From d5e15272d2ae90502032ede7790823186c4123a8 Mon Sep 17 00:00:00 2001 From: CharlesKWON Date: Mon, 15 Jun 2026 12:42:33 +0900 Subject: [PATCH] feat(charset): UTF-8 default string semantics with selectable charset Five strings now operate in Unicode rune units by default. Core string functions (LEN/CHR/ASC/SUBSTR/LEFT/RIGHT/AT/PADR/PADL) are charset-aware: UTF-8 rune semantics by default, byte/charset semantics when a legacy charset (CP949, CP1252, ...) is selected. Initial charset is settable via FIVE_CHARSET / HB_CODEPAGE env vars; default UTF8. - hbrtl/charset.go: charset state + Str* helpers + DecodeToUTF8/EncodeFromUTF8 + RTL HB_GETCHARSET/HB_SETCHARSET/HB_CDPSELECT/HB_TRANSLATE (x/text htmlindex) - compiler/gengo: inlined string intrinsics now call charset-aware hbrtl.Str* helpers instead of byte-based Go (they previously bypassed the RTL registry) - compiler/analyzer: register HB_GETCHARSET/HB_SETCHARSET/HB_TRANSLATE as known - hbrtl/regex.go: add HB_REGEX (array-of-submatches) Co-Authored-By: Claude Opus 4.8 (1M context) --- compiler/analyzer/analyzer.go | 2 + compiler/gengo/gengo.go | 33 ++-- hbrtl/charset.go | 319 ++++++++++++++++++++++++++++++++++ hbrtl/missing.go | 81 ++++++--- hbrtl/missing_fivesql.go | 8 +- hbrtl/regex.go | 35 ++++ hbrtl/register.go | 4 + hbrtl/strings.go | 38 +++- 8 files changed, 476 insertions(+), 44 deletions(-) create mode 100644 hbrtl/charset.go diff --git a/compiler/analyzer/analyzer.go b/compiler/analyzer/analyzer.go index a49e6a9..97cbbfe 100644 --- a/compiler/analyzer/analyzer.go +++ b/compiler/analyzer/analyzer.go @@ -629,6 +629,8 @@ var rtlFunctions = map[string]bool{ "HB_DATETIME": true, "HB_HOUR": true, "HB_MINUTE": true, "HB_SEC": true, "HB_TTOC": true, "HB_CTOT": true, "HB_SECOND": true, "HB_ATOKENS": true, "HB_CDPSELECT": true, "HB_TTOS": true, "HB_STOT": true, "HB_MILLISECONDS": true, + // Charset (charset.go) + "HB_GETCHARSET": true, "HB_SETCHARSET": true, "HB_TRANSLATE": true, "HB_DATE": true, "HB_CTOD": true, "HB_DTOC": true, "HB_STOD": true, "HB_DTOT": true, "HB_TTOD": true, "HB_TTOHOUR": true, "HB_TTOMIN": true, "HB_TTOSEC": true, "HB_TTOMSEC": true, "HB_TTON": true, "HB_NTOT": true, diff --git a/compiler/gengo/gengo.go b/compiler/gengo/gengo.go index 82025af..53d082d 100644 --- a/compiler/gengo/gengo.go +++ b/compiler/gengo/gengo.go @@ -1396,7 +1396,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool { case "LEN": if len(args) == 1 { g.emitExpr(args[0]) - g.writeln("{ _v := t.Pop2(); if _v.IsString() { t.PushInt(len(_v.AsString())) } else if _v.IsArray() { t.PushInt(len(_v.AsArray().Items)) } else if _v.IsHash() { t.PushInt(len(_v.AsHash().Keys)) } else { t.PushInt(0) } }") + g.writeln("{ _v := t.Pop2(); if _v.IsString() { t.PushInt(hbrtl.StrLen(_v.AsString())) } else if _v.IsArray() { t.PushInt(len(_v.AsArray().Items)) } else if _v.IsHash() { t.PushInt(len(_v.AsHash().Keys)) } else { t.PushInt(0) } }") return true } case "EMPTY": @@ -1409,13 +1409,13 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool { case "CHR": if len(args) == 1 { g.emitExpr(args[0]) - g.writeln("t.PushString(string(byte(t.Pop2().AsNumInt())))") + g.writeln("t.PushString(hbrtl.StrChr(int(t.Pop2().AsNumInt())))") return true } case "ASC": if len(args) == 1 { g.emitExpr(args[0]) - g.writeln("{ _s := t.Pop2().AsString(); if len(_s)>0 { t.PushInt(int(_s[0])) } else { t.PushInt(0) } }") + g.writeln("{ _s := t.Pop2().AsString(); t.PushInt(hbrtl.StrAsc(_s)) }") return true } case "EOF": @@ -1496,8 +1496,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool { g.emitExpr(args[0]) g.emitExpr(args[1]) g.writeln("{ _pn := int(t.Pop2().AsNumInt()); _ps := t.Pop2().AsString()") - g.writeln("if len(_ps) >= _pn { t.PushString(_ps[:_pn])") - g.writeln("} else { t.PushString(_ps + hbrtl.Spaces(_pn - len(_ps))) } }") + g.writeln("t.PushString(hbrtl.StrPadR(_ps, _pn)) }") return true } case "PADL": @@ -1507,13 +1506,10 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool { if len(args) == 3 { g.emitExpr(args[2]) g.writeln("{ _pf := t.Pop2().AsString(); _pn := int(t.Pop2().AsNumInt()); _ps := t.Pop2().AsString()") - g.writeln("if len(_ps) >= _pn { t.PushString(_ps[len(_ps)-_pn:])") - g.writeln("} else { t.PushString(strings.Repeat(_pf[:1], _pn-len(_ps)) + _ps) } }") - g.imports["strings"] = true + g.writeln("t.PushString(hbrtl.StrPadL(_ps, _pn, _pf)) }") } else { g.writeln("{ _pn := int(t.Pop2().AsNumInt()); _ps := t.Pop2().AsString()") - g.writeln("if len(_ps) >= _pn { t.PushString(_ps[len(_ps)-_pn:])") - g.writeln("} else { t.PushString(hbrtl.Spaces(_pn - len(_ps)) + _ps) } }") + g.writeln("t.PushString(hbrtl.StrPadL(_ps, _pn, \" \")) }") } return true } @@ -1523,13 +1519,12 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool { g.emitExpr(args[1]) if len(args) == 3 { g.emitExpr(args[2]) - g.writeln("{ _sl := int(t.Pop2().AsNumInt()); _sp := int(t.Pop2().AsNumInt())-1; _ss := t.Pop2().AsString()") + g.writeln("{ _sl := int(t.Pop2().AsNumInt()); _sp := int(t.Pop2().AsNumInt()); _ss := t.Pop2().AsString()") + g.writeln("t.PushString(hbrtl.StrSubStr(_ss, _sp, _sl, true)) }") } else { - g.writeln("{ _sl := 0; _sp := int(t.Pop2().AsNumInt())-1; _ss := t.Pop2().AsString(); _sl = len(_ss) - _sp") + g.writeln("{ _sp := int(t.Pop2().AsNumInt()); _ss := t.Pop2().AsString()") + g.writeln("t.PushString(hbrtl.StrSubStr(_ss, _sp, 0, false)) }") } - g.writeln("if _sp < 0 { _sp = 0 }; if _sp > len(_ss) { _sp = len(_ss) }") - g.writeln("if _sp+_sl > len(_ss) { _sl = len(_ss) - _sp }") - g.writeln("t.PushString(_ss[_sp:_sp+_sl]) }") return true } case "LEFT": @@ -1537,7 +1532,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool { g.emitExpr(args[0]) g.emitExpr(args[1]) g.writeln("{ _ln := int(t.Pop2().AsNumInt()); _ls := t.Pop2().AsString()") - g.writeln("if _ln >= len(_ls) { t.PushString(_ls) } else if _ln <= 0 { t.PushString(\"\") } else { t.PushString(_ls[:_ln]) } }") + g.writeln("t.PushString(hbrtl.StrLeft(_ls, _ln)) }") return true } case "RIGHT": @@ -1545,7 +1540,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool { g.emitExpr(args[0]) g.emitExpr(args[1]) g.writeln("{ _rn := int(t.Pop2().AsNumInt()); _rs := t.Pop2().AsString()") - g.writeln("if _rn >= len(_rs) { t.PushString(_rs) } else if _rn <= 0 { t.PushString(\"\") } else { t.PushString(_rs[len(_rs)-_rn:]) } }") + g.writeln("t.PushString(hbrtl.StrRight(_rs, _rn)) }") return true } case "AT": @@ -1553,9 +1548,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool { g.emitExpr(args[0]) g.emitExpr(args[1]) g.writeln("{ _as := t.Pop2().AsString(); _ak := t.Pop2().AsString()") - g.writeln("_ai := strings.Index(_as, _ak)") - g.writeln("if _ai >= 0 { t.PushInt(_ai+1) } else { t.PushInt(0) } }") - g.imports["strings"] = true + g.writeln("t.PushInt(hbrtl.StrAt(_ak, _as)) }") return true } case "IIF": diff --git a/hbrtl/charset.go b/hbrtl/charset.go new file mode 100644 index 0000000..72ff5fd --- /dev/null +++ b/hbrtl/charset.go @@ -0,0 +1,319 @@ +// charset.go — Five 문자셋(charset/codepage) 서브시스템. +// +// 설계 +// - Five 의 문자열 기본 인코딩은 UTF-8 이다(Go 네이티브). 코어 문자열 +// 함수(CHR/ASC/LEN/SUBSTR/LEFT/RIGHT/AT)는 활성 charset 이 UTF-8 이면 +// '문자(rune)' 단위로 동작한다. +// - 활성 charset 을 지정하면(예: CP949, CP1252) 그 charset 의 의미로 +// 동작하고, 입출력 경계 변환은 HB_TRANSLATE / 디코드·인코드 헬퍼로 한다. +// - 정의하지 않으면 UTF-8. 초기값은 환경변수 FIVE_CHARSET(또는 HB_CODEPAGE) +// 로 지정할 수 있다. +// +// PRG surface +// HB_GETCHARSET() → cCurrent (예: "UTF8") +// HB_SETCHARSET([cName]) → cPrev (인자 없으면 조회만) +// HB_CDPSELECT([cName]) → cPrev (Harbour 호환 별칭) +// HB_TRANSLATE(cStr, cFrom, cTo) → cConverted (charset 간 변환) +package hbrtl + +import ( + "os" + "strings" + "sync" + "unicode/utf8" + + "five/hbrt" + + "golang.org/x/text/encoding" + "golang.org/x/text/encoding/htmlindex" +) + +var ( + csMu sync.RWMutex + csName = "UTF8" +) + +func init() { + v := os.Getenv("FIVE_CHARSET") + if v == "" { + v = os.Getenv("HB_CODEPAGE") + } + if v != "" { + csName = normCharset(v) + } +} + +// normCharset — 표기 정규화(대문자, 하이픈 제거 일부). "utf-8" → "UTF8". +func normCharset(s string) string { + s = strings.ToUpper(strings.TrimSpace(s)) + switch s { + case "UTF-8", "UTF8", "": + return "UTF8" + } + return s +} + +// charsetIsUTF8 — 활성 charset 이 UTF-8 인가(기본값). +func charsetIsUTF8() bool { return csIsUTF8(GetCharset()) } +func csIsUTF8(name string) bool { + switch normCharset(name) { + case "UTF8", "UTF-8", "": + return true + } + return false +} + +// GetCharset / SetCharset — 활성 charset 조회/설정. SetCharset 은 이전값 반환. +func GetCharset() string { + csMu.RLock() + defer csMu.RUnlock() + return csName +} +func SetCharset(name string) string { + csMu.Lock() + defer csMu.Unlock() + prev := csName + if strings.TrimSpace(name) != "" { + csName = normCharset(name) + } + return prev +} + +// encodingFor — charset 이름으로 x/text encoding 획득. UTF-8/미지원이면 nil. +func encodingFor(name string) encoding.Encoding { + if csIsUTF8(name) { + return nil // identity + } + // htmlindex 는 "euc-kr","windows-1252","shift_jis" 등 표준 별칭을 받는다. + alias := strings.ToLower(strings.TrimSpace(name)) + switch normCharset(name) { + case "CP949", "MS949", "EUCKR", "EUC-KR", "KSC5601": + alias = "euc-kr" + case "CP1252", "WINDOWS1252", "WINDOWS-1252", "LATIN1", "ISO8859-1", "ISO-8859-1": + alias = "windows-1252" + case "CP932", "SHIFTJIS", "SJIS", "SHIFT-JIS": + alias = "shift_jis" + case "CP936", "GBK", "GB2312": + alias = "gbk" + case "CP950", "BIG5": + alias = "big5" + } + if enc, err := htmlindex.Get(alias); err == nil { + return enc + } + return nil +} + +// DecodeToUTF8 — 지정 charset 바이트열을 내부 UTF-8 문자열로 디코드. +func DecodeToUTF8(b []byte, fromCharset string) string { + enc := encodingFor(fromCharset) + if enc == nil { + return string(b) + } + if out, err := enc.NewDecoder().Bytes(b); err == nil { + return string(out) + } + return string(b) +} + +// EncodeFromUTF8 — 내부 UTF-8 문자열을 지정 charset 바이트열로 인코드. +func EncodeFromUTF8(s, toCharset string) []byte { + enc := encodingFor(toCharset) + if enc == nil { + return []byte(s) + } + if out, err := enc.NewEncoder().Bytes([]byte(s)); err == nil { + return out + } + return []byte(s) +} + +// ── charset-aware 코어 문자열 헬퍼 ─────────────────────────────────────── +// 컴파일러(gengo)가 LEN/CHR/ASC/SUBSTR/LEFT/RIGHT/AT 를 인라인으로 펼칠 때 +// 이 헬퍼들을 호출한다. 활성 charset 이 UTF-8(기본)이면 rune(문자) 단위, +// 아니면 byte 단위로 동작한다. + +// StrLen — LEN(cString): charset 단위 길이. +func StrLen(s string) int { + if charsetIsUTF8() { + return utf8.RuneCountInString(s) + } + return len(s) +} + +// StrChr — CHR(nCode): charset 단위 1문자 생성. +func StrChr(n int) string { + if charsetIsUTF8() { + if n < 0 { + n = 0 + } + return string(rune(n)) + } + return string([]byte{byte(n)}) +} + +// StrAsc — ASC(cString): 첫 문자의 코드값. +func StrAsc(s string) int { + if s == "" { + return 0 + } + if charsetIsUTF8() { + r, _ := utf8.DecodeRuneInString(s) + return int(r) + } + return int(s[0]) +} + +// StrSubStr — SUBSTR(cString, nStart, nLen): nStart 는 1-기반. +// hasLen 이 false 면 nStart 부터 끝까지. +func StrSubStr(s string, start, length int, hasLen bool) string { + if charsetIsUTF8() { + rs := []rune(s) + n := len(rs) + sp := start - 1 + if sp < 0 { + sp = 0 + } + if sp > n { + sp = n + } + sl := length + if !hasLen { + sl = n - sp + } + if sl < 0 { + sl = 0 + } + if sp+sl > n { + sl = n - sp + } + return string(rs[sp : sp+sl]) + } + n := len(s) + sp := start - 1 + if sp < 0 { + sp = 0 + } + if sp > n { + sp = n + } + sl := length + if !hasLen { + sl = n - sp + } + if sl < 0 { + sl = 0 + } + if sp+sl > n { + sl = n - sp + } + return s[sp : sp+sl] +} + +// StrLeft — LEFT(cString, nLen). +func StrLeft(s string, n int) string { + if n <= 0 { + return "" + } + if charsetIsUTF8() { + rs := []rune(s) + if n >= len(rs) { + return s + } + return string(rs[:n]) + } + if n >= len(s) { + return s + } + return s[:n] +} + +// StrRight — RIGHT(cString, nLen). +func StrRight(s string, n int) string { + if n <= 0 { + return "" + } + if charsetIsUTF8() { + rs := []rune(s) + if n >= len(rs) { + return s + } + return string(rs[len(rs)-n:]) + } + if n >= len(s) { + return s + } + return s[len(s)-n:] +} + +// StrAt — AT(cSearch, cTarget): cTarget 안에서 cSearch 의 1-기반 위치, 없으면 0. +func StrAt(search, target string) int { + idx := strings.Index(target, search) + if idx < 0 { + return 0 + } + if charsetIsUTF8() { + return utf8.RuneCountInString(target[:idx]) + 1 + } + return idx + 1 +} + +// StrPadR — PADR(cString, nLen): 오른쪽 공백 패딩(초과 시 왼쪽 nLen 컷). +func StrPadR(s string, n int) string { + l := StrLen(s) + if l >= n { + return StrLeft(s, n) + } + return s + Spaces(n-l) +} + +// StrPadL — PADL(cString, nLen [, cFill]): 왼쪽 패딩(초과 시 오른쪽 nLen 컷). +func StrPadL(s string, n int, fill string) string { + l := StrLen(s) + if l >= n { + return StrRight(s, n) + } + fc := " " + if fill != "" { + if charsetIsUTF8() { + fc = string([]rune(fill)[:1]) + } else { + fc = fill[:1] + } + } + return strings.Repeat(fc, n-l) + s +} + +// ── PRG RTL ────────────────────────────────────────────────────────── + +// HbGetCharset: HB_GETCHARSET() → cName +func HbGetCharset(t *hbrt.Thread) { + t.Frame(0, 0) + defer t.EndProcFast() + t.PushString(GetCharset()) + t.RetValue() +} + +// HbSetCharset: HB_SETCHARSET([cName]) → cPrev +func HbSetCharset(t *hbrt.Thread) { + nParams := t.ParamCount() + t.Frame(nParams, 0) + defer t.EndProcFast() + name := "" + if nParams >= 1 { + name = t.Local(1).AsString() + } + t.PushString(SetCharset(name)) + t.RetValue() +} + +// HbTranslate: HB_TRANSLATE(cStr, cFrom, cTo) → cConverted +func HbTranslate(t *hbrt.Thread) { + t.Frame(3, 0) + defer t.EndProcFast() + s := t.Local(1).AsString() + from := t.Local(2).AsString() + to := t.Local(3).AsString() + t.PushString(string(EncodeFromUTF8(DecodeToUTF8([]byte(s), from), to))) + t.RetValue() +} diff --git a/hbrtl/missing.go b/hbrtl/missing.go index 16ad8d3..61248aa 100644 --- a/hbrtl/missing.go +++ b/hbrtl/missing.go @@ -10,6 +10,7 @@ import ( "math" "os" "strings" + "unicode/utf8" ) // --- String functions --- @@ -21,63 +22,101 @@ func At(t *hbrt.Thread) { search := t.Local(1).AsString() target := t.Local(2).AsString() idx := strings.Index(target, search) - if idx >= 0 { - t.RetInt(int64(idx + 1)) - } else { + if idx < 0 { t.RetInt(0) + return + } + if charsetIsUTF8() { + t.RetInt(int64(utf8.RuneCountInString(target[:idx]) + 1)) // 문자 위치 + } else { + t.RetInt(int64(idx + 1)) // 바이트 위치 } } -// Left returns leftmost n characters. +// Left returns leftmost n characters (UTF-8: rune 단위, 레거시: 바이트). func Left(t *hbrt.Thread) { t.Frame(2, 0) defer t.EndProcFast() s := t.Local(1).AsString() n := int(t.Local(2).AsNumInt()) - if n >= len(s) { - t.PushString(s) - } else if n <= 0 { - t.PushString("") + if charsetIsUTF8() { + rs := []rune(s) + if n >= len(rs) { + t.PushString(s) + } else if n <= 0 { + t.PushString("") + } else { + t.PushString(string(rs[:n])) + } } else { - t.PushString(s[:n]) + if n >= len(s) { + t.PushString(s) + } else if n <= 0 { + t.PushString("") + } else { + t.PushString(s[:n]) + } } t.RetValue() } -// Right returns rightmost n characters. +// Right returns rightmost n characters (UTF-8: rune 단위, 레거시: 바이트). func Right(t *hbrt.Thread) { t.Frame(2, 0) defer t.EndProcFast() s := t.Local(1).AsString() n := int(t.Local(2).AsNumInt()) - if n >= len(s) { - t.PushString(s) - } else if n <= 0 { - t.PushString("") + if charsetIsUTF8() { + rs := []rune(s) + if n >= len(rs) { + t.PushString(s) + } else if n <= 0 { + t.PushString("") + } else { + t.PushString(string(rs[len(rs)-n:])) + } } else { - t.PushString(s[len(s)-n:]) + if n >= len(s) { + t.PushString(s) + } else if n <= 0 { + t.PushString("") + } else { + t.PushString(s[len(s)-n:]) + } } t.RetValue() } -// Asc returns ASCII code of first character. +// Asc returns the code of the first character (UTF-8: codepoint, 레거시: byte). func Asc(t *hbrt.Thread) { t.Frame(1, 0) defer t.EndProcFast() s := t.Local(1).AsString() - if len(s) > 0 { - t.RetInt(int64(s[0])) - } else { + if len(s) == 0 { t.RetInt(0) + return + } + if charsetIsUTF8() { + r, _ := utf8.DecodeRuneInString(s) + t.RetInt(int64(r)) + } else { + t.RetInt(int64(s[0])) } } -// Chr returns character from ASCII code. +// Chr returns the character for a code (UTF-8: codepoint→UTF-8, 레거시: 1 byte). func Chr(t *hbrt.Thread) { t.Frame(1, 0) defer t.EndProcFast() n := int(t.Local(1).AsNumInt()) - t.PushString(string([]byte{byte(n)})) + if charsetIsUTF8() { + if n < 0 { + n = 0 + } + t.PushString(string(rune(n))) // 코드포인트 → UTF-8 + } else { + t.PushString(string([]byte{byte(n)})) + } t.RetValue() } diff --git a/hbrtl/missing_fivesql.go b/hbrtl/missing_fivesql.go index d191b9f..fa7552b 100644 --- a/hbrtl/missing_fivesql.go +++ b/hbrtl/missing_fivesql.go @@ -64,12 +64,16 @@ func HbATokens(t *hbrt.Thread) { } // hb_cdpSelect([cCodepage]) → cPrevCodepage -// Stub: Five uses UTF-8 internally, codepage selection is a no-op. +// 활성 charset 을 설정/조회한다. 기본은 UTF-8. (charset.go 참고) func HbCdpSelect(t *hbrt.Thread) { nParams := t.ParamCount() t.Frame(nParams, 0) defer t.EndProcFast() - t.RetString("") + name := "" + if nParams >= 1 { + name = t.Local(1).AsString() + } + t.RetString(SetCharset(name)) } // Used() → lUsed — checks if current workarea is in use diff --git a/hbrtl/regex.go b/hbrtl/regex.go index f5e5bab..3d1f0be 100644 --- a/hbrtl/regex.go +++ b/hbrtl/regex.go @@ -93,6 +93,41 @@ func HbRegexAll(t *hbrt.Thread) { t.RetVal(hbrt.MakeArrayFrom(items)) } +// HB_REGEX(cPattern|pRegex, cString [, lCaseSensitive]) → aSubmatches +// +// Returns the first match plus any capture groups. Maps directly to +// Go's regexp.FindStringSubmatch: result[1] is the whole match, +// result[2..] are the capture groups. Empty PRG array on no match — +// distinguishable from `Nil` by callers that need the "anchor present +// but groups missing" case. +// +// Harbour parity: this is the array-of-submatches `hb_regex(p, s)` +// flavour. The Five-original capture-only callers should use this. +func HbRegex(t *hbrt.Thread) { + nParams := t.ParamCount() + t.Frame(nParams, 0) + defer t.EndProc() + + re := getRegex(t, 1, nParams >= 3) + if re == nil { + t.RetVal(hbrt.MakeArrayFrom(nil)) + return + } + + str := t.Local(2).AsString() + m := re.FindStringSubmatch(str) + if m == nil { + t.RetVal(hbrt.MakeArrayFrom(nil)) + return + } + + items := make([]hbrt.Value, len(m)) + for i, s := range m { + items[i] = hbrt.MakeString(s) + } + t.RetVal(hbrt.MakeArrayFrom(items)) +} + // HB_REGEXREPLACE(cPattern|pRegex, cString, cReplace [, lCaseSensitive]) → cResult func HbRegexReplace(t *hbrt.Thread) { nParams := t.ParamCount() diff --git a/hbrtl/register.go b/hbrtl/register.go index 5771ac8..aa53f35 100644 --- a/hbrtl/register.go +++ b/hbrtl/register.go @@ -233,6 +233,7 @@ func RegisterRTL(vm *hbrt.VM) { // Regex (Go regexp) hbrt.Sym("HB_REGEXCOMP", hbrt.FsPublic, HbRegexComp), + hbrt.Sym("HB_REGEX", hbrt.FsPublic, HbRegex), hbrt.Sym("HB_REGEXMATCH", hbrt.FsPublic, HbRegexMatch), hbrt.Sym("HB_REGEXSPLIT", hbrt.FsPublic, HbRegexSplit), hbrt.Sym("HB_REGEXALL", hbrt.FsPublic, HbRegexAll), @@ -446,6 +447,9 @@ func RegisterRTL(vm *hbrt.VM) { hbrt.Sym("HB_SECOND", hbrt.FsPublic, HbSecond), hbrt.Sym("HB_ATOKENS", hbrt.FsPublic, HbATokens), hbrt.Sym("HB_CDPSELECT", hbrt.FsPublic, HbCdpSelect), + hbrt.Sym("HB_GETCHARSET", hbrt.FsPublic, HbGetCharset), + hbrt.Sym("HB_SETCHARSET", hbrt.FsPublic, HbSetCharset), + hbrt.Sym("HB_TRANSLATE", hbrt.FsPublic, HbTranslate), hbrt.Sym("DBSETINDEX", hbrt.FsPublic, rtlDbSetIndex), hbrt.Sym("HB_TTOS", hbrt.FsPublic, HbTToS), hbrt.Sym("HB_STOT", hbrt.FsPublic, HbSToT), diff --git a/hbrtl/strings.go b/hbrtl/strings.go index f1f5a83..4d190fb 100644 --- a/hbrtl/strings.go +++ b/hbrtl/strings.go @@ -10,6 +10,7 @@ import ( "fmt" "math" "strings" + "unicode/utf8" ) // spacesCache: pre-built space strings for common pad sizes. @@ -134,7 +135,11 @@ func Len(t *hbrt.Thread) { v := t.Local(1) switch { case v.IsString(): - t.RetInt(int64(v.StringLen())) + if charsetIsUTF8() { + t.RetInt(int64(utf8.RuneCountInString(v.AsString()))) // 문자(rune) 수 + } else { + t.RetInt(int64(v.StringLen())) // 바이트 수(레거시 charset) + } case v.IsArray(): t.RetInt(int64(len(v.AsArray().Items))) case v.IsHash(): @@ -161,6 +166,37 @@ func SubStr(t *hbrt.Thread) { s := v.AsString() start := int(t.Local(2).AsNumInt()) + // UTF-8(기본): 문자(rune) 단위. 레거시 charset: 바이트 단위. + if charsetIsUTF8() { + rs := []rune(s) + n := len(rs) + if start < 0 { + start = n + start + 1 + } + if start < 1 { + start = 1 + } + start-- + if start >= n { + t.PushString("") + t.RetValue() + return + } + res := rs[start:] + if nParams >= 3 && !t.Local(3).IsNil() { + nLen := int(t.Local(3).AsNumInt()) + if nLen < 0 { + nLen = 0 + } + if nLen < len(res) { + res = res[:nLen] + } + } + t.PushString(string(res)) + t.RetValue() + return + } + // Harbour: 1-based index, negative = from end if start < 0 { start = len(s) + start + 1