From d5e15272d2ae90502032ede7790823186c4123a8 Mon Sep 17 00:00:00 2001
From: CharlesKWON <charleskwon@CharlesKWONui-iMac-2.local>
Date: Mon, 15 Jun 2026 12:42:33 +0900
Subject: [PATCH] feat(charset): UTF-8 default string semantics with selectable
 charset

Five strings now operate in Unicode rune units by default. Core string
functions (LEN/CHR/ASC/SUBSTR/LEFT/RIGHT/AT/PADR/PADL) are charset-aware:
UTF-8 rune semantics by default, byte/charset semantics when a legacy
charset (CP949, CP1252, ...) is selected. Initial charset is settable via
FIVE_CHARSET / HB_CODEPAGE env vars; default UTF8.

- hbrtl/charset.go: charset state + Str* helpers + DecodeToUTF8/EncodeFromUTF8
  + RTL HB_GETCHARSET/HB_SETCHARSET/HB_CDPSELECT/HB_TRANSLATE (x/text htmlindex)
- compiler/gengo: inlined string intrinsics now call charset-aware hbrtl.Str*
  helpers instead of byte-based Go (they previously bypassed the RTL registry)
- compiler/analyzer: register HB_GETCHARSET/HB_SETCHARSET/HB_TRANSLATE as known
- hbrtl/regex.go: add HB_REGEX (array-of-submatches)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 compiler/analyzer/analyzer.go |   2 +
 compiler/gengo/gengo.go       |  33 ++--
 hbrtl/charset.go              | 319 ++++++++++++++++++++++++++++++++++
 hbrtl/missing.go              |  81 ++++++---
 hbrtl/missing_fivesql.go      |   8 +-
 hbrtl/regex.go                |  35 ++++
 hbrtl/register.go             |   4 +
 hbrtl/strings.go              |  38 +++-
 8 files changed, 476 insertions(+), 44 deletions(-)
 create mode 100644 hbrtl/charset.go

diff --git a/compiler/analyzer/analyzer.go b/compiler/analyzer/analyzer.go
index a49e6a9..97cbbfe 100644
--- a/compiler/analyzer/analyzer.go
+++ b/compiler/analyzer/analyzer.go
@@ -629,6 +629,8 @@ var rtlFunctions = map[string]bool{
 	"HB_DATETIME": true, "HB_HOUR": true, "HB_MINUTE": true, "HB_SEC": true,
 	"HB_TTOC": true, "HB_CTOT": true, "HB_SECOND": true, "HB_ATOKENS": true,
 	"HB_CDPSELECT": true, "HB_TTOS": true, "HB_STOT": true, "HB_MILLISECONDS": true,
+	// Charset (charset.go)
+	"HB_GETCHARSET": true, "HB_SETCHARSET": true, "HB_TRANSLATE": true,
 	"HB_DATE": true, "HB_CTOD": true, "HB_DTOC": true, "HB_STOD": true,
 	"HB_DTOT": true, "HB_TTOD": true, "HB_TTOHOUR": true, "HB_TTOMIN": true,
 	"HB_TTOSEC": true, "HB_TTOMSEC": true, "HB_TTON": true, "HB_NTOT": true,
diff --git a/compiler/gengo/gengo.go b/compiler/gengo/gengo.go
index 82025af..53d082d 100644
--- a/compiler/gengo/gengo.go
+++ b/compiler/gengo/gengo.go
@@ -1396,7 +1396,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
 	case "LEN":
 		if len(args) == 1 {
 			g.emitExpr(args[0])
-			g.writeln("{ _v := t.Pop2(); if _v.IsString() { t.PushInt(len(_v.AsString())) } else if _v.IsArray() { t.PushInt(len(_v.AsArray().Items)) } else if _v.IsHash() { t.PushInt(len(_v.AsHash().Keys)) } else { t.PushInt(0) } }")
+			g.writeln("{ _v := t.Pop2(); if _v.IsString() { t.PushInt(hbrtl.StrLen(_v.AsString())) } else if _v.IsArray() { t.PushInt(len(_v.AsArray().Items)) } else if _v.IsHash() { t.PushInt(len(_v.AsHash().Keys)) } else { t.PushInt(0) } }")
 			return true
 		}
 	case "EMPTY":
@@ -1409,13 +1409,13 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
 	case "CHR":
 		if len(args) == 1 {
 			g.emitExpr(args[0])
-			g.writeln("t.PushString(string(byte(t.Pop2().AsNumInt())))")
+			g.writeln("t.PushString(hbrtl.StrChr(int(t.Pop2().AsNumInt())))")
 			return true
 		}
 	case "ASC":
 		if len(args) == 1 {
 			g.emitExpr(args[0])
-			g.writeln("{ _s := t.Pop2().AsString(); if len(_s)>0 { t.PushInt(int(_s[0])) } else { t.PushInt(0) } }")
+			g.writeln("{ _s := t.Pop2().AsString(); t.PushInt(hbrtl.StrAsc(_s)) }")
 			return true
 		}
 	case "EOF":
@@ -1496,8 +1496,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
 			g.emitExpr(args[0])
 			g.emitExpr(args[1])
 			g.writeln("{ _pn := int(t.Pop2().AsNumInt()); _ps := t.Pop2().AsString()")
-			g.writeln("if len(_ps) >= _pn { t.PushString(_ps[:_pn])")
-			g.writeln("} else { t.PushString(_ps + hbrtl.Spaces(_pn - len(_ps))) } }")
+			g.writeln("t.PushString(hbrtl.StrPadR(_ps, _pn)) }")
 			return true
 		}
 	case "PADL":
@@ -1507,13 +1506,10 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
 			if len(args) == 3 {
 				g.emitExpr(args[2])
 				g.writeln("{ _pf := t.Pop2().AsString(); _pn := int(t.Pop2().AsNumInt()); _ps := t.Pop2().AsString()")
-				g.writeln("if len(_ps) >= _pn { t.PushString(_ps[len(_ps)-_pn:])")
-				g.writeln("} else { t.PushString(strings.Repeat(_pf[:1], _pn-len(_ps)) + _ps) } }")
-				g.imports["strings"] = true
+				g.writeln("t.PushString(hbrtl.StrPadL(_ps, _pn, _pf)) }")
 			} else {
 				g.writeln("{ _pn := int(t.Pop2().AsNumInt()); _ps := t.Pop2().AsString()")
-				g.writeln("if len(_ps) >= _pn { t.PushString(_ps[len(_ps)-_pn:])")
-				g.writeln("} else { t.PushString(hbrtl.Spaces(_pn - len(_ps)) + _ps) } }")
+				g.writeln("t.PushString(hbrtl.StrPadL(_ps, _pn, \" \")) }")
 			}
 			return true
 		}
@@ -1523,13 +1519,12 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
 			g.emitExpr(args[1])
 			if len(args) == 3 {
 				g.emitExpr(args[2])
-				g.writeln("{ _sl := int(t.Pop2().AsNumInt()); _sp := int(t.Pop2().AsNumInt())-1; _ss := t.Pop2().AsString()")
+				g.writeln("{ _sl := int(t.Pop2().AsNumInt()); _sp := int(t.Pop2().AsNumInt()); _ss := t.Pop2().AsString()")
+				g.writeln("t.PushString(hbrtl.StrSubStr(_ss, _sp, _sl, true)) }")
 			} else {
-				g.writeln("{ _sl := 0; _sp := int(t.Pop2().AsNumInt())-1; _ss := t.Pop2().AsString(); _sl = len(_ss) - _sp")
+				g.writeln("{ _sp := int(t.Pop2().AsNumInt()); _ss := t.Pop2().AsString()")
+				g.writeln("t.PushString(hbrtl.StrSubStr(_ss, _sp, 0, false)) }")
 			}
-			g.writeln("if _sp < 0 { _sp = 0 }; if _sp > len(_ss) { _sp = len(_ss) }")
-			g.writeln("if _sp+_sl > len(_ss) { _sl = len(_ss) - _sp }")
-			g.writeln("t.PushString(_ss[_sp:_sp+_sl]) }")
 			return true
 		}
 	case "LEFT":
@@ -1537,7 +1532,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
 			g.emitExpr(args[0])
 			g.emitExpr(args[1])
 			g.writeln("{ _ln := int(t.Pop2().AsNumInt()); _ls := t.Pop2().AsString()")
-			g.writeln("if _ln >= len(_ls) { t.PushString(_ls) } else if _ln <= 0 { t.PushString(\"\") } else { t.PushString(_ls[:_ln]) } }")
+			g.writeln("t.PushString(hbrtl.StrLeft(_ls, _ln)) }")
 			return true
 		}
 	case "RIGHT":
@@ -1545,7 +1540,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
 			g.emitExpr(args[0])
 			g.emitExpr(args[1])
 			g.writeln("{ _rn := int(t.Pop2().AsNumInt()); _rs := t.Pop2().AsString()")
-			g.writeln("if _rn >= len(_rs) { t.PushString(_rs) } else if _rn <= 0 { t.PushString(\"\") } else { t.PushString(_rs[len(_rs)-_rn:]) } }")
+			g.writeln("t.PushString(hbrtl.StrRight(_rs, _rn)) }")
 			return true
 		}
 	case "AT":
@@ -1553,9 +1548,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
 			g.emitExpr(args[0])
 			g.emitExpr(args[1])
 			g.writeln("{ _as := t.Pop2().AsString(); _ak := t.Pop2().AsString()")
-			g.writeln("_ai := strings.Index(_as, _ak)")
-			g.writeln("if _ai >= 0 { t.PushInt(_ai+1) } else { t.PushInt(0) } }")
-			g.imports["strings"] = true
+			g.writeln("t.PushInt(hbrtl.StrAt(_ak, _as)) }")
 			return true
 		}
 	case "IIF":
diff --git a/hbrtl/charset.go b/hbrtl/charset.go
new file mode 100644
index 0000000..72ff5fd
--- /dev/null
+++ b/hbrtl/charset.go
@@ -0,0 +1,319 @@
+// charset.go — Five 문자셋(charset/codepage) 서브시스템.
+//
+// 설계
+//   - Five 의 문자열 기본 인코딩은 UTF-8 이다(Go 네이티브). 코어 문자열
+//     함수(CHR/ASC/LEN/SUBSTR/LEFT/RIGHT/AT)는 활성 charset 이 UTF-8 이면
+//     '문자(rune)' 단위로 동작한다.
+//   - 활성 charset 을 지정하면(예: CP949, CP1252) 그 charset 의 의미로
+//     동작하고, 입출력 경계 변환은 HB_TRANSLATE / 디코드·인코드 헬퍼로 한다.
+//   - 정의하지 않으면 UTF-8. 초기값은 환경변수 FIVE_CHARSET(또는 HB_CODEPAGE)
+//     로 지정할 수 있다.
+//
+// PRG surface
+//   HB_GETCHARSET()                  → cCurrent   (예: "UTF8")
+//   HB_SETCHARSET([cName])           → cPrev      (인자 없으면 조회만)
+//   HB_CDPSELECT([cName])            → cPrev      (Harbour 호환 별칭)
+//   HB_TRANSLATE(cStr, cFrom, cTo)   → cConverted (charset 간 변환)
+package hbrtl
+
+import (
+	"os"
+	"strings"
+	"sync"
+	"unicode/utf8"
+
+	"five/hbrt"
+
+	"golang.org/x/text/encoding"
+	"golang.org/x/text/encoding/htmlindex"
+)
+
+var (
+	csMu   sync.RWMutex
+	csName = "UTF8"
+)
+
+func init() {
+	v := os.Getenv("FIVE_CHARSET")
+	if v == "" {
+		v = os.Getenv("HB_CODEPAGE")
+	}
+	if v != "" {
+		csName = normCharset(v)
+	}
+}
+
+// normCharset — 표기 정규화(대문자, 하이픈 제거 일부). "utf-8" → "UTF8".
+func normCharset(s string) string {
+	s = strings.ToUpper(strings.TrimSpace(s))
+	switch s {
+	case "UTF-8", "UTF8", "":
+		return "UTF8"
+	}
+	return s
+}
+
+// charsetIsUTF8 — 활성 charset 이 UTF-8 인가(기본값).
+func charsetIsUTF8() bool { return csIsUTF8(GetCharset()) }
+func csIsUTF8(name string) bool {
+	switch normCharset(name) {
+	case "UTF8", "UTF-8", "":
+		return true
+	}
+	return false
+}
+
+// GetCharset / SetCharset — 활성 charset 조회/설정. SetCharset 은 이전값 반환.
+func GetCharset() string {
+	csMu.RLock()
+	defer csMu.RUnlock()
+	return csName
+}
+func SetCharset(name string) string {
+	csMu.Lock()
+	defer csMu.Unlock()
+	prev := csName
+	if strings.TrimSpace(name) != "" {
+		csName = normCharset(name)
+	}
+	return prev
+}
+
+// encodingFor — charset 이름으로 x/text encoding 획득. UTF-8/미지원이면 nil.
+func encodingFor(name string) encoding.Encoding {
+	if csIsUTF8(name) {
+		return nil // identity
+	}
+	// htmlindex 는 "euc-kr","windows-1252","shift_jis" 등 표준 별칭을 받는다.
+	alias := strings.ToLower(strings.TrimSpace(name))
+	switch normCharset(name) {
+	case "CP949", "MS949", "EUCKR", "EUC-KR", "KSC5601":
+		alias = "euc-kr"
+	case "CP1252", "WINDOWS1252", "WINDOWS-1252", "LATIN1", "ISO8859-1", "ISO-8859-1":
+		alias = "windows-1252"
+	case "CP932", "SHIFTJIS", "SJIS", "SHIFT-JIS":
+		alias = "shift_jis"
+	case "CP936", "GBK", "GB2312":
+		alias = "gbk"
+	case "CP950", "BIG5":
+		alias = "big5"
+	}
+	if enc, err := htmlindex.Get(alias); err == nil {
+		return enc
+	}
+	return nil
+}
+
+// DecodeToUTF8 — 지정 charset 바이트열을 내부 UTF-8 문자열로 디코드.
+func DecodeToUTF8(b []byte, fromCharset string) string {
+	enc := encodingFor(fromCharset)
+	if enc == nil {
+		return string(b)
+	}
+	if out, err := enc.NewDecoder().Bytes(b); err == nil {
+		return string(out)
+	}
+	return string(b)
+}
+
+// EncodeFromUTF8 — 내부 UTF-8 문자열을 지정 charset 바이트열로 인코드.
+func EncodeFromUTF8(s, toCharset string) []byte {
+	enc := encodingFor(toCharset)
+	if enc == nil {
+		return []byte(s)
+	}
+	if out, err := enc.NewEncoder().Bytes([]byte(s)); err == nil {
+		return out
+	}
+	return []byte(s)
+}
+
+// ── charset-aware 코어 문자열 헬퍼 ───────────────────────────────────────
+// 컴파일러(gengo)가 LEN/CHR/ASC/SUBSTR/LEFT/RIGHT/AT 를 인라인으로 펼칠 때
+// 이 헬퍼들을 호출한다. 활성 charset 이 UTF-8(기본)이면 rune(문자) 단위,
+// 아니면 byte 단위로 동작한다.
+
+// StrLen — LEN(cString): charset 단위 길이.
+func StrLen(s string) int {
+	if charsetIsUTF8() {
+		return utf8.RuneCountInString(s)
+	}
+	return len(s)
+}
+
+// StrChr — CHR(nCode): charset 단위 1문자 생성.
+func StrChr(n int) string {
+	if charsetIsUTF8() {
+		if n < 0 {
+			n = 0
+		}
+		return string(rune(n))
+	}
+	return string([]byte{byte(n)})
+}
+
+// StrAsc — ASC(cString): 첫 문자의 코드값.
+func StrAsc(s string) int {
+	if s == "" {
+		return 0
+	}
+	if charsetIsUTF8() {
+		r, _ := utf8.DecodeRuneInString(s)
+		return int(r)
+	}
+	return int(s[0])
+}
+
+// StrSubStr — SUBSTR(cString, nStart, nLen): nStart 는 1-기반.
+// hasLen 이 false 면 nStart 부터 끝까지.
+func StrSubStr(s string, start, length int, hasLen bool) string {
+	if charsetIsUTF8() {
+		rs := []rune(s)
+		n := len(rs)
+		sp := start - 1
+		if sp < 0 {
+			sp = 0
+		}
+		if sp > n {
+			sp = n
+		}
+		sl := length
+		if !hasLen {
+			sl = n - sp
+		}
+		if sl < 0 {
+			sl = 0
+		}
+		if sp+sl > n {
+			sl = n - sp
+		}
+		return string(rs[sp : sp+sl])
+	}
+	n := len(s)
+	sp := start - 1
+	if sp < 0 {
+		sp = 0
+	}
+	if sp > n {
+		sp = n
+	}
+	sl := length
+	if !hasLen {
+		sl = n - sp
+	}
+	if sl < 0 {
+		sl = 0
+	}
+	if sp+sl > n {
+		sl = n - sp
+	}
+	return s[sp : sp+sl]
+}
+
+// StrLeft — LEFT(cString, nLen).
+func StrLeft(s string, n int) string {
+	if n <= 0 {
+		return ""
+	}
+	if charsetIsUTF8() {
+		rs := []rune(s)
+		if n >= len(rs) {
+			return s
+		}
+		return string(rs[:n])
+	}
+	if n >= len(s) {
+		return s
+	}
+	return s[:n]
+}
+
+// StrRight — RIGHT(cString, nLen).
+func StrRight(s string, n int) string {
+	if n <= 0 {
+		return ""
+	}
+	if charsetIsUTF8() {
+		rs := []rune(s)
+		if n >= len(rs) {
+			return s
+		}
+		return string(rs[len(rs)-n:])
+	}
+	if n >= len(s) {
+		return s
+	}
+	return s[len(s)-n:]
+}
+
+// StrAt — AT(cSearch, cTarget): cTarget 안에서 cSearch 의 1-기반 위치, 없으면 0.
+func StrAt(search, target string) int {
+	idx := strings.Index(target, search)
+	if idx < 0 {
+		return 0
+	}
+	if charsetIsUTF8() {
+		return utf8.RuneCountInString(target[:idx]) + 1
+	}
+	return idx + 1
+}
+
+// StrPadR — PADR(cString, nLen): 오른쪽 공백 패딩(초과 시 왼쪽 nLen 컷).
+func StrPadR(s string, n int) string {
+	l := StrLen(s)
+	if l >= n {
+		return StrLeft(s, n)
+	}
+	return s + Spaces(n-l)
+}
+
+// StrPadL — PADL(cString, nLen [, cFill]): 왼쪽 패딩(초과 시 오른쪽 nLen 컷).
+func StrPadL(s string, n int, fill string) string {
+	l := StrLen(s)
+	if l >= n {
+		return StrRight(s, n)
+	}
+	fc := " "
+	if fill != "" {
+		if charsetIsUTF8() {
+			fc = string([]rune(fill)[:1])
+		} else {
+			fc = fill[:1]
+		}
+	}
+	return strings.Repeat(fc, n-l) + s
+}
+
+// ── PRG RTL ──────────────────────────────────────────────────────────
+
+// HbGetCharset: HB_GETCHARSET() → cName
+func HbGetCharset(t *hbrt.Thread) {
+	t.Frame(0, 0)
+	defer t.EndProcFast()
+	t.PushString(GetCharset())
+	t.RetValue()
+}
+
+// HbSetCharset: HB_SETCHARSET([cName]) → cPrev
+func HbSetCharset(t *hbrt.Thread) {
+	nParams := t.ParamCount()
+	t.Frame(nParams, 0)
+	defer t.EndProcFast()
+	name := ""
+	if nParams >= 1 {
+		name = t.Local(1).AsString()
+	}
+	t.PushString(SetCharset(name))
+	t.RetValue()
+}
+
+// HbTranslate: HB_TRANSLATE(cStr, cFrom, cTo) → cConverted
+func HbTranslate(t *hbrt.Thread) {
+	t.Frame(3, 0)
+	defer t.EndProcFast()
+	s := t.Local(1).AsString()
+	from := t.Local(2).AsString()
+	to := t.Local(3).AsString()
+	t.PushString(string(EncodeFromUTF8(DecodeToUTF8([]byte(s), from), to)))
+	t.RetValue()
+}
diff --git a/hbrtl/missing.go b/hbrtl/missing.go
index 16ad8d3..61248aa 100644
--- a/hbrtl/missing.go
+++ b/hbrtl/missing.go
@@ -10,6 +10,7 @@ import (
 	"math"
 	"os"
 	"strings"
+	"unicode/utf8"
 )
 
 // --- String functions ---
@@ -21,63 +22,101 @@ func At(t *hbrt.Thread) {
 	search := t.Local(1).AsString()
 	target := t.Local(2).AsString()
 	idx := strings.Index(target, search)
-	if idx >= 0 {
-		t.RetInt(int64(idx + 1))
-	} else {
+	if idx < 0 {
 		t.RetInt(0)
+		return
+	}
+	if charsetIsUTF8() {
+		t.RetInt(int64(utf8.RuneCountInString(target[:idx]) + 1)) // 문자 위치
+	} else {
+		t.RetInt(int64(idx + 1)) // 바이트 위치
 	}
 }
 
-// Left returns leftmost n characters.
+// Left returns leftmost n characters (UTF-8: rune 단위, 레거시: 바이트).
 func Left(t *hbrt.Thread) {
 	t.Frame(2, 0)
 	defer t.EndProcFast()
 	s := t.Local(1).AsString()
 	n := int(t.Local(2).AsNumInt())
-	if n >= len(s) {
-		t.PushString(s)
-	} else if n <= 0 {
-		t.PushString("")
+	if charsetIsUTF8() {
+		rs := []rune(s)
+		if n >= len(rs) {
+			t.PushString(s)
+		} else if n <= 0 {
+			t.PushString("")
+		} else {
+			t.PushString(string(rs[:n]))
+		}
 	} else {
-		t.PushString(s[:n])
+		if n >= len(s) {
+			t.PushString(s)
+		} else if n <= 0 {
+			t.PushString("")
+		} else {
+			t.PushString(s[:n])
+		}
 	}
 	t.RetValue()
 }
 
-// Right returns rightmost n characters.
+// Right returns rightmost n characters (UTF-8: rune 단위, 레거시: 바이트).
 func Right(t *hbrt.Thread) {
 	t.Frame(2, 0)
 	defer t.EndProcFast()
 	s := t.Local(1).AsString()
 	n := int(t.Local(2).AsNumInt())
-	if n >= len(s) {
-		t.PushString(s)
-	} else if n <= 0 {
-		t.PushString("")
+	if charsetIsUTF8() {
+		rs := []rune(s)
+		if n >= len(rs) {
+			t.PushString(s)
+		} else if n <= 0 {
+			t.PushString("")
+		} else {
+			t.PushString(string(rs[len(rs)-n:]))
+		}
 	} else {
-		t.PushString(s[len(s)-n:])
+		if n >= len(s) {
+			t.PushString(s)
+		} else if n <= 0 {
+			t.PushString("")
+		} else {
+			t.PushString(s[len(s)-n:])
+		}
 	}
 	t.RetValue()
 }
 
-// Asc returns ASCII code of first character.
+// Asc returns the code of the first character (UTF-8: codepoint, 레거시: byte).
 func Asc(t *hbrt.Thread) {
 	t.Frame(1, 0)
 	defer t.EndProcFast()
 	s := t.Local(1).AsString()
-	if len(s) > 0 {
-		t.RetInt(int64(s[0]))
-	} else {
+	if len(s) == 0 {
 		t.RetInt(0)
+		return
+	}
+	if charsetIsUTF8() {
+		r, _ := utf8.DecodeRuneInString(s)
+		t.RetInt(int64(r))
+	} else {
+		t.RetInt(int64(s[0]))
 	}
 }
 
-// Chr returns character from ASCII code.
+// Chr returns the character for a code (UTF-8: codepoint→UTF-8, 레거시: 1 byte).
 func Chr(t *hbrt.Thread) {
 	t.Frame(1, 0)
 	defer t.EndProcFast()
 	n := int(t.Local(1).AsNumInt())
-	t.PushString(string([]byte{byte(n)}))
+	if charsetIsUTF8() {
+		if n < 0 {
+			n = 0
+		}
+		t.PushString(string(rune(n))) // 코드포인트 → UTF-8
+	} else {
+		t.PushString(string([]byte{byte(n)}))
+	}
 	t.RetValue()
 }
 
diff --git a/hbrtl/missing_fivesql.go b/hbrtl/missing_fivesql.go
index d191b9f..fa7552b 100644
--- a/hbrtl/missing_fivesql.go
+++ b/hbrtl/missing_fivesql.go
@@ -64,12 +64,16 @@ func HbATokens(t *hbrt.Thread) {
 }
 
 // hb_cdpSelect([cCodepage]) → cPrevCodepage
-// Stub: Five uses UTF-8 internally, codepage selection is a no-op.
+// 활성 charset 을 설정/조회한다. 기본은 UTF-8. (charset.go 참고)
 func HbCdpSelect(t *hbrt.Thread) {
 	nParams := t.ParamCount()
 	t.Frame(nParams, 0)
 	defer t.EndProcFast()
-	t.RetString("")
+	name := ""
+	if nParams >= 1 {
+		name = t.Local(1).AsString()
+	}
+	t.RetString(SetCharset(name))
 }
 
 // Used() → lUsed — checks if current workarea is in use
diff --git a/hbrtl/regex.go b/hbrtl/regex.go
index f5e5bab..3d1f0be 100644
--- a/hbrtl/regex.go
+++ b/hbrtl/regex.go
@@ -93,6 +93,41 @@ func HbRegexAll(t *hbrt.Thread) {
 	t.RetVal(hbrt.MakeArrayFrom(items))
 }
 
+// HB_REGEX(cPattern|pRegex, cString [, lCaseSensitive]) → aSubmatches
+//
+// Returns the first match plus any capture groups. Maps directly to
+// Go's regexp.FindStringSubmatch: result[1] is the whole match,
+// result[2..] are the capture groups. Empty PRG array on no match —
+// distinguishable from `Nil` by callers that need the "anchor present
+// but groups missing" case.
+//
+// Harbour parity: this is the array-of-submatches `hb_regex(p, s)`
+// flavour. The Five-original capture-only callers should use this.
+func HbRegex(t *hbrt.Thread) {
+	nParams := t.ParamCount()
+	t.Frame(nParams, 0)
+	defer t.EndProc()
+
+	re := getRegex(t, 1, nParams >= 3)
+	if re == nil {
+		t.RetVal(hbrt.MakeArrayFrom(nil))
+		return
+	}
+
+	str := t.Local(2).AsString()
+	m := re.FindStringSubmatch(str)
+	if m == nil {
+		t.RetVal(hbrt.MakeArrayFrom(nil))
+		return
+	}
+
+	items := make([]hbrt.Value, len(m))
+	for i, s := range m {
+		items[i] = hbrt.MakeString(s)
+	}
+	t.RetVal(hbrt.MakeArrayFrom(items))
+}
+
 // HB_REGEXREPLACE(cPattern|pRegex, cString, cReplace [, lCaseSensitive]) → cResult
 func HbRegexReplace(t *hbrt.Thread) {
 	nParams := t.ParamCount()
diff --git a/hbrtl/register.go b/hbrtl/register.go
index 5771ac8..aa53f35 100644
--- a/hbrtl/register.go
+++ b/hbrtl/register.go
@@ -233,6 +233,7 @@ func RegisterRTL(vm *hbrt.VM) {
 
 		// Regex (Go regexp)
 		hbrt.Sym("HB_REGEXCOMP", hbrt.FsPublic, HbRegexComp),
+		hbrt.Sym("HB_REGEX", hbrt.FsPublic, HbRegex),
 		hbrt.Sym("HB_REGEXMATCH", hbrt.FsPublic, HbRegexMatch),
 		hbrt.Sym("HB_REGEXSPLIT", hbrt.FsPublic, HbRegexSplit),
 		hbrt.Sym("HB_REGEXALL", hbrt.FsPublic, HbRegexAll),
@@ -446,6 +447,9 @@ func RegisterRTL(vm *hbrt.VM) {
 		hbrt.Sym("HB_SECOND", hbrt.FsPublic, HbSecond),
 		hbrt.Sym("HB_ATOKENS", hbrt.FsPublic, HbATokens),
 		hbrt.Sym("HB_CDPSELECT", hbrt.FsPublic, HbCdpSelect),
+		hbrt.Sym("HB_GETCHARSET", hbrt.FsPublic, HbGetCharset),
+		hbrt.Sym("HB_SETCHARSET", hbrt.FsPublic, HbSetCharset),
+		hbrt.Sym("HB_TRANSLATE", hbrt.FsPublic, HbTranslate),
 		hbrt.Sym("DBSETINDEX", hbrt.FsPublic, rtlDbSetIndex),
 		hbrt.Sym("HB_TTOS", hbrt.FsPublic, HbTToS),
 		hbrt.Sym("HB_STOT", hbrt.FsPublic, HbSToT),
diff --git a/hbrtl/strings.go b/hbrtl/strings.go
index f1f5a83..4d190fb 100644
--- a/hbrtl/strings.go
+++ b/hbrtl/strings.go
@@ -10,6 +10,7 @@ import (
 	"fmt"
 	"math"
 	"strings"
+	"unicode/utf8"
 )
 
 // spacesCache: pre-built space strings for common pad sizes.
@@ -134,7 +135,11 @@ func Len(t *hbrt.Thread) {
 	v := t.Local(1)
 	switch {
 	case v.IsString():
-		t.RetInt(int64(v.StringLen()))
+		if charsetIsUTF8() {
+			t.RetInt(int64(utf8.RuneCountInString(v.AsString()))) // 문자(rune) 수
+		} else {
+			t.RetInt(int64(v.StringLen())) // 바이트 수(레거시 charset)
+		}
 	case v.IsArray():
 		t.RetInt(int64(len(v.AsArray().Items)))
 	case v.IsHash():
@@ -161,6 +166,37 @@ func SubStr(t *hbrt.Thread) {
 	s := v.AsString()
 	start := int(t.Local(2).AsNumInt())
 
+	// UTF-8(기본): 문자(rune) 단위. 레거시 charset: 바이트 단위.
+	if charsetIsUTF8() {
+		rs := []rune(s)
+		n := len(rs)
+		if start < 0 {
+			start = n + start + 1
+		}
+		if start < 1 {
+			start = 1
+		}
+		start--
+		if start >= n {
+			t.PushString("")
+			t.RetValue()
+			return
+		}
+		res := rs[start:]
+		if nParams >= 3 && !t.Local(3).IsNil() {
+			nLen := int(t.Local(3).AsNumInt())
+			if nLen < 0 {
+				nLen = 0
+			}
+			if nLen < len(res) {
+				res = res[:nLen]
+			}
+		}
+		t.PushString(string(res))
+		t.RetValue()
+		return
+	}
+
 	// Harbour: 1-based index, negative = from end
 	if start < 0 {
 		start = len(s) + start + 1