Files
five/compiler/lexer/lexer.go
Charles KWON OhJun 59568f3301 Five v0.9 — Harbour + Go fusion language
- Compiler: PP → Lexer → Parser → Analyzer → Gengo pipeline
- Parser: 232/236 (98%) Harbour compatibility, registry-based dispatch
- RTL: 351 Harbour-compatible functions
- RDD: DBF/NTX/CDX engines with Rushmore bitmap optimization
- Go Interop: IMPORT + pkg.Func() + obj:Method() with FastPath (15M calls/sec)
- HB_FUNC API: Full Harbour C API compatible Go bridge
- Concurrency: SPAWN/LAUNCH/GOROUTINE, <-, WATCH, PARALLEL FOR, ASYNC/AWAIT
- Extensions: Multi-return, DEFER, Slice, f-string, Nil-safe ?:, CONST
- Macro Compiler: Runtime AST parsing and evaluation
- Debugger: TUI debugger with source display, breakpoints, stepping
- FRB: Native + Pcode dual mode runtime binary
- Tests: 13 packages ALL PASS

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 09:41:50 +09:00

744 lines
18 KiB
Go

// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
// All rights reserved.
// Lexer for the Five language (Harbour-compatible).
// Hand-written scanner — no generated code.
// Handles Harbour's case-insensitive keywords, .T./.F./.AND./.OR./.NOT. literals,
// line-continuation with semicolon, and multiple comment styles.
//
// tsgo reference: ref/typescript-go/internal/scanner/ for scanning patterns.
// Key insight from tsgo: substring slicing into original source (zero-copy tokens).
package lexer
import (
"five/compiler/token"
"unicode/utf8"
)
// Lexer scans Harbour/Five source code into tokens.
type Lexer struct {
src string // source code (immutable, tsgo pattern: substring slicing)
file string // filename for error reporting
pos int // current byte position
line int // current line (1-based)
col int // current column (1-based)
lineStart int // byte offset of current line start
lastKind token.Kind // previous token kind (for [string] detection)
}
// New creates a new Lexer for the given source.
func New(filename, source string) *Lexer {
return &Lexer{
src: source,
file: filename,
pos: 0,
line: 1,
col: 1,
lineStart: 0,
}
}
// NextToken returns the next token from the source.
func (l *Lexer) NextToken() token.Token {
tok := l.nextTokenInner()
l.lastKind = tok.Kind
return tok
}
func (l *Lexer) nextTokenInner() token.Token {
l.skipWhitespaceAndComments()
if l.pos >= len(l.src) {
return l.makeToken(token.EOF, "")
}
ch := l.src[l.pos]
// Newline = statement terminator
if ch == '\n' {
tok := l.makeToken(token.NEWLINE, "\n")
l.advance()
l.line++
l.col = 1
l.lineStart = l.pos
return tok
}
if ch == '\r' {
l.advance()
if l.pos < len(l.src) && l.src[l.pos] == '\n' {
l.advance()
}
tok := l.makeToken(token.NEWLINE, "\n")
l.line++
l.col = 1
l.lineStart = l.pos
return tok
}
// String literals
if ch == '"' || ch == '\'' {
return l.scanString(ch)
}
// Numbers
if ch >= '0' && ch <= '9' {
return l.scanNumber()
}
// Dot-prefixed: .12 = numeric, .T., .F., .AND., .OR., .NOT.
if ch == '.' {
// .12 — numeric starting with decimal point
if l.pos+1 < len(l.src) && l.src[l.pos+1] >= '0' && l.src[l.pos+1] <= '9' {
return l.scanNumber() // scanNumber handles leading dot
}
if dot := l.scanDotToken(); dot.Kind != token.ILLEGAL {
return dot
}
l.advance()
return l.makeToken(token.DOT, ".")
}
// Identifiers and keywords
if isIdentStart(ch) {
return l.scanIdent()
}
// Operators and punctuation
return l.scanOperator()
}
// Tokenize returns all tokens from the source.
func Tokenize(filename, source string) []token.Token {
l := New(filename, source)
var tokens []token.Token
for {
tok := l.NextToken()
tokens = append(tokens, tok)
if tok.Kind == token.EOF {
break
}
}
return tokens
}
// --- Internal scanning methods ---
func (l *Lexer) advance() {
if l.pos < len(l.src) {
l.pos++
l.col++
}
}
func (l *Lexer) peek() byte {
if l.pos < len(l.src) {
return l.src[l.pos]
}
return 0
}
func (l *Lexer) peekAt(offset int) byte {
p := l.pos + offset
if p < len(l.src) {
return l.src[p]
}
return 0
}
func (l *Lexer) makeToken(kind token.Kind, literal string) token.Token {
return token.Token{
Kind: kind,
Literal: literal,
Pos: token.Position{
File: l.file,
Line: l.line,
Col: l.col,
Offset: l.pos,
},
}
}
func (l *Lexer) skipWhitespaceAndComments() {
for l.pos < len(l.src) {
ch := l.src[l.pos]
// Spaces and tabs (not newlines — those are tokens)
if ch == ' ' || ch == '\t' {
l.advance()
continue
}
// Semicolon = line continuation (skip semicolon + following newline)
if ch == ';' {
l.advance()
// Skip whitespace until newline
for l.pos < len(l.src) && (l.src[l.pos] == ' ' || l.src[l.pos] == '\t') {
l.advance()
}
// Skip trailing // comment before newline
if l.pos+1 < len(l.src) && l.src[l.pos] == '/' && l.src[l.pos+1] == '/' {
for l.pos < len(l.src) && l.src[l.pos] != '\n' && l.src[l.pos] != '\r' {
l.advance()
}
}
// Skip the newline itself
if l.pos < len(l.src) && l.src[l.pos] == '\r' {
l.advance()
}
if l.pos < len(l.src) && l.src[l.pos] == '\n' {
l.advance()
l.line++
l.col = 1
l.lineStart = l.pos
}
continue
}
// Backslash = alternate line continuation (Harbour extension)
if ch == '\\' && l.peekAt(1) != '\\' {
l.advance()
for l.pos < len(l.src) && (l.src[l.pos] == ' ' || l.src[l.pos] == '\t') {
l.advance()
}
if l.pos < len(l.src) && l.src[l.pos] == '\r' {
l.advance()
}
if l.pos < len(l.src) && l.src[l.pos] == '\n' {
l.advance()
l.line++
l.col = 1
l.lineStart = l.pos
}
continue
}
// // single-line comment
if ch == '/' && l.peekAt(1) == '/' {
l.skipToEndOfLine()
continue
}
// /* ... */ multi-line comment
if ch == '/' && l.peekAt(1) == '*' {
l.skipBlockComment()
continue
}
// && single-line comment (Harbour style)
if ch == '&' && l.peekAt(1) == '&' {
l.skipToEndOfLine()
continue
}
// * at start of line = comment (Harbour/Clipper style)
// Also handles indented * comments: " * comment"
if ch == '*' && l.isFirstNonWhitespace() {
l.skipToEndOfLine()
continue
}
// NOTE at start of line (Harbour)
if (ch == 'N' || ch == 'n') && l.pos == l.lineStart {
if l.matchWordAt("NOTE") {
l.skipToEndOfLine()
continue
}
}
break
}
}
func (l *Lexer) isFirstNonWhitespace() bool {
for i := l.lineStart; i < l.pos; i++ {
if l.src[i] != ' ' && l.src[i] != '\t' {
return false
}
}
return true
}
func (l *Lexer) skipToEndOfLine() {
for l.pos < len(l.src) && l.src[l.pos] != '\n' && l.src[l.pos] != '\r' {
l.advance()
}
}
func (l *Lexer) skipBlockComment() {
l.advance() // skip /
l.advance() // skip *
for l.pos < len(l.src)-1 {
if l.src[l.pos] == '*' && l.src[l.pos+1] == '/' {
l.advance() // skip *
l.advance() // skip /
return
}
if l.src[l.pos] == '\n' {
l.line++
l.col = 0
l.lineStart = l.pos + 1
}
l.advance()
}
// Unterminated comment — consume rest
l.pos = len(l.src)
}
func (l *Lexer) matchWordAt(word string) bool {
if l.pos+len(word) > len(l.src) {
return false
}
for i := 0; i < len(word); i++ {
c := l.src[l.pos+i]
w := word[i]
if c != w && c != w+32 && c != w-32 {
return false
}
}
// Must be followed by space or newline (not part of identifier)
if l.pos+len(word) < len(l.src) {
next := l.src[l.pos+len(word)]
if isIdentChar(next) {
return false
}
}
return true
}
// --- String scanning ---
func (l *Lexer) scanString(quote byte) token.Token {
start := l.pos
l.advance() // skip opening quote
for l.pos < len(l.src) {
ch := l.src[l.pos]
if ch == quote {
l.advance() // skip closing quote
// tsgo pattern: substring slice (zero-copy)
literal := l.src[start+1 : l.pos-1]
return l.makeTokenAt(token.STRING, literal, start)
}
// Note: Harbour does NOT use C-style escape sequences in strings.
// "\" is a valid string containing a single backslash.
if ch == '\n' || ch == '\r' {
break // unterminated string
}
l.advance()
}
// Unterminated string
return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start)
}
// isStringBracket returns true if [ should be treated as string delimiter.
// Harbour: [text] is string when not preceded by ident, ), ], literal.
func (l *Lexer) isStringBracket() bool {
switch l.lastKind {
case token.IDENT, token.RPAREN, token.RBRACKET,
token.INT, token.LONG, token.DOUBLE, token.STRING,
token.TRUE, token.FALSE, token.NIL_LIT:
return false // array index context
}
// Keywords used as variable names (begin, return, for, etc.) — treat as subscript
// Any keyword token could be a variable name in Harbour
if l.lastKind >= token.FUNCTION_KW {
return false
}
// Also check if next char is ] (empty []) — that's array
if l.pos < len(l.src) && l.src[l.pos] == ']' {
return false
}
return true
}
// scanBracketString scans [text] as a string literal.
func (l *Lexer) scanBracketString(start int) token.Token {
l.advance() // skip [
strStart := l.pos
depth := 1
for l.pos < len(l.src) && depth > 0 {
if l.src[l.pos] == '[' {
depth++
} else if l.src[l.pos] == ']' {
depth--
if depth == 0 {
literal := l.src[strStart:l.pos]
l.advance() // skip ]
return l.makeTokenAt(token.STRING, literal, start)
}
} else if l.src[l.pos] == '\n' || l.src[l.pos] == '\r' {
break // unterminated
}
l.advance()
}
return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start)
}
// --- Number scanning ---
func (l *Lexer) scanNumber() token.Token {
start := l.pos
isDouble := false
// Hex: 0x...
if l.src[l.pos] == '0' && l.pos+1 < len(l.src) && (l.src[l.pos+1] == 'x' || l.src[l.pos+1] == 'X') {
l.advance() // 0
l.advance() // x
for l.pos < len(l.src) && isHexDigit(l.src[l.pos]) {
l.advance()
}
return l.makeTokenAt(token.INT, l.src[start:l.pos], start)
}
// Leading dot: .12 → 0.12
if l.src[start] == '.' {
isDouble = true
l.advance() // skip .
for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' {
l.advance()
}
return l.makeTokenAt(token.DOUBLE, l.src[start:l.pos], start)
}
// Decimal digits
for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' {
l.advance()
}
// Decimal point
if l.pos < len(l.src) && l.src[l.pos] == '.' {
// Check it's not a method call (123.method) or range
if l.pos+1 < len(l.src) && l.src[l.pos+1] >= '0' && l.src[l.pos+1] <= '9' {
isDouble = true
l.advance() // skip .
for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' {
l.advance()
}
}
}
literal := l.src[start:l.pos]
if isDouble {
return l.makeTokenAt(token.DOUBLE, literal, start)
}
return l.makeTokenAt(token.INT, literal, start)
}
// --- Dot-prefixed tokens ---
func (l *Lexer) scanDotToken() token.Token {
start := l.pos
// .T. / .F.
if l.pos+2 < len(l.src) && l.src[l.pos+2] == '.' {
mid := l.src[l.pos+1]
if mid == 'T' || mid == 't' {
l.pos += 3
l.col += 3
return l.makeTokenAt(token.TRUE, ".T.", start)
}
if mid == 'F' || mid == 'f' {
l.pos += 3
l.col += 3
return l.makeTokenAt(token.FALSE, ".F.", start)
}
}
// .AND. / .OR. / .NOT.
for _, kw := range []struct {
text string
kind token.Kind
}{
{".AND.", token.AND},
{".OR.", token.OR},
{".NOT.", token.NOT},
} {
if l.matchDotKeyword(kw.text) {
l.pos += len(kw.text)
l.col += len(kw.text)
return l.makeTokenAt(kw.kind, kw.text, start)
}
}
return token.Token{Kind: token.ILLEGAL} // let caller handle plain DOT
}
func (l *Lexer) matchDotKeyword(kw string) bool {
if l.pos+len(kw) > len(l.src) {
return false
}
for i := 0; i < len(kw); i++ {
c := l.src[l.pos+i]
k := kw[i]
if c == k {
continue
}
// Case-insensitive for letters
if c >= 'a' && c <= 'z' && c-32 == k {
continue
}
if c >= 'A' && c <= 'Z' && c+32 == k {
continue
}
return false
}
return true
}
// --- Identifier scanning ---
func (l *Lexer) scanIdent() token.Token {
start := l.pos
for l.pos < len(l.src) && isIdentChar(l.src[l.pos]) {
l.advance()
}
// tsgo pattern: substring slice (zero-copy from source)
literal := l.src[start:l.pos]
kind := token.LookupKeyword(literal)
return l.makeTokenAt(kind, literal, start)
}
// --- Operator scanning ---
func (l *Lexer) scanOperator() token.Token {
start := l.pos
ch := l.src[l.pos]
l.advance()
switch ch {
case '+':
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.PLUSEQ, "+=", start)
}
if l.peek() == '+' {
l.advance()
return l.makeTokenAt(token.INC, "++", start)
}
return l.makeTokenAt(token.PLUS, "+", start)
case '-':
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.MINUSEQ, "-=", start)
}
if l.peek() == '-' {
l.advance()
return l.makeTokenAt(token.DEC, "--", start)
}
if l.peek() == '>' {
l.advance()
return l.makeTokenAt(token.ARROW, "->", start)
}
return l.makeTokenAt(token.MINUS, "-", start)
case '*':
if l.peek() == '*' {
l.advance()
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.POWEREQ, "**=", start)
}
return l.makeTokenAt(token.POWER, "**", start)
}
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.STAREQ, "*=", start)
}
return l.makeTokenAt(token.STAR, "*", start)
case '/':
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.SLASHEQ, "/=", start)
}
return l.makeTokenAt(token.SLASH, "/", start)
case '%':
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.PERCENTEQ, "%=", start)
}
return l.makeTokenAt(token.PERCENT, "%", start)
case '=':
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.EXEQ, "==", start)
}
if l.peek() == '>' {
l.advance()
return l.makeTokenAt(token.DBLARROW, "=>", start)
}
return l.makeTokenAt(token.EQ, "=", start)
case '!':
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.NEQ, "!=", start)
}
return l.makeTokenAt(token.NOT, "!", start)
case '<':
if l.peek() == '-' {
l.advance()
return l.makeTokenAt(token.ARROW_LEFT, "<-", start)
}
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.LTE, "<=", start)
}
if l.peek() == '>' {
l.advance()
return l.makeTokenAt(token.NEQ, "<>", start)
}
return l.makeTokenAt(token.LT, "<", start)
case '>':
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.GTE, ">=", start)
}
return l.makeTokenAt(token.GT, ">", start)
case '#':
// # alone = not-equal (Clipper), #keyword = preprocessor
if l.peek() >= 'a' && l.peek() <= 'z' || l.peek() >= 'A' && l.peek() <= 'Z' {
return l.scanPreprocessor(start)
}
return l.makeTokenAt(token.NEQ, "#", start)
case ':':
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.ASSIGN, ":=", start)
}
if l.peek() == ':' {
l.advance()
return l.makeTokenAt(token.COLONCOLON, "::", start)
}
return l.makeTokenAt(token.COLON, ":", start)
case '&':
return l.makeTokenAt(token.AMPERSAND, "&", start)
case '@':
return l.makeTokenAt(token.AT, "@", start)
case '$':
return l.makeTokenAt(token.DOLLAR, "$", start)
case '?':
if l.peek() == '?' {
l.advance()
return l.makeTokenAt(token.QQMARK, "??", start)
}
return l.makeTokenAt(token.QMARK, "?", start)
case '(':
return l.makeTokenAt(token.LPAREN, "(", start)
case ')':
return l.makeTokenAt(token.RPAREN, ")", start)
case '[':
// Harbour: [text] is string literal when NOT preceded by ident/)/]/literal
// a[1] = array index, but ? [Hello] = string
if l.isStringBracket() {
return l.scanBracketString(start)
}
return l.makeTokenAt(token.LBRACKET, "[", start)
case ']':
return l.makeTokenAt(token.RBRACKET, "]", start)
case '{':
return l.makeTokenAt(token.LBRACE, "{", start)
case '}':
return l.makeTokenAt(token.RBRACE, "}", start)
case ',':
return l.makeTokenAt(token.COMMA, ",", start)
case '|':
return l.makeTokenAt(token.PIPE, "|", start)
case '^':
if l.peek() == '=' {
l.advance()
return l.makeTokenAt(token.POWEREQ, "^=", start)
}
return l.makeTokenAt(token.POWER, "^", start)
default:
// Handle multi-byte UTF-8 characters in identifiers
if ch >= 0x80 {
l.pos = start
_, size := utf8.DecodeRuneInString(l.src[l.pos:])
l.pos += size
l.col += size
return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start)
}
return l.makeTokenAt(token.ILLEGAL, string(ch), start)
}
}
func (l *Lexer) scanPreprocessor(start int) token.Token {
// Already consumed '#', now scan the directive name
kwStart := l.pos
for l.pos < len(l.src) && isIdentChar(l.src[l.pos]) {
l.advance()
}
directive := l.src[kwStart:l.pos]
upper := token.LookupKeyword(directive)
_ = upper
full := l.src[start:l.pos]
switch {
case matchCI(directive, "include"):
return l.makeTokenAt(token.PP_INCLUDE, full, start)
case matchCI(directive, "define"):
return l.makeTokenAt(token.PP_DEFINE, full, start)
case matchCI(directive, "undef"):
return l.makeTokenAt(token.PP_UNDEF, full, start)
case matchCI(directive, "ifdef"):
return l.makeTokenAt(token.PP_IFDEF, full, start)
case matchCI(directive, "ifndef"):
return l.makeTokenAt(token.PP_IFNDEF, full, start)
case matchCI(directive, "else"):
return l.makeTokenAt(token.PP_ELSE, full, start)
case matchCI(directive, "endif"):
return l.makeTokenAt(token.PP_ENDIF, full, start)
case matchCI(directive, "command"):
return l.makeTokenAt(token.PP_COMMAND, full, start)
case matchCI(directive, "translate"):
return l.makeTokenAt(token.PP_TRANSLATE, full, start)
case matchCI(directive, "pragma"):
return l.makeTokenAt(token.PP_PRAGMA, full, start)
default:
return l.makeTokenAt(token.ILLEGAL, full, start)
}
}
func (l *Lexer) makeTokenAt(kind token.Kind, literal string, startPos int) token.Token {
return token.Token{
Kind: kind,
Literal: literal,
Pos: token.Position{
File: l.file,
Line: l.line,
Col: startPos - l.lineStart + 1,
Offset: startPos,
},
}
}
// --- Character classification ---
func isIdentStart(ch byte) bool {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_'
}
func isIdentChar(ch byte) bool {
return isIdentStart(ch) || (ch >= '0' && ch <= '9')
}
func isHexDigit(ch byte) bool {
return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')
}
func matchCI(a, b string) bool {
if len(a) != len(b) {
return false
}
for i := 0; i < len(a); i++ {
ca, cb := a[i], b[i]
if ca >= 'A' && ca <= 'Z' {
ca += 32
}
if cb >= 'A' && cb <= 'Z' {
cb += 32
}
if ca != cb {
return false
}
}
return true
}