five/compiler/lexer/lexer.go

// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
// All rights reserved.

// Lexer for the Five language (Harbour-compatible).
// Hand-written scanner — no generated code.
// Handles Harbour's case-insensitive keywords, .T./.F./.AND./.OR./.NOT. literals,
// line-continuation with semicolon, and multiple comment styles.
//
// tsgo reference: ref/typescript-go/internal/scanner/ for scanning patterns.
// Key insight from tsgo: substring slicing into original source (zero-copy tokens).
package lexer

import (
	"five/compiler/token"
	"unicode/utf8"
)

// Lexer scans Harbour/Five source code into tokens.
type Lexer struct {
	src       string // source code (immutable, tsgo pattern: substring slicing)
	file      string // filename for error reporting
	pos       int    // current byte position
	line      int    // current line (1-based)
	col       int    // current column (1-based)
	lineStart int    // byte offset of current line start
	lastKind  token.Kind // previous token kind (for [string] detection)
}

// New creates a new Lexer for the given source.
func New(filename, source string) *Lexer {
	return &Lexer{
		src:       source,
		file:      filename,
		pos:       0,
		line:      1,
		col:       1,
		lineStart: 0,
	}
}

// NextToken returns the next token from the source.
func (l *Lexer) NextToken() token.Token {
	tok := l.nextTokenInner()
	l.lastKind = tok.Kind
	return tok
}

func (l *Lexer) nextTokenInner() token.Token {
	l.skipWhitespaceAndComments()

	if l.pos >= len(l.src) {
		return l.makeToken(token.EOF, "")
	}

	ch := l.src[l.pos]

	// Newline = statement terminator
	if ch == '\n' {
		tok := l.makeToken(token.NEWLINE, "\n")
		l.advance()
		l.line++
		l.col = 1
		l.lineStart = l.pos
		return tok
	}
	if ch == '\r' {
		l.advance()
		if l.pos < len(l.src) && l.src[l.pos] == '\n' {
			l.advance()
		}
		tok := l.makeToken(token.NEWLINE, "\n")
		l.line++
		l.col = 1
		l.lineStart = l.pos
		return tok
	}

	// String literals
	if ch == '"' || ch == '\'' {
		return l.scanString(ch)
	}

	// Numbers
	if ch >= '0' && ch <= '9' {
		return l.scanNumber()
	}

	// Dot-prefixed: .12 = numeric, .T., .F., .AND., .OR., .NOT.
	if ch == '.' {
		// .12 — numeric starting with decimal point
		if l.pos+1 < len(l.src) && l.src[l.pos+1] >= '0' && l.src[l.pos+1] <= '9' {
			return l.scanNumber() // scanNumber handles leading dot
		}
		if dot := l.scanDotToken(); dot.Kind != token.ILLEGAL {
			return dot
		}
		l.advance()
		return l.makeToken(token.DOT, ".")
	}

	// Identifiers and keywords
	if isIdentStart(ch) {
		return l.scanIdent()
	}

	// Operators and punctuation
	return l.scanOperator()
}

// Tokenize returns all tokens from the source.
func Tokenize(filename, source string) []token.Token {
	l := New(filename, source)
	var tokens []token.Token
	for {
		tok := l.NextToken()
		tokens = append(tokens, tok)
		if tok.Kind == token.EOF {
			break
		}
	}
	return tokens
}

// --- Internal scanning methods ---

func (l *Lexer) advance() {
	if l.pos < len(l.src) {
		l.pos++
		l.col++
	}
}

func (l *Lexer) peek() byte {
	if l.pos < len(l.src) {
		return l.src[l.pos]
	}
	return 0
}

func (l *Lexer) peekAt(offset int) byte {
	p := l.pos + offset
	if p < len(l.src) {
		return l.src[p]
	}
	return 0
}

func (l *Lexer) makeToken(kind token.Kind, literal string) token.Token {
	return token.Token{
		Kind:    kind,
		Literal: literal,
		Pos: token.Position{
			File:   l.file,
			Line:   l.line,
			Col:    l.col,
			Offset: l.pos,
		},
	}
}

func (l *Lexer) skipWhitespaceAndComments() {
	for l.pos < len(l.src) {
		ch := l.src[l.pos]

		// Spaces and tabs (not newlines — those are tokens)
		if ch == ' ' || ch == '\t' {
			l.advance()
			continue
		}

		// Semicolon = line continuation (skip semicolon + following newline)
		if ch == ';' {
			l.advance()
			// Skip whitespace until newline
			for l.pos < len(l.src) && (l.src[l.pos] == ' ' || l.src[l.pos] == '\t') {
				l.advance()
			}
			// Skip trailing // comment before newline
			if l.pos+1 < len(l.src) && l.src[l.pos] == '/' && l.src[l.pos+1] == '/' {
				for l.pos < len(l.src) && l.src[l.pos] != '\n' && l.src[l.pos] != '\r' {
					l.advance()
				}
			}
			// Skip the newline itself
			if l.pos < len(l.src) && l.src[l.pos] == '\r' {
				l.advance()
			}
			if l.pos < len(l.src) && l.src[l.pos] == '\n' {
				l.advance()
				l.line++
				l.col = 1
				l.lineStart = l.pos
			}
			continue
		}

		// Backslash = alternate line continuation (Harbour extension)
		if ch == '\\' && l.peekAt(1) != '\\' {
			l.advance()
			for l.pos < len(l.src) && (l.src[l.pos] == ' ' || l.src[l.pos] == '\t') {
				l.advance()
			}
			if l.pos < len(l.src) && l.src[l.pos] == '\r' {
				l.advance()
			}
			if l.pos < len(l.src) && l.src[l.pos] == '\n' {
				l.advance()
				l.line++
				l.col = 1
				l.lineStart = l.pos
			}
			continue
		}

		// // single-line comment
		if ch == '/' && l.peekAt(1) == '/' {
			l.skipToEndOfLine()
			continue
		}

		// /* ... */ multi-line comment
		if ch == '/' && l.peekAt(1) == '*' {
			l.skipBlockComment()
			continue
		}

		// && single-line comment (Harbour style)
		if ch == '&' && l.peekAt(1) == '&' {
			l.skipToEndOfLine()
			continue
		}

		// * at start of line = comment (Harbour/Clipper style)
		// Also handles indented * comments: "   * comment"
		if ch == '*' && l.isFirstNonWhitespace() {
			l.skipToEndOfLine()
			continue
		}

		// NOTE at start of line (Harbour)
		if (ch == 'N' || ch == 'n') && l.pos == l.lineStart {
			if l.matchWordAt("NOTE") {
				l.skipToEndOfLine()
				continue
			}
		}

		break
	}
}

func (l *Lexer) isFirstNonWhitespace() bool {
	for i := l.lineStart; i < l.pos; i++ {
		if l.src[i] != ' ' && l.src[i] != '\t' {
			return false
		}
	}
	return true
}

func (l *Lexer) skipToEndOfLine() {
	for l.pos < len(l.src) && l.src[l.pos] != '\n' && l.src[l.pos] != '\r' {
		l.advance()
	}
}

func (l *Lexer) skipBlockComment() {
	l.advance() // skip /
	l.advance() // skip *
	for l.pos < len(l.src)-1 {
		if l.src[l.pos] == '*' && l.src[l.pos+1] == '/' {
			l.advance() // skip *
			l.advance() // skip /
			return
		}
		if l.src[l.pos] == '\n' {
			l.line++
			l.col = 0
			l.lineStart = l.pos + 1
		}
		l.advance()
	}
	// Unterminated comment — consume rest
	l.pos = len(l.src)
}

func (l *Lexer) matchWordAt(word string) bool {
	if l.pos+len(word) > len(l.src) {
		return false
	}
	for i := 0; i < len(word); i++ {
		c := l.src[l.pos+i]
		w := word[i]
		if c != w && c != w+32 && c != w-32 {
			return false
		}
	}
	// Must be followed by space or newline (not part of identifier)
	if l.pos+len(word) < len(l.src) {
		next := l.src[l.pos+len(word)]
		if isIdentChar(next) {
			return false
		}
	}
	return true
}

// --- String scanning ---

func (l *Lexer) scanString(quote byte) token.Token {
	start := l.pos
	l.advance() // skip opening quote
	for l.pos < len(l.src) {
		ch := l.src[l.pos]
		if ch == quote {
			l.advance() // skip closing quote
			// tsgo pattern: substring slice (zero-copy)
			literal := l.src[start+1 : l.pos-1]
			return l.makeTokenAt(token.STRING, literal, start)
		}
		// Note: Harbour does NOT use C-style escape sequences in strings.
		// "\" is a valid string containing a single backslash.
		if ch == '\n' || ch == '\r' {
			break // unterminated string
		}
		l.advance()
	}
	// Unterminated string
	return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start)
}

// isStringBracket returns true if [ should be treated as string delimiter.
// Harbour: [text] is string when not preceded by ident, ), ], literal.
func (l *Lexer) isStringBracket() bool {
	switch l.lastKind {
	case token.IDENT, token.RPAREN, token.RBRACKET,
		token.INT, token.LONG, token.DOUBLE, token.STRING,
		token.TRUE, token.FALSE, token.NIL_LIT:
		return false // array index context
	}
	// Keywords used as variable names (begin, return, for, etc.) — treat as subscript
	// Any keyword token could be a variable name in Harbour
	if l.lastKind >= token.FUNCTION_KW {
		return false
	}
	// Also check if next char is ] (empty []) — that's array
	if l.pos < len(l.src) && l.src[l.pos] == ']' {
		return false
	}
	return true
}

// scanBracketString scans [text] as a string literal.
func (l *Lexer) scanBracketString(start int) token.Token {
	l.advance() // skip [
	strStart := l.pos
	depth := 1
	for l.pos < len(l.src) && depth > 0 {
		if l.src[l.pos] == '[' {
			depth++
		} else if l.src[l.pos] == ']' {
			depth--
			if depth == 0 {
				literal := l.src[strStart:l.pos]
				l.advance() // skip ]
				return l.makeTokenAt(token.STRING, literal, start)
			}
		} else if l.src[l.pos] == '\n' || l.src[l.pos] == '\r' {
			break // unterminated
		}
		l.advance()
	}
	return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start)
}

// --- Number scanning ---

func (l *Lexer) scanNumber() token.Token {
	start := l.pos
	isDouble := false

	// Hex: 0x...
	if l.src[l.pos] == '0' && l.pos+1 < len(l.src) && (l.src[l.pos+1] == 'x' || l.src[l.pos+1] == 'X') {
		l.advance() // 0
		l.advance() // x
		for l.pos < len(l.src) && isHexDigit(l.src[l.pos]) {
			l.advance()
		}
		return l.makeTokenAt(token.INT, l.src[start:l.pos], start)
	}

	// Leading dot: .12 → 0.12
	if l.src[start] == '.' {
		isDouble = true
		l.advance() // skip .
		for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' {
			l.advance()
		}
		return l.makeTokenAt(token.DOUBLE, l.src[start:l.pos], start)
	}

	// Decimal digits
	for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' {
		l.advance()
	}

	// Decimal point
	if l.pos < len(l.src) && l.src[l.pos] == '.' {
		// Check it's not a method call (123.method) or range
		if l.pos+1 < len(l.src) && l.src[l.pos+1] >= '0' && l.src[l.pos+1] <= '9' {
			isDouble = true
			l.advance() // skip .
			for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' {
				l.advance()
			}
		}
	}

	literal := l.src[start:l.pos]
	if isDouble {
		return l.makeTokenAt(token.DOUBLE, literal, start)
	}
	return l.makeTokenAt(token.INT, literal, start)
}

// --- Dot-prefixed tokens ---

func (l *Lexer) scanDotToken() token.Token {
	start := l.pos

	// .T. / .F.
	if l.pos+2 < len(l.src) && l.src[l.pos+2] == '.' {
		mid := l.src[l.pos+1]
		if mid == 'T' || mid == 't' {
			l.pos += 3
			l.col += 3
			return l.makeTokenAt(token.TRUE, ".T.", start)
		}
		if mid == 'F' || mid == 'f' {
			l.pos += 3
			l.col += 3
			return l.makeTokenAt(token.FALSE, ".F.", start)
		}
	}

	// .AND. / .OR. / .NOT.
	for _, kw := range []struct {
		text string
		kind token.Kind
	}{
		{".AND.", token.AND},
		{".OR.", token.OR},
		{".NOT.", token.NOT},
	} {
		if l.matchDotKeyword(kw.text) {
			l.pos += len(kw.text)
			l.col += len(kw.text)
			return l.makeTokenAt(kw.kind, kw.text, start)
		}
	}

	return token.Token{Kind: token.ILLEGAL} // let caller handle plain DOT
}

func (l *Lexer) matchDotKeyword(kw string) bool {
	if l.pos+len(kw) > len(l.src) {
		return false
	}
	for i := 0; i < len(kw); i++ {
		c := l.src[l.pos+i]
		k := kw[i]
		if c == k {
			continue
		}
		// Case-insensitive for letters
		if c >= 'a' && c <= 'z' && c-32 == k {
			continue
		}
		if c >= 'A' && c <= 'Z' && c+32 == k {
			continue
		}
		return false
	}
	return true
}

// --- Identifier scanning ---

func (l *Lexer) scanIdent() token.Token {
	start := l.pos
	for l.pos < len(l.src) && isIdentChar(l.src[l.pos]) {
		l.advance()
	}
	// tsgo pattern: substring slice (zero-copy from source)
	literal := l.src[start:l.pos]
	kind := token.LookupKeyword(literal)
	return l.makeTokenAt(kind, literal, start)
}

// --- Operator scanning ---

func (l *Lexer) scanOperator() token.Token {
	start := l.pos
	ch := l.src[l.pos]
	l.advance()

	switch ch {
	case '+':
		if l.peek() == '=' {
			l.advance()
			return l.makeTokenAt(token.PLUSEQ, "+=", start)
		}
		if l.peek() == '+' {
			l.advance()
			return l.makeTokenAt(token.INC, "++", start)
		}
		return l.makeTokenAt(token.PLUS, "+", start)
	case '-':
		if l.peek() == '=' {
			l.advance()
			return l.makeTokenAt(token.MINUSEQ, "-=", start)
		}
		if l.peek() == '-' {
			l.advance()
			return l.makeTokenAt(token.DEC, "--", start)
		}
		if l.peek() == '>' {
			l.advance()
			return l.makeTokenAt(token.ARROW, "->", start)
		}
		return l.makeTokenAt(token.MINUS, "-", start)
	case '*':
		if l.peek() == '*' {
			l.advance()
			if l.peek() == '=' {
				l.advance()
				return l.makeTokenAt(token.POWEREQ, "**=", start)
			}
			return l.makeTokenAt(token.POWER, "**", start)
		}
		if l.peek() == '=' {
			l.advance()
			return l.makeTokenAt(token.STAREQ, "*=", start)
		}
		return l.makeTokenAt(token.STAR, "*", start)
	case '/':
		if l.peek() == '=' {
			l.advance()
			return l.makeTokenAt(token.SLASHEQ, "/=", start)
		}
		return l.makeTokenAt(token.SLASH, "/", start)
	case '%':
		if l.peek() == '=' {
			l.advance()
			return l.makeTokenAt(token.PERCENTEQ, "%=", start)
		}
		return l.makeTokenAt(token.PERCENT, "%", start)
	case '=':
		if l.peek() == '=' {
			l.advance()
			return l.makeTokenAt(token.EXEQ, "==", start)
		}
		if l.peek() == '>' {
			l.advance()
			return l.makeTokenAt(token.DBLARROW, "=>", start)
		}
		return l.makeTokenAt(token.EQ, "=", start)
	case '!':
		if l.peek() == '=' {
			l.advance()
			return l.makeTokenAt(token.NEQ, "!=", start)
		}
		return l.makeTokenAt(token.NOT, "!", start)
	case '<':
		if l.peek() == '-' {
			l.advance()
			return l.makeTokenAt(token.ARROW_LEFT, "<-", start)
		}
		if l.peek() == '=' {
			l.advance()
			return l.makeTokenAt(token.LTE, "<=", start)
		}
		if l.peek() == '>' {
			l.advance()
			return l.makeTokenAt(token.NEQ, "<>", start)
		}
		return l.makeTokenAt(token.LT, "<", start)
	case '>':
		if l.peek() == '=' {
			l.advance()
			return l.makeTokenAt(token.GTE, ">=", start)
		}
		return l.makeTokenAt(token.GT, ">", start)
	case '#':
		// # alone = not-equal (Clipper), #keyword = preprocessor
		if l.peek() >= 'a' && l.peek() <= 'z' || l.peek() >= 'A' && l.peek() <= 'Z' {
			return l.scanPreprocessor(start)
		}
		return l.makeTokenAt(token.NEQ, "#", start)
	case ':':
		if l.peek() == '=' {
			l.advance()
			return l.makeTokenAt(token.ASSIGN, ":=", start)
		}
		if l.peek() == ':' {
			l.advance()
			return l.makeTokenAt(token.COLONCOLON, "::", start)
		}
		return l.makeTokenAt(token.COLON, ":", start)
	case '&':
		return l.makeTokenAt(token.AMPERSAND, "&", start)
	case '@':
		return l.makeTokenAt(token.AT, "@", start)
	case '$':
		return l.makeTokenAt(token.DOLLAR, "$", start)
	case '?':
		if l.peek() == '?' {
			l.advance()
			return l.makeTokenAt(token.QQMARK, "??", start)
		}
		return l.makeTokenAt(token.QMARK, "?", start)
	case '(':
		return l.makeTokenAt(token.LPAREN, "(", start)
	case ')':
		return l.makeTokenAt(token.RPAREN, ")", start)
	case '[':
		// Harbour: [text] is string literal when NOT preceded by ident/)/]/literal
		// a[1] = array index, but ? [Hello] = string
		if l.isStringBracket() {
			return l.scanBracketString(start)
		}
		return l.makeTokenAt(token.LBRACKET, "[", start)
	case ']':
		return l.makeTokenAt(token.RBRACKET, "]", start)
	case '{':
		return l.makeTokenAt(token.LBRACE, "{", start)
	case '}':
		return l.makeTokenAt(token.RBRACE, "}", start)
	case ',':
		return l.makeTokenAt(token.COMMA, ",", start)
	case '|':
		return l.makeTokenAt(token.PIPE, "|", start)
	case '^':
		if l.peek() == '=' {
			l.advance()
			return l.makeTokenAt(token.POWEREQ, "^=", start)
		}
		return l.makeTokenAt(token.POWER, "^", start)
	default:
		// Handle multi-byte UTF-8 characters in identifiers
		if ch >= 0x80 {
			l.pos = start
			_, size := utf8.DecodeRuneInString(l.src[l.pos:])
			l.pos += size
			l.col += size
			return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start)
		}
		return l.makeTokenAt(token.ILLEGAL, string(ch), start)
	}
}

func (l *Lexer) scanPreprocessor(start int) token.Token {
	// Already consumed '#', now scan the directive name
	kwStart := l.pos
	for l.pos < len(l.src) && isIdentChar(l.src[l.pos]) {
		l.advance()
	}
	directive := l.src[kwStart:l.pos]
	upper := token.LookupKeyword(directive)
	_ = upper

	full := l.src[start:l.pos]
	switch {
	case matchCI(directive, "include"):
		return l.makeTokenAt(token.PP_INCLUDE, full, start)
	case matchCI(directive, "define"):
		return l.makeTokenAt(token.PP_DEFINE, full, start)
	case matchCI(directive, "undef"):
		return l.makeTokenAt(token.PP_UNDEF, full, start)
	case matchCI(directive, "ifdef"):
		return l.makeTokenAt(token.PP_IFDEF, full, start)
	case matchCI(directive, "ifndef"):
		return l.makeTokenAt(token.PP_IFNDEF, full, start)
	case matchCI(directive, "else"):
		return l.makeTokenAt(token.PP_ELSE, full, start)
	case matchCI(directive, "endif"):
		return l.makeTokenAt(token.PP_ENDIF, full, start)
	case matchCI(directive, "command"):
		return l.makeTokenAt(token.PP_COMMAND, full, start)
	case matchCI(directive, "translate"):
		return l.makeTokenAt(token.PP_TRANSLATE, full, start)
	case matchCI(directive, "pragma"):
		return l.makeTokenAt(token.PP_PRAGMA, full, start)
	default:
		return l.makeTokenAt(token.ILLEGAL, full, start)
	}
}

func (l *Lexer) makeTokenAt(kind token.Kind, literal string, startPos int) token.Token {
	return token.Token{
		Kind:    kind,
		Literal: literal,
		Pos: token.Position{
			File:   l.file,
			Line:   l.line,
			Col:    startPos - l.lineStart + 1,
			Offset: startPos,
		},
	}
}

// --- Character classification ---

func isIdentStart(ch byte) bool {
	return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_'
}

func isIdentChar(ch byte) bool {
	return isIdentStart(ch) || (ch >= '0' && ch <= '9')
}

func isHexDigit(ch byte) bool {
	return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')
}

func matchCI(a, b string) bool {
	if len(a) != len(b) {
		return false
	}
	for i := 0; i < len(a); i++ {
		ca, cb := a[i], b[i]
		if ca >= 'A' && ca <= 'Z' {
			ca += 32
		}
		if cb >= 'A' && cb <= 'Z' {
			cb += 32
		}
		if ca != cb {
			return false
		}
	}
	return true
}