- Compiler: PP → Lexer → Parser → Analyzer → Gengo pipeline - Parser: 232/236 (98%) Harbour compatibility, registry-based dispatch - RTL: 351 Harbour-compatible functions - RDD: DBF/NTX/CDX engines with Rushmore bitmap optimization - Go Interop: IMPORT + pkg.Func() + obj:Method() with FastPath (15M calls/sec) - HB_FUNC API: Full Harbour C API compatible Go bridge - Concurrency: SPAWN/LAUNCH/GOROUTINE, <-, WATCH, PARALLEL FOR, ASYNC/AWAIT - Extensions: Multi-return, DEFER, Slice, f-string, Nil-safe ?:, CONST - Macro Compiler: Runtime AST parsing and evaluation - Debugger: TUI debugger with source display, breakpoints, stepping - FRB: Native + Pcode dual mode runtime binary - Tests: 13 packages ALL PASS Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
744 lines
18 KiB
Go
744 lines
18 KiB
Go
// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
|
|
// All rights reserved.
|
|
|
|
// Lexer for the Five language (Harbour-compatible).
|
|
// Hand-written scanner — no generated code.
|
|
// Handles Harbour's case-insensitive keywords, .T./.F./.AND./.OR./.NOT. literals,
|
|
// line-continuation with semicolon, and multiple comment styles.
|
|
//
|
|
// tsgo reference: ref/typescript-go/internal/scanner/ for scanning patterns.
|
|
// Key insight from tsgo: substring slicing into original source (zero-copy tokens).
|
|
package lexer
|
|
|
|
import (
|
|
"five/compiler/token"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// Lexer scans Harbour/Five source code into tokens.
|
|
type Lexer struct {
|
|
src string // source code (immutable, tsgo pattern: substring slicing)
|
|
file string // filename for error reporting
|
|
pos int // current byte position
|
|
line int // current line (1-based)
|
|
col int // current column (1-based)
|
|
lineStart int // byte offset of current line start
|
|
lastKind token.Kind // previous token kind (for [string] detection)
|
|
}
|
|
|
|
// New creates a new Lexer for the given source.
|
|
func New(filename, source string) *Lexer {
|
|
return &Lexer{
|
|
src: source,
|
|
file: filename,
|
|
pos: 0,
|
|
line: 1,
|
|
col: 1,
|
|
lineStart: 0,
|
|
}
|
|
}
|
|
|
|
// NextToken returns the next token from the source.
|
|
func (l *Lexer) NextToken() token.Token {
|
|
tok := l.nextTokenInner()
|
|
l.lastKind = tok.Kind
|
|
return tok
|
|
}
|
|
|
|
func (l *Lexer) nextTokenInner() token.Token {
|
|
l.skipWhitespaceAndComments()
|
|
|
|
if l.pos >= len(l.src) {
|
|
return l.makeToken(token.EOF, "")
|
|
}
|
|
|
|
ch := l.src[l.pos]
|
|
|
|
// Newline = statement terminator
|
|
if ch == '\n' {
|
|
tok := l.makeToken(token.NEWLINE, "\n")
|
|
l.advance()
|
|
l.line++
|
|
l.col = 1
|
|
l.lineStart = l.pos
|
|
return tok
|
|
}
|
|
if ch == '\r' {
|
|
l.advance()
|
|
if l.pos < len(l.src) && l.src[l.pos] == '\n' {
|
|
l.advance()
|
|
}
|
|
tok := l.makeToken(token.NEWLINE, "\n")
|
|
l.line++
|
|
l.col = 1
|
|
l.lineStart = l.pos
|
|
return tok
|
|
}
|
|
|
|
// String literals
|
|
if ch == '"' || ch == '\'' {
|
|
return l.scanString(ch)
|
|
}
|
|
|
|
// Numbers
|
|
if ch >= '0' && ch <= '9' {
|
|
return l.scanNumber()
|
|
}
|
|
|
|
// Dot-prefixed: .12 = numeric, .T., .F., .AND., .OR., .NOT.
|
|
if ch == '.' {
|
|
// .12 — numeric starting with decimal point
|
|
if l.pos+1 < len(l.src) && l.src[l.pos+1] >= '0' && l.src[l.pos+1] <= '9' {
|
|
return l.scanNumber() // scanNumber handles leading dot
|
|
}
|
|
if dot := l.scanDotToken(); dot.Kind != token.ILLEGAL {
|
|
return dot
|
|
}
|
|
l.advance()
|
|
return l.makeToken(token.DOT, ".")
|
|
}
|
|
|
|
// Identifiers and keywords
|
|
if isIdentStart(ch) {
|
|
return l.scanIdent()
|
|
}
|
|
|
|
// Operators and punctuation
|
|
return l.scanOperator()
|
|
}
|
|
|
|
// Tokenize returns all tokens from the source.
|
|
func Tokenize(filename, source string) []token.Token {
|
|
l := New(filename, source)
|
|
var tokens []token.Token
|
|
for {
|
|
tok := l.NextToken()
|
|
tokens = append(tokens, tok)
|
|
if tok.Kind == token.EOF {
|
|
break
|
|
}
|
|
}
|
|
return tokens
|
|
}
|
|
|
|
// --- Internal scanning methods ---
|
|
|
|
func (l *Lexer) advance() {
|
|
if l.pos < len(l.src) {
|
|
l.pos++
|
|
l.col++
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) peek() byte {
|
|
if l.pos < len(l.src) {
|
|
return l.src[l.pos]
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (l *Lexer) peekAt(offset int) byte {
|
|
p := l.pos + offset
|
|
if p < len(l.src) {
|
|
return l.src[p]
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (l *Lexer) makeToken(kind token.Kind, literal string) token.Token {
|
|
return token.Token{
|
|
Kind: kind,
|
|
Literal: literal,
|
|
Pos: token.Position{
|
|
File: l.file,
|
|
Line: l.line,
|
|
Col: l.col,
|
|
Offset: l.pos,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) skipWhitespaceAndComments() {
|
|
for l.pos < len(l.src) {
|
|
ch := l.src[l.pos]
|
|
|
|
// Spaces and tabs (not newlines — those are tokens)
|
|
if ch == ' ' || ch == '\t' {
|
|
l.advance()
|
|
continue
|
|
}
|
|
|
|
// Semicolon = line continuation (skip semicolon + following newline)
|
|
if ch == ';' {
|
|
l.advance()
|
|
// Skip whitespace until newline
|
|
for l.pos < len(l.src) && (l.src[l.pos] == ' ' || l.src[l.pos] == '\t') {
|
|
l.advance()
|
|
}
|
|
// Skip trailing // comment before newline
|
|
if l.pos+1 < len(l.src) && l.src[l.pos] == '/' && l.src[l.pos+1] == '/' {
|
|
for l.pos < len(l.src) && l.src[l.pos] != '\n' && l.src[l.pos] != '\r' {
|
|
l.advance()
|
|
}
|
|
}
|
|
// Skip the newline itself
|
|
if l.pos < len(l.src) && l.src[l.pos] == '\r' {
|
|
l.advance()
|
|
}
|
|
if l.pos < len(l.src) && l.src[l.pos] == '\n' {
|
|
l.advance()
|
|
l.line++
|
|
l.col = 1
|
|
l.lineStart = l.pos
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Backslash = alternate line continuation (Harbour extension)
|
|
if ch == '\\' && l.peekAt(1) != '\\' {
|
|
l.advance()
|
|
for l.pos < len(l.src) && (l.src[l.pos] == ' ' || l.src[l.pos] == '\t') {
|
|
l.advance()
|
|
}
|
|
if l.pos < len(l.src) && l.src[l.pos] == '\r' {
|
|
l.advance()
|
|
}
|
|
if l.pos < len(l.src) && l.src[l.pos] == '\n' {
|
|
l.advance()
|
|
l.line++
|
|
l.col = 1
|
|
l.lineStart = l.pos
|
|
}
|
|
continue
|
|
}
|
|
|
|
// // single-line comment
|
|
if ch == '/' && l.peekAt(1) == '/' {
|
|
l.skipToEndOfLine()
|
|
continue
|
|
}
|
|
|
|
// /* ... */ multi-line comment
|
|
if ch == '/' && l.peekAt(1) == '*' {
|
|
l.skipBlockComment()
|
|
continue
|
|
}
|
|
|
|
// && single-line comment (Harbour style)
|
|
if ch == '&' && l.peekAt(1) == '&' {
|
|
l.skipToEndOfLine()
|
|
continue
|
|
}
|
|
|
|
// * at start of line = comment (Harbour/Clipper style)
|
|
// Also handles indented * comments: " * comment"
|
|
if ch == '*' && l.isFirstNonWhitespace() {
|
|
l.skipToEndOfLine()
|
|
continue
|
|
}
|
|
|
|
// NOTE at start of line (Harbour)
|
|
if (ch == 'N' || ch == 'n') && l.pos == l.lineStart {
|
|
if l.matchWordAt("NOTE") {
|
|
l.skipToEndOfLine()
|
|
continue
|
|
}
|
|
}
|
|
|
|
break
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) isFirstNonWhitespace() bool {
|
|
for i := l.lineStart; i < l.pos; i++ {
|
|
if l.src[i] != ' ' && l.src[i] != '\t' {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func (l *Lexer) skipToEndOfLine() {
|
|
for l.pos < len(l.src) && l.src[l.pos] != '\n' && l.src[l.pos] != '\r' {
|
|
l.advance()
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) skipBlockComment() {
|
|
l.advance() // skip /
|
|
l.advance() // skip *
|
|
for l.pos < len(l.src)-1 {
|
|
if l.src[l.pos] == '*' && l.src[l.pos+1] == '/' {
|
|
l.advance() // skip *
|
|
l.advance() // skip /
|
|
return
|
|
}
|
|
if l.src[l.pos] == '\n' {
|
|
l.line++
|
|
l.col = 0
|
|
l.lineStart = l.pos + 1
|
|
}
|
|
l.advance()
|
|
}
|
|
// Unterminated comment — consume rest
|
|
l.pos = len(l.src)
|
|
}
|
|
|
|
func (l *Lexer) matchWordAt(word string) bool {
|
|
if l.pos+len(word) > len(l.src) {
|
|
return false
|
|
}
|
|
for i := 0; i < len(word); i++ {
|
|
c := l.src[l.pos+i]
|
|
w := word[i]
|
|
if c != w && c != w+32 && c != w-32 {
|
|
return false
|
|
}
|
|
}
|
|
// Must be followed by space or newline (not part of identifier)
|
|
if l.pos+len(word) < len(l.src) {
|
|
next := l.src[l.pos+len(word)]
|
|
if isIdentChar(next) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// --- String scanning ---
|
|
|
|
func (l *Lexer) scanString(quote byte) token.Token {
|
|
start := l.pos
|
|
l.advance() // skip opening quote
|
|
for l.pos < len(l.src) {
|
|
ch := l.src[l.pos]
|
|
if ch == quote {
|
|
l.advance() // skip closing quote
|
|
// tsgo pattern: substring slice (zero-copy)
|
|
literal := l.src[start+1 : l.pos-1]
|
|
return l.makeTokenAt(token.STRING, literal, start)
|
|
}
|
|
// Note: Harbour does NOT use C-style escape sequences in strings.
|
|
// "\" is a valid string containing a single backslash.
|
|
if ch == '\n' || ch == '\r' {
|
|
break // unterminated string
|
|
}
|
|
l.advance()
|
|
}
|
|
// Unterminated string
|
|
return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start)
|
|
}
|
|
|
|
// isStringBracket returns true if [ should be treated as string delimiter.
|
|
// Harbour: [text] is string when not preceded by ident, ), ], literal.
|
|
func (l *Lexer) isStringBracket() bool {
|
|
switch l.lastKind {
|
|
case token.IDENT, token.RPAREN, token.RBRACKET,
|
|
token.INT, token.LONG, token.DOUBLE, token.STRING,
|
|
token.TRUE, token.FALSE, token.NIL_LIT:
|
|
return false // array index context
|
|
}
|
|
// Keywords used as variable names (begin, return, for, etc.) — treat as subscript
|
|
// Any keyword token could be a variable name in Harbour
|
|
if l.lastKind >= token.FUNCTION_KW {
|
|
return false
|
|
}
|
|
// Also check if next char is ] (empty []) — that's array
|
|
if l.pos < len(l.src) && l.src[l.pos] == ']' {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// scanBracketString scans [text] as a string literal.
|
|
func (l *Lexer) scanBracketString(start int) token.Token {
|
|
l.advance() // skip [
|
|
strStart := l.pos
|
|
depth := 1
|
|
for l.pos < len(l.src) && depth > 0 {
|
|
if l.src[l.pos] == '[' {
|
|
depth++
|
|
} else if l.src[l.pos] == ']' {
|
|
depth--
|
|
if depth == 0 {
|
|
literal := l.src[strStart:l.pos]
|
|
l.advance() // skip ]
|
|
return l.makeTokenAt(token.STRING, literal, start)
|
|
}
|
|
} else if l.src[l.pos] == '\n' || l.src[l.pos] == '\r' {
|
|
break // unterminated
|
|
}
|
|
l.advance()
|
|
}
|
|
return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start)
|
|
}
|
|
|
|
// --- Number scanning ---
|
|
|
|
func (l *Lexer) scanNumber() token.Token {
|
|
start := l.pos
|
|
isDouble := false
|
|
|
|
// Hex: 0x...
|
|
if l.src[l.pos] == '0' && l.pos+1 < len(l.src) && (l.src[l.pos+1] == 'x' || l.src[l.pos+1] == 'X') {
|
|
l.advance() // 0
|
|
l.advance() // x
|
|
for l.pos < len(l.src) && isHexDigit(l.src[l.pos]) {
|
|
l.advance()
|
|
}
|
|
return l.makeTokenAt(token.INT, l.src[start:l.pos], start)
|
|
}
|
|
|
|
// Leading dot: .12 → 0.12
|
|
if l.src[start] == '.' {
|
|
isDouble = true
|
|
l.advance() // skip .
|
|
for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' {
|
|
l.advance()
|
|
}
|
|
return l.makeTokenAt(token.DOUBLE, l.src[start:l.pos], start)
|
|
}
|
|
|
|
// Decimal digits
|
|
for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' {
|
|
l.advance()
|
|
}
|
|
|
|
// Decimal point
|
|
if l.pos < len(l.src) && l.src[l.pos] == '.' {
|
|
// Check it's not a method call (123.method) or range
|
|
if l.pos+1 < len(l.src) && l.src[l.pos+1] >= '0' && l.src[l.pos+1] <= '9' {
|
|
isDouble = true
|
|
l.advance() // skip .
|
|
for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' {
|
|
l.advance()
|
|
}
|
|
}
|
|
}
|
|
|
|
literal := l.src[start:l.pos]
|
|
if isDouble {
|
|
return l.makeTokenAt(token.DOUBLE, literal, start)
|
|
}
|
|
return l.makeTokenAt(token.INT, literal, start)
|
|
}
|
|
|
|
// --- Dot-prefixed tokens ---
|
|
|
|
func (l *Lexer) scanDotToken() token.Token {
|
|
start := l.pos
|
|
|
|
// .T. / .F.
|
|
if l.pos+2 < len(l.src) && l.src[l.pos+2] == '.' {
|
|
mid := l.src[l.pos+1]
|
|
if mid == 'T' || mid == 't' {
|
|
l.pos += 3
|
|
l.col += 3
|
|
return l.makeTokenAt(token.TRUE, ".T.", start)
|
|
}
|
|
if mid == 'F' || mid == 'f' {
|
|
l.pos += 3
|
|
l.col += 3
|
|
return l.makeTokenAt(token.FALSE, ".F.", start)
|
|
}
|
|
}
|
|
|
|
// .AND. / .OR. / .NOT.
|
|
for _, kw := range []struct {
|
|
text string
|
|
kind token.Kind
|
|
}{
|
|
{".AND.", token.AND},
|
|
{".OR.", token.OR},
|
|
{".NOT.", token.NOT},
|
|
} {
|
|
if l.matchDotKeyword(kw.text) {
|
|
l.pos += len(kw.text)
|
|
l.col += len(kw.text)
|
|
return l.makeTokenAt(kw.kind, kw.text, start)
|
|
}
|
|
}
|
|
|
|
return token.Token{Kind: token.ILLEGAL} // let caller handle plain DOT
|
|
}
|
|
|
|
func (l *Lexer) matchDotKeyword(kw string) bool {
|
|
if l.pos+len(kw) > len(l.src) {
|
|
return false
|
|
}
|
|
for i := 0; i < len(kw); i++ {
|
|
c := l.src[l.pos+i]
|
|
k := kw[i]
|
|
if c == k {
|
|
continue
|
|
}
|
|
// Case-insensitive for letters
|
|
if c >= 'a' && c <= 'z' && c-32 == k {
|
|
continue
|
|
}
|
|
if c >= 'A' && c <= 'Z' && c+32 == k {
|
|
continue
|
|
}
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// --- Identifier scanning ---
|
|
|
|
func (l *Lexer) scanIdent() token.Token {
|
|
start := l.pos
|
|
for l.pos < len(l.src) && isIdentChar(l.src[l.pos]) {
|
|
l.advance()
|
|
}
|
|
// tsgo pattern: substring slice (zero-copy from source)
|
|
literal := l.src[start:l.pos]
|
|
kind := token.LookupKeyword(literal)
|
|
return l.makeTokenAt(kind, literal, start)
|
|
}
|
|
|
|
// --- Operator scanning ---
|
|
|
|
func (l *Lexer) scanOperator() token.Token {
|
|
start := l.pos
|
|
ch := l.src[l.pos]
|
|
l.advance()
|
|
|
|
switch ch {
|
|
case '+':
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.PLUSEQ, "+=", start)
|
|
}
|
|
if l.peek() == '+' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.INC, "++", start)
|
|
}
|
|
return l.makeTokenAt(token.PLUS, "+", start)
|
|
case '-':
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.MINUSEQ, "-=", start)
|
|
}
|
|
if l.peek() == '-' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.DEC, "--", start)
|
|
}
|
|
if l.peek() == '>' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.ARROW, "->", start)
|
|
}
|
|
return l.makeTokenAt(token.MINUS, "-", start)
|
|
case '*':
|
|
if l.peek() == '*' {
|
|
l.advance()
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.POWEREQ, "**=", start)
|
|
}
|
|
return l.makeTokenAt(token.POWER, "**", start)
|
|
}
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.STAREQ, "*=", start)
|
|
}
|
|
return l.makeTokenAt(token.STAR, "*", start)
|
|
case '/':
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.SLASHEQ, "/=", start)
|
|
}
|
|
return l.makeTokenAt(token.SLASH, "/", start)
|
|
case '%':
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.PERCENTEQ, "%=", start)
|
|
}
|
|
return l.makeTokenAt(token.PERCENT, "%", start)
|
|
case '=':
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.EXEQ, "==", start)
|
|
}
|
|
if l.peek() == '>' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.DBLARROW, "=>", start)
|
|
}
|
|
return l.makeTokenAt(token.EQ, "=", start)
|
|
case '!':
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.NEQ, "!=", start)
|
|
}
|
|
return l.makeTokenAt(token.NOT, "!", start)
|
|
case '<':
|
|
if l.peek() == '-' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.ARROW_LEFT, "<-", start)
|
|
}
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.LTE, "<=", start)
|
|
}
|
|
if l.peek() == '>' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.NEQ, "<>", start)
|
|
}
|
|
return l.makeTokenAt(token.LT, "<", start)
|
|
case '>':
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.GTE, ">=", start)
|
|
}
|
|
return l.makeTokenAt(token.GT, ">", start)
|
|
case '#':
|
|
// # alone = not-equal (Clipper), #keyword = preprocessor
|
|
if l.peek() >= 'a' && l.peek() <= 'z' || l.peek() >= 'A' && l.peek() <= 'Z' {
|
|
return l.scanPreprocessor(start)
|
|
}
|
|
return l.makeTokenAt(token.NEQ, "#", start)
|
|
case ':':
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.ASSIGN, ":=", start)
|
|
}
|
|
if l.peek() == ':' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.COLONCOLON, "::", start)
|
|
}
|
|
return l.makeTokenAt(token.COLON, ":", start)
|
|
case '&':
|
|
return l.makeTokenAt(token.AMPERSAND, "&", start)
|
|
case '@':
|
|
return l.makeTokenAt(token.AT, "@", start)
|
|
case '$':
|
|
return l.makeTokenAt(token.DOLLAR, "$", start)
|
|
case '?':
|
|
if l.peek() == '?' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.QQMARK, "??", start)
|
|
}
|
|
return l.makeTokenAt(token.QMARK, "?", start)
|
|
case '(':
|
|
return l.makeTokenAt(token.LPAREN, "(", start)
|
|
case ')':
|
|
return l.makeTokenAt(token.RPAREN, ")", start)
|
|
case '[':
|
|
// Harbour: [text] is string literal when NOT preceded by ident/)/]/literal
|
|
// a[1] = array index, but ? [Hello] = string
|
|
if l.isStringBracket() {
|
|
return l.scanBracketString(start)
|
|
}
|
|
return l.makeTokenAt(token.LBRACKET, "[", start)
|
|
case ']':
|
|
return l.makeTokenAt(token.RBRACKET, "]", start)
|
|
case '{':
|
|
return l.makeTokenAt(token.LBRACE, "{", start)
|
|
case '}':
|
|
return l.makeTokenAt(token.RBRACE, "}", start)
|
|
case ',':
|
|
return l.makeTokenAt(token.COMMA, ",", start)
|
|
case '|':
|
|
return l.makeTokenAt(token.PIPE, "|", start)
|
|
case '^':
|
|
if l.peek() == '=' {
|
|
l.advance()
|
|
return l.makeTokenAt(token.POWEREQ, "^=", start)
|
|
}
|
|
return l.makeTokenAt(token.POWER, "^", start)
|
|
default:
|
|
// Handle multi-byte UTF-8 characters in identifiers
|
|
if ch >= 0x80 {
|
|
l.pos = start
|
|
_, size := utf8.DecodeRuneInString(l.src[l.pos:])
|
|
l.pos += size
|
|
l.col += size
|
|
return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start)
|
|
}
|
|
return l.makeTokenAt(token.ILLEGAL, string(ch), start)
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) scanPreprocessor(start int) token.Token {
|
|
// Already consumed '#', now scan the directive name
|
|
kwStart := l.pos
|
|
for l.pos < len(l.src) && isIdentChar(l.src[l.pos]) {
|
|
l.advance()
|
|
}
|
|
directive := l.src[kwStart:l.pos]
|
|
upper := token.LookupKeyword(directive)
|
|
_ = upper
|
|
|
|
full := l.src[start:l.pos]
|
|
switch {
|
|
case matchCI(directive, "include"):
|
|
return l.makeTokenAt(token.PP_INCLUDE, full, start)
|
|
case matchCI(directive, "define"):
|
|
return l.makeTokenAt(token.PP_DEFINE, full, start)
|
|
case matchCI(directive, "undef"):
|
|
return l.makeTokenAt(token.PP_UNDEF, full, start)
|
|
case matchCI(directive, "ifdef"):
|
|
return l.makeTokenAt(token.PP_IFDEF, full, start)
|
|
case matchCI(directive, "ifndef"):
|
|
return l.makeTokenAt(token.PP_IFNDEF, full, start)
|
|
case matchCI(directive, "else"):
|
|
return l.makeTokenAt(token.PP_ELSE, full, start)
|
|
case matchCI(directive, "endif"):
|
|
return l.makeTokenAt(token.PP_ENDIF, full, start)
|
|
case matchCI(directive, "command"):
|
|
return l.makeTokenAt(token.PP_COMMAND, full, start)
|
|
case matchCI(directive, "translate"):
|
|
return l.makeTokenAt(token.PP_TRANSLATE, full, start)
|
|
case matchCI(directive, "pragma"):
|
|
return l.makeTokenAt(token.PP_PRAGMA, full, start)
|
|
default:
|
|
return l.makeTokenAt(token.ILLEGAL, full, start)
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) makeTokenAt(kind token.Kind, literal string, startPos int) token.Token {
|
|
return token.Token{
|
|
Kind: kind,
|
|
Literal: literal,
|
|
Pos: token.Position{
|
|
File: l.file,
|
|
Line: l.line,
|
|
Col: startPos - l.lineStart + 1,
|
|
Offset: startPos,
|
|
},
|
|
}
|
|
}
|
|
|
|
// --- Character classification ---
|
|
|
|
func isIdentStart(ch byte) bool {
|
|
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_'
|
|
}
|
|
|
|
func isIdentChar(ch byte) bool {
|
|
return isIdentStart(ch) || (ch >= '0' && ch <= '9')
|
|
}
|
|
|
|
func isHexDigit(ch byte) bool {
|
|
return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')
|
|
}
|
|
|
|
func matchCI(a, b string) bool {
|
|
if len(a) != len(b) {
|
|
return false
|
|
}
|
|
for i := 0; i < len(a); i++ {
|
|
ca, cb := a[i], b[i]
|
|
if ca >= 'A' && ca <= 'Z' {
|
|
ca += 32
|
|
}
|
|
if cb >= 'A' && cb <= 'Z' {
|
|
cb += 32
|
|
}
|
|
if ca != cb {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|