// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com) // All rights reserved. // Lexer for the Five language (Harbour-compatible). // Hand-written scanner — no generated code. // Handles Harbour's case-insensitive keywords, .T./.F./.AND./.OR./.NOT. literals, // line-continuation with semicolon, and multiple comment styles. // // tsgo reference: ref/typescript-go/internal/scanner/ for scanning patterns. // Key insight from tsgo: substring slicing into original source (zero-copy tokens). package lexer import ( "five/compiler/token" "unicode/utf8" ) // Lexer scans Harbour/Five source code into tokens. type Lexer struct { src string // source code (immutable, tsgo pattern: substring slicing) file string // filename for error reporting pos int // current byte position line int // current line (1-based) col int // current column (1-based) lineStart int // byte offset of current line start lastKind token.Kind // previous token kind (for [string] detection) } // New creates a new Lexer for the given source. func New(filename, source string) *Lexer { return &Lexer{ src: source, file: filename, pos: 0, line: 1, col: 1, lineStart: 0, } } // NextToken returns the next token from the source. func (l *Lexer) NextToken() token.Token { tok := l.nextTokenInner() l.lastKind = tok.Kind return tok } func (l *Lexer) nextTokenInner() token.Token { l.skipWhitespaceAndComments() if l.pos >= len(l.src) { return l.makeToken(token.EOF, "") } ch := l.src[l.pos] // Newline = statement terminator if ch == '\n' { tok := l.makeToken(token.NEWLINE, "\n") l.advance() l.line++ l.col = 1 l.lineStart = l.pos return tok } if ch == '\r' { l.advance() if l.pos < len(l.src) && l.src[l.pos] == '\n' { l.advance() } tok := l.makeToken(token.NEWLINE, "\n") l.line++ l.col = 1 l.lineStart = l.pos return tok } // String literals if ch == '"' || ch == '\'' { return l.scanString(ch) } // Numbers if ch >= '0' && ch <= '9' { return l.scanNumber() } // Dot-prefixed: .12 = numeric, .T., .F., .AND., .OR., .NOT. if ch == '.' { // .12 — numeric starting with decimal point if l.pos+1 < len(l.src) && l.src[l.pos+1] >= '0' && l.src[l.pos+1] <= '9' { return l.scanNumber() // scanNumber handles leading dot } if dot := l.scanDotToken(); dot.Kind != token.ILLEGAL { return dot } l.advance() return l.makeToken(token.DOT, ".") } // Identifiers and keywords if isIdentStart(ch) { return l.scanIdent() } // Operators and punctuation return l.scanOperator() } // Tokenize returns all tokens from the source. func Tokenize(filename, source string) []token.Token { l := New(filename, source) var tokens []token.Token for { tok := l.NextToken() tokens = append(tokens, tok) if tok.Kind == token.EOF { break } } return tokens } // --- Internal scanning methods --- func (l *Lexer) advance() { if l.pos < len(l.src) { l.pos++ l.col++ } } func (l *Lexer) peek() byte { if l.pos < len(l.src) { return l.src[l.pos] } return 0 } func (l *Lexer) peekAt(offset int) byte { p := l.pos + offset if p < len(l.src) { return l.src[p] } return 0 } func (l *Lexer) makeToken(kind token.Kind, literal string) token.Token { return token.Token{ Kind: kind, Literal: literal, Pos: token.Position{ File: l.file, Line: l.line, Col: l.col, Offset: l.pos, }, } } func (l *Lexer) skipWhitespaceAndComments() { for l.pos < len(l.src) { ch := l.src[l.pos] // Spaces and tabs (not newlines — those are tokens) if ch == ' ' || ch == '\t' { l.advance() continue } // Semicolon = line continuation (skip semicolon + following newline) if ch == ';' { l.advance() // Skip whitespace until newline for l.pos < len(l.src) && (l.src[l.pos] == ' ' || l.src[l.pos] == '\t') { l.advance() } // Skip trailing // comment before newline if l.pos+1 < len(l.src) && l.src[l.pos] == '/' && l.src[l.pos+1] == '/' { for l.pos < len(l.src) && l.src[l.pos] != '\n' && l.src[l.pos] != '\r' { l.advance() } } // Skip the newline itself if l.pos < len(l.src) && l.src[l.pos] == '\r' { l.advance() } if l.pos < len(l.src) && l.src[l.pos] == '\n' { l.advance() l.line++ l.col = 1 l.lineStart = l.pos } continue } // Backslash = alternate line continuation (Harbour extension) if ch == '\\' && l.peekAt(1) != '\\' { l.advance() for l.pos < len(l.src) && (l.src[l.pos] == ' ' || l.src[l.pos] == '\t') { l.advance() } if l.pos < len(l.src) && l.src[l.pos] == '\r' { l.advance() } if l.pos < len(l.src) && l.src[l.pos] == '\n' { l.advance() l.line++ l.col = 1 l.lineStart = l.pos } continue } // // single-line comment if ch == '/' && l.peekAt(1) == '/' { l.skipToEndOfLine() continue } // /* ... */ multi-line comment if ch == '/' && l.peekAt(1) == '*' { l.skipBlockComment() continue } // && single-line comment (Harbour style) if ch == '&' && l.peekAt(1) == '&' { l.skipToEndOfLine() continue } // * at start of line = comment (Harbour/Clipper style) // Also handles indented * comments: " * comment" if ch == '*' && l.isFirstNonWhitespace() { l.skipToEndOfLine() continue } // NOTE at start of line (Harbour) if (ch == 'N' || ch == 'n') && l.pos == l.lineStart { if l.matchWordAt("NOTE") { l.skipToEndOfLine() continue } } break } } func (l *Lexer) isFirstNonWhitespace() bool { for i := l.lineStart; i < l.pos; i++ { if l.src[i] != ' ' && l.src[i] != '\t' { return false } } return true } func (l *Lexer) skipToEndOfLine() { for l.pos < len(l.src) && l.src[l.pos] != '\n' && l.src[l.pos] != '\r' { l.advance() } } func (l *Lexer) skipBlockComment() { l.advance() // skip / l.advance() // skip * for l.pos < len(l.src)-1 { if l.src[l.pos] == '*' && l.src[l.pos+1] == '/' { l.advance() // skip * l.advance() // skip / return } if l.src[l.pos] == '\n' { l.line++ l.col = 0 l.lineStart = l.pos + 1 } l.advance() } // Unterminated comment — consume rest l.pos = len(l.src) } func (l *Lexer) matchWordAt(word string) bool { if l.pos+len(word) > len(l.src) { return false } for i := 0; i < len(word); i++ { c := l.src[l.pos+i] w := word[i] if c != w && c != w+32 && c != w-32 { return false } } // Must be followed by space or newline (not part of identifier) if l.pos+len(word) < len(l.src) { next := l.src[l.pos+len(word)] if isIdentChar(next) { return false } } return true } // --- String scanning --- func (l *Lexer) scanString(quote byte) token.Token { start := l.pos l.advance() // skip opening quote for l.pos < len(l.src) { ch := l.src[l.pos] if ch == quote { l.advance() // skip closing quote // tsgo pattern: substring slice (zero-copy) literal := l.src[start+1 : l.pos-1] return l.makeTokenAt(token.STRING, literal, start) } // Note: Harbour does NOT use C-style escape sequences in strings. // "\" is a valid string containing a single backslash. if ch == '\n' || ch == '\r' { break // unterminated string } l.advance() } // Unterminated string return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start) } // isStringBracket returns true if [ should be treated as string delimiter. // Harbour: [text] is string when not preceded by ident, ), ], literal. func (l *Lexer) isStringBracket() bool { switch l.lastKind { case token.IDENT, token.RPAREN, token.RBRACKET, token.INT, token.LONG, token.DOUBLE, token.STRING, token.TRUE, token.FALSE, token.NIL_LIT: return false // array index context } // Keywords used as variable names (begin, return, for, etc.) — treat as subscript // Any keyword token could be a variable name in Harbour if l.lastKind >= token.FUNCTION_KW { return false } // Also check if next char is ] (empty []) — that's array if l.pos < len(l.src) && l.src[l.pos] == ']' { return false } return true } // scanBracketString scans [text] as a string literal. func (l *Lexer) scanBracketString(start int) token.Token { l.advance() // skip [ strStart := l.pos depth := 1 for l.pos < len(l.src) && depth > 0 { if l.src[l.pos] == '[' { depth++ } else if l.src[l.pos] == ']' { depth-- if depth == 0 { literal := l.src[strStart:l.pos] l.advance() // skip ] return l.makeTokenAt(token.STRING, literal, start) } } else if l.src[l.pos] == '\n' || l.src[l.pos] == '\r' { break // unterminated } l.advance() } return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start) } // --- Number scanning --- func (l *Lexer) scanNumber() token.Token { start := l.pos isDouble := false // Hex: 0x... if l.src[l.pos] == '0' && l.pos+1 < len(l.src) && (l.src[l.pos+1] == 'x' || l.src[l.pos+1] == 'X') { l.advance() // 0 l.advance() // x for l.pos < len(l.src) && isHexDigit(l.src[l.pos]) { l.advance() } return l.makeTokenAt(token.INT, l.src[start:l.pos], start) } // Leading dot: .12 → 0.12 if l.src[start] == '.' { isDouble = true l.advance() // skip . for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' { l.advance() } return l.makeTokenAt(token.DOUBLE, l.src[start:l.pos], start) } // Decimal digits for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' { l.advance() } // Decimal point if l.pos < len(l.src) && l.src[l.pos] == '.' { // Check it's not a method call (123.method) or range if l.pos+1 < len(l.src) && l.src[l.pos+1] >= '0' && l.src[l.pos+1] <= '9' { isDouble = true l.advance() // skip . for l.pos < len(l.src) && l.src[l.pos] >= '0' && l.src[l.pos] <= '9' { l.advance() } } } literal := l.src[start:l.pos] if isDouble { return l.makeTokenAt(token.DOUBLE, literal, start) } return l.makeTokenAt(token.INT, literal, start) } // --- Dot-prefixed tokens --- func (l *Lexer) scanDotToken() token.Token { start := l.pos // .T. / .F. if l.pos+2 < len(l.src) && l.src[l.pos+2] == '.' { mid := l.src[l.pos+1] if mid == 'T' || mid == 't' { l.pos += 3 l.col += 3 return l.makeTokenAt(token.TRUE, ".T.", start) } if mid == 'F' || mid == 'f' { l.pos += 3 l.col += 3 return l.makeTokenAt(token.FALSE, ".F.", start) } } // .AND. / .OR. / .NOT. for _, kw := range []struct { text string kind token.Kind }{ {".AND.", token.AND}, {".OR.", token.OR}, {".NOT.", token.NOT}, } { if l.matchDotKeyword(kw.text) { l.pos += len(kw.text) l.col += len(kw.text) return l.makeTokenAt(kw.kind, kw.text, start) } } return token.Token{Kind: token.ILLEGAL} // let caller handle plain DOT } func (l *Lexer) matchDotKeyword(kw string) bool { if l.pos+len(kw) > len(l.src) { return false } for i := 0; i < len(kw); i++ { c := l.src[l.pos+i] k := kw[i] if c == k { continue } // Case-insensitive for letters if c >= 'a' && c <= 'z' && c-32 == k { continue } if c >= 'A' && c <= 'Z' && c+32 == k { continue } return false } return true } // --- Identifier scanning --- func (l *Lexer) scanIdent() token.Token { start := l.pos for l.pos < len(l.src) && isIdentChar(l.src[l.pos]) { l.advance() } // tsgo pattern: substring slice (zero-copy from source) literal := l.src[start:l.pos] kind := token.LookupKeyword(literal) return l.makeTokenAt(kind, literal, start) } // --- Operator scanning --- func (l *Lexer) scanOperator() token.Token { start := l.pos ch := l.src[l.pos] l.advance() switch ch { case '+': if l.peek() == '=' { l.advance() return l.makeTokenAt(token.PLUSEQ, "+=", start) } if l.peek() == '+' { l.advance() return l.makeTokenAt(token.INC, "++", start) } return l.makeTokenAt(token.PLUS, "+", start) case '-': if l.peek() == '=' { l.advance() return l.makeTokenAt(token.MINUSEQ, "-=", start) } if l.peek() == '-' { l.advance() return l.makeTokenAt(token.DEC, "--", start) } if l.peek() == '>' { l.advance() return l.makeTokenAt(token.ARROW, "->", start) } return l.makeTokenAt(token.MINUS, "-", start) case '*': if l.peek() == '*' { l.advance() if l.peek() == '=' { l.advance() return l.makeTokenAt(token.POWEREQ, "**=", start) } return l.makeTokenAt(token.POWER, "**", start) } if l.peek() == '=' { l.advance() return l.makeTokenAt(token.STAREQ, "*=", start) } return l.makeTokenAt(token.STAR, "*", start) case '/': if l.peek() == '=' { l.advance() return l.makeTokenAt(token.SLASHEQ, "/=", start) } return l.makeTokenAt(token.SLASH, "/", start) case '%': if l.peek() == '=' { l.advance() return l.makeTokenAt(token.PERCENTEQ, "%=", start) } return l.makeTokenAt(token.PERCENT, "%", start) case '=': if l.peek() == '=' { l.advance() return l.makeTokenAt(token.EXEQ, "==", start) } if l.peek() == '>' { l.advance() return l.makeTokenAt(token.DBLARROW, "=>", start) } return l.makeTokenAt(token.EQ, "=", start) case '!': if l.peek() == '=' { l.advance() return l.makeTokenAt(token.NEQ, "!=", start) } return l.makeTokenAt(token.NOT, "!", start) case '<': if l.peek() == '-' { l.advance() return l.makeTokenAt(token.ARROW_LEFT, "<-", start) } if l.peek() == '=' { l.advance() return l.makeTokenAt(token.LTE, "<=", start) } if l.peek() == '>' { l.advance() return l.makeTokenAt(token.NEQ, "<>", start) } return l.makeTokenAt(token.LT, "<", start) case '>': if l.peek() == '=' { l.advance() return l.makeTokenAt(token.GTE, ">=", start) } return l.makeTokenAt(token.GT, ">", start) case '#': // # alone = not-equal (Clipper), #keyword = preprocessor if l.peek() >= 'a' && l.peek() <= 'z' || l.peek() >= 'A' && l.peek() <= 'Z' { return l.scanPreprocessor(start) } return l.makeTokenAt(token.NEQ, "#", start) case ':': if l.peek() == '=' { l.advance() return l.makeTokenAt(token.ASSIGN, ":=", start) } if l.peek() == ':' { l.advance() return l.makeTokenAt(token.COLONCOLON, "::", start) } return l.makeTokenAt(token.COLON, ":", start) case '&': return l.makeTokenAt(token.AMPERSAND, "&", start) case '@': return l.makeTokenAt(token.AT, "@", start) case '$': return l.makeTokenAt(token.DOLLAR, "$", start) case '?': if l.peek() == '?' { l.advance() return l.makeTokenAt(token.QQMARK, "??", start) } return l.makeTokenAt(token.QMARK, "?", start) case '(': return l.makeTokenAt(token.LPAREN, "(", start) case ')': return l.makeTokenAt(token.RPAREN, ")", start) case '[': // Harbour: [text] is string literal when NOT preceded by ident/)/]/literal // a[1] = array index, but ? [Hello] = string if l.isStringBracket() { return l.scanBracketString(start) } return l.makeTokenAt(token.LBRACKET, "[", start) case ']': return l.makeTokenAt(token.RBRACKET, "]", start) case '{': return l.makeTokenAt(token.LBRACE, "{", start) case '}': return l.makeTokenAt(token.RBRACE, "}", start) case ',': return l.makeTokenAt(token.COMMA, ",", start) case '|': return l.makeTokenAt(token.PIPE, "|", start) case '^': if l.peek() == '=' { l.advance() return l.makeTokenAt(token.POWEREQ, "^=", start) } return l.makeTokenAt(token.POWER, "^", start) default: // Handle multi-byte UTF-8 characters in identifiers if ch >= 0x80 { l.pos = start _, size := utf8.DecodeRuneInString(l.src[l.pos:]) l.pos += size l.col += size return l.makeTokenAt(token.ILLEGAL, l.src[start:l.pos], start) } return l.makeTokenAt(token.ILLEGAL, string(ch), start) } } func (l *Lexer) scanPreprocessor(start int) token.Token { // Already consumed '#', now scan the directive name kwStart := l.pos for l.pos < len(l.src) && isIdentChar(l.src[l.pos]) { l.advance() } directive := l.src[kwStart:l.pos] upper := token.LookupKeyword(directive) _ = upper full := l.src[start:l.pos] switch { case matchCI(directive, "include"): return l.makeTokenAt(token.PP_INCLUDE, full, start) case matchCI(directive, "define"): return l.makeTokenAt(token.PP_DEFINE, full, start) case matchCI(directive, "undef"): return l.makeTokenAt(token.PP_UNDEF, full, start) case matchCI(directive, "ifdef"): return l.makeTokenAt(token.PP_IFDEF, full, start) case matchCI(directive, "ifndef"): return l.makeTokenAt(token.PP_IFNDEF, full, start) case matchCI(directive, "else"): return l.makeTokenAt(token.PP_ELSE, full, start) case matchCI(directive, "endif"): return l.makeTokenAt(token.PP_ENDIF, full, start) case matchCI(directive, "command"): return l.makeTokenAt(token.PP_COMMAND, full, start) case matchCI(directive, "translate"): return l.makeTokenAt(token.PP_TRANSLATE, full, start) case matchCI(directive, "pragma"): return l.makeTokenAt(token.PP_PRAGMA, full, start) default: return l.makeTokenAt(token.ILLEGAL, full, start) } } func (l *Lexer) makeTokenAt(kind token.Kind, literal string, startPos int) token.Token { return token.Token{ Kind: kind, Literal: literal, Pos: token.Position{ File: l.file, Line: l.line, Col: startPos - l.lineStart + 1, Offset: startPos, }, } } // --- Character classification --- func isIdentStart(ch byte) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_' } func isIdentChar(ch byte) bool { return isIdentStart(ch) || (ch >= '0' && ch <= '9') } func isHexDigit(ch byte) bool { return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F') } func matchCI(a, b string) bool { if len(a) != len(b) { return false } for i := 0; i < len(a); i++ { ca, cb := a[i], b[i] if ca >= 'A' && ca <= 'Z' { ca += 32 } if cb >= 'A' && cb <= 'Z' { cb += 32 } if ca != cb { return false } } return true }