use crate::token::{Span, Token, TokenKind}; use unicode_xid::UnicodeXID; pub struct Lexer<'src> { src: &'src str, /// Current byte offset into `src`. Always kept on a UTF-8 char boundary. pos: usize, } impl<'src> Lexer<'src> { pub fn new(src: &'src str) -> Self { Self { src, pos: 0 } } // ── Low-level cursor primitives ────────────────────────────────────────── /// Peek at the next character without consuming it. #[inline] fn peek(&self) -> Option { self.src[self.pos..].chars().next() } /// Consume and return the next character. /// Panics if called at end-of-input (always guard with `peek` first). #[inline] fn advance(&mut self) -> char { let c = self.src[self.pos..] .chars() .next() .expect("advance called at end of input"); self.pos += c.len_utf8(); c } /// Advance while `pred` holds. #[inline] fn skip_while(&mut self, mut pred: impl FnMut(char) -> bool) { while self.peek().is_some_and(|c| pred(c)) { self.advance(); } } /// Check whether the two bytes at the current position equal `[a, b]`. /// Safe even when the source contains multi-byte chars because we compare /// raw bytes and `a`/`b` are always ASCII. #[inline] fn at_ascii2(&self, a: u8, b: u8) -> bool { let bytes = self.src.as_bytes(); self.pos + 1 < bytes.len() && bytes[self.pos] == a && bytes[self.pos + 1] == b } /// Build a token from `[start, self.pos)`. #[inline] fn make(&self, kind: TokenKind, start: usize) -> Token<'src> { Token::new( kind, Span::new(start as u32, self.pos as u32), &self.src[start..self.pos], ) } // ── Trivia skipping ────────────────────────────────────────────────────── /// Skip all whitespace and comments (`//…` and `/*…*/`). fn skip_trivia(&mut self) { loop { // Whitespace self.skip_while(|c| c.is_ascii_whitespace()); if self.at_ascii2(b'/', b'/') { // Line comment — skip everything up to (but not including) '\n' self.pos += 2; self.skip_while(|c| c != '\n'); } else if self.at_ascii2(b'/', b'*') { // Block comment — skip until matching `*/` self.pos += 2; loop { if self.at_ascii2(b'*', b'/') { self.pos += 2; break; } if self.peek().is_none() { break; // unterminated block comment — stop at EOF } self.advance(); } } else { break; } } } // ── Literal scanners ───────────────────────────────────────────────────── /// Scan the body and closing `"` of a string literal. /// The opening `"` has already been consumed. fn scan_string(&mut self) { loop { match self.peek() { None => break, // unterminated — stop at EOF Some('"') => { self.advance(); break; } Some('\\') => { self.advance(); self.scan_escape(); } _ => { self.advance(); } } } } /// Scan the body and closing `'` of a char literal. /// The opening `'` has already been consumed. fn scan_char(&mut self) { match self.peek() { None => return, Some('\\') => { self.advance(); self.scan_escape(); } _ => { self.advance(); } } if self.peek() == Some('\'') { self.advance(); } } /// Scan the tail of an escape sequence (the leading `\` is already consumed). fn scan_escape(&mut self) { match self.peek() { Some('u') => { self.advance(); if self.peek() == Some('{') { self.advance(); self.skip_while(|c| c.is_ascii_hexdigit()); if self.peek() == Some('}') { self.advance(); } } } Some(_) => { self.advance(); } // n, t, r, \, ", ', 0, … None => {} // EOF inside escape — stop } } /// Scan a numeric literal. The first character `first` has already been /// consumed; `start` is its byte offset. fn scan_number(&mut self, first: char) -> TokenKind { // Prefix detection for non-decimal bases (only after a leading `0`) if first == '0' { match self.peek() { Some('x') | Some('X') => { self.advance(); self.skip_while(|c| c.is_ascii_hexdigit() || c == '_'); return TokenKind::IntLit; } Some('o') | Some('O') => { self.advance(); self.skip_while(|c| matches!(c, '0'..='7') || c == '_'); return TokenKind::IntLit; } Some('b') | Some('B') => { self.advance(); self.skip_while(|c| matches!(c, '0' | '1') || c == '_'); return TokenKind::IntLit; } _ => {} } } // Remaining decimal digits (with optional `_` separators) self.skip_while(|c| c.is_ascii_digit() || c == '_'); // Fractional part: `.` followed by at least one digit. // We peek at the *byte* after `.` to avoid claiming the `.` in // member-access expressions like `42.to_string()`. let mut is_float = false; let bytes = self.src.as_bytes(); if bytes.get(self.pos) == Some(&b'.') { if bytes.get(self.pos + 1).is_some_and(|b| b.is_ascii_digit()) { self.advance(); // consume '.' self.skip_while(|c| c.is_ascii_digit() || c == '_'); is_float = true; } } // Optional exponent: `e` or `E`, optional sign, digits if matches!(self.peek(), Some('e') | Some('E')) { self.advance(); if matches!(self.peek(), Some('+') | Some('-')) { self.advance(); } self.skip_while(|c| c.is_ascii_digit() || c == '_'); is_float = true; } if is_float { TokenKind::FloatLit } else { TokenKind::IntLit } } /// Scan an identifier and map it to the correct keyword token (if any). /// The first character has already been consumed; `start` is its byte offset. fn scan_ident_or_kw(&mut self, start: usize) -> TokenKind { self.skip_while(|c| UnicodeXID::is_xid_continue(c)); match &self.src[start..self.pos] { // Control flow "if" => TokenKind::If, "else" => TokenKind::Else, "while" => TokenKind::While, "loop" => TokenKind::Loop, "break" => TokenKind::Break, "continue" => TokenKind::Continue, "return" => TokenKind::Return, // Declarations "fn" => TokenKind::Fn, "struct" => TokenKind::Struct, "let" => TokenKind::Let, "mut" => TokenKind::Mut, // Operator keywords "and" => TokenKind::And, "or" => TokenKind::Or, // Boolean literals "true" => TokenKind::True, "false" => TokenKind::False, // Primitive types "u8" => TokenKind::U8, "u16" => TokenKind::U16, "u32" => TokenKind::U32, "u64" => TokenKind::U64, "i8" => TokenKind::I8, "i16" => TokenKind::I16, "i32" => TokenKind::I32, "i64" => TokenKind::I64, "f32" => TokenKind::F32, "f64" => TokenKind::F64, "bool" => TokenKind::Bool, "char" => TokenKind::Char, // Pointer keyword "opaque" => TokenKind::Opaque, _ => TokenKind::Ident, } } // ── Public API ─────────────────────────────────────────────────────────── /// Lex and return the next meaningful token. /// All leading whitespace and comments are silently skipped. /// Once input is exhausted, every subsequent call returns `Eof`. pub fn next_token(&mut self) -> Token<'src> { self.skip_trivia(); let start = self.pos; let Some(c) = self.peek() else { return self.make(TokenKind::Eof, start); }; self.advance(); let kind = match c { // ── Unambiguous single-character tokens ────────────────────────── '+' => TokenKind::Plus, '*' => TokenKind::Star, '/' => TokenKind::Slash, '%' => TokenKind::Percent, '&' => TokenKind::Amp, '|' => TokenKind::Pipe, '^' => TokenKind::Caret, '~' => TokenKind::Tilde, '.' => TokenKind::Dot, '(' => TokenKind::LParen, ')' => TokenKind::RParen, '[' => TokenKind::LBracket, ']' => TokenKind::RBracket, '{' => TokenKind::LCurly, '}' => TokenKind::RCurly, ',' => TokenKind::Comma, ';' => TokenKind::Semicolon, ':' => TokenKind::Colon, // ── Tokens that may be the prefix of a longer token ────────────── '-' => { if self.peek() == Some('>') { self.advance(); TokenKind::Arrow } else { TokenKind::Minus } } '!' => { if self.peek() == Some('=') { self.advance(); TokenKind::BangEq } else { TokenKind::Bang } } '=' => { if self.peek() == Some('=') { self.advance(); TokenKind::EqEq } else { TokenKind::Eq } } '<' => { if self.peek() == Some('=') { self.advance(); TokenKind::LtEq } else { TokenKind::Lt } } '>' => { if self.peek() == Some('=') { self.advance(); TokenKind::GtEq } else { TokenKind::Gt } } // ── Literals ───────────────────────────────────────────────────── '"' => { self.scan_string(); TokenKind::StringLit } '\'' => { self.scan_char(); TokenKind::CharLit } '0'..='9' => self.scan_number(c), // ── Identifiers and keywords ───────────────────────────────────── // `_` is XID_Continue but not XID_Start; Flux allows it as a // leading character (e.g. `_bar`, `__builtin`). c if c == '_' || UnicodeXID::is_xid_start(c) => self.scan_ident_or_kw(start), // ── Anything unrecognised ──────────────────────────────────────── _ => TokenKind::Unknown, }; self.make(kind, start) } /// Collect every token (including the trailing `Eof`) into a `Vec`. pub fn tokenize(mut self) -> Vec> { let mut tokens = Vec::new(); loop { let tok = self.next_token(); let done = tok.is(TokenKind::Eof); tokens.push(tok); if done { break; } } tokens } } /// `Lexer` implements `Iterator` over non-`Eof` tokens, making it easy to use /// in `for` loops or with iterator adaptors. impl<'src> Iterator for Lexer<'src> { type Item = Token<'src>; fn next(&mut self) -> Option> { let tok = self.next_token(); if tok.is(TokenKind::Eof) { None } else { Some(tok) } } } // ── Tests ──────────────────────────────────────────────────────────────────── #[cfg(test)] mod tests { use super::*; use TokenKind::*; fn kinds(src: &str) -> Vec { Lexer::new(src) .tokenize() .into_iter() .map(|t| t.kind) .collect() } fn texts(src: &str) -> Vec<&str> { Lexer::new(src) .tokenize() .into_iter() .map(|t| t.text) .collect() } #[test] fn empty_input() { assert_eq!(kinds(""), vec![Eof]); } #[test] fn whitespace_only() { assert_eq!(kinds(" \t\n "), vec![Eof]); } #[test] fn line_comment_skipped() { assert_eq!(kinds("// this is a comment\n42"), vec![IntLit, Eof]); } #[test] fn block_comment_skipped() { assert_eq!(kinds("/* hello */ 1 /* world */"), vec![IntLit, Eof]); } #[test] fn block_comment_multiline() { assert_eq!(kinds("/*\n ignored\n*/\ntrue"), vec![True, Eof]); } #[test] fn keywords() { let src = "fn struct let mut return if else while loop break continue and or true false opaque"; assert_eq!( kinds(src), vec![ Fn, Struct, Let, Mut, Return, If, Else, While, Loop, Break, Continue, And, Or, True, False, Opaque, Eof ] ); } #[test] fn type_keywords() { let src = "u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 bool char"; assert_eq!( kinds(src), vec![ U8, U16, U32, U64, I8, I16, I32, I64, F32, F64, Bool, Char, Eof ] ); } #[test] fn identifier() { let toks = Lexer::new("foo _bar baz42").tokenize(); assert_eq!(toks[0].kind, Ident); assert_eq!(toks[0].text, "foo"); assert_eq!(toks[1].kind, Ident); assert_eq!(toks[1].text, "_bar"); assert_eq!(toks[2].kind, Ident); assert_eq!(toks[2].text, "baz42"); assert_eq!(toks[3].kind, Eof); } #[test] fn integer_literals() { assert_eq!( kinds("42 0xFF 0o77 0b1010 1_000_000"), vec![IntLit, IntLit, IntLit, IntLit, IntLit, Eof] ); let ts = texts("42 0xFF 0o77 0b1010 1_000_000"); assert_eq!(ts, vec!["42", "0xFF", "0o77", "0b1010", "1_000_000", ""]); } #[test] fn float_literals() { assert_eq!( kinds("3.14 1.0e-9 2e4 0.5"), vec![FloatLit, FloatLit, FloatLit, FloatLit, Eof] ); } #[test] fn dot_not_stolen_from_integer() { // `0.bar` should lex as IntLit Dot Ident, not FloatLit Ident let ts = Lexer::new("0.bar").tokenize(); assert_eq!(ts[0].kind, IntLit); assert_eq!(ts[1].kind, Dot); assert_eq!(ts[2].kind, Ident); } #[test] fn string_literal() { let toks = Lexer::new(r#""hello\nworld""#).tokenize(); assert_eq!(toks[0].kind, StringLit); assert_eq!(toks[0].text, "\"hello\\nworld\""); } #[test] fn char_literal() { let toks = Lexer::new(r"'\u{1F600}'").tokenize(); assert_eq!(toks[0].kind, CharLit); } #[test] fn operators() { let src = "-> == != <= >= < > = + - * / % & | ^ ~ !"; assert_eq!( kinds(src), vec![ Arrow, EqEq, BangEq, LtEq, GtEq, Lt, Gt, Eq, Plus, Minus, Star, Slash, Percent, Amp, Pipe, Caret, Tilde, Bang, Eof ] ); } #[test] fn punctuation() { assert_eq!( kinds("( ) [ ] { } , ; : ."), vec![ LParen, RParen, LBracket, RBracket, LCurly, RCurly, Comma, Semicolon, Colon, Dot, Eof ] ); } #[test] fn spans_are_correct() { let toks = Lexer::new("fn foo").tokenize(); assert_eq!((toks[0].span.start, toks[0].span.end), (0, 2)); // "fn" assert_eq!((toks[1].span.start, toks[1].span.end), (3, 6)); // "foo" } #[test] fn small_function() { let src = "fn add(a: i32, b: i32) -> i32 { return a + b; }"; let toks = Lexer::new(src).tokenize(); let ks: Vec<_> = toks.iter().map(|t| t.kind).collect(); assert_eq!( ks, vec![ Fn, Ident, LParen, Ident, Colon, I32, Comma, Ident, Colon, I32, RParen, Arrow, I32, LCurly, Return, Ident, Plus, Ident, Semicolon, RCurly, Eof ] ); } }