use super::token::{Span, Token, TokenKind}; use std::{iter::Peekable, str::Chars}; /// Splits a source string into a sequence of [`Token`]s to be consumed by the parser. /// /// Internally tracks a [`Peekable`] char iterator and a byte-offset cursor /// to produce accurate source spans on every token. pub struct Lexer<'src> { source: &'src str, chars: Peekable>, cursor: usize, } impl<'src> Lexer<'src> { /// Creates a new lexer with given input source. pub fn new(source: &'src str) -> Self { let chars = source.chars().peekable(); Self { source, chars, cursor: 0, } } /// Advances the lexer updating the cursor position. fn advance(&mut self) -> Option { let ch = self.chars.next()?; self.cursor += ch.len_utf8(); Some(ch) } /// Advances the lexer as long as the predicate matches. fn advance_while(&mut self, predicate: impl Fn(char) -> bool) { while self.peek().is_some_and(&predicate) { self.advance(); } } /// Peeks at the next character without advancing. fn peek(&mut self) -> Option { self.chars.peek().copied() } /// Skip whitespace and line comments. fn skip_whitespace_and_comments(&mut self) { loop { self.advance_while(char::is_whitespace); if self.source[self.cursor..].starts_with("//") { self.advance_while(|ch| ch != '\n'); continue; } break; } } /// Lexes the next identifier or keyword. fn lex_identifier_or_keyword(&mut self) -> TokenKind { let start = self.cursor; self.advance_while(|ch| matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')); match &self.source[start..self.cursor] { "fn" => TokenKind::Fn, "if" => TokenKind::If, "else" => TokenKind::Else, "return" => TokenKind::Return, "i8" => TokenKind::I8, "i16" => TokenKind::I16, "i32" => TokenKind::I32, "i64" => TokenKind::I64, "u8" => TokenKind::U8, "u16" => TokenKind::U16, "u32" => TokenKind::U32, "u64" => TokenKind::U64, "bool" => TokenKind::Bool, "true" | "false" => TokenKind::BooleanLit, _ => TokenKind::Identifier, } } /// Lexes the next number. fn lex_number(&mut self) -> TokenKind { let radix = match (self.advance(), self.peek()) { (Some('0'), Some('x' | 'X')) => { self.advance(); 16 } (Some('0'), Some('o' | 'O')) => { self.advance(); 8 } (Some('0'), Some('b' | 'B')) => { self.advance(); 2 } _ => 10, }; self.advance_while(|ch| ch.is_digit(radix)); TokenKind::IntegerLit } /// Lexes the next [Token]. pub fn next_token(&mut self) -> Option> { self.skip_whitespace_and_comments(); let start = self.cursor; macro_rules! token { ($kind:expr) => {{ self.advance(); $kind }}; ($default:expr, $($ch:expr => $kind:expr),+ $(,)?) => {{ self.advance(); match self.peek() { $( Some($ch) => { self.advance(); $kind } )+ _ => $default, } }}; } let kind = match self.peek()? { 'a'..='z' | 'A'..='Z' | '_' => self.lex_identifier_or_keyword(), '0'..='9' => self.lex_number(), '+' => token!(TokenKind::Plus), '-' => token!(TokenKind::Minus, '>' => TokenKind::Arrow), '*' => token!(TokenKind::Star), '/' => token!(TokenKind::Slash), '%' => token!(TokenKind::Percent), '!' => token!(TokenKind::Bang, '=' => TokenKind::Unequal), '=' => token!(TokenKind::Assign, '=' => TokenKind::Equal), '<' => token!(TokenKind::LessThan, '=' => TokenKind::LessEqual), '>' => token!(TokenKind::GreaterThan, '=' => TokenKind::GreaterEqual), '.' => token!(TokenKind::Dot), ',' => token!(TokenKind::Comma), ':' => token!(TokenKind::Colon), ';' => token!(TokenKind::Semicolon), '(' => token!(TokenKind::LParen), ')' => token!(TokenKind::RParen), '{' => token!(TokenKind::LBrace), '}' => token!(TokenKind::RBrace), '[' => token!(TokenKind::LBracket), ']' => token!(TokenKind::RBracket), _ => token!(TokenKind::Invalid), }; let span = Span::new(start, self.cursor); let text = &self.source[start..self.cursor]; Some(Token::new(kind, text, span)) } } impl<'src> Iterator for Lexer<'src> { type Item = Token<'src>; fn next(&mut self) -> Option { self.next_token() } } #[cfg(test)] mod test { use crate::frontend::{ lexer::Lexer, token::{Span, Token, TokenKind}, }; fn tokenize<'src>(source: &'src str) -> Vec> { Lexer::new(source).collect() } #[test] fn comments() { assert_eq!( tokenize("// this is a comment\nhello"), vec![Token::new( TokenKind::Identifier, "hello", Span::new(21, 26) )] ) } #[test] fn identifiers() { assert_eq!( tokenize("HELLO _hello _0@ fn if else return"), vec![ Token::new(TokenKind::Identifier, "HELLO", Span::new(0, 5)), Token::new(TokenKind::Identifier, "_hello", Span::new(6, 12)), Token::new(TokenKind::Identifier, "_0", Span::new(13, 15)), Token::new(TokenKind::Invalid, "@", Span::new(15, 16)), Token::new(TokenKind::Fn, "fn", Span::new(17, 19)), Token::new(TokenKind::If, "if", Span::new(20, 22)), Token::new(TokenKind::Else, "else", Span::new(23, 27)), Token::new(TokenKind::Return, "return", Span::new(28, 34)), ] ) } #[test] fn integer_literals() { assert_eq!( tokenize("0xBEEF 0o777 0b1001 1337"), vec![ Token::new(TokenKind::IntegerLit, "0xBEEF", Span::new(0, 6)), Token::new(TokenKind::IntegerLit, "0o777", Span::new(7, 12)), Token::new(TokenKind::IntegerLit, "0b1001", Span::new(13, 19)), Token::new(TokenKind::IntegerLit, "1337", Span::new(20, 24)), ] ) } #[test] fn boolean_literals() { assert_eq!( tokenize("true false"), vec![ Token::new(TokenKind::BooleanLit, "true", Span::new(0, 4)), Token::new(TokenKind::BooleanLit, "false", Span::new(5, 10)), ] ); } #[test] fn types() { assert_eq!( tokenize("i8 i16 i32 i64 u8 u16 u32 u64 bool"), vec![ Token::new(TokenKind::I8, "i8", Span::new(0, 2)), Token::new(TokenKind::I16, "i16", Span::new(3, 6)), Token::new(TokenKind::I32, "i32", Span::new(7, 10)), Token::new(TokenKind::I64, "i64", Span::new(11, 14)), Token::new(TokenKind::U8, "u8", Span::new(15, 17)), Token::new(TokenKind::U16, "u16", Span::new(18, 21)), Token::new(TokenKind::U32, "u32", Span::new(22, 25)), Token::new(TokenKind::U64, "u64", Span::new(26, 29)), Token::new(TokenKind::Bool, "bool", Span::new(30, 34)), ] ) } #[test] fn operators() { assert_eq!( tokenize("+ - * / %"), vec![ Token::new(TokenKind::Plus, "+", Span::new(0, 1)), Token::new(TokenKind::Minus, "-", Span::new(2, 3)), Token::new(TokenKind::Star, "*", Span::new(4, 5)), Token::new(TokenKind::Slash, "/", Span::new(6, 7)), Token::new(TokenKind::Percent, "%", Span::new(8, 9)), ] ) } #[test] fn punctuation() { assert_eq!( tokenize(". , : ; -> ( ) { } [ ]"), vec![ Token::new(TokenKind::Dot, ".", Span::new(0, 1)), Token::new(TokenKind::Comma, ",", Span::new(2, 3)), Token::new(TokenKind::Colon, ":", Span::new(4, 5)), Token::new(TokenKind::Semicolon, ";", Span::new(6, 7)), Token::new(TokenKind::Arrow, "->", Span::new(8, 10)), Token::new(TokenKind::LParen, "(", Span::new(11, 12)), Token::new(TokenKind::RParen, ")", Span::new(13, 14)), Token::new(TokenKind::LBrace, "{", Span::new(15, 16)), Token::new(TokenKind::RBrace, "}", Span::new(17, 18)), Token::new(TokenKind::LBracket, "[", Span::new(19, 20)), Token::new(TokenKind::RBracket, "]", Span::new(21, 22)), ] ) } #[test] fn comparison_and_logical() { assert_eq!( tokenize("== != < <= > >= ! ="), vec![ Token::new(TokenKind::Equal, "==", Span::new(0, 2)), Token::new(TokenKind::Unequal, "!=", Span::new(3, 5)), Token::new(TokenKind::LessThan, "<", Span::new(6, 7)), Token::new(TokenKind::LessEqual, "<=", Span::new(8, 10)), Token::new(TokenKind::GreaterThan, ">", Span::new(11, 12)), Token::new(TokenKind::GreaterEqual, ">=", Span::new(13, 15)), Token::new(TokenKind::Bang, "!", Span::new(16, 17)), Token::new(TokenKind::Assign, "=", Span::new(18, 19)), ] ) } }