use std::{iter::Peekable, str::Chars}; use unicode_xid::UnicodeXID; use crate::token::{Span, Token, TokenKind}; pub struct Lexer<'src> { chars: Peekable>, source: &'src str, position: usize, } impl<'src> Lexer<'src> { pub fn new(source: &'src str) -> Self { Self { chars: source.chars().peekable(), source, position: 0, } } /// Peek at the next character without consuming it. fn peek(&mut self) -> Option { self.chars.peek().copied() } /// Consume and return the next character. /// This method panics if called at the end of input. fn advance(&mut self) -> char { let ch = self.chars.next().expect("failed to advance the lexer"); self.position += ch.len_utf8(); ch } /// Advance while `condition` holds. fn advance_while(&mut self, condition: impl FnMut(char) -> bool + Copy) { while self.peek().is_some_and(condition) { self.advance(); } } /// Build a token from `[start, self.pos)`. fn make(&self, kind: TokenKind, start: usize) -> Token<'src> { Token { kind, span: Span::new(start as u32, self.position as u32), text: &self.source[start..self.position], } } /// Skip all whitespace and comments. fn skip_whitespace_and_comments(&mut self) { loop { self.advance_while(char::is_whitespace); if self.peek() == Some('#') { self.advance_while(|ch| ch != '\n'); } else { break; } } } /// Lexes the next identifier token. fn next_identifier(&mut self) -> TokenKind { let start = self.position; self.advance(); self.advance_while(|ch| ch.is_xid_continue()); match &self.source[start..self.position] { "and" => TokenKind::KwAnd, "or" => TokenKind::KwOr, "u8" => TokenKind::TyU8, "u16" => TokenKind::TyU16, "u32" => TokenKind::TyU32, "u64" => TokenKind::TyU64, "i8" => TokenKind::TyI8, "i16" => TokenKind::TyI16, "i32" => TokenKind::TyI32, "i64" => TokenKind::TyI64, "true" | "false" => TokenKind::LitBool, _ => TokenKind::Identifier, } } /// Lexes the next number token. fn next_number(&mut self) -> TokenKind { let radix = match self.advance() { '0' => match self.peek() { Some('x') => { self.advance(); 16 } Some('o') => { self.advance(); 8 } Some('b') => { self.advance(); 2 } _ => 10, }, _ => 10, }; self.advance_while(|ch| ch.is_digit(radix)); TokenKind::LitInt } /// Lexes the next string token. fn next_string(&mut self) -> TokenKind { let mut escaped = false; self.advance(); while let Some(ch) = self.peek() { if escaped { self.advance(); escaped = false; } else if ch == '\\' { self.advance(); escaped = true; } else if ch == '"' { self.advance(); break; } else { self.advance(); } } TokenKind::LitString } } impl<'src> Iterator for Lexer<'src> { type Item = Token<'src>; fn next(&mut self) -> Option { self.skip_whitespace_and_comments(); let start = self.position; macro_rules! token { // Case 1: Simple token (no lookahead) ($default:expr) => {{ self.advance(); $default }}; // Case 2: Multi-character lookahead entry point ($($c:expr => $kind:expr),+ ; $default:expr) => {{ self.advance(); token!(@step $($c => $kind),+ ; $default) }}; // Internal Recursive step: More than one pair remains (@step $c:expr => $kind:expr, $($rest_c:expr => $rest_k:expr),+ ; $default:expr) => { if self.peek() == Some($c) { self.advance(); $kind } else { token!(@step $($rest_c => $rest_k),+ ; $default) } }; // Internal Base case: Last pair in the lookahead chain (@step $c:expr => $kind:expr ; $default:expr) => { if self.peek() == Some($c) { self.advance(); $kind } else { $default } }; } let kind = match self.peek()? { ch if ch.is_xid_start() || ch == '_' => self.next_identifier(), '0'..='9' => self.next_number(), '"' => self.next_string(), '+' => token!(TokenKind::Plus), '-' => token!( '>' => TokenKind::Arrow; TokenKind::Minus ), '*' => token!(TokenKind::Star), '/' => token!(TokenKind::Slash), '%' => token!(TokenKind::Percent), '&' => token!(TokenKind::Amp), '|' => token!(TokenKind::Pipe), '^' => token!(TokenKind::Caret), '~' => token!(TokenKind::Tilde), '<' => token!( '<' => TokenKind::Shl, '=' => TokenKind::Le; TokenKind::Lt ), '>' => token!( '>' => TokenKind::Shr, '=' => TokenKind::Ge; TokenKind::Gt ), '!' => token!( '=' => TokenKind::Ne; TokenKind::Bang ), '=' => token!( '=' => TokenKind::Eq; TokenKind::Assign ), '.' => token!(TokenKind::Dot), ',' => token!(TokenKind::Comma), ':' => token!(TokenKind::Colon), ';' => token!(TokenKind::Semi), '(' => token!(TokenKind::LParen), ')' => token!(TokenKind::RParen), '[' => token!(TokenKind::LBracket), ']' => token!(TokenKind::RBracket), '{' => token!(TokenKind::LCurly), '}' => token!(TokenKind::RCurly), _ => token!(TokenKind::Unknown), }; Some(self.make(kind, start)) } }