use std::str::Chars; use multipeek::{IteratorExt, MultiPeek}; use unicode_xid::UnicodeXID; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Span { pub start: usize, pub end: usize, } impl Span { pub const fn new(start: usize, end: usize) -> Self { Self { start, end } } pub const fn from_offset_and_length(start: usize, length: usize) -> Self { Self { start, end: start + length, } } } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Token<'src> { pub kind: TokenKind, pub span: Span, pub text: &'src str, } impl<'src> Token<'src> { pub const fn new(kind: TokenKind, span: Span, text: &'src str) -> Self { Self { kind, span, text } } } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum TokenKind { InvalidCharacter, KwFn, KwIf, KwLet, KwElse, KwLoop, KwWhile, KwBreak, KwReturn, KwNot, KwAnd, KwOr, Identifier, Integer, Boolean, Plus, Minus, Asterisk, Slash, Percent, Ampersand, Pipe, Caret, Tilde, Equal, Unequal, LessThan, LessThanOrEqual, GreaterThan, GreaterThanOrEqual, Assign, Dot, Comma, Colon, Semicolon, LeftParen, RightParen, LeftBrace, RightBrace, LeftBracket, RightBracket, } pub struct Tokenizer<'src> { input: &'src str, chars: MultiPeek>, position: usize, } impl<'src> Tokenizer<'src> { pub fn new(input: &'src str) -> Self { Self { input, chars: input.chars().multipeek(), position: 0, } } fn span(&self, start: usize) -> Span { Span::new(start, self.position) } fn text(&self, span: Span) -> &'src str { &self.input[span.start..span.end] } fn peek(&mut self) -> Option { self.chars.peek().copied() } fn peek_nth(&mut self, n: usize) -> Option { self.chars.peek_nth(n).copied() } fn consume(&mut self) -> Option { let ch = self.chars.next()?; self.position += 1; Some(ch) } fn skip_whitespace(&mut self) { while self.peek().is_some_and(char::is_whitespace) { self.consume(); } } fn skip_line(&mut self) { while self.peek().is_some_and(|ch| ch != '\n') { self.consume(); } self.consume(); } fn skip_block_comment(&mut self) { let mut indent = 1; self.consume(); self.consume(); while indent > 0 { let Some(peek_1st) = self.peek() else { break; }; let peek_2nd = self.peek_nth(1).unwrap_or('\0'); match (peek_1st, peek_2nd) { ('/', '*') => { indent += 1; self.consume(); self.consume(); } ('*', '/') => { indent -= 1; self.consume(); self.consume(); } _ => { self.consume(); } } } } fn next_identifier(&mut self) -> Token<'src> { let start = self.position; self.consume(); while self.peek().is_some_and(|ch| ch.is_xid_continue()) { self.consume(); } let span = self.span(start); let text = self.text(span); let kind = match text { "fn" => TokenKind::KwFn, "if" => TokenKind::KwIf, "let" => TokenKind::KwLet, "else" => TokenKind::KwElse, "loop" => TokenKind::KwLoop, "while" => TokenKind::KwWhile, "break" => TokenKind::KwBreak, "return" => TokenKind::KwReturn, "or" => TokenKind::KwOr, "and" => TokenKind::KwAnd, "not" => TokenKind::KwNot, _ => TokenKind::Identifier, }; Token::new(kind, span, text) } fn next_integer(&mut self) -> Token<'src> { let start = self.position; self.consume(); while self.peek().is_some_and(|ch| ch.is_ascii_digit()) { self.consume(); } let span = self.span(start); let text = self.text(span); Token::new(TokenKind::Integer, span, text) } fn next_punctuation(&mut self) -> Token<'src> { let start = self.position; macro_rules! single { ($kind:expr) => {{ self.consume(); $kind }}; } let kind = match self.peek().unwrap() { '+' => single!(TokenKind::Plus), '-' => single!(TokenKind::Minus), '*' => single!(TokenKind::Asterisk), '/' => single!(TokenKind::Slash), '%' => single!(TokenKind::Percent), '&' => single!(TokenKind::Ampersand), '|' => single!(TokenKind::Pipe), '^' => single!(TokenKind::Caret), '~' => single!(TokenKind::Tilde), '.' => single!(TokenKind::Dot), ',' => single!(TokenKind::Comma), ':' => single!(TokenKind::Colon), ';' => single!(TokenKind::Semicolon), '=' => { self.consume(); if self.peek() == Some('=') { self.consume(); TokenKind::Equal } else { TokenKind::Assign } } '!' => { self.consume(); if self.peek() == Some('=') { self.consume(); TokenKind::Unequal } else { TokenKind::InvalidCharacter } } '<' => { self.consume(); if self.peek() == Some('=') { self.consume(); TokenKind::LessThanOrEqual } else { TokenKind::LessThan } } '>' => { self.consume(); if self.peek() == Some('=') { self.consume(); TokenKind::GreaterThanOrEqual } else { TokenKind::GreaterThan } } '(' => single!(TokenKind::LeftParen), ')' => single!(TokenKind::RightParen), '{' => single!(TokenKind::LeftBrace), '}' => single!(TokenKind::RightBrace), '[' => single!(TokenKind::LeftBracket), ']' => single!(TokenKind::RightBracket), _ => single!(TokenKind::InvalidCharacter), }; let span = self.span(start); let text = self.text(span); Token::new(kind, span, text) } } impl<'src> Iterator for Tokenizer<'src> { type Item = Token<'src>; fn next(&mut self) -> Option { loop { self.skip_whitespace(); if self.peek() == Some('/') && self.peek_nth(1) == Some('/') { self.skip_line(); continue; } if self.peek() == Some('/') && self.peek_nth(1) == Some('*') { self.skip_block_comment(); continue; } let ch = self.peek()?; if ch.is_xid_start() || ch == '_' { return Some(self.next_identifier()); } if ch.is_ascii_digit() { return Some(self.next_integer()); } return Some(self.next_punctuation()); } } }