diff --git a/PLAN.md b/PLAN.md index df56f19..daee6eb 100644 --- a/PLAN.md +++ b/PLAN.md @@ -8,10 +8,10 @@ A Rust-flavored, C-targeting language - built pipeline-first. ## Phase 1 - Lexer - [x] Define token enum (int literal, bool literal, ident, keywords, operators, punctuation) -- [ ] Implement character-by-character scanner loop -- [ ] Handle whitespace & single-line comments (`//`) -- [ ] Produce source spans (file, line, col) on every token -- [ ] Unit-test: known inputs → expected token streams +- [x] Implement character-by-character scanner loop +- [x] Handle whitespace & single-line comments (`//`) +- [x] Produce source spans (file, line, col) on every token +- [x] Unit-test: known inputs → expected token streams ## Phase 2 - Parser diff --git a/src/frontend/lexer.rs b/src/frontend/lexer.rs new file mode 100644 index 0000000..9f0ea69 --- /dev/null +++ b/src/frontend/lexer.rs @@ -0,0 +1,286 @@ +use super::token::{Span, Token, TokenKind}; +use std::{iter::Peekable, str::Chars}; + +/// Splits a source string into a sequence of [`Token`]s to be consumed by the parser. +/// +/// Internally tracks a [`Peekable`] char iterator and a byte-offset cursor +/// to produce accurate source spans on every token. +pub struct Lexer<'src> { + source: &'src str, + chars: Peekable>, + cursor: usize, +} + +impl<'src> Lexer<'src> { + /// Creates a new lexer with given input source. + pub fn new(source: &'src str) -> Self { + let chars = source.chars().peekable(); + + Self { + source, + chars, + cursor: 0, + } + } + + /// Advances the lexer updating the cursor position. + fn advance(&mut self) -> Option { + let ch = self.chars.next()?; + self.cursor += ch.len_utf8(); + Some(ch) + } + + /// Advances the lexer as long as the predicate matches. + fn advance_while(&mut self, predicate: impl Fn(char) -> bool) { + while self.peek().is_some_and(&predicate) { + self.advance(); + } + } + + /// Peeks at the next character without advancing. + fn peek(&mut self) -> Option { + self.chars.peek().copied() + } + + /// Skip whitespace and line comments. + fn skip_whitespace_and_comments(&mut self) { + loop { + self.advance_while(char::is_whitespace); + + if self.source[self.cursor..].starts_with("//") { + self.advance_while(|ch| ch != '\n'); + continue; + } + + break; + } + } + + /// Lexes the next identifier or keyword. + fn lex_identifier_or_keyword(&mut self) -> TokenKind { + let start = self.cursor; + + self.advance_while(|ch| matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')); + + match &self.source[start..self.cursor] { + "fn" => TokenKind::Fn, + "return" => TokenKind::Return, + + "i8" => TokenKind::I8, + "i16" => TokenKind::I16, + "i32" => TokenKind::I32, + "i64" => TokenKind::I64, + "u8" => TokenKind::U8, + "u16" => TokenKind::U16, + "u32" => TokenKind::U32, + "u64" => TokenKind::U64, + "bool" => TokenKind::Bool, + + "true" | "false" => TokenKind::BooleanLit, + + _ => TokenKind::Identifier, + } + } + + /// Lexes the next number. + fn lex_number(&mut self) -> TokenKind { + let radix = match (self.advance(), self.peek()) { + (Some('0'), Some('x' | 'X')) => { + self.advance(); + 16 + } + (Some('0'), Some('o' | 'O')) => { + self.advance(); + 8 + } + (Some('0'), Some('b' | 'B')) => { + self.advance(); + 2 + } + _ => 10, + }; + + self.advance_while(|ch| ch.is_digit(radix)); + + TokenKind::IntegerLit + } + + /// Lexes the next [Token]. + pub fn next_token(&mut self) -> Option> { + self.skip_whitespace_and_comments(); + + let start = self.cursor; + + macro_rules! token { + ($kind:expr) => {{ + self.advance(); + $kind + }}; + + ($default:expr, $($ch:expr => $kind:expr),+ $(,)?) => {{ + self.advance(); + + match self.peek() { + $( + Some($ch) => { + self.advance(); + $kind + } + )+ + _ => $default, + } + }}; + } + + let kind = match self.peek()? { + 'a'..='z' | 'A'..='Z' | '_' => self.lex_identifier_or_keyword(), + '0'..='9' => self.lex_number(), + + '+' => token!(TokenKind::Plus), + '-' => token!(TokenKind::Minus, '>' => TokenKind::Arrow), + '*' => token!(TokenKind::Star), + '/' => token!(TokenKind::Slash), + '%' => token!(TokenKind::Percent), + + '.' => token!(TokenKind::Dot), + ',' => token!(TokenKind::Comma), + ':' => token!(TokenKind::Colon), + ';' => token!(TokenKind::Semicolon), + + '(' => token!(TokenKind::LParen), + ')' => token!(TokenKind::RParen), + '{' => token!(TokenKind::LBrace), + '}' => token!(TokenKind::RBrace), + '[' => token!(TokenKind::LBracket), + ']' => token!(TokenKind::RBracket), + + _ => token!(TokenKind::Invalid), + }; + + let span = Span::new(start, self.cursor); + let text = &self.source[start..self.cursor]; + + Some(Token::new(kind, text, span)) + } +} + +impl<'src> Iterator for Lexer<'src> { + type Item = Token<'src>; + + fn next(&mut self) -> Option { + self.next_token() + } +} + +#[cfg(test)] +mod test { + use crate::frontend::{ + lexer::Lexer, + token::{Span, Token, TokenKind}, + }; + + fn tokenize<'src>(source: &'src str) -> Vec> { + Lexer::new(source).collect() + } + + #[test] + fn comments() { + assert_eq!( + tokenize("// this is a comment\nhello"), + vec![Token::new( + TokenKind::Identifier, + "hello", + Span::new(21, 26) + )] + ) + } + + #[test] + fn identifiers() { + assert_eq!( + tokenize("HELLO _hello _0@"), + vec![ + Token::new(TokenKind::Identifier, "HELLO", Span::new(0, 5)), + Token::new(TokenKind::Identifier, "_hello", Span::new(6, 12)), + Token::new(TokenKind::Identifier, "_0", Span::new(13, 15)), + Token::new(TokenKind::Invalid, "@", Span::new(15, 16)), + ] + ) + } + + #[test] + fn integer_literals() { + assert_eq!( + tokenize("0xBEEF 0o777 0b1001 1337"), + vec![ + Token::new(TokenKind::IntegerLit, "0xBEEF", Span::new(0, 6)), + Token::new(TokenKind::IntegerLit, "0o777", Span::new(7, 12)), + Token::new(TokenKind::IntegerLit, "0b1001", Span::new(13, 19)), + Token::new(TokenKind::IntegerLit, "1337", Span::new(20, 24)), + ] + ) + } + + #[test] + fn boolean_literals() { + assert_eq!( + tokenize("true false"), + vec![ + Token::new(TokenKind::BooleanLit, "true", Span::new(0, 4)), + Token::new(TokenKind::BooleanLit, "false", Span::new(5, 10)), + ] + ); + } + + #[test] + fn types() { + assert_eq!( + tokenize("i8 i16 i32 i64 u8 u16 u32 u64 bool"), + vec![ + Token::new(TokenKind::I8, "i8", Span::new(0, 2)), + Token::new(TokenKind::I16, "i16", Span::new(3, 6)), + Token::new(TokenKind::I32, "i32", Span::new(7, 10)), + Token::new(TokenKind::I64, "i64", Span::new(11, 14)), + Token::new(TokenKind::U8, "u8", Span::new(15, 17)), + Token::new(TokenKind::U16, "u16", Span::new(18, 21)), + Token::new(TokenKind::U32, "u32", Span::new(22, 25)), + Token::new(TokenKind::U64, "u64", Span::new(26, 29)), + Token::new(TokenKind::Bool, "bool", Span::new(30, 34)), + ] + ) + } + + #[test] + fn operators() { + assert_eq!( + tokenize("+ - * / %"), + vec![ + Token::new(TokenKind::Plus, "+", Span::new(0, 1)), + Token::new(TokenKind::Minus, "-", Span::new(2, 3)), + Token::new(TokenKind::Star, "*", Span::new(4, 5)), + Token::new(TokenKind::Slash, "/", Span::new(6, 7)), + Token::new(TokenKind::Percent, "%", Span::new(8, 9)), + ] + ) + } + + #[test] + fn punctuation() { + assert_eq!( + tokenize(". , : ; -> ( ) { } [ ]"), + vec![ + Token::new(TokenKind::Dot, ".", Span::new(0, 1)), + Token::new(TokenKind::Comma, ",", Span::new(2, 3)), + Token::new(TokenKind::Colon, ":", Span::new(4, 5)), + Token::new(TokenKind::Semicolon, ";", Span::new(6, 7)), + Token::new(TokenKind::Arrow, "->", Span::new(8, 10)), + Token::new(TokenKind::LParen, "(", Span::new(11, 12)), + Token::new(TokenKind::RParen, ")", Span::new(13, 14)), + Token::new(TokenKind::LBrace, "{", Span::new(15, 16)), + Token::new(TokenKind::RBrace, "}", Span::new(17, 18)), + Token::new(TokenKind::LBracket, "[", Span::new(19, 20)), + Token::new(TokenKind::RBracket, "]", Span::new(21, 22)), + ] + ) + } +} diff --git a/src/frontend/mod.rs b/src/frontend/mod.rs index 79c66ba..e12719b 100644 --- a/src/frontend/mod.rs +++ b/src/frontend/mod.rs @@ -1 +1,2 @@ +pub mod lexer; pub mod token;