From 51bd07d313042d74eed6aff8b0708cfe0a6cbf3e Mon Sep 17 00:00:00 2001 From: Jooris Hadeler Date: Wed, 11 Mar 2026 23:18:05 +0100 Subject: [PATCH] feat: Add token definitions and lexer logic. This commit adds the `Token` and `TokenKind` definitions in `src/token.rs`, in `src/lexer.rs` I've added the `Lexer` logic. --- Cargo.lock | 7 ++ Cargo.toml | 1 + examples/fibonacci.bky | 7 ++ examples/hello-world.bky | 5 + src/cli.rs | 8 +- src/lexer.rs | 231 +++++++++++++++++++++++++++++++++++++++ src/main.rs | 27 ++++- src/token.rs | 146 +++++++++++++++++++++++++ 8 files changed, 426 insertions(+), 6 deletions(-) create mode 100644 examples/fibonacci.bky create mode 100644 examples/hello-world.bky create mode 100644 src/lexer.rs create mode 100644 src/token.rs diff --git a/Cargo.lock b/Cargo.lock index 57aee68..5d9546b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,9 +6,16 @@ version = 4 name = "buckyc" version = "0.1.0" dependencies = [ + "unicode-xid", "yansi", ] +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "yansi" version = "1.0.1" diff --git a/Cargo.toml b/Cargo.toml index e185849..33af498 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,4 +4,5 @@ version = "0.1.0" edition = "2024" [dependencies] +unicode-xid = "0.2.6" yansi = "1.0.1" diff --git a/examples/fibonacci.bky b/examples/fibonacci.bky new file mode 100644 index 0000000..2d27996 --- /dev/null +++ b/examples/fibonacci.bky @@ -0,0 +1,7 @@ +fn fib(n: u64) -> u64 { + if n < 2 { + return n; + } + + return fib(n - 1) + fib(n - 2); +} \ No newline at end of file diff --git a/examples/hello-world.bky b/examples/hello-world.bky new file mode 100644 index 0000000..b9ee248 --- /dev/null +++ b/examples/hello-world.bky @@ -0,0 +1,5 @@ +extern puts(text: *char); + +fn main() { + puts("Hello, World!"); +} \ No newline at end of file diff --git a/src/cli.rs b/src/cli.rs index 0c0187a..d3b25bb 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -16,12 +16,12 @@ pub fn print_help() { println!(); println!("{}", "OPTIONS:".bold().yellow()); println!( - " {}, {} Print this help message", + " {}, {} Print this help message", "-h".bold(), "--help".bold() ); println!( - " {}, {} Print version information", + " {}, {} Print version information", "-V".bold(), "--version".bold() ); @@ -34,7 +34,7 @@ pub fn print_help() { "-c".bold() ); println!( - " {} {} Write output to ", + " {} {} Write output to ", "-o".bold(), "".bold(), ); @@ -42,7 +42,7 @@ pub fn print_help() { println!(); println!("{}", "ARGS:".bold().yellow()); println!( - " {} One or more Flux source files to compile", + " {} One or more source files to compile", "".bold(), ); } diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..4254b77 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,231 @@ +use std::{iter::Peekable, str::Chars}; + +use unicode_xid::UnicodeXID; + +use crate::token::{Span, Token, TokenKind}; + +pub struct Lexer<'src> { + chars: Peekable>, + source: &'src str, + position: usize, +} + +impl<'src> Lexer<'src> { + pub fn new(source: &'src str) -> Self { + Self { + chars: source.chars().peekable(), + source, + position: 0, + } + } + + /// Peek at the next character without consuming it. + fn peek(&mut self) -> Option { + self.chars.peek().copied() + } + + /// Consume and return the next character. + /// This method panics if called at the end of input. + fn advance(&mut self) -> char { + let ch = self.chars.next().expect("failed to advance the lexer"); + self.position += ch.len_utf8(); + ch + } + + /// Advance while `condition` holds. + fn advance_while(&mut self, condition: impl FnMut(char) -> bool + Copy) { + while self.peek().is_some_and(condition) { + self.advance(); + } + } + + /// Build a token from `[start, self.pos)`. + fn make(&self, kind: TokenKind, start: usize) -> Token<'src> { + Token { + kind, + span: Span::new(start as u32, self.position as u32), + text: &self.source[start..self.position], + } + } + + /// Skip all whitespace and comments. + fn skip_whitespace_and_comments(&mut self) { + loop { + self.advance_while(char::is_whitespace); + + if self.peek() == Some('#') { + self.advance_while(|ch| ch != '\n'); + } else { + break; + } + } + } + + /// Lexes the next identifier token. + fn next_identifier(&mut self) -> TokenKind { + let start = self.position; + + self.advance(); + self.advance_while(|ch| ch.is_xid_continue()); + + match &self.source[start..self.position] { + "and" => TokenKind::KwAnd, + "or" => TokenKind::KwOr, + "u8" => TokenKind::TyU8, + "u16" => TokenKind::TyU16, + "u32" => TokenKind::TyU32, + "u64" => TokenKind::TyU64, + "i8" => TokenKind::TyI8, + "i16" => TokenKind::TyI16, + "i32" => TokenKind::TyI32, + "i64" => TokenKind::TyI64, + "true" | "false" => TokenKind::LitBool, + _ => TokenKind::Identifier, + } + } + + /// Lexes the next number token. + fn next_number(&mut self) -> TokenKind { + let radix = match self.advance() { + '0' => match self.peek() { + Some('x') => { + self.advance(); + 16 + } + Some('o') => { + self.advance(); + 8 + } + Some('b') => { + self.advance(); + 2 + } + _ => 10, + }, + _ => 10, + }; + + self.advance_while(|ch| ch.is_digit(radix)); + + TokenKind::LitInt + } + + /// Lexes the next string token. + fn next_string(&mut self) -> TokenKind { + let mut escaped = false; + + self.advance(); + + while let Some(ch) = self.peek() { + if escaped { + self.advance(); + escaped = false; + } else if ch == '\\' { + self.advance(); + escaped = true; + } else if ch == '"' { + self.advance(); + break; + } else { + self.advance(); + } + } + + TokenKind::LitString + } +} + +impl<'src> Iterator for Lexer<'src> { + type Item = Token<'src>; + + fn next(&mut self) -> Option { + self.skip_whitespace_and_comments(); + + let start = self.position; + + macro_rules! token { + // Case 1: Simple token (no lookahead) + ($default:expr) => {{ + self.advance(); + $default + }}; + + // Case 2: Multi-character lookahead entry point + ($($c:expr => $kind:expr),+ ; $default:expr) => {{ + self.advance(); + token!(@step $($c => $kind),+ ; $default) + }}; + + // Internal Recursive step: More than one pair remains + (@step $c:expr => $kind:expr, $($rest_c:expr => $rest_k:expr),+ ; $default:expr) => { + if self.peek() == Some($c) { + self.advance(); + $kind + } else { + token!(@step $($rest_c => $rest_k),+ ; $default) + } + }; + + // Internal Base case: Last pair in the lookahead chain + (@step $c:expr => $kind:expr ; $default:expr) => { + if self.peek() == Some($c) { + self.advance(); + $kind + } else { + $default + } + }; + } + + let kind = match self.peek()? { + ch if ch.is_xid_start() || ch == '_' => self.next_identifier(), + '0'..='9' => self.next_number(), + '"' => self.next_string(), + + '+' => token!(TokenKind::Plus), + '-' => token!( + '>' => TokenKind::Arrow; + TokenKind::Minus + ), + '*' => token!(TokenKind::Star), + '/' => token!(TokenKind::Slash), + '%' => token!(TokenKind::Percent), + '&' => token!(TokenKind::Amp), + '|' => token!(TokenKind::Pipe), + '^' => token!(TokenKind::Caret), + '~' => token!(TokenKind::Tilde), + '<' => token!( + '<' => TokenKind::Shl, + '=' => TokenKind::Le; + TokenKind::Lt + ), + '>' => token!( + '>' => TokenKind::Shr, + '=' => TokenKind::Ge; + TokenKind::Gt + ), + '!' => token!( + '=' => TokenKind::Ne; + TokenKind::Bang + ), + '=' => token!( + '=' => TokenKind::Eq; + TokenKind::Assign + ), + '.' => token!(TokenKind::Dot), + ',' => token!(TokenKind::Comma), + ':' => token!(TokenKind::Colon), + ';' => token!(TokenKind::Semi), + '(' => token!(TokenKind::LParen), + ')' => token!(TokenKind::RParen), + '[' => token!(TokenKind::LBracket), + ']' => token!(TokenKind::RBracket), + '{' => token!(TokenKind::LCurly), + '}' => token!(TokenKind::RCurly), + + _ => token!(TokenKind::Unknown), + }; + + Some(self.make(kind, start)) + } +} diff --git a/src/main.rs b/src/main.rs index c50cc74..b6851bc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,32 @@ -use crate::cli::parse_args; +use std::fs; + +use crate::{ + cli::{fatal, parse_args}, + lexer::Lexer, +}; mod cli; +mod lexer; +mod token; fn main() { let opts = parse_args(); - println!("{opts:#?}"); + for file in &opts.files { + let content = match fs::read_to_string(file) { + Ok(content) => content, + Err(error) => { + fatal(format!( + "failed to read {}: {:?}", + file.display(), + error.kind() + )); + } + }; + + println!("-- {} --", file.display()); + for token in Lexer::new(&content) { + println!("{}", token); + } + } } diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..f63a347 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,146 @@ +use std::fmt; + +/// A Span is a half-open byte range `[start, end)` which marks a location in +/// the source string. The start and end positions are stored as a [u32] which +/// limits us to a maximum source file size of 4 gigabytes. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct Span { + pub start: u32, + pub end: u32, +} + +impl Span { + pub const fn new(start: u32, end: u32) -> Self { + Self { start, end } + } + + pub fn len(&self) -> u32 { + self.end.saturating_sub(self.start) + } + + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Extend this [Span] to cover `other` as well. + pub fn extend(self, other: Self) -> Self { + Self { + start: self.start.min(other.start), + end: self.end.max(other.end), + } + } +} + +impl fmt::Display for Span { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}..{}", self.start, self.end) + } +} + +/// This macro helps with defining the different kinds of [Token]s. It +/// simultaneously defines a variant and its [fmt::Display] implementation. +macro_rules! define_tokens { + ($($name:ident => $repr:literal),* $(,)?) => { + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] + pub enum TokenKind { + $($name),* + } + + impl fmt::Display for TokenKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(match self { + $(TokenKind::$name => $repr),* + }) + } + } + }; +} + +define_tokens! { + // -- Identifier -- + Identifier => "identifier", + + // -- Literals -- + LitInt => "integer literal", + LitBool => "boolean literal", + LitString => "string literal", + + // -- Keywords -- + KwAnd => "`and`", + KwOr => "`or`", + + // -- Type Keywords -- + TyU8 => "`u8`", + TyU16 => "`u16`", + TyU32 => "`u32`", + TyU64 => "`u64`", + TyI8 => "`i8`", + TyI16 => "`i16`", + TyI32 => "`i32`", + TyI64 => "`i64`", + + // -- Arithmetic Operators -- + Plus => "`+`", + Minus => "`-`", + Star => "`*`", + Slash => "`/`", + Percent => "`%`", + + // -- Bitwise / Logical Operators -- + Amp => "`&`", + Pipe => "`|`", + Caret => "`^`", + Tilde => "`~`", + Shl => "`<<`", + Shr => "`>>`", + Bang => "`!`", + + // -- Comparision Operators -- + Eq => "`==`", + Ne => "`!=`", + Lt => "`<`", + Le => "`<=`", + Gt => "`>`", + Ge => "`>=`", + + // -- Punctuation -- + Assign => "`=`", + Arrow => "`->`", + Dot => "`.`", + Colon => "`:`", + Comma => "`,`", + Semi => "`;`", + + // -- Delimiters -- + LParen => "`(`", + RParen => "`)`", + LBracket => "`[`", + RBracket => "`]`", + LCurly => "`{`", + RCurly => "`}`", + + // -- Special -- + Unknown => "unknown character" +} + +/// A Token represents the smallest continous unit of the source code. It holds +/// its [TokenKind], [Span] and source text. +#[derive(Debug, Clone, Copy)] +pub struct Token<'src> { + pub kind: TokenKind, + pub span: Span, + pub text: &'src str, +} + +impl<'src> Token<'src> { + /// Checks if the current [Token] is of given [TokenKind]. + pub fn is(&self, kind: TokenKind) -> bool { + self.kind == kind + } +} + +impl<'src> fmt::Display for Token<'src> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?} `{}` @ {}", self.kind, self.text, self.span) + } +}