From 0599a5fb98a06745a16012b57288b64f3334bd67 Mon Sep 17 00:00:00 2001 From: Jooris Hadeler Date: Mon, 12 Jan 2026 16:06:55 +0100 Subject: [PATCH] init: Add `Token` definition and `Tokenizer` logic. --- .gitignore | 8 ++ Cargo.lock | 23 ++++ Cargo.toml | 8 ++ example/main.bky | 8 ++ src/main.rs | 11 ++ src/token.rs | 320 +++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 378 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 example/main.bky create mode 100644 src/main.rs create mode 100644 src/token.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a5ff07f --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +/target + + +# Added by cargo +# +# already existing elements were commented out + +#/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..1035792 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,23 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "bucky" +version = "0.1.0" +dependencies = [ + "multipeek", + "unicode-xid", +] + +[[package]] +name = "multipeek" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d6b1cf1c2ae7c8c3898cbf8354ee836bc7037e35592d3739a9901d53c97b6a2" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..06725b9 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "bucky" +version = "0.1.0" +edition = "2024" + +[dependencies] +multipeek = "0.1.2" +unicode-xid = "0.2.6" diff --git a/example/main.bky b/example/main.bky new file mode 100644 index 0000000..a668ad3 --- /dev/null +++ b/example/main.bky @@ -0,0 +1,8 @@ +/// This function computes the n-th value of the fibbonacci sequence. +fn fib(n: u64): u64 { + if n < 2 { + return n; + } + + return fib(n - 1) + fib(n - 2); +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..aa44cdb --- /dev/null +++ b/src/main.rs @@ -0,0 +1,11 @@ +use crate::token::Tokenizer; + +pub mod token; + +fn main() { + let input = include_str!("../example/main.bky"); + + for token in Tokenizer::new(input) { + println!("{token:?}"); + } +} diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..5fab5c0 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,320 @@ +use std::str::Chars; + +use multipeek::{IteratorExt, MultiPeek}; +use unicode_xid::UnicodeXID; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Span { + pub start: usize, + pub end: usize, +} + +impl Span { + pub const fn new(start: usize, end: usize) -> Self { + Self { start, end } + } + + pub const fn from_offset_and_length(start: usize, length: usize) -> Self { + Self { + start, + end: start + length, + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Token<'src> { + pub kind: TokenKind, + pub span: Span, + pub text: &'src str, +} + +impl<'src> Token<'src> { + pub const fn new(kind: TokenKind, span: Span, text: &'src str) -> Self { + Self { kind, span, text } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TokenKind { + InvalidCharacter, + + KwFn, + KwIf, + KwLet, + KwLoop, + KwWhile, + KwBreak, + KwReturn, + + Identifier, + Integer, + Boolean, + + Plus, + Minus, + Asterisk, + Slash, + Percent, + Ampersand, + Pipe, + Caret, + Bang, + + Equal, + Unequal, + LessThan, + LessThanOrEqual, + GreaterThan, + GreaterThanOrEqual, + + Assign, + Dot, + Comma, + Colon, + Semicolon, + + LeftParen, + RightParen, + LeftBrace, + RightBrace, + LeftBracket, + RightBracket, +} + +pub struct Tokenizer<'src> { + input: &'src str, + chars: MultiPeek>, + position: usize, +} + +impl<'src> Tokenizer<'src> { + pub fn new(input: &'src str) -> Self { + Self { + input, + chars: input.chars().multipeek(), + position: 0, + } + } + + fn span(&self, start: usize) -> Span { + Span::new(start, self.position) + } + + fn text(&self, span: Span) -> &'src str { + &self.input[span.start..span.end] + } + + fn peek(&mut self) -> Option { + self.chars.peek().copied() + } + + fn peek_nth(&mut self, n: usize) -> Option { + self.chars.peek_nth(n).copied() + } + + fn consume(&mut self) -> Option { + let ch = self.chars.next()?; + self.position += 1; + Some(ch) + } + + fn skip_whitespace(&mut self) { + while self.peek().is_some_and(char::is_whitespace) { + self.consume(); + } + } + + fn skip_line(&mut self) { + while self.peek().is_some_and(|ch| ch != '\n') { + self.consume(); + } + + self.consume(); + } + + fn skip_block_comment(&mut self) { + let mut indent = 1; + + self.consume(); + self.consume(); + + while indent > 0 { + let Some(peek_1st) = self.peek() else { + break; + }; + + let peek_2nd = self.peek_nth(1).unwrap_or('\0'); + + match (peek_1st, peek_2nd) { + ('/', '*') => { + indent += 1; + self.consume(); + self.consume(); + } + ('*', '/') => { + indent -= 1; + self.consume(); + self.consume(); + } + _ => { + self.consume(); + } + } + } + } + + fn next_identifier(&mut self) -> Token<'src> { + let start = self.position; + + self.consume(); + while self.peek().is_some_and(|ch| ch.is_xid_continue()) { + self.consume(); + } + + let span = self.span(start); + let text = self.text(span); + + let kind = match text { + "fn" => TokenKind::KwFn, + "if" => TokenKind::KwIf, + "let" => TokenKind::KwLet, + "loop" => TokenKind::KwLoop, + "while" => TokenKind::KwWhile, + "break" => TokenKind::KwBreak, + "return" => TokenKind::KwReturn, + + _ => TokenKind::Identifier, + }; + + Token::new(kind, span, text) + } + + fn next_integer(&mut self) -> Token<'src> { + let start = self.position; + + self.consume(); + while self.peek().is_some_and(|ch| ch.is_ascii_digit()) { + self.consume(); + } + + let span = self.span(start); + let text = self.text(span); + + Token::new(TokenKind::Integer, span, text) + } + + fn next_punctuation(&mut self) -> Token<'src> { + let start = self.position; + + macro_rules! single { + ($kind:expr) => {{ + self.consume(); + $kind + }}; + } + + let kind = match self.peek().unwrap() { + '+' => single!(TokenKind::Plus), + '-' => single!(TokenKind::Minus), + '*' => single!(TokenKind::Asterisk), + '/' => single!(TokenKind::Slash), + '%' => single!(TokenKind::Percent), + '&' => single!(TokenKind::Ampersand), + '|' => single!(TokenKind::Pipe), + '^' => single!(TokenKind::Caret), + + '.' => single!(TokenKind::Dot), + ',' => single!(TokenKind::Comma), + ':' => single!(TokenKind::Colon), + ';' => single!(TokenKind::Semicolon), + + '=' => { + if self.peek() == Some('=') { + self.consume(); + self.consume(); + TokenKind::Equal + } else { + self.consume(); + TokenKind::Assign + } + } + '!' => { + if self.peek() == Some('=') { + self.consume(); + self.consume(); + TokenKind::Unequal + } else { + self.consume(); + TokenKind::Bang + } + } + '<' => { + if self.peek() == Some('=') { + self.consume(); + self.consume(); + TokenKind::LessThanOrEqual + } else { + self.consume(); + TokenKind::LessThan + } + } + '>' => { + if self.peek() == Some('=') { + self.consume(); + self.consume(); + TokenKind::GreaterThanOrEqual + } else { + self.consume(); + TokenKind::GreaterThan + } + } + + '(' => single!(TokenKind::LeftParen), + ')' => single!(TokenKind::RightParen), + '{' => single!(TokenKind::LeftBrace), + '}' => single!(TokenKind::RightBrace), + '[' => single!(TokenKind::LeftBracket), + ']' => single!(TokenKind::RightBracket), + + _ => single!(TokenKind::InvalidCharacter), + }; + + let span = self.span(start); + let text = self.text(span); + + Token::new(kind, span, text) + } +} + +impl<'src> Iterator for Tokenizer<'src> { + type Item = Token<'src>; + + fn next(&mut self) -> Option { + loop { + self.skip_whitespace(); + + if self.peek() == Some('/') && self.peek_nth(1) == Some('/') { + self.skip_line(); + continue; + } + + if self.peek() == Some('/') && self.peek_nth(1) == Some('*') { + self.skip_block_comment(); + continue; + } + + let ch = self.peek()?; + + if ch.is_xid_start() || ch == '_' { + return Some(self.next_identifier()); + } + + if ch.is_ascii_digit() { + return Some(self.next_integer()); + } + + return Some(self.next_punctuation()); + } + } +}