init: Add Token definition and Tokenizer logic.

2026-01-12 16:06:55 +01:00
commit 0599a5fb98
6 changed files with 378 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,8 @@
 /target
 # Added by cargo
 #
 # already existing elements were commented out
 #/target
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -0,0 +1,23 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 4
 [[package]]
 name = "bucky"
 version = "0.1.0"
 dependencies = [
 "multipeek",
 "unicode-xid",
 ]
 [[package]]
 name = "multipeek"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1d6b1cf1c2ae7c8c3898cbf8354ee836bc7037e35592d3739a9901d53c97b6a2"
 [[package]]
 name = "unicode-xid"
 version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,8 @@
 [package]
 name = "bucky"
 version = "0.1.0"
 edition = "2024"
 [dependencies]
 multipeek = "0.1.2"
 unicode-xid = "0.2.6"
--- a/example/main.bky
+++ b/example/main.bky
@@ -0,0 +1,8 @@
 /// This function computes the n-th value of the fibbonacci sequence.
 fn fib(n: u64): u64 {
    if n < 2 {
        return n;
    }
    return fib(n - 1) + fib(n - 2);
 }
--- a/src/main.rs
+++ b/src/main.rs
@@ -0,0 +1,11 @@
 use crate::token::Tokenizer;
 pub mod token;
 fn main() {
    let input = include_str!("../example/main.bky");
    for token in Tokenizer::new(input) {
        println!("{token:?}");
    }
 }
--- a/src/token.rs
+++ b/src/token.rs
@@ -0,0 +1,320 @@
 use std::str::Chars;
 use multipeek::{IteratorExt, MultiPeek};
 use unicode_xid::UnicodeXID;
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct Span {
    pub start: usize,
    pub end: usize,
 }
 impl Span {
    pub const fn new(start: usize, end: usize) -> Self {
        Self { start, end }
    }
    pub const fn from_offset_and_length(start: usize, length: usize) -> Self {
        Self {
            start,
            end: start + length,
        }
    }
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct Token<'src> {
    pub kind: TokenKind,
    pub span: Span,
    pub text: &'src str,
 }
 impl<'src> Token<'src> {
    pub const fn new(kind: TokenKind, span: Span, text: &'src str) -> Self {
        Self { kind, span, text }
    }
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum TokenKind {
    InvalidCharacter,
    KwFn,
    KwIf,
    KwLet,
    KwLoop,
    KwWhile,
    KwBreak,
    KwReturn,
    Identifier,
    Integer,
    Boolean,
    Plus,
    Minus,
    Asterisk,
    Slash,
    Percent,
    Ampersand,
    Pipe,
    Caret,
    Bang,
    Equal,
    Unequal,
    LessThan,
    LessThanOrEqual,
    GreaterThan,
    GreaterThanOrEqual,
    Assign,
    Dot,
    Comma,
    Colon,
    Semicolon,
    LeftParen,
    RightParen,
    LeftBrace,
    RightBrace,
    LeftBracket,
    RightBracket,
 }
 pub struct Tokenizer<'src> {
    input: &'src str,
    chars: MultiPeek<Chars<'src>>,
    position: usize,
 }
 impl<'src> Tokenizer<'src> {
    pub fn new(input: &'src str) -> Self {
        Self {
            input,
            chars: input.chars().multipeek(),
            position: 0,
        }
    }
    fn span(&self, start: usize) -> Span {
        Span::new(start, self.position)
    }
    fn text(&self, span: Span) -> &'src str {
        &self.input[span.start..span.end]
    }
    fn peek(&mut self) -> Option<char> {
        self.chars.peek().copied()
    }
    fn peek_nth(&mut self, n: usize) -> Option<char> {
        self.chars.peek_nth(n).copied()
    }
    fn consume(&mut self) -> Option<char> {
        let ch = self.chars.next()?;
        self.position += 1;
        Some(ch)
    }
    fn skip_whitespace(&mut self) {
        while self.peek().is_some_and(char::is_whitespace) {
            self.consume();
        }
    }
    fn skip_line(&mut self) {
        while self.peek().is_some_and(|ch| ch != '\n') {
            self.consume();
        }
        self.consume();
    }
    fn skip_block_comment(&mut self) {
        let mut indent = 1;
        self.consume();
        self.consume();
        while indent > 0 {
            let Some(peek_1st) = self.peek() else {
                break;
            };
            let peek_2nd = self.peek_nth(1).unwrap_or('\0');
            match (peek_1st, peek_2nd) {
                ('/', '*') => {
                    indent += 1;
                    self.consume();
                    self.consume();
                }
                ('*', '/') => {
                    indent -= 1;
                    self.consume();
                    self.consume();
                }
                _ => {
                    self.consume();
                }
            }
        }
    }
    fn next_identifier(&mut self) -> Token<'src> {
        let start = self.position;
        self.consume();
        while self.peek().is_some_and(|ch| ch.is_xid_continue()) {
            self.consume();
        }
        let span = self.span(start);
        let text = self.text(span);
        let kind = match text {
            "fn" => TokenKind::KwFn,
            "if" => TokenKind::KwIf,
            "let" => TokenKind::KwLet,
            "loop" => TokenKind::KwLoop,
            "while" => TokenKind::KwWhile,
            "break" => TokenKind::KwBreak,
            "return" => TokenKind::KwReturn,
            _ => TokenKind::Identifier,
        };
        Token::new(kind, span, text)
    }
    fn next_integer(&mut self) -> Token<'src> {
        let start = self.position;
        self.consume();
        while self.peek().is_some_and(|ch| ch.is_ascii_digit()) {
            self.consume();
        }
        let span = self.span(start);
        let text = self.text(span);
        Token::new(TokenKind::Integer, span, text)
    }
    fn next_punctuation(&mut self) -> Token<'src> {
        let start = self.position;
        macro_rules! single {
            ($kind:expr) => {{
                self.consume();
                $kind
            }};
        }
        let kind = match self.peek().unwrap() {
            '+' => single!(TokenKind::Plus),
            '-' => single!(TokenKind::Minus),
            '*' => single!(TokenKind::Asterisk),
            '/' => single!(TokenKind::Slash),
            '%' => single!(TokenKind::Percent),
            '&' => single!(TokenKind::Ampersand),
            '|' => single!(TokenKind::Pipe),
            '^' => single!(TokenKind::Caret),
            '.' => single!(TokenKind::Dot),
            ',' => single!(TokenKind::Comma),
            ':' => single!(TokenKind::Colon),
            ';' => single!(TokenKind::Semicolon),
            '=' => {
                if self.peek() == Some('=') {
                    self.consume();
                    self.consume();
                    TokenKind::Equal
                } else {
                    self.consume();
                    TokenKind::Assign
                }
            }
            '!' => {
                if self.peek() == Some('=') {
                    self.consume();
                    self.consume();
                    TokenKind::Unequal
                } else {
                    self.consume();
                    TokenKind::Bang
                }
            }
            '<' => {
                if self.peek() == Some('=') {
                    self.consume();
                    self.consume();
                    TokenKind::LessThanOrEqual
                } else {
                    self.consume();
                    TokenKind::LessThan
                }
            }
            '>' => {
                if self.peek() == Some('=') {
                    self.consume();
                    self.consume();
                    TokenKind::GreaterThanOrEqual
                } else {
                    self.consume();
                    TokenKind::GreaterThan
                }
            }
            '(' => single!(TokenKind::LeftParen),
            ')' => single!(TokenKind::RightParen),
            '{' => single!(TokenKind::LeftBrace),
            '}' => single!(TokenKind::RightBrace),
            '[' => single!(TokenKind::LeftBracket),
            ']' => single!(TokenKind::RightBracket),
            _ => single!(TokenKind::InvalidCharacter),
        };
        let span = self.span(start);
        let text = self.text(span);
        Token::new(kind, span, text)
    }
 }
 impl<'src> Iterator for Tokenizer<'src> {
    type Item = Token<'src>;
    fn next(&mut self) -> Option<Self::Item> {
        loop {
            self.skip_whitespace();
            if self.peek() == Some('/') && self.peek_nth(1) == Some('/') {
                self.skip_line();
                continue;
            }
            if self.peek() == Some('/') && self.peek_nth(1) == Some('*') {
                self.skip_block_comment();
                continue;
            }
            let ch = self.peek()?;
            if ch.is_xid_start() || ch == '_' {
                return Some(self.next_identifier());
            }
            if ch.is_ascii_digit() {
                return Some(self.next_integer());
            }
            return Some(self.next_punctuation());
        }
    }
 }