init: Add Token definition and Tokenizer logic.

This commit is contained in:
Jooris Hadeler
2026-01-12 16:06:55 +01:00
commit 0599a5fb98
6 changed files with 378 additions and 0 deletions

11
src/main.rs Normal file
View File

@@ -0,0 +1,11 @@
use crate::token::Tokenizer;
pub mod token;
fn main() {
let input = include_str!("../example/main.bky");
for token in Tokenizer::new(input) {
println!("{token:?}");
}
}

320
src/token.rs Normal file
View File

@@ -0,0 +1,320 @@
use std::str::Chars;
use multipeek::{IteratorExt, MultiPeek};
use unicode_xid::UnicodeXID;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Span {
pub start: usize,
pub end: usize,
}
impl Span {
pub const fn new(start: usize, end: usize) -> Self {
Self { start, end }
}
pub const fn from_offset_and_length(start: usize, length: usize) -> Self {
Self {
start,
end: start + length,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Token<'src> {
pub kind: TokenKind,
pub span: Span,
pub text: &'src str,
}
impl<'src> Token<'src> {
pub const fn new(kind: TokenKind, span: Span, text: &'src str) -> Self {
Self { kind, span, text }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenKind {
InvalidCharacter,
KwFn,
KwIf,
KwLet,
KwLoop,
KwWhile,
KwBreak,
KwReturn,
Identifier,
Integer,
Boolean,
Plus,
Minus,
Asterisk,
Slash,
Percent,
Ampersand,
Pipe,
Caret,
Bang,
Equal,
Unequal,
LessThan,
LessThanOrEqual,
GreaterThan,
GreaterThanOrEqual,
Assign,
Dot,
Comma,
Colon,
Semicolon,
LeftParen,
RightParen,
LeftBrace,
RightBrace,
LeftBracket,
RightBracket,
}
pub struct Tokenizer<'src> {
input: &'src str,
chars: MultiPeek<Chars<'src>>,
position: usize,
}
impl<'src> Tokenizer<'src> {
pub fn new(input: &'src str) -> Self {
Self {
input,
chars: input.chars().multipeek(),
position: 0,
}
}
fn span(&self, start: usize) -> Span {
Span::new(start, self.position)
}
fn text(&self, span: Span) -> &'src str {
&self.input[span.start..span.end]
}
fn peek(&mut self) -> Option<char> {
self.chars.peek().copied()
}
fn peek_nth(&mut self, n: usize) -> Option<char> {
self.chars.peek_nth(n).copied()
}
fn consume(&mut self) -> Option<char> {
let ch = self.chars.next()?;
self.position += 1;
Some(ch)
}
fn skip_whitespace(&mut self) {
while self.peek().is_some_and(char::is_whitespace) {
self.consume();
}
}
fn skip_line(&mut self) {
while self.peek().is_some_and(|ch| ch != '\n') {
self.consume();
}
self.consume();
}
fn skip_block_comment(&mut self) {
let mut indent = 1;
self.consume();
self.consume();
while indent > 0 {
let Some(peek_1st) = self.peek() else {
break;
};
let peek_2nd = self.peek_nth(1).unwrap_or('\0');
match (peek_1st, peek_2nd) {
('/', '*') => {
indent += 1;
self.consume();
self.consume();
}
('*', '/') => {
indent -= 1;
self.consume();
self.consume();
}
_ => {
self.consume();
}
}
}
}
fn next_identifier(&mut self) -> Token<'src> {
let start = self.position;
self.consume();
while self.peek().is_some_and(|ch| ch.is_xid_continue()) {
self.consume();
}
let span = self.span(start);
let text = self.text(span);
let kind = match text {
"fn" => TokenKind::KwFn,
"if" => TokenKind::KwIf,
"let" => TokenKind::KwLet,
"loop" => TokenKind::KwLoop,
"while" => TokenKind::KwWhile,
"break" => TokenKind::KwBreak,
"return" => TokenKind::KwReturn,
_ => TokenKind::Identifier,
};
Token::new(kind, span, text)
}
fn next_integer(&mut self) -> Token<'src> {
let start = self.position;
self.consume();
while self.peek().is_some_and(|ch| ch.is_ascii_digit()) {
self.consume();
}
let span = self.span(start);
let text = self.text(span);
Token::new(TokenKind::Integer, span, text)
}
fn next_punctuation(&mut self) -> Token<'src> {
let start = self.position;
macro_rules! single {
($kind:expr) => {{
self.consume();
$kind
}};
}
let kind = match self.peek().unwrap() {
'+' => single!(TokenKind::Plus),
'-' => single!(TokenKind::Minus),
'*' => single!(TokenKind::Asterisk),
'/' => single!(TokenKind::Slash),
'%' => single!(TokenKind::Percent),
'&' => single!(TokenKind::Ampersand),
'|' => single!(TokenKind::Pipe),
'^' => single!(TokenKind::Caret),
'.' => single!(TokenKind::Dot),
',' => single!(TokenKind::Comma),
':' => single!(TokenKind::Colon),
';' => single!(TokenKind::Semicolon),
'=' => {
if self.peek() == Some('=') {
self.consume();
self.consume();
TokenKind::Equal
} else {
self.consume();
TokenKind::Assign
}
}
'!' => {
if self.peek() == Some('=') {
self.consume();
self.consume();
TokenKind::Unequal
} else {
self.consume();
TokenKind::Bang
}
}
'<' => {
if self.peek() == Some('=') {
self.consume();
self.consume();
TokenKind::LessThanOrEqual
} else {
self.consume();
TokenKind::LessThan
}
}
'>' => {
if self.peek() == Some('=') {
self.consume();
self.consume();
TokenKind::GreaterThanOrEqual
} else {
self.consume();
TokenKind::GreaterThan
}
}
'(' => single!(TokenKind::LeftParen),
')' => single!(TokenKind::RightParen),
'{' => single!(TokenKind::LeftBrace),
'}' => single!(TokenKind::RightBrace),
'[' => single!(TokenKind::LeftBracket),
']' => single!(TokenKind::RightBracket),
_ => single!(TokenKind::InvalidCharacter),
};
let span = self.span(start);
let text = self.text(span);
Token::new(kind, span, text)
}
}
impl<'src> Iterator for Tokenizer<'src> {
type Item = Token<'src>;
fn next(&mut self) -> Option<Self::Item> {
loop {
self.skip_whitespace();
if self.peek() == Some('/') && self.peek_nth(1) == Some('/') {
self.skip_line();
continue;
}
if self.peek() == Some('/') && self.peek_nth(1) == Some('*') {
self.skip_block_comment();
continue;
}
let ch = self.peek()?;
if ch.is_xid_start() || ch == '_' {
return Some(self.next_identifier());
}
if ch.is_ascii_digit() {
return Some(self.next_integer());
}
return Some(self.next_punctuation());
}
}
}