init: Add Token definition and Tokenizer logic.
This commit is contained in:
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/target
|
||||
|
||||
|
||||
# Added by cargo
|
||||
#
|
||||
# already existing elements were commented out
|
||||
|
||||
#/target
|
||||
23
Cargo.lock
generated
Normal file
23
Cargo.lock
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "bucky"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"multipeek",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "multipeek"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d6b1cf1c2ae7c8c3898cbf8354ee836bc7037e35592d3739a9901d53c97b6a2"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
|
||||
8
Cargo.toml
Normal file
8
Cargo.toml
Normal file
@@ -0,0 +1,8 @@
|
||||
[package]
|
||||
name = "bucky"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
multipeek = "0.1.2"
|
||||
unicode-xid = "0.2.6"
|
||||
8
example/main.bky
Normal file
8
example/main.bky
Normal file
@@ -0,0 +1,8 @@
|
||||
/// This function computes the n-th value of the fibbonacci sequence.
|
||||
fn fib(n: u64): u64 {
|
||||
if n < 2 {
|
||||
return n;
|
||||
}
|
||||
|
||||
return fib(n - 1) + fib(n - 2);
|
||||
}
|
||||
11
src/main.rs
Normal file
11
src/main.rs
Normal file
@@ -0,0 +1,11 @@
|
||||
use crate::token::Tokenizer;
|
||||
|
||||
pub mod token;
|
||||
|
||||
fn main() {
|
||||
let input = include_str!("../example/main.bky");
|
||||
|
||||
for token in Tokenizer::new(input) {
|
||||
println!("{token:?}");
|
||||
}
|
||||
}
|
||||
320
src/token.rs
Normal file
320
src/token.rs
Normal file
@@ -0,0 +1,320 @@
|
||||
use std::str::Chars;
|
||||
|
||||
use multipeek::{IteratorExt, MultiPeek};
|
||||
use unicode_xid::UnicodeXID;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct Span {
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
}
|
||||
|
||||
impl Span {
|
||||
pub const fn new(start: usize, end: usize) -> Self {
|
||||
Self { start, end }
|
||||
}
|
||||
|
||||
pub const fn from_offset_and_length(start: usize, length: usize) -> Self {
|
||||
Self {
|
||||
start,
|
||||
end: start + length,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct Token<'src> {
|
||||
pub kind: TokenKind,
|
||||
pub span: Span,
|
||||
pub text: &'src str,
|
||||
}
|
||||
|
||||
impl<'src> Token<'src> {
|
||||
pub const fn new(kind: TokenKind, span: Span, text: &'src str) -> Self {
|
||||
Self { kind, span, text }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TokenKind {
|
||||
InvalidCharacter,
|
||||
|
||||
KwFn,
|
||||
KwIf,
|
||||
KwLet,
|
||||
KwLoop,
|
||||
KwWhile,
|
||||
KwBreak,
|
||||
KwReturn,
|
||||
|
||||
Identifier,
|
||||
Integer,
|
||||
Boolean,
|
||||
|
||||
Plus,
|
||||
Minus,
|
||||
Asterisk,
|
||||
Slash,
|
||||
Percent,
|
||||
Ampersand,
|
||||
Pipe,
|
||||
Caret,
|
||||
Bang,
|
||||
|
||||
Equal,
|
||||
Unequal,
|
||||
LessThan,
|
||||
LessThanOrEqual,
|
||||
GreaterThan,
|
||||
GreaterThanOrEqual,
|
||||
|
||||
Assign,
|
||||
Dot,
|
||||
Comma,
|
||||
Colon,
|
||||
Semicolon,
|
||||
|
||||
LeftParen,
|
||||
RightParen,
|
||||
LeftBrace,
|
||||
RightBrace,
|
||||
LeftBracket,
|
||||
RightBracket,
|
||||
}
|
||||
|
||||
pub struct Tokenizer<'src> {
|
||||
input: &'src str,
|
||||
chars: MultiPeek<Chars<'src>>,
|
||||
position: usize,
|
||||
}
|
||||
|
||||
impl<'src> Tokenizer<'src> {
|
||||
pub fn new(input: &'src str) -> Self {
|
||||
Self {
|
||||
input,
|
||||
chars: input.chars().multipeek(),
|
||||
position: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn span(&self, start: usize) -> Span {
|
||||
Span::new(start, self.position)
|
||||
}
|
||||
|
||||
fn text(&self, span: Span) -> &'src str {
|
||||
&self.input[span.start..span.end]
|
||||
}
|
||||
|
||||
fn peek(&mut self) -> Option<char> {
|
||||
self.chars.peek().copied()
|
||||
}
|
||||
|
||||
fn peek_nth(&mut self, n: usize) -> Option<char> {
|
||||
self.chars.peek_nth(n).copied()
|
||||
}
|
||||
|
||||
fn consume(&mut self) -> Option<char> {
|
||||
let ch = self.chars.next()?;
|
||||
self.position += 1;
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
fn skip_whitespace(&mut self) {
|
||||
while self.peek().is_some_and(char::is_whitespace) {
|
||||
self.consume();
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_line(&mut self) {
|
||||
while self.peek().is_some_and(|ch| ch != '\n') {
|
||||
self.consume();
|
||||
}
|
||||
|
||||
self.consume();
|
||||
}
|
||||
|
||||
fn skip_block_comment(&mut self) {
|
||||
let mut indent = 1;
|
||||
|
||||
self.consume();
|
||||
self.consume();
|
||||
|
||||
while indent > 0 {
|
||||
let Some(peek_1st) = self.peek() else {
|
||||
break;
|
||||
};
|
||||
|
||||
let peek_2nd = self.peek_nth(1).unwrap_or('\0');
|
||||
|
||||
match (peek_1st, peek_2nd) {
|
||||
('/', '*') => {
|
||||
indent += 1;
|
||||
self.consume();
|
||||
self.consume();
|
||||
}
|
||||
('*', '/') => {
|
||||
indent -= 1;
|
||||
self.consume();
|
||||
self.consume();
|
||||
}
|
||||
_ => {
|
||||
self.consume();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn next_identifier(&mut self) -> Token<'src> {
|
||||
let start = self.position;
|
||||
|
||||
self.consume();
|
||||
while self.peek().is_some_and(|ch| ch.is_xid_continue()) {
|
||||
self.consume();
|
||||
}
|
||||
|
||||
let span = self.span(start);
|
||||
let text = self.text(span);
|
||||
|
||||
let kind = match text {
|
||||
"fn" => TokenKind::KwFn,
|
||||
"if" => TokenKind::KwIf,
|
||||
"let" => TokenKind::KwLet,
|
||||
"loop" => TokenKind::KwLoop,
|
||||
"while" => TokenKind::KwWhile,
|
||||
"break" => TokenKind::KwBreak,
|
||||
"return" => TokenKind::KwReturn,
|
||||
|
||||
_ => TokenKind::Identifier,
|
||||
};
|
||||
|
||||
Token::new(kind, span, text)
|
||||
}
|
||||
|
||||
fn next_integer(&mut self) -> Token<'src> {
|
||||
let start = self.position;
|
||||
|
||||
self.consume();
|
||||
while self.peek().is_some_and(|ch| ch.is_ascii_digit()) {
|
||||
self.consume();
|
||||
}
|
||||
|
||||
let span = self.span(start);
|
||||
let text = self.text(span);
|
||||
|
||||
Token::new(TokenKind::Integer, span, text)
|
||||
}
|
||||
|
||||
fn next_punctuation(&mut self) -> Token<'src> {
|
||||
let start = self.position;
|
||||
|
||||
macro_rules! single {
|
||||
($kind:expr) => {{
|
||||
self.consume();
|
||||
$kind
|
||||
}};
|
||||
}
|
||||
|
||||
let kind = match self.peek().unwrap() {
|
||||
'+' => single!(TokenKind::Plus),
|
||||
'-' => single!(TokenKind::Minus),
|
||||
'*' => single!(TokenKind::Asterisk),
|
||||
'/' => single!(TokenKind::Slash),
|
||||
'%' => single!(TokenKind::Percent),
|
||||
'&' => single!(TokenKind::Ampersand),
|
||||
'|' => single!(TokenKind::Pipe),
|
||||
'^' => single!(TokenKind::Caret),
|
||||
|
||||
'.' => single!(TokenKind::Dot),
|
||||
',' => single!(TokenKind::Comma),
|
||||
':' => single!(TokenKind::Colon),
|
||||
';' => single!(TokenKind::Semicolon),
|
||||
|
||||
'=' => {
|
||||
if self.peek() == Some('=') {
|
||||
self.consume();
|
||||
self.consume();
|
||||
TokenKind::Equal
|
||||
} else {
|
||||
self.consume();
|
||||
TokenKind::Assign
|
||||
}
|
||||
}
|
||||
'!' => {
|
||||
if self.peek() == Some('=') {
|
||||
self.consume();
|
||||
self.consume();
|
||||
TokenKind::Unequal
|
||||
} else {
|
||||
self.consume();
|
||||
TokenKind::Bang
|
||||
}
|
||||
}
|
||||
'<' => {
|
||||
if self.peek() == Some('=') {
|
||||
self.consume();
|
||||
self.consume();
|
||||
TokenKind::LessThanOrEqual
|
||||
} else {
|
||||
self.consume();
|
||||
TokenKind::LessThan
|
||||
}
|
||||
}
|
||||
'>' => {
|
||||
if self.peek() == Some('=') {
|
||||
self.consume();
|
||||
self.consume();
|
||||
TokenKind::GreaterThanOrEqual
|
||||
} else {
|
||||
self.consume();
|
||||
TokenKind::GreaterThan
|
||||
}
|
||||
}
|
||||
|
||||
'(' => single!(TokenKind::LeftParen),
|
||||
')' => single!(TokenKind::RightParen),
|
||||
'{' => single!(TokenKind::LeftBrace),
|
||||
'}' => single!(TokenKind::RightBrace),
|
||||
'[' => single!(TokenKind::LeftBracket),
|
||||
']' => single!(TokenKind::RightBracket),
|
||||
|
||||
_ => single!(TokenKind::InvalidCharacter),
|
||||
};
|
||||
|
||||
let span = self.span(start);
|
||||
let text = self.text(span);
|
||||
|
||||
Token::new(kind, span, text)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'src> Iterator for Tokenizer<'src> {
|
||||
type Item = Token<'src>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
self.skip_whitespace();
|
||||
|
||||
if self.peek() == Some('/') && self.peek_nth(1) == Some('/') {
|
||||
self.skip_line();
|
||||
continue;
|
||||
}
|
||||
|
||||
if self.peek() == Some('/') && self.peek_nth(1) == Some('*') {
|
||||
self.skip_block_comment();
|
||||
continue;
|
||||
}
|
||||
|
||||
let ch = self.peek()?;
|
||||
|
||||
if ch.is_xid_start() || ch == '_' {
|
||||
return Some(self.next_identifier());
|
||||
}
|
||||
|
||||
if ch.is_ascii_digit() {
|
||||
return Some(self.next_integer());
|
||||
}
|
||||
|
||||
return Some(self.next_punctuation());
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user