332 lines
7.6 KiB
Rust
332 lines
7.6 KiB
Rust
use std::str::Chars;
|
|
|
|
use multipeek::{IteratorExt, MultiPeek};
|
|
use unicode_xid::UnicodeXID;
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub struct Span {
|
|
pub start: usize,
|
|
pub end: usize,
|
|
}
|
|
|
|
impl Span {
|
|
pub const fn new(start: usize, end: usize) -> Self {
|
|
Self { start, end }
|
|
}
|
|
|
|
pub const fn from_offset_and_length(start: usize, length: usize) -> Self {
|
|
Self {
|
|
start,
|
|
end: start + length,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub struct Token<'src> {
|
|
pub kind: TokenKind,
|
|
pub span: Span,
|
|
pub text: &'src str,
|
|
}
|
|
|
|
impl<'src> Token<'src> {
|
|
pub const fn new(kind: TokenKind, span: Span, text: &'src str) -> Self {
|
|
Self { kind, span, text }
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum TokenKind {
|
|
InvalidCharacter,
|
|
|
|
KwFn,
|
|
KwIf,
|
|
KwLet,
|
|
KwElse,
|
|
KwLoop,
|
|
KwWhile,
|
|
KwBreak,
|
|
KwReturn,
|
|
|
|
KwNot,
|
|
KwAnd,
|
|
KwOr,
|
|
|
|
Identifier,
|
|
Integer,
|
|
Boolean,
|
|
|
|
Plus,
|
|
Minus,
|
|
Asterisk,
|
|
Slash,
|
|
Percent,
|
|
Ampersand,
|
|
Pipe,
|
|
Caret,
|
|
Tilde,
|
|
|
|
Equal,
|
|
Unequal,
|
|
LessThan,
|
|
LessThanOrEqual,
|
|
GreaterThan,
|
|
GreaterThanOrEqual,
|
|
|
|
Assign,
|
|
Dot,
|
|
Comma,
|
|
Colon,
|
|
Semicolon,
|
|
|
|
LeftParen,
|
|
RightParen,
|
|
LeftBrace,
|
|
RightBrace,
|
|
LeftBracket,
|
|
RightBracket,
|
|
}
|
|
|
|
pub struct Tokenizer<'src> {
|
|
input: &'src str,
|
|
chars: MultiPeek<Chars<'src>>,
|
|
position: usize,
|
|
}
|
|
|
|
impl<'src> Tokenizer<'src> {
|
|
pub fn new(input: &'src str) -> Self {
|
|
Self {
|
|
input,
|
|
chars: input.chars().multipeek(),
|
|
position: 0,
|
|
}
|
|
}
|
|
|
|
fn span(&self, start: usize) -> Span {
|
|
Span::new(start, self.position)
|
|
}
|
|
|
|
fn text(&self, span: Span) -> &'src str {
|
|
&self.input[span.start..span.end]
|
|
}
|
|
|
|
fn peek(&mut self) -> Option<char> {
|
|
self.chars.peek().copied()
|
|
}
|
|
|
|
fn peek_nth(&mut self, n: usize) -> Option<char> {
|
|
self.chars.peek_nth(n).copied()
|
|
}
|
|
|
|
fn consume(&mut self) -> Option<char> {
|
|
let ch = self.chars.next()?;
|
|
self.position += 1;
|
|
Some(ch)
|
|
}
|
|
|
|
fn skip_whitespace(&mut self) {
|
|
while self.peek().is_some_and(char::is_whitespace) {
|
|
self.consume();
|
|
}
|
|
}
|
|
|
|
fn skip_line(&mut self) {
|
|
while self.peek().is_some_and(|ch| ch != '\n') {
|
|
self.consume();
|
|
}
|
|
|
|
self.consume();
|
|
}
|
|
|
|
fn skip_block_comment(&mut self) {
|
|
let mut indent = 1;
|
|
|
|
self.consume();
|
|
self.consume();
|
|
|
|
while indent > 0 {
|
|
let Some(peek_1st) = self.peek() else {
|
|
break;
|
|
};
|
|
|
|
let peek_2nd = self.peek_nth(1).unwrap_or('\0');
|
|
|
|
match (peek_1st, peek_2nd) {
|
|
('/', '*') => {
|
|
indent += 1;
|
|
self.consume();
|
|
self.consume();
|
|
}
|
|
('*', '/') => {
|
|
indent -= 1;
|
|
self.consume();
|
|
self.consume();
|
|
}
|
|
_ => {
|
|
self.consume();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
fn next_identifier(&mut self) -> Token<'src> {
|
|
let start = self.position;
|
|
|
|
self.consume();
|
|
while self.peek().is_some_and(|ch| ch.is_xid_continue()) {
|
|
self.consume();
|
|
}
|
|
|
|
let span = self.span(start);
|
|
let text = self.text(span);
|
|
|
|
let kind = match text {
|
|
"fn" => TokenKind::KwFn,
|
|
"if" => TokenKind::KwIf,
|
|
"let" => TokenKind::KwLet,
|
|
"else" => TokenKind::KwElse,
|
|
"loop" => TokenKind::KwLoop,
|
|
"while" => TokenKind::KwWhile,
|
|
"break" => TokenKind::KwBreak,
|
|
"return" => TokenKind::KwReturn,
|
|
|
|
"or" => TokenKind::KwOr,
|
|
"and" => TokenKind::KwAnd,
|
|
"not" => TokenKind::KwNot,
|
|
|
|
_ => TokenKind::Identifier,
|
|
};
|
|
|
|
Token::new(kind, span, text)
|
|
}
|
|
|
|
fn next_integer(&mut self) -> Token<'src> {
|
|
let start = self.position;
|
|
|
|
self.consume();
|
|
while self.peek().is_some_and(|ch| ch.is_ascii_digit()) {
|
|
self.consume();
|
|
}
|
|
|
|
let span = self.span(start);
|
|
let text = self.text(span);
|
|
|
|
Token::new(TokenKind::Integer, span, text)
|
|
}
|
|
|
|
fn next_punctuation(&mut self) -> Token<'src> {
|
|
let start = self.position;
|
|
|
|
macro_rules! single {
|
|
($kind:expr) => {{
|
|
self.consume();
|
|
$kind
|
|
}};
|
|
}
|
|
|
|
let kind = match self.peek().unwrap() {
|
|
'+' => single!(TokenKind::Plus),
|
|
'-' => single!(TokenKind::Minus),
|
|
'*' => single!(TokenKind::Asterisk),
|
|
'/' => single!(TokenKind::Slash),
|
|
'%' => single!(TokenKind::Percent),
|
|
'&' => single!(TokenKind::Ampersand),
|
|
'|' => single!(TokenKind::Pipe),
|
|
'^' => single!(TokenKind::Caret),
|
|
'~' => single!(TokenKind::Tilde),
|
|
|
|
'.' => single!(TokenKind::Dot),
|
|
',' => single!(TokenKind::Comma),
|
|
':' => single!(TokenKind::Colon),
|
|
';' => single!(TokenKind::Semicolon),
|
|
|
|
'=' => {
|
|
self.consume();
|
|
|
|
if self.peek() == Some('=') {
|
|
self.consume();
|
|
TokenKind::Equal
|
|
} else {
|
|
TokenKind::Assign
|
|
}
|
|
}
|
|
'!' => {
|
|
self.consume();
|
|
|
|
if self.peek() == Some('=') {
|
|
self.consume();
|
|
TokenKind::Unequal
|
|
} else {
|
|
TokenKind::InvalidCharacter
|
|
}
|
|
}
|
|
'<' => {
|
|
self.consume();
|
|
|
|
if self.peek() == Some('=') {
|
|
self.consume();
|
|
TokenKind::LessThanOrEqual
|
|
} else {
|
|
TokenKind::LessThan
|
|
}
|
|
}
|
|
'>' => {
|
|
self.consume();
|
|
|
|
if self.peek() == Some('=') {
|
|
self.consume();
|
|
TokenKind::GreaterThanOrEqual
|
|
} else {
|
|
TokenKind::GreaterThan
|
|
}
|
|
}
|
|
|
|
'(' => single!(TokenKind::LeftParen),
|
|
')' => single!(TokenKind::RightParen),
|
|
'{' => single!(TokenKind::LeftBrace),
|
|
'}' => single!(TokenKind::RightBrace),
|
|
'[' => single!(TokenKind::LeftBracket),
|
|
']' => single!(TokenKind::RightBracket),
|
|
|
|
_ => single!(TokenKind::InvalidCharacter),
|
|
};
|
|
|
|
let span = self.span(start);
|
|
let text = self.text(span);
|
|
|
|
Token::new(kind, span, text)
|
|
}
|
|
}
|
|
|
|
impl<'src> Iterator for Tokenizer<'src> {
|
|
type Item = Token<'src>;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
loop {
|
|
self.skip_whitespace();
|
|
|
|
if self.peek() == Some('/') && self.peek_nth(1) == Some('/') {
|
|
self.skip_line();
|
|
continue;
|
|
}
|
|
|
|
if self.peek() == Some('/') && self.peek_nth(1) == Some('*') {
|
|
self.skip_block_comment();
|
|
continue;
|
|
}
|
|
|
|
let ch = self.peek()?;
|
|
|
|
if ch.is_xid_start() || ch == '_' {
|
|
return Some(self.next_identifier());
|
|
}
|
|
|
|
if ch.is_ascii_digit() {
|
|
return Some(self.next_integer());
|
|
}
|
|
|
|
return Some(self.next_punctuation());
|
|
}
|
|
}
|
|
}
|