feat: Add parsing for expressions.

This commit adds support for parsing expression using the pratt parsing
approach.
This commit is contained in:
2026-03-12 12:14:00 +01:00
parent 9ac8a79151
commit 93f08d1944
5 changed files with 632 additions and 3 deletions

125
src/ast.rs Normal file
View File

@@ -0,0 +1,125 @@
use std::fmt::Debug;
use crate::token::Span;
/// The [Phase] trait is used for type state. The AST can be in one of multiple
/// type states:
/// 1. [Parsed] - AST that was produced through parsing.
pub trait Phase {
type ExtraData: PartialEq + Debug;
}
/// See [Phase] for more information.
#[derive(Debug)]
pub struct Parsed;
impl Phase for Parsed {
type ExtraData = ();
}
pub type ParsedExpression = Expression<Parsed>;
/// This represents an expression in the source code. It holds the
/// [ExpressionKind], the [Span] and extra information according to the [Phase].
#[derive(Debug, PartialEq)]
pub struct Expression<P: Phase> {
pub kind: ExpressionKind<P>,
pub span: Span,
pub extra: P::ExtraData,
}
/// Represents the different kinds of [Expression]s, e.g. literals, unary or
/// binary expressions.
#[derive(Debug, PartialEq)]
pub enum ExpressionKind<P: Phase> {
Identifier(String),
LitString(String),
LitInteger(u64),
LitBool(bool),
Unary {
op: UnaryOp,
op_span: Span,
operand: Box<Expression<P>>,
},
Binary {
op: BinaryOp,
op_span: Span,
left: Box<Expression<P>>,
right: Box<Expression<P>>,
},
Call {
func: Box<Expression<P>>,
args: Vec<Expression<P>>,
},
Index {
expr: Box<Expression<P>>,
index: Box<Expression<P>>,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum UnaryOp {
/// Bitwise Not
BitNot,
/// Logical Not
Not,
/// Negate
Neg,
/// Address Of
AddrOf,
/// Deref
Deref,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BinaryOp {
/// Addition
Add,
/// Subtraction
Sub,
/// Multiplication
Mul,
/// Division
Div,
/// Remainder
Rem,
/// Bitwise And
BitAnd,
/// Bitwise Or
BitOr,
/// Bitwise Xor
BitXor,
/// Bitwise Shift Left
BitShl,
/// Bitwise Shift Right
BitShr,
/// Logical And
And,
/// Logical Or
Or,
/// Equal
Eq,
/// Not Equal
Ne,
/// Less than
Lt,
/// Less than or Equal
Le,
/// Greater than
Gt,
/// Greater than or Equal
Ge,
/// Assign
Assign,
/// Member Access
Dot,
}

196
src/diagnostic.rs Normal file
View File

@@ -0,0 +1,196 @@
use std::{fmt::Display, path::Path, process::exit};
use yansi::Paint;
use crate::token::Span;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum Severity {
Note,
Warning,
Error,
Critical,
}
impl Display for Severity {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Severity::Note => write!(f, "{}", "Note".bold().cyan()),
Severity::Warning => write!(f, "{}", "Warning".bold().yellow()),
Severity::Error => write!(f, "{}", "Error".bold().red()),
Severity::Critical => write!(f, "{}", "Critical".bold().magenta()),
}
}
}
pub struct Diagnostic {
pub severity: Severity,
pub span: Option<Span>,
pub message: String,
pub labels: Vec<(Span, String)>,
}
impl Diagnostic {
pub fn new(severity: Severity, message: impl ToString) -> Self {
Self {
severity,
span: None,
message: message.to_string(),
labels: Vec::new(),
}
}
pub fn with_span(mut self, span: Span) -> Self {
self.span = Some(span);
self
}
pub fn add_label(mut self, span: Span, message: impl ToString) -> Self {
self.labels.push((span, message.to_string()));
self
}
pub fn report(self, file_name: &Path, source: &str) {
eprintln!("{}: {}", self.severity, self.message.bold());
let Some(primary_span) = self.span else {
eprintln!(" {} {}", "-->".bright_black(), file_name.display());
if self.severity == Severity::Critical {
exit(-1);
}
return;
};
// Guard: no source context available (e.g. critical error before any
// file is read).
if source.is_empty() || primary_span.start as usize >= source.len() {
eprintln!(" {} {}", "-->".bright_black(), file_name.display());
if self.severity == Severity::Critical {
exit(-1);
}
return;
}
let (primary_line, primary_col) = get_line_col(source, primary_span.start);
// Partition labels: those on the *exact same span* as the primary are
// merged into the primary underline as inline text. All others are
// rendered as separate snippets below the primary.
let (same_span, other_span): (Vec<_>, Vec<_>) = self
.labels
.into_iter()
.partition(|(s, _)| *s == primary_span);
let primary_label: Option<String> = same_span.into_iter().next().map(|(_, m)| m);
// Gutter must be wide enough for the highest line number we'll print.
let max_line = other_span
.iter()
.filter(|(s, _)| (s.start as usize) < source.len())
.map(|(s, _)| get_line_col(source, s.start).0)
.fold(primary_line, usize::max);
let gutter_w = count_digits(max_line);
let pad = " ".repeat(gutter_w);
// " --> file:line:col"
eprintln!(
"{} {}:{}:{}",
format!("{pad} -->").bright_black(),
file_name.display(),
primary_line,
primary_col,
);
eprintln!("{}", format!("{pad} |").bright_black());
// Primary snippet.
render_snippet(
source,
primary_span,
primary_label.as_deref(),
gutter_w,
self.severity,
);
// Additional-context labels (different locations).
for (span, msg) in &other_span {
if (span.start as usize) < source.len() {
render_snippet(source, *span, Some(msg.as_str()), gutter_w, Severity::Note);
}
}
eprintln!("{}", format!("{pad} |").bright_black());
if self.severity == Severity::Critical {
exit(-1);
}
}
}
/// Render a single source-line snippet: the numbered source line followed by
/// a `^^^` underline. When `label` is `Some`, the text is appended after the
/// carets on the same line.
fn render_snippet(
source: &str,
span: Span,
label: Option<&str>,
gutter_w: usize,
severity: Severity,
) {
let (line_num, _) = get_line_col(source, span.start);
let (line_start, line_content) = get_line_content(source, span.start);
let pad = " ".repeat(gutter_w);
let bar = format!("{}", "|".bright_black());
let line_num_str = format!("{:>width$}", line_num, width = gutter_w);
// "N | source text"
eprintln!("{} {bar} {line_content}", line_num_str.bright_black());
// Caret underline, clamped to the current line.
let col_offset = span.start as usize - line_start;
let line_end_byte = line_start + line_content.len();
let underline_len = (span.end as usize)
.min(line_end_byte)
.saturating_sub(span.start as usize)
.max(1);
let spaces = " ".repeat(col_offset);
let carets = "^".repeat(underline_len);
let colored_carets = paint_severity(&carets, severity);
let label_text = label
.map(|l| format!(" {}", paint_severity(l, severity)))
.unwrap_or_default();
// " | ^^^label"
eprintln!("{pad} {bar} {spaces}{colored_carets}{label_text}");
}
fn paint_severity(s: &str, severity: Severity) -> String {
match severity {
Severity::Note => format!("{}", s.bold().bright_cyan()),
Severity::Warning => format!("{}", s.bold().bright_yellow()),
Severity::Error | Severity::Critical => format!("{}", s.bold().bright_red()),
}
}
fn count_digits(n: usize) -> usize {
format!("{n}").len()
}
/// Returns `(line_start_byte, line_content)` for the line that contains
/// `position`. The returned content does *not* include the trailing newline.
fn get_line_content(source: &str, position: u32) -> (usize, &str) {
let pos = position as usize;
let line_start = source[..pos].rfind('\n').map(|i| i + 1).unwrap_or(0);
let rest = &source[line_start..];
let line_len = rest.find('\n').unwrap_or(rest.len());
(line_start, &rest[..line_len])
}
fn get_line_col(source: &str, position: u32) -> (usize, usize) {
let prefix = &source[..position as usize];
let line = prefix.bytes().filter(|&b| b == b'\n').count() + 1;
let line_start_byte = prefix.rfind('\n').map(|i| i + 1).unwrap_or(0);
let col = prefix[line_start_byte..].chars().count() + 1;
(line, col)
}

View File

@@ -2,11 +2,14 @@ use std::fs;
use crate::{
cli::{fatal, parse_args},
lexer::Lexer,
parser::Parser,
};
mod ast;
mod cli;
mod diagnostic;
mod lexer;
mod parser;
mod token;
fn main() {
@@ -25,8 +28,11 @@ fn main() {
};
println!("-- {} --", file.display());
for token in Lexer::new(&content) {
println!("{}", token);
let mut parser = Parser::new(&content);
match parser.parse_expression(0) {
Ok(ast) => println!("{ast:#?}"),
Err(diag) => diag.report(file, &content),
}
}
}

301
src/parser.rs Normal file
View File

@@ -0,0 +1,301 @@
use std::iter::Peekable;
use crate::ast;
use crate::diagnostic::{Diagnostic, Severity};
use crate::lexer::Lexer;
use crate::token::{Token, TokenKind};
/// The [Parser] consumes the [Token]s produced by the [Lexer] and constructs
/// an [ast] in the [ast::Parsed] phase.
pub struct Parser<'src> {
tokens: Peekable<Lexer<'src>>,
errors: Vec<Diagnostic>,
}
impl<'src> Parser<'src> {
/// Constructs a new [Parser] with the given source text.
pub fn new(source: &'src str) -> Self {
Self {
tokens: Lexer::new(source).peekable(),
errors: Vec::new(),
}
}
/// Peek at the next [Token] without consuming it.
fn peek(&mut self) -> Option<Token<'src>> {
self.tokens.peek().copied()
}
/// Peek at the next [Token] and return a [Diagnostic] if we reached the end of input.
fn peek_no_eof(&mut self) -> Result<Token<'src>, Diagnostic> {
self.peek()
.ok_or_else(|| Diagnostic::new(Severity::Error, "unexpected end of input"))
}
/// Check if the peek [Token] is of a given [TokenKind].
fn is_peek(&mut self, kind: TokenKind) -> bool {
self.peek().map_or(false, |tok| tok.is(kind))
}
/// Check if we have reached the end of input.
fn is_at_eof(&mut self) -> bool {
self.peek().is_none()
}
/// Consumes and returns the next [Token].
/// This method panics if called at the end of input.
fn advance(&mut self) -> Token<'src> {
self.tokens.next().expect("failed to advance the parser")
}
/// Consumes and returns the next [Token], if it is of a given [TokenKind],
/// otherwise returns an [Err].
fn expect(&mut self, kind: TokenKind) -> Result<Token<'src>, Diagnostic> {
match self.peek() {
Some(tok) if tok.is(kind) => Ok(self.advance()),
Some(tok) => Err(Diagnostic::new(Severity::Error, "unexpected token found")
.with_span(tok.span)
.add_label(
tok.span,
format!("expected {} but found {} instead", kind, tok.kind),
)),
None => Err(Diagnostic::new(Severity::Error, "unexpected end of input")),
}
}
/// Parses an [ast::Expression] using the pratt parsing algorithm.
pub fn parse_expression(&mut self, min_bp: u8) -> Result<ast::ParsedExpression, Diagnostic> {
let peek_token = self.peek_no_eof()?;
let mut left = if let Some((right_bp, op)) = prefix_binding_power(peek_token.kind) {
let op_span = self.advance().span;
let operand = Box::new(self.parse_expression(right_bp)?);
let span = op_span.extend(operand.span);
ast::ParsedExpression {
kind: ast::ExpressionKind::Unary {
op,
op_span,
operand,
},
span,
extra: (),
}
} else {
self.parse_primary_expression()?
};
while let Some(peek_token) = self.peek() {
if let Some(left_bp) = postfix_binding_power(peek_token.kind) {
if left_bp < min_bp {
break;
}
left = match peek_token.kind {
TokenKind::LParen => self.parse_call_expr(left)?,
TokenKind::LBracket => self.parse_index_expr(left)?,
_ => unreachable!(),
};
continue;
}
if let Some((left_bp, right_bp, op)) = infix_binding_power(peek_token.kind) {
if left_bp < min_bp {
break;
}
let op_span = self.advance().span;
let right = self.parse_expression(right_bp)?;
let span = left.span.extend(right.span);
left = ast::ParsedExpression {
kind: ast::ExpressionKind::Binary {
op,
op_span,
left: Box::new(left),
right: Box::new(right),
},
span,
extra: (),
};
continue;
}
break;
}
Ok(left)
}
/// Parses a primary expression, e.g. literals, unary or grouped expression.
fn parse_primary_expression(&mut self) -> Result<ast::ParsedExpression, Diagnostic> {
let peek_token = self.peek_no_eof()?;
match peek_token.kind {
TokenKind::Identifier => {
let name = self.advance().text.to_string();
Ok(ast::ParsedExpression {
kind: ast::ExpressionKind::Identifier(name),
span: peek_token.span,
extra: (),
})
}
TokenKind::LitInt => {
let tok = self.advance();
let (radix, src) = [("0x", 16), ("0o", 8), ("0b", 2)]
.into_iter()
.find_map(|(prefix, radix)| {
tok.text.strip_prefix(prefix).map(|text| (radix, text))
})
.unwrap_or((10, tok.text));
let value = u64::from_str_radix(src, radix).map_err(|_| {
Diagnostic::new(Severity::Error, "invalid integer literal")
.with_span(tok.span)
.add_label(tok.span, "this is an invalid integer literal")
})?;
Ok(ast::ParsedExpression {
kind: ast::ExpressionKind::LitInteger(value),
span: tok.span,
extra: (),
})
}
TokenKind::LitBool => {
let value = self.advance().text == "true";
Ok(ast::ParsedExpression {
kind: ast::ExpressionKind::LitBool(value),
span: peek_token.span,
extra: (),
})
}
_ => Err(Diagnostic::new(
Severity::Error,
format!(
"expected one of {}, {} or {} but found {} instead",
TokenKind::Identifier,
TokenKind::LitInt,
TokenKind::LitBool,
peek_token.kind
),
)
.with_span(peek_token.span)),
}
}
/// Parses a [ast::ExpressionKind::Call] expression.
fn parse_call_expr(
&mut self,
func: ast::ParsedExpression,
) -> Result<ast::ParsedExpression, Diagnostic> {
self.expect(TokenKind::LParen)?;
let mut args = Vec::new();
while !self.is_at_eof() && !self.is_peek(TokenKind::RParen) {
if !args.is_empty() {
self.expect(TokenKind::Comma)?;
}
args.push(self.parse_expression(0)?);
}
let rparen_token = self.expect(TokenKind::RParen)?;
let span = func.span.extend(rparen_token.span);
Ok(ast::ParsedExpression {
kind: ast::ExpressionKind::Call {
func: Box::new(func),
args,
},
span,
extra: (),
})
}
/// Parses an [ast::ExpressionKind::Index] expression.
fn parse_index_expr(
&mut self,
expr: ast::ParsedExpression,
) -> Result<ast::ParsedExpression, Diagnostic> {
self.expect(TokenKind::LBracket)?;
let index = self.parse_expression(0)?;
let rbracket_token = self.expect(TokenKind::RBracket)?;
let span = expr.span.extend(rbracket_token.span);
Ok(ast::ParsedExpression {
kind: ast::ExpressionKind::Index {
expr: Box::new(expr),
index: Box::new(index),
},
span,
extra: (),
})
}
}
fn infix_binding_power(kind: TokenKind) -> Option<(u8, u8, ast::BinaryOp)> {
Some(match kind {
TokenKind::Assign => (2, 2, ast::BinaryOp::Assign),
TokenKind::KwOr => (10, 11, ast::BinaryOp::Or),
TokenKind::KwAnd => (20, 21, ast::BinaryOp::And),
TokenKind::Pipe => (30, 31, ast::BinaryOp::BitOr),
TokenKind::Caret => (40, 41, ast::BinaryOp::BitXor),
TokenKind::Amp => (50, 51, ast::BinaryOp::BitAnd),
TokenKind::Eq => (55, 56, ast::BinaryOp::Eq),
TokenKind::Ne => (55, 56, ast::BinaryOp::Ne),
TokenKind::Lt => (57, 58, ast::BinaryOp::Lt),
TokenKind::Le => (57, 58, ast::BinaryOp::Le),
TokenKind::Gt => (57, 58, ast::BinaryOp::Gt),
TokenKind::Ge => (57, 58, ast::BinaryOp::Ge),
TokenKind::Plus => (60, 61, ast::BinaryOp::Add),
TokenKind::Minus => (60, 61, ast::BinaryOp::Sub),
TokenKind::Shl => (65, 66, ast::BinaryOp::BitShl),
TokenKind::Shr => (65, 66, ast::BinaryOp::BitShr),
TokenKind::Star => (70, 71, ast::BinaryOp::Mul),
TokenKind::Slash => (70, 71, ast::BinaryOp::Div),
TokenKind::Percent => (70, 71, ast::BinaryOp::Rem),
TokenKind::Dot => (90, 91, ast::BinaryOp::Dot),
_ => return None,
})
}
fn prefix_binding_power(kind: TokenKind) -> Option<(u8, ast::UnaryOp)> {
Some(match kind {
TokenKind::Minus => (80, ast::UnaryOp::Neg),
TokenKind::Amp => (80, ast::UnaryOp::AddrOf),
TokenKind::Tilde => (80, ast::UnaryOp::BitNot),
TokenKind::Star => (80, ast::UnaryOp::Deref),
TokenKind::Bang => (80, ast::UnaryOp::Not),
_ => return None,
})
}
fn postfix_binding_power(kind: TokenKind) -> Option<u8> {
Some(match kind {
TokenKind::LParen => 90,
TokenKind::LBracket => 90,
_ => return None,
})
}

1
test.bky Normal file
View File

@@ -0,0 +1 @@
foo.bar(12, 3) - 5