From 93f08d1944c2a16930405457a6e7aad5dccb72c6 Mon Sep 17 00:00:00 2001 From: Jooris Hadeler Date: Thu, 12 Mar 2026 12:14:00 +0100 Subject: [PATCH] feat: Add parsing for expressions. This commit adds support for parsing expression using the pratt parsing approach. --- src/ast.rs | 125 +++++++++++++++++++ src/diagnostic.rs | 196 ++++++++++++++++++++++++++++++ src/main.rs | 12 +- src/parser.rs | 301 ++++++++++++++++++++++++++++++++++++++++++++++ test.bky | 1 + 5 files changed, 632 insertions(+), 3 deletions(-) create mode 100644 src/ast.rs create mode 100644 src/diagnostic.rs create mode 100644 src/parser.rs create mode 100644 test.bky diff --git a/src/ast.rs b/src/ast.rs new file mode 100644 index 0000000..b87d181 --- /dev/null +++ b/src/ast.rs @@ -0,0 +1,125 @@ +use std::fmt::Debug; + +use crate::token::Span; + +/// The [Phase] trait is used for type state. The AST can be in one of multiple +/// type states: +/// 1. [Parsed] - AST that was produced through parsing. +pub trait Phase { + type ExtraData: PartialEq + Debug; +} + +/// See [Phase] for more information. +#[derive(Debug)] +pub struct Parsed; + +impl Phase for Parsed { + type ExtraData = (); +} + +pub type ParsedExpression = Expression; + +/// This represents an expression in the source code. It holds the +/// [ExpressionKind], the [Span] and extra information according to the [Phase]. +#[derive(Debug, PartialEq)] +pub struct Expression { + pub kind: ExpressionKind

, + pub span: Span, + pub extra: P::ExtraData, +} + +/// Represents the different kinds of [Expression]s, e.g. literals, unary or +/// binary expressions. +#[derive(Debug, PartialEq)] +pub enum ExpressionKind { + Identifier(String), + + LitString(String), + LitInteger(u64), + LitBool(bool), + + Unary { + op: UnaryOp, + op_span: Span, + operand: Box>, + }, + + Binary { + op: BinaryOp, + op_span: Span, + left: Box>, + right: Box>, + }, + + Call { + func: Box>, + args: Vec>, + }, + + Index { + expr: Box>, + index: Box>, + }, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum UnaryOp { + /// Bitwise Not + BitNot, + /// Logical Not + Not, + /// Negate + Neg, + /// Address Of + AddrOf, + /// Deref + Deref, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BinaryOp { + /// Addition + Add, + /// Subtraction + Sub, + /// Multiplication + Mul, + /// Division + Div, + /// Remainder + Rem, + + /// Bitwise And + BitAnd, + /// Bitwise Or + BitOr, + /// Bitwise Xor + BitXor, + /// Bitwise Shift Left + BitShl, + /// Bitwise Shift Right + BitShr, + + /// Logical And + And, + /// Logical Or + Or, + + /// Equal + Eq, + /// Not Equal + Ne, + /// Less than + Lt, + /// Less than or Equal + Le, + /// Greater than + Gt, + /// Greater than or Equal + Ge, + + /// Assign + Assign, + /// Member Access + Dot, +} diff --git a/src/diagnostic.rs b/src/diagnostic.rs new file mode 100644 index 0000000..efd7ca8 --- /dev/null +++ b/src/diagnostic.rs @@ -0,0 +1,196 @@ +use std::{fmt::Display, path::Path, process::exit}; + +use yansi::Paint; + +use crate::token::Span; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub enum Severity { + Note, + Warning, + Error, + Critical, +} + +impl Display for Severity { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Severity::Note => write!(f, "{}", "Note".bold().cyan()), + Severity::Warning => write!(f, "{}", "Warning".bold().yellow()), + Severity::Error => write!(f, "{}", "Error".bold().red()), + Severity::Critical => write!(f, "{}", "Critical".bold().magenta()), + } + } +} + +pub struct Diagnostic { + pub severity: Severity, + pub span: Option, + pub message: String, + pub labels: Vec<(Span, String)>, +} + +impl Diagnostic { + pub fn new(severity: Severity, message: impl ToString) -> Self { + Self { + severity, + span: None, + message: message.to_string(), + labels: Vec::new(), + } + } + + pub fn with_span(mut self, span: Span) -> Self { + self.span = Some(span); + self + } + + pub fn add_label(mut self, span: Span, message: impl ToString) -> Self { + self.labels.push((span, message.to_string())); + self + } + + pub fn report(self, file_name: &Path, source: &str) { + eprintln!("{}: {}", self.severity, self.message.bold()); + + let Some(primary_span) = self.span else { + eprintln!(" {} {}", "-->".bright_black(), file_name.display()); + if self.severity == Severity::Critical { + exit(-1); + } + return; + }; + + // Guard: no source context available (e.g. critical error before any + // file is read). + if source.is_empty() || primary_span.start as usize >= source.len() { + eprintln!(" {} {}", "-->".bright_black(), file_name.display()); + if self.severity == Severity::Critical { + exit(-1); + } + return; + } + + let (primary_line, primary_col) = get_line_col(source, primary_span.start); + + // Partition labels: those on the *exact same span* as the primary are + // merged into the primary underline as inline text. All others are + // rendered as separate snippets below the primary. + let (same_span, other_span): (Vec<_>, Vec<_>) = self + .labels + .into_iter() + .partition(|(s, _)| *s == primary_span); + + let primary_label: Option = same_span.into_iter().next().map(|(_, m)| m); + + // Gutter must be wide enough for the highest line number we'll print. + let max_line = other_span + .iter() + .filter(|(s, _)| (s.start as usize) < source.len()) + .map(|(s, _)| get_line_col(source, s.start).0) + .fold(primary_line, usize::max); + let gutter_w = count_digits(max_line); + let pad = " ".repeat(gutter_w); + + // " --> file:line:col" + eprintln!( + "{} {}:{}:{}", + format!("{pad} -->").bright_black(), + file_name.display(), + primary_line, + primary_col, + ); + eprintln!("{}", format!("{pad} |").bright_black()); + + // Primary snippet. + render_snippet( + source, + primary_span, + primary_label.as_deref(), + gutter_w, + self.severity, + ); + + // Additional-context labels (different locations). + for (span, msg) in &other_span { + if (span.start as usize) < source.len() { + render_snippet(source, *span, Some(msg.as_str()), gutter_w, Severity::Note); + } + } + + eprintln!("{}", format!("{pad} |").bright_black()); + + if self.severity == Severity::Critical { + exit(-1); + } + } +} + +/// Render a single source-line snippet: the numbered source line followed by +/// a `^^^` underline. When `label` is `Some`, the text is appended after the +/// carets on the same line. +fn render_snippet( + source: &str, + span: Span, + label: Option<&str>, + gutter_w: usize, + severity: Severity, +) { + let (line_num, _) = get_line_col(source, span.start); + let (line_start, line_content) = get_line_content(source, span.start); + + let pad = " ".repeat(gutter_w); + let bar = format!("{}", "|".bright_black()); + let line_num_str = format!("{:>width$}", line_num, width = gutter_w); + + // "N | source text" + eprintln!("{} {bar} {line_content}", line_num_str.bright_black()); + + // Caret underline, clamped to the current line. + let col_offset = span.start as usize - line_start; + let line_end_byte = line_start + line_content.len(); + let underline_len = (span.end as usize) + .min(line_end_byte) + .saturating_sub(span.start as usize) + .max(1); + + let spaces = " ".repeat(col_offset); + let carets = "^".repeat(underline_len); + let colored_carets = paint_severity(&carets, severity); + let label_text = label + .map(|l| format!(" {}", paint_severity(l, severity))) + .unwrap_or_default(); + + // " | ^^^label" + eprintln!("{pad} {bar} {spaces}{colored_carets}{label_text}"); +} + +fn paint_severity(s: &str, severity: Severity) -> String { + match severity { + Severity::Note => format!("{}", s.bold().bright_cyan()), + Severity::Warning => format!("{}", s.bold().bright_yellow()), + Severity::Error | Severity::Critical => format!("{}", s.bold().bright_red()), + } +} + +fn count_digits(n: usize) -> usize { + format!("{n}").len() +} + +/// Returns `(line_start_byte, line_content)` for the line that contains +/// `position`. The returned content does *not* include the trailing newline. +fn get_line_content(source: &str, position: u32) -> (usize, &str) { + let pos = position as usize; + let line_start = source[..pos].rfind('\n').map(|i| i + 1).unwrap_or(0); + let rest = &source[line_start..]; + let line_len = rest.find('\n').unwrap_or(rest.len()); + (line_start, &rest[..line_len]) +} + +fn get_line_col(source: &str, position: u32) -> (usize, usize) { + let prefix = &source[..position as usize]; + let line = prefix.bytes().filter(|&b| b == b'\n').count() + 1; + let line_start_byte = prefix.rfind('\n').map(|i| i + 1).unwrap_or(0); + let col = prefix[line_start_byte..].chars().count() + 1; + (line, col) +} diff --git a/src/main.rs b/src/main.rs index b6851bc..89d2dd3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,11 +2,14 @@ use std::fs; use crate::{ cli::{fatal, parse_args}, - lexer::Lexer, + parser::Parser, }; +mod ast; mod cli; +mod diagnostic; mod lexer; +mod parser; mod token; fn main() { @@ -25,8 +28,11 @@ fn main() { }; println!("-- {} --", file.display()); - for token in Lexer::new(&content) { - println!("{}", token); + let mut parser = Parser::new(&content); + + match parser.parse_expression(0) { + Ok(ast) => println!("{ast:#?}"), + Err(diag) => diag.report(file, &content), } } } diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..66dc7dd --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,301 @@ +use std::iter::Peekable; + +use crate::ast; +use crate::diagnostic::{Diagnostic, Severity}; +use crate::lexer::Lexer; +use crate::token::{Token, TokenKind}; + +/// The [Parser] consumes the [Token]s produced by the [Lexer] and constructs +/// an [ast] in the [ast::Parsed] phase. +pub struct Parser<'src> { + tokens: Peekable>, + errors: Vec, +} + +impl<'src> Parser<'src> { + /// Constructs a new [Parser] with the given source text. + pub fn new(source: &'src str) -> Self { + Self { + tokens: Lexer::new(source).peekable(), + errors: Vec::new(), + } + } + + /// Peek at the next [Token] without consuming it. + fn peek(&mut self) -> Option> { + self.tokens.peek().copied() + } + + /// Peek at the next [Token] and return a [Diagnostic] if we reached the end of input. + fn peek_no_eof(&mut self) -> Result, Diagnostic> { + self.peek() + .ok_or_else(|| Diagnostic::new(Severity::Error, "unexpected end of input")) + } + + /// Check if the peek [Token] is of a given [TokenKind]. + fn is_peek(&mut self, kind: TokenKind) -> bool { + self.peek().map_or(false, |tok| tok.is(kind)) + } + + /// Check if we have reached the end of input. + fn is_at_eof(&mut self) -> bool { + self.peek().is_none() + } + + /// Consumes and returns the next [Token]. + /// This method panics if called at the end of input. + fn advance(&mut self) -> Token<'src> { + self.tokens.next().expect("failed to advance the parser") + } + + /// Consumes and returns the next [Token], if it is of a given [TokenKind], + /// otherwise returns an [Err]. + fn expect(&mut self, kind: TokenKind) -> Result, Diagnostic> { + match self.peek() { + Some(tok) if tok.is(kind) => Ok(self.advance()), + Some(tok) => Err(Diagnostic::new(Severity::Error, "unexpected token found") + .with_span(tok.span) + .add_label( + tok.span, + format!("expected {} but found {} instead", kind, tok.kind), + )), + None => Err(Diagnostic::new(Severity::Error, "unexpected end of input")), + } + } + + /// Parses an [ast::Expression] using the pratt parsing algorithm. + pub fn parse_expression(&mut self, min_bp: u8) -> Result { + let peek_token = self.peek_no_eof()?; + + let mut left = if let Some((right_bp, op)) = prefix_binding_power(peek_token.kind) { + let op_span = self.advance().span; + let operand = Box::new(self.parse_expression(right_bp)?); + + let span = op_span.extend(operand.span); + + ast::ParsedExpression { + kind: ast::ExpressionKind::Unary { + op, + op_span, + operand, + }, + span, + extra: (), + } + } else { + self.parse_primary_expression()? + }; + + while let Some(peek_token) = self.peek() { + if let Some(left_bp) = postfix_binding_power(peek_token.kind) { + if left_bp < min_bp { + break; + } + + left = match peek_token.kind { + TokenKind::LParen => self.parse_call_expr(left)?, + TokenKind::LBracket => self.parse_index_expr(left)?, + + _ => unreachable!(), + }; + + continue; + } + + if let Some((left_bp, right_bp, op)) = infix_binding_power(peek_token.kind) { + if left_bp < min_bp { + break; + } + + let op_span = self.advance().span; + + let right = self.parse_expression(right_bp)?; + let span = left.span.extend(right.span); + + left = ast::ParsedExpression { + kind: ast::ExpressionKind::Binary { + op, + op_span, + left: Box::new(left), + right: Box::new(right), + }, + span, + extra: (), + }; + + continue; + } + + break; + } + + Ok(left) + } + + /// Parses a primary expression, e.g. literals, unary or grouped expression. + fn parse_primary_expression(&mut self) -> Result { + let peek_token = self.peek_no_eof()?; + + match peek_token.kind { + TokenKind::Identifier => { + let name = self.advance().text.to_string(); + + Ok(ast::ParsedExpression { + kind: ast::ExpressionKind::Identifier(name), + span: peek_token.span, + extra: (), + }) + } + TokenKind::LitInt => { + let tok = self.advance(); + + let (radix, src) = [("0x", 16), ("0o", 8), ("0b", 2)] + .into_iter() + .find_map(|(prefix, radix)| { + tok.text.strip_prefix(prefix).map(|text| (radix, text)) + }) + .unwrap_or((10, tok.text)); + + let value = u64::from_str_radix(src, radix).map_err(|_| { + Diagnostic::new(Severity::Error, "invalid integer literal") + .with_span(tok.span) + .add_label(tok.span, "this is an invalid integer literal") + })?; + + Ok(ast::ParsedExpression { + kind: ast::ExpressionKind::LitInteger(value), + span: tok.span, + extra: (), + }) + } + TokenKind::LitBool => { + let value = self.advance().text == "true"; + + Ok(ast::ParsedExpression { + kind: ast::ExpressionKind::LitBool(value), + span: peek_token.span, + extra: (), + }) + } + + _ => Err(Diagnostic::new( + Severity::Error, + format!( + "expected one of {}, {} or {} but found {} instead", + TokenKind::Identifier, + TokenKind::LitInt, + TokenKind::LitBool, + peek_token.kind + ), + ) + .with_span(peek_token.span)), + } + } + + /// Parses a [ast::ExpressionKind::Call] expression. + fn parse_call_expr( + &mut self, + func: ast::ParsedExpression, + ) -> Result { + self.expect(TokenKind::LParen)?; + let mut args = Vec::new(); + + while !self.is_at_eof() && !self.is_peek(TokenKind::RParen) { + if !args.is_empty() { + self.expect(TokenKind::Comma)?; + } + + args.push(self.parse_expression(0)?); + } + + let rparen_token = self.expect(TokenKind::RParen)?; + let span = func.span.extend(rparen_token.span); + + Ok(ast::ParsedExpression { + kind: ast::ExpressionKind::Call { + func: Box::new(func), + args, + }, + span, + extra: (), + }) + } + + /// Parses an [ast::ExpressionKind::Index] expression. + fn parse_index_expr( + &mut self, + expr: ast::ParsedExpression, + ) -> Result { + self.expect(TokenKind::LBracket)?; + + let index = self.parse_expression(0)?; + + let rbracket_token = self.expect(TokenKind::RBracket)?; + let span = expr.span.extend(rbracket_token.span); + + Ok(ast::ParsedExpression { + kind: ast::ExpressionKind::Index { + expr: Box::new(expr), + index: Box::new(index), + }, + span, + extra: (), + }) + } +} + +fn infix_binding_power(kind: TokenKind) -> Option<(u8, u8, ast::BinaryOp)> { + Some(match kind { + TokenKind::Assign => (2, 2, ast::BinaryOp::Assign), + + TokenKind::KwOr => (10, 11, ast::BinaryOp::Or), + TokenKind::KwAnd => (20, 21, ast::BinaryOp::And), + + TokenKind::Pipe => (30, 31, ast::BinaryOp::BitOr), + TokenKind::Caret => (40, 41, ast::BinaryOp::BitXor), + TokenKind::Amp => (50, 51, ast::BinaryOp::BitAnd), + + TokenKind::Eq => (55, 56, ast::BinaryOp::Eq), + TokenKind::Ne => (55, 56, ast::BinaryOp::Ne), + + TokenKind::Lt => (57, 58, ast::BinaryOp::Lt), + TokenKind::Le => (57, 58, ast::BinaryOp::Le), + TokenKind::Gt => (57, 58, ast::BinaryOp::Gt), + TokenKind::Ge => (57, 58, ast::BinaryOp::Ge), + + TokenKind::Plus => (60, 61, ast::BinaryOp::Add), + TokenKind::Minus => (60, 61, ast::BinaryOp::Sub), + + TokenKind::Shl => (65, 66, ast::BinaryOp::BitShl), + TokenKind::Shr => (65, 66, ast::BinaryOp::BitShr), + + TokenKind::Star => (70, 71, ast::BinaryOp::Mul), + TokenKind::Slash => (70, 71, ast::BinaryOp::Div), + TokenKind::Percent => (70, 71, ast::BinaryOp::Rem), + + TokenKind::Dot => (90, 91, ast::BinaryOp::Dot), + + _ => return None, + }) +} + +fn prefix_binding_power(kind: TokenKind) -> Option<(u8, ast::UnaryOp)> { + Some(match kind { + TokenKind::Minus => (80, ast::UnaryOp::Neg), + TokenKind::Amp => (80, ast::UnaryOp::AddrOf), + TokenKind::Tilde => (80, ast::UnaryOp::BitNot), + TokenKind::Star => (80, ast::UnaryOp::Deref), + TokenKind::Bang => (80, ast::UnaryOp::Not), + + _ => return None, + }) +} + +fn postfix_binding_power(kind: TokenKind) -> Option { + Some(match kind { + TokenKind::LParen => 90, + TokenKind::LBracket => 90, + + _ => return None, + }) +} diff --git a/test.bky b/test.bky new file mode 100644 index 0000000..534e282 --- /dev/null +++ b/test.bky @@ -0,0 +1 @@ +foo.bar(12, 3) - 5