From 4981a78a8b885109fffac81e33180672852ddffe Mon Sep 17 00:00:00 2001 From: Jooris Hadeler Date: Mon, 20 Apr 2026 21:41:58 +0200 Subject: [PATCH] feat: add ast and parser implementation --- PLAN.md | 16 +- src/frontend/ast.rs | 103 ++++++ src/frontend/mod.rs | 2 + src/frontend/parser.rs | 767 +++++++++++++++++++++++++++++++++++++++++ src/frontend/token.rs | 2 + 5 files changed, 881 insertions(+), 9 deletions(-) create mode 100644 src/frontend/ast.rs create mode 100644 src/frontend/parser.rs diff --git a/PLAN.md b/PLAN.md index daee6eb..e4bccf0 100644 --- a/PLAN.md +++ b/PLAN.md @@ -15,15 +15,13 @@ A Rust-flavored, C-targeting language - built pipeline-first. ## Phase 2 - Parser -- [ ] Write grammar for the base subset - - `fn` declarations, `return`, `let`, int/bool literals - - Arithmetic (`+`, `-`, `*`, `/`), comparison (`==`, `!=`, `<`, `>`) - - Function call expressions -- [ ] Implement recursive-descent parser -- [ ] Build typed AST: `FnDecl`, `Block`, `ReturnStmt`, `LetStmt`, `BinExpr`, `CallExpr`, `Literal`, `Ident` -- [ ] Attach source spans to every AST node -- [ ] Emit structured parse errors with span info -- [ ] Unit-test: parse valid snippets, expect correct AST shapes +- [x] Write grammar for the base subset + - `fn` declarations, `return`, int/bool literals + - Arithmetic (`+`, `-`, `*`, `/`) +- [x] Implement recursive-descent parser +- [x] Attach source spans to every AST node +- [x] Emit structured parse errors with span info +- [x] Unit-test: parse valid snippets, expect correct AST shapes ## Phase 3 - Semantic Analysis diff --git a/src/frontend/ast.rs b/src/frontend/ast.rs new file mode 100644 index 0000000..3c1d664 --- /dev/null +++ b/src/frontend/ast.rs @@ -0,0 +1,103 @@ +use crate::frontend::token::Span; + +#[derive(Debug, PartialEq, Eq)] +pub struct Module { + pub decls: Vec, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct Decl { + pub kind: DeclKind, + pub span: Span, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum DeclKind { + Function { + name: String, + name_span: Span, + params: Vec, + return_type: Option, + body: Stmt, + }, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct FunctionParam { + pub name: String, + pub name_span: Span, + pub ty: Type, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct Type { + pub kind: TypeKind, + pub span: Span, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum TypeKind { + I8, + I16, + I32, + I64, + U8, + U16, + U32, + U64, + Bool, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct Stmt { + pub kind: StmtKind, + pub span: Span, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum StmtKind { + Compound { inner: Vec }, + Return { value: Option }, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct Expr { + pub kind: ExprKind, + pub span: Span, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum ExprKind { + Identifier { + name: String, + }, + Integer { + value: u64, + }, + Boolean { + value: bool, + }, + Unary { + op: UnaryOp, + expr: Box, + }, + Binary { + op: BinaryOp, + lhs: Box, + rhs: Box, + }, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum UnaryOp { + Neg, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BinaryOp { + Add, + Sub, + Mul, + Div, + Rem, +} diff --git a/src/frontend/mod.rs b/src/frontend/mod.rs index e12719b..1b8d4c4 100644 --- a/src/frontend/mod.rs +++ b/src/frontend/mod.rs @@ -1,2 +1,4 @@ +pub mod ast; pub mod lexer; +pub mod parser; pub mod token; diff --git a/src/frontend/parser.rs b/src/frontend/parser.rs new file mode 100644 index 0000000..b0b81ac --- /dev/null +++ b/src/frontend/parser.rs @@ -0,0 +1,767 @@ +use std::{fmt::Display, iter::Peekable}; + +use crate::frontend::{ + ast::*, + lexer::Lexer, + token::{Span, Token, TokenKind}, +}; + +/// A structured error produced during parsing, carrying a human-readable +/// message and the [Span] of the offending token for precise diagnostics. +#[derive(Debug, PartialEq, Eq)] +pub struct ParseError { + /// Human-readable description of what went wrong. + pub message: String, + /// Source location of the offending token. + pub span: Span, +} + +impl ParseError { + /// Creates a new [ParserError] with the given message and source span. + pub fn new(message: impl Display, span: Span) -> Self { + Self { + message: message.to_string(), + span, + } + } +} + +/// Convenience alias for parser operations that may fail with a [ParserError]. +type ParseResult = Result; + +/// Consumes a [`Lexer`] token stream and builds an AST. +/// +/// The parser operates as a recursive-descent parser, peeking one token +/// ahead to make branching decisions without backtracking. +pub struct Parser<'src> { + lexer: Peekable>, + errors: Vec, +} + +impl<'src> Parser<'src> { + /// Creates a new [Parser] with the given source text. + pub fn new(source: &'src str) -> Self { + Self { + lexer: Lexer::new(source).peekable(), + errors: Vec::new(), + } + } + + /// Finish the parsing process returning the errors if any have occured. + pub fn finish(self) -> Option> { + (!self.errors.is_empty()).then_some(self.errors) + } + + /// Advances the lexer and returns the next [`Token`], or `None` at end of file. + fn advance(&mut self) -> Option> { + self.lexer.next() + } + + /// Returns a copy of the next [Token] without consuming it, or `None` at end of file. + fn peek(&mut self) -> Option> { + self.lexer.peek().copied() + } + + /// Returns a copy of the next [Token], or `Err` at end of file. + fn peek_no_eof(&mut self) -> ParseResult> { + self.peek() + .ok_or_else(|| ParseError::new("unexpected end of file", Span::EOF)) + } + + /// Returns `true` if we have reached the end of file. + fn is_at_eof(&mut self) -> bool { + self.lexer.peek().is_none() + } + + /// Returns `true` if the next [Token] matches the given [TokenKind]. + fn is_peek(&mut self, kind: TokenKind) -> bool { + self.peek().is_some_and(|tok| tok.kind == kind) + } + + /// Peeks at the next [Token] and returns it if it matches `expected`, + /// otherwise returns a [ParserError] describing the mismatch or unexpected EOF. + fn expect(&mut self, expected: TokenKind) -> ParseResult> { + let token = self.peek_no_eof()?; + + if token.kind != expected { + return Err(ParseError::new( + format!("expected {} but found {} instead", expected, token.kind), + token.span, + )); + } + + self.advance(); + + Ok(token) + } + + /// Consumes tokens until we reach a synchronization token or we reach end of file. + fn synchronize(&mut self, kinds: &[TokenKind]) { + while self.peek().is_some_and(|tok| !kinds.contains(&tok.kind)) { + self.advance(); + } + } + + // ====== Recursive Descent Parser Implementation ====== + + /// Parses a module. + /// + /// ```ebnf + /// module = { decl } ; + /// ``` + pub fn parse_module(&mut self) -> Module { + let mut decls = Vec::new(); + + while !self.is_at_eof() { + match self.parse_decl() { + Ok(decl) => decls.push(decl), + Err(err) => { + self.errors.push(err); + self.synchronize(&[TokenKind::Fn]); + } + } + } + + Module { decls } + } + + /// Parses a declaration. + /// + /// ```ebnf + /// decl = function_decl ; + /// ``` + pub fn parse_decl(&mut self) -> ParseResult { + let peek_token = self.peek_no_eof()?; + + match peek_token.kind { + TokenKind::Fn => self.parse_function_decl(), + + _ => Err(ParseError::new( + format!( + "expected a declaration but found {} instead", + peek_token.kind + ), + peek_token.span, + )), + } + } + + /// Parses a function declaration. + /// + /// ```ebnf + /// function_decl = "fn" IDENTIFIER "(" function_params ")" [ "->" type ] stmt ; + /// ``` + fn parse_function_decl(&mut self) -> ParseResult { + let fn_token = self.expect(TokenKind::Fn)?; + + let (name, name_span) = { + let ident_token = self.expect(TokenKind::Identifier)?; + (ident_token.text.to_string(), ident_token.span) + }; + + self.expect(TokenKind::LParen)?; + let params = self.parse_function_params()?; + self.expect(TokenKind::RParen)?; + + let return_type = if self.is_peek(TokenKind::Arrow) { + self.advance(); + Some(self.parse_type()?) + } else { + None + }; + + let body = self.parse_compound_stmt()?; + let span = fn_token.span.join(body.span); + + Ok(Decl { + kind: DeclKind::Function { + name, + name_span, + params, + return_type, + body, + }, + span, + }) + } + + /// Parses the function parameter list. + /// + /// ```ebnf + /// function_params = [ function_param { "," function_param } ] ; + /// function_param = IDENTIFIER ":" type ; + /// ``` + fn parse_function_params(&mut self) -> ParseResult> { + let mut params = Vec::new(); + + while !self.is_at_eof() && !self.is_peek(TokenKind::RParen) { + if !params.is_empty() { + self.expect(TokenKind::Comma)?; + } + + let (name, name_span) = { + let ident_token = self.expect(TokenKind::Identifier)?; + (ident_token.text.to_string(), ident_token.span) + }; + + self.expect(TokenKind::Colon)?; + + let ty = self.parse_type()?; + + params.push(FunctionParam { + name, + name_span, + ty, + }); + } + + Ok(params) + } + + /// Parses a type. + /// + /// ```ebnf + /// type = "i8" | "i16" | "i32" | "i64" + /// | "u8" | "u16" | "u32" | "u64" + /// | "bool" ; + /// ``` + pub fn parse_type(&mut self) -> ParseResult { + let peek_token = self.peek_no_eof()?; + + let kind = match peek_token.kind { + TokenKind::I8 => { + self.advance(); + TypeKind::I8 + } + TokenKind::I16 => { + self.advance(); + TypeKind::I16 + } + TokenKind::I32 => { + self.advance(); + TypeKind::I32 + } + TokenKind::I64 => { + self.advance(); + TypeKind::I64 + } + TokenKind::U8 => { + self.advance(); + TypeKind::U8 + } + TokenKind::U16 => { + self.advance(); + TypeKind::U16 + } + TokenKind::U32 => { + self.advance(); + TypeKind::U32 + } + TokenKind::U64 => { + self.advance(); + TypeKind::U64 + } + TokenKind::Bool => { + self.advance(); + TypeKind::Bool + } + + _ => { + return Err(ParseError::new( + format!("expected a type but found {} instead", peek_token.kind), + peek_token.span, + )); + } + }; + + Ok(Type { + kind, + span: peek_token.span, + }) + } + + /// Parses a statement. + /// + /// ```ebnf + /// stmt = compound_stmt + /// | return_stmt ; + /// ``` + pub fn parse_stmt(&mut self) -> ParseResult { + let peek_token = self.peek_no_eof()?; + + match peek_token.kind { + TokenKind::LBrace => self.parse_compound_stmt(), + TokenKind::Return => self.parse_return_stmt(), + + _ => Err(ParseError::new( + format!("expected a statement but found {} instead", peek_token.kind), + peek_token.span, + )), + } + } + + /// Parses a compound statement. + /// + /// ```ebnf + /// compound_stmt = "{" { stmt } "}"; + /// ``` + fn parse_compound_stmt(&mut self) -> ParseResult { + let lbrace_token = self.expect(TokenKind::LBrace)?; + let mut inner = Vec::new(); + + while !self.is_at_eof() && !self.is_peek(TokenKind::RBrace) { + match self.parse_stmt() { + Ok(stmt) => inner.push(stmt), + Err(error) => { + self.errors.push(error); + + // skip ahead until we've reached a statement border + self.synchronize(&[TokenKind::Semicolon, TokenKind::Return, TokenKind::RBrace]); + if self.is_peek(TokenKind::Semicolon) { + self.advance(); + } + } + } + } + + let rbrace_token = self.expect(TokenKind::RBrace)?; + let span = lbrace_token.span.join(rbrace_token.span); + + Ok(Stmt { + kind: StmtKind::Compound { inner }, + span, + }) + } + + /// Parses a return statement. + /// + /// ```ebnf + /// return_stmt = "return" [ expr ] ";" ; + /// ``` + fn parse_return_stmt(&mut self) -> ParseResult { + let return_token = self.expect(TokenKind::Return)?; + + let value = if !self.is_peek(TokenKind::Semicolon) { + Some(self.parse_expr()?) + } else { + None + }; + + let semi_token = self.expect(TokenKind::Semicolon)?; + let span = return_token.span.join(semi_token.span); + + Ok(Stmt { + kind: StmtKind::Return { value }, + span, + }) + } + + // ====== Pratt Parsing Implementation ====== + + /// Parses an expression. + pub fn parse_expr(&mut self) -> ParseResult { + self.parse_expr_bp(0) + } + + /// Pratt parsing implementation for expressions. + fn parse_expr_bp(&mut self, min_bp: u8) -> ParseResult { + let mut lhs = self.parse_leading_expr()?; + + loop { + let peek_token = self.peek_no_eof()?; + + let Some((op, left_bp, right_bp)) = self.infix_operator(peek_token.kind) else { + break; // Not an infix operator + }; + + if left_bp < min_bp { + break; // The operator binds less tightly than the current context + } + + self.advance(); // consume the operator + + let rhs = self.parse_expr_bp(right_bp)?; + let span = lhs.span.join(rhs.span); + + lhs = Expr { + kind: ExprKind::Binary { + op, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }, + span, + }; + } + + Ok(lhs) + } + + /// Parses a leading expression such as identifiers, integer and boolean literals + /// or prefix expressions. + fn parse_leading_expr(&mut self) -> ParseResult { + let peek_token = self.peek_no_eof()?; + + match peek_token.kind { + TokenKind::Identifier => { + let token = self.advance().unwrap(); + + Ok(Expr { + kind: ExprKind::Identifier { + name: token.text.to_string(), + }, + span: token.span, + }) + } + + TokenKind::IntegerLit => { + let token = self.advance().unwrap(); + let text = token.text; + + let value = if text.starts_with("0x") || text.starts_with("0X") { + u64::from_str_radix(&text[2..], 16) + } else if text.starts_with("0o") || text.starts_with("0O") { + u64::from_str_radix(&text[2..], 8) + } else if text.starts_with("0b") || text.starts_with("0B") { + u64::from_str_radix(&text[2..], 2) + } else { + text.parse() + } + .unwrap(); + + Ok(Expr { + kind: ExprKind::Integer { value }, + span: token.span, + }) + } + + TokenKind::BooleanLit => { + let token = self.advance().unwrap(); + + Ok(Expr { + kind: ExprKind::Boolean { + value: token.text == "true", + }, + span: token.span, + }) + } + + TokenKind::LParen => { + let lparen = self.advance().unwrap(); + let expr = self.parse_expr_bp(0)?; + let rparen = self.expect(TokenKind::RParen)?; + + Ok(Expr { + kind: expr.kind, + span: lparen.span.join(rparen.span), + }) + } + + kind if let Some((op, r_bp)) = self.prefix_operator(kind) => { + let op_token = self.advance().unwrap(); + let rhs = self.parse_expr_bp(r_bp)?; + + Ok(Expr { + span: op_token.span.join(rhs.span), + kind: ExprKind::Unary { + op, + expr: Box::new(rhs), + }, + }) + } + + _ => Err(ParseError::new( + format!("expected an expression but found {}", peek_token.kind), + peek_token.span, + )), + } + } + + /// Returns the [UnaryOp] and right binding power of a prefix operator, + /// or `None` if the [TokenKind] is not a valid prefix operator. + fn prefix_operator(&self, op: TokenKind) -> Option<(UnaryOp, u8)> { + match op { + TokenKind::Minus => Some((UnaryOp::Neg, 30)), + + _ => None, + } + } + + /// Returns the [BinaryOp], left and right binding powers of an infix operator, + /// or `None` if the [TokenKind] is not a valid infix operator. + fn infix_operator(&self, op: TokenKind) -> Option<(BinaryOp, u8, u8)> { + match op { + TokenKind::Plus => Some((BinaryOp::Add, 10, 11)), + TokenKind::Minus => Some((BinaryOp::Sub, 10, 11)), + + TokenKind::Star => Some((BinaryOp::Mul, 20, 21)), + TokenKind::Slash => Some((BinaryOp::Div, 20, 21)), + TokenKind::Percent => Some((BinaryOp::Rem, 20, 21)), + + _ => None, + } + } +} + +#[cfg(test)] +mod test { + use std::fmt::Debug; + + use crate::frontend::{ + ast::*, + parser::{ParseError, ParseResult, Parser}, + token::Span, + }; + + #[derive(Debug, PartialEq, Eq)] + enum TestResult { + Success(T), + Recovered(T, Vec), + Error(Vec), + } + + use TestResult::*; + + fn parse<'src, T: Debug + Eq + PartialEq>( + source: &'src str, + method: impl Fn(&mut Parser<'src>) -> ParseResult, + ) -> TestResult { + let mut parser = Parser::new(source); + + match method(&mut parser) { + Ok(result) => { + if parser.errors.is_empty() { + TestResult::Success(result) + } else { + TestResult::Recovered(result, parser.errors) + } + } + Err(error) => { + parser.errors.push(error); + TestResult::Error(parser.errors) + } + } + } + + #[test] + fn integer_literals() { + assert_eq!( + parse("0xBEEF;", Parser::parse_expr), + Success(Expr { + kind: ExprKind::Integer { value: 0xBEEF }, + span: Span::new(0, 6) + }) + ); + + assert_eq!( + parse("0o777;", Parser::parse_expr), + Success(Expr { + kind: ExprKind::Integer { value: 0o777 }, + span: Span::new(0, 5) + }) + ); + + assert_eq!( + parse("0b1001;", Parser::parse_expr), + Success(Expr { + kind: ExprKind::Integer { value: 0b1001 }, + span: Span::new(0, 6) + }) + ); + + assert_eq!( + parse("1337;", Parser::parse_expr), + Success(Expr { + kind: ExprKind::Integer { value: 1337 }, + span: Span::new(0, 4) + }) + ); + } + + #[test] + fn boolean_literals() { + assert_eq!( + parse("true;", Parser::parse_expr), + Success(Expr { + kind: ExprKind::Boolean { value: true }, + span: Span::new(0, 4) + }) + ); + + assert_eq!( + parse("false;", Parser::parse_expr), + Success(Expr { + kind: ExprKind::Boolean { value: false }, + span: Span::new(0, 5) + }) + ); + } + + #[test] + fn unary_expr() { + assert_eq!( + parse("-5;", Parser::parse_expr), + Success(Expr { + kind: ExprKind::Unary { + op: UnaryOp::Neg, + expr: Box::new(Expr { + kind: ExprKind::Integer { value: 5 }, + span: Span::new(1, 2) + }) + }, + span: Span::new(0, 2) + }) + ); + } + + #[test] + fn binary_expr() { + assert_eq!( + parse("12 + 3 * 6;", Parser::parse_expr), + Success(Expr { + kind: ExprKind::Binary { + op: BinaryOp::Add, + lhs: Box::new(Expr { + kind: ExprKind::Integer { value: 12 }, + span: Span::new(0, 2) + }), + rhs: Box::new(Expr { + kind: ExprKind::Binary { + op: BinaryOp::Mul, + lhs: Box::new(Expr { + kind: ExprKind::Integer { value: 3 }, + span: Span::new(5, 6) + }), + rhs: Box::new(Expr { + kind: ExprKind::Integer { value: 6 }, + span: Span::new(9, 10) + }) + }, + span: Span::new(5, 10) + }) + }, + span: Span::new(0, 10) + }) + ); + } + + #[test] + fn return_stmt() { + assert_eq!( + parse("return;", Parser::parse_stmt), + Success(Stmt { + kind: StmtKind::Return { value: None }, + span: Span::new(0, 7) + }) + ); + + assert_eq!( + parse("return 0;", Parser::parse_stmt), + Success(Stmt { + kind: StmtKind::Return { + value: Some(Expr { + kind: ExprKind::Integer { value: 0 }, + span: Span::new(7, 8) + }) + }, + span: Span::new(0, 9) + }) + ); + } + + #[test] + fn compound_stmt() { + assert_eq!( + parse("{ return; }", Parser::parse_stmt), + Success(Stmt { + kind: StmtKind::Compound { + inner: vec![Stmt { + kind: StmtKind::Return { value: None }, + span: Span::new(2, 9) + }] + }, + span: Span::new(0, 11) + }) + ); + + assert_eq!( + parse("{ return 0 }", Parser::parse_stmt), + Recovered( + Stmt { + kind: StmtKind::Compound { inner: vec![] }, + span: Span::new(0, 12) + }, + vec![ParseError::new( + "expected `;` but found `}` instead", + Span::new(11, 12) + )] + ) + ); + } + + #[test] + fn function_decl() { + assert_eq!( + parse( + "fn add(a: i32, b: i32) -> i32 { return a + b; }", + Parser::parse_decl + ), + Success(Decl { + kind: DeclKind::Function { + name: "add".to_string(), + name_span: Span::new(3, 6), + params: vec![ + FunctionParam { + name: "a".to_string(), + name_span: Span::new(7, 8), + ty: Type { + kind: TypeKind::I32, + span: Span::new(10, 13) + } + }, + FunctionParam { + name: "b".to_string(), + name_span: Span::new(15, 16), + ty: Type { + kind: TypeKind::I32, + span: Span::new(18, 21) + } + } + ], + return_type: Some(Type { + kind: TypeKind::I32, + span: Span::new(26, 29) + }), + body: Stmt { + kind: StmtKind::Compound { + inner: vec![Stmt { + kind: StmtKind::Return { + value: Some(Expr { + kind: ExprKind::Binary { + op: BinaryOp::Add, + lhs: Box::new(Expr { + kind: ExprKind::Identifier { + name: "a".to_string() + }, + span: Span::new(39, 40) + }), + rhs: Box::new(Expr { + kind: ExprKind::Identifier { + name: "b".to_string() + }, + span: Span::new(43, 44) + }) + }, + span: Span::new(39, 44) + }) + }, + span: Span::new(32, 45) + }] + }, + span: Span::new(30, 47) + } + }, + span: Span::new(0, 47) + }) + ) + } +} diff --git a/src/frontend/token.rs b/src/frontend/token.rs index d38b323..d2efc4b 100644 --- a/src/frontend/token.rs +++ b/src/frontend/token.rs @@ -8,6 +8,8 @@ pub struct Span { } impl Span { + pub const EOF: Self = Self::new(0, 0); + /// Create a new [Span] from the start and end positions. pub const fn new(start: usize, end: usize) -> Self { debug_assert!(start <= end);