feat: add ast and parser implementation

This commit is contained in:
2026-04-20 21:41:58 +02:00
parent 1006301255
commit 4981a78a8b
5 changed files with 881 additions and 9 deletions
+7 -9
View File
@@ -15,15 +15,13 @@ A Rust-flavored, C-targeting language - built pipeline-first.
## Phase 2 - Parser ## Phase 2 - Parser
- [ ] Write grammar for the base subset - [x] Write grammar for the base subset
- `fn` declarations, `return`, `let`, int/bool literals - `fn` declarations, `return`, int/bool literals
- Arithmetic (`+`, `-`, `*`, `/`), comparison (`==`, `!=`, `<`, `>`) - Arithmetic (`+`, `-`, `*`, `/`)
- Function call expressions - [x] Implement recursive-descent parser
- [ ] Implement recursive-descent parser - [x] Attach source spans to every AST node
- [ ] Build typed AST: `FnDecl`, `Block`, `ReturnStmt`, `LetStmt`, `BinExpr`, `CallExpr`, `Literal`, `Ident` - [x] Emit structured parse errors with span info
- [ ] Attach source spans to every AST node - [x] Unit-test: parse valid snippets, expect correct AST shapes
- [ ] Emit structured parse errors with span info
- [ ] Unit-test: parse valid snippets, expect correct AST shapes
## Phase 3 - Semantic Analysis ## Phase 3 - Semantic Analysis
+103
View File
@@ -0,0 +1,103 @@
use crate::frontend::token::Span;
#[derive(Debug, PartialEq, Eq)]
pub struct Module {
pub decls: Vec<Decl>,
}
#[derive(Debug, PartialEq, Eq)]
pub struct Decl {
pub kind: DeclKind,
pub span: Span,
}
#[derive(Debug, PartialEq, Eq)]
pub enum DeclKind {
Function {
name: String,
name_span: Span,
params: Vec<FunctionParam>,
return_type: Option<Type>,
body: Stmt,
},
}
#[derive(Debug, PartialEq, Eq)]
pub struct FunctionParam {
pub name: String,
pub name_span: Span,
pub ty: Type,
}
#[derive(Debug, PartialEq, Eq)]
pub struct Type {
pub kind: TypeKind,
pub span: Span,
}
#[derive(Debug, PartialEq, Eq)]
pub enum TypeKind {
I8,
I16,
I32,
I64,
U8,
U16,
U32,
U64,
Bool,
}
#[derive(Debug, PartialEq, Eq)]
pub struct Stmt {
pub kind: StmtKind,
pub span: Span,
}
#[derive(Debug, PartialEq, Eq)]
pub enum StmtKind {
Compound { inner: Vec<Stmt> },
Return { value: Option<Expr> },
}
#[derive(Debug, PartialEq, Eq)]
pub struct Expr {
pub kind: ExprKind,
pub span: Span,
}
#[derive(Debug, PartialEq, Eq)]
pub enum ExprKind {
Identifier {
name: String,
},
Integer {
value: u64,
},
Boolean {
value: bool,
},
Unary {
op: UnaryOp,
expr: Box<Expr>,
},
Binary {
op: BinaryOp,
lhs: Box<Expr>,
rhs: Box<Expr>,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum UnaryOp {
Neg,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BinaryOp {
Add,
Sub,
Mul,
Div,
Rem,
}
+2
View File
@@ -1,2 +1,4 @@
pub mod ast;
pub mod lexer; pub mod lexer;
pub mod parser;
pub mod token; pub mod token;
+767
View File
@@ -0,0 +1,767 @@
use std::{fmt::Display, iter::Peekable};
use crate::frontend::{
ast::*,
lexer::Lexer,
token::{Span, Token, TokenKind},
};
/// A structured error produced during parsing, carrying a human-readable
/// message and the [Span] of the offending token for precise diagnostics.
#[derive(Debug, PartialEq, Eq)]
pub struct ParseError {
/// Human-readable description of what went wrong.
pub message: String,
/// Source location of the offending token.
pub span: Span,
}
impl ParseError {
/// Creates a new [ParserError] with the given message and source span.
pub fn new(message: impl Display, span: Span) -> Self {
Self {
message: message.to_string(),
span,
}
}
}
/// Convenience alias for parser operations that may fail with a [ParserError].
type ParseResult<T> = Result<T, ParseError>;
/// Consumes a [`Lexer`] token stream and builds an AST.
///
/// The parser operates as a recursive-descent parser, peeking one token
/// ahead to make branching decisions without backtracking.
pub struct Parser<'src> {
lexer: Peekable<Lexer<'src>>,
errors: Vec<ParseError>,
}
impl<'src> Parser<'src> {
/// Creates a new [Parser] with the given source text.
pub fn new(source: &'src str) -> Self {
Self {
lexer: Lexer::new(source).peekable(),
errors: Vec::new(),
}
}
/// Finish the parsing process returning the errors if any have occured.
pub fn finish(self) -> Option<Vec<ParseError>> {
(!self.errors.is_empty()).then_some(self.errors)
}
/// Advances the lexer and returns the next [`Token`], or `None` at end of file.
fn advance(&mut self) -> Option<Token<'src>> {
self.lexer.next()
}
/// Returns a copy of the next [Token] without consuming it, or `None` at end of file.
fn peek(&mut self) -> Option<Token<'src>> {
self.lexer.peek().copied()
}
/// Returns a copy of the next [Token], or `Err` at end of file.
fn peek_no_eof(&mut self) -> ParseResult<Token<'src>> {
self.peek()
.ok_or_else(|| ParseError::new("unexpected end of file", Span::EOF))
}
/// Returns `true` if we have reached the end of file.
fn is_at_eof(&mut self) -> bool {
self.lexer.peek().is_none()
}
/// Returns `true` if the next [Token] matches the given [TokenKind].
fn is_peek(&mut self, kind: TokenKind) -> bool {
self.peek().is_some_and(|tok| tok.kind == kind)
}
/// Peeks at the next [Token] and returns it if it matches `expected`,
/// otherwise returns a [ParserError] describing the mismatch or unexpected EOF.
fn expect(&mut self, expected: TokenKind) -> ParseResult<Token<'src>> {
let token = self.peek_no_eof()?;
if token.kind != expected {
return Err(ParseError::new(
format!("expected {} but found {} instead", expected, token.kind),
token.span,
));
}
self.advance();
Ok(token)
}
/// Consumes tokens until we reach a synchronization token or we reach end of file.
fn synchronize(&mut self, kinds: &[TokenKind]) {
while self.peek().is_some_and(|tok| !kinds.contains(&tok.kind)) {
self.advance();
}
}
// ====== Recursive Descent Parser Implementation ======
/// Parses a module.
///
/// ```ebnf
/// module = { decl } ;
/// ```
pub fn parse_module(&mut self) -> Module {
let mut decls = Vec::new();
while !self.is_at_eof() {
match self.parse_decl() {
Ok(decl) => decls.push(decl),
Err(err) => {
self.errors.push(err);
self.synchronize(&[TokenKind::Fn]);
}
}
}
Module { decls }
}
/// Parses a declaration.
///
/// ```ebnf
/// decl = function_decl ;
/// ```
pub fn parse_decl(&mut self) -> ParseResult<Decl> {
let peek_token = self.peek_no_eof()?;
match peek_token.kind {
TokenKind::Fn => self.parse_function_decl(),
_ => Err(ParseError::new(
format!(
"expected a declaration but found {} instead",
peek_token.kind
),
peek_token.span,
)),
}
}
/// Parses a function declaration.
///
/// ```ebnf
/// function_decl = "fn" IDENTIFIER "(" function_params ")" [ "->" type ] stmt ;
/// ```
fn parse_function_decl(&mut self) -> ParseResult<Decl> {
let fn_token = self.expect(TokenKind::Fn)?;
let (name, name_span) = {
let ident_token = self.expect(TokenKind::Identifier)?;
(ident_token.text.to_string(), ident_token.span)
};
self.expect(TokenKind::LParen)?;
let params = self.parse_function_params()?;
self.expect(TokenKind::RParen)?;
let return_type = if self.is_peek(TokenKind::Arrow) {
self.advance();
Some(self.parse_type()?)
} else {
None
};
let body = self.parse_compound_stmt()?;
let span = fn_token.span.join(body.span);
Ok(Decl {
kind: DeclKind::Function {
name,
name_span,
params,
return_type,
body,
},
span,
})
}
/// Parses the function parameter list.
///
/// ```ebnf
/// function_params = [ function_param { "," function_param } ] ;
/// function_param = IDENTIFIER ":" type ;
/// ```
fn parse_function_params(&mut self) -> ParseResult<Vec<FunctionParam>> {
let mut params = Vec::new();
while !self.is_at_eof() && !self.is_peek(TokenKind::RParen) {
if !params.is_empty() {
self.expect(TokenKind::Comma)?;
}
let (name, name_span) = {
let ident_token = self.expect(TokenKind::Identifier)?;
(ident_token.text.to_string(), ident_token.span)
};
self.expect(TokenKind::Colon)?;
let ty = self.parse_type()?;
params.push(FunctionParam {
name,
name_span,
ty,
});
}
Ok(params)
}
/// Parses a type.
///
/// ```ebnf
/// type = "i8" | "i16" | "i32" | "i64"
/// | "u8" | "u16" | "u32" | "u64"
/// | "bool" ;
/// ```
pub fn parse_type(&mut self) -> ParseResult<Type> {
let peek_token = self.peek_no_eof()?;
let kind = match peek_token.kind {
TokenKind::I8 => {
self.advance();
TypeKind::I8
}
TokenKind::I16 => {
self.advance();
TypeKind::I16
}
TokenKind::I32 => {
self.advance();
TypeKind::I32
}
TokenKind::I64 => {
self.advance();
TypeKind::I64
}
TokenKind::U8 => {
self.advance();
TypeKind::U8
}
TokenKind::U16 => {
self.advance();
TypeKind::U16
}
TokenKind::U32 => {
self.advance();
TypeKind::U32
}
TokenKind::U64 => {
self.advance();
TypeKind::U64
}
TokenKind::Bool => {
self.advance();
TypeKind::Bool
}
_ => {
return Err(ParseError::new(
format!("expected a type but found {} instead", peek_token.kind),
peek_token.span,
));
}
};
Ok(Type {
kind,
span: peek_token.span,
})
}
/// Parses a statement.
///
/// ```ebnf
/// stmt = compound_stmt
/// | return_stmt ;
/// ```
pub fn parse_stmt(&mut self) -> ParseResult<Stmt> {
let peek_token = self.peek_no_eof()?;
match peek_token.kind {
TokenKind::LBrace => self.parse_compound_stmt(),
TokenKind::Return => self.parse_return_stmt(),
_ => Err(ParseError::new(
format!("expected a statement but found {} instead", peek_token.kind),
peek_token.span,
)),
}
}
/// Parses a compound statement.
///
/// ```ebnf
/// compound_stmt = "{" { stmt } "}";
/// ```
fn parse_compound_stmt(&mut self) -> ParseResult<Stmt> {
let lbrace_token = self.expect(TokenKind::LBrace)?;
let mut inner = Vec::new();
while !self.is_at_eof() && !self.is_peek(TokenKind::RBrace) {
match self.parse_stmt() {
Ok(stmt) => inner.push(stmt),
Err(error) => {
self.errors.push(error);
// skip ahead until we've reached a statement border
self.synchronize(&[TokenKind::Semicolon, TokenKind::Return, TokenKind::RBrace]);
if self.is_peek(TokenKind::Semicolon) {
self.advance();
}
}
}
}
let rbrace_token = self.expect(TokenKind::RBrace)?;
let span = lbrace_token.span.join(rbrace_token.span);
Ok(Stmt {
kind: StmtKind::Compound { inner },
span,
})
}
/// Parses a return statement.
///
/// ```ebnf
/// return_stmt = "return" [ expr ] ";" ;
/// ```
fn parse_return_stmt(&mut self) -> ParseResult<Stmt> {
let return_token = self.expect(TokenKind::Return)?;
let value = if !self.is_peek(TokenKind::Semicolon) {
Some(self.parse_expr()?)
} else {
None
};
let semi_token = self.expect(TokenKind::Semicolon)?;
let span = return_token.span.join(semi_token.span);
Ok(Stmt {
kind: StmtKind::Return { value },
span,
})
}
// ====== Pratt Parsing Implementation ======
/// Parses an expression.
pub fn parse_expr(&mut self) -> ParseResult<Expr> {
self.parse_expr_bp(0)
}
/// Pratt parsing implementation for expressions.
fn parse_expr_bp(&mut self, min_bp: u8) -> ParseResult<Expr> {
let mut lhs = self.parse_leading_expr()?;
loop {
let peek_token = self.peek_no_eof()?;
let Some((op, left_bp, right_bp)) = self.infix_operator(peek_token.kind) else {
break; // Not an infix operator
};
if left_bp < min_bp {
break; // The operator binds less tightly than the current context
}
self.advance(); // consume the operator
let rhs = self.parse_expr_bp(right_bp)?;
let span = lhs.span.join(rhs.span);
lhs = Expr {
kind: ExprKind::Binary {
op,
lhs: Box::new(lhs),
rhs: Box::new(rhs),
},
span,
};
}
Ok(lhs)
}
/// Parses a leading expression such as identifiers, integer and boolean literals
/// or prefix expressions.
fn parse_leading_expr(&mut self) -> ParseResult<Expr> {
let peek_token = self.peek_no_eof()?;
match peek_token.kind {
TokenKind::Identifier => {
let token = self.advance().unwrap();
Ok(Expr {
kind: ExprKind::Identifier {
name: token.text.to_string(),
},
span: token.span,
})
}
TokenKind::IntegerLit => {
let token = self.advance().unwrap();
let text = token.text;
let value = if text.starts_with("0x") || text.starts_with("0X") {
u64::from_str_radix(&text[2..], 16)
} else if text.starts_with("0o") || text.starts_with("0O") {
u64::from_str_radix(&text[2..], 8)
} else if text.starts_with("0b") || text.starts_with("0B") {
u64::from_str_radix(&text[2..], 2)
} else {
text.parse()
}
.unwrap();
Ok(Expr {
kind: ExprKind::Integer { value },
span: token.span,
})
}
TokenKind::BooleanLit => {
let token = self.advance().unwrap();
Ok(Expr {
kind: ExprKind::Boolean {
value: token.text == "true",
},
span: token.span,
})
}
TokenKind::LParen => {
let lparen = self.advance().unwrap();
let expr = self.parse_expr_bp(0)?;
let rparen = self.expect(TokenKind::RParen)?;
Ok(Expr {
kind: expr.kind,
span: lparen.span.join(rparen.span),
})
}
kind if let Some((op, r_bp)) = self.prefix_operator(kind) => {
let op_token = self.advance().unwrap();
let rhs = self.parse_expr_bp(r_bp)?;
Ok(Expr {
span: op_token.span.join(rhs.span),
kind: ExprKind::Unary {
op,
expr: Box::new(rhs),
},
})
}
_ => Err(ParseError::new(
format!("expected an expression but found {}", peek_token.kind),
peek_token.span,
)),
}
}
/// Returns the [UnaryOp] and right binding power of a prefix operator,
/// or `None` if the [TokenKind] is not a valid prefix operator.
fn prefix_operator(&self, op: TokenKind) -> Option<(UnaryOp, u8)> {
match op {
TokenKind::Minus => Some((UnaryOp::Neg, 30)),
_ => None,
}
}
/// Returns the [BinaryOp], left and right binding powers of an infix operator,
/// or `None` if the [TokenKind] is not a valid infix operator.
fn infix_operator(&self, op: TokenKind) -> Option<(BinaryOp, u8, u8)> {
match op {
TokenKind::Plus => Some((BinaryOp::Add, 10, 11)),
TokenKind::Minus => Some((BinaryOp::Sub, 10, 11)),
TokenKind::Star => Some((BinaryOp::Mul, 20, 21)),
TokenKind::Slash => Some((BinaryOp::Div, 20, 21)),
TokenKind::Percent => Some((BinaryOp::Rem, 20, 21)),
_ => None,
}
}
}
#[cfg(test)]
mod test {
use std::fmt::Debug;
use crate::frontend::{
ast::*,
parser::{ParseError, ParseResult, Parser},
token::Span,
};
#[derive(Debug, PartialEq, Eq)]
enum TestResult<T: Debug + Eq + PartialEq> {
Success(T),
Recovered(T, Vec<ParseError>),
Error(Vec<ParseError>),
}
use TestResult::*;
fn parse<'src, T: Debug + Eq + PartialEq>(
source: &'src str,
method: impl Fn(&mut Parser<'src>) -> ParseResult<T>,
) -> TestResult<T> {
let mut parser = Parser::new(source);
match method(&mut parser) {
Ok(result) => {
if parser.errors.is_empty() {
TestResult::Success(result)
} else {
TestResult::Recovered(result, parser.errors)
}
}
Err(error) => {
parser.errors.push(error);
TestResult::Error(parser.errors)
}
}
}
#[test]
fn integer_literals() {
assert_eq!(
parse("0xBEEF;", Parser::parse_expr),
Success(Expr {
kind: ExprKind::Integer { value: 0xBEEF },
span: Span::new(0, 6)
})
);
assert_eq!(
parse("0o777;", Parser::parse_expr),
Success(Expr {
kind: ExprKind::Integer { value: 0o777 },
span: Span::new(0, 5)
})
);
assert_eq!(
parse("0b1001;", Parser::parse_expr),
Success(Expr {
kind: ExprKind::Integer { value: 0b1001 },
span: Span::new(0, 6)
})
);
assert_eq!(
parse("1337;", Parser::parse_expr),
Success(Expr {
kind: ExprKind::Integer { value: 1337 },
span: Span::new(0, 4)
})
);
}
#[test]
fn boolean_literals() {
assert_eq!(
parse("true;", Parser::parse_expr),
Success(Expr {
kind: ExprKind::Boolean { value: true },
span: Span::new(0, 4)
})
);
assert_eq!(
parse("false;", Parser::parse_expr),
Success(Expr {
kind: ExprKind::Boolean { value: false },
span: Span::new(0, 5)
})
);
}
#[test]
fn unary_expr() {
assert_eq!(
parse("-5;", Parser::parse_expr),
Success(Expr {
kind: ExprKind::Unary {
op: UnaryOp::Neg,
expr: Box::new(Expr {
kind: ExprKind::Integer { value: 5 },
span: Span::new(1, 2)
})
},
span: Span::new(0, 2)
})
);
}
#[test]
fn binary_expr() {
assert_eq!(
parse("12 + 3 * 6;", Parser::parse_expr),
Success(Expr {
kind: ExprKind::Binary {
op: BinaryOp::Add,
lhs: Box::new(Expr {
kind: ExprKind::Integer { value: 12 },
span: Span::new(0, 2)
}),
rhs: Box::new(Expr {
kind: ExprKind::Binary {
op: BinaryOp::Mul,
lhs: Box::new(Expr {
kind: ExprKind::Integer { value: 3 },
span: Span::new(5, 6)
}),
rhs: Box::new(Expr {
kind: ExprKind::Integer { value: 6 },
span: Span::new(9, 10)
})
},
span: Span::new(5, 10)
})
},
span: Span::new(0, 10)
})
);
}
#[test]
fn return_stmt() {
assert_eq!(
parse("return;", Parser::parse_stmt),
Success(Stmt {
kind: StmtKind::Return { value: None },
span: Span::new(0, 7)
})
);
assert_eq!(
parse("return 0;", Parser::parse_stmt),
Success(Stmt {
kind: StmtKind::Return {
value: Some(Expr {
kind: ExprKind::Integer { value: 0 },
span: Span::new(7, 8)
})
},
span: Span::new(0, 9)
})
);
}
#[test]
fn compound_stmt() {
assert_eq!(
parse("{ return; }", Parser::parse_stmt),
Success(Stmt {
kind: StmtKind::Compound {
inner: vec![Stmt {
kind: StmtKind::Return { value: None },
span: Span::new(2, 9)
}]
},
span: Span::new(0, 11)
})
);
assert_eq!(
parse("{ return 0 }", Parser::parse_stmt),
Recovered(
Stmt {
kind: StmtKind::Compound { inner: vec![] },
span: Span::new(0, 12)
},
vec![ParseError::new(
"expected `;` but found `}` instead",
Span::new(11, 12)
)]
)
);
}
#[test]
fn function_decl() {
assert_eq!(
parse(
"fn add(a: i32, b: i32) -> i32 { return a + b; }",
Parser::parse_decl
),
Success(Decl {
kind: DeclKind::Function {
name: "add".to_string(),
name_span: Span::new(3, 6),
params: vec![
FunctionParam {
name: "a".to_string(),
name_span: Span::new(7, 8),
ty: Type {
kind: TypeKind::I32,
span: Span::new(10, 13)
}
},
FunctionParam {
name: "b".to_string(),
name_span: Span::new(15, 16),
ty: Type {
kind: TypeKind::I32,
span: Span::new(18, 21)
}
}
],
return_type: Some(Type {
kind: TypeKind::I32,
span: Span::new(26, 29)
}),
body: Stmt {
kind: StmtKind::Compound {
inner: vec![Stmt {
kind: StmtKind::Return {
value: Some(Expr {
kind: ExprKind::Binary {
op: BinaryOp::Add,
lhs: Box::new(Expr {
kind: ExprKind::Identifier {
name: "a".to_string()
},
span: Span::new(39, 40)
}),
rhs: Box::new(Expr {
kind: ExprKind::Identifier {
name: "b".to_string()
},
span: Span::new(43, 44)
})
},
span: Span::new(39, 44)
})
},
span: Span::new(32, 45)
}]
},
span: Span::new(30, 47)
}
},
span: Span::new(0, 47)
})
)
}
}
+2
View File
@@ -8,6 +8,8 @@ pub struct Span {
} }
impl Span { impl Span {
pub const EOF: Self = Self::new(0, 0);
/// Create a new [Span] from the start and end positions. /// Create a new [Span] from the start and end positions.
pub const fn new(start: usize, end: usize) -> Self { pub const fn new(start: usize, end: usize) -> Self {
debug_assert!(start <= end); debug_assert!(start <= end);