From becc7a2d345b490d0ecf797550ac12aed3adc687 Mon Sep 17 00:00:00 2001 From: Jooris Hadeler Date: Tue, 10 Mar 2026 17:40:52 +0100 Subject: [PATCH] Add expression AST and Pratt parser with REPL - ast.rs: Expr/ExprKind with UnaryOp, BinaryOp, StructField - parser.rs: Pratt expression parser with allow_struct_literals flag, error recovery via dummy tokens, and 19 unit tests - main.rs: interactive expression REPL (prints parsed AST) --- fluxc/src/ast.rs | 113 ++++++++ fluxc/src/main.rs | 43 ++- fluxc/src/parser.rs | 617 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 767 insertions(+), 6 deletions(-) create mode 100644 fluxc/src/ast.rs create mode 100644 fluxc/src/parser.rs diff --git a/fluxc/src/ast.rs b/fluxc/src/ast.rs new file mode 100644 index 0000000..f790bd3 --- /dev/null +++ b/fluxc/src/ast.rs @@ -0,0 +1,113 @@ +use crate::token::Span; + +// ── Operators ────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum UnaryOp { + Neg, // `-` + Not, // `!` + BitNot, // `~` + Deref, // `*` + AddrOf, // `&` +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BinaryOp { + // Logical + Or, // `or` + And, // `and` + // Bitwise + BitOr, // `|` + BitXor, // `^` + BitAnd, // `&` + // Comparison + Eq, // `==` + Ne, // `!=` + Lt, // `<` + Gt, // `>` + Le, // `<=` + Ge, // `>=` + // Arithmetic + Add, // `+` + Sub, // `-` + Mul, // `*` + Div, // `/` + Rem, // `%` +} + +// ── Struct literal field ─────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +pub struct StructField { + pub name: String, + pub name_span: Span, + pub value: Expr, +} + +// ── Expression ──────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +pub struct Expr { + pub kind: ExprKind, + pub span: Span, +} + +impl Expr { + pub fn new(kind: ExprKind, span: Span) -> Self { + Self { kind, span } + } +} + +#[derive(Debug, Clone)] +pub enum ExprKind { + // Literals + IntLit(String), + FloatLit(String), + StringLit(String), + CharLit(String), + Bool(bool), + + // Identifier + Ident(String), + + // Struct literal: `Foo { x: 1, y: 2 }` + StructLit { + name: String, + name_span: Span, + fields: Vec, + }, + + // Operators + Unary { + op: UnaryOp, + op_span: Span, + expr: Box, + }, + Binary { + op: BinaryOp, + op_span: Span, + lhs: Box, + rhs: Box, + }, + + // Postfix + Field { + expr: Box, + field: String, + field_span: Span, + }, + Index { + expr: Box, + index: Box, + }, + Call { + callee: Box, + args: Vec, + }, + + // Parenthesised expression + Group(Box), + + // Placeholder for parse errors — allows parsing to continue + Error, +} diff --git a/fluxc/src/main.rs b/fluxc/src/main.rs index 76693ab..fd73427 100644 --- a/fluxc/src/main.rs +++ b/fluxc/src/main.rs @@ -1,15 +1,46 @@ -use std::{env::args, fs}; +use std::io::{self, BufRead, Write}; -use crate::lexer::Lexer; +use crate::parser::Parser; +pub mod ast; pub mod lexer; +pub mod parser; pub mod token; fn main() { - let path = args().nth(1).expect("usage: fluxc "); - let content = fs::read_to_string(&path).expect("error: failed to read file"); + let stdin = io::stdin(); + let stdout = io::stdout(); - for token in Lexer::new(&content) { - println!("{token:?}"); + println!("flux expression REPL (ctrl+d to exit)"); + + loop { + print!("> "); + stdout.lock().flush().unwrap(); + + let mut line = String::new(); + match stdin.lock().read_line(&mut line) { + Ok(0) => break, // EOF + Ok(_) => {} + Err(e) => { + eprintln!("error: {e}"); + break; + } + } + + let src = line.trim(); + if src.is_empty() { + continue; + } + + let mut parser = Parser::new(src); + let expr = parser.parse_expr(true); + + for err in &parser.errors { + eprintln!("parse error: {err}"); + } + + if parser.errors.is_empty() { + println!("{expr:#?}"); + } } } diff --git a/fluxc/src/parser.rs b/fluxc/src/parser.rs new file mode 100644 index 0000000..7346ec5 --- /dev/null +++ b/fluxc/src/parser.rs @@ -0,0 +1,617 @@ +use std::fmt; + +use crate::{ + ast::{BinaryOp, Expr, ExprKind, StructField, UnaryOp}, + lexer::Lexer, + token::{Span, Token, TokenKind}, +}; + +// ── Parse error ─────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +pub struct ParseError { + pub span: Span, + pub message: String, +} + +impl fmt::Display for ParseError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "error at {}: {}", self.span, self.message) + } +} + +// ── Binding powers ───────────────────────────────────────────────────────────── +// +// Returns `(left_bp, right_bp)` for infix operators. +// left_bp < right_bp → left-associative +// left_bp > right_bp → right-associative (none here) +// +// NOTE: comparison operators (==, !=, <, >, <=, >=) are not listed in the +// GRAMMAR.ebnf precedence table but appear in examples; placed between +// bitwise-AND (50) and additive (60) at 55. + +fn infix_bp(kind: TokenKind) -> Option<(u8, u8)> { + let bp = match kind { + TokenKind::Or => (10, 11), + TokenKind::And => (20, 21), + TokenKind::Pipe => (30, 31), + TokenKind::Caret => (40, 41), + TokenKind::Amp => (50, 51), + TokenKind::EqEq + | TokenKind::BangEq + | TokenKind::Lt + | TokenKind::Gt + | TokenKind::LtEq + | TokenKind::GtEq => (55, 56), + TokenKind::Plus | TokenKind::Minus => (60, 61), + TokenKind::Star | TokenKind::Slash | TokenKind::Percent => (70, 71), + // Postfix: `.`, `[`, `(` — handled separately in parse_led, bp listed + // here only so callers can detect them as infix/postfix operators. + TokenKind::Dot | TokenKind::LBracket | TokenKind::LParen => (90, 91), + _ => return None, + }; + Some(bp) +} + +// Returns the right binding power for prefix operators. +fn prefix_bp(kind: TokenKind) -> Option { + match kind { + TokenKind::Bang + | TokenKind::Tilde + | TokenKind::Minus + | TokenKind::Star + | TokenKind::Amp => Some(80), + _ => None, + } +} + +fn token_to_unary_op(kind: TokenKind) -> UnaryOp { + match kind { + TokenKind::Minus => UnaryOp::Neg, + TokenKind::Bang => UnaryOp::Not, + TokenKind::Tilde => UnaryOp::BitNot, + TokenKind::Star => UnaryOp::Deref, + TokenKind::Amp => UnaryOp::AddrOf, + _ => unreachable!("not a unary op: {:?}", kind), + } +} + +fn token_to_binary_op(kind: TokenKind) -> BinaryOp { + match kind { + TokenKind::Or => BinaryOp::Or, + TokenKind::And => BinaryOp::And, + TokenKind::Pipe => BinaryOp::BitOr, + TokenKind::Caret => BinaryOp::BitXor, + TokenKind::Amp => BinaryOp::BitAnd, + TokenKind::EqEq => BinaryOp::Eq, + TokenKind::BangEq => BinaryOp::Ne, + TokenKind::Lt => BinaryOp::Lt, + TokenKind::Gt => BinaryOp::Gt, + TokenKind::LtEq => BinaryOp::Le, + TokenKind::GtEq => BinaryOp::Ge, + TokenKind::Plus => BinaryOp::Add, + TokenKind::Minus => BinaryOp::Sub, + TokenKind::Star => BinaryOp::Mul, + TokenKind::Slash => BinaryOp::Div, + TokenKind::Percent => BinaryOp::Rem, + _ => unreachable!("not a binary op: {:?}", kind), + } +} + +// ── Parser ───────────────────────────────────────────────────────────────────── + +pub struct Parser<'src> { + tokens: Vec>, + pos: usize, + pub errors: Vec, +} + +impl<'src> Parser<'src> { + pub fn new(src: &'src str) -> Self { + let tokens = Lexer::new(src).tokenize(); + Self { + tokens, + pos: 0, + errors: Vec::new(), + } + } + + // ── Token access ────────────────────────────────────────────────────────── + + fn current(&self) -> Token<'src> { + self.tokens[self.pos] + } + + /// Advance past the current token and return it. + fn advance(&mut self) -> Token<'src> { + let tok = self.current(); + if tok.kind != TokenKind::Eof { + self.pos += 1; + } + tok + } + + /// Consume the current token if it matches `kind`; otherwise record an + /// error and return a zero-width dummy token at the current position + /// so that parsing can continue. + fn expect(&mut self, kind: TokenKind) -> Token<'src> { + let tok = self.current(); + if tok.kind == kind { + self.advance() + } else { + let span = Span::new(tok.span.start, tok.span.start); + self.errors.push(ParseError { + span, + message: format!("expected {}, found {}", kind, tok.kind), + }); + Token::new(kind, span, "") + } + } + + // ── Public API ──────────────────────────────────────────────────────────── + + /// Parse a single expression. + /// + /// `allow_struct_literals` controls whether a bare `Ident { … }` is + /// parsed as a struct literal. Pass `false` in `if`/`while` conditions + /// so that `{` is not consumed as a struct body. + pub fn parse_expr(&mut self, allow_struct_literals: bool) -> Expr { + self.pratt(0, allow_struct_literals) + } + + // ── Pratt core ──────────────────────────────────────────────────────────── + + fn pratt(&mut self, min_bp: u8, allow_struct_lit: bool) -> Expr { + let mut lhs = self.parse_nud(allow_struct_lit); + + loop { + let op_tok = self.current(); + + // Struct literal: `Ident {` — only when the flag is set, and only + // when the lhs is a bare identifier. + if allow_struct_lit + && op_tok.kind == TokenKind::LCurly + && matches!(lhs.kind, ExprKind::Ident(_)) + && min_bp == 0 + { + lhs = self.parse_struct_lit(lhs); + continue; + } + + let (l_bp, r_bp) = match infix_bp(op_tok.kind) { + Some(bp) => bp, + None => break, + }; + + if l_bp < min_bp { + break; + } + + lhs = self.parse_led(lhs, op_tok, r_bp, allow_struct_lit); + } + + lhs + } + + // ── Null denotation (prefix / primary) ─────────────────────────────────── + + fn parse_nud(&mut self, allow_struct_lit: bool) -> Expr { + let tok = self.advance(); + match tok.kind { + // Literals + TokenKind::IntLit => Expr::new(ExprKind::IntLit(tok.text.to_owned()), tok.span), + TokenKind::FloatLit => Expr::new(ExprKind::FloatLit(tok.text.to_owned()), tok.span), + TokenKind::StringLit => Expr::new(ExprKind::StringLit(tok.text.to_owned()), tok.span), + TokenKind::CharLit => Expr::new(ExprKind::CharLit(tok.text.to_owned()), tok.span), + TokenKind::True => Expr::new(ExprKind::Bool(true), tok.span), + TokenKind::False => Expr::new(ExprKind::Bool(false), tok.span), + + // Identifier + TokenKind::Ident => Expr::new(ExprKind::Ident(tok.text.to_owned()), tok.span), + + // Prefix unary + kind if prefix_bp(kind).is_some() => { + let r_bp = prefix_bp(kind).unwrap(); + let op = token_to_unary_op(kind); + let operand = self.pratt(r_bp, allow_struct_lit); + let span = tok.span.cover(operand.span); + Expr::new( + ExprKind::Unary { + op, + op_span: tok.span, + expr: Box::new(operand), + }, + span, + ) + } + + // Grouped expression + TokenKind::LParen => { + // Inside parentheses struct literals are always allowed. + let inner = self.pratt(0, true); + let close = self.expect(TokenKind::RParen); + let span = tok.span.cover(close.span); + Expr::new(ExprKind::Group(Box::new(inner)), span) + } + + // Error recovery + _ => { + self.errors.push(ParseError { + span: tok.span, + message: format!("unexpected token {} in expression", tok.kind), + }); + Expr::new(ExprKind::Error, tok.span) + } + } + } + + // ── Left denotation (infix / postfix) ──────────────────────────────────── + + fn parse_led( + &mut self, + lhs: Expr, + op_tok: Token<'src>, + r_bp: u8, + allow_struct_lit: bool, + ) -> Expr { + // Consume the operator token. + self.advance(); + + match op_tok.kind { + // Field access: `expr.field` + TokenKind::Dot => { + let field_tok = self.expect(TokenKind::Ident); + let span = lhs.span.cover(field_tok.span); + Expr::new( + ExprKind::Field { + expr: Box::new(lhs), + field: field_tok.text.to_owned(), + field_span: field_tok.span, + }, + span, + ) + } + + // Index: `expr[index]` + TokenKind::LBracket => { + // Inside brackets struct literals are always allowed. + let index = self.pratt(0, true); + let close = self.expect(TokenKind::RBracket); + let span = lhs.span.cover(close.span); + Expr::new( + ExprKind::Index { + expr: Box::new(lhs), + index: Box::new(index), + }, + span, + ) + } + + // Call: `expr(args…)` + TokenKind::LParen => { + let (args, close_span) = self.parse_arg_list(); + let span = lhs.span.cover(close_span); + Expr::new( + ExprKind::Call { + callee: Box::new(lhs), + args, + }, + span, + ) + } + + // Binary operator + kind => { + let op = token_to_binary_op(kind); + let rhs = self.pratt(r_bp, allow_struct_lit); + let span = lhs.span.cover(rhs.span); + Expr::new( + ExprKind::Binary { + op, + op_span: op_tok.span, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }, + span, + ) + } + } + } + + // ── Struct literal ──────────────────────────────────────────────────────── + + /// Called after we have already parsed the leading `Ident` as `lhs` and + /// the current token is `{`. + fn parse_struct_lit(&mut self, name_expr: Expr) -> Expr { + let (name, name_span) = match name_expr.kind { + ExprKind::Ident(ref s) => (s.clone(), name_expr.span), + _ => unreachable!(), + }; + + self.advance(); // consume `{` + + let fields = self.parse_struct_field_list(); + + let close = self.expect(TokenKind::RCurly); + let span = name_span.cover(close.span); + Expr::new( + ExprKind::StructLit { + name, + name_span, + fields, + }, + span, + ) + } + + fn parse_struct_field_list(&mut self) -> Vec { + let mut fields = Vec::new(); + loop { + if matches!(self.current().kind, TokenKind::RCurly | TokenKind::Eof) { + break; + } + fields.push(self.parse_struct_field()); + if self.current().kind == TokenKind::Comma { + self.advance(); + } else { + break; + } + } + fields + } + + fn parse_struct_field(&mut self) -> StructField { + let name_tok = self.expect(TokenKind::Ident); + self.expect(TokenKind::Colon); + // Struct literals allowed inside field values. + let value = self.pratt(0, true); + StructField { + name: name_tok.text.to_owned(), + name_span: name_tok.span, + value, + } + } + + // ── Argument list ───────────────────────────────────────────────────────── + + /// Parse `arg, arg, …` up to `)`. The opening `(` has already been + /// consumed by `parse_led`. Returns `(args, close_span)`. + fn parse_arg_list(&mut self) -> (Vec, Span) { + let mut args = Vec::new(); + loop { + if matches!(self.current().kind, TokenKind::RParen | TokenKind::Eof) { + break; + } + // Struct literals allowed inside argument lists. + args.push(self.pratt(0, true)); + if self.current().kind == TokenKind::Comma { + self.advance(); + } else { + break; + } + } + let close = self.expect(TokenKind::RParen); + (args, close.span) + } +} + +// ── Tests ────────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(src: &str) -> Expr { + Parser::new(src).parse_expr(true) + } + + fn parse_no_struct(src: &str) -> Expr { + Parser::new(src).parse_expr(false) + } + + #[test] + fn int_literal() { + let expr = parse("42"); + assert!(matches!(expr.kind, ExprKind::IntLit(ref s) if s == "42")); + } + + #[test] + fn float_literal() { + let expr = parse("3.14"); + assert!(matches!(expr.kind, ExprKind::FloatLit(ref s) if s == "3.14")); + } + + #[test] + fn bool_literals() { + assert!(matches!(parse("true").kind, ExprKind::Bool(true))); + assert!(matches!(parse("false").kind, ExprKind::Bool(false))); + } + + #[test] + fn ident() { + let expr = parse("foo"); + assert!(matches!(expr.kind, ExprKind::Ident(ref s) if s == "foo")); + } + + #[test] + fn unary_neg() { + let expr = parse("-42"); + assert!(matches!( + expr.kind, + ExprKind::Unary { + op: UnaryOp::Neg, + .. + } + )); + } + + #[test] + fn unary_not() { + let expr = parse("!x"); + assert!(matches!( + expr.kind, + ExprKind::Unary { + op: UnaryOp::Not, + .. + } + )); + } + + #[test] + fn binary_add() { + let expr = parse("a + b"); + assert!(matches!( + expr.kind, + ExprKind::Binary { + op: BinaryOp::Add, + .. + } + )); + } + + #[test] + fn binary_precedence() { + // `a + b * c` should parse as `a + (b * c)` + let expr = parse("a + b * c"); + match &expr.kind { + ExprKind::Binary { + op: BinaryOp::Add, + lhs, + rhs, + .. + } => { + assert!(matches!(lhs.kind, ExprKind::Ident(ref s) if s == "a")); + assert!(matches!( + rhs.kind, + ExprKind::Binary { + op: BinaryOp::Mul, + .. + } + )); + } + _ => panic!("expected binary add, got {:?}", expr.kind), + } + } + + #[test] + fn comparison() { + let expr = parse("a == b"); + assert!(matches!( + expr.kind, + ExprKind::Binary { + op: BinaryOp::Eq, + .. + } + )); + } + + #[test] + fn logical_and_or() { + // `a or b and c` → `a or (b and c)` (and binds tighter) + let expr = parse("a or b and c"); + match &expr.kind { + ExprKind::Binary { + op: BinaryOp::Or, + rhs, + .. + } => { + assert!(matches!( + rhs.kind, + ExprKind::Binary { + op: BinaryOp::And, + .. + } + )); + } + _ => panic!("expected or at top level"), + } + } + + #[test] + fn grouped_expr() { + let expr = parse("(a + b)"); + assert!(matches!(expr.kind, ExprKind::Group(_))); + } + + #[test] + fn field_access() { + let expr = parse("foo.bar"); + assert!(matches!(expr.kind, ExprKind::Field { ref field, .. } if field == "bar")); + } + + #[test] + fn index_expr() { + let expr = parse("arr[0]"); + assert!(matches!(expr.kind, ExprKind::Index { .. })); + } + + #[test] + fn call_no_args() { + let expr = parse("foo()"); + match &expr.kind { + ExprKind::Call { args, .. } => assert!(args.is_empty()), + _ => panic!("expected call"), + } + } + + #[test] + fn call_with_args() { + let expr = parse("foo(1, 2, 3)"); + match &expr.kind { + ExprKind::Call { args, .. } => assert_eq!(args.len(), 3), + _ => panic!("expected call"), + } + } + + #[test] + fn struct_literal() { + let expr = parse("Foo { x: 1, y: 2 }"); + match &expr.kind { + ExprKind::StructLit { name, fields, .. } => { + assert_eq!(name, "Foo"); + assert_eq!(fields.len(), 2); + } + _ => panic!("expected struct literal, got {:?}", expr.kind), + } + } + + #[test] + fn struct_literal_disabled() { + // With allow_struct_literals=false, `Foo { ... }` should NOT be a + // struct literal — the Ident is parsed alone and `{` is left unconsumed. + let expr = parse_no_struct("Foo { x: 1 }"); + assert!(matches!(expr.kind, ExprKind::Ident(ref s) if s == "Foo")); + } + + #[test] + fn chained_field_access() { + let expr = parse("a.b.c"); + match &expr.kind { + ExprKind::Field { + expr: inner, field, .. + } => { + assert_eq!(field, "c"); + assert!(matches!(inner.kind, ExprKind::Field { ref field, .. } if field == "b")); + } + _ => panic!("expected field access"), + } + } + + #[test] + fn deref_and_addrof() { + assert!(matches!( + parse("*p").kind, + ExprKind::Unary { + op: UnaryOp::Deref, + .. + } + )); + assert!(matches!( + parse("&x").kind, + ExprKind::Unary { + op: UnaryOp::AddrOf, + .. + } + )); + } +}