Compare commits

..

2 Commits

Author SHA1 Message Date
1107c7d93d feat: Add support for let and expression statements.
This commit implements parsing for `let` statements, anything that
is not a let statement will be assumed to be an expression statement.
2026-03-12 21:23:09 +01:00
bb9cb8d2d1 docs: Add and improve documentation for every module. 2026-03-12 20:44:41 +01:00
8 changed files with 497 additions and 75 deletions

View File

@@ -1,15 +1,34 @@
//! Abstract Syntax Tree (AST) definitions.
//!
//! The AST is parameterised over a [`Phase`] type-state so that the same node
//! types can carry different amounts of information at different compiler
//! stages. Currently only the [`Parsed`] phase exists, which attaches no
//! extra data (`()`) to each node.
//!
//! The primary node families are:
//! - [`Expression`] / [`ExpressionKind`] — value-producing constructs.
//! - [`Type`] / [`TypeKind`] — type annotations.
//! - [`Statement`] / [`StatementKind`] — top-level and block-level statements.
use std::fmt::Debug;
use crate::token::Span;
/// The [Phase] trait is used for type state. The AST can be in one of multiple
/// type states:
/// 1. [Parsed] - AST that was produced through parsing.
/// Marker trait that carries phase-specific associated data for AST nodes.
///
/// Each phase defines an [`ExtraData`](Phase::ExtraData) type that is embedded
/// in every node. This allows later compiler passes (e.g. type-checking) to
/// augment the tree without duplicating the node hierarchy.
///
/// Current phases:
/// - [`Parsed`] — produced directly by the parser; no extra data.
pub trait Phase {
type ExtraData: PartialEq + Debug;
}
/// See [Phase] for more information.
/// The initial AST phase produced by the parser.
///
/// In this phase [`Phase::ExtraData`] is `()`, meaning nodes carry only
/// syntactic information (kind + source span).
#[derive(Debug)]
pub struct Parsed;
@@ -17,10 +36,15 @@ impl Phase for Parsed {
type ExtraData = ();
}
/// Convenience alias for an [`Expression`] in the [`Parsed`] phase.
pub type ParsedExpression = Expression<Parsed>;
/// This represents an expression in the source code. It holds the
/// [ExpressionKind], the [Span] and extra information according to the [Phase].
/// A value-producing node in the AST.
///
/// Every expression carries:
/// - [`kind`](Expression::kind) — what *kind* of expression it is.
/// - [`span`](Expression::span) — the source location it was parsed from.
/// - [`extra`](Expression::extra) — phase-specific data (see [`Phase`]).
#[derive(Debug, PartialEq)]
pub struct Expression<P: Phase> {
pub kind: ExpressionKind<P>,
@@ -28,109 +52,125 @@ pub struct Expression<P: Phase> {
pub extra: P::ExtraData,
}
/// Represents the different kinds of [Expression]s, e.g. literals, unary or
/// binary expressions.
/// The concrete variant of an [`Expression`].
#[derive(Debug, PartialEq)]
pub enum ExpressionKind<P: Phase> {
/// A bare name, e.g. `foo`.
Identifier(String),
/// A string literal, e.g. `"hello"`.
LitString(String),
/// An integer literal, e.g. `42`, `0xFF`, `0b1010`. The value is stored
/// as a `u64` regardless of the source radix.
LitInteger(u64),
/// A boolean literal: `true` or `false`.
LitBool(bool),
/// A prefix unary expression, e.g. `-x`, `!cond`, `*ptr`.
Unary {
op: UnaryOp,
/// Source span of the operator token itself.
op_span: Span,
operand: Box<Expression<P>>,
},
/// An infix binary expression, e.g. `a + b`, `x == y`.
Binary {
op: BinaryOp,
/// Source span of the operator token itself.
op_span: Span,
left: Box<Expression<P>>,
right: Box<Expression<P>>,
},
/// A function call, e.g. `f(a, b)`.
Call {
/// The callee expression (often an [`Identifier`](ExpressionKind::Identifier)).
func: Box<Expression<P>>,
args: Vec<Expression<P>>,
},
/// An index expression, e.g. `arr[i]`.
Index {
expr: Box<Expression<P>>,
index: Box<Expression<P>>,
},
/// A type-cast expression, e.g. `x as u32`.
Cast {
expr: Box<Expression<P>>,
ty: Box<Type<P>>,
},
}
/// A prefix unary operator.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum UnaryOp {
/// Bitwise Not
/// Bitwise complement (`~`)
BitNot,
/// Logical Not
/// Logical negation (`!`)
Not,
/// Negate
/// Arithmetic negation (`-`)
Neg,
/// Address Of
/// Address-of (`&`)
AddrOf,
/// Deref
/// Pointer dereference (`*`)
Deref,
}
/// An infix binary operator.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BinaryOp {
/// Addition
/// Addition (`+`)
Add,
/// Subtraction
/// Subtraction (`-`)
Sub,
/// Multiplication
/// Multiplication (`*`)
Mul,
/// Division
/// Division (`/`)
Div,
/// Remainder
/// Remainder (`%`)
Rem,
/// Bitwise And
/// Bitwise AND (`&`)
BitAnd,
/// Bitwise Or
/// Bitwise OR (`|`)
BitOr,
/// Bitwise Xor
/// Bitwise XOR (`^`)
BitXor,
/// Bitwise Shift Left
/// Left shift (`<<`)
BitShl,
/// Bitwise Shift Right
/// Right shift (`>>`)
BitShr,
/// Logical And
/// Logical AND (`and`)
And,
/// Logical Or
/// Logical OR (`or`)
Or,
/// Equal
/// Equality (`==`)
Eq,
/// Not Equal
/// Inequality (`!=`)
Ne,
/// Less than
/// Less-than (`<`)
Lt,
/// Less than or Equal
/// Less-than-or-equal (`<=`)
Le,
/// Greater than
/// Greater-than (`>`)
Gt,
/// Greater than or Equal
/// Greater-than-or-equal (`>=`)
Ge,
/// Assign
/// Assignment (`=`)
Assign,
/// Member Access
/// Member access (`.`)
Dot,
}
/// Convenience alias for a [`Type`] in the [`Parsed`] phase.
pub type ParsedType = Type<Parsed>;
/// A type annotation node in the AST.
#[derive(Debug, PartialEq)]
pub struct Type<P: Phase> {
pub kind: TypeKind,
@@ -138,19 +178,65 @@ pub struct Type<P: Phase> {
pub extra: P::ExtraData,
}
/// The concrete variant of a [`Type`] annotation.
#[derive(Debug, PartialEq)]
pub enum TypeKind {
/// Signed integers
I8,
I16,
I32,
I64,
/// Unsigned integers
U8,
U16,
U32,
U64,
/// Boolean type (`bool`)
Bool,
/// A user-defined named type, e.g. `MyStruct`.
Named(String),
}
/// Convenience alias for a [`Statement`] in the [`Parsed`] phase.
pub type ParsedStatement = Statement<Parsed>;
/// A statement node in the AST.
///
/// Statements are the sequential building blocks of a block body. Like
/// [`Expression`] and [`Type`], a statement is parameterised over a [`Phase`]
/// so that later compiler passes can attach additional information without
/// changing the node layout.
#[derive(Debug, PartialEq)]
pub struct Statement<P: Phase> {
pub kind: StatementKind<P>,
pub span: Span,
pub extra: P::ExtraData,
}
/// The concrete variant of a [`Statement`].
#[derive(Debug, PartialEq)]
pub enum StatementKind<P: Phase> {
/// A `let` binding, e.g. `let x: i32 = 0;`.
///
/// Both the type annotation and the initialiser are optional at the parse
/// stage and may be filled in or validated by later passes.
Let {
/// The name of the binding.
name: String,
/// Source span of the name token, used for diagnostics.
name_span: Span,
/// Optional explicit type annotation (`let x: T`).
ty: Option<Type<P>>,
/// Optional initialiser expression (`= <expr>`).
value: Option<Expression<P>>,
},
/// A bare expression statement, e.g. `f(x);`.
///
/// The trailing `;` is not stored in the node but is included in
/// [`Statement::span`].
Expr(Expression<P>),
}

View File

@@ -1,7 +1,18 @@
//! Command-line interface: argument parsing, help/version output, and fatal
//! error reporting.
//!
//! The primary entry point is [`parse_args`], which parses [`std::env::args`]
//! and returns an [`Opts`] struct. If any argument is invalid or required
//! arguments are missing, it calls [`fatal`] which prints an error to `stderr`
//! and exits with code `1`.
use std::path::PathBuf;
use yansi::Paint;
/// Print the help message to `stdout`.
///
/// Describes the compiler's usage, all supported options, and the `<file>`
/// positional argument.
pub fn print_help() {
println!(
"{} {} - the bucky language compiler",
@@ -47,27 +58,52 @@ pub fn print_help() {
);
}
/// Print the compiler version string (`buckyc <version>`) to `stdout`.
pub fn print_version() {
println!("buckyc {}", env!("CARGO_PKG_VERSION"));
}
/// Print a formatted error message to `stderr` and exit with code `1`.
///
/// This function never returns (`-> !`). Use it for unrecoverable CLI errors
/// such as missing arguments or unknown flags, discovered before compilation
/// begins.
pub fn fatal(message: impl ToString) -> ! {
eprintln!("{}: {}", "error".bold().red(), message.to_string().bold());
std::process::exit(1);
}
/// Parsed command-line options returned by [`parse_args`].
#[derive(Debug)]
pub struct Opts {
/// The list of files passed to the compiler.
/// One or more source files to compile, in the order they were supplied.
pub files: Vec<PathBuf>,
/// `-S`: emit IR and stop (implies `-c`).
/// `-S`: emit IR and stop (implies [`no_link`](Opts::no_link)).
pub emit_ir: bool,
/// `-c`: compile source to object file without linking.
/// `-c`: compile to an object file without invoking the linker.
pub no_link: bool,
/// `-o <file>`: write final output to this path.
/// `-o <file>`: destination path for the final output. When `None` the
/// compiler chooses a default output name.
pub output: Option<PathBuf>,
}
/// Parse [`std::env::args`] and return the resulting [`Opts`].
///
/// Recognised flags:
///
/// | Flag | Effect |
/// |------|--------|
/// | `-h`, `--help` | Print help and exit `0` |
/// | `-V`, `--version` | Print version and exit `0` |
/// | `-S` | Set [`emit_ir`](Opts::emit_ir) and [`no_link`](Opts::no_link) |
/// | `-c` | Set [`no_link`](Opts::no_link) |
/// | `-o <file>` | Set [`output`](Opts::output) |
/// | `<file>` | Append to [`files`](Opts::files) |
///
/// Calls [`fatal`] (and exits) if:
/// - an unknown `-`-prefixed flag is encountered, or
/// - `-o` is supplied without a following argument, or
/// - no source files are provided.
pub fn parse_args() -> Opts {
let mut files = Vec::new();
let mut no_link = false;

View File

@@ -1,14 +1,37 @@
//! Compiler diagnostic reporting with source-location context.
//!
//! This module provides [`Diagnostic`], a structured error/warning message that
//! can optionally include a source span and one or more labelled secondary
//! spans. Diagnostics are rendered to `stderr` in a rustc-inspired format:
//!
//! ```text
//! Error: undeclared variable `x`
//! --> src/main.bky:3:5
//! |
//! 3 | let y = x + 1;
//! | ^ undeclared variable
//! |
//! ```
use std::{fmt::Display, path::Path, process::exit};
use yansi::Paint;
use crate::token::Span;
/// The importance level of a [`Diagnostic`].
///
/// Variants are ordered from least to most severe so that `<` / `>` comparisons
/// work intuitively (e.g. `Severity::Warning < Severity::Error`).
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum Severity {
/// Purely informational; never causes the compiler to stop.
Note,
/// Something suspicious that may or may not be a problem.
Warning,
/// A recoverable problem that prevents successful compilation.
Error,
/// An unrecoverable problem; the process will exit immediately after
/// reporting this diagnostic.
Critical,
}
@@ -23,14 +46,29 @@ impl Display for Severity {
}
}
/// A single compiler message with optional source-location information.
///
/// Build a diagnostic with [`Diagnostic::new`], optionally attach a primary
/// source location via [`with_span`](Diagnostic::with_span), attach labelled
/// secondary locations via [`add_label`](Diagnostic::add_label), then call
/// [`report`](Diagnostic::report) to print it.
///
/// If the severity is [`Severity::Critical`], `report` will call
/// [`process::exit`](std::process::exit) after printing.
pub struct Diagnostic {
pub severity: Severity,
/// Primary source location, if any.
pub span: Option<Span>,
pub message: String,
/// Secondary labelled spans rendered below the primary snippet.
pub labels: Vec<(Span, String)>,
}
impl Diagnostic {
/// Create a new diagnostic with the given severity and message.
///
/// No source location is attached; use [`with_span`](Self::with_span) to
/// add one.
pub fn new(severity: Severity, message: impl ToString) -> Self {
Self {
severity,
@@ -40,16 +78,29 @@ impl Diagnostic {
}
}
/// Attach a primary source span to this diagnostic.
pub fn with_span(mut self, span: Span) -> Self {
self.span = Some(span);
self
}
/// Attach a labelled secondary span.
///
/// Labels whose span matches the primary span exactly are merged into the
/// primary underline as inline text. All other labels are rendered as
/// separate snippets below the primary one.
pub fn add_label(mut self, span: Span, message: impl ToString) -> Self {
self.labels.push((span, message.to_string()));
self
}
/// Print this diagnostic to `stderr` and, if the severity is
/// [`Severity::Critical`], terminate the process.
///
/// # Arguments
/// * `file_name` path shown in the `-->` location line.
/// * `source` full source text of the file, used to extract line/col
/// information and to display the relevant source snippet.
pub fn report(self, file_name: &Path, source: &str) {
eprintln!("{}: {}", self.severity, self.message.bold());
@@ -165,6 +216,7 @@ fn render_snippet(
eprintln!("{pad} {bar} {spaces}{colored_carets}{label_text}");
}
/// Apply severity-appropriate ANSI colour to a string.
fn paint_severity(s: &str, severity: Severity) -> String {
match severity {
Severity::Note => format!("{}", s.bold().bright_cyan()),
@@ -173,6 +225,7 @@ fn paint_severity(s: &str, severity: Severity) -> String {
}
}
/// Returns the number of decimal digits in `n` (minimum 1).
fn count_digits(n: usize) -> usize {
format!("{n}").len()
}
@@ -187,6 +240,10 @@ fn get_line_content(source: &str, position: u32) -> (usize, &str) {
(line_start, &rest[..line_len])
}
/// Returns the 1-based `(line, column)` for a byte `position` within `source`.
///
/// Both line and column are counted from 1. The column is measured in Unicode
/// scalar values (characters), not bytes.
fn get_line_col(source: &str, position: u32) -> (usize, usize) {
let prefix = &source[..position as usize];
let line = prefix.bytes().filter(|&b| b == b'\n').count() + 1;

View File

@@ -1,16 +1,44 @@
//! Lexer (tokeniser) that converts raw source text into a [`Token`] stream.
//!
//! [`Lexer`] implements [`Iterator<Item = Token>`] so it can be used directly
//! in a `for` loop or with iterator adaptors such as `.peekable()`.
//! Whitespace and `#`-line-comments are skipped automatically between tokens.
//!
//! # Character classes
//! - **Identifiers / keywords** — start with a
//! [XID_Start](https://unicode.org/reports/tr31/) character or `_`, continue
//! with XID_Continue characters. Reserved words are mapped to their
//! respective [`TokenKind`] variants; everything else becomes
//! [`TokenKind::Identifier`].
//! - **Integer literals** — decimal by default; `0x` / `0o` / `0b` prefixes
//! select hexadecimal, octal, and binary respectively.
//! - **String literals** — delimited by `"…"`; `\` escapes the next character.
//! - **Operators and punctuation** — single- or double-character tokens
//! dispatched via the `token!` macro with one character of lookahead.
use std::{iter::Peekable, str::Chars};
use unicode_xid::UnicodeXID;
use crate::token::{Span, Token, TokenKind};
/// A lazy iterator over the [`Token`]s of a source string.
///
/// Tokens borrow their text slice directly from the original source, so the
/// lexer lifetime `'src` must outlive any use of the produced tokens.
///
/// Construct with [`Lexer::new`] and consume via the [`Iterator`] impl or by
/// passing it to the parser.
pub struct Lexer<'src> {
/// One-character look-ahead over the source characters.
chars: Peekable<Chars<'src>>,
/// The full source text, kept for slice extraction in [`make`](Self::make).
source: &'src str,
/// Current byte offset into `source`. Advanced by [`advance`](Self::advance).
position: usize,
}
impl<'src> Lexer<'src> {
/// Creates a new [`Lexer`] positioned at the start of `source`.
pub fn new(source: &'src str) -> Self {
Self {
chars: source.chars().peekable(),
@@ -24,22 +52,29 @@ impl<'src> Lexer<'src> {
self.chars.peek().copied()
}
/// Consume and return the next character.
/// This method panics if called at the end of input.
/// Consume and return the next character, advancing [`position`](Self::position)
/// by the character's UTF-8 byte length.
///
/// # Panics
/// Panics if called at the end of input. Always guard with
/// [`peek`](Self::peek) first.
fn advance(&mut self) -> char {
let ch = self.chars.next().expect("failed to advance the lexer");
self.position += ch.len_utf8();
ch
}
/// Advance while `condition` holds.
/// Advance while `condition` holds, stopping at the first character for
/// which it returns `false` (or at end of input).
fn advance_while(&mut self, condition: impl FnMut(char) -> bool + Copy) {
while self.peek().is_some_and(condition) {
self.advance();
}
}
/// Build a token from `[start, self.pos)`.
/// Construct a [`Token`] spanning the byte range `[start, self.position)`.
///
/// The token's `text` is a zero-copy slice of the source string.
fn make(&self, kind: TokenKind, start: usize) -> Token<'src> {
Token {
kind,
@@ -48,7 +83,11 @@ impl<'src> Lexer<'src> {
}
}
/// Skip all whitespace and comments.
/// Skip any run of whitespace followed by a `#` line comment, repeating
/// until neither is present.
///
/// Comments begin with `#` and extend to (but do not include) the
/// following `\n`.
fn skip_whitespace_and_comments(&mut self) {
loop {
self.advance_while(char::is_whitespace);
@@ -61,7 +100,12 @@ impl<'src> Lexer<'src> {
}
}
/// Lexes the next identifier token.
/// Lex the next identifier or keyword token.
///
/// Assumes the current peek character satisfies `is_xid_start() || == '_'`.
/// Consumes one XID_Start (or `_`) character followed by any number of
/// XID_Continue characters, then matches the resulting slice against the
/// keyword / type-keyword table.
fn next_identifier(&mut self) -> TokenKind {
let start = self.position;
@@ -72,6 +116,7 @@ impl<'src> Lexer<'src> {
"and" => TokenKind::KwAnd,
"or" => TokenKind::KwOr,
"as" => TokenKind::KwAs,
"let" => TokenKind::KwLet,
"u8" => TokenKind::TyU8,
"u16" => TokenKind::TyU16,
@@ -90,7 +135,12 @@ impl<'src> Lexer<'src> {
}
}
/// Lexes the next number token.
/// Lex the next integer literal token.
///
/// Assumes the current peek character is an ASCII digit. Detects an
/// optional radix prefix (`0x` → 16, `0o` → 8, `0b` → 2) then consumes
/// all subsequent digits valid for that radix. Always returns
/// [`TokenKind::LitInt`].
fn next_number(&mut self) -> TokenKind {
let radix = match self.advance() {
'0' => match self.peek() {
@@ -116,7 +166,15 @@ impl<'src> Lexer<'src> {
TokenKind::LitInt
}
/// Lexes the next string token.
/// Lex the next string literal token.
///
/// Assumes the current peek character is `"`. Consumes characters until
/// a closing (unescaped) `"` is found or input is exhausted. A `\`
/// escapes the immediately following character, preventing it from being
/// treated as a closing delimiter. Always returns [`TokenKind::LitString`].
///
/// Note: escape sequences are not validated here; that is left to a later
/// compiler stage.
fn next_string(&mut self) -> TokenKind {
let mut escaped = false;
@@ -144,11 +202,25 @@ impl<'src> Lexer<'src> {
impl<'src> Iterator for Lexer<'src> {
type Item = Token<'src>;
/// Returns the next [`Token`], or `None` when the source is exhausted.
///
/// Leading whitespace and `#`-comments are skipped before each token.
/// Multi-character operator tokens (`->`, `<<`, `<=`, …) are resolved with
/// a single character of lookahead via the `token!` macro. Unrecognised
/// characters are returned as [`TokenKind::Unknown`].
fn next(&mut self) -> Option<Self::Item> {
self.skip_whitespace_and_comments();
let start = self.position;
/// Builds and evaluates a [`TokenKind`] from the current position.
///
/// Three forms:
/// - `token!($kind)` — single-character token: advance once, yield `$kind`.
/// - `token!($c => $kind, … ; $default)` — multi-character token with
/// lookahead: advance once (consuming the lead character), then
/// check the next character against each `$c => $kind` arm in order,
/// falling back to `$default` if none match.
macro_rules! token {
// Case 1: Simple token (no lookahead)
($default:expr) => {{

View File

@@ -30,7 +30,7 @@ fn main() {
println!("-- {} --", file.display());
let mut parser = Parser::new(&content);
match parser.parse_expression(0) {
match parser.parse_statement() {
Ok(ast) => println!("{ast:#?}"),
Err(diag) => diag.report(file, &content),
}

View File

@@ -1,3 +1,9 @@
//! Recursive-descent / Pratt parser that converts a token stream into an AST.
//!
//! The entry points are [`Parser::parse_statement`], [`Parser::parse_type`],
//! and [`Parser::parse_expression`].
//! Errors are represented as [`Diagnostic`] values; the caller is responsible
//! for reporting them.
use std::iter::Peekable;
use crate::ast;
@@ -5,15 +11,24 @@ use crate::diagnostic::{Diagnostic, Severity};
use crate::lexer::Lexer;
use crate::token::{Token, TokenKind};
/// The [Parser] consumes the [Token]s produced by the [Lexer] and constructs
/// an [ast] in the [ast::Parsed] phase.
/// Consumes the [`Token`] stream produced by the [`Lexer`] and constructs an
/// AST in the [`ast::Parsed`] phase.
///
/// The parser uses a single token of look-ahead (peek) for all decisions.
/// Expression parsing is implemented with the
/// [Pratt / top-down operator-precedence](https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html)
/// algorithm; binding-power tables are defined by [`infix_binding_power`],
/// [`prefix_binding_power`], and [`postfix_binding_power`].
pub struct Parser<'src> {
tokens: Peekable<Lexer<'src>>,
/// Diagnostics accumulated during parsing. Non-fatal errors are pushed here
/// so that the parser can attempt to continue and surface multiple issues
/// in a single pass.
errors: Vec<Diagnostic>,
}
impl<'src> Parser<'src> {
/// Constructs a new [Parser] with the given source text.
/// Constructs a new [`Parser`] with the given source text.
pub fn new(source: &'src str) -> Self {
Self {
tokens: Lexer::new(source).peekable(),
@@ -21,35 +36,40 @@ impl<'src> Parser<'src> {
}
}
/// Peek at the next [Token] without consuming it.
/// Peek at the next [`Token`] without consuming it.
fn peek(&mut self) -> Option<Token<'src>> {
self.tokens.peek().copied()
}
/// Peek at the next [Token] and return a [Diagnostic] if we reached the end of input.
/// Peek at the next [`Token`], returning an [`Err`] diagnostic if the
/// token stream is exhausted.
fn peek_no_eof(&mut self) -> Result<Token<'src>, Diagnostic> {
self.peek()
.ok_or_else(|| Diagnostic::new(Severity::Error, "unexpected end of input"))
}
/// Check if the peek [Token] is of a given [TokenKind].
/// Returns `true` if the next token has the given [`TokenKind`].
fn is_peek(&mut self, kind: TokenKind) -> bool {
self.peek().map_or(false, |tok| tok.is(kind))
}
/// Check if we have reached the end of input.
/// Returns `true` if the token stream is exhausted.
fn is_at_eof(&mut self) -> bool {
self.peek().is_none()
}
/// Consumes and returns the next [Token].
/// This method panics if called at the end of input.
/// Consumes and returns the next [`Token`].
///
/// # Panics
/// Panics if called at the end of input. Always check [`is_at_eof`](Self::is_at_eof)
/// or use [`peek_no_eof`](Self::peek_no_eof) / [`expect`](Self::expect) in
/// production code paths.
fn advance(&mut self) -> Token<'src> {
self.tokens.next().expect("failed to advance the parser")
}
/// Consumes and returns the next [Token], if it is of a given [TokenKind],
/// otherwise returns an [Err].
/// Consumes and returns the next [`Token`] if it matches `kind`; otherwise
/// returns an [`Err`] diagnostic that points at the offending token.
fn expect(&mut self, kind: TokenKind) -> Result<Token<'src>, Diagnostic> {
match self.peek() {
Some(tok) if tok.is(kind) => Ok(self.advance()),
@@ -63,8 +83,11 @@ impl<'src> Parser<'src> {
}
}
/// Skips [Token]s until we reach a neutral statement boundary, so that
/// subsequent statements can still be parsed cleanly.
/// Error-recovery helper: skips tokens until a statement boundary is
/// reached so that subsequent statements can still be parsed cleanly.
///
/// Stops *after* consuming a `;`, or *before* consuming a `}`. This keeps
/// nested blocks intact when recovering inside function bodies.
fn synchronize(&mut self) {
while let Some(peek) = self.peek() {
match peek.kind {
@@ -82,6 +105,76 @@ impl<'src> Parser<'src> {
}
}
/// Parses the next statement.
///
/// Dispatches to the appropriate specialised parser based on the leading
/// token:
/// - `let` → [`parse_let_statement`](Self::parse_let_statement)
/// - anything else → an expression followed by a mandatory `;`
pub fn parse_statement(&mut self) -> Result<ast::ParsedStatement, Diagnostic> {
let peek = self.peek_no_eof()?;
match peek.kind {
TokenKind::KwLet => self.parse_let_statement(),
_ => {
let expr = self.parse_expression(0)?;
let semi_token = self.expect(TokenKind::Semi)?;
let span = expr.span.extend(semi_token.span);
Ok(ast::ParsedStatement {
kind: ast::StatementKind::Expr(expr),
span,
extra: (),
})
}
}
}
/// Parses a `let` binding statement: `let <name>[: <type>] [= <expr>];`.
///
/// Both the type annotation and the initialiser are optional. The
/// statement span runs from the `let` keyword through to the closing `;`.
fn parse_let_statement(&mut self) -> Result<ast::ParsedStatement, Diagnostic> {
let let_token = self.expect(TokenKind::KwLet)?;
let (name, name_span) = {
let ident_token = self.expect(TokenKind::Identifier)?;
(ident_token.text.to_string(), ident_token.span)
};
let ty = if self.is_peek(TokenKind::Colon) {
self.advance();
Some(self.parse_type()?)
} else {
None
};
let value = if self.is_peek(TokenKind::Assign) {
self.advance();
Some(self.parse_expression(0)?)
} else {
None
};
let semi_token = self.expect(TokenKind::Semi)?;
Ok(ast::ParsedStatement {
kind: ast::StatementKind::Let {
name,
name_span,
ty,
value,
},
span: let_token.span.extend(semi_token.span),
extra: (),
})
}
/// Parses a type annotation, e.g. `u8`, `i64`, `bool`, or a user-defined
/// named type.
///
/// Returns an [`Err`] diagnostic if the next token is not a valid type.
pub fn parse_type(&mut self) -> Result<ast::ParsedType, Diagnostic> {
let peek = self.peek_no_eof()?;
@@ -112,7 +205,24 @@ impl<'src> Parser<'src> {
})
}
/// Parses an [ast::Expression] using the pratt parsing algorithm.
/// Parses an expression using the Pratt (top-down operator-precedence)
/// algorithm.
///
/// `min_bp` is the minimum *left* binding power the next infix/postfix
/// operator must have to be incorporated into the current expression. Pass
/// `0` to parse a full expression with no restrictions.
///
/// The precedence hierarchy (low → high) is:
/// - assignment (`=`)
/// - logical `or` / `and`
/// - bitwise `|` / `^` / `&`
/// - equality (`==`, `!=`) and comparison (`<`, `<=`, `>`, `>=`)
/// - addition / subtraction
/// - shifts (`<<`, `>>`)
/// - multiplication / division / remainder
/// - member access (`.`)
/// - postfix: call `()`, index `[]`, cast `as`
/// - prefix: `-`, `&`, `~`, `*`, `!`
pub fn parse_expression(&mut self, min_bp: u8) -> Result<ast::ParsedExpression, Diagnostic> {
let peek_token = self.peek_no_eof()?;
@@ -182,7 +292,11 @@ impl<'src> Parser<'src> {
Ok(left)
}
/// Parses a primary expression, e.g. literals, unary or grouped expression.
/// Parses a primary (non-operator) expression: an identifier, integer
/// literal, boolean literal, or a parenthesised expression.
///
/// Integer literals support `0x` (hex), `0o` (octal), and `0b` (binary)
/// prefixes in addition to plain decimal.
fn parse_primary_expression(&mut self) -> Result<ast::ParsedExpression, Diagnostic> {
let peek_token = self.peek_no_eof()?;
@@ -252,7 +366,10 @@ impl<'src> Parser<'src> {
}
}
/// Parses a [ast::ExpressionKind::Call] expression.
/// Parses a function-call expression `func(arg, …)`.
///
/// The opening `(` is consumed here; `func` is the already-parsed callee
/// expression passed in from the Pratt loop.
fn parse_call_expr(
&mut self,
func: ast::ParsedExpression,
@@ -281,7 +398,10 @@ impl<'src> Parser<'src> {
})
}
/// Parses an [ast::ExpressionKind::Index] expression.
/// Parses an index expression `expr[index]`.
///
/// The opening `[` is consumed here; `expr` is the already-parsed
/// collection expression passed in from the Pratt loop.
fn parse_index_expr(
&mut self,
expr: ast::ParsedExpression,
@@ -303,7 +423,10 @@ impl<'src> Parser<'src> {
})
}
/// Parses an [ast::ExpressionKind::Cast] expression.
/// Parses a cast expression `expr as Type`.
///
/// The `as` keyword is consumed here; `expr` is the already-parsed value
/// expression passed in from the Pratt loop.
fn parse_cast_expr(
&mut self,
expr: ast::ParsedExpression,
@@ -324,6 +447,12 @@ impl<'src> Parser<'src> {
}
}
/// Returns `(left_bp, right_bp, op)` for infix operators, or `None` if `kind`
/// is not an infix operator.
///
/// The two binding-power values implement associativity: equal values give
/// left-associativity, and `right_bp = left_bp` gives right-associativity
/// (currently used for `=`).
fn infix_binding_power(kind: TokenKind) -> Option<(u8, u8, ast::BinaryOp)> {
Some(match kind {
TokenKind::Assign => (2, 2, ast::BinaryOp::Assign),
@@ -359,6 +488,11 @@ fn infix_binding_power(kind: TokenKind) -> Option<(u8, u8, ast::BinaryOp)> {
})
}
/// Returns `(right_bp, op)` for prefix operators, or `None` if `kind` is not
/// a prefix operator.
///
/// All prefix operators currently share the same binding power (`80`), giving
/// them higher precedence than any binary operator.
fn prefix_binding_power(kind: TokenKind) -> Option<(u8, ast::UnaryOp)> {
Some(match kind {
TokenKind::Minus => (80, ast::UnaryOp::Neg),
@@ -371,6 +505,12 @@ fn prefix_binding_power(kind: TokenKind) -> Option<(u8, ast::UnaryOp)> {
})
}
/// Returns the *left* binding power for postfix operators, or `None` if `kind`
/// is not a postfix operator.
///
/// Postfix operators (`()`, `[]`, `as`) bind tighter than all binary operators
/// but are checked before prefix operators in the Pratt loop so they always
/// apply to the nearest sub-expression.
fn postfix_binding_power(kind: TokenKind) -> Option<u8> {
Some(match kind {
TokenKind::LParen => 100,

View File

@@ -1,28 +1,50 @@
//! Token definitions used by the [`Lexer`](crate::lexer::Lexer) and
//! [`Parser`](crate::parser::Parser).
//!
//! The two core types are:
//! - [`Span`] — a half-open byte range that marks a location in source text.
//! - [`Token`] — a classified slice of source text together with its span.
//!
//! [`TokenKind`] enumerates every token variant; its [`Display`](std::fmt::Display)
//! impl produces the human-readable representation used in diagnostics.
use std::fmt;
/// A Span is a half-open byte range `[start, end)` which marks a location in
/// the source string. The start and end positions are stored as a [u32] which
/// limits us to a maximum source file size of 4 gigabytes.
/// A half-open byte range `[start, end)` that marks a location in the source
/// string.
///
/// Positions are stored as [`u32`], which limits supported source files to
/// 4 GiB — more than sufficient for any practical source file.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Span {
/// Inclusive start byte offset.
pub start: u32,
/// Exclusive end byte offset.
pub end: u32,
}
impl Span {
/// Creates a new span covering `[start, end)`.
pub const fn new(start: u32, end: u32) -> Self {
Self { start, end }
}
/// Returns the length of the span in bytes.
///
/// Uses saturating subtraction so an inverted span returns `0` rather than
/// wrapping.
pub fn len(&self) -> u32 {
self.end.saturating_sub(self.start)
}
/// Returns `true` if the span covers zero bytes (`start == end`).
pub fn is_empty(&self) -> bool {
self.start == self.end
}
/// Extend this [Span] to cover `other` as well.
/// Returns the smallest span that covers both `self` and `other`.
///
/// This is the union of the two ranges, useful for computing the span of a
/// parent node from its children.
pub fn extend(self, other: Self) -> Self {
Self {
start: self.start.min(other.start),
@@ -37,8 +59,10 @@ impl fmt::Display for Span {
}
}
/// This macro helps with defining the different kinds of [Token]s. It
/// simultaneously defines a variant and its [fmt::Display] implementation.
/// Simultaneously defines the [`TokenKind`] enum and its [`fmt::Display`] impl.
///
/// Each arm maps a variant name to the human-readable string used in
/// diagnostics (e.g. `` `+` ``, `identifier`).
macro_rules! define_tokens {
($($name:ident => $repr:literal),* $(,)?) => {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -69,6 +93,7 @@ define_tokens! {
KwAnd => "`and`",
KwOr => "`or`",
KwAs => "`as`",
KwLet => "`let`",
// -- Type Keywords --
TyU8 => "`u8`",
@@ -97,7 +122,7 @@ define_tokens! {
Shr => "`>>`",
Bang => "`!`",
// -- Comparision Operators --
// -- Comparison Operators --
Eq => "`==`",
Ne => "`!=`",
Lt => "`<`",
@@ -125,17 +150,23 @@ define_tokens! {
Unknown => "unknown character"
}
/// A Token represents the smallest continous unit of the source code. It holds
/// its [TokenKind], [Span] and source text.
/// The smallest contiguous unit of source text, as produced by the
/// [`Lexer`](crate::lexer::Lexer).
///
/// A token borrows its [`text`](Token::text) slice directly from the original
/// source string, so the lifetime `'src` ties every token to that source.
#[derive(Debug, Clone, Copy)]
pub struct Token<'src> {
/// The syntactic category of this token.
pub kind: TokenKind,
/// The byte range in the source string where this token appears.
pub span: Span,
/// The raw source text of this token (a zero-copy slice).
pub text: &'src str,
}
impl<'src> Token<'src> {
/// Checks if the current [Token] is of given [TokenKind].
/// Returns `true` if this token has the given [`TokenKind`].
pub fn is(&self, kind: TokenKind) -> bool {
self.kind == kind
}

View File

@@ -1 +1 @@
foo.bar - 5 as i32
let test: i32 = foo.bar - 5 as i32;