docs: Add and improve documentation for every module.

This commit is contained in:
2026-03-12 20:44:41 +01:00
parent 4e2df32e36
commit bb9cb8d2d1
6 changed files with 384 additions and 73 deletions

View File

@@ -1,15 +1,33 @@
//! Abstract Syntax Tree (AST) definitions.
//!
//! The AST is parameterised over a [`Phase`] type-state so that the same node
//! types can carry different amounts of information at different compiler
//! stages. Currently only the [`Parsed`] phase exists, which attaches no
//! extra data (`()`) to each node.
//!
//! The two primary node families are:
//! - [`Expression`] / [`ExpressionKind`] — value-producing constructs.
//! - [`Type`] / [`TypeKind`] — type annotations.
use std::fmt::Debug; use std::fmt::Debug;
use crate::token::Span; use crate::token::Span;
/// The [Phase] trait is used for type state. The AST can be in one of multiple /// Marker trait that carries phase-specific associated data for AST nodes.
/// type states: ///
/// 1. [Parsed] - AST that was produced through parsing. /// Each phase defines an [`ExtraData`](Phase::ExtraData) type that is embedded
/// in every node. This allows later compiler passes (e.g. type-checking) to
/// augment the tree without duplicating the node hierarchy.
///
/// Current phases:
/// - [`Parsed`] — produced directly by the parser; no extra data.
pub trait Phase { pub trait Phase {
type ExtraData: PartialEq + Debug; type ExtraData: PartialEq + Debug;
} }
/// See [Phase] for more information. /// The initial AST phase produced by the parser.
///
/// In this phase [`Phase::ExtraData`] is `()`, meaning nodes carry only
/// syntactic information (kind + source span).
#[derive(Debug)] #[derive(Debug)]
pub struct Parsed; pub struct Parsed;
@@ -17,10 +35,15 @@ impl Phase for Parsed {
type ExtraData = (); type ExtraData = ();
} }
/// Convenience alias for an [`Expression`] in the [`Parsed`] phase.
pub type ParsedExpression = Expression<Parsed>; pub type ParsedExpression = Expression<Parsed>;
/// This represents an expression in the source code. It holds the /// A value-producing node in the AST.
/// [ExpressionKind], the [Span] and extra information according to the [Phase]. ///
/// Every expression carries:
/// - [`kind`](Expression::kind) — what *kind* of expression it is.
/// - [`span`](Expression::span) — the source location it was parsed from.
/// - [`extra`](Expression::extra) — phase-specific data (see [`Phase`]).
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub struct Expression<P: Phase> { pub struct Expression<P: Phase> {
pub kind: ExpressionKind<P>, pub kind: ExpressionKind<P>,
@@ -28,109 +51,125 @@ pub struct Expression<P: Phase> {
pub extra: P::ExtraData, pub extra: P::ExtraData,
} }
/// Represents the different kinds of [Expression]s, e.g. literals, unary or /// The concrete variant of an [`Expression`].
/// binary expressions.
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub enum ExpressionKind<P: Phase> { pub enum ExpressionKind<P: Phase> {
/// A bare name, e.g. `foo`.
Identifier(String), Identifier(String),
/// A string literal, e.g. `"hello"`.
LitString(String), LitString(String),
/// An integer literal, e.g. `42`, `0xFF`, `0b1010`. The value is stored
/// as a `u64` regardless of the source radix.
LitInteger(u64), LitInteger(u64),
/// A boolean literal: `true` or `false`.
LitBool(bool), LitBool(bool),
/// A prefix unary expression, e.g. `-x`, `!cond`, `*ptr`.
Unary { Unary {
op: UnaryOp, op: UnaryOp,
/// Source span of the operator token itself.
op_span: Span, op_span: Span,
operand: Box<Expression<P>>, operand: Box<Expression<P>>,
}, },
/// An infix binary expression, e.g. `a + b`, `x == y`.
Binary { Binary {
op: BinaryOp, op: BinaryOp,
/// Source span of the operator token itself.
op_span: Span, op_span: Span,
left: Box<Expression<P>>, left: Box<Expression<P>>,
right: Box<Expression<P>>, right: Box<Expression<P>>,
}, },
/// A function call, e.g. `f(a, b)`.
Call { Call {
/// The callee expression (often an [`Identifier`](ExpressionKind::Identifier)).
func: Box<Expression<P>>, func: Box<Expression<P>>,
args: Vec<Expression<P>>, args: Vec<Expression<P>>,
}, },
/// An index expression, e.g. `arr[i]`.
Index { Index {
expr: Box<Expression<P>>, expr: Box<Expression<P>>,
index: Box<Expression<P>>, index: Box<Expression<P>>,
}, },
/// A type-cast expression, e.g. `x as u32`.
Cast { Cast {
expr: Box<Expression<P>>, expr: Box<Expression<P>>,
ty: Box<Type<P>>, ty: Box<Type<P>>,
}, },
} }
/// A prefix unary operator.
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum UnaryOp { pub enum UnaryOp {
/// Bitwise Not /// Bitwise complement (`~`)
BitNot, BitNot,
/// Logical Not /// Logical negation (`!`)
Not, Not,
/// Negate /// Arithmetic negation (`-`)
Neg, Neg,
/// Address Of /// Address-of (`&`)
AddrOf, AddrOf,
/// Deref /// Pointer dereference (`*`)
Deref, Deref,
} }
/// An infix binary operator.
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BinaryOp { pub enum BinaryOp {
/// Addition /// Addition (`+`)
Add, Add,
/// Subtraction /// Subtraction (`-`)
Sub, Sub,
/// Multiplication /// Multiplication (`*`)
Mul, Mul,
/// Division /// Division (`/`)
Div, Div,
/// Remainder /// Remainder (`%`)
Rem, Rem,
/// Bitwise And /// Bitwise AND (`&`)
BitAnd, BitAnd,
/// Bitwise Or /// Bitwise OR (`|`)
BitOr, BitOr,
/// Bitwise Xor /// Bitwise XOR (`^`)
BitXor, BitXor,
/// Bitwise Shift Left /// Left shift (`<<`)
BitShl, BitShl,
/// Bitwise Shift Right /// Right shift (`>>`)
BitShr, BitShr,
/// Logical And /// Logical AND (`and`)
And, And,
/// Logical Or /// Logical OR (`or`)
Or, Or,
/// Equal /// Equality (`==`)
Eq, Eq,
/// Not Equal /// Inequality (`!=`)
Ne, Ne,
/// Less than /// Less-than (`<`)
Lt, Lt,
/// Less than or Equal /// Less-than-or-equal (`<=`)
Le, Le,
/// Greater than /// Greater-than (`>`)
Gt, Gt,
/// Greater than or Equal /// Greater-than-or-equal (`>=`)
Ge, Ge,
/// Assign /// Assignment (`=`)
Assign, Assign,
/// Member Access /// Member access (`.`)
Dot, Dot,
} }
/// Convenience alias for a [`Type`] in the [`Parsed`] phase.
pub type ParsedType = Type<Parsed>; pub type ParsedType = Type<Parsed>;
/// A type annotation node in the AST.
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub struct Type<P: Phase> { pub struct Type<P: Phase> {
pub kind: TypeKind, pub kind: TypeKind,
@@ -138,19 +177,24 @@ pub struct Type<P: Phase> {
pub extra: P::ExtraData, pub extra: P::ExtraData,
} }
/// The concrete variant of a [`Type`] annotation.
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub enum TypeKind { pub enum TypeKind {
/// Signed integers
I8, I8,
I16, I16,
I32, I32,
I64, I64,
/// Unsigned integers
U8, U8,
U16, U16,
U32, U32,
U64, U64,
/// Boolean type (`bool`)
Bool, Bool,
/// A user-defined named type, e.g. `MyStruct`.
Named(String), Named(String),
} }

View File

@@ -1,7 +1,18 @@
//! Command-line interface: argument parsing, help/version output, and fatal
//! error reporting.
//!
//! The primary entry point is [`parse_args`], which parses [`std::env::args`]
//! and returns an [`Opts`] struct. If any argument is invalid or required
//! arguments are missing, it calls [`fatal`] which prints an error to `stderr`
//! and exits with code `1`.
use std::path::PathBuf; use std::path::PathBuf;
use yansi::Paint; use yansi::Paint;
/// Print the help message to `stdout`.
///
/// Describes the compiler's usage, all supported options, and the `<file>`
/// positional argument.
pub fn print_help() { pub fn print_help() {
println!( println!(
"{} {} - the bucky language compiler", "{} {} - the bucky language compiler",
@@ -47,27 +58,52 @@ pub fn print_help() {
); );
} }
/// Print the compiler version string (`buckyc <version>`) to `stdout`.
pub fn print_version() { pub fn print_version() {
println!("buckyc {}", env!("CARGO_PKG_VERSION")); println!("buckyc {}", env!("CARGO_PKG_VERSION"));
} }
/// Print a formatted error message to `stderr` and exit with code `1`.
///
/// This function never returns (`-> !`). Use it for unrecoverable CLI errors
/// such as missing arguments or unknown flags, discovered before compilation
/// begins.
pub fn fatal(message: impl ToString) -> ! { pub fn fatal(message: impl ToString) -> ! {
eprintln!("{}: {}", "error".bold().red(), message.to_string().bold()); eprintln!("{}: {}", "error".bold().red(), message.to_string().bold());
std::process::exit(1); std::process::exit(1);
} }
/// Parsed command-line options returned by [`parse_args`].
#[derive(Debug)] #[derive(Debug)]
pub struct Opts { pub struct Opts {
/// The list of files passed to the compiler. /// One or more source files to compile, in the order they were supplied.
pub files: Vec<PathBuf>, pub files: Vec<PathBuf>,
/// `-S`: emit IR and stop (implies `-c`). /// `-S`: emit IR and stop (implies [`no_link`](Opts::no_link)).
pub emit_ir: bool, pub emit_ir: bool,
/// `-c`: compile source to object file without linking. /// `-c`: compile to an object file without invoking the linker.
pub no_link: bool, pub no_link: bool,
/// `-o <file>`: write final output to this path. /// `-o <file>`: destination path for the final output. When `None` the
/// compiler chooses a default output name.
pub output: Option<PathBuf>, pub output: Option<PathBuf>,
} }
/// Parse [`std::env::args`] and return the resulting [`Opts`].
///
/// Recognised flags:
///
/// | Flag | Effect |
/// |------|--------|
/// | `-h`, `--help` | Print help and exit `0` |
/// | `-V`, `--version` | Print version and exit `0` |
/// | `-S` | Set [`emit_ir`](Opts::emit_ir) and [`no_link`](Opts::no_link) |
/// | `-c` | Set [`no_link`](Opts::no_link) |
/// | `-o <file>` | Set [`output`](Opts::output) |
/// | `<file>` | Append to [`files`](Opts::files) |
///
/// Calls [`fatal`] (and exits) if:
/// - an unknown `-`-prefixed flag is encountered, or
/// - `-o` is supplied without a following argument, or
/// - no source files are provided.
pub fn parse_args() -> Opts { pub fn parse_args() -> Opts {
let mut files = Vec::new(); let mut files = Vec::new();
let mut no_link = false; let mut no_link = false;

View File

@@ -1,14 +1,37 @@
//! Compiler diagnostic reporting with source-location context.
//!
//! This module provides [`Diagnostic`], a structured error/warning message that
//! can optionally include a source span and one or more labelled secondary
//! spans. Diagnostics are rendered to `stderr` in a rustc-inspired format:
//!
//! ```text
//! Error: undeclared variable `x`
//! --> src/main.bky:3:5
//! |
//! 3 | let y = x + 1;
//! | ^ undeclared variable
//! |
//! ```
use std::{fmt::Display, path::Path, process::exit}; use std::{fmt::Display, path::Path, process::exit};
use yansi::Paint; use yansi::Paint;
use crate::token::Span; use crate::token::Span;
/// The importance level of a [`Diagnostic`].
///
/// Variants are ordered from least to most severe so that `<` / `>` comparisons
/// work intuitively (e.g. `Severity::Warning < Severity::Error`).
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum Severity { pub enum Severity {
/// Purely informational; never causes the compiler to stop.
Note, Note,
/// Something suspicious that may or may not be a problem.
Warning, Warning,
/// A recoverable problem that prevents successful compilation.
Error, Error,
/// An unrecoverable problem; the process will exit immediately after
/// reporting this diagnostic.
Critical, Critical,
} }
@@ -23,14 +46,29 @@ impl Display for Severity {
} }
} }
/// A single compiler message with optional source-location information.
///
/// Build a diagnostic with [`Diagnostic::new`], optionally attach a primary
/// source location via [`with_span`](Diagnostic::with_span), attach labelled
/// secondary locations via [`add_label`](Diagnostic::add_label), then call
/// [`report`](Diagnostic::report) to print it.
///
/// If the severity is [`Severity::Critical`], `report` will call
/// [`process::exit`](std::process::exit) after printing.
pub struct Diagnostic { pub struct Diagnostic {
pub severity: Severity, pub severity: Severity,
/// Primary source location, if any.
pub span: Option<Span>, pub span: Option<Span>,
pub message: String, pub message: String,
/// Secondary labelled spans rendered below the primary snippet.
pub labels: Vec<(Span, String)>, pub labels: Vec<(Span, String)>,
} }
impl Diagnostic { impl Diagnostic {
/// Create a new diagnostic with the given severity and message.
///
/// No source location is attached; use [`with_span`](Self::with_span) to
/// add one.
pub fn new(severity: Severity, message: impl ToString) -> Self { pub fn new(severity: Severity, message: impl ToString) -> Self {
Self { Self {
severity, severity,
@@ -40,16 +78,29 @@ impl Diagnostic {
} }
} }
/// Attach a primary source span to this diagnostic.
pub fn with_span(mut self, span: Span) -> Self { pub fn with_span(mut self, span: Span) -> Self {
self.span = Some(span); self.span = Some(span);
self self
} }
/// Attach a labelled secondary span.
///
/// Labels whose span matches the primary span exactly are merged into the
/// primary underline as inline text. All other labels are rendered as
/// separate snippets below the primary one.
pub fn add_label(mut self, span: Span, message: impl ToString) -> Self { pub fn add_label(mut self, span: Span, message: impl ToString) -> Self {
self.labels.push((span, message.to_string())); self.labels.push((span, message.to_string()));
self self
} }
/// Print this diagnostic to `stderr` and, if the severity is
/// [`Severity::Critical`], terminate the process.
///
/// # Arguments
/// * `file_name` path shown in the `-->` location line.
/// * `source` full source text of the file, used to extract line/col
/// information and to display the relevant source snippet.
pub fn report(self, file_name: &Path, source: &str) { pub fn report(self, file_name: &Path, source: &str) {
eprintln!("{}: {}", self.severity, self.message.bold()); eprintln!("{}: {}", self.severity, self.message.bold());
@@ -165,6 +216,7 @@ fn render_snippet(
eprintln!("{pad} {bar} {spaces}{colored_carets}{label_text}"); eprintln!("{pad} {bar} {spaces}{colored_carets}{label_text}");
} }
/// Apply severity-appropriate ANSI colour to a string.
fn paint_severity(s: &str, severity: Severity) -> String { fn paint_severity(s: &str, severity: Severity) -> String {
match severity { match severity {
Severity::Note => format!("{}", s.bold().bright_cyan()), Severity::Note => format!("{}", s.bold().bright_cyan()),
@@ -173,6 +225,7 @@ fn paint_severity(s: &str, severity: Severity) -> String {
} }
} }
/// Returns the number of decimal digits in `n` (minimum 1).
fn count_digits(n: usize) -> usize { fn count_digits(n: usize) -> usize {
format!("{n}").len() format!("{n}").len()
} }
@@ -187,6 +240,10 @@ fn get_line_content(source: &str, position: u32) -> (usize, &str) {
(line_start, &rest[..line_len]) (line_start, &rest[..line_len])
} }
/// Returns the 1-based `(line, column)` for a byte `position` within `source`.
///
/// Both line and column are counted from 1. The column is measured in Unicode
/// scalar values (characters), not bytes.
fn get_line_col(source: &str, position: u32) -> (usize, usize) { fn get_line_col(source: &str, position: u32) -> (usize, usize) {
let prefix = &source[..position as usize]; let prefix = &source[..position as usize];
let line = prefix.bytes().filter(|&b| b == b'\n').count() + 1; let line = prefix.bytes().filter(|&b| b == b'\n').count() + 1;

View File

@@ -1,16 +1,44 @@
//! Lexer (tokeniser) that converts raw source text into a [`Token`] stream.
//!
//! [`Lexer`] implements [`Iterator<Item = Token>`] so it can be used directly
//! in a `for` loop or with iterator adaptors such as `.peekable()`.
//! Whitespace and `#`-line-comments are skipped automatically between tokens.
//!
//! # Character classes
//! - **Identifiers / keywords** — start with a
//! [XID_Start](https://unicode.org/reports/tr31/) character or `_`, continue
//! with XID_Continue characters. Reserved words are mapped to their
//! respective [`TokenKind`] variants; everything else becomes
//! [`TokenKind::Identifier`].
//! - **Integer literals** — decimal by default; `0x` / `0o` / `0b` prefixes
//! select hexadecimal, octal, and binary respectively.
//! - **String literals** — delimited by `"…"`; `\` escapes the next character.
//! - **Operators and punctuation** — single- or double-character tokens
//! dispatched via the `token!` macro with one character of lookahead.
use std::{iter::Peekable, str::Chars}; use std::{iter::Peekable, str::Chars};
use unicode_xid::UnicodeXID; use unicode_xid::UnicodeXID;
use crate::token::{Span, Token, TokenKind}; use crate::token::{Span, Token, TokenKind};
/// A lazy iterator over the [`Token`]s of a source string.
///
/// Tokens borrow their text slice directly from the original source, so the
/// lexer lifetime `'src` must outlive any use of the produced tokens.
///
/// Construct with [`Lexer::new`] and consume via the [`Iterator`] impl or by
/// passing it to the parser.
pub struct Lexer<'src> { pub struct Lexer<'src> {
/// One-character look-ahead over the source characters.
chars: Peekable<Chars<'src>>, chars: Peekable<Chars<'src>>,
/// The full source text, kept for slice extraction in [`make`](Self::make).
source: &'src str, source: &'src str,
/// Current byte offset into `source`. Advanced by [`advance`](Self::advance).
position: usize, position: usize,
} }
impl<'src> Lexer<'src> { impl<'src> Lexer<'src> {
/// Creates a new [`Lexer`] positioned at the start of `source`.
pub fn new(source: &'src str) -> Self { pub fn new(source: &'src str) -> Self {
Self { Self {
chars: source.chars().peekable(), chars: source.chars().peekable(),
@@ -24,22 +52,29 @@ impl<'src> Lexer<'src> {
self.chars.peek().copied() self.chars.peek().copied()
} }
/// Consume and return the next character. /// Consume and return the next character, advancing [`position`](Self::position)
/// This method panics if called at the end of input. /// by the character's UTF-8 byte length.
///
/// # Panics
/// Panics if called at the end of input. Always guard with
/// [`peek`](Self::peek) first.
fn advance(&mut self) -> char { fn advance(&mut self) -> char {
let ch = self.chars.next().expect("failed to advance the lexer"); let ch = self.chars.next().expect("failed to advance the lexer");
self.position += ch.len_utf8(); self.position += ch.len_utf8();
ch ch
} }
/// Advance while `condition` holds. /// Advance while `condition` holds, stopping at the first character for
/// which it returns `false` (or at end of input).
fn advance_while(&mut self, condition: impl FnMut(char) -> bool + Copy) { fn advance_while(&mut self, condition: impl FnMut(char) -> bool + Copy) {
while self.peek().is_some_and(condition) { while self.peek().is_some_and(condition) {
self.advance(); self.advance();
} }
} }
/// Build a token from `[start, self.pos)`. /// Construct a [`Token`] spanning the byte range `[start, self.position)`.
///
/// The token's `text` is a zero-copy slice of the source string.
fn make(&self, kind: TokenKind, start: usize) -> Token<'src> { fn make(&self, kind: TokenKind, start: usize) -> Token<'src> {
Token { Token {
kind, kind,
@@ -48,7 +83,11 @@ impl<'src> Lexer<'src> {
} }
} }
/// Skip all whitespace and comments. /// Skip any run of whitespace followed by a `#` line comment, repeating
/// until neither is present.
///
/// Comments begin with `#` and extend to (but do not include) the
/// following `\n`.
fn skip_whitespace_and_comments(&mut self) { fn skip_whitespace_and_comments(&mut self) {
loop { loop {
self.advance_while(char::is_whitespace); self.advance_while(char::is_whitespace);
@@ -61,7 +100,12 @@ impl<'src> Lexer<'src> {
} }
} }
/// Lexes the next identifier token. /// Lex the next identifier or keyword token.
///
/// Assumes the current peek character satisfies `is_xid_start() || == '_'`.
/// Consumes one XID_Start (or `_`) character followed by any number of
/// XID_Continue characters, then matches the resulting slice against the
/// keyword / type-keyword table.
fn next_identifier(&mut self) -> TokenKind { fn next_identifier(&mut self) -> TokenKind {
let start = self.position; let start = self.position;
@@ -90,7 +134,12 @@ impl<'src> Lexer<'src> {
} }
} }
/// Lexes the next number token. /// Lex the next integer literal token.
///
/// Assumes the current peek character is an ASCII digit. Detects an
/// optional radix prefix (`0x` → 16, `0o` → 8, `0b` → 2) then consumes
/// all subsequent digits valid for that radix. Always returns
/// [`TokenKind::LitInt`].
fn next_number(&mut self) -> TokenKind { fn next_number(&mut self) -> TokenKind {
let radix = match self.advance() { let radix = match self.advance() {
'0' => match self.peek() { '0' => match self.peek() {
@@ -116,7 +165,15 @@ impl<'src> Lexer<'src> {
TokenKind::LitInt TokenKind::LitInt
} }
/// Lexes the next string token. /// Lex the next string literal token.
///
/// Assumes the current peek character is `"`. Consumes characters until
/// a closing (unescaped) `"` is found or input is exhausted. A `\`
/// escapes the immediately following character, preventing it from being
/// treated as a closing delimiter. Always returns [`TokenKind::LitString`].
///
/// Note: escape sequences are not validated here; that is left to a later
/// compiler stage.
fn next_string(&mut self) -> TokenKind { fn next_string(&mut self) -> TokenKind {
let mut escaped = false; let mut escaped = false;
@@ -144,11 +201,25 @@ impl<'src> Lexer<'src> {
impl<'src> Iterator for Lexer<'src> { impl<'src> Iterator for Lexer<'src> {
type Item = Token<'src>; type Item = Token<'src>;
/// Returns the next [`Token`], or `None` when the source is exhausted.
///
/// Leading whitespace and `#`-comments are skipped before each token.
/// Multi-character operator tokens (`->`, `<<`, `<=`, …) are resolved with
/// a single character of lookahead via the `token!` macro. Unrecognised
/// characters are returned as [`TokenKind::Unknown`].
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
self.skip_whitespace_and_comments(); self.skip_whitespace_and_comments();
let start = self.position; let start = self.position;
/// Builds and evaluates a [`TokenKind`] from the current position.
///
/// Three forms:
/// - `token!($kind)` — single-character token: advance once, yield `$kind`.
/// - `token!($c => $kind, … ; $default)` — multi-character token with
/// lookahead: advance once (consuming the lead character), then
/// check the next character against each `$c => $kind` arm in order,
/// falling back to `$default` if none match.
macro_rules! token { macro_rules! token {
// Case 1: Simple token (no lookahead) // Case 1: Simple token (no lookahead)
($default:expr) => {{ ($default:expr) => {{

View File

@@ -1,3 +1,8 @@
//! Recursive-descent / Pratt parser that converts a token stream into an AST.
//!
//! The entry points are [`Parser::parse_type`] and [`Parser::parse_expression`].
//! Errors are represented as [`Diagnostic`] values; the caller is responsible
//! for reporting them.
use std::iter::Peekable; use std::iter::Peekable;
use crate::ast; use crate::ast;
@@ -5,15 +10,24 @@ use crate::diagnostic::{Diagnostic, Severity};
use crate::lexer::Lexer; use crate::lexer::Lexer;
use crate::token::{Token, TokenKind}; use crate::token::{Token, TokenKind};
/// The [Parser] consumes the [Token]s produced by the [Lexer] and constructs /// Consumes the [`Token`] stream produced by the [`Lexer`] and constructs an
/// an [ast] in the [ast::Parsed] phase. /// AST in the [`ast::Parsed`] phase.
///
/// The parser uses a single token of look-ahead (peek) for all decisions.
/// Expression parsing is implemented with the
/// [Pratt / top-down operator-precedence](https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html)
/// algorithm; binding-power tables are defined by [`infix_binding_power`],
/// [`prefix_binding_power`], and [`postfix_binding_power`].
pub struct Parser<'src> { pub struct Parser<'src> {
tokens: Peekable<Lexer<'src>>, tokens: Peekable<Lexer<'src>>,
/// Diagnostics accumulated during parsing. Non-fatal errors are pushed here
/// so that the parser can attempt to continue and surface multiple issues
/// in a single pass.
errors: Vec<Diagnostic>, errors: Vec<Diagnostic>,
} }
impl<'src> Parser<'src> { impl<'src> Parser<'src> {
/// Constructs a new [Parser] with the given source text. /// Constructs a new [`Parser`] with the given source text.
pub fn new(source: &'src str) -> Self { pub fn new(source: &'src str) -> Self {
Self { Self {
tokens: Lexer::new(source).peekable(), tokens: Lexer::new(source).peekable(),
@@ -21,35 +35,40 @@ impl<'src> Parser<'src> {
} }
} }
/// Peek at the next [Token] without consuming it. /// Peek at the next [`Token`] without consuming it.
fn peek(&mut self) -> Option<Token<'src>> { fn peek(&mut self) -> Option<Token<'src>> {
self.tokens.peek().copied() self.tokens.peek().copied()
} }
/// Peek at the next [Token] and return a [Diagnostic] if we reached the end of input. /// Peek at the next [`Token`], returning an [`Err`] diagnostic if the
/// token stream is exhausted.
fn peek_no_eof(&mut self) -> Result<Token<'src>, Diagnostic> { fn peek_no_eof(&mut self) -> Result<Token<'src>, Diagnostic> {
self.peek() self.peek()
.ok_or_else(|| Diagnostic::new(Severity::Error, "unexpected end of input")) .ok_or_else(|| Diagnostic::new(Severity::Error, "unexpected end of input"))
} }
/// Check if the peek [Token] is of a given [TokenKind]. /// Returns `true` if the next token has the given [`TokenKind`].
fn is_peek(&mut self, kind: TokenKind) -> bool { fn is_peek(&mut self, kind: TokenKind) -> bool {
self.peek().map_or(false, |tok| tok.is(kind)) self.peek().map_or(false, |tok| tok.is(kind))
} }
/// Check if we have reached the end of input. /// Returns `true` if the token stream is exhausted.
fn is_at_eof(&mut self) -> bool { fn is_at_eof(&mut self) -> bool {
self.peek().is_none() self.peek().is_none()
} }
/// Consumes and returns the next [Token]. /// Consumes and returns the next [`Token`].
/// This method panics if called at the end of input. ///
/// # Panics
/// Panics if called at the end of input. Always check [`is_at_eof`](Self::is_at_eof)
/// or use [`peek_no_eof`](Self::peek_no_eof) / [`expect`](Self::expect) in
/// production code paths.
fn advance(&mut self) -> Token<'src> { fn advance(&mut self) -> Token<'src> {
self.tokens.next().expect("failed to advance the parser") self.tokens.next().expect("failed to advance the parser")
} }
/// Consumes and returns the next [Token], if it is of a given [TokenKind], /// Consumes and returns the next [`Token`] if it matches `kind`; otherwise
/// otherwise returns an [Err]. /// returns an [`Err`] diagnostic that points at the offending token.
fn expect(&mut self, kind: TokenKind) -> Result<Token<'src>, Diagnostic> { fn expect(&mut self, kind: TokenKind) -> Result<Token<'src>, Diagnostic> {
match self.peek() { match self.peek() {
Some(tok) if tok.is(kind) => Ok(self.advance()), Some(tok) if tok.is(kind) => Ok(self.advance()),
@@ -63,8 +82,11 @@ impl<'src> Parser<'src> {
} }
} }
/// Skips [Token]s until we reach a neutral statement boundary, so that /// Error-recovery helper: skips tokens until a statement boundary is
/// subsequent statements can still be parsed cleanly. /// reached so that subsequent statements can still be parsed cleanly.
///
/// Stops *after* consuming a `;`, or *before* consuming a `}`. This keeps
/// nested blocks intact when recovering inside function bodies.
fn synchronize(&mut self) { fn synchronize(&mut self) {
while let Some(peek) = self.peek() { while let Some(peek) = self.peek() {
match peek.kind { match peek.kind {
@@ -82,6 +104,10 @@ impl<'src> Parser<'src> {
} }
} }
/// Parses a type annotation, e.g. `u8`, `i64`, `bool`, or a user-defined
/// named type.
///
/// Returns an [`Err`] diagnostic if the next token is not a valid type.
pub fn parse_type(&mut self) -> Result<ast::ParsedType, Diagnostic> { pub fn parse_type(&mut self) -> Result<ast::ParsedType, Diagnostic> {
let peek = self.peek_no_eof()?; let peek = self.peek_no_eof()?;
@@ -112,7 +138,24 @@ impl<'src> Parser<'src> {
}) })
} }
/// Parses an [ast::Expression] using the pratt parsing algorithm. /// Parses an expression using the Pratt (top-down operator-precedence)
/// algorithm.
///
/// `min_bp` is the minimum *left* binding power the next infix/postfix
/// operator must have to be incorporated into the current expression. Pass
/// `0` to parse a full expression with no restrictions.
///
/// The precedence hierarchy (low → high) is:
/// - assignment (`=`)
/// - logical `or` / `and`
/// - bitwise `|` / `^` / `&`
/// - equality (`==`, `!=`) and comparison (`<`, `<=`, `>`, `>=`)
/// - addition / subtraction
/// - shifts (`<<`, `>>`)
/// - multiplication / division / remainder
/// - member access (`.`)
/// - postfix: call `()`, index `[]`, cast `as`
/// - prefix: `-`, `&`, `~`, `*`, `!`
pub fn parse_expression(&mut self, min_bp: u8) -> Result<ast::ParsedExpression, Diagnostic> { pub fn parse_expression(&mut self, min_bp: u8) -> Result<ast::ParsedExpression, Diagnostic> {
let peek_token = self.peek_no_eof()?; let peek_token = self.peek_no_eof()?;
@@ -182,7 +225,11 @@ impl<'src> Parser<'src> {
Ok(left) Ok(left)
} }
/// Parses a primary expression, e.g. literals, unary or grouped expression. /// Parses a primary (non-operator) expression: an identifier, integer
/// literal, boolean literal, or a parenthesised expression.
///
/// Integer literals support `0x` (hex), `0o` (octal), and `0b` (binary)
/// prefixes in addition to plain decimal.
fn parse_primary_expression(&mut self) -> Result<ast::ParsedExpression, Diagnostic> { fn parse_primary_expression(&mut self) -> Result<ast::ParsedExpression, Diagnostic> {
let peek_token = self.peek_no_eof()?; let peek_token = self.peek_no_eof()?;
@@ -252,7 +299,10 @@ impl<'src> Parser<'src> {
} }
} }
/// Parses a [ast::ExpressionKind::Call] expression. /// Parses a function-call expression `func(arg, …)`.
///
/// The opening `(` is consumed here; `func` is the already-parsed callee
/// expression passed in from the Pratt loop.
fn parse_call_expr( fn parse_call_expr(
&mut self, &mut self,
func: ast::ParsedExpression, func: ast::ParsedExpression,
@@ -281,7 +331,10 @@ impl<'src> Parser<'src> {
}) })
} }
/// Parses an [ast::ExpressionKind::Index] expression. /// Parses an index expression `expr[index]`.
///
/// The opening `[` is consumed here; `expr` is the already-parsed
/// collection expression passed in from the Pratt loop.
fn parse_index_expr( fn parse_index_expr(
&mut self, &mut self,
expr: ast::ParsedExpression, expr: ast::ParsedExpression,
@@ -303,7 +356,10 @@ impl<'src> Parser<'src> {
}) })
} }
/// Parses an [ast::ExpressionKind::Cast] expression. /// Parses a cast expression `expr as Type`.
///
/// The `as` keyword is consumed here; `expr` is the already-parsed value
/// expression passed in from the Pratt loop.
fn parse_cast_expr( fn parse_cast_expr(
&mut self, &mut self,
expr: ast::ParsedExpression, expr: ast::ParsedExpression,
@@ -324,6 +380,12 @@ impl<'src> Parser<'src> {
} }
} }
/// Returns `(left_bp, right_bp, op)` for infix operators, or `None` if `kind`
/// is not an infix operator.
///
/// The two binding-power values implement associativity: equal values give
/// left-associativity, and `right_bp = left_bp` gives right-associativity
/// (currently used for `=`).
fn infix_binding_power(kind: TokenKind) -> Option<(u8, u8, ast::BinaryOp)> { fn infix_binding_power(kind: TokenKind) -> Option<(u8, u8, ast::BinaryOp)> {
Some(match kind { Some(match kind {
TokenKind::Assign => (2, 2, ast::BinaryOp::Assign), TokenKind::Assign => (2, 2, ast::BinaryOp::Assign),
@@ -359,6 +421,11 @@ fn infix_binding_power(kind: TokenKind) -> Option<(u8, u8, ast::BinaryOp)> {
}) })
} }
/// Returns `(right_bp, op)` for prefix operators, or `None` if `kind` is not
/// a prefix operator.
///
/// All prefix operators currently share the same binding power (`80`), giving
/// them higher precedence than any binary operator.
fn prefix_binding_power(kind: TokenKind) -> Option<(u8, ast::UnaryOp)> { fn prefix_binding_power(kind: TokenKind) -> Option<(u8, ast::UnaryOp)> {
Some(match kind { Some(match kind {
TokenKind::Minus => (80, ast::UnaryOp::Neg), TokenKind::Minus => (80, ast::UnaryOp::Neg),
@@ -371,6 +438,12 @@ fn prefix_binding_power(kind: TokenKind) -> Option<(u8, ast::UnaryOp)> {
}) })
} }
/// Returns the *left* binding power for postfix operators, or `None` if `kind`
/// is not a postfix operator.
///
/// Postfix operators (`()`, `[]`, `as`) bind tighter than all binary operators
/// but are checked before prefix operators in the Pratt loop so they always
/// apply to the nearest sub-expression.
fn postfix_binding_power(kind: TokenKind) -> Option<u8> { fn postfix_binding_power(kind: TokenKind) -> Option<u8> {
Some(match kind { Some(match kind {
TokenKind::LParen => 100, TokenKind::LParen => 100,

View File

@@ -1,28 +1,50 @@
//! Token definitions used by the [`Lexer`](crate::lexer::Lexer) and
//! [`Parser`](crate::parser::Parser).
//!
//! The two core types are:
//! - [`Span`] — a half-open byte range that marks a location in source text.
//! - [`Token`] — a classified slice of source text together with its span.
//!
//! [`TokenKind`] enumerates every token variant; its [`Display`](std::fmt::Display)
//! impl produces the human-readable representation used in diagnostics.
use std::fmt; use std::fmt;
/// A Span is a half-open byte range `[start, end)` which marks a location in /// A half-open byte range `[start, end)` that marks a location in the source
/// the source string. The start and end positions are stored as a [u32] which /// string.
/// limits us to a maximum source file size of 4 gigabytes. ///
/// Positions are stored as [`u32`], which limits supported source files to
/// 4 GiB — more than sufficient for any practical source file.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Span { pub struct Span {
/// Inclusive start byte offset.
pub start: u32, pub start: u32,
/// Exclusive end byte offset.
pub end: u32, pub end: u32,
} }
impl Span { impl Span {
/// Creates a new span covering `[start, end)`.
pub const fn new(start: u32, end: u32) -> Self { pub const fn new(start: u32, end: u32) -> Self {
Self { start, end } Self { start, end }
} }
/// Returns the length of the span in bytes.
///
/// Uses saturating subtraction so an inverted span returns `0` rather than
/// wrapping.
pub fn len(&self) -> u32 { pub fn len(&self) -> u32 {
self.end.saturating_sub(self.start) self.end.saturating_sub(self.start)
} }
/// Returns `true` if the span covers zero bytes (`start == end`).
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.start == self.end self.start == self.end
} }
/// Extend this [Span] to cover `other` as well. /// Returns the smallest span that covers both `self` and `other`.
///
/// This is the union of the two ranges, useful for computing the span of a
/// parent node from its children.
pub fn extend(self, other: Self) -> Self { pub fn extend(self, other: Self) -> Self {
Self { Self {
start: self.start.min(other.start), start: self.start.min(other.start),
@@ -37,8 +59,10 @@ impl fmt::Display for Span {
} }
} }
/// This macro helps with defining the different kinds of [Token]s. It /// Simultaneously defines the [`TokenKind`] enum and its [`fmt::Display`] impl.
/// simultaneously defines a variant and its [fmt::Display] implementation. ///
/// Each arm maps a variant name to the human-readable string used in
/// diagnostics (e.g. `` `+` ``, `identifier`).
macro_rules! define_tokens { macro_rules! define_tokens {
($($name:ident => $repr:literal),* $(,)?) => { ($($name:ident => $repr:literal),* $(,)?) => {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -97,7 +121,7 @@ define_tokens! {
Shr => "`>>`", Shr => "`>>`",
Bang => "`!`", Bang => "`!`",
// -- Comparision Operators -- // -- Comparison Operators --
Eq => "`==`", Eq => "`==`",
Ne => "`!=`", Ne => "`!=`",
Lt => "`<`", Lt => "`<`",
@@ -125,17 +149,23 @@ define_tokens! {
Unknown => "unknown character" Unknown => "unknown character"
} }
/// A Token represents the smallest continous unit of the source code. It holds /// The smallest contiguous unit of source text, as produced by the
/// its [TokenKind], [Span] and source text. /// [`Lexer`](crate::lexer::Lexer).
///
/// A token borrows its [`text`](Token::text) slice directly from the original
/// source string, so the lifetime `'src` ties every token to that source.
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct Token<'src> { pub struct Token<'src> {
/// The syntactic category of this token.
pub kind: TokenKind, pub kind: TokenKind,
/// The byte range in the source string where this token appears.
pub span: Span, pub span: Span,
/// The raw source text of this token (a zero-copy slice).
pub text: &'src str, pub text: &'src str,
} }
impl<'src> Token<'src> { impl<'src> Token<'src> {
/// Checks if the current [Token] is of given [TokenKind]. /// Returns `true` if this token has the given [`TokenKind`].
pub fn is(&self, kind: TokenKind) -> bool { pub fn is(&self, kind: TokenKind) -> bool {
self.kind == kind self.kind == kind
} }