diff --git a/src/ast.rs b/src/ast.rs index 7449fe5..80aa883 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -1,15 +1,33 @@ +//! Abstract Syntax Tree (AST) definitions. +//! +//! The AST is parameterised over a [`Phase`] type-state so that the same node +//! types can carry different amounts of information at different compiler +//! stages. Currently only the [`Parsed`] phase exists, which attaches no +//! extra data (`()`) to each node. +//! +//! The two primary node families are: +//! - [`Expression`] / [`ExpressionKind`] — value-producing constructs. +//! - [`Type`] / [`TypeKind`] — type annotations. use std::fmt::Debug; use crate::token::Span; -/// The [Phase] trait is used for type state. The AST can be in one of multiple -/// type states: -/// 1. [Parsed] - AST that was produced through parsing. +/// Marker trait that carries phase-specific associated data for AST nodes. +/// +/// Each phase defines an [`ExtraData`](Phase::ExtraData) type that is embedded +/// in every node. This allows later compiler passes (e.g. type-checking) to +/// augment the tree without duplicating the node hierarchy. +/// +/// Current phases: +/// - [`Parsed`] — produced directly by the parser; no extra data. pub trait Phase { type ExtraData: PartialEq + Debug; } -/// See [Phase] for more information. +/// The initial AST phase produced by the parser. +/// +/// In this phase [`Phase::ExtraData`] is `()`, meaning nodes carry only +/// syntactic information (kind + source span). #[derive(Debug)] pub struct Parsed; @@ -17,10 +35,15 @@ impl Phase for Parsed { type ExtraData = (); } +/// Convenience alias for an [`Expression`] in the [`Parsed`] phase. pub type ParsedExpression = Expression; -/// This represents an expression in the source code. It holds the -/// [ExpressionKind], the [Span] and extra information according to the [Phase]. +/// A value-producing node in the AST. +/// +/// Every expression carries: +/// - [`kind`](Expression::kind) — what *kind* of expression it is. +/// - [`span`](Expression::span) — the source location it was parsed from. +/// - [`extra`](Expression::extra) — phase-specific data (see [`Phase`]). #[derive(Debug, PartialEq)] pub struct Expression { pub kind: ExpressionKind

, @@ -28,109 +51,125 @@ pub struct Expression { pub extra: P::ExtraData, } -/// Represents the different kinds of [Expression]s, e.g. literals, unary or -/// binary expressions. +/// The concrete variant of an [`Expression`]. #[derive(Debug, PartialEq)] pub enum ExpressionKind { + /// A bare name, e.g. `foo`. Identifier(String), + /// A string literal, e.g. `"hello"`. LitString(String), + /// An integer literal, e.g. `42`, `0xFF`, `0b1010`. The value is stored + /// as a `u64` regardless of the source radix. LitInteger(u64), + /// A boolean literal: `true` or `false`. LitBool(bool), + /// A prefix unary expression, e.g. `-x`, `!cond`, `*ptr`. Unary { op: UnaryOp, + /// Source span of the operator token itself. op_span: Span, operand: Box>, }, + /// An infix binary expression, e.g. `a + b`, `x == y`. Binary { op: BinaryOp, + /// Source span of the operator token itself. op_span: Span, left: Box>, right: Box>, }, + /// A function call, e.g. `f(a, b)`. Call { + /// The callee expression (often an [`Identifier`](ExpressionKind::Identifier)). func: Box>, args: Vec>, }, + /// An index expression, e.g. `arr[i]`. Index { expr: Box>, index: Box>, }, + /// A type-cast expression, e.g. `x as u32`. Cast { expr: Box>, ty: Box>, }, } +/// A prefix unary operator. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum UnaryOp { - /// Bitwise Not + /// Bitwise complement (`~`) BitNot, - /// Logical Not + /// Logical negation (`!`) Not, - /// Negate + /// Arithmetic negation (`-`) Neg, - /// Address Of + /// Address-of (`&`) AddrOf, - /// Deref + /// Pointer dereference (`*`) Deref, } +/// An infix binary operator. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum BinaryOp { - /// Addition + /// Addition (`+`) Add, - /// Subtraction + /// Subtraction (`-`) Sub, - /// Multiplication + /// Multiplication (`*`) Mul, - /// Division + /// Division (`/`) Div, - /// Remainder + /// Remainder (`%`) Rem, - /// Bitwise And + /// Bitwise AND (`&`) BitAnd, - /// Bitwise Or + /// Bitwise OR (`|`) BitOr, - /// Bitwise Xor + /// Bitwise XOR (`^`) BitXor, - /// Bitwise Shift Left + /// Left shift (`<<`) BitShl, - /// Bitwise Shift Right + /// Right shift (`>>`) BitShr, - /// Logical And + /// Logical AND (`and`) And, - /// Logical Or + /// Logical OR (`or`) Or, - /// Equal + /// Equality (`==`) Eq, - /// Not Equal + /// Inequality (`!=`) Ne, - /// Less than + /// Less-than (`<`) Lt, - /// Less than or Equal + /// Less-than-or-equal (`<=`) Le, - /// Greater than + /// Greater-than (`>`) Gt, - /// Greater than or Equal + /// Greater-than-or-equal (`>=`) Ge, - /// Assign + /// Assignment (`=`) Assign, - /// Member Access + /// Member access (`.`) Dot, } +/// Convenience alias for a [`Type`] in the [`Parsed`] phase. pub type ParsedType = Type; +/// A type annotation node in the AST. #[derive(Debug, PartialEq)] pub struct Type { pub kind: TypeKind, @@ -138,19 +177,24 @@ pub struct Type { pub extra: P::ExtraData, } +/// The concrete variant of a [`Type`] annotation. #[derive(Debug, PartialEq)] pub enum TypeKind { + /// Signed integers I8, I16, I32, I64, + /// Unsigned integers U8, U16, U32, U64, + /// Boolean type (`bool`) Bool, + /// A user-defined named type, e.g. `MyStruct`. Named(String), } diff --git a/src/cli.rs b/src/cli.rs index d3b25bb..94bf54d 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,7 +1,18 @@ +//! Command-line interface: argument parsing, help/version output, and fatal +//! error reporting. +//! +//! The primary entry point is [`parse_args`], which parses [`std::env::args`] +//! and returns an [`Opts`] struct. If any argument is invalid or required +//! arguments are missing, it calls [`fatal`] which prints an error to `stderr` +//! and exits with code `1`. use std::path::PathBuf; use yansi::Paint; +/// Print the help message to `stdout`. +/// +/// Describes the compiler's usage, all supported options, and the `` +/// positional argument. pub fn print_help() { println!( "{} {} - the bucky language compiler", @@ -47,27 +58,52 @@ pub fn print_help() { ); } +/// Print the compiler version string (`buckyc `) to `stdout`. pub fn print_version() { println!("buckyc {}", env!("CARGO_PKG_VERSION")); } +/// Print a formatted error message to `stderr` and exit with code `1`. +/// +/// This function never returns (`-> !`). Use it for unrecoverable CLI errors +/// such as missing arguments or unknown flags, discovered before compilation +/// begins. pub fn fatal(message: impl ToString) -> ! { eprintln!("{}: {}", "error".bold().red(), message.to_string().bold()); std::process::exit(1); } +/// Parsed command-line options returned by [`parse_args`]. #[derive(Debug)] pub struct Opts { - /// The list of files passed to the compiler. + /// One or more source files to compile, in the order they were supplied. pub files: Vec, - /// `-S`: emit IR and stop (implies `-c`). + /// `-S`: emit IR and stop (implies [`no_link`](Opts::no_link)). pub emit_ir: bool, - /// `-c`: compile source to object file without linking. + /// `-c`: compile to an object file without invoking the linker. pub no_link: bool, - /// `-o `: write final output to this path. + /// `-o `: destination path for the final output. When `None` the + /// compiler chooses a default output name. pub output: Option, } +/// Parse [`std::env::args`] and return the resulting [`Opts`]. +/// +/// Recognised flags: +/// +/// | Flag | Effect | +/// |------|--------| +/// | `-h`, `--help` | Print help and exit `0` | +/// | `-V`, `--version` | Print version and exit `0` | +/// | `-S` | Set [`emit_ir`](Opts::emit_ir) and [`no_link`](Opts::no_link) | +/// | `-c` | Set [`no_link`](Opts::no_link) | +/// | `-o ` | Set [`output`](Opts::output) | +/// | `` | Append to [`files`](Opts::files) | +/// +/// Calls [`fatal`] (and exits) if: +/// - an unknown `-`-prefixed flag is encountered, or +/// - `-o` is supplied without a following argument, or +/// - no source files are provided. pub fn parse_args() -> Opts { let mut files = Vec::new(); let mut no_link = false; diff --git a/src/diagnostic.rs b/src/diagnostic.rs index efd7ca8..9e47b95 100644 --- a/src/diagnostic.rs +++ b/src/diagnostic.rs @@ -1,14 +1,37 @@ +//! Compiler diagnostic reporting with source-location context. +//! +//! This module provides [`Diagnostic`], a structured error/warning message that +//! can optionally include a source span and one or more labelled secondary +//! spans. Diagnostics are rendered to `stderr` in a rustc-inspired format: +//! +//! ```text +//! Error: undeclared variable `x` +//! --> src/main.bky:3:5 +//! | +//! 3 | let y = x + 1; +//! | ^ undeclared variable +//! | +//! ``` use std::{fmt::Display, path::Path, process::exit}; use yansi::Paint; use crate::token::Span; +/// The importance level of a [`Diagnostic`]. +/// +/// Variants are ordered from least to most severe so that `<` / `>` comparisons +/// work intuitively (e.g. `Severity::Warning < Severity::Error`). #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub enum Severity { + /// Purely informational; never causes the compiler to stop. Note, + /// Something suspicious that may or may not be a problem. Warning, + /// A recoverable problem that prevents successful compilation. Error, + /// An unrecoverable problem; the process will exit immediately after + /// reporting this diagnostic. Critical, } @@ -23,14 +46,29 @@ impl Display for Severity { } } +/// A single compiler message with optional source-location information. +/// +/// Build a diagnostic with [`Diagnostic::new`], optionally attach a primary +/// source location via [`with_span`](Diagnostic::with_span), attach labelled +/// secondary locations via [`add_label`](Diagnostic::add_label), then call +/// [`report`](Diagnostic::report) to print it. +/// +/// If the severity is [`Severity::Critical`], `report` will call +/// [`process::exit`](std::process::exit) after printing. pub struct Diagnostic { pub severity: Severity, + /// Primary source location, if any. pub span: Option, pub message: String, + /// Secondary labelled spans rendered below the primary snippet. pub labels: Vec<(Span, String)>, } impl Diagnostic { + /// Create a new diagnostic with the given severity and message. + /// + /// No source location is attached; use [`with_span`](Self::with_span) to + /// add one. pub fn new(severity: Severity, message: impl ToString) -> Self { Self { severity, @@ -40,16 +78,29 @@ impl Diagnostic { } } + /// Attach a primary source span to this diagnostic. pub fn with_span(mut self, span: Span) -> Self { self.span = Some(span); self } + /// Attach a labelled secondary span. + /// + /// Labels whose span matches the primary span exactly are merged into the + /// primary underline as inline text. All other labels are rendered as + /// separate snippets below the primary one. pub fn add_label(mut self, span: Span, message: impl ToString) -> Self { self.labels.push((span, message.to_string())); self } + /// Print this diagnostic to `stderr` and, if the severity is + /// [`Severity::Critical`], terminate the process. + /// + /// # Arguments + /// * `file_name` – path shown in the `-->` location line. + /// * `source` – full source text of the file, used to extract line/col + /// information and to display the relevant source snippet. pub fn report(self, file_name: &Path, source: &str) { eprintln!("{}: {}", self.severity, self.message.bold()); @@ -165,6 +216,7 @@ fn render_snippet( eprintln!("{pad} {bar} {spaces}{colored_carets}{label_text}"); } +/// Apply severity-appropriate ANSI colour to a string. fn paint_severity(s: &str, severity: Severity) -> String { match severity { Severity::Note => format!("{}", s.bold().bright_cyan()), @@ -173,6 +225,7 @@ fn paint_severity(s: &str, severity: Severity) -> String { } } +/// Returns the number of decimal digits in `n` (minimum 1). fn count_digits(n: usize) -> usize { format!("{n}").len() } @@ -187,6 +240,10 @@ fn get_line_content(source: &str, position: u32) -> (usize, &str) { (line_start, &rest[..line_len]) } +/// Returns the 1-based `(line, column)` for a byte `position` within `source`. +/// +/// Both line and column are counted from 1. The column is measured in Unicode +/// scalar values (characters), not bytes. fn get_line_col(source: &str, position: u32) -> (usize, usize) { let prefix = &source[..position as usize]; let line = prefix.bytes().filter(|&b| b == b'\n').count() + 1; diff --git a/src/lexer.rs b/src/lexer.rs index c5a448e..3bbed7b 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -1,16 +1,44 @@ +//! Lexer (tokeniser) that converts raw source text into a [`Token`] stream. +//! +//! [`Lexer`] implements [`Iterator`] so it can be used directly +//! in a `for` loop or with iterator adaptors such as `.peekable()`. +//! Whitespace and `#`-line-comments are skipped automatically between tokens. +//! +//! # Character classes +//! - **Identifiers / keywords** — start with a +//! [XID_Start](https://unicode.org/reports/tr31/) character or `_`, continue +//! with XID_Continue characters. Reserved words are mapped to their +//! respective [`TokenKind`] variants; everything else becomes +//! [`TokenKind::Identifier`]. +//! - **Integer literals** — decimal by default; `0x` / `0o` / `0b` prefixes +//! select hexadecimal, octal, and binary respectively. +//! - **String literals** — delimited by `"…"`; `\` escapes the next character. +//! - **Operators and punctuation** — single- or double-character tokens +//! dispatched via the `token!` macro with one character of lookahead. use std::{iter::Peekable, str::Chars}; use unicode_xid::UnicodeXID; use crate::token::{Span, Token, TokenKind}; +/// A lazy iterator over the [`Token`]s of a source string. +/// +/// Tokens borrow their text slice directly from the original source, so the +/// lexer lifetime `'src` must outlive any use of the produced tokens. +/// +/// Construct with [`Lexer::new`] and consume via the [`Iterator`] impl or by +/// passing it to the parser. pub struct Lexer<'src> { + /// One-character look-ahead over the source characters. chars: Peekable>, + /// The full source text, kept for slice extraction in [`make`](Self::make). source: &'src str, + /// Current byte offset into `source`. Advanced by [`advance`](Self::advance). position: usize, } impl<'src> Lexer<'src> { + /// Creates a new [`Lexer`] positioned at the start of `source`. pub fn new(source: &'src str) -> Self { Self { chars: source.chars().peekable(), @@ -24,22 +52,29 @@ impl<'src> Lexer<'src> { self.chars.peek().copied() } - /// Consume and return the next character. - /// This method panics if called at the end of input. + /// Consume and return the next character, advancing [`position`](Self::position) + /// by the character's UTF-8 byte length. + /// + /// # Panics + /// Panics if called at the end of input. Always guard with + /// [`peek`](Self::peek) first. fn advance(&mut self) -> char { let ch = self.chars.next().expect("failed to advance the lexer"); self.position += ch.len_utf8(); ch } - /// Advance while `condition` holds. + /// Advance while `condition` holds, stopping at the first character for + /// which it returns `false` (or at end of input). fn advance_while(&mut self, condition: impl FnMut(char) -> bool + Copy) { while self.peek().is_some_and(condition) { self.advance(); } } - /// Build a token from `[start, self.pos)`. + /// Construct a [`Token`] spanning the byte range `[start, self.position)`. + /// + /// The token's `text` is a zero-copy slice of the source string. fn make(&self, kind: TokenKind, start: usize) -> Token<'src> { Token { kind, @@ -48,7 +83,11 @@ impl<'src> Lexer<'src> { } } - /// Skip all whitespace and comments. + /// Skip any run of whitespace followed by a `#` line comment, repeating + /// until neither is present. + /// + /// Comments begin with `#` and extend to (but do not include) the + /// following `\n`. fn skip_whitespace_and_comments(&mut self) { loop { self.advance_while(char::is_whitespace); @@ -61,7 +100,12 @@ impl<'src> Lexer<'src> { } } - /// Lexes the next identifier token. + /// Lex the next identifier or keyword token. + /// + /// Assumes the current peek character satisfies `is_xid_start() || == '_'`. + /// Consumes one XID_Start (or `_`) character followed by any number of + /// XID_Continue characters, then matches the resulting slice against the + /// keyword / type-keyword table. fn next_identifier(&mut self) -> TokenKind { let start = self.position; @@ -90,7 +134,12 @@ impl<'src> Lexer<'src> { } } - /// Lexes the next number token. + /// Lex the next integer literal token. + /// + /// Assumes the current peek character is an ASCII digit. Detects an + /// optional radix prefix (`0x` → 16, `0o` → 8, `0b` → 2) then consumes + /// all subsequent digits valid for that radix. Always returns + /// [`TokenKind::LitInt`]. fn next_number(&mut self) -> TokenKind { let radix = match self.advance() { '0' => match self.peek() { @@ -116,7 +165,15 @@ impl<'src> Lexer<'src> { TokenKind::LitInt } - /// Lexes the next string token. + /// Lex the next string literal token. + /// + /// Assumes the current peek character is `"`. Consumes characters until + /// a closing (unescaped) `"` is found or input is exhausted. A `\` + /// escapes the immediately following character, preventing it from being + /// treated as a closing delimiter. Always returns [`TokenKind::LitString`]. + /// + /// Note: escape sequences are not validated here; that is left to a later + /// compiler stage. fn next_string(&mut self) -> TokenKind { let mut escaped = false; @@ -144,11 +201,25 @@ impl<'src> Lexer<'src> { impl<'src> Iterator for Lexer<'src> { type Item = Token<'src>; + /// Returns the next [`Token`], or `None` when the source is exhausted. + /// + /// Leading whitespace and `#`-comments are skipped before each token. + /// Multi-character operator tokens (`->`, `<<`, `<=`, …) are resolved with + /// a single character of lookahead via the `token!` macro. Unrecognised + /// characters are returned as [`TokenKind::Unknown`]. fn next(&mut self) -> Option { self.skip_whitespace_and_comments(); let start = self.position; + /// Builds and evaluates a [`TokenKind`] from the current position. + /// + /// Three forms: + /// - `token!($kind)` — single-character token: advance once, yield `$kind`. + /// - `token!($c => $kind, … ; $default)` — multi-character token with + /// lookahead: advance once (consuming the lead character), then + /// check the next character against each `$c => $kind` arm in order, + /// falling back to `$default` if none match. macro_rules! token { // Case 1: Simple token (no lookahead) ($default:expr) => {{ diff --git a/src/parser.rs b/src/parser.rs index a789e59..92e14ee 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,3 +1,8 @@ +//! Recursive-descent / Pratt parser that converts a token stream into an AST. +//! +//! The entry points are [`Parser::parse_type`] and [`Parser::parse_expression`]. +//! Errors are represented as [`Diagnostic`] values; the caller is responsible +//! for reporting them. use std::iter::Peekable; use crate::ast; @@ -5,15 +10,24 @@ use crate::diagnostic::{Diagnostic, Severity}; use crate::lexer::Lexer; use crate::token::{Token, TokenKind}; -/// The [Parser] consumes the [Token]s produced by the [Lexer] and constructs -/// an [ast] in the [ast::Parsed] phase. +/// Consumes the [`Token`] stream produced by the [`Lexer`] and constructs an +/// AST in the [`ast::Parsed`] phase. +/// +/// The parser uses a single token of look-ahead (peek) for all decisions. +/// Expression parsing is implemented with the +/// [Pratt / top-down operator-precedence](https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html) +/// algorithm; binding-power tables are defined by [`infix_binding_power`], +/// [`prefix_binding_power`], and [`postfix_binding_power`]. pub struct Parser<'src> { tokens: Peekable>, + /// Diagnostics accumulated during parsing. Non-fatal errors are pushed here + /// so that the parser can attempt to continue and surface multiple issues + /// in a single pass. errors: Vec, } impl<'src> Parser<'src> { - /// Constructs a new [Parser] with the given source text. + /// Constructs a new [`Parser`] with the given source text. pub fn new(source: &'src str) -> Self { Self { tokens: Lexer::new(source).peekable(), @@ -21,35 +35,40 @@ impl<'src> Parser<'src> { } } - /// Peek at the next [Token] without consuming it. + /// Peek at the next [`Token`] without consuming it. fn peek(&mut self) -> Option> { self.tokens.peek().copied() } - /// Peek at the next [Token] and return a [Diagnostic] if we reached the end of input. + /// Peek at the next [`Token`], returning an [`Err`] diagnostic if the + /// token stream is exhausted. fn peek_no_eof(&mut self) -> Result, Diagnostic> { self.peek() .ok_or_else(|| Diagnostic::new(Severity::Error, "unexpected end of input")) } - /// Check if the peek [Token] is of a given [TokenKind]. + /// Returns `true` if the next token has the given [`TokenKind`]. fn is_peek(&mut self, kind: TokenKind) -> bool { self.peek().map_or(false, |tok| tok.is(kind)) } - /// Check if we have reached the end of input. + /// Returns `true` if the token stream is exhausted. fn is_at_eof(&mut self) -> bool { self.peek().is_none() } - /// Consumes and returns the next [Token]. - /// This method panics if called at the end of input. + /// Consumes and returns the next [`Token`]. + /// + /// # Panics + /// Panics if called at the end of input. Always check [`is_at_eof`](Self::is_at_eof) + /// or use [`peek_no_eof`](Self::peek_no_eof) / [`expect`](Self::expect) in + /// production code paths. fn advance(&mut self) -> Token<'src> { self.tokens.next().expect("failed to advance the parser") } - /// Consumes and returns the next [Token], if it is of a given [TokenKind], - /// otherwise returns an [Err]. + /// Consumes and returns the next [`Token`] if it matches `kind`; otherwise + /// returns an [`Err`] diagnostic that points at the offending token. fn expect(&mut self, kind: TokenKind) -> Result, Diagnostic> { match self.peek() { Some(tok) if tok.is(kind) => Ok(self.advance()), @@ -63,8 +82,11 @@ impl<'src> Parser<'src> { } } - /// Skips [Token]s until we reach a neutral statement boundary, so that - /// subsequent statements can still be parsed cleanly. + /// Error-recovery helper: skips tokens until a statement boundary is + /// reached so that subsequent statements can still be parsed cleanly. + /// + /// Stops *after* consuming a `;`, or *before* consuming a `}`. This keeps + /// nested blocks intact when recovering inside function bodies. fn synchronize(&mut self) { while let Some(peek) = self.peek() { match peek.kind { @@ -82,6 +104,10 @@ impl<'src> Parser<'src> { } } + /// Parses a type annotation, e.g. `u8`, `i64`, `bool`, or a user-defined + /// named type. + /// + /// Returns an [`Err`] diagnostic if the next token is not a valid type. pub fn parse_type(&mut self) -> Result { let peek = self.peek_no_eof()?; @@ -112,7 +138,24 @@ impl<'src> Parser<'src> { }) } - /// Parses an [ast::Expression] using the pratt parsing algorithm. + /// Parses an expression using the Pratt (top-down operator-precedence) + /// algorithm. + /// + /// `min_bp` is the minimum *left* binding power the next infix/postfix + /// operator must have to be incorporated into the current expression. Pass + /// `0` to parse a full expression with no restrictions. + /// + /// The precedence hierarchy (low → high) is: + /// - assignment (`=`) + /// - logical `or` / `and` + /// - bitwise `|` / `^` / `&` + /// - equality (`==`, `!=`) and comparison (`<`, `<=`, `>`, `>=`) + /// - addition / subtraction + /// - shifts (`<<`, `>>`) + /// - multiplication / division / remainder + /// - member access (`.`) + /// - postfix: call `()`, index `[]`, cast `as` + /// - prefix: `-`, `&`, `~`, `*`, `!` pub fn parse_expression(&mut self, min_bp: u8) -> Result { let peek_token = self.peek_no_eof()?; @@ -182,7 +225,11 @@ impl<'src> Parser<'src> { Ok(left) } - /// Parses a primary expression, e.g. literals, unary or grouped expression. + /// Parses a primary (non-operator) expression: an identifier, integer + /// literal, boolean literal, or a parenthesised expression. + /// + /// Integer literals support `0x` (hex), `0o` (octal), and `0b` (binary) + /// prefixes in addition to plain decimal. fn parse_primary_expression(&mut self) -> Result { let peek_token = self.peek_no_eof()?; @@ -252,7 +299,10 @@ impl<'src> Parser<'src> { } } - /// Parses a [ast::ExpressionKind::Call] expression. + /// Parses a function-call expression `func(arg, …)`. + /// + /// The opening `(` is consumed here; `func` is the already-parsed callee + /// expression passed in from the Pratt loop. fn parse_call_expr( &mut self, func: ast::ParsedExpression, @@ -281,7 +331,10 @@ impl<'src> Parser<'src> { }) } - /// Parses an [ast::ExpressionKind::Index] expression. + /// Parses an index expression `expr[index]`. + /// + /// The opening `[` is consumed here; `expr` is the already-parsed + /// collection expression passed in from the Pratt loop. fn parse_index_expr( &mut self, expr: ast::ParsedExpression, @@ -303,7 +356,10 @@ impl<'src> Parser<'src> { }) } - /// Parses an [ast::ExpressionKind::Cast] expression. + /// Parses a cast expression `expr as Type`. + /// + /// The `as` keyword is consumed here; `expr` is the already-parsed value + /// expression passed in from the Pratt loop. fn parse_cast_expr( &mut self, expr: ast::ParsedExpression, @@ -324,6 +380,12 @@ impl<'src> Parser<'src> { } } +/// Returns `(left_bp, right_bp, op)` for infix operators, or `None` if `kind` +/// is not an infix operator. +/// +/// The two binding-power values implement associativity: equal values give +/// left-associativity, and `right_bp = left_bp` gives right-associativity +/// (currently used for `=`). fn infix_binding_power(kind: TokenKind) -> Option<(u8, u8, ast::BinaryOp)> { Some(match kind { TokenKind::Assign => (2, 2, ast::BinaryOp::Assign), @@ -359,6 +421,11 @@ fn infix_binding_power(kind: TokenKind) -> Option<(u8, u8, ast::BinaryOp)> { }) } +/// Returns `(right_bp, op)` for prefix operators, or `None` if `kind` is not +/// a prefix operator. +/// +/// All prefix operators currently share the same binding power (`80`), giving +/// them higher precedence than any binary operator. fn prefix_binding_power(kind: TokenKind) -> Option<(u8, ast::UnaryOp)> { Some(match kind { TokenKind::Minus => (80, ast::UnaryOp::Neg), @@ -371,6 +438,12 @@ fn prefix_binding_power(kind: TokenKind) -> Option<(u8, ast::UnaryOp)> { }) } +/// Returns the *left* binding power for postfix operators, or `None` if `kind` +/// is not a postfix operator. +/// +/// Postfix operators (`()`, `[]`, `as`) bind tighter than all binary operators +/// but are checked before prefix operators in the Pratt loop so they always +/// apply to the nearest sub-expression. fn postfix_binding_power(kind: TokenKind) -> Option { Some(match kind { TokenKind::LParen => 100, diff --git a/src/token.rs b/src/token.rs index cffb4d7..5aa046e 100644 --- a/src/token.rs +++ b/src/token.rs @@ -1,28 +1,50 @@ +//! Token definitions used by the [`Lexer`](crate::lexer::Lexer) and +//! [`Parser`](crate::parser::Parser). +//! +//! The two core types are: +//! - [`Span`] — a half-open byte range that marks a location in source text. +//! - [`Token`] — a classified slice of source text together with its span. +//! +//! [`TokenKind`] enumerates every token variant; its [`Display`](std::fmt::Display) +//! impl produces the human-readable representation used in diagnostics. use std::fmt; -/// A Span is a half-open byte range `[start, end)` which marks a location in -/// the source string. The start and end positions are stored as a [u32] which -/// limits us to a maximum source file size of 4 gigabytes. +/// A half-open byte range `[start, end)` that marks a location in the source +/// string. +/// +/// Positions are stored as [`u32`], which limits supported source files to +/// 4 GiB — more than sufficient for any practical source file. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct Span { + /// Inclusive start byte offset. pub start: u32, + /// Exclusive end byte offset. pub end: u32, } impl Span { + /// Creates a new span covering `[start, end)`. pub const fn new(start: u32, end: u32) -> Self { Self { start, end } } + /// Returns the length of the span in bytes. + /// + /// Uses saturating subtraction so an inverted span returns `0` rather than + /// wrapping. pub fn len(&self) -> u32 { self.end.saturating_sub(self.start) } + /// Returns `true` if the span covers zero bytes (`start == end`). pub fn is_empty(&self) -> bool { self.start == self.end } - /// Extend this [Span] to cover `other` as well. + /// Returns the smallest span that covers both `self` and `other`. + /// + /// This is the union of the two ranges, useful for computing the span of a + /// parent node from its children. pub fn extend(self, other: Self) -> Self { Self { start: self.start.min(other.start), @@ -37,8 +59,10 @@ impl fmt::Display for Span { } } -/// This macro helps with defining the different kinds of [Token]s. It -/// simultaneously defines a variant and its [fmt::Display] implementation. +/// Simultaneously defines the [`TokenKind`] enum and its [`fmt::Display`] impl. +/// +/// Each arm maps a variant name to the human-readable string used in +/// diagnostics (e.g. `` `+` ``, `identifier`). macro_rules! define_tokens { ($($name:ident => $repr:literal),* $(,)?) => { #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -97,7 +121,7 @@ define_tokens! { Shr => "`>>`", Bang => "`!`", - // -- Comparision Operators -- + // -- Comparison Operators -- Eq => "`==`", Ne => "`!=`", Lt => "`<`", @@ -125,17 +149,23 @@ define_tokens! { Unknown => "unknown character" } -/// A Token represents the smallest continous unit of the source code. It holds -/// its [TokenKind], [Span] and source text. +/// The smallest contiguous unit of source text, as produced by the +/// [`Lexer`](crate::lexer::Lexer). +/// +/// A token borrows its [`text`](Token::text) slice directly from the original +/// source string, so the lifetime `'src` ties every token to that source. #[derive(Debug, Clone, Copy)] pub struct Token<'src> { + /// The syntactic category of this token. pub kind: TokenKind, + /// The byte range in the source string where this token appears. pub span: Span, + /// The raw source text of this token (a zero-copy slice). pub text: &'src str, } impl<'src> Token<'src> { - /// Checks if the current [Token] is of given [TokenKind]. + /// Returns `true` if this token has the given [`TokenKind`]. pub fn is(&self, kind: TokenKind) -> bool { self.kind == kind }