Compare commits

..

7 Commits

Author SHA1 Message Date
cde0ff5582 feat: Add support for compound statements.
This commit adds parsing logic for compound statements.
2026-03-12 21:42:40 +01:00
1107c7d93d feat: Add support for let and expression statements.
This commit implements parsing for `let` statements, anything that
is not a let statement will be assumed to be an expression statement.
2026-03-12 21:23:09 +01:00
bb9cb8d2d1 docs: Add and improve documentation for every module. 2026-03-12 20:44:41 +01:00
4e2df32e36 feat: Add support for type parsing.
This commit adds simple primitive type parsing, it also adds
a new type of expression called the cast expression.
2026-03-12 12:50:17 +01:00
93f08d1944 feat: Add parsing for expressions.
This commit adds support for parsing expression using the pratt parsing
approach.
2026-03-12 12:14:00 +01:00
9ac8a79151 feat: Add a README.md and logo.svg. 2026-03-11 23:42:43 +01:00
51bd07d313 feat: Add token definitions and lexer logic.
This commit adds the `Token` and `TokenKind` definitions in `src/token.rs`,
in `src/lexer.rs` I've added the `Lexer` logic.
2026-03-11 23:42:39 +01:00
9 changed files with 1259 additions and 33 deletions

View File

@@ -1,3 +1,5 @@
extern puts(text: *char);
fn main() {
puts("Hello, World!");
}

251
src/ast.rs Normal file
View File

@@ -0,0 +1,251 @@
//! Abstract Syntax Tree (AST) definitions.
//!
//! The AST is parameterised over a [`Phase`] type-state so that the same node
//! types can carry different amounts of information at different compiler
//! stages. Currently only the [`Parsed`] phase exists, which attaches no
//! extra data (`()`) to each node.
//!
//! The primary node families are:
//! - [`Expression`] / [`ExpressionKind`] — value-producing constructs.
//! - [`Type`] / [`TypeKind`] — type annotations.
//! - [`Statement`] / [`StatementKind`] — top-level and block-level statements.
use std::fmt::Debug;
use crate::token::Span;
/// Marker trait that carries phase-specific associated data for AST nodes.
///
/// Each phase defines an [`ExtraData`](Phase::ExtraData) type that is embedded
/// in every node. This allows later compiler passes (e.g. type-checking) to
/// augment the tree without duplicating the node hierarchy.
///
/// Current phases:
/// - [`Parsed`] — produced directly by the parser; no extra data.
pub trait Phase {
type ExtraData: PartialEq + Debug;
}
/// The initial AST phase produced by the parser.
///
/// In this phase [`Phase::ExtraData`] is `()`, meaning nodes carry only
/// syntactic information (kind + source span).
#[derive(Debug)]
pub struct Parsed;
impl Phase for Parsed {
type ExtraData = ();
}
/// Convenience alias for an [`Expression`] in the [`Parsed`] phase.
pub type ParsedExpression = Expression<Parsed>;
/// A value-producing node in the AST.
///
/// Every expression carries:
/// - [`kind`](Expression::kind) — what *kind* of expression it is.
/// - [`span`](Expression::span) — the source location it was parsed from.
/// - [`extra`](Expression::extra) — phase-specific data (see [`Phase`]).
#[derive(Debug, PartialEq)]
pub struct Expression<P: Phase> {
pub kind: ExpressionKind<P>,
pub span: Span,
pub extra: P::ExtraData,
}
/// The concrete variant of an [`Expression`].
#[derive(Debug, PartialEq)]
pub enum ExpressionKind<P: Phase> {
/// A bare name, e.g. `foo`.
Identifier(String),
/// A string literal, e.g. `"hello"`.
LitString(String),
/// An integer literal, e.g. `42`, `0xFF`, `0b1010`. The value is stored
/// as a `u64` regardless of the source radix.
LitInteger(u64),
/// A boolean literal: `true` or `false`.
LitBool(bool),
/// A prefix unary expression, e.g. `-x`, `!cond`, `*ptr`.
Unary {
op: UnaryOp,
/// Source span of the operator token itself.
op_span: Span,
operand: Box<Expression<P>>,
},
/// An infix binary expression, e.g. `a + b`, `x == y`.
Binary {
op: BinaryOp,
/// Source span of the operator token itself.
op_span: Span,
left: Box<Expression<P>>,
right: Box<Expression<P>>,
},
/// A function call, e.g. `f(a, b)`.
Call {
/// The callee expression (often an [`Identifier`](ExpressionKind::Identifier)).
func: Box<Expression<P>>,
args: Vec<Expression<P>>,
},
/// An index expression, e.g. `arr[i]`.
Index {
expr: Box<Expression<P>>,
index: Box<Expression<P>>,
},
/// A type-cast expression, e.g. `x as u32`.
Cast {
expr: Box<Expression<P>>,
ty: Box<Type<P>>,
},
}
/// A prefix unary operator.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum UnaryOp {
/// Bitwise complement (`~`)
BitNot,
/// Logical negation (`!`)
Not,
/// Arithmetic negation (`-`)
Neg,
/// Address-of (`&`)
AddrOf,
/// Pointer dereference (`*`)
Deref,
}
/// An infix binary operator.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BinaryOp {
/// Addition (`+`)
Add,
/// Subtraction (`-`)
Sub,
/// Multiplication (`*`)
Mul,
/// Division (`/`)
Div,
/// Remainder (`%`)
Rem,
/// Bitwise AND (`&`)
BitAnd,
/// Bitwise OR (`|`)
BitOr,
/// Bitwise XOR (`^`)
BitXor,
/// Left shift (`<<`)
BitShl,
/// Right shift (`>>`)
BitShr,
/// Logical AND (`and`)
And,
/// Logical OR (`or`)
Or,
/// Equality (`==`)
Eq,
/// Inequality (`!=`)
Ne,
/// Less-than (`<`)
Lt,
/// Less-than-or-equal (`<=`)
Le,
/// Greater-than (`>`)
Gt,
/// Greater-than-or-equal (`>=`)
Ge,
/// Assignment (`=`)
Assign,
/// Member access (`.`)
Dot,
}
/// Convenience alias for a [`Type`] in the [`Parsed`] phase.
pub type ParsedType = Type<Parsed>;
/// A type annotation node in the AST.
#[derive(Debug, PartialEq)]
pub struct Type<P: Phase> {
pub kind: TypeKind,
pub span: Span,
pub extra: P::ExtraData,
}
/// The concrete variant of a [`Type`] annotation.
#[derive(Debug, PartialEq)]
pub enum TypeKind {
/// Signed integers
I8,
I16,
I32,
I64,
/// Unsigned integers
U8,
U16,
U32,
U64,
/// Boolean type (`bool`)
Bool,
/// A user-defined named type, e.g. `MyStruct`.
Named(String),
}
/// Convenience alias for a [`Statement`] in the [`Parsed`] phase.
pub type ParsedStatement = Statement<Parsed>;
/// A statement node in the AST.
///
/// Statements are the sequential building blocks of a block body. Like
/// [`Expression`] and [`Type`], a statement is parameterised over a [`Phase`]
/// so that later compiler passes can attach additional information without
/// changing the node layout.
#[derive(Debug, PartialEq)]
pub struct Statement<P: Phase> {
pub kind: StatementKind<P>,
pub span: Span,
pub extra: P::ExtraData,
}
/// The concrete variant of a [`Statement`].
#[derive(Debug, PartialEq)]
pub enum StatementKind<P: Phase> {
/// A `let` binding, e.g. `let x: i32 = 0;`.
///
/// Both the type annotation and the initialiser are optional at the parse
/// stage and may be filled in or validated by later passes.
Let {
/// The name of the binding.
name: String,
/// Source span of the name token, used for diagnostics.
name_span: Span,
/// Optional explicit type annotation (`let x: T`).
ty: Option<Type<P>>,
/// Optional initialiser expression (`= <expr>`).
value: Option<Expression<P>>,
},
/// A braced block of statements, e.g. `{ let x = 1; f(x); }`.
///
/// Compound statements introduce a new scope and can appear anywhere a
/// statement is expected.
Compound {
/// The statements contained within the block, in source order.
inner: Vec<Statement<P>>,
},
/// A bare expression statement, e.g. `f(x);`.
///
/// The trailing `;` is not stored in the node but is included in
/// [`Statement::span`].
Expr(Expression<P>),
}

View File

@@ -1,7 +1,18 @@
//! Command-line interface: argument parsing, help/version output, and fatal
//! error reporting.
//!
//! The primary entry point is [`parse_args`], which parses [`std::env::args`]
//! and returns an [`Opts`] struct. If any argument is invalid or required
//! arguments are missing, it calls [`fatal`] which prints an error to `stderr`
//! and exits with code `1`.
use std::path::PathBuf;
use yansi::Paint;
/// Print the help message to `stdout`.
///
/// Describes the compiler's usage, all supported options, and the `<file>`
/// positional argument.
pub fn print_help() {
println!(
"{} {} - the bucky language compiler",
@@ -47,27 +58,52 @@ pub fn print_help() {
);
}
/// Print the compiler version string (`buckyc <version>`) to `stdout`.
pub fn print_version() {
println!("buckyc {}", env!("CARGO_PKG_VERSION"));
}
/// Print a formatted error message to `stderr` and exit with code `1`.
///
/// This function never returns (`-> !`). Use it for unrecoverable CLI errors
/// such as missing arguments or unknown flags, discovered before compilation
/// begins.
pub fn fatal(message: impl ToString) -> ! {
eprintln!("{}: {}", "error".bold().red(), message.to_string().bold());
std::process::exit(1);
}
/// Parsed command-line options returned by [`parse_args`].
#[derive(Debug)]
pub struct Opts {
/// The list of files passed to the compiler.
/// One or more source files to compile, in the order they were supplied.
pub files: Vec<PathBuf>,
/// `-S`: emit IR and stop (implies `-c`).
/// `-S`: emit IR and stop (implies [`no_link`](Opts::no_link)).
pub emit_ir: bool,
/// `-c`: compile source to object file without linking.
/// `-c`: compile to an object file without invoking the linker.
pub no_link: bool,
/// `-o <file>`: write final output to this path.
/// `-o <file>`: destination path for the final output. When `None` the
/// compiler chooses a default output name.
pub output: Option<PathBuf>,
}
/// Parse [`std::env::args`] and return the resulting [`Opts`].
///
/// Recognised flags:
///
/// | Flag | Effect |
/// |------|--------|
/// | `-h`, `--help` | Print help and exit `0` |
/// | `-V`, `--version` | Print version and exit `0` |
/// | `-S` | Set [`emit_ir`](Opts::emit_ir) and [`no_link`](Opts::no_link) |
/// | `-c` | Set [`no_link`](Opts::no_link) |
/// | `-o <file>` | Set [`output`](Opts::output) |
/// | `<file>` | Append to [`files`](Opts::files) |
///
/// Calls [`fatal`] (and exits) if:
/// - an unknown `-`-prefixed flag is encountered, or
/// - `-o` is supplied without a following argument, or
/// - no source files are provided.
pub fn parse_args() -> Opts {
let mut files = Vec::new();
let mut no_link = false;

253
src/diagnostic.rs Normal file
View File

@@ -0,0 +1,253 @@
//! Compiler diagnostic reporting with source-location context.
//!
//! This module provides [`Diagnostic`], a structured error/warning message that
//! can optionally include a source span and one or more labelled secondary
//! spans. Diagnostics are rendered to `stderr` in a rustc-inspired format:
//!
//! ```text
//! Error: undeclared variable `x`
//! --> src/main.bky:3:5
//! |
//! 3 | let y = x + 1;
//! | ^ undeclared variable
//! |
//! ```
use std::{fmt::Display, path::Path, process::exit};
use yansi::Paint;
use crate::token::Span;
/// The importance level of a [`Diagnostic`].
///
/// Variants are ordered from least to most severe so that `<` / `>` comparisons
/// work intuitively (e.g. `Severity::Warning < Severity::Error`).
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum Severity {
/// Purely informational; never causes the compiler to stop.
Note,
/// Something suspicious that may or may not be a problem.
Warning,
/// A recoverable problem that prevents successful compilation.
Error,
/// An unrecoverable problem; the process will exit immediately after
/// reporting this diagnostic.
Critical,
}
impl Display for Severity {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Severity::Note => write!(f, "{}", "Note".bold().cyan()),
Severity::Warning => write!(f, "{}", "Warning".bold().yellow()),
Severity::Error => write!(f, "{}", "Error".bold().red()),
Severity::Critical => write!(f, "{}", "Critical".bold().magenta()),
}
}
}
/// A single compiler message with optional source-location information.
///
/// Build a diagnostic with [`Diagnostic::new`], optionally attach a primary
/// source location via [`with_span`](Diagnostic::with_span), attach labelled
/// secondary locations via [`add_label`](Diagnostic::add_label), then call
/// [`report`](Diagnostic::report) to print it.
///
/// If the severity is [`Severity::Critical`], `report` will call
/// [`process::exit`](std::process::exit) after printing.
pub struct Diagnostic {
pub severity: Severity,
/// Primary source location, if any.
pub span: Option<Span>,
pub message: String,
/// Secondary labelled spans rendered below the primary snippet.
pub labels: Vec<(Span, String)>,
}
impl Diagnostic {
/// Create a new diagnostic with the given severity and message.
///
/// No source location is attached; use [`with_span`](Self::with_span) to
/// add one.
pub fn new(severity: Severity, message: impl ToString) -> Self {
Self {
severity,
span: None,
message: message.to_string(),
labels: Vec::new(),
}
}
/// Attach a primary source span to this diagnostic.
pub fn with_span(mut self, span: Span) -> Self {
self.span = Some(span);
self
}
/// Attach a labelled secondary span.
///
/// Labels whose span matches the primary span exactly are merged into the
/// primary underline as inline text. All other labels are rendered as
/// separate snippets below the primary one.
pub fn add_label(mut self, span: Span, message: impl ToString) -> Self {
self.labels.push((span, message.to_string()));
self
}
/// Print this diagnostic to `stderr` and, if the severity is
/// [`Severity::Critical`], terminate the process.
///
/// # Arguments
/// * `file_name` path shown in the `-->` location line.
/// * `source` full source text of the file, used to extract line/col
/// information and to display the relevant source snippet.
pub fn report(self, file_name: &Path, source: &str) {
eprintln!("{}: {}", self.severity, self.message.bold());
let Some(primary_span) = self.span else {
eprintln!(" {} {}", "-->".bright_black(), file_name.display());
if self.severity == Severity::Critical {
exit(-1);
}
return;
};
// Guard: no source context available (e.g. critical error before any
// file is read).
if source.is_empty() || primary_span.start as usize >= source.len() {
eprintln!(" {} {}", "-->".bright_black(), file_name.display());
if self.severity == Severity::Critical {
exit(-1);
}
return;
}
let (primary_line, primary_col) = get_line_col(source, primary_span.start);
// Partition labels: those on the *exact same span* as the primary are
// merged into the primary underline as inline text. All others are
// rendered as separate snippets below the primary.
let (same_span, other_span): (Vec<_>, Vec<_>) = self
.labels
.into_iter()
.partition(|(s, _)| *s == primary_span);
let primary_label: Option<String> = same_span.into_iter().next().map(|(_, m)| m);
// Gutter must be wide enough for the highest line number we'll print.
let max_line = other_span
.iter()
.filter(|(s, _)| (s.start as usize) < source.len())
.map(|(s, _)| get_line_col(source, s.start).0)
.fold(primary_line, usize::max);
let gutter_w = count_digits(max_line);
let pad = " ".repeat(gutter_w);
// " --> file:line:col"
eprintln!(
"{} {}:{}:{}",
format!("{pad} -->").bright_black(),
file_name.display(),
primary_line,
primary_col,
);
eprintln!("{}", format!("{pad} |").bright_black());
// Primary snippet.
render_snippet(
source,
primary_span,
primary_label.as_deref(),
gutter_w,
self.severity,
);
// Additional-context labels (different locations).
for (span, msg) in &other_span {
if (span.start as usize) < source.len() {
render_snippet(source, *span, Some(msg.as_str()), gutter_w, Severity::Note);
}
}
eprintln!("{}", format!("{pad} |").bright_black());
if self.severity == Severity::Critical {
exit(-1);
}
}
}
/// Render a single source-line snippet: the numbered source line followed by
/// a `^^^` underline. When `label` is `Some`, the text is appended after the
/// carets on the same line.
fn render_snippet(
source: &str,
span: Span,
label: Option<&str>,
gutter_w: usize,
severity: Severity,
) {
let (line_num, _) = get_line_col(source, span.start);
let (line_start, line_content) = get_line_content(source, span.start);
let pad = " ".repeat(gutter_w);
let bar = format!("{}", "|".bright_black());
let line_num_str = format!("{:>width$}", line_num, width = gutter_w);
// "N | source text"
eprintln!("{} {bar} {line_content}", line_num_str.bright_black());
// Caret underline, clamped to the current line.
let col_offset = span.start as usize - line_start;
let line_end_byte = line_start + line_content.len();
let underline_len = (span.end as usize)
.min(line_end_byte)
.saturating_sub(span.start as usize)
.max(1);
let spaces = " ".repeat(col_offset);
let carets = "^".repeat(underline_len);
let colored_carets = paint_severity(&carets, severity);
let label_text = label
.map(|l| format!(" {}", paint_severity(l, severity)))
.unwrap_or_default();
// " | ^^^label"
eprintln!("{pad} {bar} {spaces}{colored_carets}{label_text}");
}
/// Apply severity-appropriate ANSI colour to a string.
fn paint_severity(s: &str, severity: Severity) -> String {
match severity {
Severity::Note => format!("{}", s.bold().bright_cyan()),
Severity::Warning => format!("{}", s.bold().bright_yellow()),
Severity::Error | Severity::Critical => format!("{}", s.bold().bright_red()),
}
}
/// Returns the number of decimal digits in `n` (minimum 1).
fn count_digits(n: usize) -> usize {
format!("{n}").len()
}
/// Returns `(line_start_byte, line_content)` for the line that contains
/// `position`. The returned content does *not* include the trailing newline.
fn get_line_content(source: &str, position: u32) -> (usize, &str) {
let pos = position as usize;
let line_start = source[..pos].rfind('\n').map(|i| i + 1).unwrap_or(0);
let rest = &source[line_start..];
let line_len = rest.find('\n').unwrap_or(rest.len());
(line_start, &rest[..line_len])
}
/// Returns the 1-based `(line, column)` for a byte `position` within `source`.
///
/// Both line and column are counted from 1. The column is measured in Unicode
/// scalar values (characters), not bytes.
fn get_line_col(source: &str, position: u32) -> (usize, usize) {
let prefix = &source[..position as usize];
let line = prefix.bytes().filter(|&b| b == b'\n').count() + 1;
let line_start_byte = prefix.rfind('\n').map(|i| i + 1).unwrap_or(0);
let col = prefix[line_start_byte..].chars().count() + 1;
(line, col)
}

View File

@@ -1,16 +1,44 @@
//! Lexer (tokeniser) that converts raw source text into a [`Token`] stream.
//!
//! [`Lexer`] implements [`Iterator<Item = Token>`] so it can be used directly
//! in a `for` loop or with iterator adaptors such as `.peekable()`.
//! Whitespace and `#`-line-comments are skipped automatically between tokens.
//!
//! # Character classes
//! - **Identifiers / keywords** — start with a
//! [XID_Start](https://unicode.org/reports/tr31/) character or `_`, continue
//! with XID_Continue characters. Reserved words are mapped to their
//! respective [`TokenKind`] variants; everything else becomes
//! [`TokenKind::Identifier`].
//! - **Integer literals** — decimal by default; `0x` / `0o` / `0b` prefixes
//! select hexadecimal, octal, and binary respectively.
//! - **String literals** — delimited by `"…"`; `\` escapes the next character.
//! - **Operators and punctuation** — single- or double-character tokens
//! dispatched via the `token!` macro with one character of lookahead.
use std::{iter::Peekable, str::Chars};
use unicode_xid::UnicodeXID;
use crate::token::{Span, Token, TokenKind};
/// A lazy iterator over the [`Token`]s of a source string.
///
/// Tokens borrow their text slice directly from the original source, so the
/// lexer lifetime `'src` must outlive any use of the produced tokens.
///
/// Construct with [`Lexer::new`] and consume via the [`Iterator`] impl or by
/// passing it to the parser.
pub struct Lexer<'src> {
/// One-character look-ahead over the source characters.
chars: Peekable<Chars<'src>>,
/// The full source text, kept for slice extraction in [`make`](Self::make).
source: &'src str,
/// Current byte offset into `source`. Advanced by [`advance`](Self::advance).
position: usize,
}
impl<'src> Lexer<'src> {
/// Creates a new [`Lexer`] positioned at the start of `source`.
pub fn new(source: &'src str) -> Self {
Self {
chars: source.chars().peekable(),
@@ -24,22 +52,29 @@ impl<'src> Lexer<'src> {
self.chars.peek().copied()
}
/// Consume and return the next character.
/// This method panics if called at the end of input.
/// Consume and return the next character, advancing [`position`](Self::position)
/// by the character's UTF-8 byte length.
///
/// # Panics
/// Panics if called at the end of input. Always guard with
/// [`peek`](Self::peek) first.
fn advance(&mut self) -> char {
let ch = self.chars.next().expect("failed to advance the lexer");
self.position += ch.len_utf8();
ch
}
/// Advance while `condition` holds.
/// Advance while `condition` holds, stopping at the first character for
/// which it returns `false` (or at end of input).
fn advance_while(&mut self, condition: impl FnMut(char) -> bool + Copy) {
while self.peek().is_some_and(condition) {
self.advance();
}
}
/// Build a token from `[start, self.pos)`.
/// Construct a [`Token`] spanning the byte range `[start, self.position)`.
///
/// The token's `text` is a zero-copy slice of the source string.
fn make(&self, kind: TokenKind, start: usize) -> Token<'src> {
Token {
kind,
@@ -48,7 +83,11 @@ impl<'src> Lexer<'src> {
}
}
/// Skip all whitespace and comments.
/// Skip any run of whitespace followed by a `#` line comment, repeating
/// until neither is present.
///
/// Comments begin with `#` and extend to (but do not include) the
/// following `\n`.
fn skip_whitespace_and_comments(&mut self) {
loop {
self.advance_while(char::is_whitespace);
@@ -61,7 +100,12 @@ impl<'src> Lexer<'src> {
}
}
/// Lexes the next identifier token.
/// Lex the next identifier or keyword token.
///
/// Assumes the current peek character satisfies `is_xid_start() || == '_'`.
/// Consumes one XID_Start (or `_`) character followed by any number of
/// XID_Continue characters, then matches the resulting slice against the
/// keyword / type-keyword table.
fn next_identifier(&mut self) -> TokenKind {
let start = self.position;
@@ -71,20 +115,32 @@ impl<'src> Lexer<'src> {
match &self.source[start..self.position] {
"and" => TokenKind::KwAnd,
"or" => TokenKind::KwOr,
"as" => TokenKind::KwAs,
"let" => TokenKind::KwLet,
"u8" => TokenKind::TyU8,
"u16" => TokenKind::TyU16,
"u32" => TokenKind::TyU32,
"u64" => TokenKind::TyU64,
"i8" => TokenKind::TyI8,
"i16" => TokenKind::TyI16,
"i32" => TokenKind::TyI32,
"i64" => TokenKind::TyI64,
"bool" => TokenKind::TyBool,
"true" | "false" => TokenKind::LitBool,
_ => TokenKind::Identifier,
}
}
/// Lexes the next number token.
/// Lex the next integer literal token.
///
/// Assumes the current peek character is an ASCII digit. Detects an
/// optional radix prefix (`0x` → 16, `0o` → 8, `0b` → 2) then consumes
/// all subsequent digits valid for that radix. Always returns
/// [`TokenKind::LitInt`].
fn next_number(&mut self) -> TokenKind {
let radix = match self.advance() {
'0' => match self.peek() {
@@ -110,7 +166,15 @@ impl<'src> Lexer<'src> {
TokenKind::LitInt
}
/// Lexes the next string token.
/// Lex the next string literal token.
///
/// Assumes the current peek character is `"`. Consumes characters until
/// a closing (unescaped) `"` is found or input is exhausted. A `\`
/// escapes the immediately following character, preventing it from being
/// treated as a closing delimiter. Always returns [`TokenKind::LitString`].
///
/// Note: escape sequences are not validated here; that is left to a later
/// compiler stage.
fn next_string(&mut self) -> TokenKind {
let mut escaped = false;
@@ -138,11 +202,25 @@ impl<'src> Lexer<'src> {
impl<'src> Iterator for Lexer<'src> {
type Item = Token<'src>;
/// Returns the next [`Token`], or `None` when the source is exhausted.
///
/// Leading whitespace and `#`-comments are skipped before each token.
/// Multi-character operator tokens (`->`, `<<`, `<=`, …) are resolved with
/// a single character of lookahead via the `token!` macro. Unrecognised
/// characters are returned as [`TokenKind::Unknown`].
fn next(&mut self) -> Option<Self::Item> {
self.skip_whitespace_and_comments();
let start = self.position;
/// Builds and evaluates a [`TokenKind`] from the current position.
///
/// Three forms:
/// - `token!($kind)` — single-character token: advance once, yield `$kind`.
/// - `token!($c => $kind, … ; $default)` — multi-character token with
/// lookahead: advance once (consuming the lead character), then
/// check the next character against each `$c => $kind` arm in order,
/// falling back to `$default` if none match.
macro_rules! token {
// Case 1: Simple token (no lookahead)
($default:expr) => {{

View File

@@ -2,11 +2,14 @@ use std::fs;
use crate::{
cli::{fatal, parse_args},
lexer::Lexer,
parser::Parser,
};
mod ast;
mod cli;
mod diagnostic;
mod lexer;
mod parser;
mod token;
fn main() {
@@ -25,8 +28,16 @@ fn main() {
};
println!("-- {} --", file.display());
for token in Lexer::new(&content) {
println!("{}", token);
let mut parser = Parser::new(&content);
match parser.parse_statement() {
Ok(ast) => println!("{ast:#?}"),
Err(diag) => diag.report(file, &content),
}
parser
.errors
.into_iter()
.for_each(|diag| diag.report(file, &content));
}
}

558
src/parser.rs Normal file
View File

@@ -0,0 +1,558 @@
//! Recursive-descent / Pratt parser that converts a token stream into an AST.
//!
//! The entry points are [`Parser::parse_statement`], [`Parser::parse_type`],
//! and [`Parser::parse_expression`].
//! Errors are represented as [`Diagnostic`] values; the caller is responsible
//! for reporting them.
use std::iter::Peekable;
use crate::ast;
use crate::diagnostic::{Diagnostic, Severity};
use crate::lexer::Lexer;
use crate::token::{Token, TokenKind};
/// Consumes the [`Token`] stream produced by the [`Lexer`] and constructs an
/// AST in the [`ast::Parsed`] phase.
///
/// The parser uses a single token of look-ahead (peek) for all decisions.
/// Expression parsing is implemented with the
/// [Pratt / top-down operator-precedence](https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html)
/// algorithm; binding-power tables are defined by [`infix_binding_power`],
/// [`prefix_binding_power`], and [`postfix_binding_power`].
pub struct Parser<'src> {
tokens: Peekable<Lexer<'src>>,
/// Diagnostics accumulated during parsing. Non-fatal errors are pushed here
/// so that the parser can attempt to continue and surface multiple issues
/// in a single pass.
pub errors: Vec<Diagnostic>,
}
impl<'src> Parser<'src> {
/// Constructs a new [`Parser`] with the given source text.
pub fn new(source: &'src str) -> Self {
Self {
tokens: Lexer::new(source).peekable(),
errors: Vec::new(),
}
}
/// Peek at the next [`Token`] without consuming it.
fn peek(&mut self) -> Option<Token<'src>> {
self.tokens.peek().copied()
}
/// Peek at the next [`Token`], returning an [`Err`] diagnostic if the
/// token stream is exhausted.
fn peek_no_eof(&mut self) -> Result<Token<'src>, Diagnostic> {
self.peek()
.ok_or_else(|| Diagnostic::new(Severity::Error, "unexpected end of input"))
}
/// Returns `true` if the next token has the given [`TokenKind`].
fn is_peek(&mut self, kind: TokenKind) -> bool {
self.peek().map_or(false, |tok| tok.is(kind))
}
/// Returns `true` if the token stream is exhausted.
fn is_at_eof(&mut self) -> bool {
self.peek().is_none()
}
/// Consumes and returns the next [`Token`].
///
/// # Panics
/// Panics if called at the end of input. Always check [`is_at_eof`](Self::is_at_eof)
/// or use [`peek_no_eof`](Self::peek_no_eof) / [`expect`](Self::expect) in
/// production code paths.
fn advance(&mut self) -> Token<'src> {
self.tokens.next().expect("failed to advance the parser")
}
/// Consumes and returns the next [`Token`] if it matches `kind`; otherwise
/// returns an [`Err`] diagnostic that points at the offending token.
fn expect(&mut self, kind: TokenKind) -> Result<Token<'src>, Diagnostic> {
match self.peek() {
Some(tok) if tok.is(kind) => Ok(self.advance()),
Some(tok) => Err(Diagnostic::new(Severity::Error, "unexpected token found")
.with_span(tok.span)
.add_label(
tok.span,
format!("expected {} but found {} instead", kind, tok.kind),
)),
None => Err(Diagnostic::new(Severity::Error, "unexpected end of input")),
}
}
/// Error-recovery helper: skips tokens until a statement boundary is
/// reached so that subsequent statements can still be parsed cleanly.
///
/// Stops *after* consuming a `;`, or *before* consuming a `}`. This keeps
/// nested blocks intact when recovering inside function bodies.
fn synchronize(&mut self) {
while let Some(peek) = self.peek() {
match peek.kind {
// Consume the `;` and stop
TokenKind::Semi => {
self.advance();
break;
}
// Stop before these
TokenKind::RCurly => break,
_ => _ = self.advance(),
}
}
}
/// Parses the next statement.
///
/// Dispatches to the appropriate specialised parser based on the leading
/// token:
/// - `let` → [`parse_let_statement`](Self::parse_let_statement)
/// - `{` → [`parse_compound_statement`](Self::parse_compound_statement)
/// - anything else → an expression followed by a mandatory `;`
pub fn parse_statement(&mut self) -> Result<ast::ParsedStatement, Diagnostic> {
let peek = self.peek_no_eof()?;
match peek.kind {
TokenKind::KwLet => self.parse_let_statement(),
TokenKind::LCurly => self.parse_compound_statement(),
_ => {
let expr = self.parse_expression(0)?;
let semi_token = self.expect(TokenKind::Semi)?;
let span = expr.span.extend(semi_token.span);
Ok(ast::ParsedStatement {
kind: ast::StatementKind::Expr(expr),
span,
extra: (),
})
}
}
}
/// Parses a `let` binding statement: `let <name>[: <type>] [= <expr>];`.
///
/// Both the type annotation and the initialiser are optional. The
/// statement span runs from the `let` keyword through to the closing `;`.
fn parse_let_statement(&mut self) -> Result<ast::ParsedStatement, Diagnostic> {
let let_token = self.expect(TokenKind::KwLet)?;
let (name, name_span) = {
let ident_token = self.expect(TokenKind::Identifier)?;
(ident_token.text.to_string(), ident_token.span)
};
let ty = if self.is_peek(TokenKind::Colon) {
self.advance();
Some(self.parse_type()?)
} else {
None
};
let value = if self.is_peek(TokenKind::Assign) {
self.advance();
Some(self.parse_expression(0)?)
} else {
None
};
let semi_token = self.expect(TokenKind::Semi)?;
Ok(ast::ParsedStatement {
kind: ast::StatementKind::Let {
name,
name_span,
ty,
value,
},
span: let_token.span.extend(semi_token.span),
extra: (),
})
}
/// Parses a braced block of statements: `{ <stmt>* }`.
///
/// Each inner statement is parsed with [`parse_statement`](Self::parse_statement).
/// If a statement fails, the diagnostic is pushed onto [`errors`](Parser::errors)
/// and [`synchronize`](Self::synchronize) is called so that parsing can
/// continue with the next statement. The block span runs from `{` to `}`.
fn parse_compound_statement(&mut self) -> Result<ast::ParsedStatement, Diagnostic> {
let lcurly_token = self.expect(TokenKind::LCurly)?;
let mut inner = Vec::new();
while !self.is_at_eof() && !self.is_peek(TokenKind::RCurly) {
match self.parse_statement() {
Ok(stmt) => inner.push(stmt),
Err(diag) => {
self.errors.push(diag);
self.synchronize();
}
}
}
let rcurly_token = self.expect(TokenKind::RCurly)?;
let span = lcurly_token.span.extend(rcurly_token.span);
Ok(ast::ParsedStatement {
kind: ast::StatementKind::Compound { inner },
span,
extra: (),
})
}
/// Parses a type annotation, e.g. `u8`, `i64`, `bool`, or a user-defined
/// named type.
///
/// Returns an [`Err`] diagnostic if the next token is not a valid type.
pub fn parse_type(&mut self) -> Result<ast::ParsedType, Diagnostic> {
let peek = self.peek_no_eof()?;
let kind = match peek.kind {
TokenKind::TyU8 => ast::TypeKind::U8,
TokenKind::TyU16 => ast::TypeKind::U16,
TokenKind::TyU32 => ast::TypeKind::U32,
TokenKind::TyU64 => ast::TypeKind::U64,
TokenKind::TyI8 => ast::TypeKind::I8,
TokenKind::TyI16 => ast::TypeKind::I16,
TokenKind::TyI32 => ast::TypeKind::I32,
TokenKind::TyI64 => ast::TypeKind::I64,
TokenKind::TyBool => ast::TypeKind::Bool,
TokenKind::Identifier => ast::TypeKind::Named(peek.text.to_string()),
_ => {
return Err(
Diagnostic::new(Severity::Error, "expected a type").with_span(peek.span)
);
}
};
let span = self.advance().span;
Ok(ast::ParsedType {
kind,
span,
extra: (),
})
}
/// Parses an expression using the Pratt (top-down operator-precedence)
/// algorithm.
///
/// `min_bp` is the minimum *left* binding power the next infix/postfix
/// operator must have to be incorporated into the current expression. Pass
/// `0` to parse a full expression with no restrictions.
///
/// The precedence hierarchy (low → high) is:
/// - assignment (`=`)
/// - logical `or` / `and`
/// - bitwise `|` / `^` / `&`
/// - equality (`==`, `!=`) and comparison (`<`, `<=`, `>`, `>=`)
/// - addition / subtraction
/// - shifts (`<<`, `>>`)
/// - multiplication / division / remainder
/// - member access (`.`)
/// - postfix: call `()`, index `[]`, cast `as`
/// - prefix: `-`, `&`, `~`, `*`, `!`
pub fn parse_expression(&mut self, min_bp: u8) -> Result<ast::ParsedExpression, Diagnostic> {
let peek_token = self.peek_no_eof()?;
let mut left = if let Some((right_bp, op)) = prefix_binding_power(peek_token.kind) {
let op_span = self.advance().span;
let operand = Box::new(self.parse_expression(right_bp)?);
let span = op_span.extend(operand.span);
ast::ParsedExpression {
kind: ast::ExpressionKind::Unary {
op,
op_span,
operand,
},
span,
extra: (),
}
} else {
self.parse_primary_expression()?
};
while let Some(peek_token) = self.peek() {
if let Some(left_bp) = postfix_binding_power(peek_token.kind) {
if left_bp < min_bp {
break;
}
left = match peek_token.kind {
TokenKind::LParen => self.parse_call_expr(left)?,
TokenKind::LBracket => self.parse_index_expr(left)?,
TokenKind::KwAs => self.parse_cast_expr(left)?,
_ => unreachable!(),
};
continue;
}
if let Some((left_bp, right_bp, op)) = infix_binding_power(peek_token.kind) {
if left_bp < min_bp {
break;
}
let op_span = self.advance().span;
let right = self.parse_expression(right_bp)?;
let span = left.span.extend(right.span);
left = ast::ParsedExpression {
kind: ast::ExpressionKind::Binary {
op,
op_span,
left: Box::new(left),
right: Box::new(right),
},
span,
extra: (),
};
continue;
}
break;
}
Ok(left)
}
/// Parses a primary (non-operator) expression: an identifier, integer
/// literal, boolean literal, or a parenthesised expression.
///
/// Integer literals support `0x` (hex), `0o` (octal), and `0b` (binary)
/// prefixes in addition to plain decimal.
fn parse_primary_expression(&mut self) -> Result<ast::ParsedExpression, Diagnostic> {
let peek_token = self.peek_no_eof()?;
match peek_token.kind {
TokenKind::Identifier => {
let name = self.advance().text.to_string();
Ok(ast::ParsedExpression {
kind: ast::ExpressionKind::Identifier(name),
span: peek_token.span,
extra: (),
})
}
TokenKind::LitInt => {
let tok = self.advance();
let (radix, src) = [("0x", 16), ("0o", 8), ("0b", 2)]
.into_iter()
.find_map(|(prefix, radix)| {
tok.text.strip_prefix(prefix).map(|text| (radix, text))
})
.unwrap_or((10, tok.text));
let value = u64::from_str_radix(src, radix).map_err(|_| {
Diagnostic::new(Severity::Error, "invalid integer literal")
.with_span(tok.span)
.add_label(tok.span, "this is an invalid integer literal")
})?;
Ok(ast::ParsedExpression {
kind: ast::ExpressionKind::LitInteger(value),
span: tok.span,
extra: (),
})
}
TokenKind::LitBool => {
let value = self.advance().text == "true";
Ok(ast::ParsedExpression {
kind: ast::ExpressionKind::LitBool(value),
span: peek_token.span,
extra: (),
})
}
TokenKind::LParen => {
let open_paren = self.advance();
let inner = self.parse_expression(0)?;
let close_paren = self.expect(TokenKind::RParen)?;
Ok(ast::ParsedExpression {
kind: inner.kind,
span: open_paren.span.extend(close_paren.span),
extra: (),
})
}
_ => Err(Diagnostic::new(
Severity::Error,
format!(
"expected one of {}, {} or {} but found {} instead",
TokenKind::Identifier,
TokenKind::LitInt,
TokenKind::LitBool,
peek_token.kind
),
)
.with_span(peek_token.span)),
}
}
/// Parses a function-call expression `func(arg, …)`.
///
/// The opening `(` is consumed here; `func` is the already-parsed callee
/// expression passed in from the Pratt loop.
fn parse_call_expr(
&mut self,
func: ast::ParsedExpression,
) -> Result<ast::ParsedExpression, Diagnostic> {
self.expect(TokenKind::LParen)?;
let mut args = Vec::new();
while !self.is_at_eof() && !self.is_peek(TokenKind::RParen) {
if !args.is_empty() {
self.expect(TokenKind::Comma)?;
}
args.push(self.parse_expression(0)?);
}
let rparen_token = self.expect(TokenKind::RParen)?;
let span = func.span.extend(rparen_token.span);
Ok(ast::ParsedExpression {
kind: ast::ExpressionKind::Call {
func: Box::new(func),
args,
},
span,
extra: (),
})
}
/// Parses an index expression `expr[index]`.
///
/// The opening `[` is consumed here; `expr` is the already-parsed
/// collection expression passed in from the Pratt loop.
fn parse_index_expr(
&mut self,
expr: ast::ParsedExpression,
) -> Result<ast::ParsedExpression, Diagnostic> {
self.expect(TokenKind::LBracket)?;
let index = self.parse_expression(0)?;
let rbracket_token = self.expect(TokenKind::RBracket)?;
let span = expr.span.extend(rbracket_token.span);
Ok(ast::ParsedExpression {
kind: ast::ExpressionKind::Index {
expr: Box::new(expr),
index: Box::new(index),
},
span,
extra: (),
})
}
/// Parses a cast expression `expr as Type`.
///
/// The `as` keyword is consumed here; `expr` is the already-parsed value
/// expression passed in from the Pratt loop.
fn parse_cast_expr(
&mut self,
expr: ast::ParsedExpression,
) -> Result<ast::ParsedExpression, Diagnostic> {
self.expect(TokenKind::KwAs)?;
let ty = self.parse_type()?;
let span = expr.span.extend(ty.span);
Ok(ast::ParsedExpression {
kind: ast::ExpressionKind::Cast {
expr: Box::new(expr),
ty: Box::new(ty),
},
span,
extra: (),
})
}
}
/// Returns `(left_bp, right_bp, op)` for infix operators, or `None` if `kind`
/// is not an infix operator.
///
/// The two binding-power values implement associativity: equal values give
/// left-associativity, and `right_bp = left_bp` gives right-associativity
/// (currently used for `=`).
fn infix_binding_power(kind: TokenKind) -> Option<(u8, u8, ast::BinaryOp)> {
Some(match kind {
TokenKind::Assign => (2, 2, ast::BinaryOp::Assign),
TokenKind::KwOr => (10, 11, ast::BinaryOp::Or),
TokenKind::KwAnd => (20, 21, ast::BinaryOp::And),
TokenKind::Pipe => (30, 31, ast::BinaryOp::BitOr),
TokenKind::Caret => (40, 41, ast::BinaryOp::BitXor),
TokenKind::Amp => (50, 51, ast::BinaryOp::BitAnd),
TokenKind::Eq => (55, 56, ast::BinaryOp::Eq),
TokenKind::Ne => (55, 56, ast::BinaryOp::Ne),
TokenKind::Lt => (57, 58, ast::BinaryOp::Lt),
TokenKind::Le => (57, 58, ast::BinaryOp::Le),
TokenKind::Gt => (57, 58, ast::BinaryOp::Gt),
TokenKind::Ge => (57, 58, ast::BinaryOp::Ge),
TokenKind::Plus => (60, 61, ast::BinaryOp::Add),
TokenKind::Minus => (60, 61, ast::BinaryOp::Sub),
TokenKind::Shl => (65, 66, ast::BinaryOp::BitShl),
TokenKind::Shr => (65, 66, ast::BinaryOp::BitShr),
TokenKind::Star => (70, 71, ast::BinaryOp::Mul),
TokenKind::Slash => (70, 71, ast::BinaryOp::Div),
TokenKind::Percent => (70, 71, ast::BinaryOp::Rem),
TokenKind::Dot => (100, 101, ast::BinaryOp::Dot),
_ => return None,
})
}
/// Returns `(right_bp, op)` for prefix operators, or `None` if `kind` is not
/// a prefix operator.
///
/// All prefix operators currently share the same binding power (`80`), giving
/// them higher precedence than any binary operator.
fn prefix_binding_power(kind: TokenKind) -> Option<(u8, ast::UnaryOp)> {
Some(match kind {
TokenKind::Minus => (80, ast::UnaryOp::Neg),
TokenKind::Amp => (80, ast::UnaryOp::AddrOf),
TokenKind::Tilde => (80, ast::UnaryOp::BitNot),
TokenKind::Star => (80, ast::UnaryOp::Deref),
TokenKind::Bang => (80, ast::UnaryOp::Not),
_ => return None,
})
}
/// Returns the *left* binding power for postfix operators, or `None` if `kind`
/// is not a postfix operator.
///
/// Postfix operators (`()`, `[]`, `as`) bind tighter than all binary operators
/// but are checked before prefix operators in the Pratt loop so they always
/// apply to the nearest sub-expression.
fn postfix_binding_power(kind: TokenKind) -> Option<u8> {
Some(match kind {
TokenKind::LParen => 100,
TokenKind::LBracket => 100,
TokenKind::KwAs => 90,
_ => return None,
})
}

View File

@@ -1,28 +1,50 @@
//! Token definitions used by the [`Lexer`](crate::lexer::Lexer) and
//! [`Parser`](crate::parser::Parser).
//!
//! The two core types are:
//! - [`Span`] — a half-open byte range that marks a location in source text.
//! - [`Token`] — a classified slice of source text together with its span.
//!
//! [`TokenKind`] enumerates every token variant; its [`Display`](std::fmt::Display)
//! impl produces the human-readable representation used in diagnostics.
use std::fmt;
/// A Span is a half-open byte range `[start, end)` which marks a location in
/// the source string. The start and end positions are stored as a [u32] which
/// limits us to a maximum source file size of 4 gigabytes.
/// A half-open byte range `[start, end)` that marks a location in the source
/// string.
///
/// Positions are stored as [`u32`], which limits supported source files to
/// 4 GiB — more than sufficient for any practical source file.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Span {
/// Inclusive start byte offset.
pub start: u32,
/// Exclusive end byte offset.
pub end: u32,
}
impl Span {
/// Creates a new span covering `[start, end)`.
pub const fn new(start: u32, end: u32) -> Self {
Self { start, end }
}
/// Returns the length of the span in bytes.
///
/// Uses saturating subtraction so an inverted span returns `0` rather than
/// wrapping.
pub fn len(&self) -> u32 {
self.end.saturating_sub(self.start)
}
/// Returns `true` if the span covers zero bytes (`start == end`).
pub fn is_empty(&self) -> bool {
self.start == self.end
}
/// Extend this [Span] to cover `other` as well.
/// Returns the smallest span that covers both `self` and `other`.
///
/// This is the union of the two ranges, useful for computing the span of a
/// parent node from its children.
pub fn extend(self, other: Self) -> Self {
Self {
start: self.start.min(other.start),
@@ -37,8 +59,10 @@ impl fmt::Display for Span {
}
}
/// This macro helps with defining the different kinds of [Token]s. It
/// simultaneously defines a variant and its [fmt::Display] implementation.
/// Simultaneously defines the [`TokenKind`] enum and its [`fmt::Display`] impl.
///
/// Each arm maps a variant name to the human-readable string used in
/// diagnostics (e.g. `` `+` ``, `identifier`).
macro_rules! define_tokens {
($($name:ident => $repr:literal),* $(,)?) => {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -68,16 +92,19 @@ define_tokens! {
// -- Keywords --
KwAnd => "`and`",
KwOr => "`or`",
KwAs => "`as`",
KwLet => "`let`",
// -- Type Keywords --
TyU8 => "`u8`",
TyU16 => "`u16`",
TyU32 => "`u32`",
TyU64 => "`u64`",
TyI8 => "`i8`",
TyI16 => "`i16`",
TyI32 => "`i32`",
TyI64 => "`i64`",
TyU8 => "`u8`",
TyU16 => "`u16`",
TyU32 => "`u32`",
TyU64 => "`u64`",
TyI8 => "`i8`",
TyI16 => "`i16`",
TyI32 => "`i32`",
TyI64 => "`i64`",
TyBool => "`bool`",
// -- Arithmetic Operators --
Plus => "`+`",
@@ -95,7 +122,7 @@ define_tokens! {
Shr => "`>>`",
Bang => "`!`",
// -- Comparision Operators --
// -- Comparison Operators --
Eq => "`==`",
Ne => "`!=`",
Lt => "`<`",
@@ -123,17 +150,23 @@ define_tokens! {
Unknown => "unknown character"
}
/// A Token represents the smallest continous unit of the source code. It holds
/// its [TokenKind], [Span] and source text.
/// The smallest contiguous unit of source text, as produced by the
/// [`Lexer`](crate::lexer::Lexer).
///
/// A token borrows its [`text`](Token::text) slice directly from the original
/// source string, so the lifetime `'src` ties every token to that source.
#[derive(Debug, Clone, Copy)]
pub struct Token<'src> {
/// The syntactic category of this token.
pub kind: TokenKind,
/// The byte range in the source string where this token appears.
pub span: Span,
/// The raw source text of this token (a zero-copy slice).
pub text: &'src str,
}
impl<'src> Token<'src> {
/// Checks if the current [Token] is of given [TokenKind].
/// Returns `true` if this token has the given [`TokenKind`].
pub fn is(&self, kind: TokenKind) -> bool {
self.kind == kind
}

4
test.bky Normal file
View File

@@ -0,0 +1,4 @@
{
let a : 0;
f();
}