Add fluxc compiler skeleton: token definitions and lexer

Introduces the fluxc Rust crate with the first two compiler stages:

- token.rs: define_tokens! macro generates TokenKind enum and its
  Display impl from a single table covering all Flux tokens
  (literals, keywords, operators, punctuation, Eof/Unknown).
  Span (half-open u32 byte range) and Token<'src> (kind + span +
  zero-copy text slice) round out the module.

- lexer.rs: Lexer<'src> produces Token<'src> from a source &str.
  Skips whitespace, // line comments, and /* */ block comments.
  Handles all integer bases (decimal, hex, octal, binary with _
  separators), floats (fractional + exponent), string/char literals
  with escape sequences, and Unicode identifiers via unicode-xid.
  Implements Iterator<Item = Token> and includes 17 unit tests.

Also adds .gitignore (ignores fluxc/target) and expands
examples/fibonacci.flx with an iterative variant.
This commit is contained in:
2026-03-10 17:20:17 +01:00
parent 0e08640f59
commit 4f80de51b2
7 changed files with 798 additions and 2 deletions

552
fluxc/src/lexer.rs Normal file
View File

@@ -0,0 +1,552 @@
use crate::token::{Span, Token, TokenKind};
use unicode_xid::UnicodeXID;
pub struct Lexer<'src> {
src: &'src str,
/// Current byte offset into `src`. Always kept on a UTF-8 char boundary.
pos: usize,
}
impl<'src> Lexer<'src> {
pub fn new(src: &'src str) -> Self {
Self { src, pos: 0 }
}
// ── Low-level cursor primitives ──────────────────────────────────────────
/// Peek at the next character without consuming it.
#[inline]
fn peek(&self) -> Option<char> {
self.src[self.pos..].chars().next()
}
/// Consume and return the next character.
/// Panics if called at end-of-input (always guard with `peek` first).
#[inline]
fn advance(&mut self) -> char {
let c = self.src[self.pos..]
.chars()
.next()
.expect("advance called at end of input");
self.pos += c.len_utf8();
c
}
/// Advance while `pred` holds.
#[inline]
fn skip_while(&mut self, mut pred: impl FnMut(char) -> bool) {
while self.peek().is_some_and(|c| pred(c)) {
self.advance();
}
}
/// Check whether the two bytes at the current position equal `[a, b]`.
/// Safe even when the source contains multi-byte chars because we compare
/// raw bytes and `a`/`b` are always ASCII.
#[inline]
fn at_ascii2(&self, a: u8, b: u8) -> bool {
let bytes = self.src.as_bytes();
self.pos + 1 < bytes.len() && bytes[self.pos] == a && bytes[self.pos + 1] == b
}
/// Build a token from `[start, self.pos)`.
#[inline]
fn make(&self, kind: TokenKind, start: usize) -> Token<'src> {
Token::new(
kind,
Span::new(start as u32, self.pos as u32),
&self.src[start..self.pos],
)
}
// ── Trivia skipping ──────────────────────────────────────────────────────
/// Skip all whitespace and comments (`//…` and `/*…*/`).
fn skip_trivia(&mut self) {
loop {
// Whitespace
self.skip_while(|c| c.is_ascii_whitespace());
if self.at_ascii2(b'/', b'/') {
// Line comment — skip everything up to (but not including) '\n'
self.pos += 2;
self.skip_while(|c| c != '\n');
} else if self.at_ascii2(b'/', b'*') {
// Block comment — skip until matching `*/`
self.pos += 2;
loop {
if self.at_ascii2(b'*', b'/') {
self.pos += 2;
break;
}
if self.peek().is_none() {
break; // unterminated block comment — stop at EOF
}
self.advance();
}
} else {
break;
}
}
}
// ── Literal scanners ─────────────────────────────────────────────────────
/// Scan the body and closing `"` of a string literal.
/// The opening `"` has already been consumed.
fn scan_string(&mut self) {
loop {
match self.peek() {
None => break, // unterminated — stop at EOF
Some('"') => {
self.advance();
break;
}
Some('\\') => {
self.advance();
self.scan_escape();
}
_ => {
self.advance();
}
}
}
}
/// Scan the body and closing `'` of a char literal.
/// The opening `'` has already been consumed.
fn scan_char(&mut self) {
match self.peek() {
None => return,
Some('\\') => {
self.advance();
self.scan_escape();
}
_ => {
self.advance();
}
}
if self.peek() == Some('\'') {
self.advance();
}
}
/// Scan the tail of an escape sequence (the leading `\` is already consumed).
fn scan_escape(&mut self) {
match self.peek() {
Some('u') => {
self.advance();
if self.peek() == Some('{') {
self.advance();
self.skip_while(|c| c.is_ascii_hexdigit());
if self.peek() == Some('}') {
self.advance();
}
}
}
Some(_) => {
self.advance();
} // n, t, r, \, ", ', 0, …
None => {} // EOF inside escape — stop
}
}
/// Scan a numeric literal. The first character `first` has already been
/// consumed; `start` is its byte offset.
fn scan_number(&mut self, first: char) -> TokenKind {
// Prefix detection for non-decimal bases (only after a leading `0`)
if first == '0' {
match self.peek() {
Some('x') | Some('X') => {
self.advance();
self.skip_while(|c| c.is_ascii_hexdigit() || c == '_');
return TokenKind::IntLit;
}
Some('o') | Some('O') => {
self.advance();
self.skip_while(|c| matches!(c, '0'..='7') || c == '_');
return TokenKind::IntLit;
}
Some('b') | Some('B') => {
self.advance();
self.skip_while(|c| matches!(c, '0' | '1') || c == '_');
return TokenKind::IntLit;
}
_ => {}
}
}
// Remaining decimal digits (with optional `_` separators)
self.skip_while(|c| c.is_ascii_digit() || c == '_');
// Fractional part: `.` followed by at least one digit.
// We peek at the *byte* after `.` to avoid claiming the `.` in
// member-access expressions like `42.to_string()`.
let mut is_float = false;
let bytes = self.src.as_bytes();
if bytes.get(self.pos) == Some(&b'.') {
if bytes.get(self.pos + 1).is_some_and(|b| b.is_ascii_digit()) {
self.advance(); // consume '.'
self.skip_while(|c| c.is_ascii_digit() || c == '_');
is_float = true;
}
}
// Optional exponent: `e` or `E`, optional sign, digits
if matches!(self.peek(), Some('e') | Some('E')) {
self.advance();
if matches!(self.peek(), Some('+') | Some('-')) {
self.advance();
}
self.skip_while(|c| c.is_ascii_digit() || c == '_');
is_float = true;
}
if is_float {
TokenKind::FloatLit
} else {
TokenKind::IntLit
}
}
/// Scan an identifier and map it to the correct keyword token (if any).
/// The first character has already been consumed; `start` is its byte offset.
fn scan_ident_or_kw(&mut self, start: usize) -> TokenKind {
self.skip_while(|c| UnicodeXID::is_xid_continue(c));
match &self.src[start..self.pos] {
// Control flow
"if" => TokenKind::If,
"else" => TokenKind::Else,
"while" => TokenKind::While,
"loop" => TokenKind::Loop,
"break" => TokenKind::Break,
"continue" => TokenKind::Continue,
"return" => TokenKind::Return,
// Declarations
"fn" => TokenKind::Fn,
"struct" => TokenKind::Struct,
"let" => TokenKind::Let,
"mut" => TokenKind::Mut,
// Operator keywords
"and" => TokenKind::And,
"or" => TokenKind::Or,
// Boolean literals
"true" => TokenKind::True,
"false" => TokenKind::False,
// Primitive types
"u8" => TokenKind::U8,
"u16" => TokenKind::U16,
"u32" => TokenKind::U32,
"u64" => TokenKind::U64,
"i8" => TokenKind::I8,
"i16" => TokenKind::I16,
"i32" => TokenKind::I32,
"i64" => TokenKind::I64,
"f32" => TokenKind::F32,
"f64" => TokenKind::F64,
"bool" => TokenKind::Bool,
"char" => TokenKind::Char,
// Pointer keyword
"opaque" => TokenKind::Opaque,
_ => TokenKind::Ident,
}
}
// ── Public API ───────────────────────────────────────────────────────────
/// Lex and return the next meaningful token.
/// All leading whitespace and comments are silently skipped.
/// Once input is exhausted, every subsequent call returns `Eof`.
pub fn next_token(&mut self) -> Token<'src> {
self.skip_trivia();
let start = self.pos;
let Some(c) = self.peek() else {
return self.make(TokenKind::Eof, start);
};
self.advance();
let kind = match c {
// ── Unambiguous single-character tokens ──────────────────────────
'+' => TokenKind::Plus,
'*' => TokenKind::Star,
'/' => TokenKind::Slash,
'%' => TokenKind::Percent,
'&' => TokenKind::Amp,
'|' => TokenKind::Pipe,
'^' => TokenKind::Caret,
'~' => TokenKind::Tilde,
'.' => TokenKind::Dot,
'(' => TokenKind::LParen,
')' => TokenKind::RParen,
'[' => TokenKind::LBracket,
']' => TokenKind::RBracket,
'{' => TokenKind::LCurly,
'}' => TokenKind::RCurly,
',' => TokenKind::Comma,
';' => TokenKind::Semicolon,
':' => TokenKind::Colon,
// ── Tokens that may be the prefix of a longer token ──────────────
'-' => {
if self.peek() == Some('>') {
self.advance();
TokenKind::Arrow
} else {
TokenKind::Minus
}
}
'!' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::BangEq
} else {
TokenKind::Bang
}
}
'=' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::EqEq
} else {
TokenKind::Eq
}
}
'<' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::LtEq
} else {
TokenKind::Lt
}
}
'>' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::GtEq
} else {
TokenKind::Gt
}
}
// ── Literals ─────────────────────────────────────────────────────
'"' => {
self.scan_string();
TokenKind::StringLit
}
'\'' => {
self.scan_char();
TokenKind::CharLit
}
'0'..='9' => self.scan_number(c),
// ── Identifiers and keywords ─────────────────────────────────────
// `_` is XID_Continue but not XID_Start; Flux allows it as a
// leading character (e.g. `_bar`, `__builtin`).
c if c == '_' || UnicodeXID::is_xid_start(c) => self.scan_ident_or_kw(start),
// ── Anything unrecognised ────────────────────────────────────────
_ => TokenKind::Unknown,
};
self.make(kind, start)
}
/// Collect every token (including the trailing `Eof`) into a `Vec`.
pub fn tokenize(mut self) -> Vec<Token<'src>> {
let mut tokens = Vec::new();
loop {
let tok = self.next_token();
let done = tok.is(TokenKind::Eof);
tokens.push(tok);
if done {
break;
}
}
tokens
}
}
/// `Lexer` implements `Iterator` over non-`Eof` tokens, making it easy to use
/// in `for` loops or with iterator adaptors.
impl<'src> Iterator for Lexer<'src> {
type Item = Token<'src>;
fn next(&mut self) -> Option<Token<'src>> {
let tok = self.next_token();
if tok.is(TokenKind::Eof) {
None
} else {
Some(tok)
}
}
}
// ── Tests ────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
use TokenKind::*;
fn kinds(src: &str) -> Vec<TokenKind> {
Lexer::new(src)
.tokenize()
.into_iter()
.map(|t| t.kind)
.collect()
}
fn texts(src: &str) -> Vec<&str> {
Lexer::new(src)
.tokenize()
.into_iter()
.map(|t| t.text)
.collect()
}
#[test]
fn empty_input() {
assert_eq!(kinds(""), vec![Eof]);
}
#[test]
fn whitespace_only() {
assert_eq!(kinds(" \t\n "), vec![Eof]);
}
#[test]
fn line_comment_skipped() {
assert_eq!(kinds("// this is a comment\n42"), vec![IntLit, Eof]);
}
#[test]
fn block_comment_skipped() {
assert_eq!(kinds("/* hello */ 1 /* world */"), vec![IntLit, Eof]);
}
#[test]
fn block_comment_multiline() {
assert_eq!(kinds("/*\n ignored\n*/\ntrue"), vec![True, Eof]);
}
#[test]
fn keywords() {
let src =
"fn struct let mut return if else while loop break continue and or true false opaque";
assert_eq!(
kinds(src),
vec![
Fn, Struct, Let, Mut, Return, If, Else, While, Loop, Break, Continue, And, Or,
True, False, Opaque, Eof
]
);
}
#[test]
fn type_keywords() {
let src = "u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 bool char";
assert_eq!(
kinds(src),
vec![
U8, U16, U32, U64, I8, I16, I32, I64, F32, F64, Bool, Char, Eof
]
);
}
#[test]
fn identifier() {
let toks = Lexer::new("foo _bar baz42").tokenize();
assert_eq!(toks[0].kind, Ident);
assert_eq!(toks[0].text, "foo");
assert_eq!(toks[1].kind, Ident);
assert_eq!(toks[1].text, "_bar");
assert_eq!(toks[2].kind, Ident);
assert_eq!(toks[2].text, "baz42");
assert_eq!(toks[3].kind, Eof);
}
#[test]
fn integer_literals() {
assert_eq!(
kinds("42 0xFF 0o77 0b1010 1_000_000"),
vec![IntLit, IntLit, IntLit, IntLit, IntLit, Eof]
);
let ts = texts("42 0xFF 0o77 0b1010 1_000_000");
assert_eq!(ts, vec!["42", "0xFF", "0o77", "0b1010", "1_000_000", ""]);
}
#[test]
fn float_literals() {
assert_eq!(
kinds("3.14 1.0e-9 2e4 0.5"),
vec![FloatLit, FloatLit, FloatLit, FloatLit, Eof]
);
}
#[test]
fn dot_not_stolen_from_integer() {
// `0.bar` should lex as IntLit Dot Ident, not FloatLit Ident
let ts = Lexer::new("0.bar").tokenize();
assert_eq!(ts[0].kind, IntLit);
assert_eq!(ts[1].kind, Dot);
assert_eq!(ts[2].kind, Ident);
}
#[test]
fn string_literal() {
let toks = Lexer::new(r#""hello\nworld""#).tokenize();
assert_eq!(toks[0].kind, StringLit);
assert_eq!(toks[0].text, "\"hello\\nworld\"");
}
#[test]
fn char_literal() {
let toks = Lexer::new(r"'\u{1F600}'").tokenize();
assert_eq!(toks[0].kind, CharLit);
}
#[test]
fn operators() {
let src = "-> == != <= >= < > = + - * / % & | ^ ~ !";
assert_eq!(
kinds(src),
vec![
Arrow, EqEq, BangEq, LtEq, GtEq, Lt, Gt, Eq, Plus, Minus, Star, Slash, Percent,
Amp, Pipe, Caret, Tilde, Bang, Eof
]
);
}
#[test]
fn punctuation() {
assert_eq!(
kinds("( ) [ ] { } , ; : ."),
vec![
LParen, RParen, LBracket, RBracket, LCurly, RCurly, Comma, Semicolon, Colon, Dot,
Eof
]
);
}
#[test]
fn spans_are_correct() {
let toks = Lexer::new("fn foo").tokenize();
assert_eq!((toks[0].span.start, toks[0].span.end), (0, 2)); // "fn"
assert_eq!((toks[1].span.start, toks[1].span.end), (3, 6)); // "foo"
}
#[test]
fn small_function() {
let src = "fn add(a: i32, b: i32) -> i32 { return a + b; }";
let toks = Lexer::new(src).tokenize();
let ks: Vec<_> = toks.iter().map(|t| t.kind).collect();
assert_eq!(
ks,
vec![
Fn, Ident, LParen, Ident, Colon, I32, Comma, Ident, Colon, I32, RParen, Arrow, I32,
LCurly, Return, Ident, Plus, Ident, Semicolon, RCurly, Eof
]
);
}
}