diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..970ad3a --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/fluxc/target \ No newline at end of file diff --git a/examples/fibonacci.flx b/examples/fibonacci.flx index 308a75b..4e7aa31 100644 --- a/examples/fibonacci.flx +++ b/examples/fibonacci.flx @@ -1,7 +1,23 @@ -fn fibonacci(n: u8) -> u64 { +fn fibonacci_rec(n: u8) -> u64 { if n < 2 { return n; } - return fibonacci(n - 1) + fibonacci(n - 2); + return fibonacci_rec(n - 1) + fibonacci_rec(n - 2); +} + +fn fibonacci_iter(n: u8) -> u64 { + let mut counter = 0; + let mut a = 0; + let mut b = 1; + + while counter < n { + let temp = a + b; + a = b; + b = temp; + + counter = counter + 1; + } + + return a; } \ No newline at end of file diff --git a/fluxc/Cargo.lock b/fluxc/Cargo.lock new file mode 100644 index 0000000..d867fb8 --- /dev/null +++ b/fluxc/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "fluxc" +version = "0.1.0" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" diff --git a/fluxc/Cargo.toml b/fluxc/Cargo.toml new file mode 100644 index 0000000..a00b335 --- /dev/null +++ b/fluxc/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "fluxc" +version = "0.1.0" +edition = "2024" + +[dependencies] +unicode-xid = "0.2" diff --git a/fluxc/src/lexer.rs b/fluxc/src/lexer.rs new file mode 100644 index 0000000..1aae1f9 --- /dev/null +++ b/fluxc/src/lexer.rs @@ -0,0 +1,552 @@ +use crate::token::{Span, Token, TokenKind}; +use unicode_xid::UnicodeXID; + +pub struct Lexer<'src> { + src: &'src str, + /// Current byte offset into `src`. Always kept on a UTF-8 char boundary. + pos: usize, +} + +impl<'src> Lexer<'src> { + pub fn new(src: &'src str) -> Self { + Self { src, pos: 0 } + } + + // ── Low-level cursor primitives ────────────────────────────────────────── + + /// Peek at the next character without consuming it. + #[inline] + fn peek(&self) -> Option { + self.src[self.pos..].chars().next() + } + + /// Consume and return the next character. + /// Panics if called at end-of-input (always guard with `peek` first). + #[inline] + fn advance(&mut self) -> char { + let c = self.src[self.pos..] + .chars() + .next() + .expect("advance called at end of input"); + self.pos += c.len_utf8(); + c + } + + /// Advance while `pred` holds. + #[inline] + fn skip_while(&mut self, mut pred: impl FnMut(char) -> bool) { + while self.peek().is_some_and(|c| pred(c)) { + self.advance(); + } + } + + /// Check whether the two bytes at the current position equal `[a, b]`. + /// Safe even when the source contains multi-byte chars because we compare + /// raw bytes and `a`/`b` are always ASCII. + #[inline] + fn at_ascii2(&self, a: u8, b: u8) -> bool { + let bytes = self.src.as_bytes(); + self.pos + 1 < bytes.len() && bytes[self.pos] == a && bytes[self.pos + 1] == b + } + + /// Build a token from `[start, self.pos)`. + #[inline] + fn make(&self, kind: TokenKind, start: usize) -> Token<'src> { + Token::new( + kind, + Span::new(start as u32, self.pos as u32), + &self.src[start..self.pos], + ) + } + + // ── Trivia skipping ────────────────────────────────────────────────────── + + /// Skip all whitespace and comments (`//…` and `/*…*/`). + fn skip_trivia(&mut self) { + loop { + // Whitespace + self.skip_while(|c| c.is_ascii_whitespace()); + + if self.at_ascii2(b'/', b'/') { + // Line comment — skip everything up to (but not including) '\n' + self.pos += 2; + self.skip_while(|c| c != '\n'); + } else if self.at_ascii2(b'/', b'*') { + // Block comment — skip until matching `*/` + self.pos += 2; + loop { + if self.at_ascii2(b'*', b'/') { + self.pos += 2; + break; + } + if self.peek().is_none() { + break; // unterminated block comment — stop at EOF + } + self.advance(); + } + } else { + break; + } + } + } + + // ── Literal scanners ───────────────────────────────────────────────────── + + /// Scan the body and closing `"` of a string literal. + /// The opening `"` has already been consumed. + fn scan_string(&mut self) { + loop { + match self.peek() { + None => break, // unterminated — stop at EOF + Some('"') => { + self.advance(); + break; + } + Some('\\') => { + self.advance(); + self.scan_escape(); + } + _ => { + self.advance(); + } + } + } + } + + /// Scan the body and closing `'` of a char literal. + /// The opening `'` has already been consumed. + fn scan_char(&mut self) { + match self.peek() { + None => return, + Some('\\') => { + self.advance(); + self.scan_escape(); + } + _ => { + self.advance(); + } + } + if self.peek() == Some('\'') { + self.advance(); + } + } + + /// Scan the tail of an escape sequence (the leading `\` is already consumed). + fn scan_escape(&mut self) { + match self.peek() { + Some('u') => { + self.advance(); + if self.peek() == Some('{') { + self.advance(); + self.skip_while(|c| c.is_ascii_hexdigit()); + if self.peek() == Some('}') { + self.advance(); + } + } + } + Some(_) => { + self.advance(); + } // n, t, r, \, ", ', 0, … + None => {} // EOF inside escape — stop + } + } + + /// Scan a numeric literal. The first character `first` has already been + /// consumed; `start` is its byte offset. + fn scan_number(&mut self, first: char) -> TokenKind { + // Prefix detection for non-decimal bases (only after a leading `0`) + if first == '0' { + match self.peek() { + Some('x') | Some('X') => { + self.advance(); + self.skip_while(|c| c.is_ascii_hexdigit() || c == '_'); + return TokenKind::IntLit; + } + Some('o') | Some('O') => { + self.advance(); + self.skip_while(|c| matches!(c, '0'..='7') || c == '_'); + return TokenKind::IntLit; + } + Some('b') | Some('B') => { + self.advance(); + self.skip_while(|c| matches!(c, '0' | '1') || c == '_'); + return TokenKind::IntLit; + } + _ => {} + } + } + + // Remaining decimal digits (with optional `_` separators) + self.skip_while(|c| c.is_ascii_digit() || c == '_'); + + // Fractional part: `.` followed by at least one digit. + // We peek at the *byte* after `.` to avoid claiming the `.` in + // member-access expressions like `42.to_string()`. + let mut is_float = false; + let bytes = self.src.as_bytes(); + if bytes.get(self.pos) == Some(&b'.') { + if bytes.get(self.pos + 1).is_some_and(|b| b.is_ascii_digit()) { + self.advance(); // consume '.' + self.skip_while(|c| c.is_ascii_digit() || c == '_'); + is_float = true; + } + } + + // Optional exponent: `e` or `E`, optional sign, digits + if matches!(self.peek(), Some('e') | Some('E')) { + self.advance(); + if matches!(self.peek(), Some('+') | Some('-')) { + self.advance(); + } + self.skip_while(|c| c.is_ascii_digit() || c == '_'); + is_float = true; + } + + if is_float { + TokenKind::FloatLit + } else { + TokenKind::IntLit + } + } + + /// Scan an identifier and map it to the correct keyword token (if any). + /// The first character has already been consumed; `start` is its byte offset. + fn scan_ident_or_kw(&mut self, start: usize) -> TokenKind { + self.skip_while(|c| UnicodeXID::is_xid_continue(c)); + match &self.src[start..self.pos] { + // Control flow + "if" => TokenKind::If, + "else" => TokenKind::Else, + "while" => TokenKind::While, + "loop" => TokenKind::Loop, + "break" => TokenKind::Break, + "continue" => TokenKind::Continue, + "return" => TokenKind::Return, + // Declarations + "fn" => TokenKind::Fn, + "struct" => TokenKind::Struct, + "let" => TokenKind::Let, + "mut" => TokenKind::Mut, + // Operator keywords + "and" => TokenKind::And, + "or" => TokenKind::Or, + // Boolean literals + "true" => TokenKind::True, + "false" => TokenKind::False, + // Primitive types + "u8" => TokenKind::U8, + "u16" => TokenKind::U16, + "u32" => TokenKind::U32, + "u64" => TokenKind::U64, + "i8" => TokenKind::I8, + "i16" => TokenKind::I16, + "i32" => TokenKind::I32, + "i64" => TokenKind::I64, + "f32" => TokenKind::F32, + "f64" => TokenKind::F64, + "bool" => TokenKind::Bool, + "char" => TokenKind::Char, + // Pointer keyword + "opaque" => TokenKind::Opaque, + _ => TokenKind::Ident, + } + } + + // ── Public API ─────────────────────────────────────────────────────────── + + /// Lex and return the next meaningful token. + /// All leading whitespace and comments are silently skipped. + /// Once input is exhausted, every subsequent call returns `Eof`. + pub fn next_token(&mut self) -> Token<'src> { + self.skip_trivia(); + let start = self.pos; + + let Some(c) = self.peek() else { + return self.make(TokenKind::Eof, start); + }; + self.advance(); + + let kind = match c { + // ── Unambiguous single-character tokens ────────────────────────── + '+' => TokenKind::Plus, + '*' => TokenKind::Star, + '/' => TokenKind::Slash, + '%' => TokenKind::Percent, + '&' => TokenKind::Amp, + '|' => TokenKind::Pipe, + '^' => TokenKind::Caret, + '~' => TokenKind::Tilde, + '.' => TokenKind::Dot, + '(' => TokenKind::LParen, + ')' => TokenKind::RParen, + '[' => TokenKind::LBracket, + ']' => TokenKind::RBracket, + '{' => TokenKind::LCurly, + '}' => TokenKind::RCurly, + ',' => TokenKind::Comma, + ';' => TokenKind::Semicolon, + ':' => TokenKind::Colon, + + // ── Tokens that may be the prefix of a longer token ────────────── + '-' => { + if self.peek() == Some('>') { + self.advance(); + TokenKind::Arrow + } else { + TokenKind::Minus + } + } + '!' => { + if self.peek() == Some('=') { + self.advance(); + TokenKind::BangEq + } else { + TokenKind::Bang + } + } + '=' => { + if self.peek() == Some('=') { + self.advance(); + TokenKind::EqEq + } else { + TokenKind::Eq + } + } + '<' => { + if self.peek() == Some('=') { + self.advance(); + TokenKind::LtEq + } else { + TokenKind::Lt + } + } + '>' => { + if self.peek() == Some('=') { + self.advance(); + TokenKind::GtEq + } else { + TokenKind::Gt + } + } + + // ── Literals ───────────────────────────────────────────────────── + '"' => { + self.scan_string(); + TokenKind::StringLit + } + '\'' => { + self.scan_char(); + TokenKind::CharLit + } + '0'..='9' => self.scan_number(c), + + // ── Identifiers and keywords ───────────────────────────────────── + // `_` is XID_Continue but not XID_Start; Flux allows it as a + // leading character (e.g. `_bar`, `__builtin`). + c if c == '_' || UnicodeXID::is_xid_start(c) => self.scan_ident_or_kw(start), + + // ── Anything unrecognised ──────────────────────────────────────── + _ => TokenKind::Unknown, + }; + + self.make(kind, start) + } + + /// Collect every token (including the trailing `Eof`) into a `Vec`. + pub fn tokenize(mut self) -> Vec> { + let mut tokens = Vec::new(); + loop { + let tok = self.next_token(); + let done = tok.is(TokenKind::Eof); + tokens.push(tok); + if done { + break; + } + } + tokens + } +} + +/// `Lexer` implements `Iterator` over non-`Eof` tokens, making it easy to use +/// in `for` loops or with iterator adaptors. +impl<'src> Iterator for Lexer<'src> { + type Item = Token<'src>; + + fn next(&mut self) -> Option> { + let tok = self.next_token(); + if tok.is(TokenKind::Eof) { + None + } else { + Some(tok) + } + } +} + +// ── Tests ──────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + use TokenKind::*; + + fn kinds(src: &str) -> Vec { + Lexer::new(src) + .tokenize() + .into_iter() + .map(|t| t.kind) + .collect() + } + + fn texts(src: &str) -> Vec<&str> { + Lexer::new(src) + .tokenize() + .into_iter() + .map(|t| t.text) + .collect() + } + + #[test] + fn empty_input() { + assert_eq!(kinds(""), vec![Eof]); + } + + #[test] + fn whitespace_only() { + assert_eq!(kinds(" \t\n "), vec![Eof]); + } + + #[test] + fn line_comment_skipped() { + assert_eq!(kinds("// this is a comment\n42"), vec![IntLit, Eof]); + } + + #[test] + fn block_comment_skipped() { + assert_eq!(kinds("/* hello */ 1 /* world */"), vec![IntLit, Eof]); + } + + #[test] + fn block_comment_multiline() { + assert_eq!(kinds("/*\n ignored\n*/\ntrue"), vec![True, Eof]); + } + + #[test] + fn keywords() { + let src = + "fn struct let mut return if else while loop break continue and or true false opaque"; + assert_eq!( + kinds(src), + vec![ + Fn, Struct, Let, Mut, Return, If, Else, While, Loop, Break, Continue, And, Or, + True, False, Opaque, Eof + ] + ); + } + + #[test] + fn type_keywords() { + let src = "u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 bool char"; + assert_eq!( + kinds(src), + vec![ + U8, U16, U32, U64, I8, I16, I32, I64, F32, F64, Bool, Char, Eof + ] + ); + } + + #[test] + fn identifier() { + let toks = Lexer::new("foo _bar baz42").tokenize(); + assert_eq!(toks[0].kind, Ident); + assert_eq!(toks[0].text, "foo"); + assert_eq!(toks[1].kind, Ident); + assert_eq!(toks[1].text, "_bar"); + assert_eq!(toks[2].kind, Ident); + assert_eq!(toks[2].text, "baz42"); + assert_eq!(toks[3].kind, Eof); + } + + #[test] + fn integer_literals() { + assert_eq!( + kinds("42 0xFF 0o77 0b1010 1_000_000"), + vec![IntLit, IntLit, IntLit, IntLit, IntLit, Eof] + ); + let ts = texts("42 0xFF 0o77 0b1010 1_000_000"); + assert_eq!(ts, vec!["42", "0xFF", "0o77", "0b1010", "1_000_000", ""]); + } + + #[test] + fn float_literals() { + assert_eq!( + kinds("3.14 1.0e-9 2e4 0.5"), + vec![FloatLit, FloatLit, FloatLit, FloatLit, Eof] + ); + } + + #[test] + fn dot_not_stolen_from_integer() { + // `0.bar` should lex as IntLit Dot Ident, not FloatLit Ident + let ts = Lexer::new("0.bar").tokenize(); + assert_eq!(ts[0].kind, IntLit); + assert_eq!(ts[1].kind, Dot); + assert_eq!(ts[2].kind, Ident); + } + + #[test] + fn string_literal() { + let toks = Lexer::new(r#""hello\nworld""#).tokenize(); + assert_eq!(toks[0].kind, StringLit); + assert_eq!(toks[0].text, "\"hello\\nworld\""); + } + + #[test] + fn char_literal() { + let toks = Lexer::new(r"'\u{1F600}'").tokenize(); + assert_eq!(toks[0].kind, CharLit); + } + + #[test] + fn operators() { + let src = "-> == != <= >= < > = + - * / % & | ^ ~ !"; + assert_eq!( + kinds(src), + vec![ + Arrow, EqEq, BangEq, LtEq, GtEq, Lt, Gt, Eq, Plus, Minus, Star, Slash, Percent, + Amp, Pipe, Caret, Tilde, Bang, Eof + ] + ); + } + + #[test] + fn punctuation() { + assert_eq!( + kinds("( ) [ ] { } , ; : ."), + vec![ + LParen, RParen, LBracket, RBracket, LCurly, RCurly, Comma, Semicolon, Colon, Dot, + Eof + ] + ); + } + + #[test] + fn spans_are_correct() { + let toks = Lexer::new("fn foo").tokenize(); + assert_eq!((toks[0].span.start, toks[0].span.end), (0, 2)); // "fn" + assert_eq!((toks[1].span.start, toks[1].span.end), (3, 6)); // "foo" + } + + #[test] + fn small_function() { + let src = "fn add(a: i32, b: i32) -> i32 { return a + b; }"; + let toks = Lexer::new(src).tokenize(); + let ks: Vec<_> = toks.iter().map(|t| t.kind).collect(); + assert_eq!( + ks, + vec![ + Fn, Ident, LParen, Ident, Colon, I32, Comma, Ident, Colon, I32, RParen, Arrow, I32, + LCurly, Return, Ident, Plus, Ident, Semicolon, RCurly, Eof + ] + ); + } +} diff --git a/fluxc/src/main.rs b/fluxc/src/main.rs new file mode 100644 index 0000000..76693ab --- /dev/null +++ b/fluxc/src/main.rs @@ -0,0 +1,15 @@ +use std::{env::args, fs}; + +use crate::lexer::Lexer; + +pub mod lexer; +pub mod token; + +fn main() { + let path = args().nth(1).expect("usage: fluxc "); + let content = fs::read_to_string(&path).expect("error: failed to read file"); + + for token in Lexer::new(&content) { + println!("{token:?}"); + } +} diff --git a/fluxc/src/token.rs b/fluxc/src/token.rs new file mode 100644 index 0000000..d89b16f --- /dev/null +++ b/fluxc/src/token.rs @@ -0,0 +1,189 @@ +/// Defines all token kinds together with their human-readable display strings. +/// +/// Usage: +/// `TokenKind::Fn` — enum variant +/// `format!("{}", kind)` — display string, e.g. "`fn`" +macro_rules! define_tokens { + ( $( $variant:ident => $display:literal ),* $(,)? ) => { + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] + pub enum TokenKind { + $( $variant, )* + } + + impl std::fmt::Display for TokenKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + $( TokenKind::$variant => $display, )* + }; + f.write_str(s) + } + } + }; +} + +define_tokens! { + // ── Literals ────────────────────────────────────────────────────────────── + IntLit => "integer literal", + FloatLit => "float literal", + StringLit => "string literal", + CharLit => "char literal", + + // ── Identifier ──────────────────────────────────────────────────────────── + Ident => "identifier", + + // ── Declaration keywords ────────────────────────────────────────────────── + Fn => "`fn`", + Struct => "`struct`", + Let => "`let`", + Mut => "`mut`", + + // ── Control-flow keywords ───────────────────────────────────────────────── + Return => "`return`", + If => "`if`", + Else => "`else`", + While => "`while`", + Loop => "`loop`", + Break => "`break`", + Continue => "`continue`", + + // ── Operator keywords ───────────────────────────────────────────────────── + And => "`and`", + Or => "`or`", + + // ── Boolean literals ────────────────────────────────────────────────────── + True => "`true`", + False => "`false`", + + // ── Primitive type keywords ─────────────────────────────────────────────── + U8 => "`u8`", + U16 => "`u16`", + U32 => "`u32`", + U64 => "`u64`", + I8 => "`i8`", + I16 => "`i16`", + I32 => "`i32`", + I64 => "`i64`", + F32 => "`f32`", + F64 => "`f64`", + Bool => "`bool`", + Char => "`char`", + + // ── Pointer keyword ─────────────────────────────────────────────────────── + Opaque => "`opaque`", + + // ── Arithmetic operators ────────────────────────────────────────────────── + Plus => "`+`", + Minus => "`-`", + Star => "`*`", + Slash => "`/`", + Percent => "`%`", + + // ── Bitwise / address operators ─────────────────────────────────────────── + Amp => "`&`", + Pipe => "`|`", + Caret => "`^`", + Bang => "`!`", + Tilde => "`~`", + + // ── Comparison operators ────────────────────────────────────────────────── + EqEq => "`==`", + BangEq => "`!=`", + Lt => "`<`", + Gt => "`>`", + LtEq => "`<=`", + GtEq => "`>=`", + + // ── Assignment ──────────────────────────────────────────────────────────── + Eq => "`=`", + + // ── Punctuation ─────────────────────────────────────────────────────────── + Arrow => "`->`", + Dot => "`.`", + Colon => "`:`", + Semicolon => "`;`", + Comma => "`,`", + + // ── Delimiters ──────────────────────────────────────────────────────────── + LParen => "`(`", + RParen => "`)`", + LBracket => "`[`", + RBracket => "`]`", + LCurly => "`{`", + RCurly => "`}`", + + // ── Special ─────────────────────────────────────────────────────────────── + Eof => "end of file", + Unknown => "unknown character", +} + +// ── Span ────────────────────────────────────────────────────────────────────── + +/// A half-open byte range `[start, end)` into the source string. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct Span { + pub start: u32, + pub end: u32, +} + +impl Span { + #[inline] + pub fn new(start: u32, end: u32) -> Self { + Self { start, end } + } + + #[inline] + pub fn len(self) -> u32 { + self.end - self.start + } + + #[inline] + pub fn is_empty(self) -> bool { + self.start == self.end + } + + /// Extend this span to cover `other` as well. + #[inline] + pub fn cover(self, other: Span) -> Span { + Span { + start: self.start.min(other.start), + end: self.end.max(other.end), + } + } +} + +impl std::fmt::Display for Span { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}..{}", self.start, self.end) + } +} + +// ── Token ───────────────────────────────────────────────────────────────────── + +/// A single lexed token. +/// +/// The `text` field is a slice into the original source string — no allocation +/// is needed. For `Eof` the text is the empty string at the end of input. +#[derive(Debug, Clone, Copy)] +pub struct Token<'src> { + pub kind: TokenKind, + pub span: Span, + pub text: &'src str, +} + +impl<'src> Token<'src> { + #[inline] + pub fn new(kind: TokenKind, span: Span, text: &'src str) -> Self { + Self { kind, span, text } + } + + #[inline] + pub fn is(&self, kind: TokenKind) -> bool { + self.kind == kind + } +} + +impl std::fmt::Display for Token<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} `{}` @ {}", self.kind, self.text, self.span) + } +}