Add fluxc compiler skeleton: token definitions and lexer

Introduces the fluxc Rust crate with the first two compiler stages: - token.rs: define_tokens! macro generates TokenKind enum and its Display impl from a single table covering all Flux tokens (literals, keywords, operators, punctuation, Eof/Unknown). Span (half-open u32 byte range) and Token<'src> (kind + span + zero-copy text slice) round out the module. - lexer.rs: Lexer<'src> produces Token<'src> from a source &str. Skips whitespace, // line comments, and /* */ block comments. Handles all integer bases (decimal, hex, octal, binary with _ separators), floats (fractional + exponent), string/char literals with escape sequences, and Unicode identifiers via unicode-xid. Implements Iterator<Item = Token> and includes 17 unit tests. Also adds .gitignore (ignores fluxc/target) and expands examples/fibonacci.flx with an iterative variant.
2026-03-10 17:20:17 +01:00
parent 0e08640f59
commit 4f80de51b2
7 changed files with 798 additions and 2 deletions
--- a/fluxc/src/lexer.rs
+++ b/fluxc/src/lexer.rs
@@ -0,0 +1,552 @@
+use crate::token::{Span, Token, TokenKind};
+use unicode_xid::UnicodeXID;
+
+pub struct Lexer<'src> {
+    src: &'src str,
+    /// Current byte offset into `src`. Always kept on a UTF-8 char boundary.
+    pos: usize,
+}
+
+impl<'src> Lexer<'src> {
+    pub fn new(src: &'src str) -> Self {
+        Self { src, pos: 0 }
+    }
+
+    // ── Low-level cursor primitives ──────────────────────────────────────────
+
+    /// Peek at the next character without consuming it.
+    #[inline]
+    fn peek(&self) -> Option<char> {
+        self.src[self.pos..].chars().next()
+    }
+
+    /// Consume and return the next character.
+    /// Panics if called at end-of-input (always guard with `peek` first).
+    #[inline]
+    fn advance(&mut self) -> char {
+        let c = self.src[self.pos..]
+            .chars()
+            .next()
+            .expect("advance called at end of input");
+        self.pos += c.len_utf8();
+        c
+    }
+
+    /// Advance while `pred` holds.
+    #[inline]
+    fn skip_while(&mut self, mut pred: impl FnMut(char) -> bool) {
+        while self.peek().is_some_and(|c| pred(c)) {
+            self.advance();
+        }
+    }
+
+    /// Check whether the two bytes at the current position equal `[a, b]`.
+    /// Safe even when the source contains multi-byte chars because we compare
+    /// raw bytes and `a`/`b` are always ASCII.
+    #[inline]
+    fn at_ascii2(&self, a: u8, b: u8) -> bool {
+        let bytes = self.src.as_bytes();
+        self.pos + 1 < bytes.len() && bytes[self.pos] == a && bytes[self.pos + 1] == b
+    }
+
+    /// Build a token from `[start, self.pos)`.
+    #[inline]
+    fn make(&self, kind: TokenKind, start: usize) -> Token<'src> {
+        Token::new(
+            kind,
+            Span::new(start as u32, self.pos as u32),
+            &self.src[start..self.pos],
+        )
+    }
+
+    // ── Trivia skipping ──────────────────────────────────────────────────────
+
+    /// Skip all whitespace and comments (`//…` and `/*…*/`).
+    fn skip_trivia(&mut self) {
+        loop {
+            // Whitespace
+            self.skip_while(|c| c.is_ascii_whitespace());
+
+            if self.at_ascii2(b'/', b'/') {
+                // Line comment — skip everything up to (but not including) '\n'
+                self.pos += 2;
+                self.skip_while(|c| c != '\n');
+            } else if self.at_ascii2(b'/', b'*') {
+                // Block comment — skip until matching `*/`
+                self.pos += 2;
+                loop {
+                    if self.at_ascii2(b'*', b'/') {
+                        self.pos += 2;
+                        break;
+                    }
+                    if self.peek().is_none() {
+                        break; // unterminated block comment — stop at EOF
+                    }
+                    self.advance();
+                }
+            } else {
+                break;
+            }
+        }
+    }
+
+    // ── Literal scanners ─────────────────────────────────────────────────────
+
+    /// Scan the body and closing `"` of a string literal.
+    /// The opening `"` has already been consumed.
+    fn scan_string(&mut self) {
+        loop {
+            match self.peek() {
+                None => break, // unterminated — stop at EOF
+                Some('"') => {
+                    self.advance();
+                    break;
+                }
+                Some('\\') => {
+                    self.advance();
+                    self.scan_escape();
+                }
+                _ => {
+                    self.advance();
+                }
+            }
+        }
+    }
+
+    /// Scan the body and closing `'` of a char literal.
+    /// The opening `'` has already been consumed.
+    fn scan_char(&mut self) {
+        match self.peek() {
+            None => return,
+            Some('\\') => {
+                self.advance();
+                self.scan_escape();
+            }
+            _ => {
+                self.advance();
+            }
+        }
+        if self.peek() == Some('\'') {
+            self.advance();
+        }
+    }
+
+    /// Scan the tail of an escape sequence (the leading `\` is already consumed).
+    fn scan_escape(&mut self) {
+        match self.peek() {
+            Some('u') => {
+                self.advance();
+                if self.peek() == Some('{') {
+                    self.advance();
+                    self.skip_while(|c| c.is_ascii_hexdigit());
+                    if self.peek() == Some('}') {
+                        self.advance();
+                    }
+                }
+            }
+            Some(_) => {
+                self.advance();
+            } // n, t, r, \, ", ', 0, …
+            None => {} // EOF inside escape — stop
+        }
+    }
+
+    /// Scan a numeric literal.  The first character `first` has already been
+    /// consumed; `start` is its byte offset.
+    fn scan_number(&mut self, first: char) -> TokenKind {
+        // Prefix detection for non-decimal bases (only after a leading `0`)
+        if first == '0' {
+            match self.peek() {
+                Some('x') | Some('X') => {
+                    self.advance();
+                    self.skip_while(|c| c.is_ascii_hexdigit() || c == '_');
+                    return TokenKind::IntLit;
+                }
+                Some('o') | Some('O') => {
+                    self.advance();
+                    self.skip_while(|c| matches!(c, '0'..='7') || c == '_');
+                    return TokenKind::IntLit;
+                }
+                Some('b') | Some('B') => {
+                    self.advance();
+                    self.skip_while(|c| matches!(c, '0' | '1') || c == '_');
+                    return TokenKind::IntLit;
+                }
+                _ => {}
+            }
+        }
+
+        // Remaining decimal digits (with optional `_` separators)
+        self.skip_while(|c| c.is_ascii_digit() || c == '_');
+
+        // Fractional part: `.` followed by at least one digit.
+        // We peek at the *byte* after `.` to avoid claiming the `.` in
+        // member-access expressions like `42.to_string()`.
+        let mut is_float = false;
+        let bytes = self.src.as_bytes();
+        if bytes.get(self.pos) == Some(&b'.') {
+            if bytes.get(self.pos + 1).is_some_and(|b| b.is_ascii_digit()) {
+                self.advance(); // consume '.'
+                self.skip_while(|c| c.is_ascii_digit() || c == '_');
+                is_float = true;
+            }
+        }
+
+        // Optional exponent: `e` or `E`, optional sign, digits
+        if matches!(self.peek(), Some('e') | Some('E')) {
+            self.advance();
+            if matches!(self.peek(), Some('+') | Some('-')) {
+                self.advance();
+            }
+            self.skip_while(|c| c.is_ascii_digit() || c == '_');
+            is_float = true;
+        }
+
+        if is_float {
+            TokenKind::FloatLit
+        } else {
+            TokenKind::IntLit
+        }
+    }
+
+    /// Scan an identifier and map it to the correct keyword token (if any).
+    /// The first character has already been consumed; `start` is its byte offset.
+    fn scan_ident_or_kw(&mut self, start: usize) -> TokenKind {
+        self.skip_while(|c| UnicodeXID::is_xid_continue(c));
+        match &self.src[start..self.pos] {
+            // Control flow
+            "if" => TokenKind::If,
+            "else" => TokenKind::Else,
+            "while" => TokenKind::While,
+            "loop" => TokenKind::Loop,
+            "break" => TokenKind::Break,
+            "continue" => TokenKind::Continue,
+            "return" => TokenKind::Return,
+            // Declarations
+            "fn" => TokenKind::Fn,
+            "struct" => TokenKind::Struct,
+            "let" => TokenKind::Let,
+            "mut" => TokenKind::Mut,
+            // Operator keywords
+            "and" => TokenKind::And,
+            "or" => TokenKind::Or,
+            // Boolean literals
+            "true" => TokenKind::True,
+            "false" => TokenKind::False,
+            // Primitive types
+            "u8" => TokenKind::U8,
+            "u16" => TokenKind::U16,
+            "u32" => TokenKind::U32,
+            "u64" => TokenKind::U64,
+            "i8" => TokenKind::I8,
+            "i16" => TokenKind::I16,
+            "i32" => TokenKind::I32,
+            "i64" => TokenKind::I64,
+            "f32" => TokenKind::F32,
+            "f64" => TokenKind::F64,
+            "bool" => TokenKind::Bool,
+            "char" => TokenKind::Char,
+            // Pointer keyword
+            "opaque" => TokenKind::Opaque,
+            _ => TokenKind::Ident,
+        }
+    }
+
+    // ── Public API ───────────────────────────────────────────────────────────
+
+    /// Lex and return the next meaningful token.
+    /// All leading whitespace and comments are silently skipped.
+    /// Once input is exhausted, every subsequent call returns `Eof`.
+    pub fn next_token(&mut self) -> Token<'src> {
+        self.skip_trivia();
+        let start = self.pos;
+
+        let Some(c) = self.peek() else {
+            return self.make(TokenKind::Eof, start);
+        };
+        self.advance();
+
+        let kind = match c {
+            // ── Unambiguous single-character tokens ──────────────────────────
+            '+' => TokenKind::Plus,
+            '*' => TokenKind::Star,
+            '/' => TokenKind::Slash,
+            '%' => TokenKind::Percent,
+            '&' => TokenKind::Amp,
+            '|' => TokenKind::Pipe,
+            '^' => TokenKind::Caret,
+            '~' => TokenKind::Tilde,
+            '.' => TokenKind::Dot,
+            '(' => TokenKind::LParen,
+            ')' => TokenKind::RParen,
+            '[' => TokenKind::LBracket,
+            ']' => TokenKind::RBracket,
+            '{' => TokenKind::LCurly,
+            '}' => TokenKind::RCurly,
+            ',' => TokenKind::Comma,
+            ';' => TokenKind::Semicolon,
+            ':' => TokenKind::Colon,
+
+            // ── Tokens that may be the prefix of a longer token ──────────────
+            '-' => {
+                if self.peek() == Some('>') {
+                    self.advance();
+                    TokenKind::Arrow
+                } else {
+                    TokenKind::Minus
+                }
+            }
+            '!' => {
+                if self.peek() == Some('=') {
+                    self.advance();
+                    TokenKind::BangEq
+                } else {
+                    TokenKind::Bang
+                }
+            }
+            '=' => {
+                if self.peek() == Some('=') {
+                    self.advance();
+                    TokenKind::EqEq
+                } else {
+                    TokenKind::Eq
+                }
+            }
+            '<' => {
+                if self.peek() == Some('=') {
+                    self.advance();
+                    TokenKind::LtEq
+                } else {
+                    TokenKind::Lt
+                }
+            }
+            '>' => {
+                if self.peek() == Some('=') {
+                    self.advance();
+                    TokenKind::GtEq
+                } else {
+                    TokenKind::Gt
+                }
+            }
+
+            // ── Literals ─────────────────────────────────────────────────────
+            '"' => {
+                self.scan_string();
+                TokenKind::StringLit
+            }
+            '\'' => {
+                self.scan_char();
+                TokenKind::CharLit
+            }
+            '0'..='9' => self.scan_number(c),
+
+            // ── Identifiers and keywords ─────────────────────────────────────
+            // `_` is XID_Continue but not XID_Start; Flux allows it as a
+            // leading character (e.g. `_bar`, `__builtin`).
+            c if c == '_' || UnicodeXID::is_xid_start(c) => self.scan_ident_or_kw(start),
+
+            // ── Anything unrecognised ────────────────────────────────────────
+            _ => TokenKind::Unknown,
+        };
+
+        self.make(kind, start)
+    }
+
+    /// Collect every token (including the trailing `Eof`) into a `Vec`.
+    pub fn tokenize(mut self) -> Vec<Token<'src>> {
+        let mut tokens = Vec::new();
+        loop {
+            let tok = self.next_token();
+            let done = tok.is(TokenKind::Eof);
+            tokens.push(tok);
+            if done {
+                break;
+            }
+        }
+        tokens
+    }
+}
+
+/// `Lexer` implements `Iterator` over non-`Eof` tokens, making it easy to use
+/// in `for` loops or with iterator adaptors.
+impl<'src> Iterator for Lexer<'src> {
+    type Item = Token<'src>;
+
+    fn next(&mut self) -> Option<Token<'src>> {
+        let tok = self.next_token();
+        if tok.is(TokenKind::Eof) {
+            None
+        } else {
+            Some(tok)
+        }
+    }
+}
+
+// ── Tests ────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use TokenKind::*;
+
+    fn kinds(src: &str) -> Vec<TokenKind> {
+        Lexer::new(src)
+            .tokenize()
+            .into_iter()
+            .map(|t| t.kind)
+            .collect()
+    }
+
+    fn texts(src: &str) -> Vec<&str> {
+        Lexer::new(src)
+            .tokenize()
+            .into_iter()
+            .map(|t| t.text)
+            .collect()
+    }
+
+    #[test]
+    fn empty_input() {
+        assert_eq!(kinds(""), vec![Eof]);
+    }
+
+    #[test]
+    fn whitespace_only() {
+        assert_eq!(kinds("   \t\n  "), vec![Eof]);
+    }
+
+    #[test]
+    fn line_comment_skipped() {
+        assert_eq!(kinds("// this is a comment\n42"), vec![IntLit, Eof]);
+    }
+
+    #[test]
+    fn block_comment_skipped() {
+        assert_eq!(kinds("/* hello */ 1 /* world */"), vec![IntLit, Eof]);
+    }
+
+    #[test]
+    fn block_comment_multiline() {
+        assert_eq!(kinds("/*\n  ignored\n*/\ntrue"), vec![True, Eof]);
+    }
+
+    #[test]
+    fn keywords() {
+        let src =
+            "fn struct let mut return if else while loop break continue and or true false opaque";
+        assert_eq!(
+            kinds(src),
+            vec![
+                Fn, Struct, Let, Mut, Return, If, Else, While, Loop, Break, Continue, And, Or,
+                True, False, Opaque, Eof
+            ]
+        );
+    }
+
+    #[test]
+    fn type_keywords() {
+        let src = "u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 bool char";
+        assert_eq!(
+            kinds(src),
+            vec![
+                U8, U16, U32, U64, I8, I16, I32, I64, F32, F64, Bool, Char, Eof
+            ]
+        );
+    }
+
+    #[test]
+    fn identifier() {
+        let toks = Lexer::new("foo _bar baz42").tokenize();
+        assert_eq!(toks[0].kind, Ident);
+        assert_eq!(toks[0].text, "foo");
+        assert_eq!(toks[1].kind, Ident);
+        assert_eq!(toks[1].text, "_bar");
+        assert_eq!(toks[2].kind, Ident);
+        assert_eq!(toks[2].text, "baz42");
+        assert_eq!(toks[3].kind, Eof);
+    }
+
+    #[test]
+    fn integer_literals() {
+        assert_eq!(
+            kinds("42 0xFF 0o77 0b1010 1_000_000"),
+            vec![IntLit, IntLit, IntLit, IntLit, IntLit, Eof]
+        );
+        let ts = texts("42 0xFF 0o77 0b1010 1_000_000");
+        assert_eq!(ts, vec!["42", "0xFF", "0o77", "0b1010", "1_000_000", ""]);
+    }
+
+    #[test]
+    fn float_literals() {
+        assert_eq!(
+            kinds("3.14 1.0e-9 2e4 0.5"),
+            vec![FloatLit, FloatLit, FloatLit, FloatLit, Eof]
+        );
+    }
+
+    #[test]
+    fn dot_not_stolen_from_integer() {
+        // `0.bar` should lex as IntLit Dot Ident, not FloatLit Ident
+        let ts = Lexer::new("0.bar").tokenize();
+        assert_eq!(ts[0].kind, IntLit);
+        assert_eq!(ts[1].kind, Dot);
+        assert_eq!(ts[2].kind, Ident);
+    }
+
+    #[test]
+    fn string_literal() {
+        let toks = Lexer::new(r#""hello\nworld""#).tokenize();
+        assert_eq!(toks[0].kind, StringLit);
+        assert_eq!(toks[0].text, "\"hello\\nworld\"");
+    }
+
+    #[test]
+    fn char_literal() {
+        let toks = Lexer::new(r"'\u{1F600}'").tokenize();
+        assert_eq!(toks[0].kind, CharLit);
+    }
+
+    #[test]
+    fn operators() {
+        let src = "-> == != <= >= < > = + - * / % & | ^ ~ !";
+        assert_eq!(
+            kinds(src),
+            vec![
+                Arrow, EqEq, BangEq, LtEq, GtEq, Lt, Gt, Eq, Plus, Minus, Star, Slash, Percent,
+                Amp, Pipe, Caret, Tilde, Bang, Eof
+            ]
+        );
+    }
+
+    #[test]
+    fn punctuation() {
+        assert_eq!(
+            kinds("( ) [ ] { } , ; : ."),
+            vec![
+                LParen, RParen, LBracket, RBracket, LCurly, RCurly, Comma, Semicolon, Colon, Dot,
+                Eof
+            ]
+        );
+    }
+
+    #[test]
+    fn spans_are_correct() {
+        let toks = Lexer::new("fn foo").tokenize();
+        assert_eq!((toks[0].span.start, toks[0].span.end), (0, 2)); // "fn"
+        assert_eq!((toks[1].span.start, toks[1].span.end), (3, 6)); // "foo"
+    }
+
+    #[test]
+    fn small_function() {
+        let src = "fn add(a: i32, b: i32) -> i32 { return a + b; }";
+        let toks = Lexer::new(src).tokenize();
+        let ks: Vec<_> = toks.iter().map(|t| t.kind).collect();
+        assert_eq!(
+            ks,
+            vec![
+                Fn, Ident, LParen, Ident, Colon, I32, Comma, Ident, Colon, I32, RParen, Arrow, I32,
+                LCurly, Return, Ident, Plus, Ident, Semicolon, RCurly, Eof
+            ]
+        );
+    }
+}