From 73e36fac711a9ea27264ddf8ef84bccd6e24d6dd Mon Sep 17 00:00:00 2001 From: Jooris Hadeler Date: Tue, 10 Mar 2026 14:41:54 +0100 Subject: [PATCH] Initial Flux language specification Add the LL(1) context-free grammar (GRAMMAR.ebnf), token and syntax reference (SYNTAX.md), LL(1) verification tool (ll1_check.py), and a fibonacci example demonstrating the language. --- GRAMMAR.ebnf | 435 ++++++++++++++++++++++ SYNTAX.md | 803 +++++++++++++++++++++++++++++++++++++++++ examples/fibonacci.flx | 7 + ll1_check.py | 362 +++++++++++++++++++ 4 files changed, 1607 insertions(+) create mode 100644 GRAMMAR.ebnf create mode 100644 SYNTAX.md create mode 100644 examples/fibonacci.flx create mode 100644 ll1_check.py diff --git a/GRAMMAR.ebnf b/GRAMMAR.ebnf new file mode 100644 index 0000000..9690b3d --- /dev/null +++ b/GRAMMAR.ebnf @@ -0,0 +1,435 @@ +(* Flux Language Grammar — Context-Free LL(1) Grammar *) +(* ================================================================ *) +(* *) +(* Notation (ISO/IEC 14977 EBNF): *) +(* rule = definition ; defines a rule (terminated by ;) *) +(* a , b concatenation *) +(* a | b alternation *) +(* { a } zero or more repetitions of a *) +(* [ a ] optional a (zero or one) *) +(* ( a | b ) grouping *) +(* "literal" terminal string *) +(* *) +(* UPPERCASE identifiers are lexical token classes whose value *) +(* cannot be expressed as a single literal (e.g. IDENT, INT_LIT). *) +(* They are NOT defined here — see SYNTAX.md. *) +(* *) +(* Unique/fixed tokens are written as quoted literals directly. *) +(* *) +(* Lowercase identifiers are non-terminals (grammar productions). *) + + +(* ================================================================ *) +(* Program (start symbol) *) +(* ================================================================ *) + +program = { top_level_def } ; + +top_level_def = func_def + | struct_def ; + + +(* ================================================================ *) +(* Expressions *) +(* ================================================================ *) + +expr = or_expr ; + + +(* --- Logical OR (lowest-precedence binary operator) --- *) +(* *) +(* Uses keyword `or`; left-associative via iteration. *) + +or_expr = and_expr , { "or" , and_expr } ; + + +(* --- Logical AND --- *) +(* *) +(* Uses keyword `and`; left-associative via iteration. *) + +and_expr = bitor_expr , { "and" , bitor_expr } ; + + +(* --- Bitwise OR --- *) + +bitor_expr = bitxor_expr , { "|" , bitxor_expr } ; + + +(* --- Bitwise XOR --- *) + +bitxor_expr = bitand_expr , { "^" , bitand_expr } ; + + +(* --- Bitwise AND --- *) + +bitand_expr = additive_expr , { "&" , additive_expr } ; + + +(* --- Additive: addition and subtraction --- *) + +additive_expr = multiplicative_expr , + { ( "+" | "-" ) , multiplicative_expr } ; + + +(* --- Multiplicative: multiplication, division, modulo --- *) + +multiplicative_expr = unary_expr , + { ( "*" | "/" | "%" ) , unary_expr } ; + + +(* --- Unary operators (prefix, right-associative by recursion) --- *) +(* *) +(* "!" logical not *) +(* "~" bitwise not *) +(* "-" arithmetic negation *) +(* "*" dereference (pointer indirection) *) +(* "&" address-of *) + +unary_expr = "!" , unary_expr + | "~" , unary_expr + | "-" , unary_expr + | "*" , unary_expr + | "&" , unary_expr + | postfix_expr ; + + +(* --- Postfix operators (left-associative via iteration) --- *) +(* *) +(* Postfix operators bind tighter than any prefix or binary form. *) +(* Multiple postfix operations chain left-to-right. *) + +postfix_expr = primary_expr , { postfix_op } ; + +postfix_op = "." , IDENT (* member access *) + | "[" , expr , "]" (* subscript/index *) + | "(" , arg_list , ")" ; (* function call *) + + +(* --- Primary expressions (highest precedence) --- *) +(* *) +(* LL(1) note: after IDENT, peek at the next token. *) +(* "{" → parse struct_lit_body (struct literal) *) +(* other → bare identifier reference *) + +primary_expr = IDENT , [ struct_lit_body ] (* ident or struct lit *) + | INT_LIT + | FLOAT_LIT + | STRING_LIT + | CHAR_LIT + | "true" + | "false" + | "(" , expr , ")" ; (* parenthesised *) + + +(* --- Struct literal --- *) +(* *) +(* A struct literal constructs a value of a named struct type. *) +(* IDENT "{" field: expr, ... "}" *) +(* *) +(* Field order need not match the struct definition order. *) +(* No trailing comma is permitted (consistent with struct_def). *) +(* *) +(* LL(1) notes: *) +(* struct_field_list: "}" → ε; IDENT → first field *) +(* FIRST(struct_field) = {IDENT} *) +(* FOLLOW(struct_field_list) = {"}"} *) +(* Disjoint, so no look-ahead conflict. *) + +struct_lit_body = "{" , struct_field_list , "}" ; + +struct_field_list = [ struct_field , { "," , struct_field } ] ; + +struct_field = IDENT , ":" , expr ; + + +(* ================================================================ *) +(* Argument List *) +(* ================================================================ *) + +arg_list = [ expr , { "," , expr } ] ; + + +(* ================================================================ *) +(* No-Struct Expression Hierarchy (expr_ns) *) +(* ================================================================ *) +(* *) +(* Struct literals create an LL(1) ambiguity in if/while conditions:*) +(* if Point { x: 1 } { ... } *) +(* After "Point", "{" could open a struct literal OR the body block.*) +(* *) +(* Solution: define expr_ns — identical to expr except *) +(* primary_expr_ns disallows the struct_lit_body suffix after IDENT.*) +(* Struct literals ARE still allowed when parenthesised: *) +(* if (Point { x: 1 }).flag { ... } *) +(* *) +(* if_stmt and while_stmt use expr_ns for their condition. *) +(* All other expression positions use the full expr. *) + +expr_ns = or_expr_ns ; + +or_expr_ns = and_expr_ns , { "or" , and_expr_ns } ; +and_expr_ns = bitor_expr_ns , { "and" , bitor_expr_ns } ; + +bitor_expr_ns = bitxor_expr_ns , { "|" , bitxor_expr_ns } ; +bitxor_expr_ns = bitand_expr_ns , { "^" , bitand_expr_ns } ; +bitand_expr_ns = additive_expr_ns , { "&" , additive_expr_ns } ; + +additive_expr_ns = multiplicative_expr_ns , + { ( "+" | "-" ) , multiplicative_expr_ns } ; + +multiplicative_expr_ns = unary_expr_ns , + { ( "*" | "/" | "%" ) , unary_expr_ns } ; + +unary_expr_ns = "!" , unary_expr_ns + | "~" , unary_expr_ns + | "-" , unary_expr_ns + | "*" , unary_expr_ns + | "&" , unary_expr_ns + | postfix_expr_ns ; + +postfix_expr_ns = primary_expr_ns , { postfix_op } ; + +(* primary_expr_ns: same as primary_expr but IDENT is never *) +(* followed by struct_lit_body. Note "(" , expr , ")" uses full *) +(* expr, so struct literals are permitted inside parentheses. *) + +primary_expr_ns = IDENT (* bare ident only *) + | INT_LIT + | FLOAT_LIT + | STRING_LIT + | CHAR_LIT + | "true" + | "false" + | "(" , expr , ")" ; (* struct lit OK here *) + + +(* ================================================================ *) +(* Types *) +(* ================================================================ *) + +type = primitive_type + | named_type + | pointer_type + | array_type ; + + +(* --- Primitive types --- *) +(* *) +(* Unsigned integers : u8 u16 u32 u64 *) +(* Signed integers : i8 i16 i32 i64 *) +(* Floating-point : f32 f64 *) +(* Other : bool char *) + +primitive_type = "u8" | "u16" | "u32" | "u64" + | "i8" | "i16" | "i32" | "i64" + | "f32" | "f64" + | "bool" | "char" ; + + +(* --- Named types --- *) +(* *) +(* A user-defined type referenced by its identifier (e.g. a struct *) +(* name). The lexer guarantees that all primitive-type keywords are *) +(* reserved, so IDENT never clashes with primitive_type. *) + +named_type = IDENT ; + + +(* --- Pointer types --- *) +(* *) +(* "*" type — typed pointer; the pointee type is known. *) +(* "*opaque" — untyped/opaque pointer (no pointee type info). *) +(* *) +(* LL(1) note: after "*", "opaque" is not in FIRST(type), so the *) +(* two alternatives are always distinguishable with one token. *) + +pointer_type = "*" , ( "opaque" | type ) ; + + +(* --- Array types --- *) +(* *) +(* "[" type ";" INT_LIT "]" *) +(* *) +(* The element type and the fixed size (a non-negative integer *) +(* literal) are separated by ";". Sizes that are constant *) +(* expressions may be introduced in a later grammar revision. *) + +array_type = "[" , type , ";" , INT_LIT , "]" ; + + +(* ================================================================ *) +(* Statements *) +(* ================================================================ *) + +stmt = let_stmt + | return_stmt + | if_stmt + | while_stmt + | loop_stmt + | break_stmt + | continue_stmt + | block_stmt + | expr_stmt ; + + +(* --- Return statement --- *) +(* *) +(* Exits the enclosing function, optionally yielding a value. *) +(* "return ;" is used when the function return type is (). *) +(* *) +(* LL(1): after "return", peek at next token. *) +(* ";" → no expression (unit return) *) +(* other → parse expr, then expect ";" *) +(* ";" is not in FIRST(expr), so the two cases are unambiguous. *) + +return_stmt = "return" , [ expr ] , ";" ; + + +(* --- Expression statement --- *) +(* *) +(* Evaluates an expression for its side effects; the value is *) +(* discarded. The ";" is mandatory. *) +(* *) +(* LL(1): at stmt level: *) +(* "let" → let_stmt *) +(* "return" → return_stmt *) +(* "if" → if_stmt *) +(* "while" → while_stmt *) +(* "loop" → loop_stmt *) +(* "break" → break_stmt *) +(* "continue" → continue_stmt *) +(* "{" → block_stmt *) +(* other → expr_stmt *) + +expr_stmt = expr , ";" ; + + +(* --- If statement --- *) +(* *) +(* Conditionally executes a block. An optional "else" branch may *) +(* follow; it is either a plain block or another "if" statement, *) +(* enabling "else if" chains of arbitrary length. *) +(* *) +(* LL(1) notes: *) +(* condition uses expr_ns — struct literals are forbidden at the *) +(* outermost level to avoid ambiguity with the body block's "{". *) +(* [ "else" ... ] — consume "else" iff next token is "else" *) +(* else_branch: "if" → if_stmt (else-if); "{" → block_stmt *) +(* The two else_branch alternatives start with distinct tokens, *) +(* so no look-ahead conflict arises (no dangling-else ambiguity). *) + +if_stmt = "if" , expr_ns , block_stmt , [ "else" , else_branch ] ; + +else_branch = if_stmt (* else if *) + | block_stmt ; (* plain else *) + + +(* --- While loop --- *) +(* *) +(* Repeatedly executes the body as long as the condition is true. *) +(* The condition is re-evaluated before every iteration. *) +(* If the condition is false on the first check, the body never *) +(* executes. *) +(* *) +(* Like if_stmt, the condition uses expr_ns to prevent struct *) +(* literal ambiguity with the body block's opening "{". *) + +while_stmt = "while" , expr_ns , block_stmt ; + + +(* --- Infinite loop --- *) +(* *) +(* Executes the body unconditionally and indefinitely. The only *) +(* ways to exit are "break" or "return" inside the body. *) + +loop_stmt = "loop" , block_stmt ; + + +(* --- Break and continue --- *) +(* *) +(* "break" exits the immediately enclosing "while" or "loop". *) +(* "continue" skips the rest of the current iteration and jumps to *) +(* the next condition check (while) or iteration (loop). *) +(* Both are only valid inside a loop body; the compiler enforces *) +(* this as a semantic rule. *) + +break_stmt = "break" , ";" ; +continue_stmt = "continue" , ";" ; + + +(* --- Block statement --- *) +(* *) +(* A block groups zero or more statements into a single statement *) +(* and introduces a new lexical scope. It does not produce a value. *) +(* *) +(* LL(1): at stmt level, "{" unambiguously selects block since no *) +(* other stmt alternative starts with "{". *) + +block_stmt = "{" , { stmt } , "}" ; + + +(* --- Let statement --- *) +(* *) +(* Introduces a named binding in the current scope. *) +(* Bindings are immutable by default; "mut" opts into mutability. *) +(* *) +(* The type annotation and the initialiser are both optional, but *) +(* at least one must be present for the binding to be usable; *) +(* the compiler enforces this as a semantic (not syntactic) rule. *) +(* *) +(* LL(1) notes: *) +(* [ "mut" ] — consume "mut" iff the next token is "mut" *) +(* [ ":" ... ] — consume iff next token is ":" *) +(* [ "=" ... ] — consume iff next token is "=" *) +(* All decision tokens are distinct, so no look-ahead conflict. *) + +let_stmt = "let" , [ "mut" ] , IDENT , + [ ":" , type ] , + [ "=" , expr ] , + ";" ; + + +(* ================================================================ *) +(* Top-Level Definitions *) +(* ================================================================ *) + +(* --- Function definition --- *) +(* *) +(* Defines a named function with a typed parameter list and an *) +(* optional return type. Omitting "->" implies a return type of (). *) +(* *) +(* LL(1) notes: *) +(* param_list: ")" → ε (empty list); else parse first param *) +(* param: "mut" → consume; IDENT → skip (mut absent) *) +(* [ "->" ... ]: consume iff next token is "->" *) +(* "->" is a two-character token; distinct from all stmt-starting *) +(* tokens, so no look-ahead conflict with block_stmt that follows *) + +func_def = "fn" , IDENT , "(" , param_list , ")" , + [ "->" , type ] , + block_stmt ; + +param_list = [ param , { "," , param } ] ; + +(* Each parameter is an optionally-mutable name with a required *) +(* type annotation. Mutability applies within the function body. *) + +param = [ "mut" ] , IDENT , ":" , type ; + + +(* --- Struct definition --- *) +(* *) +(* Defines a named product type with zero or more typed fields. *) +(* Fields are separated by commas; no trailing comma is permitted. *) +(* *) +(* LL(1) notes: *) +(* field_list: "}" → ε (empty struct); else parse first field *) +(* FIRST(field) = {IDENT}, FOLLOW(field_list) = {"}"} *) +(* Disjoint, so no look-ahead conflict. *) +(* top_level_def: "fn" → func_def; "struct" → struct_def *) + +struct_def = "struct" , IDENT , "{" , field_list , "}" ; + +field_list = [ field , { "," , field } ] ; + +field = IDENT , ":" , type ; diff --git a/SYNTAX.md b/SYNTAX.md new file mode 100644 index 0000000..361d8b0 --- /dev/null +++ b/SYNTAX.md @@ -0,0 +1,803 @@ +# Flux Language Syntax Reference + +## Lexical Tokens + +All tokens listed here are produced by the lexer (lexical analysis phase) and +appear as UPPERCASE terminals in `GRAMMAR.ebnf`. + +### Literals + +| Token | Description | Examples | +| ------------ | ------------------------------------------------------------------- | ------------------------------ | +| `INT_LIT` | Integer literal (decimal, hex `0x`, octal `0o`, binary `0b`) | `42`, `0xFF`, `0o77`, `0b1010` | +| `FLOAT_LIT` | Floating-point literal | `3.14`, `1.0e-9`, `0.5` | +| `STRING_LIT` | Double-quoted UTF-8 string, supports `\n \t \\ \"` escape sequences | `"hello\nworld"` | +| `CHAR_LIT` | Single-quoted Unicode scalar value | `'a'`, `'\n'`, `'\u{1F600}'` | +| `TRUE` | Boolean true literal | `true` | +| `FALSE` | Boolean false literal | `false` | + +### Identifier + +| Token | Description | +| ------- | ------------------------------------------------------------------------------------------------------------ | +| `IDENT` | Identifier: starts with a letter or `_`, followed by letters, digits, or `_`. Unicode letters are permitted. | + +### Operator Tokens + +| Token | Lexeme | Description | +| --------- | ------ | -------------------------------------- | +| `PLUS` | `+` | Addition / unary plus (not in grammar) | +| `MINUS` | `-` | Subtraction / unary negation | +| `STAR` | `*` | Multiplication / pointer dereference | +| `SLASH` | `/` | Division | +| `PERCENT` | `%` | Modulo (remainder) | +| `AMP` | `&` | Bitwise AND / address-of | +| `PIPE` | `\|` | Bitwise OR | +| `CARET` | `^` | Bitwise XOR | +| `BANG` | `!` | Logical NOT | +| `TILDE` | `~` | Bitwise NOT | +| `DOT` | `.` | Member access | + +### Keyword Tokens + +#### Operator Keywords + +| Lexeme | Description | +| ------ | ----------- | +| `and` | Logical AND | +| `or` | Logical OR | + +#### Boolean Literals + +| Lexeme | Description | +| ------- | ------------------- | +| `true` | Boolean true value | +| `false` | Boolean false value | + +#### Primitive Type Keywords + +| Lexeme | Description | +| ------ | ------------------------------ | +| `u8` | Unsigned 8-bit integer | +| `u16` | Unsigned 16-bit integer | +| `u32` | Unsigned 32-bit integer | +| `u64` | Unsigned 64-bit integer | +| `i8` | Signed 8-bit integer | +| `i16` | Signed 16-bit integer | +| `i32` | Signed 32-bit integer | +| `i64` | Signed 64-bit integer | +| `f32` | 32-bit IEEE 754 floating-point | +| `f64` | 64-bit IEEE 754 floating-point | +| `bool` | Boolean (`true` or `false`) | +| `char` | Unicode scalar value (32-bit) | + +#### Pointer Keyword + +| Lexeme | Description | +| -------- | ------------------------------------------------------- | +| `opaque` | Used in `*opaque` to denote a pointer with no type info | + +#### Statement Keywords + +| Lexeme | Description | +| ---------- | ------------------------------------- | +| `let` | Introduces a variable binding | +| `mut` | Marks a binding or pointer as mutable | +| `return` | Exits the enclosing function | +| `if` | Conditional statement | +| `else` | Alternative branch of an `if` | +| `while` | Condition-controlled loop | +| `loop` | Infinite loop | +| `break` | Exit the immediately enclosing loop | +| `continue` | Skip to the next iteration of a loop | + +#### Definition Keywords + +| Lexeme | Description | +| -------- | -------------------------------- | +| `fn` | Introduces a function definition | +| `struct` | Introduces a struct definition | + +> **Lexer note:** All keywords above are reserved and must be recognised before +> the general `IDENT` rule. An identifier may not shadow any keyword. + +### Delimiter / Punctuation Tokens + +| Token | Lexeme | Description | +| ----------- | ------ | ------------------------------------------------------ | +| `LPAREN` | `(` | Left parenthesis | +| `RPAREN` | `)` | Right parenthesis | +| `LBRACKET` | `[` | Left square bracket | +| `RBRACKET` | `]` | Right square bracket | +| `COMMA` | `,` | Argument / element separator | +| `SEMICOLON` | `;` | Statement terminator / array size separator (`[T; N]`) | +| `LCURLY` | `{` | Block / compound expression open | +| `RCURLY` | `}` | Block / compound expression close | +| `ARROW` | `->` | Function return type separator | +| `COLON` | `:` | Type annotation separator | + +--- + +## Expressions + +Expressions produce a value. The grammar defines them through a hierarchy of +precedence levels — lower in the list means lower precedence (binds less +tightly). + +### Operator Precedence Table + +| Level | Operators | Associativity | Description | +| ----- | --------------------------- | -------------- | -------------------------------- | +| 1 | `or` | left | Logical OR (lowest) | +| 2 | `and` | left | Logical AND | +| 3 | `\|` | left | Bitwise OR | +| 4 | `^` | left | Bitwise XOR | +| 5 | `&` | left | Bitwise AND | +| 6 | `+` `-` | left | Addition, subtraction | +| 7 | `*` `/` `%` | left | Multiplication, division, modulo | +| 8 | `!` `~` `-` `*` `&` | right (unary) | Prefix unary operators | +| 9 | `.` `[…]` `(…)` | left (postfix) | Member access, index, call | +| 10 | literals, identifiers, `()` | — | Primary expressions (highest) | + +### Operator Descriptions + +#### Binary Operators + +| Operator | Name | Example | Notes | +| -------- | -------------- | --------- | -------------------------------------------- | +| `or` | Logical OR | `a or b` | Short-circuits; both operands must be `bool` | +| `and` | Logical AND | `a and b` | Short-circuits; both operands must be `bool` | +| `\|` | Bitwise OR | `a \| b` | Integer types | +| `^` | Bitwise XOR | `a ^ b` | Integer types | +| `&` | Bitwise AND | `a & b` | Integer types (binary context) | +| `+` | Addition | `a + b` | | +| `-` | Subtraction | `a - b` | | +| `*` | Multiplication | `a * b` | Binary context (both operands are values) | +| `/` | Division | `a / b` | Integer division truncates toward zero | +| `%` | Modulo | `a % b` | Sign follows the dividend | + +#### Unary Prefix Operators + +| Operator | Name | Example | Notes | +| -------- | ----------- | ------- | ------------------------------------------------ | +| `!` | Logical NOT | `!cond` | Operand must be `bool` | +| `~` | Bitwise NOT | `~mask` | Bitwise complement; integer types | +| `-` | Negation | `-x` | Arithmetic negation | +| `*` | Dereference | `*ptr` | Unary context; operand must be a pointer type | +| `&` | Address-of | `&x` | Unary context; produces a pointer to the operand | + +#### Postfix Operators + +| Operator | Name | Example | Notes | +| -------- | ------------- | ----------- | ------------------------------------------------- | +| `.` | Member access | `obj.field` | Accesses a named field or method of a struct/type | +| `[…]` | Subscript | `arr[i]` | Indexes into an array, slice, or map | +| `(…)` | Call | `f(a, b)` | Invokes a function or closure | + +> **Disambiguation:** `*` and `&` are context-sensitive. +> When appearing as the first token of a `unary_expr` they are **unary** +> (dereference / address-of). When appearing between two `unary_expr` +> sub-trees inside `multiplicative_expr` or `bitand_expr` they are **binary** +> (multiplication / bitwise AND). The parser resolves this purely from +> grammatical position — no look-ahead beyond 1 token is required. + +### Parenthesised Expressions + +Any expression may be wrapped in parentheses to override default precedence: + +``` +(a + b) * c +``` + +### Function Call Argument List + +Arguments are comma-separated expressions. A trailing comma is **not** +permitted at this grammar level. + +``` +f() +f(x) +f(x, y, z) +``` + +### Examples + +```flux +// Arithmetic +a + b * c - d % 2 + +// Bitwise +flags & MASK | extra ^ toggle + +// Logical +ready and not_done or fallback + +// Mixed unary / postfix +*ptr.field +&arr[i] +!cond + +// Chained postfix +obj.method(arg1, arg2)[0].name + +// Explicit precedence override +(a or b) and c +``` + +--- + +## Types + +Types describe the shape and interpretation of values. All type positions in +the grammar reference the `type` non-terminal. + +### Primitive Types + +Primitive types are single-keyword types built into the language. + +| Type | Kind | Width | Range / Notes | +| ------ | ---------------- | ------ | ------------------------------------------ | +| `u8` | Unsigned integer | 8-bit | 0 … 255 | +| `u16` | Unsigned integer | 16-bit | 0 … 65 535 | +| `u32` | Unsigned integer | 32-bit | 0 … 4 294 967 295 | +| `u64` | Unsigned integer | 64-bit | 0 … 2⁶⁴ − 1 | +| `i8` | Signed integer | 8-bit | −128 … 127 | +| `i16` | Signed integer | 16-bit | −32 768 … 32 767 | +| `i32` | Signed integer | 32-bit | −2 147 483 648 … 2 147 483 647 | +| `i64` | Signed integer | 64-bit | −2⁶³ … 2⁶³ − 1 | +| `f32` | Floating-point | 32-bit | IEEE 754 single precision | +| `f64` | Floating-point | 64-bit | IEEE 754 double precision | +| `bool` | Boolean | 1 byte | `true` or `false` | +| `char` | Unicode scalar | 32-bit | Any Unicode scalar value (not a surrogate) | + +### Named Types + +A named type is any user-defined type referenced by its identifier — typically a struct name. Because all primitive-type keywords (`u8`, `bool`, etc.) are reserved, an `IDENT` in type position is always a named type, never a primitive. + +```flux +Point // struct Point { x: f32, y: f32 } +Node // struct Node { value: i64, next: *Node } +*Point // pointer to a named type +[Node; 8] // array of a named type +``` + +### Pointer Types + +A pointer type is written with a leading `*`. + +| Syntax | Description | +| --------- | ------------------------------------------------------------------------------------- | +| `*T` | Typed pointer — points to a value of type `T` | +| `*opaque` | Opaque pointer — no compile-time pointee type information; equivalent to C's `void *` | + +Pointer types may be nested: `**u8` is a pointer to a pointer to `u8`. + +```flux +*u8 // pointer to u8 +**i32 // pointer to pointer to i32 +*opaque // untyped pointer +**opaque // pointer to untyped pointer +``` + +### Array Types + +Arrays have a fixed size known at compile time. + +``` +[ ; ] +``` + +`` must be a non-negative integer literal (`INT_LIT`). The element type +may itself be any `type`, including pointers or nested arrays. + +```flux +[u8; 256] // array of 256 u8 values +[*u8; 4] // array of 4 pointers to u8 +[[f32; 3]; 3] // 3×3 matrix of f32 (array of arrays) +[*opaque; 8] // array of 8 opaque pointers +``` + +### Type Grammar Summary + +```ebnf +type = primitive_type | named_type | pointer_type | array_type ; +primitive_type = "u8" | "u16" | "u32" | "u64" + | "i8" | "i16" | "i32" | "i64" + | "f32" | "f64" | "bool" | "char" ; +named_type = IDENT ; +pointer_type = "*" , ( "opaque" | type ) ; +array_type = "[" , type , ";" , INT_LIT , "]" ; +``` + +--- + +## Struct Literals + +A struct literal constructs a value of a named struct type by providing values for each field. + +``` + { : , ... } +``` + +Fields may appear in any order and need not match the declaration order. No trailing comma is permitted. + +### Examples + +```flux +let p = Point { x: 1.0, y: 2.0 }; + +let n = Node { + value: 42, + next: get_next() +}; + +// Nested struct literal +let outer = Rect { + origin: Point { x: 0.0, y: 0.0 }, + size: Point { x: 10.0, y: 5.0 } +}; + +// Empty struct +let u = Unit {}; +``` + +### Struct Literals in Conditions + +Struct literals are **not permitted** as the outermost expression in `if` and `while` conditions. This restriction exists because `{` after the condition is ambiguous — it could start a struct literal body or the statement block. + +```flux +// ERROR — ambiguous: is `{` a struct body or the if block? +if Flags { verbose: true } { ... } + +// OK — parentheses resolve the ambiguity +if (Flags { verbose: true }).verbose { ... } +``` + +The grammar enforces this through the `expr_ns` (no-struct) hierarchy used in condition positions. Struct literals remain valid everywhere else: `let`, `return`, function arguments, field values, etc. + +### Struct Literal Grammar Summary + +```ebnf +primary_expr = IDENT , [ struct_lit_body ] | INT_LIT | FLOAT_LIT + | STRING_LIT | CHAR_LIT | "true" | "false" + | "(" , expr , ")" ; +struct_lit_body = "{" , struct_field_list , "}" ; +struct_field_list = [ struct_field , { "," , struct_field } ] ; +struct_field = IDENT , ":" , expr ; +``` + +### No-Struct Expression (`expr_ns`) + +`expr_ns` is a parallel expression hierarchy identical to `expr` except its primary level (`primary_expr_ns`) does not allow the `struct_lit_body` suffix after an `IDENT`. Struct literals are still permitted when enclosed in parentheses (`"(" , expr , ")"`), because the `(` unambiguously marks the start of a grouped expression. + +`if_stmt` and `while_stmt` use `expr_ns` for their condition; all other expression positions use the full `expr`. + +--- + +## Statements + +Statements perform an action and do not produce a value. Each statement is +terminated by a semicolon `;`. + +### Let Statement + +Introduces a new named binding in the current scope. + +``` +let [mut] [: ] [= ] ; +``` + +| Part | Required | Description | +| ---------- | -------- | --------------------------------------------- | +| `mut` | no | Makes the binding mutable; omit for immutable | +| `` | yes | The identifier being bound | +| `: ` | no | Explicit type annotation | +| `= ` | no | Initialiser expression | +| `;` | yes | Statement terminator | + +Bindings are **immutable by default**. Attempting to assign to a binding +declared without `mut` is a compile-time error. + +At least one of the type annotation or the initialiser must be present so the +compiler can determine the binding's type. This is a semantic constraint, not a +syntactic one — the grammar permits bare `let x;` and the type checker rejects +it if no type can be inferred from context. + +#### Examples + +```flux +// Immutable, type inferred from initialiser +let x = 42; + +// Immutable, explicit type +let y: f64 = 3.14; + +// Mutable, type inferred +let mut count = 0; + +// Mutable, explicit type, no initialiser (must be assigned before use) +let mut buf: [u8; 128]; + +// Mutable pointer to u32 +let mut ptr: *u32 = &value; + +// Shadowing a previous binding is allowed +let x = "hello"; // x is now a string, previous x is gone +``` + +### Return Statement + +Exits the enclosing function immediately, optionally producing a return value. + +``` +return [] ; +``` + +`return;` (no expression) is used when the function's return type is the unit +type `()`. `return ;` returns the value of the expression. + +Explicit `return` is only needed for early exits. The idiomatic way to return a +value from a function is the implicit return of its body block. + +```flux +return; // unit return +return 42; // return an integer +return x * 2 + 1; // return an expression +``` + +### Expression Statement + +Evaluates an expression for its side effects; the resulting value is +discarded. A semicolon is required. + +``` + ; +``` + +```flux +do_something(x); // call for side effects +count + 1; // legal but silly — value discarded +``` + +### Statement Grammar Summary + +```ebnf +stmt = let_stmt | return_stmt | if_stmt + | while_stmt | loop_stmt | break_stmt | continue_stmt + | block_stmt | expr_stmt ; +let_stmt = "let" , [ "mut" ] , IDENT , [ ":" , type ] , [ "=" , expr ] , ";" ; +return_stmt = "return" , [ expr ] , ";" ; +if_stmt = "if" , expr_ns , block_stmt , [ "else" , else_branch ] ; +else_branch = if_stmt | block_stmt ; +while_stmt = "while" , expr_ns , block_stmt ; +loop_stmt = "loop" , block_stmt ; +break_stmt = "break" , ";" ; +continue_stmt = "continue" , ";" ; +block_stmt = "{" , { stmt } , "}" ; +expr_stmt = expr , ";" ; +``` + +--- + +## If Statement + +Conditionally executes a block based on a boolean expression. + +``` +if [else ] +``` + +The condition `` must be an expression of type `bool`. The body is +always a `block_stmt` — braces are mandatory. + +### Else Branch + +The optional `else` branch is either a plain block or another `if` statement, +enabling `else if` chains of arbitrary length. + +```flux +if x > 0 { + pos(); +} + +if x > 0 { + pos(); +} else { + non_pos(); +} + +if x > 0 { + pos(); +} else if x < 0 { + neg(); +} else { + zero(); +} +``` + +### If Statement Grammar Summary + +```ebnf +if_stmt = "if" , expr_ns , block_stmt , [ "else" , else_branch ] ; +else_branch = if_stmt | block_stmt ; +``` + +--- + +## While Loop + +Repeatedly executes a block as long as a boolean condition holds. The +condition is tested before each iteration; if it is false on entry, the body +never runs. + +``` +while +``` + +```flux +let mut i = 0; +while i < 10 { + process(i); + i = i + 1; +} +``` + +### While Loop Grammar Summary + +```ebnf +while_stmt = "while" , expr_ns , block_stmt ; +``` + +--- + +## Loop + +Executes a block unconditionally and indefinitely. The loop runs until a +`break` or `return` inside the body transfers control out. + +``` +loop +``` + +```flux +loop { + let msg = recv(); + if msg.is_quit() { + break; + } + handle(msg); +} +``` + +### Loop Grammar Summary + +```ebnf +loop_stmt = "loop" , block_stmt ; +``` + +--- + +## Break and Continue + +`break` and `continue` are only valid inside the body of a `while` or `loop`. +The compiler enforces this as a semantic rule. + +| Statement | Effect | +| ------------ | -------------------------------------------------------------- | +| `break ;` | Exits the immediately enclosing loop immediately | +| `continue ;` | Skips the rest of the current iteration; jumps to the next one | + +For `while`, `continue` jumps back to the condition check. For `loop`, +`continue` jumps back to the top of the body. + +```flux +let mut i = 0; +while i < 20 { + i = i + 1; + if i % 2 == 0 { + continue; // skip even numbers + } + if i > 15 { + break; // stop after 15 + } + process(i); +} +``` + +### Break / Continue Grammar Summary + +```ebnf +break_stmt = "break" , ";" ; +continue_stmt = "continue" , ";" ; +``` + +--- + +## Block Statement + +A block groups zero or more statements into a single statement and introduces +a new lexical scope. Blocks do not produce a value. + +``` +{ * } +``` + +### Scoping + +Bindings declared inside a block are not visible outside it. A binding in an +inner scope may shadow a name from an outer scope without affecting it. + +```flux +let x = 1; +{ + let x = 2; // shadows outer x inside this block only + f(x); // uses 2 +} +// x is still 1 here +``` + +### Nesting + +Blocks may be nested freely to any depth. + +```flux +{ + let a = compute_a(); + { + let b = compute_b(); + use(a, b); + } + // b is no longer in scope here +} +``` + +### Block Grammar Summary + +```ebnf +block = "{" , { stmt } , "}" ; +``` + +--- + +## Top-Level Definitions + +A Flux source file is a sequence of top-level definitions. + +```ebnf +program = { top_level_def } ; +top_level_def = func_def | struct_def ; +``` + +The leading token unambiguously selects the definition kind: `fn` → function, +`struct` → struct. + +--- + +## Function Definition + +Defines a named, callable function. + +``` +fn ( [] ) [-> ] +``` + +| Part | Required | Description | +| ------------------ | -------- | -------------------------------------------------------- | +| `` | yes | The function's identifier | +| `( [] )` | yes | Comma-separated parameter list, may be empty | +| `-> ` | no | Return type; omitting it means the function returns `()` | +| `` | yes | Function body — a `block_stmt` | + +### Parameters + +Each parameter is a name with a mandatory type annotation. Parameters are +immutable by default; `mut` makes the local binding mutable within the body. + +``` +[mut] : +``` + +```flux +fn add(a: i32, b: i32) -> i32 { + return a + b; +} + +fn greet(name: *u8) { + print(name); +} + +fn increment(mut x: i32) -> i32 { + x = x + 1; + return x; +} + +fn apply(f: *opaque, mut buf: [u8; 64]) -> bool { + return call(f, &buf); +} +``` + +### Return Type + +If `->` is omitted the return type is implicitly `()` (the unit type). An +explicit `-> ()` is also permitted but redundant. + +```flux +fn do_work() { // returns () + side_effect(); +} + +fn get_value() -> i64 { // returns i64 + return 42; +} +``` + +### Function Definition Grammar Summary + +```ebnf +func_def = "fn" , IDENT , "(" , param_list , ")" , [ "->" , type ] , block_stmt ; +param_list = [ param , { "," , param } ] ; +param = [ "mut" ] , IDENT , ":" , type ; +``` + +--- + +## Struct Definition + +Defines a named product type with zero or more typed fields. + +``` +struct { + : , + ... +} +``` + +Fields are separated by commas. No trailing comma is permitted. An empty +struct (zero fields) is valid. + +### Fields + +Each field is a name and a type. Fields may be of any type including pointers, +arrays, and other structs. Field names must be unique within the struct. + +```flux +struct Point { + x: f32, + y: f32 +} + +struct Node { + value: i64, + next: *Node +} + +struct Buffer { + data: *u8, + len: u64, + cap: u64 +} + +struct Unit {} +``` + +### Member Access + +Fields of a struct value are accessed with the `.` operator (defined in the +expression grammar). If the value is behind a pointer, dereference it first +with `*`. + +```flux +let p: Point = make_point(); +let x = p.x; + +let ptr: *Point = get_point_ptr(); +let y = (*ptr).y; +``` + +### Struct Definition Grammar Summary + +```ebnf +struct_def = "struct" , IDENT , "{" , field_list , "}" ; +field_list = [ field , { "," , field } ] ; +field = IDENT , ":" , type ; +``` diff --git a/examples/fibonacci.flx b/examples/fibonacci.flx new file mode 100644 index 0000000..308a75b --- /dev/null +++ b/examples/fibonacci.flx @@ -0,0 +1,7 @@ +fn fibonacci(n: u8) -> u64 { + if n < 2 { + return n; + } + + return fibonacci(n - 1) + fibonacci(n - 2); +} \ No newline at end of file diff --git a/ll1_check.py b/ll1_check.py new file mode 100644 index 0000000..507e409 --- /dev/null +++ b/ll1_check.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python3 +""" +ll1_check.py — Parse GRAMMAR.ebnf and verify the LL(1) property. + +Usage: python ll1_check.py [grammar_file] [-v] + +Algorithm +--------- +1. Strip (* … *) comments; tokenise. +2. Parse ISO/IEC 14977 EBNF into an AST. +3. Normalise to plain BNF by introducing fresh helper non-terminals: + { body } → _repN where _repN = body , _repN | ε + [ body ] → _optN where _optN = body | ε + ( body ) → inlined (cross-product inside the parent sequence) +4. Compute FIRST and FOLLOW sets (fixed-point iteration). +5. For each non-terminal compute PREDICT sets; flag pairwise conflicts. +""" + +import re +import sys +from collections import defaultdict +from itertools import count as _count +from pathlib import Path + +EPSILON = 'ε' +START = 'program' # grammar start symbol + +# ═══════════════════════════════════════════════════════════════ 1. Tokenise + +_TOK = re.compile( + r'"(?:[^"\\]|\\.)*"' # "quoted terminal string" + r'|[A-Z][A-Z0-9_]*' # UPPERCASE token class (terminal) + r'|[a-z][a-z0-9_]*' # lowercase identifier (non-terminal) + r'|[=;,|()\[\]{}]' # single-char punctuation +) + +def tokenise(src: str) -> list: + src = re.sub(r'\(\*.*?\*\)', ' ', src, flags=re.DOTALL) + return _TOK.findall(src) + + +# ═══════════════════════════════════════════════════════════════ 2. Parse EBNF → AST +# +# Each AST node is a tuple: +# ('lit', s) terminal — quoted string "…" or UPPERCASE token class +# ('nt', s) non-terminal reference +# ('seq', [...]) concatenation (A , B , C) +# ('alt', [...]) alternation (A | B | C) +# ('opt', node) optional [ … ] +# ('rep', node) repetition { … } + +class _Parser: + def __init__(self, tokens): + self._t = tokens + self._i = 0 + + def _peek(self): + return self._t[self._i] if self._i < len(self._t) else None + + def _eat(self, expected=None): + v = self._t[self._i]; self._i += 1 + if expected and v != expected: + raise SyntaxError(f'expected {expected!r}, got {v!r} ' + f'(token #{self._i - 1})') + return v + + def parse_grammar(self) -> dict: + rules = {} + while self._i < len(self._t): + name = self._eat() + self._eat('=') + rules[name] = self._body() + self._eat(';') + return rules + + def _body(self): + alts = [self._seq()] + while self._peek() == '|': + self._eat() + alts.append(self._seq()) + return alts[0] if len(alts) == 1 else ('alt', alts) + + def _seq(self): + items = [self._atom()] + while self._peek() == ',': + self._eat() + items.append(self._atom()) + return items[0] if len(items) == 1 else ('seq', items) + + def _atom(self): + t = self._peek() + if t == '[': + self._eat(); b = self._body(); self._eat(']') + return ('opt', b) + if t == '{': + self._eat(); b = self._body(); self._eat('}') + return ('rep', b) + if t == '(': + self._eat(); b = self._body(); self._eat(')') + return b # group — return inner node directly + if t and (t[0] == '"' or t[0].isupper()): + return ('lit', self._eat()) + if t and t[0].islower(): + return ('nt', self._eat()) + raise SyntaxError(f'unexpected token {t!r}') + + +# ═══════════════════════════════════════════════════════════════ 3. Normalise + +def normalise(ebnf: dict) -> tuple: + """ + Convert EBNF AST to plain BNF. + + Returns + ------- + bnf : dict[name → list[list[str]]] + Each inner list is one production; [] = ε production. + origins : dict[helper_name → parent_rule_name] + Maps generated helper names back to the rule that created them. + """ + bnf: dict = {} + origins: dict = {} + ctr = _count() + + def fresh(tag: str, rule: str) -> str: + h = f'_{tag}{next(ctr)}' + origins[h] = rule + return h + + def expand(node, rule: str, in_seq: bool = False) -> list: + """ + Return a list of alternative symbol sequences for this AST node. + + in_seq: when True, an 'alt' node is wrapped in a fresh non-terminal + instead of being inlined. This prevents the cross-product + expansion of A , (B | C) , D from producing two productions + that both start with A — a common-prefix false positive that + would be misreported as an LL(1) conflict. The grammar is + already left-factored at the EBNF level; this preserves that. + """ + tag = node[0] + + if tag == 'lit': + return [[node[1]]] + + if tag == 'nt': + return [[node[1]]] + + if tag == 'seq': + # Children of a seq are expanded with in_seq=True so that any + # alt node inside the sequence becomes a fresh non-terminal. + result = [[]] + for child in node[1]: + child_seqs = expand(child, rule, in_seq=True) + result = [a + b for a in result for b in child_seqs] + return result + + if tag == 'alt': + if in_seq: + # Alt inside a seq: wrap in a fresh non-terminal (_grpN). + # Each alternative is expanded at top-level (in_seq=False). + h = fresh('grp', rule) + bnf[h] = [s for child in node[1] + for s in expand(child, rule, in_seq=False)] + return [[h]] + # Alt at the top level of a rule body: return alternatives directly. + return [s for child in node[1] + for s in expand(child, rule, in_seq=False)] + + if tag == 'opt': + # [ body ] → _optN = body | ε + h = fresh('opt', rule) + bnf[h] = expand(node[1], rule) + [[]] + return [[h]] + + if tag == 'rep': + # { body } → _repN = body , _repN | ε + h = fresh('rep', rule) + body_seqs = expand(node[1], rule) + bnf[h] = [s + [h] for s in body_seqs] + [[]] + return [[h]] + + raise ValueError(f'unknown AST tag {tag!r}') + + for name, node in ebnf.items(): + bnf[name] = expand(node, name) + + return bnf, origins + + +# ═══════════════════════════════════════════════════════════════ 4. FIRST / FOLLOW + +def first_of_seq(seq: list, first: dict, bnf: dict) -> set: + """ + FIRST set of a sequence of grammar symbols. + Returns a set of terminal strings; includes EPSILON if the whole + sequence can derive the empty string. + """ + result = set() + for sym in seq: + if sym not in bnf: # terminal symbol + result.add(sym) + return result # terminals never derive ε + sym_first = first[sym] + result |= sym_first - {EPSILON} + if EPSILON not in sym_first: + return result # this symbol is not nullable — stop + result.add(EPSILON) # every symbol in seq was nullable + return result + + +def compute_first(bnf: dict) -> dict: + first = defaultdict(set) + changed = True + while changed: + changed = False + for name, prods in bnf.items(): + for prod in prods: + new = first_of_seq(prod, first, bnf) + if not new <= first[name]: + first[name] |= new + changed = True + return first + + +def compute_follow(bnf: dict, first: dict, start: str) -> dict: + follow = defaultdict(set) + follow[start].add('$') + changed = True + while changed: + changed = False + for name, prods in bnf.items(): + for prod in prods: + for i, sym in enumerate(prod): + if sym not in bnf: + continue # skip terminals + # FIRST of what comes after sym in this production + rest_first = first_of_seq(prod[i + 1:], first, bnf) + before = len(follow[sym]) + follow[sym] |= rest_first - {EPSILON} + if EPSILON in rest_first: + follow[sym] |= follow[name] + if len(follow[sym]) > before: + changed = True + return follow + + +# ═══════════════════════════════════════════════════════════════ 5. LL(1) check + +def predict_set(prod: list, name: str, first: dict, follow: dict, bnf: dict) -> set: + """ + PREDICT(A → prod) = (FIRST(prod) − {ε}) ∪ (FOLLOW(A) if ε ∈ FIRST(prod)) + """ + f = first_of_seq(prod, first, bnf) + p = f - {EPSILON} + if EPSILON in f: + p |= follow[name] + return p + + +def check_ll1(bnf: dict, first: dict, follow: dict) -> list: + """ + For each non-terminal check that all PREDICT sets are pairwise disjoint. + Returns a list of conflict dicts. + """ + errors = [] + for name, prods in bnf.items(): + sets = [predict_set(p, name, first, follow, bnf) for p in prods] + for i in range(len(sets)): + for j in range(i + 1, len(sets)): + conflict = sets[i] & sets[j] + if conflict: + errors.append({ + 'rule': name, + 'prod_i': prods[i], + 'prod_j': prods[j], + 'conflict': sorted(conflict), + }) + return errors + + +# ═══════════════════════════════════════════════════════════════ 6. Main + +def _fmt_prod(prod: list) -> str: + return ' '.join(prod) if prod else EPSILON + + +def main(): + argv = sys.argv[1:] + verbose = '-v' in argv + positional = [a for a in argv if not a.startswith('-')] + path = Path(positional[0]) if positional else Path('GRAMMAR.ebnf') + + # ── Load & parse ────────────────────────────────────────────────────── + print(f'Checking {path} …') + try: + src = path.read_text(encoding='utf-8') + except FileNotFoundError: + sys.exit(f'error: file not found: {path}') + + toks = tokenise(src) + try: + ebnf = _Parser(toks).parse_grammar() + except SyntaxError as exc: + sys.exit(f'EBNF parse error: {exc}') + + bnf, origins = normalise(ebnf) + first = compute_first(bnf) + follow = compute_follow(bnf, first, START) + errors = check_ll1(bnf, first, follow) + + # ── Summary line ────────────────────────────────────────────────────── + named = sorted(n for n in bnf if not n.startswith('_')) + helpers = sorted(n for n in bnf if n.startswith('_')) + print(f' {len(named)} named rules, {len(helpers)} generated helper rules\n') + + # ── Optional verbose output ─────────────────────────────────────────── + if verbose: + col = max((len(n) for n in named), default=0) + 2 + print('── FIRST sets (named rules) ──────────────────────────────') + for n in named: + syms = sorted(first[n] - {EPSILON}) + nullable = ' [nullable]' if EPSILON in first[n] else '' + print(f' FIRST({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}{nullable}') + print() + print('── FOLLOW sets (named rules) ─────────────────────────────') + for n in named: + syms = sorted(follow[n]) + print(f' FOLLOW({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}') + print() + + # ── LL(1) result ────────────────────────────────────────────────────── + named_err = [e for e in errors if not e['rule'].startswith('_')] + helper_err = [e for e in errors if e['rule'].startswith('_')] + + if not errors: + print('✓ Grammar is LL(1) — no conflicts detected.') + return + + print(f'✗ {len(errors)} conflict(s): ' + f'{len(named_err)} in named rules, ' + f'{len(helper_err)} in generated helpers\n') + + for e in named_err: + print(f' Rule [{e["rule"]}]') + print(f' alt A : {_fmt_prod(e["prod_i"])}') + print(f' alt B : {_fmt_prod(e["prod_j"])}') + print(f' ambiguous token(s): {e["conflict"]}\n') + + if helper_err: + print(' Conflicts in generated helpers ' + '(each is linked back to its enclosing named rule):') + for e in helper_err: + orig = origins.get(e['rule'], '?') + print(f' [{e["rule"]}] ← from rule [{orig}]') + print(f' alt A : {_fmt_prod(e["prod_i"])}') + print(f' alt B : {_fmt_prod(e["prod_j"])}') + print(f' ambiguous token(s): {e["conflict"]}\n') + + +if __name__ == '__main__': + main()