Initial Flux language specification

Add the LL(1) context-free grammar (GRAMMAR.ebnf), token and syntax
reference (SYNTAX.md), LL(1) verification tool (ll1_check.py), and a
fibonacci example demonstrating the language.
This commit is contained in:
2026-03-10 14:41:54 +01:00
commit 73e36fac71
4 changed files with 1607 additions and 0 deletions

435
GRAMMAR.ebnf Normal file
View File

@@ -0,0 +1,435 @@
(* Flux Language Grammar Context-Free LL(1) Grammar *)
(* ================================================================ *)
(* *)
(* Notation (ISO/IEC 14977 EBNF): *)
(* rule = definition ; defines a rule (terminated by ;) *)
(* a , b concatenation *)
(* a | b alternation *)
(* { a } zero or more repetitions of a *)
(* [ a ] optional a (zero or one) *)
(* ( a | b ) grouping *)
(* "literal" terminal string *)
(* *)
(* UPPERCASE identifiers are lexical token classes whose value *)
(* cannot be expressed as a single literal (e.g. IDENT, INT_LIT). *)
(* They are NOT defined here see SYNTAX.md. *)
(* *)
(* Unique/fixed tokens are written as quoted literals directly. *)
(* *)
(* Lowercase identifiers are non-terminals (grammar productions). *)
(* ================================================================ *)
(* Program (start symbol) *)
(* ================================================================ *)
program = { top_level_def } ;
top_level_def = func_def
| struct_def ;
(* ================================================================ *)
(* Expressions *)
(* ================================================================ *)
expr = or_expr ;
(* --- Logical OR (lowest-precedence binary operator) --- *)
(* *)
(* Uses keyword `or`; left-associative via iteration. *)
or_expr = and_expr , { "or" , and_expr } ;
(* --- Logical AND --- *)
(* *)
(* Uses keyword `and`; left-associative via iteration. *)
and_expr = bitor_expr , { "and" , bitor_expr } ;
(* --- Bitwise OR --- *)
bitor_expr = bitxor_expr , { "|" , bitxor_expr } ;
(* --- Bitwise XOR --- *)
bitxor_expr = bitand_expr , { "^" , bitand_expr } ;
(* --- Bitwise AND --- *)
bitand_expr = additive_expr , { "&" , additive_expr } ;
(* --- Additive: addition and subtraction --- *)
additive_expr = multiplicative_expr ,
{ ( "+" | "-" ) , multiplicative_expr } ;
(* --- Multiplicative: multiplication, division, modulo --- *)
multiplicative_expr = unary_expr ,
{ ( "*" | "/" | "%" ) , unary_expr } ;
(* --- Unary operators (prefix, right-associative by recursion) --- *)
(* *)
(* "!" logical not *)
(* "~" bitwise not *)
(* "-" arithmetic negation *)
(* "*" dereference (pointer indirection) *)
(* "&" address-of *)
unary_expr = "!" , unary_expr
| "~" , unary_expr
| "-" , unary_expr
| "*" , unary_expr
| "&" , unary_expr
| postfix_expr ;
(* --- Postfix operators (left-associative via iteration) --- *)
(* *)
(* Postfix operators bind tighter than any prefix or binary form. *)
(* Multiple postfix operations chain left-to-right. *)
postfix_expr = primary_expr , { postfix_op } ;
postfix_op = "." , IDENT (* member access *)
| "[" , expr , "]" (* subscript/index *)
| "(" , arg_list , ")" ; (* function call *)
(* --- Primary expressions (highest precedence) --- *)
(* *)
(* LL(1) note: after IDENT, peek at the next token. *)
(* "{" parse struct_lit_body (struct literal) *)
(* other bare identifier reference *)
primary_expr = IDENT , [ struct_lit_body ] (* ident or struct lit *)
| INT_LIT
| FLOAT_LIT
| STRING_LIT
| CHAR_LIT
| "true"
| "false"
| "(" , expr , ")" ; (* parenthesised *)
(* --- Struct literal --- *)
(* *)
(* A struct literal constructs a value of a named struct type. *)
(* IDENT "{" field: expr, ... "}" *)
(* *)
(* Field order need not match the struct definition order. *)
(* No trailing comma is permitted (consistent with struct_def). *)
(* *)
(* LL(1) notes: *)
(* struct_field_list: "}" ε; IDENT first field *)
(* FIRST(struct_field) = {IDENT} *)
(* FOLLOW(struct_field_list) = {"}"} *)
(* Disjoint, so no look-ahead conflict. *)
struct_lit_body = "{" , struct_field_list , "}" ;
struct_field_list = [ struct_field , { "," , struct_field } ] ;
struct_field = IDENT , ":" , expr ;
(* ================================================================ *)
(* Argument List *)
(* ================================================================ *)
arg_list = [ expr , { "," , expr } ] ;
(* ================================================================ *)
(* No-Struct Expression Hierarchy (expr_ns) *)
(* ================================================================ *)
(* *)
(* Struct literals create an LL(1) ambiguity in if/while conditions:*)
(* if Point { x: 1 } { ... } *)
(* After "Point", "{" could open a struct literal OR the body block.*)
(* *)
(* Solution: define expr_ns identical to expr except *)
(* primary_expr_ns disallows the struct_lit_body suffix after IDENT.*)
(* Struct literals ARE still allowed when parenthesised: *)
(* if (Point { x: 1 }).flag { ... } *)
(* *)
(* if_stmt and while_stmt use expr_ns for their condition. *)
(* All other expression positions use the full expr. *)
expr_ns = or_expr_ns ;
or_expr_ns = and_expr_ns , { "or" , and_expr_ns } ;
and_expr_ns = bitor_expr_ns , { "and" , bitor_expr_ns } ;
bitor_expr_ns = bitxor_expr_ns , { "|" , bitxor_expr_ns } ;
bitxor_expr_ns = bitand_expr_ns , { "^" , bitand_expr_ns } ;
bitand_expr_ns = additive_expr_ns , { "&" , additive_expr_ns } ;
additive_expr_ns = multiplicative_expr_ns ,
{ ( "+" | "-" ) , multiplicative_expr_ns } ;
multiplicative_expr_ns = unary_expr_ns ,
{ ( "*" | "/" | "%" ) , unary_expr_ns } ;
unary_expr_ns = "!" , unary_expr_ns
| "~" , unary_expr_ns
| "-" , unary_expr_ns
| "*" , unary_expr_ns
| "&" , unary_expr_ns
| postfix_expr_ns ;
postfix_expr_ns = primary_expr_ns , { postfix_op } ;
(* primary_expr_ns: same as primary_expr but IDENT is never *)
(* followed by struct_lit_body. Note "(" , expr , ")" uses full *)
(* expr, so struct literals are permitted inside parentheses. *)
primary_expr_ns = IDENT (* bare ident only *)
| INT_LIT
| FLOAT_LIT
| STRING_LIT
| CHAR_LIT
| "true"
| "false"
| "(" , expr , ")" ; (* struct lit OK here *)
(* ================================================================ *)
(* Types *)
(* ================================================================ *)
type = primitive_type
| named_type
| pointer_type
| array_type ;
(* --- Primitive types --- *)
(* *)
(* Unsigned integers : u8 u16 u32 u64 *)
(* Signed integers : i8 i16 i32 i64 *)
(* Floating-point : f32 f64 *)
(* Other : bool char *)
primitive_type = "u8" | "u16" | "u32" | "u64"
| "i8" | "i16" | "i32" | "i64"
| "f32" | "f64"
| "bool" | "char" ;
(* --- Named types --- *)
(* *)
(* A user-defined type referenced by its identifier (e.g. a struct *)
(* name). The lexer guarantees that all primitive-type keywords are *)
(* reserved, so IDENT never clashes with primitive_type. *)
named_type = IDENT ;
(* --- Pointer types --- *)
(* *)
(* "*" type typed pointer; the pointee type is known. *)
(* "*opaque" untyped/opaque pointer (no pointee type info). *)
(* *)
(* LL(1) note: after "*", "opaque" is not in FIRST(type), so the *)
(* two alternatives are always distinguishable with one token. *)
pointer_type = "*" , ( "opaque" | type ) ;
(* --- Array types --- *)
(* *)
(* "[" type ";" INT_LIT "]" *)
(* *)
(* The element type and the fixed size (a non-negative integer *)
(* literal) are separated by ";". Sizes that are constant *)
(* expressions may be introduced in a later grammar revision. *)
array_type = "[" , type , ";" , INT_LIT , "]" ;
(* ================================================================ *)
(* Statements *)
(* ================================================================ *)
stmt = let_stmt
| return_stmt
| if_stmt
| while_stmt
| loop_stmt
| break_stmt
| continue_stmt
| block_stmt
| expr_stmt ;
(* --- Return statement --- *)
(* *)
(* Exits the enclosing function, optionally yielding a value. *)
(* "return ;" is used when the function return type is (). *)
(* *)
(* LL(1): after "return", peek at next token. *)
(* ";" no expression (unit return) *)
(* other parse expr, then expect ";" *)
(* ";" is not in FIRST(expr), so the two cases are unambiguous. *)
return_stmt = "return" , [ expr ] , ";" ;
(* --- Expression statement --- *)
(* *)
(* Evaluates an expression for its side effects; the value is *)
(* discarded. The ";" is mandatory. *)
(* *)
(* LL(1): at stmt level: *)
(* "let" let_stmt *)
(* "return" return_stmt *)
(* "if" if_stmt *)
(* "while" while_stmt *)
(* "loop" loop_stmt *)
(* "break" break_stmt *)
(* "continue" continue_stmt *)
(* "{" block_stmt *)
(* other expr_stmt *)
expr_stmt = expr , ";" ;
(* --- If statement --- *)
(* *)
(* Conditionally executes a block. An optional "else" branch may *)
(* follow; it is either a plain block or another "if" statement, *)
(* enabling "else if" chains of arbitrary length. *)
(* *)
(* LL(1) notes: *)
(* condition uses expr_ns struct literals are forbidden at the *)
(* outermost level to avoid ambiguity with the body block's "{". *)
(* [ "else" ... ] consume "else" iff next token is "else" *)
(* else_branch: "if" if_stmt (else-if); "{" block_stmt *)
(* The two else_branch alternatives start with distinct tokens, *)
(* so no look-ahead conflict arises (no dangling-else ambiguity). *)
if_stmt = "if" , expr_ns , block_stmt , [ "else" , else_branch ] ;
else_branch = if_stmt (* else if *)
| block_stmt ; (* plain else *)
(* --- While loop --- *)
(* *)
(* Repeatedly executes the body as long as the condition is true. *)
(* The condition is re-evaluated before every iteration. *)
(* If the condition is false on the first check, the body never *)
(* executes. *)
(* *)
(* Like if_stmt, the condition uses expr_ns to prevent struct *)
(* literal ambiguity with the body block's opening "{". *)
while_stmt = "while" , expr_ns , block_stmt ;
(* --- Infinite loop --- *)
(* *)
(* Executes the body unconditionally and indefinitely. The only *)
(* ways to exit are "break" or "return" inside the body. *)
loop_stmt = "loop" , block_stmt ;
(* --- Break and continue --- *)
(* *)
(* "break" exits the immediately enclosing "while" or "loop". *)
(* "continue" skips the rest of the current iteration and jumps to *)
(* the next condition check (while) or iteration (loop). *)
(* Both are only valid inside a loop body; the compiler enforces *)
(* this as a semantic rule. *)
break_stmt = "break" , ";" ;
continue_stmt = "continue" , ";" ;
(* --- Block statement --- *)
(* *)
(* A block groups zero or more statements into a single statement *)
(* and introduces a new lexical scope. It does not produce a value. *)
(* *)
(* LL(1): at stmt level, "{" unambiguously selects block since no *)
(* other stmt alternative starts with "{". *)
block_stmt = "{" , { stmt } , "}" ;
(* --- Let statement --- *)
(* *)
(* Introduces a named binding in the current scope. *)
(* Bindings are immutable by default; "mut" opts into mutability. *)
(* *)
(* The type annotation and the initialiser are both optional, but *)
(* at least one must be present for the binding to be usable; *)
(* the compiler enforces this as a semantic (not syntactic) rule. *)
(* *)
(* LL(1) notes: *)
(* [ "mut" ] consume "mut" iff the next token is "mut" *)
(* [ ":" ... ] consume iff next token is ":" *)
(* [ "=" ... ] consume iff next token is "=" *)
(* All decision tokens are distinct, so no look-ahead conflict. *)
let_stmt = "let" , [ "mut" ] , IDENT ,
[ ":" , type ] ,
[ "=" , expr ] ,
";" ;
(* ================================================================ *)
(* Top-Level Definitions *)
(* ================================================================ *)
(* --- Function definition --- *)
(* *)
(* Defines a named function with a typed parameter list and an *)
(* optional return type. Omitting "->" implies a return type of (). *)
(* *)
(* LL(1) notes: *)
(* param_list: ")" ε (empty list); else parse first param *)
(* param: "mut" consume; IDENT skip (mut absent) *)
(* [ "->" ... ]: consume iff next token is "->" *)
(* "->" is a two-character token; distinct from all stmt-starting *)
(* tokens, so no look-ahead conflict with block_stmt that follows *)
func_def = "fn" , IDENT , "(" , param_list , ")" ,
[ "->" , type ] ,
block_stmt ;
param_list = [ param , { "," , param } ] ;
(* Each parameter is an optionally-mutable name with a required *)
(* type annotation. Mutability applies within the function body. *)
param = [ "mut" ] , IDENT , ":" , type ;
(* --- Struct definition --- *)
(* *)
(* Defines a named product type with zero or more typed fields. *)
(* Fields are separated by commas; no trailing comma is permitted. *)
(* *)
(* LL(1) notes: *)
(* field_list: "}" ε (empty struct); else parse first field *)
(* FIRST(field) = {IDENT}, FOLLOW(field_list) = {"}"} *)
(* Disjoint, so no look-ahead conflict. *)
(* top_level_def: "fn" func_def; "struct" struct_def *)
struct_def = "struct" , IDENT , "{" , field_list , "}" ;
field_list = [ field , { "," , field } ] ;
field = IDENT , ":" , type ;

803
SYNTAX.md Normal file
View File

@@ -0,0 +1,803 @@
# Flux Language Syntax Reference
## Lexical Tokens
All tokens listed here are produced by the lexer (lexical analysis phase) and
appear as UPPERCASE terminals in `GRAMMAR.ebnf`.
### Literals
| Token | Description | Examples |
| ------------ | ------------------------------------------------------------------- | ------------------------------ |
| `INT_LIT` | Integer literal (decimal, hex `0x`, octal `0o`, binary `0b`) | `42`, `0xFF`, `0o77`, `0b1010` |
| `FLOAT_LIT` | Floating-point literal | `3.14`, `1.0e-9`, `0.5` |
| `STRING_LIT` | Double-quoted UTF-8 string, supports `\n \t \\ \"` escape sequences | `"hello\nworld"` |
| `CHAR_LIT` | Single-quoted Unicode scalar value | `'a'`, `'\n'`, `'\u{1F600}'` |
| `TRUE` | Boolean true literal | `true` |
| `FALSE` | Boolean false literal | `false` |
### Identifier
| Token | Description |
| ------- | ------------------------------------------------------------------------------------------------------------ |
| `IDENT` | Identifier: starts with a letter or `_`, followed by letters, digits, or `_`. Unicode letters are permitted. |
### Operator Tokens
| Token | Lexeme | Description |
| --------- | ------ | -------------------------------------- |
| `PLUS` | `+` | Addition / unary plus (not in grammar) |
| `MINUS` | `-` | Subtraction / unary negation |
| `STAR` | `*` | Multiplication / pointer dereference |
| `SLASH` | `/` | Division |
| `PERCENT` | `%` | Modulo (remainder) |
| `AMP` | `&` | Bitwise AND / address-of |
| `PIPE` | `\|` | Bitwise OR |
| `CARET` | `^` | Bitwise XOR |
| `BANG` | `!` | Logical NOT |
| `TILDE` | `~` | Bitwise NOT |
| `DOT` | `.` | Member access |
### Keyword Tokens
#### Operator Keywords
| Lexeme | Description |
| ------ | ----------- |
| `and` | Logical AND |
| `or` | Logical OR |
#### Boolean Literals
| Lexeme | Description |
| ------- | ------------------- |
| `true` | Boolean true value |
| `false` | Boolean false value |
#### Primitive Type Keywords
| Lexeme | Description |
| ------ | ------------------------------ |
| `u8` | Unsigned 8-bit integer |
| `u16` | Unsigned 16-bit integer |
| `u32` | Unsigned 32-bit integer |
| `u64` | Unsigned 64-bit integer |
| `i8` | Signed 8-bit integer |
| `i16` | Signed 16-bit integer |
| `i32` | Signed 32-bit integer |
| `i64` | Signed 64-bit integer |
| `f32` | 32-bit IEEE 754 floating-point |
| `f64` | 64-bit IEEE 754 floating-point |
| `bool` | Boolean (`true` or `false`) |
| `char` | Unicode scalar value (32-bit) |
#### Pointer Keyword
| Lexeme | Description |
| -------- | ------------------------------------------------------- |
| `opaque` | Used in `*opaque` to denote a pointer with no type info |
#### Statement Keywords
| Lexeme | Description |
| ---------- | ------------------------------------- |
| `let` | Introduces a variable binding |
| `mut` | Marks a binding or pointer as mutable |
| `return` | Exits the enclosing function |
| `if` | Conditional statement |
| `else` | Alternative branch of an `if` |
| `while` | Condition-controlled loop |
| `loop` | Infinite loop |
| `break` | Exit the immediately enclosing loop |
| `continue` | Skip to the next iteration of a loop |
#### Definition Keywords
| Lexeme | Description |
| -------- | -------------------------------- |
| `fn` | Introduces a function definition |
| `struct` | Introduces a struct definition |
> **Lexer note:** All keywords above are reserved and must be recognised before
> the general `IDENT` rule. An identifier may not shadow any keyword.
### Delimiter / Punctuation Tokens
| Token | Lexeme | Description |
| ----------- | ------ | ------------------------------------------------------ |
| `LPAREN` | `(` | Left parenthesis |
| `RPAREN` | `)` | Right parenthesis |
| `LBRACKET` | `[` | Left square bracket |
| `RBRACKET` | `]` | Right square bracket |
| `COMMA` | `,` | Argument / element separator |
| `SEMICOLON` | `;` | Statement terminator / array size separator (`[T; N]`) |
| `LCURLY` | `{` | Block / compound expression open |
| `RCURLY` | `}` | Block / compound expression close |
| `ARROW` | `->` | Function return type separator |
| `COLON` | `:` | Type annotation separator |
---
## Expressions
Expressions produce a value. The grammar defines them through a hierarchy of
precedence levels — lower in the list means lower precedence (binds less
tightly).
### Operator Precedence Table
| Level | Operators | Associativity | Description |
| ----- | --------------------------- | -------------- | -------------------------------- |
| 1 | `or` | left | Logical OR (lowest) |
| 2 | `and` | left | Logical AND |
| 3 | `\|` | left | Bitwise OR |
| 4 | `^` | left | Bitwise XOR |
| 5 | `&` | left | Bitwise AND |
| 6 | `+` `-` | left | Addition, subtraction |
| 7 | `*` `/` `%` | left | Multiplication, division, modulo |
| 8 | `!` `~` `-` `*` `&` | right (unary) | Prefix unary operators |
| 9 | `.` `[…]` `(…)` | left (postfix) | Member access, index, call |
| 10 | literals, identifiers, `()` | — | Primary expressions (highest) |
### Operator Descriptions
#### Binary Operators
| Operator | Name | Example | Notes |
| -------- | -------------- | --------- | -------------------------------------------- |
| `or` | Logical OR | `a or b` | Short-circuits; both operands must be `bool` |
| `and` | Logical AND | `a and b` | Short-circuits; both operands must be `bool` |
| `\|` | Bitwise OR | `a \| b` | Integer types |
| `^` | Bitwise XOR | `a ^ b` | Integer types |
| `&` | Bitwise AND | `a & b` | Integer types (binary context) |
| `+` | Addition | `a + b` | |
| `-` | Subtraction | `a - b` | |
| `*` | Multiplication | `a * b` | Binary context (both operands are values) |
| `/` | Division | `a / b` | Integer division truncates toward zero |
| `%` | Modulo | `a % b` | Sign follows the dividend |
#### Unary Prefix Operators
| Operator | Name | Example | Notes |
| -------- | ----------- | ------- | ------------------------------------------------ |
| `!` | Logical NOT | `!cond` | Operand must be `bool` |
| `~` | Bitwise NOT | `~mask` | Bitwise complement; integer types |
| `-` | Negation | `-x` | Arithmetic negation |
| `*` | Dereference | `*ptr` | Unary context; operand must be a pointer type |
| `&` | Address-of | `&x` | Unary context; produces a pointer to the operand |
#### Postfix Operators
| Operator | Name | Example | Notes |
| -------- | ------------- | ----------- | ------------------------------------------------- |
| `.` | Member access | `obj.field` | Accesses a named field or method of a struct/type |
| `[…]` | Subscript | `arr[i]` | Indexes into an array, slice, or map |
| `(…)` | Call | `f(a, b)` | Invokes a function or closure |
> **Disambiguation:** `*` and `&` are context-sensitive.
> When appearing as the first token of a `unary_expr` they are **unary**
> (dereference / address-of). When appearing between two `unary_expr`
> sub-trees inside `multiplicative_expr` or `bitand_expr` they are **binary**
> (multiplication / bitwise AND). The parser resolves this purely from
> grammatical position — no look-ahead beyond 1 token is required.
### Parenthesised Expressions
Any expression may be wrapped in parentheses to override default precedence:
```
(a + b) * c
```
### Function Call Argument List
Arguments are comma-separated expressions. A trailing comma is **not**
permitted at this grammar level.
```
f()
f(x)
f(x, y, z)
```
### Examples
```flux
// Arithmetic
a + b * c - d % 2
// Bitwise
flags & MASK | extra ^ toggle
// Logical
ready and not_done or fallback
// Mixed unary / postfix
*ptr.field
&arr[i]
!cond
// Chained postfix
obj.method(arg1, arg2)[0].name
// Explicit precedence override
(a or b) and c
```
---
## Types
Types describe the shape and interpretation of values. All type positions in
the grammar reference the `type` non-terminal.
### Primitive Types
Primitive types are single-keyword types built into the language.
| Type | Kind | Width | Range / Notes |
| ------ | ---------------- | ------ | ------------------------------------------ |
| `u8` | Unsigned integer | 8-bit | 0 … 255 |
| `u16` | Unsigned integer | 16-bit | 0 … 65 535 |
| `u32` | Unsigned integer | 32-bit | 0 … 4 294 967 295 |
| `u64` | Unsigned integer | 64-bit | 0 … 2⁶⁴ 1 |
| `i8` | Signed integer | 8-bit | 128 … 127 |
| `i16` | Signed integer | 16-bit | 32 768 … 32 767 |
| `i32` | Signed integer | 32-bit | 2 147 483 648 … 2 147 483 647 |
| `i64` | Signed integer | 64-bit | 2⁶³ … 2⁶³ 1 |
| `f32` | Floating-point | 32-bit | IEEE 754 single precision |
| `f64` | Floating-point | 64-bit | IEEE 754 double precision |
| `bool` | Boolean | 1 byte | `true` or `false` |
| `char` | Unicode scalar | 32-bit | Any Unicode scalar value (not a surrogate) |
### Named Types
A named type is any user-defined type referenced by its identifier — typically a struct name. Because all primitive-type keywords (`u8`, `bool`, etc.) are reserved, an `IDENT` in type position is always a named type, never a primitive.
```flux
Point // struct Point { x: f32, y: f32 }
Node // struct Node { value: i64, next: *Node }
*Point // pointer to a named type
[Node; 8] // array of a named type
```
### Pointer Types
A pointer type is written with a leading `*`.
| Syntax | Description |
| --------- | ------------------------------------------------------------------------------------- |
| `*T` | Typed pointer — points to a value of type `T` |
| `*opaque` | Opaque pointer — no compile-time pointee type information; equivalent to C's `void *` |
Pointer types may be nested: `**u8` is a pointer to a pointer to `u8`.
```flux
*u8 // pointer to u8
**i32 // pointer to pointer to i32
*opaque // untyped pointer
**opaque // pointer to untyped pointer
```
### Array Types
Arrays have a fixed size known at compile time.
```
[ <element-type> ; <size> ]
```
`<size>` must be a non-negative integer literal (`INT_LIT`). The element type
may itself be any `type`, including pointers or nested arrays.
```flux
[u8; 256] // array of 256 u8 values
[*u8; 4] // array of 4 pointers to u8
[[f32; 3]; 3] // 3×3 matrix of f32 (array of arrays)
[*opaque; 8] // array of 8 opaque pointers
```
### Type Grammar Summary
```ebnf
type = primitive_type | named_type | pointer_type | array_type ;
primitive_type = "u8" | "u16" | "u32" | "u64"
| "i8" | "i16" | "i32" | "i64"
| "f32" | "f64" | "bool" | "char" ;
named_type = IDENT ;
pointer_type = "*" , ( "opaque" | type ) ;
array_type = "[" , type , ";" , INT_LIT , "]" ;
```
---
## Struct Literals
A struct literal constructs a value of a named struct type by providing values for each field.
```
<TypeName> { <field>: <expr>, ... }
```
Fields may appear in any order and need not match the declaration order. No trailing comma is permitted.
### Examples
```flux
let p = Point { x: 1.0, y: 2.0 };
let n = Node {
value: 42,
next: get_next()
};
// Nested struct literal
let outer = Rect {
origin: Point { x: 0.0, y: 0.0 },
size: Point { x: 10.0, y: 5.0 }
};
// Empty struct
let u = Unit {};
```
### Struct Literals in Conditions
Struct literals are **not permitted** as the outermost expression in `if` and `while` conditions. This restriction exists because `{` after the condition is ambiguous — it could start a struct literal body or the statement block.
```flux
// ERROR — ambiguous: is `{` a struct body or the if block?
if Flags { verbose: true } { ... }
// OK — parentheses resolve the ambiguity
if (Flags { verbose: true }).verbose { ... }
```
The grammar enforces this through the `expr_ns` (no-struct) hierarchy used in condition positions. Struct literals remain valid everywhere else: `let`, `return`, function arguments, field values, etc.
### Struct Literal Grammar Summary
```ebnf
primary_expr = IDENT , [ struct_lit_body ] | INT_LIT | FLOAT_LIT
| STRING_LIT | CHAR_LIT | "true" | "false"
| "(" , expr , ")" ;
struct_lit_body = "{" , struct_field_list , "}" ;
struct_field_list = [ struct_field , { "," , struct_field } ] ;
struct_field = IDENT , ":" , expr ;
```
### No-Struct Expression (`expr_ns`)
`expr_ns` is a parallel expression hierarchy identical to `expr` except its primary level (`primary_expr_ns`) does not allow the `struct_lit_body` suffix after an `IDENT`. Struct literals are still permitted when enclosed in parentheses (`"(" , expr , ")"`), because the `(` unambiguously marks the start of a grouped expression.
`if_stmt` and `while_stmt` use `expr_ns` for their condition; all other expression positions use the full `expr`.
---
## Statements
Statements perform an action and do not produce a value. Each statement is
terminated by a semicolon `;`.
### Let Statement
Introduces a new named binding in the current scope.
```
let [mut] <name> [: <type>] [= <expr>] ;
```
| Part | Required | Description |
| ---------- | -------- | --------------------------------------------- |
| `mut` | no | Makes the binding mutable; omit for immutable |
| `<name>` | yes | The identifier being bound |
| `: <type>` | no | Explicit type annotation |
| `= <expr>` | no | Initialiser expression |
| `;` | yes | Statement terminator |
Bindings are **immutable by default**. Attempting to assign to a binding
declared without `mut` is a compile-time error.
At least one of the type annotation or the initialiser must be present so the
compiler can determine the binding's type. This is a semantic constraint, not a
syntactic one — the grammar permits bare `let x;` and the type checker rejects
it if no type can be inferred from context.
#### Examples
```flux
// Immutable, type inferred from initialiser
let x = 42;
// Immutable, explicit type
let y: f64 = 3.14;
// Mutable, type inferred
let mut count = 0;
// Mutable, explicit type, no initialiser (must be assigned before use)
let mut buf: [u8; 128];
// Mutable pointer to u32
let mut ptr: *u32 = &value;
// Shadowing a previous binding is allowed
let x = "hello"; // x is now a string, previous x is gone
```
### Return Statement
Exits the enclosing function immediately, optionally producing a return value.
```
return [<expr>] ;
```
`return;` (no expression) is used when the function's return type is the unit
type `()`. `return <expr>;` returns the value of the expression.
Explicit `return` is only needed for early exits. The idiomatic way to return a
value from a function is the implicit return of its body block.
```flux
return; // unit return
return 42; // return an integer
return x * 2 + 1; // return an expression
```
### Expression Statement
Evaluates an expression for its side effects; the resulting value is
discarded. A semicolon is required.
```
<expr> ;
```
```flux
do_something(x); // call for side effects
count + 1; // legal but silly — value discarded
```
### Statement Grammar Summary
```ebnf
stmt = let_stmt | return_stmt | if_stmt
| while_stmt | loop_stmt | break_stmt | continue_stmt
| block_stmt | expr_stmt ;
let_stmt = "let" , [ "mut" ] , IDENT , [ ":" , type ] , [ "=" , expr ] , ";" ;
return_stmt = "return" , [ expr ] , ";" ;
if_stmt = "if" , expr_ns , block_stmt , [ "else" , else_branch ] ;
else_branch = if_stmt | block_stmt ;
while_stmt = "while" , expr_ns , block_stmt ;
loop_stmt = "loop" , block_stmt ;
break_stmt = "break" , ";" ;
continue_stmt = "continue" , ";" ;
block_stmt = "{" , { stmt } , "}" ;
expr_stmt = expr , ";" ;
```
---
## If Statement
Conditionally executes a block based on a boolean expression.
```
if <cond> <block> [else <else-branch>]
```
The condition `<cond>` must be an expression of type `bool`. The body is
always a `block_stmt` — braces are mandatory.
### Else Branch
The optional `else` branch is either a plain block or another `if` statement,
enabling `else if` chains of arbitrary length.
```flux
if x > 0 {
pos();
}
if x > 0 {
pos();
} else {
non_pos();
}
if x > 0 {
pos();
} else if x < 0 {
neg();
} else {
zero();
}
```
### If Statement Grammar Summary
```ebnf
if_stmt = "if" , expr_ns , block_stmt , [ "else" , else_branch ] ;
else_branch = if_stmt | block_stmt ;
```
---
## While Loop
Repeatedly executes a block as long as a boolean condition holds. The
condition is tested before each iteration; if it is false on entry, the body
never runs.
```
while <cond> <block>
```
```flux
let mut i = 0;
while i < 10 {
process(i);
i = i + 1;
}
```
### While Loop Grammar Summary
```ebnf
while_stmt = "while" , expr_ns , block_stmt ;
```
---
## Loop
Executes a block unconditionally and indefinitely. The loop runs until a
`break` or `return` inside the body transfers control out.
```
loop <block>
```
```flux
loop {
let msg = recv();
if msg.is_quit() {
break;
}
handle(msg);
}
```
### Loop Grammar Summary
```ebnf
loop_stmt = "loop" , block_stmt ;
```
---
## Break and Continue
`break` and `continue` are only valid inside the body of a `while` or `loop`.
The compiler enforces this as a semantic rule.
| Statement | Effect |
| ------------ | -------------------------------------------------------------- |
| `break ;` | Exits the immediately enclosing loop immediately |
| `continue ;` | Skips the rest of the current iteration; jumps to the next one |
For `while`, `continue` jumps back to the condition check. For `loop`,
`continue` jumps back to the top of the body.
```flux
let mut i = 0;
while i < 20 {
i = i + 1;
if i % 2 == 0 {
continue; // skip even numbers
}
if i > 15 {
break; // stop after 15
}
process(i);
}
```
### Break / Continue Grammar Summary
```ebnf
break_stmt = "break" , ";" ;
continue_stmt = "continue" , ";" ;
```
---
## Block Statement
A block groups zero or more statements into a single statement and introduces
a new lexical scope. Blocks do not produce a value.
```
{ <stmt>* }
```
### Scoping
Bindings declared inside a block are not visible outside it. A binding in an
inner scope may shadow a name from an outer scope without affecting it.
```flux
let x = 1;
{
let x = 2; // shadows outer x inside this block only
f(x); // uses 2
}
// x is still 1 here
```
### Nesting
Blocks may be nested freely to any depth.
```flux
{
let a = compute_a();
{
let b = compute_b();
use(a, b);
}
// b is no longer in scope here
}
```
### Block Grammar Summary
```ebnf
block = "{" , { stmt } , "}" ;
```
---
## Top-Level Definitions
A Flux source file is a sequence of top-level definitions.
```ebnf
program = { top_level_def } ;
top_level_def = func_def | struct_def ;
```
The leading token unambiguously selects the definition kind: `fn` → function,
`struct` → struct.
---
## Function Definition
Defines a named, callable function.
```
fn <name> ( [<params>] ) [-> <return-type>] <block>
```
| Part | Required | Description |
| ------------------ | -------- | -------------------------------------------------------- |
| `<name>` | yes | The function's identifier |
| `( [<params>] )` | yes | Comma-separated parameter list, may be empty |
| `-> <return-type>` | no | Return type; omitting it means the function returns `()` |
| `<block>` | yes | Function body — a `block_stmt` |
### Parameters
Each parameter is a name with a mandatory type annotation. Parameters are
immutable by default; `mut` makes the local binding mutable within the body.
```
[mut] <name> : <type>
```
```flux
fn add(a: i32, b: i32) -> i32 {
return a + b;
}
fn greet(name: *u8) {
print(name);
}
fn increment(mut x: i32) -> i32 {
x = x + 1;
return x;
}
fn apply(f: *opaque, mut buf: [u8; 64]) -> bool {
return call(f, &buf);
}
```
### Return Type
If `->` is omitted the return type is implicitly `()` (the unit type). An
explicit `-> ()` is also permitted but redundant.
```flux
fn do_work() { // returns ()
side_effect();
}
fn get_value() -> i64 { // returns i64
return 42;
}
```
### Function Definition Grammar Summary
```ebnf
func_def = "fn" , IDENT , "(" , param_list , ")" , [ "->" , type ] , block_stmt ;
param_list = [ param , { "," , param } ] ;
param = [ "mut" ] , IDENT , ":" , type ;
```
---
## Struct Definition
Defines a named product type with zero or more typed fields.
```
struct <name> {
<field>: <type>,
...
}
```
Fields are separated by commas. No trailing comma is permitted. An empty
struct (zero fields) is valid.
### Fields
Each field is a name and a type. Fields may be of any type including pointers,
arrays, and other structs. Field names must be unique within the struct.
```flux
struct Point {
x: f32,
y: f32
}
struct Node {
value: i64,
next: *Node
}
struct Buffer {
data: *u8,
len: u64,
cap: u64
}
struct Unit {}
```
### Member Access
Fields of a struct value are accessed with the `.` operator (defined in the
expression grammar). If the value is behind a pointer, dereference it first
with `*`.
```flux
let p: Point = make_point();
let x = p.x;
let ptr: *Point = get_point_ptr();
let y = (*ptr).y;
```
### Struct Definition Grammar Summary
```ebnf
struct_def = "struct" , IDENT , "{" , field_list , "}" ;
field_list = [ field , { "," , field } ] ;
field = IDENT , ":" , type ;
```

7
examples/fibonacci.flx Normal file
View File

@@ -0,0 +1,7 @@
fn fibonacci(n: u8) -> u64 {
if n < 2 {
return n;
}
return fibonacci(n - 1) + fibonacci(n - 2);
}

362
ll1_check.py Normal file
View File

@@ -0,0 +1,362 @@
#!/usr/bin/env python3
"""
ll1_check.py — Parse GRAMMAR.ebnf and verify the LL(1) property.
Usage: python ll1_check.py [grammar_file] [-v]
Algorithm
---------
1. Strip (* … *) comments; tokenise.
2. Parse ISO/IEC 14977 EBNF into an AST.
3. Normalise to plain BNF by introducing fresh helper non-terminals:
{ body } → _repN where _repN = body , _repN | ε
[ body ] → _optN where _optN = body | ε
( body ) → inlined (cross-product inside the parent sequence)
4. Compute FIRST and FOLLOW sets (fixed-point iteration).
5. For each non-terminal compute PREDICT sets; flag pairwise conflicts.
"""
import re
import sys
from collections import defaultdict
from itertools import count as _count
from pathlib import Path
EPSILON = 'ε'
START = 'program' # grammar start symbol
# ═══════════════════════════════════════════════════════════════ 1. Tokenise
_TOK = re.compile(
r'"(?:[^"\\]|\\.)*"' # "quoted terminal string"
r'|[A-Z][A-Z0-9_]*' # UPPERCASE token class (terminal)
r'|[a-z][a-z0-9_]*' # lowercase identifier (non-terminal)
r'|[=;,|()\[\]{}]' # single-char punctuation
)
def tokenise(src: str) -> list:
src = re.sub(r'\(\*.*?\*\)', ' ', src, flags=re.DOTALL)
return _TOK.findall(src)
# ═══════════════════════════════════════════════════════════════ 2. Parse EBNF → AST
#
# Each AST node is a tuple:
# ('lit', s) terminal — quoted string "…" or UPPERCASE token class
# ('nt', s) non-terminal reference
# ('seq', [...]) concatenation (A , B , C)
# ('alt', [...]) alternation (A | B | C)
# ('opt', node) optional [ … ]
# ('rep', node) repetition { … }
class _Parser:
def __init__(self, tokens):
self._t = tokens
self._i = 0
def _peek(self):
return self._t[self._i] if self._i < len(self._t) else None
def _eat(self, expected=None):
v = self._t[self._i]; self._i += 1
if expected and v != expected:
raise SyntaxError(f'expected {expected!r}, got {v!r} '
f'(token #{self._i - 1})')
return v
def parse_grammar(self) -> dict:
rules = {}
while self._i < len(self._t):
name = self._eat()
self._eat('=')
rules[name] = self._body()
self._eat(';')
return rules
def _body(self):
alts = [self._seq()]
while self._peek() == '|':
self._eat()
alts.append(self._seq())
return alts[0] if len(alts) == 1 else ('alt', alts)
def _seq(self):
items = [self._atom()]
while self._peek() == ',':
self._eat()
items.append(self._atom())
return items[0] if len(items) == 1 else ('seq', items)
def _atom(self):
t = self._peek()
if t == '[':
self._eat(); b = self._body(); self._eat(']')
return ('opt', b)
if t == '{':
self._eat(); b = self._body(); self._eat('}')
return ('rep', b)
if t == '(':
self._eat(); b = self._body(); self._eat(')')
return b # group — return inner node directly
if t and (t[0] == '"' or t[0].isupper()):
return ('lit', self._eat())
if t and t[0].islower():
return ('nt', self._eat())
raise SyntaxError(f'unexpected token {t!r}')
# ═══════════════════════════════════════════════════════════════ 3. Normalise
def normalise(ebnf: dict) -> tuple:
"""
Convert EBNF AST to plain BNF.
Returns
-------
bnf : dict[name → list[list[str]]]
Each inner list is one production; [] = ε production.
origins : dict[helper_name → parent_rule_name]
Maps generated helper names back to the rule that created them.
"""
bnf: dict = {}
origins: dict = {}
ctr = _count()
def fresh(tag: str, rule: str) -> str:
h = f'_{tag}{next(ctr)}'
origins[h] = rule
return h
def expand(node, rule: str, in_seq: bool = False) -> list:
"""
Return a list of alternative symbol sequences for this AST node.
in_seq: when True, an 'alt' node is wrapped in a fresh non-terminal
instead of being inlined. This prevents the cross-product
expansion of A , (B | C) , D from producing two productions
that both start with A — a common-prefix false positive that
would be misreported as an LL(1) conflict. The grammar is
already left-factored at the EBNF level; this preserves that.
"""
tag = node[0]
if tag == 'lit':
return [[node[1]]]
if tag == 'nt':
return [[node[1]]]
if tag == 'seq':
# Children of a seq are expanded with in_seq=True so that any
# alt node inside the sequence becomes a fresh non-terminal.
result = [[]]
for child in node[1]:
child_seqs = expand(child, rule, in_seq=True)
result = [a + b for a in result for b in child_seqs]
return result
if tag == 'alt':
if in_seq:
# Alt inside a seq: wrap in a fresh non-terminal (_grpN).
# Each alternative is expanded at top-level (in_seq=False).
h = fresh('grp', rule)
bnf[h] = [s for child in node[1]
for s in expand(child, rule, in_seq=False)]
return [[h]]
# Alt at the top level of a rule body: return alternatives directly.
return [s for child in node[1]
for s in expand(child, rule, in_seq=False)]
if tag == 'opt':
# [ body ] → _optN = body | ε
h = fresh('opt', rule)
bnf[h] = expand(node[1], rule) + [[]]
return [[h]]
if tag == 'rep':
# { body } → _repN = body , _repN | ε
h = fresh('rep', rule)
body_seqs = expand(node[1], rule)
bnf[h] = [s + [h] for s in body_seqs] + [[]]
return [[h]]
raise ValueError(f'unknown AST tag {tag!r}')
for name, node in ebnf.items():
bnf[name] = expand(node, name)
return bnf, origins
# ═══════════════════════════════════════════════════════════════ 4. FIRST / FOLLOW
def first_of_seq(seq: list, first: dict, bnf: dict) -> set:
"""
FIRST set of a sequence of grammar symbols.
Returns a set of terminal strings; includes EPSILON if the whole
sequence can derive the empty string.
"""
result = set()
for sym in seq:
if sym not in bnf: # terminal symbol
result.add(sym)
return result # terminals never derive ε
sym_first = first[sym]
result |= sym_first - {EPSILON}
if EPSILON not in sym_first:
return result # this symbol is not nullable — stop
result.add(EPSILON) # every symbol in seq was nullable
return result
def compute_first(bnf: dict) -> dict:
first = defaultdict(set)
changed = True
while changed:
changed = False
for name, prods in bnf.items():
for prod in prods:
new = first_of_seq(prod, first, bnf)
if not new <= first[name]:
first[name] |= new
changed = True
return first
def compute_follow(bnf: dict, first: dict, start: str) -> dict:
follow = defaultdict(set)
follow[start].add('$')
changed = True
while changed:
changed = False
for name, prods in bnf.items():
for prod in prods:
for i, sym in enumerate(prod):
if sym not in bnf:
continue # skip terminals
# FIRST of what comes after sym in this production
rest_first = first_of_seq(prod[i + 1:], first, bnf)
before = len(follow[sym])
follow[sym] |= rest_first - {EPSILON}
if EPSILON in rest_first:
follow[sym] |= follow[name]
if len(follow[sym]) > before:
changed = True
return follow
# ═══════════════════════════════════════════════════════════════ 5. LL(1) check
def predict_set(prod: list, name: str, first: dict, follow: dict, bnf: dict) -> set:
"""
PREDICT(A → prod) = (FIRST(prod) {ε}) (FOLLOW(A) if ε ∈ FIRST(prod))
"""
f = first_of_seq(prod, first, bnf)
p = f - {EPSILON}
if EPSILON in f:
p |= follow[name]
return p
def check_ll1(bnf: dict, first: dict, follow: dict) -> list:
"""
For each non-terminal check that all PREDICT sets are pairwise disjoint.
Returns a list of conflict dicts.
"""
errors = []
for name, prods in bnf.items():
sets = [predict_set(p, name, first, follow, bnf) for p in prods]
for i in range(len(sets)):
for j in range(i + 1, len(sets)):
conflict = sets[i] & sets[j]
if conflict:
errors.append({
'rule': name,
'prod_i': prods[i],
'prod_j': prods[j],
'conflict': sorted(conflict),
})
return errors
# ═══════════════════════════════════════════════════════════════ 6. Main
def _fmt_prod(prod: list) -> str:
return ' '.join(prod) if prod else EPSILON
def main():
argv = sys.argv[1:]
verbose = '-v' in argv
positional = [a for a in argv if not a.startswith('-')]
path = Path(positional[0]) if positional else Path('GRAMMAR.ebnf')
# ── Load & parse ──────────────────────────────────────────────────────
print(f'Checking {path}')
try:
src = path.read_text(encoding='utf-8')
except FileNotFoundError:
sys.exit(f'error: file not found: {path}')
toks = tokenise(src)
try:
ebnf = _Parser(toks).parse_grammar()
except SyntaxError as exc:
sys.exit(f'EBNF parse error: {exc}')
bnf, origins = normalise(ebnf)
first = compute_first(bnf)
follow = compute_follow(bnf, first, START)
errors = check_ll1(bnf, first, follow)
# ── Summary line ──────────────────────────────────────────────────────
named = sorted(n for n in bnf if not n.startswith('_'))
helpers = sorted(n for n in bnf if n.startswith('_'))
print(f' {len(named)} named rules, {len(helpers)} generated helper rules\n')
# ── Optional verbose output ───────────────────────────────────────────
if verbose:
col = max((len(n) for n in named), default=0) + 2
print('── FIRST sets (named rules) ──────────────────────────────')
for n in named:
syms = sorted(first[n] - {EPSILON})
nullable = ' [nullable]' if EPSILON in first[n] else ''
print(f' FIRST({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}{nullable}')
print()
print('── FOLLOW sets (named rules) ─────────────────────────────')
for n in named:
syms = sorted(follow[n])
print(f' FOLLOW({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}')
print()
# ── LL(1) result ──────────────────────────────────────────────────────
named_err = [e for e in errors if not e['rule'].startswith('_')]
helper_err = [e for e in errors if e['rule'].startswith('_')]
if not errors:
print('✓ Grammar is LL(1) — no conflicts detected.')
return
print(f'{len(errors)} conflict(s): '
f'{len(named_err)} in named rules, '
f'{len(helper_err)} in generated helpers\n')
for e in named_err:
print(f' Rule [{e["rule"]}]')
print(f' alt A : {_fmt_prod(e["prod_i"])}')
print(f' alt B : {_fmt_prod(e["prod_j"])}')
print(f' ambiguous token(s): {e["conflict"]}\n')
if helper_err:
print(' Conflicts in generated helpers '
'(each is linked back to its enclosing named rule):')
for e in helper_err:
orig = origins.get(e['rule'], '?')
print(f' [{e["rule"]}] ← from rule [{orig}]')
print(f' alt A : {_fmt_prod(e["prod_i"])}')
print(f' alt B : {_fmt_prod(e["prod_j"])}')
print(f' ambiguous token(s): {e["conflict"]}\n')
if __name__ == '__main__':
main()