Initial Flux language specification

Add the LL(1) context-free grammar (GRAMMAR.ebnf), token and syntax reference (SYNTAX.md), LL(1) verification tool (ll1_check.py), and a fibonacci example demonstrating the language.
2026-03-10 14:41:54 +01:00
commit 73e36fac71
4 changed files with 1607 additions and 0 deletions
--- a/GRAMMAR.ebnf
+++ b/GRAMMAR.ebnf
@@ -0,0 +1,435 @@
+(* Flux Language Grammar — Context-Free LL(1) Grammar               *)
+(* ================================================================ *)
+(*                                                                  *)
+(* Notation (ISO/IEC 14977 EBNF):                                   *)
+(*   rule = definition ;        defines a rule (terminated by ;)    *)
+(*   a , b                      concatenation                       *)
+(*   a | b                      alternation                         *)
+(*   { a }                      zero or more repetitions of a       *)
+(*   [ a ]                      optional a  (zero or one)           *)
+(*   ( a | b )                  grouping                            *)
+(*   "literal"                  terminal string                     *)
+(*                                                                  *)
+(* UPPERCASE identifiers are lexical token classes whose value      *)
+(* cannot be expressed as a single literal (e.g. IDENT, INT_LIT).   *)
+(* They are NOT defined here — see SYNTAX.md.                       *)
+(*                                                                  *)
+(* Unique/fixed tokens are written as quoted literals directly.     *)
+(*                                                                  *)
+(* Lowercase identifiers are non-terminals (grammar productions).   *)
+
+
+(* ================================================================ *)
+(* Program (start symbol)                                           *)
+(* ================================================================ *)
+
+program = { top_level_def } ;
+
+top_level_def = func_def
+              | struct_def ;
+
+
+(* ================================================================ *)
+(* Expressions                                                      *)
+(* ================================================================ *)
+
+expr = or_expr ;
+
+
+(* --- Logical OR (lowest-precedence binary operator) ---           *)
+(*                                                                  *)
+(* Uses keyword `or`; left-associative via iteration.               *)
+
+or_expr = and_expr , { "or" , and_expr } ;
+
+
+(* --- Logical AND ---                                              *)
+(*                                                                  *)
+(* Uses keyword `and`; left-associative via iteration.              *)
+
+and_expr = bitor_expr , { "and" , bitor_expr } ;
+
+
+(* --- Bitwise OR ---                                               *)
+
+bitor_expr = bitxor_expr , { "|" , bitxor_expr } ;
+
+
+(* --- Bitwise XOR ---                                              *)
+
+bitxor_expr = bitand_expr , { "^" , bitand_expr } ;
+
+
+(* --- Bitwise AND ---                                              *)
+
+bitand_expr = additive_expr , { "&" , additive_expr } ;
+
+
+(* --- Additive: addition and subtraction ---                       *)
+
+additive_expr = multiplicative_expr ,
+                { ( "+" | "-" ) , multiplicative_expr } ;
+
+
+(* --- Multiplicative: multiplication, division, modulo ---         *)
+
+multiplicative_expr = unary_expr ,
+                      { ( "*" | "/" | "%" ) , unary_expr } ;
+
+
+(* --- Unary operators (prefix, right-associative by recursion) --- *)
+(*                                                                  *)
+(* "!"  logical not                                                 *)
+(* "~"  bitwise not                                                 *)
+(* "-"  arithmetic negation                                         *)
+(* "*"  dereference (pointer indirection)                           *)
+(* "&"  address-of                                                  *)
+
+unary_expr = "!" , unary_expr
+           | "~" , unary_expr
+           | "-" , unary_expr
+           | "*" , unary_expr
+           | "&" , unary_expr
+           | postfix_expr ;
+
+
+(* --- Postfix operators (left-associative via iteration) ---       *)
+(*                                                                  *)
+(* Postfix operators bind tighter than any prefix or binary form.   *)
+(* Multiple postfix operations chain left-to-right.                 *)
+
+postfix_expr = primary_expr , { postfix_op } ;
+
+postfix_op = "." , IDENT                         (* member access   *)
+           | "[" , expr , "]"                    (* subscript/index *)
+           | "(" , arg_list , ")" ;              (* function call   *)
+
+
+(* --- Primary expressions (highest precedence) ---                 *)
+(*                                                                  *)
+(* LL(1) note: after IDENT, peek at the next token.                 *)
+(*   "{" → parse struct_lit_body (struct literal)                   *)
+(*   other → bare identifier reference                              *)
+
+primary_expr = IDENT , [ struct_lit_body ]   (* ident or struct lit *)
+             | INT_LIT
+             | FLOAT_LIT
+             | STRING_LIT
+             | CHAR_LIT
+             | "true"
+             | "false"
+             | "(" , expr , ")" ;             (* parenthesised      *)
+
+
+(* --- Struct literal ---                                           *)
+(*                                                                  *)
+(* A struct literal constructs a value of a named struct type.      *)
+(*   IDENT "{" field: expr, ... "}"                                 *)
+(*                                                                  *)
+(* Field order need not match the struct definition order.          *)
+(* No trailing comma is permitted (consistent with struct_def).     *)
+(*                                                                  *)
+(* LL(1) notes:                                                     *)
+(*   struct_field_list: "}" → ε; IDENT → first field                *)
+(*   FIRST(struct_field) = {IDENT}                                  *) 
+(*   FOLLOW(struct_field_list) = {"}"}                              *)
+(*   Disjoint, so no look-ahead conflict.                           *)
+
+struct_lit_body   = "{" , struct_field_list , "}" ;
+
+struct_field_list = [ struct_field , { "," , struct_field } ] ;
+
+struct_field = IDENT , ":" , expr ;
+
+
+(* ================================================================ *)
+(* Argument List                                                    *)
+(* ================================================================ *)
+
+arg_list = [ expr , { "," , expr } ] ;
+
+
+(* ================================================================ *)
+(* No-Struct Expression Hierarchy (expr_ns)                         *)
+(* ================================================================ *)
+(*                                                                  *)
+(* Struct literals create an LL(1) ambiguity in if/while conditions:*)
+(*   if Point { x: 1 } { ... }                                      *)
+(* After "Point", "{" could open a struct literal OR the body block.*)
+(*                                                                  *)
+(* Solution: define expr_ns — identical to expr except              *)
+(* primary_expr_ns disallows the struct_lit_body suffix after IDENT.*)
+(* Struct literals ARE still allowed when parenthesised:            *)
+(*   if (Point { x: 1 }).flag { ... }                               *)
+(*                                                                  *)
+(* if_stmt and while_stmt use expr_ns for their condition.          *)
+(* All other expression positions use the full expr.                *)
+
+expr_ns = or_expr_ns ;
+
+or_expr_ns  = and_expr_ns , { "or"  , and_expr_ns } ;
+and_expr_ns = bitor_expr_ns , { "and" , bitor_expr_ns } ;
+
+bitor_expr_ns  = bitxor_expr_ns , { "|" , bitxor_expr_ns } ;
+bitxor_expr_ns = bitand_expr_ns , { "^" , bitand_expr_ns } ;
+bitand_expr_ns = additive_expr_ns , { "&" , additive_expr_ns } ;
+
+additive_expr_ns = multiplicative_expr_ns ,
+                   { ( "+" | "-" ) , multiplicative_expr_ns } ;
+
+multiplicative_expr_ns = unary_expr_ns ,
+                         { ( "*" | "/" | "%" ) , unary_expr_ns } ;
+
+unary_expr_ns = "!" , unary_expr_ns
+              | "~" , unary_expr_ns
+              | "-" , unary_expr_ns
+              | "*" , unary_expr_ns
+              | "&" , unary_expr_ns
+              | postfix_expr_ns ;
+
+postfix_expr_ns = primary_expr_ns , { postfix_op } ;
+
+(* primary_expr_ns: same as primary_expr but IDENT is never         *)
+(* followed by struct_lit_body. Note "(" , expr , ")" uses full     *)
+(* expr, so struct literals are permitted inside parentheses.       *)
+
+primary_expr_ns = IDENT                          (* bare ident only *)
+                | INT_LIT
+                | FLOAT_LIT
+                | STRING_LIT
+                | CHAR_LIT
+                | "true"
+                | "false"
+                | "(" , expr , ")" ;          (* struct lit OK here *)
+
+
+(* ================================================================ *)
+(* Types                                                            *)
+(* ================================================================ *)
+
+type = primitive_type
+     | named_type
+     | pointer_type
+     | array_type ;
+
+
+(* --- Primitive types ---                                          *)
+(*                                                                  *)
+(* Unsigned integers : u8  u16  u32  u64                            *)
+(* Signed integers   : i8  i16  i32  i64                            *)
+(* Floating-point    : f32  f64                                     *)
+(* Other             : bool  char                                   *)
+
+primitive_type = "u8"  | "u16" | "u32" | "u64"
+               | "i8"  | "i16" | "i32" | "i64"
+               | "f32" | "f64"
+               | "bool" | "char" ;
+
+
+(* --- Named types ---                                              *)
+(*                                                                  *)
+(* A user-defined type referenced by its identifier (e.g. a struct  *)
+(* name). The lexer guarantees that all primitive-type keywords are *)
+(* reserved, so IDENT never clashes with primitive_type.            *)
+
+named_type = IDENT ;
+
+
+(* --- Pointer types ---                                            *)
+(*                                                                  *)
+(* "*" type    — typed pointer; the pointee type is known.          *)
+(* "*opaque"   — untyped/opaque pointer (no pointee type info).     *)
+(*                                                                  *)
+(* LL(1) note: after "*", "opaque" is not in FIRST(type), so the    *)
+(* two alternatives are always distinguishable with one token.      *)
+
+pointer_type = "*" , ( "opaque" | type ) ;
+
+
+(* --- Array types ---                                              *)
+(*                                                                  *)
+(* "[" type ";" INT_LIT "]"                                         *)
+(*                                                                  *)
+(* The element type and the fixed size (a non-negative integer      *)
+(* literal) are separated by ";". Sizes that are constant           *)
+(* expressions may be introduced in a later grammar revision.       *)
+
+array_type = "[" , type , ";" , INT_LIT , "]" ;
+
+
+(* ================================================================ *)
+(* Statements                                                       *)
+(* ================================================================ *)
+
+stmt = let_stmt
+     | return_stmt
+     | if_stmt
+     | while_stmt
+     | loop_stmt
+     | break_stmt
+     | continue_stmt
+     | block_stmt
+     | expr_stmt ;
+
+
+(* --- Return statement ---                                         *)
+(*                                                                  *)
+(* Exits the enclosing function, optionally yielding a value.       *)
+(* "return ;" is used when the function return type is ().          *)
+(*                                                                  *)
+(* LL(1): after "return", peek at next token.                       *)
+(*   ";" → no expression (unit return)                              *)
+(*   other → parse expr, then expect ";"                            *)
+(* ";" is not in FIRST(expr), so the two cases are unambiguous.     *)
+
+return_stmt = "return" , [ expr ] , ";" ;
+
+
+(* --- Expression statement ---                                     *)
+(*                                                                  *)
+(* Evaluates an expression for its side effects; the value is       *)
+(* discarded. The ";" is mandatory.                                 *)
+(*                                                                  *)
+(* LL(1): at stmt level:                                            *)
+(*   "let"      → let_stmt                                          *)
+(*   "return"   → return_stmt                                       *)
+(*   "if"       → if_stmt                                           *)
+(*   "while"    → while_stmt                                        *)
+(*   "loop"     → loop_stmt                                         *)
+(*   "break"    → break_stmt                                        *)
+(*   "continue" → continue_stmt                                     *)
+(*   "{"        → block_stmt                                        *)
+(*   other      → expr_stmt                                         *)
+
+expr_stmt = expr , ";" ;
+
+
+(* --- If statement ---                                             *)
+(*                                                                  *)
+(* Conditionally executes a block. An optional "else" branch may    *)
+(* follow; it is either a plain block or another "if" statement,    *)
+(* enabling "else if" chains of arbitrary length.                   *)
+(*                                                                  *)
+(* LL(1) notes:                                                     *)
+(*   condition uses expr_ns — struct literals are forbidden at the  *)
+(*   outermost level to avoid ambiguity with the body block's "{".  *)
+(*   [ "else" ... ] — consume "else" iff next token is "else"       *)
+(*   else_branch: "if" → if_stmt (else-if); "{" → block_stmt        *)
+(*   The two else_branch alternatives start with distinct tokens,   *)
+(*   so no look-ahead conflict arises (no dangling-else ambiguity). *)
+
+if_stmt     = "if" , expr_ns , block_stmt , [ "else" , else_branch ] ;
+
+else_branch = if_stmt       (* else if *)
+            | block_stmt ;  (* plain else *)
+
+
+(* --- While loop ---                                               *)
+(*                                                                  *)
+(* Repeatedly executes the body as long as the condition is true.   *)
+(* The condition is re-evaluated before every iteration.            *)
+(* If the condition is false on the first check, the body never     *)
+(* executes.                                                        *)
+(*                                                                  *)
+(* Like if_stmt, the condition uses expr_ns to prevent struct       *)
+(* literal ambiguity with the body block's opening "{".             *)
+
+while_stmt = "while" , expr_ns , block_stmt ;
+
+
+(* --- Infinite loop ---                                            *)
+(*                                                                  *)
+(* Executes the body unconditionally and indefinitely. The only     *)
+(* ways to exit are "break" or "return" inside the body.            *)
+
+loop_stmt = "loop" , block_stmt ;
+
+
+(* --- Break and continue ---                                       *)
+(*                                                                  *)
+(* "break"    exits the immediately enclosing "while" or "loop".    *)
+(* "continue" skips the rest of the current iteration and jumps to  *)
+(*            the next condition check (while) or iteration (loop). *)
+(* Both are only valid inside a loop body; the compiler enforces    *)
+(* this as a semantic rule.                                         *)
+
+break_stmt    = "break" , ";" ;
+continue_stmt = "continue" , ";" ;
+
+
+(* --- Block statement ---                                          *)
+(*                                                                  *)
+(* A block groups zero or more statements into a single statement   *)
+(* and introduces a new lexical scope. It does not produce a value. *)
+(*                                                                  *)
+(* LL(1): at stmt level, "{" unambiguously selects block since no   *)
+(* other stmt alternative starts with "{".                          *)
+
+block_stmt = "{" , { stmt } , "}" ;
+
+
+(* --- Let statement ---                                            *)
+(*                                                                  *)
+(* Introduces a named binding in the current scope.                 *)
+(* Bindings are immutable by default; "mut" opts into mutability.   *)
+(*                                                                  *)
+(* The type annotation and the initialiser are both optional, but   *)
+(* at least one must be present for the binding to be usable;       *)
+(* the compiler enforces this as a semantic (not syntactic) rule.   *)
+(*                                                                  *)
+(* LL(1) notes:                                                     *)
+(*   [ "mut" ]   — consume "mut" iff the next token is "mut"        *)
+(*   [ ":" ... ] — consume iff next token is ":"                    *)
+(*   [ "=" ... ] — consume iff next token is "="                    *)
+(*   All decision tokens are distinct, so no look-ahead conflict.   *)
+
+let_stmt = "let" , [ "mut" ] , IDENT ,
+           [ ":" , type ] ,
+           [ "=" , expr ] ,
+           ";" ;
+
+
+(* ================================================================ *)
+(* Top-Level Definitions                                            *)
+(* ================================================================ *)
+
+(* --- Function definition ---                                      *)
+(*                                                                  *)
+(* Defines a named function with a typed parameter list and an      *)
+(* optional return type. Omitting "->" implies a return type of (). *)
+(*                                                                  *)
+(* LL(1) notes:                                                     *)
+(*   param_list: ")" → ε (empty list); else parse first param       *)
+(*   param: "mut" → consume; IDENT → skip (mut absent)              *)
+(*   [ "->" ... ]: consume iff next token is "->"                   *)
+(*   "->" is a two-character token; distinct from all stmt-starting *)
+(*   tokens, so no look-ahead conflict with block_stmt that follows *)
+
+func_def = "fn" , IDENT , "(" , param_list , ")" ,
+           [ "->" , type ] ,
+           block_stmt ;
+
+param_list = [ param , { "," , param } ] ;
+
+(* Each parameter is an optionally-mutable name with a required     *)
+(* type annotation. Mutability applies within the function body.    *)
+
+param = [ "mut" ] , IDENT , ":" , type ;
+
+
+(* --- Struct definition ---                                        *)
+(*                                                                  *)
+(* Defines a named product type with zero or more typed fields.     *)
+(* Fields are separated by commas; no trailing comma is permitted.  *)
+(*                                                                  *)
+(* LL(1) notes:                                                     *)
+(*   field_list: "}" → ε (empty struct); else parse first field     *)
+(*   FIRST(field) = {IDENT}, FOLLOW(field_list) = {"}"}             *)
+(*   Disjoint, so no look-ahead conflict.                           *)
+(*   top_level_def: "fn" → func_def; "struct" → struct_def          *)
+
+struct_def = "struct" , IDENT , "{" , field_list , "}" ;
+
+field_list = [ field , { "," , field } ] ;
+
+field = IDENT , ":" , type ;