flux/GRAMMAR.ebnf

(* Flux Language Grammar — Context-Free LL(1) Grammar               *)
(* ================================================================ *)
(*                                                                  *)
(* Notation (ISO/IEC 14977 EBNF):                                   *)
(*   rule = definition ;        defines a rule (terminated by ;)    *)
(*   a , b                      concatenation                       *)
(*   a | b                      alternation                         *)
(*   { a }                      zero or more repetitions of a       *)
(*   [ a ]                      optional a  (zero or one)           *)
(*   ( a | b )                  grouping                            *)
(*   "literal"                  terminal string                     *)
(*                                                                  *)
(* UPPERCASE identifiers are lexical token classes whose value      *)
(* cannot be expressed as a single literal (e.g. IDENT, INT_LIT).   *)
(* They are NOT defined here — see SYNTAX.md.                       *)
(*                                                                  *)
(* Unique/fixed tokens are written as quoted literals directly.     *)
(*                                                                  *)
(* Lowercase identifiers are non-terminals (grammar productions).   *)


(* ================================================================ *)
(* Program (start symbol)                                           *)
(* ================================================================ *)

program = { top_level_def } ;

top_level_def = func_def
              | struct_def ;


(* ================================================================ *)
(* Expressions                                                      *)
(* ================================================================ *)

expr = assign_expr ;


(* --- Assignment and compound assignment (lowest precedence) ---   *)
(*                                                                  *)
(* assign_op covers `=` and all compound-assignment operators.      *)
(* All have the same precedence and are right-associative:          *)
(*   `a = b = c`   →  `a = (b = c)`                                 *)
(*   `a += b += c` →  `a += (b += c)`  (unusual but syntactically   *)
(*                     valid; semantics checked later)              *)
(*                                                                  *)
(* Compound assignments expand semantically:                        *)
(*   `x += y`  →  `x = x + y`                                       *)
(*   `x -= y`  →  `x = x - y`    etc.                               *)
(*                                                                  *)
(* LL(1): after or_expr, peek at next token.                        *)
(*   assign_op token → consume and recurse into assign_expr         *)
(*   other → return the or_expr as-is                               *)
(* None of the assign_op tokens are in FIRST(stmt), so expr_stmt    *)
(* remains unambiguous.                                             *)

assign_expr = or_expr , [ assign_op , assign_expr ] ;

assign_op = "=" | "+=" | "-=" | "*=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" ;


(* --- Logical OR (lowest-precedence binary operator) ---           *)
(*                                                                  *)
(* Uses keyword `or`; left-associative via iteration.               *)

or_expr = and_expr , { "or" , and_expr } ;


(* --- Logical AND ---                                              *)
(*                                                                  *)
(* Uses keyword `and`; left-associative via iteration.              *)

and_expr = bitor_expr , { "and" , bitor_expr } ;


(* --- Bitwise OR ---                                               *)

bitor_expr = bitxor_expr , { "|" , bitxor_expr } ;


(* --- Bitwise XOR ---                                              *)

bitxor_expr = bitand_expr , { "^" , bitand_expr } ;


(* --- Bitwise AND ---                                              *)

bitand_expr = additive_expr , { "&" , additive_expr } ;


(* --- Additive: addition and subtraction ---                       *)

additive_expr = shift_expr ,
                { ( "+" | "-" ) , shift_expr } ;


(* --- Shift: left shift and right shift ---                        *)

shift_expr = multiplicative_expr ,
             { ( "<<" | ">>" ) , multiplicative_expr } ;


(* --- Multiplicative: multiplication, division, modulo ---         *)

multiplicative_expr = unary_expr ,
                      { ( "*" | "/" | "%" ) , unary_expr } ;


(* --- Unary operators (prefix, right-associative by recursion) --- *)
(*                                                                  *)
(* "!"  logical not                                                 *)
(* "~"  bitwise not                                                 *)
(* "-"  arithmetic negation                                         *)
(* "*"  dereference (pointer indirection)                           *)
(* "&"  address-of                                                  *)

unary_expr = "!" , unary_expr
           | "~" , unary_expr
           | "-" , unary_expr
           | "*" , unary_expr
           | "&" , unary_expr
           | postfix_expr ;


(* --- Postfix operators (left-associative via iteration) ---       *)
(*                                                                  *)
(* Postfix operators bind tighter than any prefix or binary form.   *)
(* Multiple postfix operations chain left-to-right.                 *)

postfix_expr = primary_expr , { postfix_op } ;

postfix_op = "." , IDENT                         (* member access   *)
           | "[" , expr , "]"                    (* subscript/index *)
           | "(" , arg_list , ")" ;              (* function call   *)


(* --- Primary expressions (highest precedence) ---                 *)
(*                                                                  *)
(* LL(1) note: after IDENT, peek at the next token.                 *)
(*   "{" → parse struct_lit_body (struct literal)                   *)
(*   other → bare identifier reference                              *)

primary_expr = IDENT , [ struct_lit_body ]   (* ident or struct lit *)
             | INT_LIT
             | FLOAT_LIT
             | STRING_LIT
             | CHAR_LIT
             | "true"
             | "false"
             | "(" , expr , ")" ;             (* parenthesised      *)


(* --- Struct literal ---                                           *)
(*                                                                  *)
(* A struct literal constructs a value of a named struct type.      *)
(*   IDENT "{" field: expr, ... "}"                                 *)
(*                                                                  *)
(* Field order need not match the struct definition order.          *)
(* No trailing comma is permitted (consistent with struct_def).     *)
(*                                                                  *)
(* LL(1) notes:                                                     *)
(*   struct_field_list: "}" → ε; IDENT → first field                *)
(*   FIRST(struct_field) = {IDENT}                                  *)
(*   FOLLOW(struct_field_list) = {"}"}                              *)
(*   Disjoint, so no look-ahead conflict.                           *)

struct_lit_body   = "{" , struct_field_list , "}" ;

struct_field_list = [ struct_field , { "," , struct_field } ] ;

struct_field = IDENT , ":" , expr ;


(* ================================================================ *)
(* Argument List                                                    *)
(* ================================================================ *)

arg_list = [ expr , { "," , expr } ] ;


(* ================================================================ *)
(* No-Struct Expression Hierarchy (expr_ns)                         *)
(* ================================================================ *)
(*                                                                  *)
(* Struct literals create an LL(1) ambiguity in if/while conditions:*)
(*   if Point { x: 1 } { ... }                                      *)
(* After "Point", "{" could open a struct literal OR the body block.*)
(*                                                                  *)
(* Solution: define expr_ns — identical to expr except              *)
(* primary_expr_ns disallows the struct_lit_body suffix after IDENT.*)
(* Struct literals ARE still allowed when parenthesised:            *)
(*   if (Point { x: 1 }).flag { ... }                               *)
(*                                                                  *)
(* if_stmt and while_stmt use expr_ns for their condition.          *)
(* All other expression positions use the full expr.                *)

expr_ns = assign_expr_ns ;

assign_expr_ns = or_expr_ns , [ assign_op , assign_expr_ns ] ;

or_expr_ns  = and_expr_ns , { "or"  , and_expr_ns } ;
and_expr_ns = bitor_expr_ns , { "and" , bitor_expr_ns } ;

bitor_expr_ns  = bitxor_expr_ns , { "|" , bitxor_expr_ns } ;
bitxor_expr_ns = bitand_expr_ns , { "^" , bitand_expr_ns } ;
bitand_expr_ns = additive_expr_ns , { "&" , additive_expr_ns } ;

additive_expr_ns = shift_expr_ns ,
                   { ( "+" | "-" ) , shift_expr_ns } ;

shift_expr_ns = multiplicative_expr_ns ,
                { ( "<<" | ">>" ) , multiplicative_expr_ns } ;

multiplicative_expr_ns = unary_expr_ns ,
                         { ( "*" | "/" | "%" ) , unary_expr_ns } ;

unary_expr_ns = "!" , unary_expr_ns
              | "~" , unary_expr_ns
              | "-" , unary_expr_ns
              | "*" , unary_expr_ns
              | "&" , unary_expr_ns
              | postfix_expr_ns ;

postfix_expr_ns = primary_expr_ns , { postfix_op } ;

(* primary_expr_ns: same as primary_expr but IDENT is never         *)
(* followed by struct_lit_body. Note "(" , expr , ")" uses full     *)
(* expr, so struct literals are permitted inside parentheses.       *)

primary_expr_ns = IDENT                          (* bare ident only *)
                | INT_LIT
                | FLOAT_LIT
                | STRING_LIT
                | CHAR_LIT
                | "true"
                | "false"
                | "(" , expr , ")" ;          (* struct lit OK here *)


(* ================================================================ *)
(* Types                                                            *)
(* ================================================================ *)

type = primitive_type
     | named_type
     | pointer_type
     | array_type ;


(* --- Primitive types ---                                          *)
(*                                                                  *)
(* Unsigned integers : u8  u16  u32  u64                            *)
(* Signed integers   : i8  i16  i32  i64                            *)
(* Floating-point    : f32  f64                                     *)
(* Other             : bool  char                                   *)

primitive_type = "u8"  | "u16" | "u32" | "u64"
               | "i8"  | "i16" | "i32" | "i64"
               | "f32" | "f64"
               | "bool" | "char" ;


(* --- Named types ---                                              *)
(*                                                                  *)
(* A user-defined type referenced by its identifier (e.g. a struct  *)
(* name). The lexer guarantees that all primitive-type keywords are *)
(* reserved, so IDENT never clashes with primitive_type.            *)

named_type = IDENT ;


(* --- Pointer types ---                                            *)
(*                                                                  *)
(* "*" type    — typed pointer; the pointee type is known.          *)
(* "*opaque"   — untyped/opaque pointer (no pointee type info).     *)
(*                                                                  *)
(* LL(1) note: after "*", "opaque" is not in FIRST(type), so the    *)
(* two alternatives are always distinguishable with one token.      *)

pointer_type = "*" , ( "opaque" | type ) ;


(* --- Array types ---                                              *)
(*                                                                  *)
(* "[" type ";" INT_LIT "]"                                         *)
(*                                                                  *)
(* The element type and the fixed size (a non-negative integer      *)
(* literal) are separated by ";". Sizes that are constant           *)
(* expressions may be introduced in a later grammar revision.       *)

array_type = "[" , type , ";" , INT_LIT , "]" ;


(* ================================================================ *)
(* Statements                                                       *)
(* ================================================================ *)

stmt = let_stmt
     | return_stmt
     | if_stmt
     | while_stmt
     | loop_stmt
     | break_stmt
     | continue_stmt
     | block_stmt
     | expr_stmt ;


(* --- Return statement ---                                         *)
(*                                                                  *)
(* Exits the enclosing function, optionally yielding a value.       *)
(* "return ;" is used when the function return type is ().          *)
(*                                                                  *)
(* LL(1): after "return", peek at next token.                       *)
(*   ";" → no expression (unit return)                              *)
(*   other → parse expr, then expect ";"                            *)
(* ";" is not in FIRST(expr), so the two cases are unambiguous.     *)

return_stmt = "return" , [ expr ] , ";" ;


(* --- Expression statement ---                                     *)
(*                                                                  *)
(* Evaluates an expression for its side effects; the value is       *)
(* discarded. The ";" is mandatory.                                 *)
(*                                                                  *)
(* LL(1): at stmt level:                                            *)
(*   "let"      → let_stmt                                          *)
(*   "return"   → return_stmt                                       *)
(*   "if"       → if_stmt                                           *)
(*   "while"    → while_stmt                                        *)
(*   "loop"     → loop_stmt                                         *)
(*   "break"    → break_stmt                                        *)
(*   "continue" → continue_stmt                                     *)
(*   "{"        → block_stmt                                        *)
(*   other      → expr_stmt                                         *)

expr_stmt = expr , ";" ;


(* --- If statement ---                                             *)
(*                                                                  *)
(* Conditionally executes a block. An optional "else" branch may    *)
(* follow; it is either a plain block or another "if" statement,    *)
(* enabling "else if" chains of arbitrary length.                   *)
(*                                                                  *)
(* LL(1) notes:                                                     *)
(*   condition uses expr_ns — struct literals are forbidden at the  *)
(*   outermost level to avoid ambiguity with the body block's "{".  *)
(*   [ "else" ... ] — consume "else" iff next token is "else"       *)
(*   else_branch: "if" → if_stmt (else-if); "{" → block_stmt        *)
(*   The two else_branch alternatives start with distinct tokens,   *)
(*   so no look-ahead conflict arises (no dangling-else ambiguity). *)

if_stmt     = "if" , expr_ns , block_stmt , [ "else" , else_branch ] ;

else_branch = if_stmt       (* else if *)
            | block_stmt ;  (* plain else *)


(* --- While loop ---                                               *)
(*                                                                  *)
(* Repeatedly executes the body as long as the condition is true.   *)
(* The condition is re-evaluated before every iteration.            *)
(* If the condition is false on the first check, the body never     *)
(* executes.                                                        *)
(*                                                                  *)
(* Like if_stmt, the condition uses expr_ns to prevent struct       *)
(* literal ambiguity with the body block's opening "{".             *)

while_stmt = "while" , expr_ns , block_stmt ;


(* --- Infinite loop ---                                            *)
(*                                                                  *)
(* Executes the body unconditionally and indefinitely. The only     *)
(* ways to exit are "break" or "return" inside the body.            *)

loop_stmt = "loop" , block_stmt ;


(* --- Break and continue ---                                       *)
(*                                                                  *)
(* "break"    exits the immediately enclosing "while" or "loop".    *)
(* "continue" skips the rest of the current iteration and jumps to  *)
(*            the next condition check (while) or iteration (loop). *)
(* Both are only valid inside a loop body; the compiler enforces    *)
(* this as a semantic rule.                                         *)

break_stmt    = "break" , ";" ;
continue_stmt = "continue" , ";" ;


(* --- Block statement ---                                          *)
(*                                                                  *)
(* A block groups zero or more statements into a single statement   *)
(* and introduces a new lexical scope. It does not produce a value. *)
(*                                                                  *)
(* LL(1): at stmt level, "{" unambiguously selects block since no   *)
(* other stmt alternative starts with "{".                          *)

block_stmt = "{" , { stmt } , "}" ;


(* --- Let statement ---                                            *)
(*                                                                  *)
(* Introduces a named binding in the current scope.                 *)
(* Bindings are immutable by default; "mut" opts into mutability.   *)
(*                                                                  *)
(* The type annotation and the initialiser are both optional, but   *)
(* at least one must be present for the binding to be usable;       *)
(* the compiler enforces this as a semantic (not syntactic) rule.   *)
(*                                                                  *)
(* LL(1) notes:                                                     *)
(*   [ "mut" ]   — consume "mut" iff the next token is "mut"        *)
(*   [ ":" ... ] — consume iff next token is ":"                    *)
(*   [ "=" ... ] — consume iff next token is "="                    *)
(*   All decision tokens are distinct, so no look-ahead conflict.   *)

let_stmt = "let" , [ "mut" ] , IDENT ,
           [ ":" , type ] ,
           [ "=" , expr ] ,
           ";" ;


(* ================================================================ *)
(* Top-Level Definitions                                            *)
(* ================================================================ *)

(* --- Function definition ---                                      *)
(*                                                                  *)
(* Defines a named function with a typed parameter list and an      *)
(* optional return type. Omitting "->" implies a return type of (). *)
(*                                                                  *)
(* LL(1) notes:                                                     *)
(*   param_list: ")" → ε (empty list); else parse first param       *)
(*   param: "mut" → consume; IDENT → skip (mut absent)              *)
(*   [ "->" ... ]: consume iff next token is "->"                   *)
(*   "->" is a two-character token; distinct from all stmt-starting *)
(*   tokens, so no look-ahead conflict with block_stmt that follows *)

func_def = "fn" , IDENT , "(" , param_list , ")" ,
           [ "->" , type ] ,
           block_stmt ;

param_list = [ param , { "," , param } ] ;

(* Each parameter is an optionally-mutable name with a required     *)
(* type annotation. Mutability applies within the function body.    *)

param = [ "mut" ] , IDENT , ":" , type ;


(* --- Struct definition ---                                        *)
(*                                                                  *)
(* Defines a named product type with zero or more typed fields.     *)
(* Fields are separated by commas; no trailing comma is permitted.  *)
(*                                                                  *)
(* LL(1) notes:                                                     *)
(*   field_list: "}" → ε (empty struct); else parse first field     *)
(*   FIRST(field) = {IDENT}, FOLLOW(field_list) = {"}"}             *)
(*   Disjoint, so no look-ahead conflict.                           *)
(*   top_level_def: "fn" → func_def; "struct" → struct_def          *)

struct_def = "struct" , IDENT , "{" , field_list , "}" ;

field_list = [ field , { "," , field } ] ;

field = IDENT , ":" , type ;