Initial Flux language specification

Add the LL(1) context-free grammar (GRAMMAR.ebnf), token and syntax
reference (SYNTAX.md), LL(1) verification tool (ll1_check.py), and a
fibonacci example demonstrating the language.
This commit is contained in:
2026-03-10 14:41:54 +01:00
commit 73e36fac71
4 changed files with 1607 additions and 0 deletions

362
ll1_check.py Normal file
View File

@@ -0,0 +1,362 @@
#!/usr/bin/env python3
"""
ll1_check.py — Parse GRAMMAR.ebnf and verify the LL(1) property.
Usage: python ll1_check.py [grammar_file] [-v]
Algorithm
---------
1. Strip (* … *) comments; tokenise.
2. Parse ISO/IEC 14977 EBNF into an AST.
3. Normalise to plain BNF by introducing fresh helper non-terminals:
{ body } → _repN where _repN = body , _repN | ε
[ body ] → _optN where _optN = body | ε
( body ) → inlined (cross-product inside the parent sequence)
4. Compute FIRST and FOLLOW sets (fixed-point iteration).
5. For each non-terminal compute PREDICT sets; flag pairwise conflicts.
"""
import re
import sys
from collections import defaultdict
from itertools import count as _count
from pathlib import Path
EPSILON = 'ε'
START = 'program' # grammar start symbol
# ═══════════════════════════════════════════════════════════════ 1. Tokenise
_TOK = re.compile(
r'"(?:[^"\\]|\\.)*"' # "quoted terminal string"
r'|[A-Z][A-Z0-9_]*' # UPPERCASE token class (terminal)
r'|[a-z][a-z0-9_]*' # lowercase identifier (non-terminal)
r'|[=;,|()\[\]{}]' # single-char punctuation
)
def tokenise(src: str) -> list:
src = re.sub(r'\(\*.*?\*\)', ' ', src, flags=re.DOTALL)
return _TOK.findall(src)
# ═══════════════════════════════════════════════════════════════ 2. Parse EBNF → AST
#
# Each AST node is a tuple:
# ('lit', s) terminal — quoted string "…" or UPPERCASE token class
# ('nt', s) non-terminal reference
# ('seq', [...]) concatenation (A , B , C)
# ('alt', [...]) alternation (A | B | C)
# ('opt', node) optional [ … ]
# ('rep', node) repetition { … }
class _Parser:
def __init__(self, tokens):
self._t = tokens
self._i = 0
def _peek(self):
return self._t[self._i] if self._i < len(self._t) else None
def _eat(self, expected=None):
v = self._t[self._i]; self._i += 1
if expected and v != expected:
raise SyntaxError(f'expected {expected!r}, got {v!r} '
f'(token #{self._i - 1})')
return v
def parse_grammar(self) -> dict:
rules = {}
while self._i < len(self._t):
name = self._eat()
self._eat('=')
rules[name] = self._body()
self._eat(';')
return rules
def _body(self):
alts = [self._seq()]
while self._peek() == '|':
self._eat()
alts.append(self._seq())
return alts[0] if len(alts) == 1 else ('alt', alts)
def _seq(self):
items = [self._atom()]
while self._peek() == ',':
self._eat()
items.append(self._atom())
return items[0] if len(items) == 1 else ('seq', items)
def _atom(self):
t = self._peek()
if t == '[':
self._eat(); b = self._body(); self._eat(']')
return ('opt', b)
if t == '{':
self._eat(); b = self._body(); self._eat('}')
return ('rep', b)
if t == '(':
self._eat(); b = self._body(); self._eat(')')
return b # group — return inner node directly
if t and (t[0] == '"' or t[0].isupper()):
return ('lit', self._eat())
if t and t[0].islower():
return ('nt', self._eat())
raise SyntaxError(f'unexpected token {t!r}')
# ═══════════════════════════════════════════════════════════════ 3. Normalise
def normalise(ebnf: dict) -> tuple:
"""
Convert EBNF AST to plain BNF.
Returns
-------
bnf : dict[name → list[list[str]]]
Each inner list is one production; [] = ε production.
origins : dict[helper_name → parent_rule_name]
Maps generated helper names back to the rule that created them.
"""
bnf: dict = {}
origins: dict = {}
ctr = _count()
def fresh(tag: str, rule: str) -> str:
h = f'_{tag}{next(ctr)}'
origins[h] = rule
return h
def expand(node, rule: str, in_seq: bool = False) -> list:
"""
Return a list of alternative symbol sequences for this AST node.
in_seq: when True, an 'alt' node is wrapped in a fresh non-terminal
instead of being inlined. This prevents the cross-product
expansion of A , (B | C) , D from producing two productions
that both start with A — a common-prefix false positive that
would be misreported as an LL(1) conflict. The grammar is
already left-factored at the EBNF level; this preserves that.
"""
tag = node[0]
if tag == 'lit':
return [[node[1]]]
if tag == 'nt':
return [[node[1]]]
if tag == 'seq':
# Children of a seq are expanded with in_seq=True so that any
# alt node inside the sequence becomes a fresh non-terminal.
result = [[]]
for child in node[1]:
child_seqs = expand(child, rule, in_seq=True)
result = [a + b for a in result for b in child_seqs]
return result
if tag == 'alt':
if in_seq:
# Alt inside a seq: wrap in a fresh non-terminal (_grpN).
# Each alternative is expanded at top-level (in_seq=False).
h = fresh('grp', rule)
bnf[h] = [s for child in node[1]
for s in expand(child, rule, in_seq=False)]
return [[h]]
# Alt at the top level of a rule body: return alternatives directly.
return [s for child in node[1]
for s in expand(child, rule, in_seq=False)]
if tag == 'opt':
# [ body ] → _optN = body | ε
h = fresh('opt', rule)
bnf[h] = expand(node[1], rule) + [[]]
return [[h]]
if tag == 'rep':
# { body } → _repN = body , _repN | ε
h = fresh('rep', rule)
body_seqs = expand(node[1], rule)
bnf[h] = [s + [h] for s in body_seqs] + [[]]
return [[h]]
raise ValueError(f'unknown AST tag {tag!r}')
for name, node in ebnf.items():
bnf[name] = expand(node, name)
return bnf, origins
# ═══════════════════════════════════════════════════════════════ 4. FIRST / FOLLOW
def first_of_seq(seq: list, first: dict, bnf: dict) -> set:
"""
FIRST set of a sequence of grammar symbols.
Returns a set of terminal strings; includes EPSILON if the whole
sequence can derive the empty string.
"""
result = set()
for sym in seq:
if sym not in bnf: # terminal symbol
result.add(sym)
return result # terminals never derive ε
sym_first = first[sym]
result |= sym_first - {EPSILON}
if EPSILON not in sym_first:
return result # this symbol is not nullable — stop
result.add(EPSILON) # every symbol in seq was nullable
return result
def compute_first(bnf: dict) -> dict:
first = defaultdict(set)
changed = True
while changed:
changed = False
for name, prods in bnf.items():
for prod in prods:
new = first_of_seq(prod, first, bnf)
if not new <= first[name]:
first[name] |= new
changed = True
return first
def compute_follow(bnf: dict, first: dict, start: str) -> dict:
follow = defaultdict(set)
follow[start].add('$')
changed = True
while changed:
changed = False
for name, prods in bnf.items():
for prod in prods:
for i, sym in enumerate(prod):
if sym not in bnf:
continue # skip terminals
# FIRST of what comes after sym in this production
rest_first = first_of_seq(prod[i + 1:], first, bnf)
before = len(follow[sym])
follow[sym] |= rest_first - {EPSILON}
if EPSILON in rest_first:
follow[sym] |= follow[name]
if len(follow[sym]) > before:
changed = True
return follow
# ═══════════════════════════════════════════════════════════════ 5. LL(1) check
def predict_set(prod: list, name: str, first: dict, follow: dict, bnf: dict) -> set:
"""
PREDICT(A → prod) = (FIRST(prod) {ε}) (FOLLOW(A) if ε ∈ FIRST(prod))
"""
f = first_of_seq(prod, first, bnf)
p = f - {EPSILON}
if EPSILON in f:
p |= follow[name]
return p
def check_ll1(bnf: dict, first: dict, follow: dict) -> list:
"""
For each non-terminal check that all PREDICT sets are pairwise disjoint.
Returns a list of conflict dicts.
"""
errors = []
for name, prods in bnf.items():
sets = [predict_set(p, name, first, follow, bnf) for p in prods]
for i in range(len(sets)):
for j in range(i + 1, len(sets)):
conflict = sets[i] & sets[j]
if conflict:
errors.append({
'rule': name,
'prod_i': prods[i],
'prod_j': prods[j],
'conflict': sorted(conflict),
})
return errors
# ═══════════════════════════════════════════════════════════════ 6. Main
def _fmt_prod(prod: list) -> str:
return ' '.join(prod) if prod else EPSILON
def main():
argv = sys.argv[1:]
verbose = '-v' in argv
positional = [a for a in argv if not a.startswith('-')]
path = Path(positional[0]) if positional else Path('GRAMMAR.ebnf')
# ── Load & parse ──────────────────────────────────────────────────────
print(f'Checking {path}')
try:
src = path.read_text(encoding='utf-8')
except FileNotFoundError:
sys.exit(f'error: file not found: {path}')
toks = tokenise(src)
try:
ebnf = _Parser(toks).parse_grammar()
except SyntaxError as exc:
sys.exit(f'EBNF parse error: {exc}')
bnf, origins = normalise(ebnf)
first = compute_first(bnf)
follow = compute_follow(bnf, first, START)
errors = check_ll1(bnf, first, follow)
# ── Summary line ──────────────────────────────────────────────────────
named = sorted(n for n in bnf if not n.startswith('_'))
helpers = sorted(n for n in bnf if n.startswith('_'))
print(f' {len(named)} named rules, {len(helpers)} generated helper rules\n')
# ── Optional verbose output ───────────────────────────────────────────
if verbose:
col = max((len(n) for n in named), default=0) + 2
print('── FIRST sets (named rules) ──────────────────────────────')
for n in named:
syms = sorted(first[n] - {EPSILON})
nullable = ' [nullable]' if EPSILON in first[n] else ''
print(f' FIRST({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}{nullable}')
print()
print('── FOLLOW sets (named rules) ─────────────────────────────')
for n in named:
syms = sorted(follow[n])
print(f' FOLLOW({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}')
print()
# ── LL(1) result ──────────────────────────────────────────────────────
named_err = [e for e in errors if not e['rule'].startswith('_')]
helper_err = [e for e in errors if e['rule'].startswith('_')]
if not errors:
print('✓ Grammar is LL(1) — no conflicts detected.')
return
print(f'{len(errors)} conflict(s): '
f'{len(named_err)} in named rules, '
f'{len(helper_err)} in generated helpers\n')
for e in named_err:
print(f' Rule [{e["rule"]}]')
print(f' alt A : {_fmt_prod(e["prod_i"])}')
print(f' alt B : {_fmt_prod(e["prod_j"])}')
print(f' ambiguous token(s): {e["conflict"]}\n')
if helper_err:
print(' Conflicts in generated helpers '
'(each is linked back to its enclosing named rule):')
for e in helper_err:
orig = origins.get(e['rule'], '?')
print(f' [{e["rule"]}] ← from rule [{orig}]')
print(f' alt A : {_fmt_prod(e["prod_i"])}')
print(f' alt B : {_fmt_prod(e["prod_j"])}')
print(f' ambiguous token(s): {e["conflict"]}\n')
if __name__ == '__main__':
main()