Files
flux/ll1_check.py
Jooris Hadeler 73e36fac71 Initial Flux language specification
Add the LL(1) context-free grammar (GRAMMAR.ebnf), token and syntax
reference (SYNTAX.md), LL(1) verification tool (ll1_check.py), and a
fibonacci example demonstrating the language.
2026-03-10 14:41:54 +01:00

363 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
ll1_check.py — Parse GRAMMAR.ebnf and verify the LL(1) property.
Usage: python ll1_check.py [grammar_file] [-v]
Algorithm
---------
1. Strip (* … *) comments; tokenise.
2. Parse ISO/IEC 14977 EBNF into an AST.
3. Normalise to plain BNF by introducing fresh helper non-terminals:
{ body } → _repN where _repN = body , _repN | ε
[ body ] → _optN where _optN = body | ε
( body ) → inlined (cross-product inside the parent sequence)
4. Compute FIRST and FOLLOW sets (fixed-point iteration).
5. For each non-terminal compute PREDICT sets; flag pairwise conflicts.
"""
import re
import sys
from collections import defaultdict
from itertools import count as _count
from pathlib import Path
EPSILON = 'ε'
START = 'program' # grammar start symbol
# ═══════════════════════════════════════════════════════════════ 1. Tokenise
_TOK = re.compile(
r'"(?:[^"\\]|\\.)*"' # "quoted terminal string"
r'|[A-Z][A-Z0-9_]*' # UPPERCASE token class (terminal)
r'|[a-z][a-z0-9_]*' # lowercase identifier (non-terminal)
r'|[=;,|()\[\]{}]' # single-char punctuation
)
def tokenise(src: str) -> list:
src = re.sub(r'\(\*.*?\*\)', ' ', src, flags=re.DOTALL)
return _TOK.findall(src)
# ═══════════════════════════════════════════════════════════════ 2. Parse EBNF → AST
#
# Each AST node is a tuple:
# ('lit', s) terminal — quoted string "…" or UPPERCASE token class
# ('nt', s) non-terminal reference
# ('seq', [...]) concatenation (A , B , C)
# ('alt', [...]) alternation (A | B | C)
# ('opt', node) optional [ … ]
# ('rep', node) repetition { … }
class _Parser:
def __init__(self, tokens):
self._t = tokens
self._i = 0
def _peek(self):
return self._t[self._i] if self._i < len(self._t) else None
def _eat(self, expected=None):
v = self._t[self._i]; self._i += 1
if expected and v != expected:
raise SyntaxError(f'expected {expected!r}, got {v!r} '
f'(token #{self._i - 1})')
return v
def parse_grammar(self) -> dict:
rules = {}
while self._i < len(self._t):
name = self._eat()
self._eat('=')
rules[name] = self._body()
self._eat(';')
return rules
def _body(self):
alts = [self._seq()]
while self._peek() == '|':
self._eat()
alts.append(self._seq())
return alts[0] if len(alts) == 1 else ('alt', alts)
def _seq(self):
items = [self._atom()]
while self._peek() == ',':
self._eat()
items.append(self._atom())
return items[0] if len(items) == 1 else ('seq', items)
def _atom(self):
t = self._peek()
if t == '[':
self._eat(); b = self._body(); self._eat(']')
return ('opt', b)
if t == '{':
self._eat(); b = self._body(); self._eat('}')
return ('rep', b)
if t == '(':
self._eat(); b = self._body(); self._eat(')')
return b # group — return inner node directly
if t and (t[0] == '"' or t[0].isupper()):
return ('lit', self._eat())
if t and t[0].islower():
return ('nt', self._eat())
raise SyntaxError(f'unexpected token {t!r}')
# ═══════════════════════════════════════════════════════════════ 3. Normalise
def normalise(ebnf: dict) -> tuple:
"""
Convert EBNF AST to plain BNF.
Returns
-------
bnf : dict[name → list[list[str]]]
Each inner list is one production; [] = ε production.
origins : dict[helper_name → parent_rule_name]
Maps generated helper names back to the rule that created them.
"""
bnf: dict = {}
origins: dict = {}
ctr = _count()
def fresh(tag: str, rule: str) -> str:
h = f'_{tag}{next(ctr)}'
origins[h] = rule
return h
def expand(node, rule: str, in_seq: bool = False) -> list:
"""
Return a list of alternative symbol sequences for this AST node.
in_seq: when True, an 'alt' node is wrapped in a fresh non-terminal
instead of being inlined. This prevents the cross-product
expansion of A , (B | C) , D from producing two productions
that both start with A — a common-prefix false positive that
would be misreported as an LL(1) conflict. The grammar is
already left-factored at the EBNF level; this preserves that.
"""
tag = node[0]
if tag == 'lit':
return [[node[1]]]
if tag == 'nt':
return [[node[1]]]
if tag == 'seq':
# Children of a seq are expanded with in_seq=True so that any
# alt node inside the sequence becomes a fresh non-terminal.
result = [[]]
for child in node[1]:
child_seqs = expand(child, rule, in_seq=True)
result = [a + b for a in result for b in child_seqs]
return result
if tag == 'alt':
if in_seq:
# Alt inside a seq: wrap in a fresh non-terminal (_grpN).
# Each alternative is expanded at top-level (in_seq=False).
h = fresh('grp', rule)
bnf[h] = [s for child in node[1]
for s in expand(child, rule, in_seq=False)]
return [[h]]
# Alt at the top level of a rule body: return alternatives directly.
return [s for child in node[1]
for s in expand(child, rule, in_seq=False)]
if tag == 'opt':
# [ body ] → _optN = body | ε
h = fresh('opt', rule)
bnf[h] = expand(node[1], rule) + [[]]
return [[h]]
if tag == 'rep':
# { body } → _repN = body , _repN | ε
h = fresh('rep', rule)
body_seqs = expand(node[1], rule)
bnf[h] = [s + [h] for s in body_seqs] + [[]]
return [[h]]
raise ValueError(f'unknown AST tag {tag!r}')
for name, node in ebnf.items():
bnf[name] = expand(node, name)
return bnf, origins
# ═══════════════════════════════════════════════════════════════ 4. FIRST / FOLLOW
def first_of_seq(seq: list, first: dict, bnf: dict) -> set:
"""
FIRST set of a sequence of grammar symbols.
Returns a set of terminal strings; includes EPSILON if the whole
sequence can derive the empty string.
"""
result = set()
for sym in seq:
if sym not in bnf: # terminal symbol
result.add(sym)
return result # terminals never derive ε
sym_first = first[sym]
result |= sym_first - {EPSILON}
if EPSILON not in sym_first:
return result # this symbol is not nullable — stop
result.add(EPSILON) # every symbol in seq was nullable
return result
def compute_first(bnf: dict) -> dict:
first = defaultdict(set)
changed = True
while changed:
changed = False
for name, prods in bnf.items():
for prod in prods:
new = first_of_seq(prod, first, bnf)
if not new <= first[name]:
first[name] |= new
changed = True
return first
def compute_follow(bnf: dict, first: dict, start: str) -> dict:
follow = defaultdict(set)
follow[start].add('$')
changed = True
while changed:
changed = False
for name, prods in bnf.items():
for prod in prods:
for i, sym in enumerate(prod):
if sym not in bnf:
continue # skip terminals
# FIRST of what comes after sym in this production
rest_first = first_of_seq(prod[i + 1:], first, bnf)
before = len(follow[sym])
follow[sym] |= rest_first - {EPSILON}
if EPSILON in rest_first:
follow[sym] |= follow[name]
if len(follow[sym]) > before:
changed = True
return follow
# ═══════════════════════════════════════════════════════════════ 5. LL(1) check
def predict_set(prod: list, name: str, first: dict, follow: dict, bnf: dict) -> set:
"""
PREDICT(A → prod) = (FIRST(prod) {ε}) (FOLLOW(A) if ε ∈ FIRST(prod))
"""
f = first_of_seq(prod, first, bnf)
p = f - {EPSILON}
if EPSILON in f:
p |= follow[name]
return p
def check_ll1(bnf: dict, first: dict, follow: dict) -> list:
"""
For each non-terminal check that all PREDICT sets are pairwise disjoint.
Returns a list of conflict dicts.
"""
errors = []
for name, prods in bnf.items():
sets = [predict_set(p, name, first, follow, bnf) for p in prods]
for i in range(len(sets)):
for j in range(i + 1, len(sets)):
conflict = sets[i] & sets[j]
if conflict:
errors.append({
'rule': name,
'prod_i': prods[i],
'prod_j': prods[j],
'conflict': sorted(conflict),
})
return errors
# ═══════════════════════════════════════════════════════════════ 6. Main
def _fmt_prod(prod: list) -> str:
return ' '.join(prod) if prod else EPSILON
def main():
argv = sys.argv[1:]
verbose = '-v' in argv
positional = [a for a in argv if not a.startswith('-')]
path = Path(positional[0]) if positional else Path('GRAMMAR.ebnf')
# ── Load & parse ──────────────────────────────────────────────────────
print(f'Checking {path}')
try:
src = path.read_text(encoding='utf-8')
except FileNotFoundError:
sys.exit(f'error: file not found: {path}')
toks = tokenise(src)
try:
ebnf = _Parser(toks).parse_grammar()
except SyntaxError as exc:
sys.exit(f'EBNF parse error: {exc}')
bnf, origins = normalise(ebnf)
first = compute_first(bnf)
follow = compute_follow(bnf, first, START)
errors = check_ll1(bnf, first, follow)
# ── Summary line ──────────────────────────────────────────────────────
named = sorted(n for n in bnf if not n.startswith('_'))
helpers = sorted(n for n in bnf if n.startswith('_'))
print(f' {len(named)} named rules, {len(helpers)} generated helper rules\n')
# ── Optional verbose output ───────────────────────────────────────────
if verbose:
col = max((len(n) for n in named), default=0) + 2
print('── FIRST sets (named rules) ──────────────────────────────')
for n in named:
syms = sorted(first[n] - {EPSILON})
nullable = ' [nullable]' if EPSILON in first[n] else ''
print(f' FIRST({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}{nullable}')
print()
print('── FOLLOW sets (named rules) ─────────────────────────────')
for n in named:
syms = sorted(follow[n])
print(f' FOLLOW({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}')
print()
# ── LL(1) result ──────────────────────────────────────────────────────
named_err = [e for e in errors if not e['rule'].startswith('_')]
helper_err = [e for e in errors if e['rule'].startswith('_')]
if not errors:
print('✓ Grammar is LL(1) — no conflicts detected.')
return
print(f'{len(errors)} conflict(s): '
f'{len(named_err)} in named rules, '
f'{len(helper_err)} in generated helpers\n')
for e in named_err:
print(f' Rule [{e["rule"]}]')
print(f' alt A : {_fmt_prod(e["prod_i"])}')
print(f' alt B : {_fmt_prod(e["prod_j"])}')
print(f' ambiguous token(s): {e["conflict"]}\n')
if helper_err:
print(' Conflicts in generated helpers '
'(each is linked back to its enclosing named rule):')
for e in helper_err:
orig = origins.get(e['rule'], '?')
print(f' [{e["rule"]}] ← from rule [{orig}]')
print(f' alt A : {_fmt_prod(e["prod_i"])}')
print(f' alt B : {_fmt_prod(e["prod_j"])}')
print(f' ambiguous token(s): {e["conflict"]}\n')
if __name__ == '__main__':
main()