Add the LL(1) context-free grammar (GRAMMAR.ebnf), token and syntax reference (SYNTAX.md), LL(1) verification tool (ll1_check.py), and a fibonacci example demonstrating the language.
363 lines
14 KiB
Python
363 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
ll1_check.py — Parse GRAMMAR.ebnf and verify the LL(1) property.
|
||
|
||
Usage: python ll1_check.py [grammar_file] [-v]
|
||
|
||
Algorithm
|
||
---------
|
||
1. Strip (* … *) comments; tokenise.
|
||
2. Parse ISO/IEC 14977 EBNF into an AST.
|
||
3. Normalise to plain BNF by introducing fresh helper non-terminals:
|
||
{ body } → _repN where _repN = body , _repN | ε
|
||
[ body ] → _optN where _optN = body | ε
|
||
( body ) → inlined (cross-product inside the parent sequence)
|
||
4. Compute FIRST and FOLLOW sets (fixed-point iteration).
|
||
5. For each non-terminal compute PREDICT sets; flag pairwise conflicts.
|
||
"""
|
||
|
||
import re
|
||
import sys
|
||
from collections import defaultdict
|
||
from itertools import count as _count
|
||
from pathlib import Path
|
||
|
||
EPSILON = 'ε'
|
||
START = 'program' # grammar start symbol
|
||
|
||
# ═══════════════════════════════════════════════════════════════ 1. Tokenise
|
||
|
||
_TOK = re.compile(
|
||
r'"(?:[^"\\]|\\.)*"' # "quoted terminal string"
|
||
r'|[A-Z][A-Z0-9_]*' # UPPERCASE token class (terminal)
|
||
r'|[a-z][a-z0-9_]*' # lowercase identifier (non-terminal)
|
||
r'|[=;,|()\[\]{}]' # single-char punctuation
|
||
)
|
||
|
||
def tokenise(src: str) -> list:
|
||
src = re.sub(r'\(\*.*?\*\)', ' ', src, flags=re.DOTALL)
|
||
return _TOK.findall(src)
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════ 2. Parse EBNF → AST
|
||
#
|
||
# Each AST node is a tuple:
|
||
# ('lit', s) terminal — quoted string "…" or UPPERCASE token class
|
||
# ('nt', s) non-terminal reference
|
||
# ('seq', [...]) concatenation (A , B , C)
|
||
# ('alt', [...]) alternation (A | B | C)
|
||
# ('opt', node) optional [ … ]
|
||
# ('rep', node) repetition { … }
|
||
|
||
class _Parser:
|
||
def __init__(self, tokens):
|
||
self._t = tokens
|
||
self._i = 0
|
||
|
||
def _peek(self):
|
||
return self._t[self._i] if self._i < len(self._t) else None
|
||
|
||
def _eat(self, expected=None):
|
||
v = self._t[self._i]; self._i += 1
|
||
if expected and v != expected:
|
||
raise SyntaxError(f'expected {expected!r}, got {v!r} '
|
||
f'(token #{self._i - 1})')
|
||
return v
|
||
|
||
def parse_grammar(self) -> dict:
|
||
rules = {}
|
||
while self._i < len(self._t):
|
||
name = self._eat()
|
||
self._eat('=')
|
||
rules[name] = self._body()
|
||
self._eat(';')
|
||
return rules
|
||
|
||
def _body(self):
|
||
alts = [self._seq()]
|
||
while self._peek() == '|':
|
||
self._eat()
|
||
alts.append(self._seq())
|
||
return alts[0] if len(alts) == 1 else ('alt', alts)
|
||
|
||
def _seq(self):
|
||
items = [self._atom()]
|
||
while self._peek() == ',':
|
||
self._eat()
|
||
items.append(self._atom())
|
||
return items[0] if len(items) == 1 else ('seq', items)
|
||
|
||
def _atom(self):
|
||
t = self._peek()
|
||
if t == '[':
|
||
self._eat(); b = self._body(); self._eat(']')
|
||
return ('opt', b)
|
||
if t == '{':
|
||
self._eat(); b = self._body(); self._eat('}')
|
||
return ('rep', b)
|
||
if t == '(':
|
||
self._eat(); b = self._body(); self._eat(')')
|
||
return b # group — return inner node directly
|
||
if t and (t[0] == '"' or t[0].isupper()):
|
||
return ('lit', self._eat())
|
||
if t and t[0].islower():
|
||
return ('nt', self._eat())
|
||
raise SyntaxError(f'unexpected token {t!r}')
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════ 3. Normalise
|
||
|
||
def normalise(ebnf: dict) -> tuple:
|
||
"""
|
||
Convert EBNF AST to plain BNF.
|
||
|
||
Returns
|
||
-------
|
||
bnf : dict[name → list[list[str]]]
|
||
Each inner list is one production; [] = ε production.
|
||
origins : dict[helper_name → parent_rule_name]
|
||
Maps generated helper names back to the rule that created them.
|
||
"""
|
||
bnf: dict = {}
|
||
origins: dict = {}
|
||
ctr = _count()
|
||
|
||
def fresh(tag: str, rule: str) -> str:
|
||
h = f'_{tag}{next(ctr)}'
|
||
origins[h] = rule
|
||
return h
|
||
|
||
def expand(node, rule: str, in_seq: bool = False) -> list:
|
||
"""
|
||
Return a list of alternative symbol sequences for this AST node.
|
||
|
||
in_seq: when True, an 'alt' node is wrapped in a fresh non-terminal
|
||
instead of being inlined. This prevents the cross-product
|
||
expansion of A , (B | C) , D from producing two productions
|
||
that both start with A — a common-prefix false positive that
|
||
would be misreported as an LL(1) conflict. The grammar is
|
||
already left-factored at the EBNF level; this preserves that.
|
||
"""
|
||
tag = node[0]
|
||
|
||
if tag == 'lit':
|
||
return [[node[1]]]
|
||
|
||
if tag == 'nt':
|
||
return [[node[1]]]
|
||
|
||
if tag == 'seq':
|
||
# Children of a seq are expanded with in_seq=True so that any
|
||
# alt node inside the sequence becomes a fresh non-terminal.
|
||
result = [[]]
|
||
for child in node[1]:
|
||
child_seqs = expand(child, rule, in_seq=True)
|
||
result = [a + b for a in result for b in child_seqs]
|
||
return result
|
||
|
||
if tag == 'alt':
|
||
if in_seq:
|
||
# Alt inside a seq: wrap in a fresh non-terminal (_grpN).
|
||
# Each alternative is expanded at top-level (in_seq=False).
|
||
h = fresh('grp', rule)
|
||
bnf[h] = [s for child in node[1]
|
||
for s in expand(child, rule, in_seq=False)]
|
||
return [[h]]
|
||
# Alt at the top level of a rule body: return alternatives directly.
|
||
return [s for child in node[1]
|
||
for s in expand(child, rule, in_seq=False)]
|
||
|
||
if tag == 'opt':
|
||
# [ body ] → _optN = body | ε
|
||
h = fresh('opt', rule)
|
||
bnf[h] = expand(node[1], rule) + [[]]
|
||
return [[h]]
|
||
|
||
if tag == 'rep':
|
||
# { body } → _repN = body , _repN | ε
|
||
h = fresh('rep', rule)
|
||
body_seqs = expand(node[1], rule)
|
||
bnf[h] = [s + [h] for s in body_seqs] + [[]]
|
||
return [[h]]
|
||
|
||
raise ValueError(f'unknown AST tag {tag!r}')
|
||
|
||
for name, node in ebnf.items():
|
||
bnf[name] = expand(node, name)
|
||
|
||
return bnf, origins
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════ 4. FIRST / FOLLOW
|
||
|
||
def first_of_seq(seq: list, first: dict, bnf: dict) -> set:
|
||
"""
|
||
FIRST set of a sequence of grammar symbols.
|
||
Returns a set of terminal strings; includes EPSILON if the whole
|
||
sequence can derive the empty string.
|
||
"""
|
||
result = set()
|
||
for sym in seq:
|
||
if sym not in bnf: # terminal symbol
|
||
result.add(sym)
|
||
return result # terminals never derive ε
|
||
sym_first = first[sym]
|
||
result |= sym_first - {EPSILON}
|
||
if EPSILON not in sym_first:
|
||
return result # this symbol is not nullable — stop
|
||
result.add(EPSILON) # every symbol in seq was nullable
|
||
return result
|
||
|
||
|
||
def compute_first(bnf: dict) -> dict:
|
||
first = defaultdict(set)
|
||
changed = True
|
||
while changed:
|
||
changed = False
|
||
for name, prods in bnf.items():
|
||
for prod in prods:
|
||
new = first_of_seq(prod, first, bnf)
|
||
if not new <= first[name]:
|
||
first[name] |= new
|
||
changed = True
|
||
return first
|
||
|
||
|
||
def compute_follow(bnf: dict, first: dict, start: str) -> dict:
|
||
follow = defaultdict(set)
|
||
follow[start].add('$')
|
||
changed = True
|
||
while changed:
|
||
changed = False
|
||
for name, prods in bnf.items():
|
||
for prod in prods:
|
||
for i, sym in enumerate(prod):
|
||
if sym not in bnf:
|
||
continue # skip terminals
|
||
# FIRST of what comes after sym in this production
|
||
rest_first = first_of_seq(prod[i + 1:], first, bnf)
|
||
before = len(follow[sym])
|
||
follow[sym] |= rest_first - {EPSILON}
|
||
if EPSILON in rest_first:
|
||
follow[sym] |= follow[name]
|
||
if len(follow[sym]) > before:
|
||
changed = True
|
||
return follow
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════ 5. LL(1) check
|
||
|
||
def predict_set(prod: list, name: str, first: dict, follow: dict, bnf: dict) -> set:
|
||
"""
|
||
PREDICT(A → prod) = (FIRST(prod) − {ε}) ∪ (FOLLOW(A) if ε ∈ FIRST(prod))
|
||
"""
|
||
f = first_of_seq(prod, first, bnf)
|
||
p = f - {EPSILON}
|
||
if EPSILON in f:
|
||
p |= follow[name]
|
||
return p
|
||
|
||
|
||
def check_ll1(bnf: dict, first: dict, follow: dict) -> list:
|
||
"""
|
||
For each non-terminal check that all PREDICT sets are pairwise disjoint.
|
||
Returns a list of conflict dicts.
|
||
"""
|
||
errors = []
|
||
for name, prods in bnf.items():
|
||
sets = [predict_set(p, name, first, follow, bnf) for p in prods]
|
||
for i in range(len(sets)):
|
||
for j in range(i + 1, len(sets)):
|
||
conflict = sets[i] & sets[j]
|
||
if conflict:
|
||
errors.append({
|
||
'rule': name,
|
||
'prod_i': prods[i],
|
||
'prod_j': prods[j],
|
||
'conflict': sorted(conflict),
|
||
})
|
||
return errors
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════ 6. Main
|
||
|
||
def _fmt_prod(prod: list) -> str:
|
||
return ' '.join(prod) if prod else EPSILON
|
||
|
||
|
||
def main():
|
||
argv = sys.argv[1:]
|
||
verbose = '-v' in argv
|
||
positional = [a for a in argv if not a.startswith('-')]
|
||
path = Path(positional[0]) if positional else Path('GRAMMAR.ebnf')
|
||
|
||
# ── Load & parse ──────────────────────────────────────────────────────
|
||
print(f'Checking {path} …')
|
||
try:
|
||
src = path.read_text(encoding='utf-8')
|
||
except FileNotFoundError:
|
||
sys.exit(f'error: file not found: {path}')
|
||
|
||
toks = tokenise(src)
|
||
try:
|
||
ebnf = _Parser(toks).parse_grammar()
|
||
except SyntaxError as exc:
|
||
sys.exit(f'EBNF parse error: {exc}')
|
||
|
||
bnf, origins = normalise(ebnf)
|
||
first = compute_first(bnf)
|
||
follow = compute_follow(bnf, first, START)
|
||
errors = check_ll1(bnf, first, follow)
|
||
|
||
# ── Summary line ──────────────────────────────────────────────────────
|
||
named = sorted(n for n in bnf if not n.startswith('_'))
|
||
helpers = sorted(n for n in bnf if n.startswith('_'))
|
||
print(f' {len(named)} named rules, {len(helpers)} generated helper rules\n')
|
||
|
||
# ── Optional verbose output ───────────────────────────────────────────
|
||
if verbose:
|
||
col = max((len(n) for n in named), default=0) + 2
|
||
print('── FIRST sets (named rules) ──────────────────────────────')
|
||
for n in named:
|
||
syms = sorted(first[n] - {EPSILON})
|
||
nullable = ' [nullable]' if EPSILON in first[n] else ''
|
||
print(f' FIRST({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}{nullable}')
|
||
print()
|
||
print('── FOLLOW sets (named rules) ─────────────────────────────')
|
||
for n in named:
|
||
syms = sorted(follow[n])
|
||
print(f' FOLLOW({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}')
|
||
print()
|
||
|
||
# ── LL(1) result ──────────────────────────────────────────────────────
|
||
named_err = [e for e in errors if not e['rule'].startswith('_')]
|
||
helper_err = [e for e in errors if e['rule'].startswith('_')]
|
||
|
||
if not errors:
|
||
print('✓ Grammar is LL(1) — no conflicts detected.')
|
||
return
|
||
|
||
print(f'✗ {len(errors)} conflict(s): '
|
||
f'{len(named_err)} in named rules, '
|
||
f'{len(helper_err)} in generated helpers\n')
|
||
|
||
for e in named_err:
|
||
print(f' Rule [{e["rule"]}]')
|
||
print(f' alt A : {_fmt_prod(e["prod_i"])}')
|
||
print(f' alt B : {_fmt_prod(e["prod_j"])}')
|
||
print(f' ambiguous token(s): {e["conflict"]}\n')
|
||
|
||
if helper_err:
|
||
print(' Conflicts in generated helpers '
|
||
'(each is linked back to its enclosing named rule):')
|
||
for e in helper_err:
|
||
orig = origins.get(e['rule'], '?')
|
||
print(f' [{e["rule"]}] ← from rule [{orig}]')
|
||
print(f' alt A : {_fmt_prod(e["prod_i"])}')
|
||
print(f' alt B : {_fmt_prod(e["prod_j"])}')
|
||
print(f' ambiguous token(s): {e["conflict"]}\n')
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|