#!/usr/bin/env python3 """ ll1_check.py — Parse GRAMMAR.ebnf and verify the LL(1) property. Usage: python ll1_check.py [grammar_file] [-v] Algorithm --------- 1. Strip (* … *) comments; tokenise. 2. Parse ISO/IEC 14977 EBNF into an AST. 3. Normalise to plain BNF by introducing fresh helper non-terminals: { body } → _repN where _repN = body , _repN | ε [ body ] → _optN where _optN = body | ε ( body ) → inlined (cross-product inside the parent sequence) 4. Compute FIRST and FOLLOW sets (fixed-point iteration). 5. For each non-terminal compute PREDICT sets; flag pairwise conflicts. """ import re import sys from collections import defaultdict from itertools import count as _count from pathlib import Path EPSILON = 'ε' START = 'program' # grammar start symbol # ═══════════════════════════════════════════════════════════════ 1. Tokenise _TOK = re.compile( r'"(?:[^"\\]|\\.)*"' # "quoted terminal string" r'|[A-Z][A-Z0-9_]*' # UPPERCASE token class (terminal) r'|[a-z][a-z0-9_]*' # lowercase identifier (non-terminal) r'|[=;,|()\[\]{}]' # single-char punctuation ) def tokenise(src: str) -> list: src = re.sub(r'\(\*.*?\*\)', ' ', src, flags=re.DOTALL) return _TOK.findall(src) # ═══════════════════════════════════════════════════════════════ 2. Parse EBNF → AST # # Each AST node is a tuple: # ('lit', s) terminal — quoted string "…" or UPPERCASE token class # ('nt', s) non-terminal reference # ('seq', [...]) concatenation (A , B , C) # ('alt', [...]) alternation (A | B | C) # ('opt', node) optional [ … ] # ('rep', node) repetition { … } class _Parser: def __init__(self, tokens): self._t = tokens self._i = 0 def _peek(self): return self._t[self._i] if self._i < len(self._t) else None def _eat(self, expected=None): v = self._t[self._i]; self._i += 1 if expected and v != expected: raise SyntaxError(f'expected {expected!r}, got {v!r} ' f'(token #{self._i - 1})') return v def parse_grammar(self) -> dict: rules = {} while self._i < len(self._t): name = self._eat() self._eat('=') rules[name] = self._body() self._eat(';') return rules def _body(self): alts = [self._seq()] while self._peek() == '|': self._eat() alts.append(self._seq()) return alts[0] if len(alts) == 1 else ('alt', alts) def _seq(self): items = [self._atom()] while self._peek() == ',': self._eat() items.append(self._atom()) return items[0] if len(items) == 1 else ('seq', items) def _atom(self): t = self._peek() if t == '[': self._eat(); b = self._body(); self._eat(']') return ('opt', b) if t == '{': self._eat(); b = self._body(); self._eat('}') return ('rep', b) if t == '(': self._eat(); b = self._body(); self._eat(')') return b # group — return inner node directly if t and (t[0] == '"' or t[0].isupper()): return ('lit', self._eat()) if t and t[0].islower(): return ('nt', self._eat()) raise SyntaxError(f'unexpected token {t!r}') # ═══════════════════════════════════════════════════════════════ 3. Normalise def normalise(ebnf: dict) -> tuple: """ Convert EBNF AST to plain BNF. Returns ------- bnf : dict[name → list[list[str]]] Each inner list is one production; [] = ε production. origins : dict[helper_name → parent_rule_name] Maps generated helper names back to the rule that created them. """ bnf: dict = {} origins: dict = {} ctr = _count() def fresh(tag: str, rule: str) -> str: h = f'_{tag}{next(ctr)}' origins[h] = rule return h def expand(node, rule: str, in_seq: bool = False) -> list: """ Return a list of alternative symbol sequences for this AST node. in_seq: when True, an 'alt' node is wrapped in a fresh non-terminal instead of being inlined. This prevents the cross-product expansion of A , (B | C) , D from producing two productions that both start with A — a common-prefix false positive that would be misreported as an LL(1) conflict. The grammar is already left-factored at the EBNF level; this preserves that. """ tag = node[0] if tag == 'lit': return [[node[1]]] if tag == 'nt': return [[node[1]]] if tag == 'seq': # Children of a seq are expanded with in_seq=True so that any # alt node inside the sequence becomes a fresh non-terminal. result = [[]] for child in node[1]: child_seqs = expand(child, rule, in_seq=True) result = [a + b for a in result for b in child_seqs] return result if tag == 'alt': if in_seq: # Alt inside a seq: wrap in a fresh non-terminal (_grpN). # Each alternative is expanded at top-level (in_seq=False). h = fresh('grp', rule) bnf[h] = [s for child in node[1] for s in expand(child, rule, in_seq=False)] return [[h]] # Alt at the top level of a rule body: return alternatives directly. return [s for child in node[1] for s in expand(child, rule, in_seq=False)] if tag == 'opt': # [ body ] → _optN = body | ε h = fresh('opt', rule) bnf[h] = expand(node[1], rule) + [[]] return [[h]] if tag == 'rep': # { body } → _repN = body , _repN | ε h = fresh('rep', rule) body_seqs = expand(node[1], rule) bnf[h] = [s + [h] for s in body_seqs] + [[]] return [[h]] raise ValueError(f'unknown AST tag {tag!r}') for name, node in ebnf.items(): bnf[name] = expand(node, name) return bnf, origins # ═══════════════════════════════════════════════════════════════ 4. FIRST / FOLLOW def first_of_seq(seq: list, first: dict, bnf: dict) -> set: """ FIRST set of a sequence of grammar symbols. Returns a set of terminal strings; includes EPSILON if the whole sequence can derive the empty string. """ result = set() for sym in seq: if sym not in bnf: # terminal symbol result.add(sym) return result # terminals never derive ε sym_first = first[sym] result |= sym_first - {EPSILON} if EPSILON not in sym_first: return result # this symbol is not nullable — stop result.add(EPSILON) # every symbol in seq was nullable return result def compute_first(bnf: dict) -> dict: first = defaultdict(set) changed = True while changed: changed = False for name, prods in bnf.items(): for prod in prods: new = first_of_seq(prod, first, bnf) if not new <= first[name]: first[name] |= new changed = True return first def compute_follow(bnf: dict, first: dict, start: str) -> dict: follow = defaultdict(set) follow[start].add('$') changed = True while changed: changed = False for name, prods in bnf.items(): for prod in prods: for i, sym in enumerate(prod): if sym not in bnf: continue # skip terminals # FIRST of what comes after sym in this production rest_first = first_of_seq(prod[i + 1:], first, bnf) before = len(follow[sym]) follow[sym] |= rest_first - {EPSILON} if EPSILON in rest_first: follow[sym] |= follow[name] if len(follow[sym]) > before: changed = True return follow # ═══════════════════════════════════════════════════════════════ 5. LL(1) check def predict_set(prod: list, name: str, first: dict, follow: dict, bnf: dict) -> set: """ PREDICT(A → prod) = (FIRST(prod) − {ε}) ∪ (FOLLOW(A) if ε ∈ FIRST(prod)) """ f = first_of_seq(prod, first, bnf) p = f - {EPSILON} if EPSILON in f: p |= follow[name] return p def check_ll1(bnf: dict, first: dict, follow: dict) -> list: """ For each non-terminal check that all PREDICT sets are pairwise disjoint. Returns a list of conflict dicts. """ errors = [] for name, prods in bnf.items(): sets = [predict_set(p, name, first, follow, bnf) for p in prods] for i in range(len(sets)): for j in range(i + 1, len(sets)): conflict = sets[i] & sets[j] if conflict: errors.append({ 'rule': name, 'prod_i': prods[i], 'prod_j': prods[j], 'conflict': sorted(conflict), }) return errors # ═══════════════════════════════════════════════════════════════ 6. Main def _fmt_prod(prod: list) -> str: return ' '.join(prod) if prod else EPSILON def main(): argv = sys.argv[1:] verbose = '-v' in argv positional = [a for a in argv if not a.startswith('-')] path = Path(positional[0]) if positional else Path('GRAMMAR.ebnf') # ── Load & parse ────────────────────────────────────────────────────── print(f'Checking {path} …') try: src = path.read_text(encoding='utf-8') except FileNotFoundError: sys.exit(f'error: file not found: {path}') toks = tokenise(src) try: ebnf = _Parser(toks).parse_grammar() except SyntaxError as exc: sys.exit(f'EBNF parse error: {exc}') bnf, origins = normalise(ebnf) first = compute_first(bnf) follow = compute_follow(bnf, first, START) errors = check_ll1(bnf, first, follow) # ── Summary line ────────────────────────────────────────────────────── named = sorted(n for n in bnf if not n.startswith('_')) helpers = sorted(n for n in bnf if n.startswith('_')) print(f' {len(named)} named rules, {len(helpers)} generated helper rules\n') # ── Optional verbose output ─────────────────────────────────────────── if verbose: col = max((len(n) for n in named), default=0) + 2 print('── FIRST sets (named rules) ──────────────────────────────') for n in named: syms = sorted(first[n] - {EPSILON}) nullable = ' [nullable]' if EPSILON in first[n] else '' print(f' FIRST({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}{nullable}') print() print('── FOLLOW sets (named rules) ─────────────────────────────') for n in named: syms = sorted(follow[n]) print(f' FOLLOW({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}') print() # ── LL(1) result ────────────────────────────────────────────────────── named_err = [e for e in errors if not e['rule'].startswith('_')] helper_err = [e for e in errors if e['rule'].startswith('_')] if not errors: print('✓ Grammar is LL(1) — no conflicts detected.') return print(f'✗ {len(errors)} conflict(s): ' f'{len(named_err)} in named rules, ' f'{len(helper_err)} in generated helpers\n') for e in named_err: print(f' Rule [{e["rule"]}]') print(f' alt A : {_fmt_prod(e["prod_i"])}') print(f' alt B : {_fmt_prod(e["prod_j"])}') print(f' ambiguous token(s): {e["conflict"]}\n') if helper_err: print(' Conflicts in generated helpers ' '(each is linked back to its enclosing named rule):') for e in helper_err: orig = origins.get(e['rule'], '?') print(f' [{e["rule"]}] ← from rule [{orig}]') print(f' alt A : {_fmt_prod(e["prod_i"])}') print(f' alt B : {_fmt_prod(e["prod_j"])}') print(f' ambiguous token(s): {e["conflict"]}\n') if __name__ == '__main__': main()