flux/ll1_check.py

#!/usr/bin/env python3
"""
ll1_check.py — Parse GRAMMAR.ebnf and verify the LL(1) property.

Usage:  python ll1_check.py [grammar_file]  [-v]

Algorithm
---------
1. Strip (* … *) comments; tokenise.
2. Parse ISO/IEC 14977 EBNF into an AST.
3. Normalise to plain BNF by introducing fresh helper non-terminals:
     { body }  →  _repN   where  _repN = body , _repN | ε
     [ body ]  →  _optN   where  _optN = body | ε
     ( body )  →  inlined (cross-product inside the parent sequence)
4. Compute FIRST and FOLLOW sets (fixed-point iteration).
5. For each non-terminal compute PREDICT sets; flag pairwise conflicts.
"""

import re
import sys
from collections import defaultdict
from itertools import count as _count
from pathlib import Path

EPSILON = 'ε'
START   = 'program'          # grammar start symbol

# ═══════════════════════════════════════════════════════════════ 1. Tokenise

_TOK = re.compile(
    r'"(?:[^"\\]|\\.)*"'    # "quoted terminal string"
    r'|[A-Z][A-Z0-9_]*'    # UPPERCASE token class  (terminal)
    r'|[a-z][a-z0-9_]*'    # lowercase identifier   (non-terminal)
    r'|[=;,|()\[\]{}]'     # single-char punctuation
)

def tokenise(src: str) -> list:
    src = re.sub(r'\(\*.*?\*\)', ' ', src, flags=re.DOTALL)
    return _TOK.findall(src)


# ═══════════════════════════════════════════════════════════════ 2. Parse EBNF → AST
#
# Each AST node is a tuple:
#   ('lit', s)       terminal  — quoted string "…" or UPPERCASE token class
#   ('nt',  s)       non-terminal reference
#   ('seq', [...])   concatenation  (A , B , C)
#   ('alt', [...])   alternation    (A | B | C)
#   ('opt', node)    optional       [ … ]
#   ('rep', node)    repetition     { … }

class _Parser:
    def __init__(self, tokens):
        self._t = tokens
        self._i = 0

    def _peek(self):
        return self._t[self._i] if self._i < len(self._t) else None

    def _eat(self, expected=None):
        v = self._t[self._i]; self._i += 1
        if expected and v != expected:
            raise SyntaxError(f'expected {expected!r}, got {v!r} '
                              f'(token #{self._i - 1})')
        return v

    def parse_grammar(self) -> dict:
        rules = {}
        while self._i < len(self._t):
            name = self._eat()
            self._eat('=')
            rules[name] = self._body()
            self._eat(';')
        return rules

    def _body(self):
        alts = [self._seq()]
        while self._peek() == '|':
            self._eat()
            alts.append(self._seq())
        return alts[0] if len(alts) == 1 else ('alt', alts)

    def _seq(self):
        items = [self._atom()]
        while self._peek() == ',':
            self._eat()
            items.append(self._atom())
        return items[0] if len(items) == 1 else ('seq', items)

    def _atom(self):
        t = self._peek()
        if t == '[':
            self._eat(); b = self._body(); self._eat(']')
            return ('opt', b)
        if t == '{':
            self._eat(); b = self._body(); self._eat('}')
            return ('rep', b)
        if t == '(':
            self._eat(); b = self._body(); self._eat(')')
            return b                         # group — return inner node directly
        if t and (t[0] == '"' or t[0].isupper()):
            return ('lit', self._eat())
        if t and t[0].islower():
            return ('nt',  self._eat())
        raise SyntaxError(f'unexpected token {t!r}')


# ═══════════════════════════════════════════════════════════════ 3. Normalise

def normalise(ebnf: dict) -> tuple:
    """
    Convert EBNF AST to plain BNF.

    Returns
    -------
    bnf     : dict[name → list[list[str]]]
                Each inner list is one production; [] = ε production.
    origins : dict[helper_name → parent_rule_name]
                Maps generated helper names back to the rule that created them.
    """
    bnf:     dict = {}
    origins: dict = {}
    ctr = _count()

    def fresh(tag: str, rule: str) -> str:
        h = f'_{tag}{next(ctr)}'
        origins[h] = rule
        return h

    def expand(node, rule: str, in_seq: bool = False) -> list:
        """
        Return a list of alternative symbol sequences for this AST node.

        in_seq: when True, an 'alt' node is wrapped in a fresh non-terminal
                instead of being inlined.  This prevents the cross-product
                expansion of  A , (B | C) , D  from producing two productions
                that both start with A — a common-prefix false positive that
                would be misreported as an LL(1) conflict.  The grammar is
                already left-factored at the EBNF level; this preserves that.
        """
        tag = node[0]

        if tag == 'lit':
            return [[node[1]]]

        if tag == 'nt':
            return [[node[1]]]

        if tag == 'seq':
            # Children of a seq are expanded with in_seq=True so that any
            # alt node inside the sequence becomes a fresh non-terminal.
            result = [[]]
            for child in node[1]:
                child_seqs = expand(child, rule, in_seq=True)
                result = [a + b for a in result for b in child_seqs]
            return result

        if tag == 'alt':
            if in_seq:
                # Alt inside a seq: wrap in a fresh non-terminal (_grpN).
                # Each alternative is expanded at top-level (in_seq=False).
                h = fresh('grp', rule)
                bnf[h] = [s for child in node[1]
                          for s in expand(child, rule, in_seq=False)]
                return [[h]]
            # Alt at the top level of a rule body: return alternatives directly.
            return [s for child in node[1]
                    for s in expand(child, rule, in_seq=False)]

        if tag == 'opt':
            # [ body ]  →  _optN = body | ε
            h = fresh('opt', rule)
            bnf[h] = expand(node[1], rule) + [[]]
            return [[h]]

        if tag == 'rep':
            # { body }  →  _repN = body , _repN | ε
            h = fresh('rep', rule)
            body_seqs = expand(node[1], rule)
            bnf[h] = [s + [h] for s in body_seqs] + [[]]
            return [[h]]

        raise ValueError(f'unknown AST tag {tag!r}')

    for name, node in ebnf.items():
        bnf[name] = expand(node, name)

    return bnf, origins


# ═══════════════════════════════════════════════════════════════ 4. FIRST / FOLLOW

def first_of_seq(seq: list, first: dict, bnf: dict) -> set:
    """
    FIRST set of a sequence of grammar symbols.
    Returns a set of terminal strings; includes EPSILON if the whole
    sequence can derive the empty string.
    """
    result = set()
    for sym in seq:
        if sym not in bnf:              # terminal symbol
            result.add(sym)
            return result               # terminals never derive ε
        sym_first = first[sym]
        result |= sym_first - {EPSILON}
        if EPSILON not in sym_first:
            return result               # this symbol is not nullable — stop
    result.add(EPSILON)                 # every symbol in seq was nullable
    return result


def compute_first(bnf: dict) -> dict:
    first = defaultdict(set)
    changed = True
    while changed:
        changed = False
        for name, prods in bnf.items():
            for prod in prods:
                new = first_of_seq(prod, first, bnf)
                if not new <= first[name]:
                    first[name] |= new
                    changed = True
    return first


def compute_follow(bnf: dict, first: dict, start: str) -> dict:
    follow = defaultdict(set)
    follow[start].add('$')
    changed = True
    while changed:
        changed = False
        for name, prods in bnf.items():
            for prod in prods:
                for i, sym in enumerate(prod):
                    if sym not in bnf:
                        continue                    # skip terminals
                    # FIRST of what comes after sym in this production
                    rest_first = first_of_seq(prod[i + 1:], first, bnf)
                    before = len(follow[sym])
                    follow[sym] |= rest_first - {EPSILON}
                    if EPSILON in rest_first:
                        follow[sym] |= follow[name]
                    if len(follow[sym]) > before:
                        changed = True
    return follow


# ═══════════════════════════════════════════════════════════════ 5. LL(1) check

def predict_set(prod: list, name: str, first: dict, follow: dict, bnf: dict) -> set:
    """
    PREDICT(A → prod) = (FIRST(prod) − {ε})  ∪  (FOLLOW(A) if ε ∈ FIRST(prod))
    """
    f = first_of_seq(prod, first, bnf)
    p = f - {EPSILON}
    if EPSILON in f:
        p |= follow[name]
    return p


def check_ll1(bnf: dict, first: dict, follow: dict) -> list:
    """
    For each non-terminal check that all PREDICT sets are pairwise disjoint.
    Returns a list of conflict dicts.
    """
    errors = []
    for name, prods in bnf.items():
        sets = [predict_set(p, name, first, follow, bnf) for p in prods]
        for i in range(len(sets)):
            for j in range(i + 1, len(sets)):
                conflict = sets[i] & sets[j]
                if conflict:
                    errors.append({
                        'rule':     name,
                        'prod_i':   prods[i],
                        'prod_j':   prods[j],
                        'conflict': sorted(conflict),
                    })
    return errors


# ═══════════════════════════════════════════════════════════════ 6. Main

def _fmt_prod(prod: list) -> str:
    return ' '.join(prod) if prod else EPSILON


def main():
    argv      = sys.argv[1:]
    verbose   = '-v' in argv
    positional = [a for a in argv if not a.startswith('-')]
    path      = Path(positional[0]) if positional else Path('GRAMMAR.ebnf')

    # ── Load & parse ──────────────────────────────────────────────────────
    print(f'Checking {path} …')
    try:
        src = path.read_text(encoding='utf-8')
    except FileNotFoundError:
        sys.exit(f'error: file not found: {path}')

    toks = tokenise(src)
    try:
        ebnf = _Parser(toks).parse_grammar()
    except SyntaxError as exc:
        sys.exit(f'EBNF parse error: {exc}')

    bnf, origins = normalise(ebnf)
    first  = compute_first(bnf)
    follow = compute_follow(bnf, first, START)
    errors = check_ll1(bnf, first, follow)

    # ── Summary line ──────────────────────────────────────────────────────
    named   = sorted(n for n in bnf if not n.startswith('_'))
    helpers = sorted(n for n in bnf if     n.startswith('_'))
    print(f'  {len(named)} named rules, {len(helpers)} generated helper rules\n')

    # ── Optional verbose output ───────────────────────────────────────────
    if verbose:
        col = max((len(n) for n in named), default=0) + 2
        print('── FIRST sets (named rules) ──────────────────────────────')
        for n in named:
            syms     = sorted(first[n] - {EPSILON})
            nullable = '  [nullable]' if EPSILON in first[n] else ''
            print(f'  FIRST({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}{nullable}')
        print()
        print('── FOLLOW sets (named rules) ─────────────────────────────')
        for n in named:
            syms = sorted(follow[n])
            print(f'  FOLLOW({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}')
        print()

    # ── LL(1) result ──────────────────────────────────────────────────────
    named_err  = [e for e in errors if not e['rule'].startswith('_')]
    helper_err = [e for e in errors if     e['rule'].startswith('_')]

    if not errors:
        print('✓  Grammar is LL(1) — no conflicts detected.')
        return

    print(f'✗  {len(errors)} conflict(s): '
          f'{len(named_err)} in named rules, '
          f'{len(helper_err)} in generated helpers\n')

    for e in named_err:
        print(f'  Rule [{e["rule"]}]')
        print(f'    alt A : {_fmt_prod(e["prod_i"])}')
        print(f'    alt B : {_fmt_prod(e["prod_j"])}')
        print(f'    ambiguous token(s): {e["conflict"]}\n')

    if helper_err:
        print('  Conflicts in generated helpers '
              '(each is linked back to its enclosing named rule):')
        for e in helper_err:
            orig = origins.get(e['rule'], '?')
            print(f'  [{e["rule"]}]  ← from rule [{orig}]')
            print(f'    alt A : {_fmt_prod(e["prod_i"])}')
            print(f'    alt B : {_fmt_prod(e["prod_j"])}')
            print(f'    ambiguous token(s): {e["conflict"]}\n')


if __name__ == '__main__':
    main()