Initial Flux language specification

Add the LL(1) context-free grammar (GRAMMAR.ebnf), token and syntax reference (SYNTAX.md), LL(1) verification tool (ll1_check.py), and a fibonacci example demonstrating the language.
2026-03-10 14:41:54 +01:00
commit 73e36fac71
4 changed files with 1607 additions and 0 deletions
--- a/ll1_check.py
+++ b/ll1_check.py
@@ -0,0 +1,362 @@
+#!/usr/bin/env python3
+"""
+ll1_check.py — Parse GRAMMAR.ebnf and verify the LL(1) property.
+
+Usage:  python ll1_check.py [grammar_file]  [-v]
+
+Algorithm
+---------
+1. Strip (* … *) comments; tokenise.
+2. Parse ISO/IEC 14977 EBNF into an AST.
+3. Normalise to plain BNF by introducing fresh helper non-terminals:
+     { body }  →  _repN   where  _repN = body , _repN | ε
+     [ body ]  →  _optN   where  _optN = body | ε
+     ( body )  →  inlined (cross-product inside the parent sequence)
+4. Compute FIRST and FOLLOW sets (fixed-point iteration).
+5. For each non-terminal compute PREDICT sets; flag pairwise conflicts.
+"""
+
+import re
+import sys
+from collections import defaultdict
+from itertools import count as _count
+from pathlib import Path
+
+EPSILON = 'ε'
+START   = 'program'          # grammar start symbol
+
+# ═══════════════════════════════════════════════════════════════ 1. Tokenise
+
+_TOK = re.compile(
+    r'"(?:[^"\\]|\\.)*"'    # "quoted terminal string"
+    r'|[A-Z][A-Z0-9_]*'    # UPPERCASE token class  (terminal)
+    r'|[a-z][a-z0-9_]*'    # lowercase identifier   (non-terminal)
+    r'|[=;,|()\[\]{}]'     # single-char punctuation
+)
+
+def tokenise(src: str) -> list:
+    src = re.sub(r'\(\*.*?\*\)', ' ', src, flags=re.DOTALL)
+    return _TOK.findall(src)
+
+
+# ═══════════════════════════════════════════════════════════════ 2. Parse EBNF → AST
+#
+# Each AST node is a tuple:
+#   ('lit', s)       terminal  — quoted string "…" or UPPERCASE token class
+#   ('nt',  s)       non-terminal reference
+#   ('seq', [...])   concatenation  (A , B , C)
+#   ('alt', [...])   alternation    (A | B | C)
+#   ('opt', node)    optional       [ … ]
+#   ('rep', node)    repetition     { … }
+
+class _Parser:
+    def __init__(self, tokens):
+        self._t = tokens
+        self._i = 0
+
+    def _peek(self):
+        return self._t[self._i] if self._i < len(self._t) else None
+
+    def _eat(self, expected=None):
+        v = self._t[self._i]; self._i += 1
+        if expected and v != expected:
+            raise SyntaxError(f'expected {expected!r}, got {v!r} '
+                              f'(token #{self._i - 1})')
+        return v
+
+    def parse_grammar(self) -> dict:
+        rules = {}
+        while self._i < len(self._t):
+            name = self._eat()
+            self._eat('=')
+            rules[name] = self._body()
+            self._eat(';')
+        return rules
+
+    def _body(self):
+        alts = [self._seq()]
+        while self._peek() == '|':
+            self._eat()
+            alts.append(self._seq())
+        return alts[0] if len(alts) == 1 else ('alt', alts)
+
+    def _seq(self):
+        items = [self._atom()]
+        while self._peek() == ',':
+            self._eat()
+            items.append(self._atom())
+        return items[0] if len(items) == 1 else ('seq', items)
+
+    def _atom(self):
+        t = self._peek()
+        if t == '[':
+            self._eat(); b = self._body(); self._eat(']')
+            return ('opt', b)
+        if t == '{':
+            self._eat(); b = self._body(); self._eat('}')
+            return ('rep', b)
+        if t == '(':
+            self._eat(); b = self._body(); self._eat(')')
+            return b                         # group — return inner node directly
+        if t and (t[0] == '"' or t[0].isupper()):
+            return ('lit', self._eat())
+        if t and t[0].islower():
+            return ('nt',  self._eat())
+        raise SyntaxError(f'unexpected token {t!r}')
+
+
+# ═══════════════════════════════════════════════════════════════ 3. Normalise
+
+def normalise(ebnf: dict) -> tuple:
+    """
+    Convert EBNF AST to plain BNF.
+
+    Returns
+    -------
+    bnf     : dict[name → list[list[str]]]
+                Each inner list is one production; [] = ε production.
+    origins : dict[helper_name → parent_rule_name]
+                Maps generated helper names back to the rule that created them.
+    """
+    bnf:     dict = {}
+    origins: dict = {}
+    ctr = _count()
+
+    def fresh(tag: str, rule: str) -> str:
+        h = f'_{tag}{next(ctr)}'
+        origins[h] = rule
+        return h
+
+    def expand(node, rule: str, in_seq: bool = False) -> list:
+        """
+        Return a list of alternative symbol sequences for this AST node.
+
+        in_seq: when True, an 'alt' node is wrapped in a fresh non-terminal
+                instead of being inlined.  This prevents the cross-product
+                expansion of  A , (B | C) , D  from producing two productions
+                that both start with A — a common-prefix false positive that
+                would be misreported as an LL(1) conflict.  The grammar is
+                already left-factored at the EBNF level; this preserves that.
+        """
+        tag = node[0]
+
+        if tag == 'lit':
+            return [[node[1]]]
+
+        if tag == 'nt':
+            return [[node[1]]]
+
+        if tag == 'seq':
+            # Children of a seq are expanded with in_seq=True so that any
+            # alt node inside the sequence becomes a fresh non-terminal.
+            result = [[]]
+            for child in node[1]:
+                child_seqs = expand(child, rule, in_seq=True)
+                result = [a + b for a in result for b in child_seqs]
+            return result
+
+        if tag == 'alt':
+            if in_seq:
+                # Alt inside a seq: wrap in a fresh non-terminal (_grpN).
+                # Each alternative is expanded at top-level (in_seq=False).
+                h = fresh('grp', rule)
+                bnf[h] = [s for child in node[1]
+                          for s in expand(child, rule, in_seq=False)]
+                return [[h]]
+            # Alt at the top level of a rule body: return alternatives directly.
+            return [s for child in node[1]
+                    for s in expand(child, rule, in_seq=False)]
+
+        if tag == 'opt':
+            # [ body ]  →  _optN = body | ε
+            h = fresh('opt', rule)
+            bnf[h] = expand(node[1], rule) + [[]]
+            return [[h]]
+
+        if tag == 'rep':
+            # { body }  →  _repN = body , _repN | ε
+            h = fresh('rep', rule)
+            body_seqs = expand(node[1], rule)
+            bnf[h] = [s + [h] for s in body_seqs] + [[]]
+            return [[h]]
+
+        raise ValueError(f'unknown AST tag {tag!r}')
+
+    for name, node in ebnf.items():
+        bnf[name] = expand(node, name)
+
+    return bnf, origins
+
+
+# ═══════════════════════════════════════════════════════════════ 4. FIRST / FOLLOW
+
+def first_of_seq(seq: list, first: dict, bnf: dict) -> set:
+    """
+    FIRST set of a sequence of grammar symbols.
+    Returns a set of terminal strings; includes EPSILON if the whole
+    sequence can derive the empty string.
+    """
+    result = set()
+    for sym in seq:
+        if sym not in bnf:              # terminal symbol
+            result.add(sym)
+            return result               # terminals never derive ε
+        sym_first = first[sym]
+        result |= sym_first - {EPSILON}
+        if EPSILON not in sym_first:
+            return result               # this symbol is not nullable — stop
+    result.add(EPSILON)                 # every symbol in seq was nullable
+    return result
+
+
+def compute_first(bnf: dict) -> dict:
+    first = defaultdict(set)
+    changed = True
+    while changed:
+        changed = False
+        for name, prods in bnf.items():
+            for prod in prods:
+                new = first_of_seq(prod, first, bnf)
+                if not new <= first[name]:
+                    first[name] |= new
+                    changed = True
+    return first
+
+
+def compute_follow(bnf: dict, first: dict, start: str) -> dict:
+    follow = defaultdict(set)
+    follow[start].add('$')
+    changed = True
+    while changed:
+        changed = False
+        for name, prods in bnf.items():
+            for prod in prods:
+                for i, sym in enumerate(prod):
+                    if sym not in bnf:
+                        continue                    # skip terminals
+                    # FIRST of what comes after sym in this production
+                    rest_first = first_of_seq(prod[i + 1:], first, bnf)
+                    before = len(follow[sym])
+                    follow[sym] |= rest_first - {EPSILON}
+                    if EPSILON in rest_first:
+                        follow[sym] |= follow[name]
+                    if len(follow[sym]) > before:
+                        changed = True
+    return follow
+
+
+# ═══════════════════════════════════════════════════════════════ 5. LL(1) check
+
+def predict_set(prod: list, name: str, first: dict, follow: dict, bnf: dict) -> set:
+    """
+    PREDICT(A → prod) = (FIRST(prod) − {ε})  ∪  (FOLLOW(A) if ε ∈ FIRST(prod))
+    """
+    f = first_of_seq(prod, first, bnf)
+    p = f - {EPSILON}
+    if EPSILON in f:
+        p |= follow[name]
+    return p
+
+
+def check_ll1(bnf: dict, first: dict, follow: dict) -> list:
+    """
+    For each non-terminal check that all PREDICT sets are pairwise disjoint.
+    Returns a list of conflict dicts.
+    """
+    errors = []
+    for name, prods in bnf.items():
+        sets = [predict_set(p, name, first, follow, bnf) for p in prods]
+        for i in range(len(sets)):
+            for j in range(i + 1, len(sets)):
+                conflict = sets[i] & sets[j]
+                if conflict:
+                    errors.append({
+                        'rule':     name,
+                        'prod_i':   prods[i],
+                        'prod_j':   prods[j],
+                        'conflict': sorted(conflict),
+                    })
+    return errors
+
+
+# ═══════════════════════════════════════════════════════════════ 6. Main
+
+def _fmt_prod(prod: list) -> str:
+    return ' '.join(prod) if prod else EPSILON
+
+
+def main():
+    argv      = sys.argv[1:]
+    verbose   = '-v' in argv
+    positional = [a for a in argv if not a.startswith('-')]
+    path      = Path(positional[0]) if positional else Path('GRAMMAR.ebnf')
+
+    # ── Load & parse ──────────────────────────────────────────────────────
+    print(f'Checking {path} …')
+    try:
+        src = path.read_text(encoding='utf-8')
+    except FileNotFoundError:
+        sys.exit(f'error: file not found: {path}')
+
+    toks = tokenise(src)
+    try:
+        ebnf = _Parser(toks).parse_grammar()
+    except SyntaxError as exc:
+        sys.exit(f'EBNF parse error: {exc}')
+
+    bnf, origins = normalise(ebnf)
+    first  = compute_first(bnf)
+    follow = compute_follow(bnf, first, START)
+    errors = check_ll1(bnf, first, follow)
+
+    # ── Summary line ──────────────────────────────────────────────────────
+    named   = sorted(n for n in bnf if not n.startswith('_'))
+    helpers = sorted(n for n in bnf if     n.startswith('_'))
+    print(f'  {len(named)} named rules, {len(helpers)} generated helper rules\n')
+
+    # ── Optional verbose output ───────────────────────────────────────────
+    if verbose:
+        col = max((len(n) for n in named), default=0) + 2
+        print('── FIRST sets (named rules) ──────────────────────────────')
+        for n in named:
+            syms     = sorted(first[n] - {EPSILON})
+            nullable = '  [nullable]' if EPSILON in first[n] else ''
+            print(f'  FIRST({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}{nullable}')
+        print()
+        print('── FOLLOW sets (named rules) ─────────────────────────────')
+        for n in named:
+            syms = sorted(follow[n])
+            print(f'  FOLLOW({n}){"":<{col - len(n)}}= {{ {", ".join(syms)} }}')
+        print()
+
+    # ── LL(1) result ──────────────────────────────────────────────────────
+    named_err  = [e for e in errors if not e['rule'].startswith('_')]
+    helper_err = [e for e in errors if     e['rule'].startswith('_')]
+
+    if not errors:
+        print('✓  Grammar is LL(1) — no conflicts detected.')
+        return
+
+    print(f'✗  {len(errors)} conflict(s): '
+          f'{len(named_err)} in named rules, '
+          f'{len(helper_err)} in generated helpers\n')
+
+    for e in named_err:
+        print(f'  Rule [{e["rule"]}]')
+        print(f'    alt A : {_fmt_prod(e["prod_i"])}')
+        print(f'    alt B : {_fmt_prod(e["prod_j"])}')
+        print(f'    ambiguous token(s): {e["conflict"]}\n')
+
+    if helper_err:
+        print('  Conflicts in generated helpers '
+              '(each is linked back to its enclosing named rule):')
+        for e in helper_err:
+            orig = origins.get(e['rule'], '?')
+            print(f'  [{e["rule"]}]  ← from rule [{orig}]')
+            print(f'    alt A : {_fmt_prod(e["prod_i"])}')
+            print(f'    alt B : {_fmt_prod(e["prod_j"])}')
+            print(f'    ambiguous token(s): {e["conflict"]}\n')
+
+
+if __name__ == '__main__':
+    main()