Python API Reference¶

Auto-generated documentation from source code docstrings.

Service Layer¶

ModelStore¶

`orionbelt.service.model_store.ModelStore` ¶

In-memory model registry. Thread-safe via threading.Lock.

Models are keyed by a 16-char hex id: content-derived for shared models (see service/model_cache.py) or random for private ones. All parsing, validation, and compilation infrastructure is instantiated internally, following the same singleton pattern as api/deps.py.

Source code in src/orionbelt/service/model_store.py

class ModelStore:
    """In-memory model registry.  Thread-safe via ``threading.Lock``.

    Models are keyed by a 16-char hex id: content-derived for shared models
    (see ``service/model_cache.py``) or random for private ones. All parsing,
    validation, and compilation infrastructure is instantiated internally,
    following the same singleton pattern as ``api/deps.py``.
    """

    def __init__(self, max_models: int = 10, shared_cache: ModelCache | None = None) -> None:
        self._lock = threading.Lock()
        # Process-wide content-addressed cache shared across sessions. When
        # None (CLI, stateless helpers, bare unit tests) every model stays
        # private to this store and ids are random — behaviour identical to
        # before the cache existed.
        self._shared_cache = shared_cache
        # model_ids in this store that are backed by ``_shared_cache`` (so
        # remove_model/close release the shared reference).
        self._shared_ids: set[str] = set()
        self._models: dict[str, SemanticModel] = {}
        # Parallel storage of each loaded model's *merged* raw YAML dict
        # so inheritance can re-merge against the exact same content the
        # parent was built from. Pre-fix (v2.7.5) inheritance round-tripped
        # through ``_model_to_raw`` which dropped most non-essential
        # fields (numClass, primaryKey, expression on computed columns,
        # measure dataType / filters / grain / delimiter / withinGroup,
        # most metric subtype config, …) — child models would inherit
        # a stripped parent and silently compile invalid SQL such as
        # ``SUM("T"."")`` for any parent computed column whose ``code:``
        # the resolver had derived from its ``expression``.
        self._raws: dict[str, dict[str, object]] = {}
        self._graphs: dict[str, GraphArtifact] = {}
        # Per-store summary cache so dedup hits can return the original
        # data_objects/dimensions/measures/metrics counts without re-walking
        # the model.
        self._summaries: dict[str, ModelSummary] = {}
        self._max_models = max_models
        # Dedup index: content_hash → model_id. Populated on every successful
        # load and consulted before parsing on the next load. See
        # design/PLAN_model_load_dedup.md.
        self._content_hash_index: dict[str, str] = {}

        # Internal pipeline singletons (stateless, safe to share).
        self._loader = TrackedLoader()
        self._resolver = ReferenceResolver()
        self._validator = SemanticValidator()
        self._merger = ExtendsMerger()
        self._pipeline = CompilationPipeline()

    # -- helpers -------------------------------------------------------------

    @staticmethod
    def _new_id() -> str:
        return uuid.uuid4().hex[:16]

    @staticmethod
    def _content_id(content_hash: str) -> str:
        """Stable content-derived id for a shared model.

        A 64-bit prefix of the content hash: identical OBML always yields the
        same id (so sessions loading matching bytes collapse to one shared
        compiled model and one result-cache key), while the full 64-hex
        ``content_hash`` remains the sharing key, so an id-label collision can
        never cause a wrong-model share.
        """
        return content_hash[:16]

    def _register_shared(self, entry: CompiledModel) -> None:
        """Register a shared cache entry into this store's local view.

        Caller must hold ``self._lock``. The store's read-through getters
        (get_model/get_raw/get_graph/describe/list_models/...) then serve the
        shared entry with no further changes.
        """
        self._models[entry.model_id] = entry.model
        self._raws[entry.model_id] = entry.raw
        self._graphs[entry.model_id] = entry.graph
        self._summaries[entry.model_id] = entry.summary
        self._content_hash_index[entry.content_hash] = entry.model_id
        self._shared_ids.add(entry.model_id)

    def _adopt_shared(self, entry: CompiledModel) -> None:
        """Take ownership of one reference to ``entry`` for this store.

        ``entry``'s refcount was already incremented by the caller (via
        ``acquire`` / ``insert_or_acquire``). This store holds exactly one
        reference per shared model_id it exposes, so a surplus reference (the
        store already references this content, e.g. concurrent identical loads)
        is released. Raises ``ModelCapacityError`` (releasing the reference
        first) when the store is full.
        """
        assert self._shared_cache is not None  # only called on shared paths
        with self._lock:
            if entry.model_id in self._shared_ids:
                surplus, over_cap = True, False
            else:
                surplus = False
                over_cap = len(self._models) >= self._max_models
            if not surplus and not over_cap:
                self._register_shared(entry)
        if surplus or over_cap:
            # Release outside the store lock — the cache has its own lock.
            self._shared_cache.release(entry.model_id)
        if over_cap:
            raise ModelCapacityError(f"Maximum models per session reached ({self._max_models})")

    @staticmethod
    def _health_for(model: SemanticModel) -> ModelHealthSummary:
        """Compute structural health for a loaded model."""
        h = compute_health(model)
        return ModelHealthSummary(
            status=h.status,
            data_objects=h.data_objects,
            joins=h.joins,
            orphan_data_objects=h.orphan_data_objects,
            fan_trap_risks=[
                FanTrapRiskInfo(
                    tables=r.tables,
                    reason=r.reason,
                    suggested_pattern=r.suggested_pattern,
                )
                for r in h.fan_trap_risks
            ],
            unreachable_dimensions=h.unreachable_dimensions,
            warnings_count=h.warnings_count,
        )

    @staticmethod
    def _content_hash(yaml_str: str) -> str:
        """SHA-256 of the OBML body, with surrounding whitespace stripped.

        Stripping at the boundary makes a trailing newline difference
        invisible to dedup; everything else (key order, comments, internal
        whitespace) still produces a different hash.
        """
        return hashlib.sha256(yaml_str.strip().encode("utf-8")).hexdigest()

    def _parse_and_validate(
        self,
        yaml_str: str | None = None,
        *,
        raw_dict: dict[str, object] | None = None,
        extends_yaml: list[str] | None = None,
        inherits_model_id: str | None = None,
    ) -> tuple[SemanticModel, dict[str, object], list[ErrorInfo], list[ErrorInfo]]:
        """Parse YAML (or accept pre-parsed dict), resolve references, validate.

        Returns ``(model, merged_raw, errors, warnings)``.
        Provide either ``yaml_str`` or ``raw_dict``, not both.

        ``merged_raw`` is the fully-merged raw dict the resolver consumed
        (after extends/inherits processing) — callers store it so future
        inherits-from-this-model loads can re-merge against the exact
        content rather than going through a lossy ``_model_to_raw``
        round-trip.
        """
        errors: list[ErrorInfo] = []
        warnings: list[ErrorInfo] = []

        # 1. Parse YAML or use pre-parsed dict
        if raw_dict is not None:
            raw = raw_dict
            source_map = None
        elif yaml_str is not None:
            try:
                raw, source_map = self._loader.load_string(yaml_str)
            except YAMLSafetyError as exc:
                errors.append(ErrorInfo(code="YAML_SAFETY_ERROR", message=str(exc)))
                return SemanticModel(), {}, errors, warnings
            except Exception as exc:
                errors.append(ErrorInfo(code="YAML_PARSE_ERROR", message=str(exc)))
                return SemanticModel(), {}, errors, warnings
        else:
            errors.append(
                ErrorInfo(
                    code="NO_MODEL_INPUT",
                    message="Provide either model_yaml or model_json",
                )
            )
            return SemanticModel(), {}, errors, warnings

        # 1b. Merge extends/inherits if provided
        try:
            inherits_raw: dict[str, object] | None = None
            if inherits_model_id is not None:
                # Prefer the parent's stored raw dict — captured at load
                # time so every field round-trips intact. Fall back to
                # the lossy ``_model_to_raw`` only when no raw is on
                # record (legacy / programmatically-constructed models).
                with self._lock:
                    inherits_raw = self._raws.get(inherits_model_id)
                if inherits_raw is None:
                    parent_model = self.get_model(inherits_model_id)
                    inherits_raw = self._model_to_raw(parent_model)

            if extends_yaml or inherits_raw is not None:
                raw, merge_warnings = self._merger.merge_from_strings(
                    raw,
                    extend_yamls=extends_yaml,
                    inherits_raw=inherits_raw,
                )
                for mw in merge_warnings:
                    warnings.append(
                        ErrorInfo(
                            code=WarningCode.MERGE_WARNING,
                            message=mw,
                            severity="warning",
                        )
                    )
                source_map = None
        except MergeError as exc:
            errors.append(ErrorInfo(code=exc.code, message=exc.message))
            return SemanticModel(), {}, errors, warnings
        except KeyError:
            errors.append(
                ErrorInfo(
                    code="PARENT_MODEL_NOT_FOUND",
                    message=f"Parent model '{inherits_model_id}' not found in session",
                )
            )
            return SemanticModel(), {}, errors, warnings

        # 2. Resolve references
        model, resolution = self._resolver.resolve(raw, source_map)
        for e in resolution.errors:
            errors.append(
                ErrorInfo(
                    code=e.code,
                    message=e.message,
                    path=e.path,
                    suggestions=list(e.suggestions),
                    severity=e.severity,
                    hint=e.hint,
                    context=e.context,
                )
            )
        for w in resolution.warnings:
            warnings.append(
                ErrorInfo(
                    code=w.code,
                    message=w.message,
                    path=w.path,
                    suggestions=list(w.suggestions),
                    severity=w.severity or "warning",
                    hint=w.hint,
                    context=w.context,
                )
            )

        # 3. Semantic validation
        sem_errors = self._validator.validate(model)
        for e in sem_errors:
            info = ErrorInfo(
                code=e.code,
                message=e.message,
                path=e.path,
                suggestions=list(e.suggestions),
                severity=e.severity,
                hint=e.hint,
                context=e.context,
            )
            if e.severity == "warning":
                warnings.append(info)
            else:
                errors.append(info)

        # 4. Cross-dataObject refresh contract consistency check.
        from orionbelt.cache.contracts import collect_table_contracts

        _, refresh_warnings = collect_table_contracts(model)
        for w in refresh_warnings:
            warnings.append(
                ErrorInfo(
                    code=w.code,
                    message=w.message,
                    path=w.path,
                    suggestions=list(w.suggestions),
                    severity=w.severity or "warning",
                    hint=w.hint,
                    context=w.context,
                )
            )

        return model, raw, errors, warnings

    @staticmethod
    def _model_to_raw(model: SemanticModel) -> dict[str, object]:
        """Convert a SemanticModel back to a raw dict for inherits merging.

        .. deprecated:: v2.7.5
            Lossy fallback only — drops most non-essential fields. New
            code stores and reuses the merged raw dict captured at load
            time (see ``ModelStore._raws``). This method remains for the
            edge case where a parent model was constructed programmatically
            without ever passing through ``load_model``.
        """
        raw: dict[str, object] = {"version": model.version}
        if model.description:
            raw["description"] = model.description
        if not model.expose_counts:
            raw["exposeCounts"] = False
        if model.count_label_pattern != DEFAULT_COUNT_PATTERN:
            raw["countLabelPattern"] = model.count_label_pattern
        if model.data_objects:
            objs: dict[str, object] = {}
            for name, obj in model.data_objects.items():
                obj_raw: dict[str, object] = {
                    "code": obj.code,
                    "database": obj.database,
                    "schema": obj.schema_name,
                }
                if obj.columns:
                    cols: dict[str, object] = {}
                    for cname, col in obj.columns.items():
                        cols[cname] = {
                            "code": col.code,
                            "abstractType": col.abstract_type.value,
                        }
                    obj_raw["columns"] = cols
                if obj.joins:
                    joins: list[dict[str, object]] = []
                    for j in obj.joins:
                        jd: dict[str, object] = {
                            "joinType": j.join_type.value,
                            "joinTo": j.join_to,
                            "columnsFrom": list(j.columns_from),
                            "columnsTo": list(j.columns_to),
                        }
                        if j.secondary:
                            jd["secondary"] = True
                            jd["pathName"] = j.path_name
                        joins.append(jd)
                    obj_raw["joins"] = joins
                # Carry count-synthesis knobs when non-default so inheritance
                # does not silently re-enable a suppressed count.
                if not obj.countable:
                    obj_raw["countable"] = False
                if obj.count_label is not None:
                    obj_raw["countLabel"] = obj.count_label
                if obj.refresh is not None:
                    refresh: dict[str, object] = {"mode": obj.refresh.mode}
                    if obj.refresh.interval:
                        refresh["interval"] = obj.refresh.interval
                    if obj.refresh.anchor:
                        refresh["anchor"] = obj.refresh.anchor
                    if obj.refresh.timezone:
                        refresh["timezone"] = obj.refresh.timezone
                    if obj.refresh.max_staleness:
                        refresh["maxStaleness"] = obj.refresh.max_staleness
                    obj_raw["refresh"] = refresh
                objs[name] = obj_raw
            raw["dataObjects"] = objs
        if model.dimensions:
            dims: dict[str, object] = {}
            for name, dim in model.dimensions.items():
                dd: dict[str, object] = {
                    "dataObject": dim.view,
                    "column": dim.column,
                    "resultType": dim.result_type.value,
                }
                if dim.time_grain:
                    dd["timeGrain"] = dim.time_grain.value
                dims[name] = dd
            raw["dimensions"] = dims
        if model.measures:
            meas: dict[str, object] = {}
            for name, m in model.measures.items():
                md: dict[str, object] = {
                    "aggregation": m.aggregation,
                    "resultType": m.result_type.value,
                }
                if m.expression:
                    md["expression"] = m.expression
                if m.columns:
                    md["columns"] = [
                        {"dataObject": c.view or "", "column": c.column or ""} for c in m.columns
                    ]
                if m.total:
                    md["total"] = True
                meas[name] = md
            raw["measures"] = meas
        if model.metrics:
            mets: dict[str, object] = {}
            for name, met in model.metrics.items():
                mtd: dict[str, object] = {"type": met.type.value}
                if met.expression:
                    mtd["expression"] = met.expression
                if met.measure:
                    mtd["measure"] = met.measure
                if met.time_dimension:
                    mtd["timeDimension"] = met.time_dimension
                mets[name] = mtd
            raw["metrics"] = mets
        if model.filters:
            raw["filters"] = [
                {
                    "dataObject": f.data_object,
                    "column": f.column,
                    "operator": f.operator,
                    **({"value": f.value} if f.value is not None else {}),
                    **({"values": f.values} if f.values else {}),
                }
                for f in model.filters
            ]
        return raw

    # -- public API ----------------------------------------------------------

    def load_model(
        self,
        yaml_str: str | None = None,
        *,
        raw_dict: dict[str, object] | None = None,
        extends_yaml: list[str] | None = None,
        inherits_model_id: str | None = None,
        dedup: bool = True,
    ) -> LoadResult:
        """Parse, validate, and store a model.  Returns id + summary.

        Provide either ``yaml_str`` or ``raw_dict``.
        Raises ``ModelValidationError`` if the model has validation errors.
        Raises ``ModelCapacityError`` if the session's model cap is reached.

        When ``dedup`` is True (default) and the same OBML bytes have already
        been loaded into this store, the existing ``model_id`` is returned
        and ``model_load`` is set to ``"reused"``. Dedup only applies to
        plain ``yaml_str`` loads — when ``raw_dict``, ``extends_yaml``, or
        ``inherits_model_id`` is supplied the load always runs fresh, since
        the effective content depends on inputs not captured by the YAML
        bytes alone.
        """
        # Dedup is meaningful only for a stand-alone YAML body. The other
        # input shapes either skip the YAML stage (raw_dict) or fold in
        # additional state (extends/inherits) that the bytes don't capture.
        dedup_eligible = (
            dedup
            and yaml_str is not None
            and raw_dict is None
            and not extends_yaml
            and inherits_model_id is None
        )
        content_hash: str | None = None
        if dedup_eligible:
            content_hash = self._content_hash(yaml_str or "")
            with self._lock:
                existing_id = self._content_hash_index.get(content_hash)
                if existing_id is not None and existing_id in self._models:
                    summary = self._summaries.get(existing_id)
                    if summary is not None:
                        existing_model = self._models[existing_id]
                        existing_health = self._health_for(existing_model)
                        return LoadResult(
                            model_id=existing_id,
                            data_objects=summary.data_objects,
                            dimensions=summary.dimensions,
                            measures=summary.measures,
                            metrics=summary.metrics,
                            warnings=[],
                            model_load="reused",
                            health=existing_health,
                        )
                # Stale index entry — drop it and fall through to a fresh load.
                if existing_id is not None:
                    self._content_hash_index.pop(content_hash, None)

        # Cross-session hit: another session already compiled these exact
        # bytes. Adopt the shared compiled model with no recompile. From this
        # store's perspective the load is still "fresh" (a new reference for
        # this session); the compile-skip is a transparent optimisation.
        if dedup_eligible and self._shared_cache is not None and content_hash is not None:
            shared = self._shared_cache.acquire(content_hash)
            if shared is not None:
                self._adopt_shared(shared)
                return LoadResult(
                    model_id=shared.model_id,
                    data_objects=shared.summary.data_objects,
                    dimensions=shared.summary.dimensions,
                    measures=shared.summary.measures,
                    metrics=shared.summary.metrics,
                    warnings=[],
                    model_load="fresh",
                    health=self._health_for(shared.model),
                )

        with self._lock:
            if len(self._models) >= self._max_models:
                raise ModelCapacityError(f"Maximum models per session reached ({self._max_models})")

        model, merged_raw, errors, warnings = self._parse_and_validate(
            yaml_str,
            raw_dict=raw_dict,
            extends_yaml=extends_yaml,
            inherits_model_id=inherits_model_id,
        )
        if errors:
            raise ModelValidationError(errors, warnings)

        shared_load = dedup_eligible and self._shared_cache is not None and content_hash is not None
        # Content-derived id for shared models so identical bytes collapse to
        # one id (and one result-cache key) across sessions; random id
        # otherwise, preserving the ``dedup=False`` "distinct model" contract.
        model_id = (
            self._content_id(content_hash)
            if content_hash is not None and shared_load
            else self._new_id()
        )

        # Eagerly export OBSL-Core graph (Option C: at model load time).
        graph = export_obsl(model, model_id)
        turtle = graph.serialize(format="turtle")
        artifact = GraphArtifact(graph=graph, turtle=turtle, generated_at=time.monotonic())

        summary = ModelSummary(
            model_id=model_id,
            data_objects=len(model.data_objects),
            dimensions=len(model.dimensions),
            measures=len(model.measures),
            metrics=len(model.metrics),
        )

        if shared_load:
            assert self._shared_cache is not None and content_hash is not None
            compiled = CompiledModel(
                model_id=model_id,
                content_hash=content_hash,
                model=model,
                raw=merged_raw,
                graph=artifact,
                summary=summary,
            )
            # A concurrent load of the same content may have won the race;
            # insert_or_acquire returns the surviving entry either way.
            entry = self._shared_cache.insert_or_acquire(compiled)
            self._adopt_shared(entry)
            return LoadResult(
                model_id=entry.model_id,
                data_objects=entry.summary.data_objects,
                dimensions=entry.summary.dimensions,
                measures=entry.summary.measures,
                metrics=entry.summary.metrics,
                warnings=warnings,
                model_load="fresh",
                health=self._health_for(entry.model),
            )

        with self._lock:
            # Re-check capacity under lock — the first check (above) ran
            # outside the lock while parsing/exporting, so a concurrent
            # request may have filled the slot in the meantime.
            if len(self._models) >= self._max_models:
                raise ModelCapacityError(f"Maximum models per session reached ({self._max_models})")
            self._models[model_id] = model
            self._raws[model_id] = merged_raw
            self._graphs[model_id] = artifact
            self._summaries[model_id] = summary
            if content_hash is not None:
                # If a concurrent request beat us to it, the last writer wins;
                # the race is benign (both models work, the older one is just
                # not reachable via the index). See PLAN_model_load_dedup.md §6.3.
                self._content_hash_index[content_hash] = model_id

        return LoadResult(
            model_id=model_id,
            data_objects=summary.data_objects,
            dimensions=summary.dimensions,
            measures=summary.measures,
            metrics=summary.metrics,
            warnings=warnings,
            model_load="fresh",
            health=self._health_for(model),
        )

    def get_model(self, model_id: str) -> SemanticModel:
        """Look up a loaded model.  Raises ``KeyError`` if not found."""
        with self._lock:
            try:
                return self._models[model_id]
            except KeyError:
                raise KeyError(f"No model loaded with id '{model_id}'") from None

    def get_raw(self, model_id: str) -> dict[str, object]:
        """Return the raw OBML dict for a loaded model.

        Prefers the merged raw dict captured verbatim at load time (so
        every field round-trips intact). Falls back to the lossy
        ``_model_to_raw`` reconstruction only for models that never passed
        through :meth:`load_model` (e.g. programmatically constructed).

        Raises ``KeyError`` if no model is loaded with the given id.

        Returns a deep copy so callers (and downstream converters) cannot
        mutate the store's internal ``_raws`` entry, which would corrupt
        later exports or ``inherits`` merges.
        """
        with self._lock:
            if model_id not in self._models:
                raise KeyError(f"No model loaded with id '{model_id}'")
            raw = self._raws.get(model_id)
            model = self._models[model_id]
        if raw is None:
            return self._model_to_raw(model)
        return copy.deepcopy(raw)

    def describe(self, model_id: str) -> ModelDescription:
        """Return a structured summary suitable for LLM consumption."""
        model = self.get_model(model_id)

        data_objects = [
            DataObjectInfo(
                label=obj.label,
                code=obj.qualified_code,
                columns=list(obj.columns.keys()),
                join_targets=[j.join_to for j in obj.joins],
                synonyms=obj.synonyms,
                owner=obj.owner,
            )
            for obj in model.data_objects.values()
        ]

        dimensions = [
            DimensionInfo(
                name=dim.label,
                result_type=dim.result_type.value,
                data_object=dim.view,
                column=dim.column,
                time_grain=dim.time_grain.value if dim.time_grain else None,
                synonyms=dim.synonyms,
                owner=dim.owner,
            )
            for dim in model.dimensions.values()
        ]

        measures = [
            MeasureInfo(
                name=m.label,
                result_type=m.result_type.value,
                aggregation=m.aggregation,
                expression=m.expression,
                synonyms=m.synonyms,
                owner=m.owner,
            )
            for m in model.measures.values()
        ]

        metrics = [
            MetricInfo(
                name=met.label,
                expression=met.expression,
                synonyms=met.synonyms,
                type=met.type.value,
                measure=met.measure,
                time_dimension=met.time_dimension,
                owner=met.owner,
            )
            for met in model.metrics.values()
        ]

        return ModelDescription(
            model_id=model_id,
            data_objects=data_objects,
            dimensions=dimensions,
            measures=measures,
            metrics=metrics,
        )

    def list_models(self) -> list[ModelSummary]:
        """Return a short summary for every loaded model."""
        with self._lock:
            return list(self._summaries.values())

    def remove_model(self, model_id: str) -> None:
        """Unload a model and its cached OBSL graph.  Raises ``KeyError`` if not found.

        Also removes the model's entry from the dedup index so the next load
        of the same OBML content runs fresh. PLAN_model_load_dedup.md §6.2.
        """
        with self._lock:
            try:
                del self._models[model_id]
            except KeyError:
                raise KeyError(f"No model loaded with id '{model_id}'") from None
            self._raws.pop(model_id, None)
            self._graphs.pop(model_id, None)
            self._summaries.pop(model_id, None)
            stale_hashes = [h for h, mid in self._content_hash_index.items() if mid == model_id]
            for h in stale_hashes:
                del self._content_hash_index[h]
            was_shared = model_id in self._shared_ids
            self._shared_ids.discard(model_id)
        # Release the shared reference outside the store lock. The shared model
        # stays compiled while any other session still references it.
        if was_shared and self._shared_cache is not None:
            self._shared_cache.release(model_id)

    def close(self) -> None:
        """Drop this store, releasing every shared reference it holds.

        Called by :class:`SessionManager` when a session is purged/expired/
        closed, so shared models are refcount-released and can be evicted once
        no live session references them. Private (non-shared) models are simply
        discarded with the store.
        """
        with self._lock:
            shared_ids = list(self._shared_ids)
            self._shared_ids.clear()
            self._models.clear()
            self._raws.clear()
            self._graphs.clear()
            self._summaries.clear()
            self._content_hash_index.clear()
        if self._shared_cache is not None:
            for mid in shared_ids:
                self._shared_cache.release(mid)

    def compile_query(
        self,
        model_id: str,
        query: QueryObject,
        dialect: str,
    ) -> CompilationResult:
        """Compile a query against a loaded model."""
        model = self.get_model(model_id)
        return self._pipeline.compile(query, model, dialect)

    def refresh_contracts(self, model_id: str) -> dict[str, RefreshContract]:
        """Per-physical-table freshness contracts for the given model.

        Used by the result cache to derive an effective TTL for a query
        based on the dataObjects it touched.
        """
        from orionbelt.cache.contracts import collect_table_contracts

        model = self.get_model(model_id)
        contracts, _ = collect_table_contracts(model)
        return contracts

    def validate(
        self,
        yaml_str: str | None = None,
        *,
        raw_dict: dict[str, object] | None = None,
        extends_yaml: list[str] | None = None,
        inherits_model_id: str | None = None,
    ) -> ValidationSummary:
        """Validate a model without storing it.  Accepts YAML string or raw dict.

        Reports JSON Schema violations alongside semantic errors so that the
        ``/validate`` endpoints match what the schema-guarded load/query
        endpoints enforce — a model that fails the schema is reported invalid
        here rather than being silently coerced.
        """
        _model, raw, errors, warnings = self._parse_and_validate(
            yaml_str,
            raw_dict=raw_dict,
            extends_yaml=extends_yaml,
            inherits_model_id=inherits_model_id,
        )
        # Schema-validate the already-safely-parsed dict (``_parse_and_validate``
        # loads via the safety-checked TrackedLoader). Skip when the YAML never
        # parsed safely — the fatal parse/safety error is already reported and
        # there is no document to validate.
        fatal = {"YAML_SAFETY_ERROR", "YAML_PARSE_ERROR"}
        if not any(e.code in fatal for e in errors):
            errors = self._schema_errors(raw) + errors
        return ValidationSummary(
            valid=len(errors) == 0,
            errors=errors,
            warnings=warnings,
        )

    @staticmethod
    def _schema_errors(raw: dict[str, object]) -> list[ErrorInfo]:
        """JSON Schema errors for a parsed model document, as ``ErrorInfo``."""
        public = {k: v for k, v in raw.items() if not str(k).startswith("_")}
        return [
            ErrorInfo(code=e.code, message=e.message, path=e.path, severity=e.severity)
            for e in validate_obml_document(public)
        ]

    # -- OBSL graph ---------------------------------------------------------

    def get_graph(self, model_id: str) -> GraphArtifact:
        """Return the cached OBSL graph for a model.  Raises ``KeyError`` if not found."""
        with self._lock:
            try:
                return self._graphs[model_id]
            except KeyError:
                raise KeyError(f"No graph for model '{model_id}'") from None

    def query_graph(self, model_id: str, sparql: str) -> SPARQLResult:
        """Execute a read-only SPARQL query against a model's OBSL graph."""
        artifact = self.get_graph(model_id)
        return execute_sparql(artifact.graph, sparql)

`load_model(yaml_str=None, *, raw_dict=None, extends_yaml=None, inherits_model_id=None, dedup=True)` ¶

Parse, validate, and store a model. Returns id + summary.

Provide either yaml_str or raw_dict. Raises ModelValidationError if the model has validation errors. Raises ModelCapacityError if the session's model cap is reached.

When dedup is True (default) and the same OBML bytes have already been loaded into this store, the existing model_id is returned and model_load is set to "reused". Dedup only applies to plain yaml_str loads — when raw_dict, extends_yaml, or inherits_model_id is supplied the load always runs fresh, since the effective content depends on inputs not captured by the YAML bytes alone.

Source code in src/orionbelt/service/model_store.py

def load_model(
    self,
    yaml_str: str | None = None,
    *,
    raw_dict: dict[str, object] | None = None,
    extends_yaml: list[str] | None = None,
    inherits_model_id: str | None = None,
    dedup: bool = True,
) -> LoadResult:
    """Parse, validate, and store a model.  Returns id + summary.

    Provide either ``yaml_str`` or ``raw_dict``.
    Raises ``ModelValidationError`` if the model has validation errors.
    Raises ``ModelCapacityError`` if the session's model cap is reached.

    When ``dedup`` is True (default) and the same OBML bytes have already
    been loaded into this store, the existing ``model_id`` is returned
    and ``model_load`` is set to ``"reused"``. Dedup only applies to
    plain ``yaml_str`` loads — when ``raw_dict``, ``extends_yaml``, or
    ``inherits_model_id`` is supplied the load always runs fresh, since
    the effective content depends on inputs not captured by the YAML
    bytes alone.
    """
    # Dedup is meaningful only for a stand-alone YAML body. The other
    # input shapes either skip the YAML stage (raw_dict) or fold in
    # additional state (extends/inherits) that the bytes don't capture.
    dedup_eligible = (
        dedup
        and yaml_str is not None
        and raw_dict is None
        and not extends_yaml
        and inherits_model_id is None
    )
    content_hash: str | None = None
    if dedup_eligible:
        content_hash = self._content_hash(yaml_str or "")
        with self._lock:
            existing_id = self._content_hash_index.get(content_hash)
            if existing_id is not None and existing_id in self._models:
                summary = self._summaries.get(existing_id)
                if summary is not None:
                    existing_model = self._models[existing_id]
                    existing_health = self._health_for(existing_model)
                    return LoadResult(
                        model_id=existing_id,
                        data_objects=summary.data_objects,
                        dimensions=summary.dimensions,
                        measures=summary.measures,
                        metrics=summary.metrics,
                        warnings=[],
                        model_load="reused",
                        health=existing_health,
                    )
            # Stale index entry — drop it and fall through to a fresh load.
            if existing_id is not None:
                self._content_hash_index.pop(content_hash, None)

    # Cross-session hit: another session already compiled these exact
    # bytes. Adopt the shared compiled model with no recompile. From this
    # store's perspective the load is still "fresh" (a new reference for
    # this session); the compile-skip is a transparent optimisation.
    if dedup_eligible and self._shared_cache is not None and content_hash is not None:
        shared = self._shared_cache.acquire(content_hash)
        if shared is not None:
            self._adopt_shared(shared)
            return LoadResult(
                model_id=shared.model_id,
                data_objects=shared.summary.data_objects,
                dimensions=shared.summary.dimensions,
                measures=shared.summary.measures,
                metrics=shared.summary.metrics,
                warnings=[],
                model_load="fresh",
                health=self._health_for(shared.model),
            )

    with self._lock:
        if len(self._models) >= self._max_models:
            raise ModelCapacityError(f"Maximum models per session reached ({self._max_models})")

    model, merged_raw, errors, warnings = self._parse_and_validate(
        yaml_str,
        raw_dict=raw_dict,
        extends_yaml=extends_yaml,
        inherits_model_id=inherits_model_id,
    )
    if errors:
        raise ModelValidationError(errors, warnings)

    shared_load = dedup_eligible and self._shared_cache is not None and content_hash is not None
    # Content-derived id for shared models so identical bytes collapse to
    # one id (and one result-cache key) across sessions; random id
    # otherwise, preserving the ``dedup=False`` "distinct model" contract.
    model_id = (
        self._content_id(content_hash)
        if content_hash is not None and shared_load
        else self._new_id()
    )

    # Eagerly export OBSL-Core graph (Option C: at model load time).
    graph = export_obsl(model, model_id)
    turtle = graph.serialize(format="turtle")
    artifact = GraphArtifact(graph=graph, turtle=turtle, generated_at=time.monotonic())

    summary = ModelSummary(
        model_id=model_id,
        data_objects=len(model.data_objects),
        dimensions=len(model.dimensions),
        measures=len(model.measures),
        metrics=len(model.metrics),
    )

    if shared_load:
        assert self._shared_cache is not None and content_hash is not None
        compiled = CompiledModel(
            model_id=model_id,
            content_hash=content_hash,
            model=model,
            raw=merged_raw,
            graph=artifact,
            summary=summary,
        )
        # A concurrent load of the same content may have won the race;
        # insert_or_acquire returns the surviving entry either way.
        entry = self._shared_cache.insert_or_acquire(compiled)
        self._adopt_shared(entry)
        return LoadResult(
            model_id=entry.model_id,
            data_objects=entry.summary.data_objects,
            dimensions=entry.summary.dimensions,
            measures=entry.summary.measures,
            metrics=entry.summary.metrics,
            warnings=warnings,
            model_load="fresh",
            health=self._health_for(entry.model),
        )

    with self._lock:
        # Re-check capacity under lock — the first check (above) ran
        # outside the lock while parsing/exporting, so a concurrent
        # request may have filled the slot in the meantime.
        if len(self._models) >= self._max_models:
            raise ModelCapacityError(f"Maximum models per session reached ({self._max_models})")
        self._models[model_id] = model
        self._raws[model_id] = merged_raw
        self._graphs[model_id] = artifact
        self._summaries[model_id] = summary
        if content_hash is not None:
            # If a concurrent request beat us to it, the last writer wins;
            # the race is benign (both models work, the older one is just
            # not reachable via the index). See PLAN_model_load_dedup.md §6.3.
            self._content_hash_index[content_hash] = model_id

    return LoadResult(
        model_id=model_id,
        data_objects=summary.data_objects,
        dimensions=summary.dimensions,
        measures=summary.measures,
        metrics=summary.metrics,
        warnings=warnings,
        model_load="fresh",
        health=self._health_for(model),
    )

`get_model(model_id)` ¶

Look up a loaded model. Raises KeyError if not found.

Source code in src/orionbelt/service/model_store.py

def get_model(self, model_id: str) -> SemanticModel:
    """Look up a loaded model.  Raises ``KeyError`` if not found."""
    with self._lock:
        try:
            return self._models[model_id]
        except KeyError:
            raise KeyError(f"No model loaded with id '{model_id}'") from None

`describe(model_id)` ¶

Return a structured summary suitable for LLM consumption.

Source code in src/orionbelt/service/model_store.py

def describe(self, model_id: str) -> ModelDescription:
    """Return a structured summary suitable for LLM consumption."""
    model = self.get_model(model_id)

    data_objects = [
        DataObjectInfo(
            label=obj.label,
            code=obj.qualified_code,
            columns=list(obj.columns.keys()),
            join_targets=[j.join_to for j in obj.joins],
            synonyms=obj.synonyms,
            owner=obj.owner,
        )
        for obj in model.data_objects.values()
    ]

    dimensions = [
        DimensionInfo(
            name=dim.label,
            result_type=dim.result_type.value,
            data_object=dim.view,
            column=dim.column,
            time_grain=dim.time_grain.value if dim.time_grain else None,
            synonyms=dim.synonyms,
            owner=dim.owner,
        )
        for dim in model.dimensions.values()
    ]

    measures = [
        MeasureInfo(
            name=m.label,
            result_type=m.result_type.value,
            aggregation=m.aggregation,
            expression=m.expression,
            synonyms=m.synonyms,
            owner=m.owner,
        )
        for m in model.measures.values()
    ]

    metrics = [
        MetricInfo(
            name=met.label,
            expression=met.expression,
            synonyms=met.synonyms,
            type=met.type.value,
            measure=met.measure,
            time_dimension=met.time_dimension,
            owner=met.owner,
        )
        for met in model.metrics.values()
    ]

    return ModelDescription(
        model_id=model_id,
        data_objects=data_objects,
        dimensions=dimensions,
        measures=measures,
        metrics=metrics,
    )

`list_models()` ¶

Return a short summary for every loaded model.

Source code in src/orionbelt/service/model_store.py

def list_models(self) -> list[ModelSummary]:
    """Return a short summary for every loaded model."""
    with self._lock:
        return list(self._summaries.values())

`remove_model(model_id)` ¶

Unload a model and its cached OBSL graph. Raises KeyError if not found.

Also removes the model's entry from the dedup index so the next load of the same OBML content runs fresh. PLAN_model_load_dedup.md §6.2.

Source code in src/orionbelt/service/model_store.py

def remove_model(self, model_id: str) -> None:
    """Unload a model and its cached OBSL graph.  Raises ``KeyError`` if not found.

    Also removes the model's entry from the dedup index so the next load
    of the same OBML content runs fresh. PLAN_model_load_dedup.md §6.2.
    """
    with self._lock:
        try:
            del self._models[model_id]
        except KeyError:
            raise KeyError(f"No model loaded with id '{model_id}'") from None
        self._raws.pop(model_id, None)
        self._graphs.pop(model_id, None)
        self._summaries.pop(model_id, None)
        stale_hashes = [h for h, mid in self._content_hash_index.items() if mid == model_id]
        for h in stale_hashes:
            del self._content_hash_index[h]
        was_shared = model_id in self._shared_ids
        self._shared_ids.discard(model_id)
    # Release the shared reference outside the store lock. The shared model
    # stays compiled while any other session still references it.
    if was_shared and self._shared_cache is not None:
        self._shared_cache.release(model_id)

`compile_query(model_id, query, dialect)` ¶

Compile a query against a loaded model.

Source code in src/orionbelt/service/model_store.py

def compile_query(
    self,
    model_id: str,
    query: QueryObject,
    dialect: str,
) -> CompilationResult:
    """Compile a query against a loaded model."""
    model = self.get_model(model_id)
    return self._pipeline.compile(query, model, dialect)

`validate(yaml_str=None, *, raw_dict=None, extends_yaml=None, inherits_model_id=None)` ¶

Validate a model without storing it. Accepts YAML string or raw dict.

Reports JSON Schema violations alongside semantic errors so that the /validate endpoints match what the schema-guarded load/query endpoints enforce — a model that fails the schema is reported invalid here rather than being silently coerced.

Source code in src/orionbelt/service/model_store.py

def validate(
    self,
    yaml_str: str | None = None,
    *,
    raw_dict: dict[str, object] | None = None,
    extends_yaml: list[str] | None = None,
    inherits_model_id: str | None = None,
) -> ValidationSummary:
    """Validate a model without storing it.  Accepts YAML string or raw dict.

    Reports JSON Schema violations alongside semantic errors so that the
    ``/validate`` endpoints match what the schema-guarded load/query
    endpoints enforce — a model that fails the schema is reported invalid
    here rather than being silently coerced.
    """
    _model, raw, errors, warnings = self._parse_and_validate(
        yaml_str,
        raw_dict=raw_dict,
        extends_yaml=extends_yaml,
        inherits_model_id=inherits_model_id,
    )
    # Schema-validate the already-safely-parsed dict (``_parse_and_validate``
    # loads via the safety-checked TrackedLoader). Skip when the YAML never
    # parsed safely — the fatal parse/safety error is already reported and
    # there is no document to validate.
    fatal = {"YAML_SAFETY_ERROR", "YAML_PARSE_ERROR"}
    if not any(e.code in fatal for e in errors):
        errors = self._schema_errors(raw) + errors
    return ValidationSummary(
        valid=len(errors) == 0,
        errors=errors,
        warnings=warnings,
    )

SessionManager¶

`orionbelt.service.session_manager.SessionManager` ¶

Manages TTL-scoped sessions, each holding its own ModelStore.

Thread-safe. Call :meth:start to begin the background cleanup thread and :meth:stop to shut it down.

Parameters¶

ttl_seconds: Sliding idle timeout — sessions expire after this many seconds of inactivity. max_age_seconds: Absolute maximum session lifetime regardless of activity. max_sessions: Global cap on concurrent sessions. create_session raises :class:SessionCapacityError when at capacity. max_models_per_session: Maximum models a single session may hold. Passed through to each ModelStore instance. cleanup_interval: Seconds between background purge sweeps. is_single_model_mode: Flag retained for backwards compatibility — historically set when a MODEL_FILE preloaded the __default__ session. With MODEL_FILES (admin-curated named sessions), the flag is True and the __default__ session — still created on demand by MCP stdio — is kept alive and excluded from purge. False otherwise, in which case __default__ is treated like any other session and subject to TTL/max-age expiry.

Source code in src/orionbelt/service/session_manager.py

class SessionManager:
    """Manages TTL-scoped sessions, each holding its own ``ModelStore``.

    Thread-safe.  Call :meth:`start` to begin the background cleanup thread
    and :meth:`stop` to shut it down.

    Parameters
    ----------
    ttl_seconds:
        Sliding idle timeout — sessions expire after this many seconds of
        inactivity.
    max_age_seconds:
        Absolute maximum session lifetime regardless of activity.
    max_sessions:
        Global cap on concurrent sessions.  ``create_session`` raises
        :class:`SessionCapacityError` when at capacity.
    max_models_per_session:
        Maximum models a single session may hold.  Passed through to each
        ``ModelStore`` instance.
    cleanup_interval:
        Seconds between background purge sweeps.
    is_single_model_mode:
        Flag retained for backwards compatibility — historically set when
        a ``MODEL_FILE`` preloaded the ``__default__`` session. With
        ``MODEL_FILES`` (admin-curated named sessions), the flag is True
        and the ``__default__`` session — still created on demand by MCP
        stdio — is kept alive and excluded from purge. False otherwise,
        in which case ``__default__`` is treated like any other session
        and subject to TTL/max-age expiry.
    """

    def __init__(
        self,
        ttl_seconds: int = 1800,
        max_age_seconds: int = 86400,
        max_sessions: int = 500,
        max_models_per_session: int = 10,
        cleanup_interval: int = 60,
        is_single_model_mode: bool = False,
    ) -> None:
        self._ttl = ttl_seconds
        self._max_age = max_age_seconds
        self._max_sessions = max_sessions
        self._max_models = max_models_per_session
        self._cleanup_interval = cleanup_interval
        self._is_single_model_mode = is_single_model_mode
        self._lock = threading.Lock()
        # One process-wide content-addressed model cache shared by every
        # per-session store, so identical OBML compiles once across sessions.
        self._model_cache = ModelCache()
        self._sessions: dict[str, _Session] = {}
        self._stop_event = threading.Event()
        self._cleanup_thread: threading.Thread | None = None

    @property
    def ttl(self) -> int:
        """Session TTL in seconds."""
        return self._ttl

    @property
    def max_age(self) -> int:
        """Absolute max session lifetime in seconds."""
        return self._max_age

    @property
    def max_sessions(self) -> int:
        """Global concurrent session cap."""
        return self._max_sessions

    @property
    def max_models_per_session(self) -> int:
        """Maximum models a single session may hold."""
        return self._max_models

    @property
    def is_single_model_mode(self) -> bool:
        """True when admin-curated (``MODEL_FILES``) mode is active.

        In this mode the BI-facing catalog should expose only the curated
        (protected) models, not transient user/scratch sessions.
        """
        return self._is_single_model_mode

    # -- lifecycle -----------------------------------------------------------

    def start(self) -> None:
        """Start the background cleanup daemon thread."""
        if self._cleanup_thread is not None:
            return
        self._stop_event.clear()
        self._cleanup_thread = threading.Thread(
            target=self._cleanup_loop, daemon=True, name="session-cleanup"
        )
        self._cleanup_thread.start()

    def stop(self) -> None:
        """Signal the cleanup thread to stop and wait for it."""
        self._stop_event.set()
        if self._cleanup_thread is not None:
            self._cleanup_thread.join(timeout=5)
            self._cleanup_thread = None

    # -- public API ----------------------------------------------------------

    def create_session(self, metadata: dict[str, str] | None = None) -> SessionInfo:
        """Create a new session and return its info.

        Raises :class:`SessionCapacityError` when the global session cap
        is reached.
        """
        now_mono = time.monotonic()
        now_wall = datetime.now(UTC)
        session_id = secrets.token_hex(16)  # 32-char hex (128-bit)
        session = _Session(
            session_id=session_id,
            store=ModelStore(max_models=self._max_models, shared_cache=self._model_cache),
            created_at=now_wall,
            created_at_mono=now_mono,
            last_accessed=now_mono,
            metadata=metadata or {},
            created_at_wall=now_wall,
            last_accessed_wall=now_wall,
        )
        with self._lock:
            # Count only non-default, non-expired sessions toward the cap.
            active = sum(
                1
                for s in self._sessions.values()
                if s.session_id != _DEFAULT_SESSION_ID and not self._is_expired(s, now_mono)
            )
            if active >= self._max_sessions:
                logger.warning(
                    "Session cap reached (%d/%d), rejecting create",
                    active,
                    self._max_sessions,
                )
                raise SessionCapacityError(
                    f"Maximum number of concurrent sessions reached ({self._max_sessions})"
                )
            self._sessions[session_id] = session
        logger.info("Session created: %s", session_id)
        return self._session_info(session)

    def get_store(self, session_id: str) -> ModelStore:
        """Get the ModelStore for a session, updating its last-accessed time.

        Raises :class:`SessionExpiredError` if the session has expired.
        Raises :class:`SessionNotFoundError` if the session ID is unknown.
        """
        now_mono = time.monotonic()
        with self._lock:
            session = self._sessions.get(session_id)
            if session is None:
                raise SessionNotFoundError(f"Session '{session_id}' not found")
            if self._is_expired(session, now_mono):
                reason = self._expiry_reason(session, now_mono)
                session.store.close()
                del self._sessions[session_id]
                logger.info("Session expired on access: %s (%s)", session_id, reason)
                raise SessionExpiredError(f"Session '{session_id}' has expired ({reason})")
            session.last_accessed = now_mono
            session.last_accessed_wall = datetime.now(UTC)
            return session.store

    def get_session(self, session_id: str) -> SessionInfo:
        """Get session info (also refreshes last-accessed)."""
        now_mono = time.monotonic()
        with self._lock:
            session = self._sessions.get(session_id)
            if session is None:
                raise SessionNotFoundError(f"Session '{session_id}' not found")
            if self._is_expired(session, now_mono):
                reason = self._expiry_reason(session, now_mono)
                session.store.close()
                del self._sessions[session_id]
                logger.info("Session expired on access: %s (%s)", session_id, reason)
                raise SessionExpiredError(f"Session '{session_id}' has expired ({reason})")
            session.last_accessed = now_mono
            session.last_accessed_wall = datetime.now(UTC)
            return self._session_info(session)

    def close_session(self, session_id: str) -> None:
        """Explicitly close a session."""
        with self._lock:
            session = self._sessions.get(session_id)
            if session is None:
                raise SessionNotFoundError(f"Session '{session_id}' not found")
            session.store.close()
            del self._sessions[session_id]
        logger.info("Session closed: %s", session_id)

    def list_sessions(self) -> list[SessionInfo]:
        """Return info for all non-expired user sessions.

        Excludes the default session and any admin-managed (protected)
        sessions created by the multi-model startup loader.
        """
        now_mono = time.monotonic()
        result: list[SessionInfo] = []
        with self._lock:
            for session in self._sessions.values():
                if session.session_id == _DEFAULT_SESSION_ID:
                    continue
                if session.protected:
                    continue
                if not self._is_expired(session, now_mono):
                    result.append(self._session_info(session))
        return result

    @property
    def active_count(self) -> int:
        """Number of active (non-expired) sessions."""
        now_mono = time.monotonic()
        with self._lock:
            return sum(1 for s in self._sessions.values() if not self._is_expired(s, now_mono))

    def get_or_create_default(self) -> ModelStore:
        """Get (or lazily create) the legacy ``__default__`` session.

        Unlike :meth:`get_or_create_named`, the default session is NOT
        marked protected — its lifecycle is controlled by the
        ``is_single_model_mode`` flag, not by the protected mechanism.
        This preserves backward compatibility with the v2.3.x model-
        upload semantics where each new user session inherits the
        preloaded YAML.
        """
        with self._lock:
            session = self._sessions.get(_DEFAULT_SESSION_ID)
            if session is not None:
                session.last_accessed = time.monotonic()
                session.last_accessed_wall = datetime.now(UTC)
                return session.store
            now_mono = time.monotonic()
            now_wall = datetime.now(UTC)
            session = _Session(
                session_id=_DEFAULT_SESSION_ID,
                store=ModelStore(max_models=self._max_models, shared_cache=self._model_cache),
                created_at=now_wall,
                created_at_mono=now_mono,
                last_accessed=now_mono,
                created_at_wall=now_wall,
                last_accessed_wall=now_wall,
            )
            self._sessions[_DEFAULT_SESSION_ID] = session
            return session.store

    def get_or_create_named(self, session_id: str) -> ModelStore:
        """Get (or lazily create) a session with a caller-chosen id.

        Used by the multi-model startup loader to register each pre-loaded
        model as its own internal session whose id is the resolved model
        name. The created session is marked ``protected`` — exempt from
        idle TTL eviction and not listed by :meth:`list_sessions`. Admin-
        managed.
        """
        with self._lock:
            session = self._sessions.get(session_id)
            if session is not None:
                session.last_accessed = time.monotonic()
                session.last_accessed_wall = datetime.now(UTC)
                return session.store
            now_mono = time.monotonic()
            now_wall = datetime.now(UTC)
            session = _Session(
                session_id=session_id,
                store=ModelStore(max_models=self._max_models, shared_cache=self._model_cache),
                created_at=now_wall,
                created_at_mono=now_mono,
                last_accessed=now_mono,
                created_at_wall=now_wall,
                last_accessed_wall=now_wall,
                protected=True,
            )
            self._sessions[session_id] = session
            return session.store

    def list_protected_session_ids(self) -> list[str]:
        """Return the ids of all admin-managed (protected) sessions.

        Used by multi-model discovery (``GET /v1/models``) and by Flight
        routing to enumerate which model names are available.
        """
        with self._lock:
            return [s.session_id for s in self._sessions.values() if s.protected]

    # -- internal ------------------------------------------------------------

    def _is_expired(self, session: _Session, now_mono: float) -> bool:
        """Check if a session has exceeded idle TTL or absolute max-age.

        Protected sessions (admin-loaded models via ``MODEL_FILES``) never
        expire — they're owned by the process lifecycle, not by client
        activity. Without this guard, ``get_store()`` would delete them on
        access past TTL, even though ``_purge_expired`` correctly skips
        them.
        """
        if session.protected:
            return False
        idle = now_mono - session.last_accessed > self._ttl
        aged = now_mono - session.created_at_mono > self._max_age
        return idle or aged

    def _expiry_reason(self, session: _Session, now_mono: float) -> str:
        """Return a human-readable reason why a session expired."""
        idle_elapsed = now_mono - session.last_accessed
        age_elapsed = now_mono - session.created_at_mono
        if age_elapsed > self._max_age:
            return f"max-age {self._max_age}s exceeded after {age_elapsed:.0f}s"
        return f"idle {self._ttl}s exceeded after {idle_elapsed:.0f}s"

    def _session_info(self, session: _Session) -> SessionInfo:
        now_wall = datetime.now(UTC)
        idle_remaining = self._ttl - (time.monotonic() - session.last_accessed)
        age_remaining = self._max_age - (time.monotonic() - session.created_at_mono)

        # expires_at = when the idle TTL would fire (from last access)
        expires_at = now_wall + timedelta(seconds=max(0.0, idle_remaining))
        # max_expires_at = absolute deadline (from creation)
        max_expires_at = now_wall + timedelta(seconds=max(0.0, age_remaining))

        return SessionInfo(
            session_id=session.session_id,
            created_at=session.created_at_wall,
            last_accessed_at=session.last_accessed_wall,
            model_count=len(session.store.list_models()),
            metadata=session.metadata,
            expires_at=expires_at,
            max_expires_at=max_expires_at,
        )

    def _purge_expired(self) -> None:
        """Remove all expired sessions (called by cleanup thread).

        Protected sessions (admin-managed pre-loads) are never purged.
        The legacy ``__default__`` session is kept alive when
        ``is_single_model_mode`` is set.
        """
        now_mono = time.monotonic()
        with self._lock:
            skip_default = self._is_single_model_mode
            expired = [
                sid
                for sid, s in self._sessions.items()
                if not s.protected
                and (not skip_default or sid != _DEFAULT_SESSION_ID)
                and self._is_expired(s, now_mono)
            ]
            for sid in expired:
                reason = self._expiry_reason(self._sessions[sid], now_mono)
                self._sessions[sid].store.close()
                del self._sessions[sid]
                logger.info("Session purged: %s (%s)", sid, reason)
        if expired:
            logger.info(
                "Purge sweep: removed %d session(s), %d remaining",
                len(expired),
                len(self._sessions),
            )

    def _cleanup_loop(self) -> None:
        """Background loop that periodically purges expired sessions."""
        while not self._stop_event.wait(timeout=self._cleanup_interval):
            self._purge_expired()

`active_count` `property` ¶

Number of active (non-expired) sessions.

`start()` ¶

Start the background cleanup daemon thread.

Source code in src/orionbelt/service/session_manager.py

def start(self) -> None:
    """Start the background cleanup daemon thread."""
    if self._cleanup_thread is not None:
        return
    self._stop_event.clear()
    self._cleanup_thread = threading.Thread(
        target=self._cleanup_loop, daemon=True, name="session-cleanup"
    )
    self._cleanup_thread.start()

`stop()` ¶

Signal the cleanup thread to stop and wait for it.

Source code in src/orionbelt/service/session_manager.py

def stop(self) -> None:
    """Signal the cleanup thread to stop and wait for it."""
    self._stop_event.set()
    if self._cleanup_thread is not None:
        self._cleanup_thread.join(timeout=5)
        self._cleanup_thread = None

`create_session(metadata=None)` ¶

Create a new session and return its info.

Raises :class:SessionCapacityError when the global session cap is reached.

Source code in src/orionbelt/service/session_manager.py

def create_session(self, metadata: dict[str, str] | None = None) -> SessionInfo:
    """Create a new session and return its info.

    Raises :class:`SessionCapacityError` when the global session cap
    is reached.
    """
    now_mono = time.monotonic()
    now_wall = datetime.now(UTC)
    session_id = secrets.token_hex(16)  # 32-char hex (128-bit)
    session = _Session(
        session_id=session_id,
        store=ModelStore(max_models=self._max_models, shared_cache=self._model_cache),
        created_at=now_wall,
        created_at_mono=now_mono,
        last_accessed=now_mono,
        metadata=metadata or {},
        created_at_wall=now_wall,
        last_accessed_wall=now_wall,
    )
    with self._lock:
        # Count only non-default, non-expired sessions toward the cap.
        active = sum(
            1
            for s in self._sessions.values()
            if s.session_id != _DEFAULT_SESSION_ID and not self._is_expired(s, now_mono)
        )
        if active >= self._max_sessions:
            logger.warning(
                "Session cap reached (%d/%d), rejecting create",
                active,
                self._max_sessions,
            )
            raise SessionCapacityError(
                f"Maximum number of concurrent sessions reached ({self._max_sessions})"
            )
        self._sessions[session_id] = session
    logger.info("Session created: %s", session_id)
    return self._session_info(session)

`get_store(session_id)` ¶

Get the ModelStore for a session, updating its last-accessed time.

Raises :class:SessionExpiredError if the session has expired. Raises :class:SessionNotFoundError if the session ID is unknown.

Source code in src/orionbelt/service/session_manager.py

def get_store(self, session_id: str) -> ModelStore:
    """Get the ModelStore for a session, updating its last-accessed time.

    Raises :class:`SessionExpiredError` if the session has expired.
    Raises :class:`SessionNotFoundError` if the session ID is unknown.
    """
    now_mono = time.monotonic()
    with self._lock:
        session = self._sessions.get(session_id)
        if session is None:
            raise SessionNotFoundError(f"Session '{session_id}' not found")
        if self._is_expired(session, now_mono):
            reason = self._expiry_reason(session, now_mono)
            session.store.close()
            del self._sessions[session_id]
            logger.info("Session expired on access: %s (%s)", session_id, reason)
            raise SessionExpiredError(f"Session '{session_id}' has expired ({reason})")
        session.last_accessed = now_mono
        session.last_accessed_wall = datetime.now(UTC)
        return session.store

`get_session(session_id)` ¶

Get session info (also refreshes last-accessed).

Source code in src/orionbelt/service/session_manager.py

def get_session(self, session_id: str) -> SessionInfo:
    """Get session info (also refreshes last-accessed)."""
    now_mono = time.monotonic()
    with self._lock:
        session = self._sessions.get(session_id)
        if session is None:
            raise SessionNotFoundError(f"Session '{session_id}' not found")
        if self._is_expired(session, now_mono):
            reason = self._expiry_reason(session, now_mono)
            session.store.close()
            del self._sessions[session_id]
            logger.info("Session expired on access: %s (%s)", session_id, reason)
            raise SessionExpiredError(f"Session '{session_id}' has expired ({reason})")
        session.last_accessed = now_mono
        session.last_accessed_wall = datetime.now(UTC)
        return self._session_info(session)

`close_session(session_id)` ¶

Explicitly close a session.

Source code in src/orionbelt/service/session_manager.py

def close_session(self, session_id: str) -> None:
    """Explicitly close a session."""
    with self._lock:
        session = self._sessions.get(session_id)
        if session is None:
            raise SessionNotFoundError(f"Session '{session_id}' not found")
        session.store.close()
        del self._sessions[session_id]
    logger.info("Session closed: %s", session_id)

`list_sessions()` ¶

Return info for all non-expired user sessions.

Excludes the default session and any admin-managed (protected) sessions created by the multi-model startup loader.

Source code in src/orionbelt/service/session_manager.py

def list_sessions(self) -> list[SessionInfo]:
    """Return info for all non-expired user sessions.

    Excludes the default session and any admin-managed (protected)
    sessions created by the multi-model startup loader.
    """
    now_mono = time.monotonic()
    result: list[SessionInfo] = []
    with self._lock:
        for session in self._sessions.values():
            if session.session_id == _DEFAULT_SESSION_ID:
                continue
            if session.protected:
                continue
            if not self._is_expired(session, now_mono):
                result.append(self._session_info(session))
    return result

`get_or_create_default()` ¶

Get (or lazily create) the legacy __default__ session.

Unlike :meth:get_or_create_named, the default session is NOT marked protected — its lifecycle is controlled by the is_single_model_mode flag, not by the protected mechanism. This preserves backward compatibility with the v2.3.x model- upload semantics where each new user session inherits the preloaded YAML.

Source code in src/orionbelt/service/session_manager.py

def get_or_create_default(self) -> ModelStore:
    """Get (or lazily create) the legacy ``__default__`` session.

    Unlike :meth:`get_or_create_named`, the default session is NOT
    marked protected — its lifecycle is controlled by the
    ``is_single_model_mode`` flag, not by the protected mechanism.
    This preserves backward compatibility with the v2.3.x model-
    upload semantics where each new user session inherits the
    preloaded YAML.
    """
    with self._lock:
        session = self._sessions.get(_DEFAULT_SESSION_ID)
        if session is not None:
            session.last_accessed = time.monotonic()
            session.last_accessed_wall = datetime.now(UTC)
            return session.store
        now_mono = time.monotonic()
        now_wall = datetime.now(UTC)
        session = _Session(
            session_id=_DEFAULT_SESSION_ID,
            store=ModelStore(max_models=self._max_models, shared_cache=self._model_cache),
            created_at=now_wall,
            created_at_mono=now_mono,
            last_accessed=now_mono,
            created_at_wall=now_wall,
            last_accessed_wall=now_wall,
        )
        self._sessions[_DEFAULT_SESSION_ID] = session
        return session.store

SessionInfo¶

`orionbelt.service.session_manager.SessionInfo` `dataclass` ¶

Public session metadata (returned by list/get).

Source code in src/orionbelt/service/session_manager.py

@dataclass
class SessionInfo:
    """Public session metadata (returned by list/get)."""

    session_id: str
    created_at: datetime
    last_accessed_at: datetime
    model_count: int
    metadata: dict[str, str]
    expires_at: datetime
    max_expires_at: datetime

Compiler Pipeline¶

`orionbelt.compiler.pipeline.CompilationPipeline` ¶

Orchestrates: Query → Resolution → Planning → AST → SQL.

Source code in src/orionbelt/compiler/pipeline.py

class CompilationPipeline:
    """Orchestrates: Query → Resolution → Planning → AST → SQL."""

    def __init__(self) -> None:
        self._resolver = QueryResolver()
        self._star_planner = StarSchemaPlanner()
        self._cfl_planner = CFLPlanner()
        self._raw_planner = RawPlanner()

    def compile(
        self,
        query: QueryObject,
        model: SemanticModel,
        dialect_name: str,
    ) -> CompilationResult:
        """Compile a query to SQL for the specified dialect."""
        # Create dialect first so resolution and planning share one
        # ``qualify_table`` — the EXISTS filter operator needs it during
        # resolution to render the correlated subquery's FROM clause.
        dialect = DialectRegistry.get(dialect_name)
        qualify_table = lambda obj: dialect.format_table_ref(  # noqa: E731
            obj.database, obj.schema_name, obj.code
        )

        # Phase 1: Resolution
        resolved = self._resolver.resolve(query, model, qualify_table=qualify_table)

        # Phase 1.5: Fanout detection (skip for CFL — each fact queried independently)
        if not resolved.requires_cfl:
            detect_fanout(resolved, model)

        # Phase 2: Planning (raw / star schema / CFL)
        use_cfl = resolved.requires_cfl or resolved.dimensions_exclude
        if resolved.is_raw:
            plan = self._raw_planner.plan(
                resolved,
                model,
                qualify_table=qualify_table,
                dialect=dialect,
                union_by_name=dialect.capabilities.supports_union_all_by_name,
            )
        elif use_cfl:
            plan = self._cfl_planner.plan(
                resolved,
                model,
                qualify_table=qualify_table,
                union_by_name=dialect.capabilities.supports_union_all_by_name,
                dialect=dialect,
            )
        else:
            plan = self._star_planner.plan(
                resolved, model, qualify_table=qualify_table, dialect=dialect
            )

        # Phase 2.3 – 2.6: Aggregate-mode passes (filter context, PoP,
        # totals, cumulative, window) plus HAVING projection cleanup. Raw
        # mode has no measures, so the passes are no-ops and skipped
        # entirely for clarity. Pass ordering and the feature-compatibility
        # rules live in ``compiler/passes.py``.
        if resolved.is_raw:
            wrapped_ast = plan.ast
        else:
            ctx = CompileContext(
                resolved=resolved,
                model=model,
                dialect=dialect,
                qualify_table=qualify_table,
            )
            wrapped_ast = apply_aggregate_passes(plan.ast, ctx)

        # Phase 3: Dialect-specific SQL rendering
        codegen = CodeGenerator(dialect)
        sql = codegen.generate(wrapped_ast)

        # Phase 4: SQL validation (non-blocking)
        validation_errors = validate_sql(sql, dialect_name)
        sql_valid = len(validation_errors) == 0
        warnings = resolved.warnings
        if not sql_valid:
            warnings = warnings + [
                warning(
                    code=WarningCode.SQL_VALIDATION,
                    message=f"SQL validation: {e}",
                )
                for e in validation_errors
            ]

        # Build explain plan
        explain = self._build_explain(resolved, model, use_cfl, plan)

        # Compute deduplicated physical tables touched by the query
        physical_tables = _compute_physical_tables(resolved, query, model)

        return CompilationResult(
            sql=sql,
            dialect=dialect_name,
            physical_tables=physical_tables,
            resolved=ResolvedInfo(
                fact_tables=resolved.fact_tables,
                dimensions=[d.name for d in resolved.dimensions],
                measures=[m.name for m in resolved.measures],
            ),
            warnings=warnings,
            sql_valid=sql_valid,
            explain=explain,
        )

    @staticmethod
    def _q(name: str) -> str:
        """Quote an identifier for explain output."""
        return f'"{name}"'

    def _build_explain(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        use_cfl: bool,
        plan: QueryPlan,
    ) -> ExplainPlan:
        """Build the explain plan from resolution results."""
        q = self._q

        # Planner choice
        if resolved.is_raw:
            planner = "Raw"
            distinct_note = " with DISTINCT" if resolved.distinct else ""
            planner_reason = (
                f"Raw-mode projection of physical columns{distinct_note} — "
                f"no aggregation, no GROUP BY"
            )
        elif use_cfl:
            if resolved.dimensions_exclude:
                planner = "CFL"
                planner_reason = (
                    "dimensionsExclude anti-join — "
                    "CROSS JOIN of distinct values EXCEPT existing combinations"
                )
            else:
                planner = "CFL"
                sources = ", ".join(q(s) for s in sorted(resolved.measure_source_objects))
                planner_reason = (
                    f"Measures reference independent fact tables ({sources}) — "
                    f"Composite Fact Layer merges them via UNION ALL"
                )
        else:
            planner = "Star Schema"
            planner_reason = (
                "All requested objects are reachable from a single base via directed joins"
            )

        # Base object — explain should reflect actual selection logic
        base = resolved.base_object
        if resolved.measure_source_objects:
            if use_cfl and len(resolved.measure_source_objects) > 1:
                base_reason = (
                    "Not applicable — each CFL leg uses its own common root (see cfl_legs)"
                )
            elif len(resolved.measure_source_objects) > 1:
                sources = ", ".join(q(s) for s in sorted(resolved.measure_source_objects))
                base_reason = (
                    f"{q(base)} selected as base — most connected fact table "
                    f"among measure sources ({sources})"
                )
            else:
                base_reason = f"{q(base)} selected as base — sole measure source object"
        elif len(resolved.required_objects) > 1:
            base_reason = (
                f"{q(base)} selected as base — common root that can reach "
                f"all required objects via directed joins"
            )
        else:
            base_reason = f"{q(base)} selected as base for single-object query"

        # Joins — for CFL queries the per-leg joins are more informative,
        # so only include resolution-level joins for star schema queries.
        explain_joins: list[ExplainJoin] = []
        if not use_cfl:
            for step in resolved.join_steps:
                join_cols = [
                    f"{fc} = {tc}"
                    for fc, tc in zip(step.from_columns, step.to_columns, strict=True)
                ]
                if step.reversed:
                    reason = (
                        f"Reversed join from {q(step.from_object)} to {q(step.to_object)} — "
                        f"original join was defined in the opposite direction"
                    )
                else:
                    reason = (
                        f"Join {q(step.from_object)} → {q(step.to_object)} to include "
                        f"columns needed by the query"
                    )
                explain_joins.append(
                    ExplainJoin(
                        from_object=step.from_object,
                        to_object=step.to_object,
                        join_columns=join_cols,
                        reason=reason,
                        cardinality=step.cardinality.value,
                    )
                )

        # CFL leg details
        cfl_leg_explains: list[ExplainCflLeg] = []
        for leg in plan.cfl_legs:
            cfl_leg_explains.append(
                ExplainCflLeg(
                    measure_source=leg.measure_source,
                    common_root=leg.common_root,
                    reason=leg.reason,
                    measures=leg.measures,
                    joins=leg.joins,
                )
            )

        return ExplainPlan(
            planner=planner,
            planner_reason=planner_reason,
            base_object=base,
            base_object_reason=base_reason,
            joins=explain_joins,
            where_filter_count=len(resolved.where_filters),
            having_filter_count=len(resolved.having_filters),
            has_totals=resolved.has_totals,
            has_grain_overrides=resolved.has_grain_overrides,
            has_filter_context=resolved.has_filter_context,
            has_cumulative=resolved.has_cumulative,
            has_pop=resolved.has_pop,
            has_window=resolved.has_window,
            cfl_legs=cfl_leg_explains,
        )

`compile(query, model, dialect_name)` ¶

Compile a query to SQL for the specified dialect.

Source code in src/orionbelt/compiler/pipeline.py

def compile(
    self,
    query: QueryObject,
    model: SemanticModel,
    dialect_name: str,
) -> CompilationResult:
    """Compile a query to SQL for the specified dialect."""
    # Create dialect first so resolution and planning share one
    # ``qualify_table`` — the EXISTS filter operator needs it during
    # resolution to render the correlated subquery's FROM clause.
    dialect = DialectRegistry.get(dialect_name)
    qualify_table = lambda obj: dialect.format_table_ref(  # noqa: E731
        obj.database, obj.schema_name, obj.code
    )

    # Phase 1: Resolution
    resolved = self._resolver.resolve(query, model, qualify_table=qualify_table)

    # Phase 1.5: Fanout detection (skip for CFL — each fact queried independently)
    if not resolved.requires_cfl:
        detect_fanout(resolved, model)

    # Phase 2: Planning (raw / star schema / CFL)
    use_cfl = resolved.requires_cfl or resolved.dimensions_exclude
    if resolved.is_raw:
        plan = self._raw_planner.plan(
            resolved,
            model,
            qualify_table=qualify_table,
            dialect=dialect,
            union_by_name=dialect.capabilities.supports_union_all_by_name,
        )
    elif use_cfl:
        plan = self._cfl_planner.plan(
            resolved,
            model,
            qualify_table=qualify_table,
            union_by_name=dialect.capabilities.supports_union_all_by_name,
            dialect=dialect,
        )
    else:
        plan = self._star_planner.plan(
            resolved, model, qualify_table=qualify_table, dialect=dialect
        )

    # Phase 2.3 – 2.6: Aggregate-mode passes (filter context, PoP,
    # totals, cumulative, window) plus HAVING projection cleanup. Raw
    # mode has no measures, so the passes are no-ops and skipped
    # entirely for clarity. Pass ordering and the feature-compatibility
    # rules live in ``compiler/passes.py``.
    if resolved.is_raw:
        wrapped_ast = plan.ast
    else:
        ctx = CompileContext(
            resolved=resolved,
            model=model,
            dialect=dialect,
            qualify_table=qualify_table,
        )
        wrapped_ast = apply_aggregate_passes(plan.ast, ctx)

    # Phase 3: Dialect-specific SQL rendering
    codegen = CodeGenerator(dialect)
    sql = codegen.generate(wrapped_ast)

    # Phase 4: SQL validation (non-blocking)
    validation_errors = validate_sql(sql, dialect_name)
    sql_valid = len(validation_errors) == 0
    warnings = resolved.warnings
    if not sql_valid:
        warnings = warnings + [
            warning(
                code=WarningCode.SQL_VALIDATION,
                message=f"SQL validation: {e}",
            )
            for e in validation_errors
        ]

    # Build explain plan
    explain = self._build_explain(resolved, model, use_cfl, plan)

    # Compute deduplicated physical tables touched by the query
    physical_tables = _compute_physical_tables(resolved, query, model)

    return CompilationResult(
        sql=sql,
        dialect=dialect_name,
        physical_tables=physical_tables,
        resolved=ResolvedInfo(
            fact_tables=resolved.fact_tables,
            dimensions=[d.name for d in resolved.dimensions],
            measures=[m.name for m in resolved.measures],
        ),
        warnings=warnings,
        sql_valid=sql_valid,
        explain=explain,
    )

Query Resolution¶

`orionbelt.compiler.resolution.QueryResolver` ¶

Resolves a QueryObject + SemanticModel into a ResolvedQuery.

Source code in src/orionbelt/compiler/resolution.py

class QueryResolver:
    """Resolves a QueryObject + SemanticModel into a ResolvedQuery."""

    def resolve(
        self,
        query: QueryObject,
        model: SemanticModel,
        qualify_table: Callable[[DataObject], str] | None = None,
    ) -> ResolvedQuery:
        ctx = _ResolutionContext(
            model=model,
            result=ResolvedQuery(
                limit=query.limit,
                offset=query.offset,
                use_path_names=list(query.use_path_names),
                is_raw=query.select.is_raw,
                distinct=query.select.distinct,
                grouping=query.grouping,
            ),
            qualify_table=qualify_table,
        )

        # Build global column lookup: col_name → (object_name, source_column)
        for obj_name, obj in model.data_objects.items():
            for col_name, col_obj in obj.columns.items():
                ctx.global_columns[col_name] = (obj_name, col_obj.code)

        if query.select.is_raw:
            # Raw mode: project physical columns, no aggregation.
            for ref in query.select.fields:
                self._resolve_raw_field(ctx, ref)
        else:
            # Aggregate mode (default).
            # 1. Resolve dimensions (string or coalesce group).
            # Coalesce groups expand into their constituent dimensions, each
            # tagged with the same coalesce_alias so the CFL outer wrapper can
            # emit COALESCE(d1, d2, ...) AS <alias>.
            for dim_entry in query.select.dimensions:
                if isinstance(dim_entry, CoalesceDimension):
                    self._resolve_coalesce_dimension(ctx, dim_entry, ctx.result.coalesce_aliases)
                else:
                    self._append_resolved_dimension(ctx, dim_entry)

            # 2. Resolve measures and track their source objects
            for measure_name in query.select.measures:
                resolved_meas = self._resolve_measure(ctx, measure_name)
                if resolved_meas:
                    ctx.result.measures.append(resolved_meas)
                    source_objs = self._get_measure_source_objects(ctx, measure_name)
                    ctx.result.measure_source_objects.update(source_objs)
                    ctx.result.required_objects.update(source_objs)

            # 2.5. Auto-include measures referenced by HAVING but not by SELECT.
            # Without this, codegen emits a HAVING clause that references an
            # alias for a column the SELECT doesn't project — every database
            # rejects the SQL with a "must appear in GROUP BY" binder error.
            # Routing this through the regular measure-resolution path also
            # updates ``measure_source_objects`` so the multi-fact CFL trigger
            # below sees the HAVING-only measure's source.
            existing_measure_names = {m.name for m in ctx.result.measures}
            for ref in self._collect_having_measure_refs(query, model):
                if ref in existing_measure_names:
                    continue
                resolved_meas = self._resolve_measure(ctx, ref)
                if resolved_meas is None:
                    continue
                ctx.result.measures.append(resolved_meas)
                ctx.result.having_only_measures.add(ref)
                existing_measure_names.add(ref)
                source_objs = self._get_measure_source_objects(ctx, ref)
                ctx.result.measure_source_objects.update(source_objs)
                ctx.result.required_objects.update(source_objs)

        # 3. Determine base object (the one with most joins / most measures)
        ctx.result.base_object = self._select_base_object(ctx)
        if ctx.result.base_object:
            ctx.result.required_objects.add(ctx.result.base_object)

        # Detect multi-fact: CFL is needed only when measure source objects
        # span multiple independent fact tables.
        if len(ctx.result.measure_source_objects) > 1:
            graph = JoinGraph(model, use_path_names=query.use_path_names or None)
            reachable = graph.descendants(ctx.result.base_object)
            unreachable = ctx.result.measure_source_objects - reachable - {ctx.result.base_object}
            if unreachable:
                ctx.result.requires_cfl = True

        # Dimension-only queries: when dimensions span independent branches,
        # join through intermediate bridge/fact tables (no CFL needed).
        # Add intermediate tables from the join steps to required_objects
        # so the star schema planner includes them.
        if not ctx.result.measure_source_objects and ctx.result.dimensions:
            dim_objects = {d.object_name for d in ctx.result.dimensions}
            if not dim_objects <= {ctx.result.base_object}:
                graph = JoinGraph(model, use_path_names=query.use_path_names or None)
                steps = graph.find_join_path(
                    {ctx.result.base_object},
                    dim_objects,
                    via_constraints=ctx.result.via_constraints or None,
                )
                for step in steps:
                    ctx.result.required_objects.add(step.from_object)
                    ctx.result.required_objects.add(step.to_object)

        # Raw mode: detect multi-fact (fields span objects unreachable from
        # the base via directed joins). The pipeline rejects this case for
        # now — raw CFL is a planned follow-up.
        if ctx.result.is_raw and ctx.result.base_object:
            field_objects = {f.object_name for f in ctx.result.fields}
            if len(field_objects) > 1:
                graph = JoinGraph(model, use_path_names=query.use_path_names or None)
                reachable = graph.descendants(ctx.result.base_object)
                unreachable = field_objects - reachable - {ctx.result.base_object}
                if unreachable:
                    ctx.result.requires_cfl = True

        # Validate dimensionsExclude constraints
        if query.dimensions_exclude:
            if query.select.measures:
                ctx.errors.append(
                    SemanticError(
                        code="DIMENSIONS_EXCLUDE_WITH_MEASURES",
                        message="dimensionsExclude cannot be combined with measures",
                        path="select",
                    )
                )
            elif len(ctx.result.dimensions) < 2:
                ctx.errors.append(
                    SemanticError(
                        code="DIMENSIONS_EXCLUDE_INSUFFICIENT",
                        message="dimensionsExclude requires at least 2 dimensions",
                        path="select.dimensions",
                    )
                )
            else:
                ctx.result.dimensions_exclude = True

        # 4. Validate usePathNames before building join graph
        self._validate_use_path_names(ctx, query.use_path_names)

        # 5. Resolve join paths
        ctx.graph = JoinGraph(model, use_path_names=query.use_path_names or None)
        if ctx.result.base_object and len(ctx.result.required_objects) > 1:
            ctx.result.join_steps = ctx.graph.find_join_path(
                {ctx.result.base_object},
                ctx.result.required_objects,
                via_constraints=ctx.result.via_constraints or None,
            )

        # Build set of all objects present in the query's join graph
        if ctx.result.base_object:
            ctx.joined_objects.add(ctx.result.base_object)
        for step in ctx.result.join_steps:
            ctx.joined_objects.add(step.to_object)

        # Detect required objects that the star-schema planner cannot reach.
        # Many-to-one joins are forward-only (reverse traversal would inflate
        # the base table), so a required object that's only reachable via a
        # reverse m-to-1 hop is unreachable.  Raise a clear error rather than
        # silently producing wrong SQL.  CFL legs are validated separately.
        if ctx.result.base_object and not ctx.result.requires_cfl:
            unreachable = ctx.result.required_objects - ctx.joined_objects
            for unreachable_name in sorted(unreachable):
                ctx.errors.append(
                    SemanticError(
                        code="UNREACHABLE_REQUIRED_OBJECT",
                        message=(
                            f"Data object '{unreachable_name}' is required by the query but "
                            f"cannot be reached from base '{ctx.result.base_object}' via "
                            f"directed joins. Many-to-one joins are forward-only; reverse "
                            f"traversal would inflate row counts. Add an explicit join from "
                            f"'{ctx.result.base_object}' (or an intermediate object) to "
                            f"'{unreachable_name}', or split the query so each fact is "
                            f"queried independently."
                        ),
                        path="select",
                    )
                )

        # 5b. Inject static model filters — always applied as WHERE conditions
        static_exprs: list[Expr] = []
        for mf in model.filters:
            static_filter = self._resolve_static_filter(ctx, mf)
            if static_filter:
                ctx.result.where_filters.append(static_filter)
                static_exprs.append(static_filter.expression)

        # 6. Classify filters — skip query-time duplicates of static filters
        for qfi in query.where:
            resolved_filter = self._resolve_filter_item(ctx, qfi, is_having=False)
            if resolved_filter and resolved_filter.expression not in static_exprs:
                ctx.result.where_filters.append(resolved_filter)

        for qfi in query.having:
            resolved_filter = self._resolve_filter_item(ctx, qfi, is_having=True)
            if resolved_filter:
                ctx.result.having_filters.append(resolved_filter)

        # 7. Resolve order by — must reference a dimension or measure in SELECT
        select_count = len(ctx.result.dimensions) + len(ctx.result.measures)
        for ob in query.order_by:
            expr = self._resolve_order_by_field(ctx, ob.field, select_count)
            if expr:
                ctx.result.order_by_exprs.append((expr, ob.direction == "desc", ob.nulls))

        # 8. ROLLUP / CUBE: backfill NULLS FIRST on any explicit ORDER BY entry
        # that didn't specify a NULLs position. Subtotal and grand-total rows
        # carry NULLs in the rolled-up group-by columns, and BI tools expect
        # those totals at the top of the result — not interleaved with details.
        if ctx.result.grouping is not None and ctx.result.order_by_exprs:
            ctx.result.order_by_exprs = [
                (expr, desc, NullsPosition.FIRST if nulls is None else nulls)
                for expr, desc, nulls in ctx.result.order_by_exprs
            ]

        # 9. Auto-order — when no explicit ORDER BY, append ORDER BY over all
        # SELECT dimensions (or raw fields) under two conditions:
        #   (a) LIMIT is set: cache hashes on compiled SQL; without ORDER BY
        #       ``LIMIT N`` returns any N rows, freezing one arbitrary slice.
        #   (b) ROLLUP / CUBE: subtotal layout is otherwise unpredictable.
        # ROLLUP / CUBE defaults to NULLS FIRST (totals at the top).
        # Aggregate-only queries (no dims, no fields) are already single-row
        # deterministic — skip.
        needs_auto_order = not ctx.result.order_by_exprs and (
            ctx.result.limit is not None or ctx.result.grouping is not None
        )
        if needs_auto_order:
            nulls_default = NullsPosition.FIRST if ctx.result.grouping is not None else None
            if ctx.result.is_raw and ctx.result.fields:
                for f in ctx.result.fields:
                    ctx.result.order_by_exprs.append(
                        (ColumnRef(name=f.alias), False, nulls_default)
                    )
            elif ctx.result.dimensions:
                for dim in ctx.result.dimensions:
                    ctx.result.order_by_exprs.append(
                        (ColumnRef(name=dim.name), False, nulls_default)
                    )

        if ctx.errors:
            raise ResolutionError(ctx.errors)

        return ctx.result

    # -- raw mode fields -----------------------------------------------------

    def _resolve_raw_field(self, ctx: _ResolutionContext, ref: str) -> None:
        """Resolve a ``DataObject.Column`` reference for raw-mode projection.

        Errors are accumulated in the resolution context (raised at the end).
        """
        raw_resolution.resolve_raw_field(self, ctx, ref)

    # -- dimensions ----------------------------------------------------------

    def _append_resolved_dimension(
        self,
        ctx: _ResolutionContext,
        dim_str: str,
        coalesce_alias: str | None = None,
    ) -> ResolvedDimension | None:
        """Resolve a single dimension string and append it to the result."""
        dim_ref = DimensionRef.parse(dim_str)
        resolved_dim = self._resolve_dimension(ctx, dim_ref)
        if resolved_dim is None:
            return None
        dim_def = ctx.model.dimensions.get(dim_ref.name)
        if dim_def and dim_def.via:
            resolved_dim.via = dim_def.via
            ctx.result.required_objects.add(dim_def.via)
            ctx.result.via_constraints[resolved_dim.object_name] = dim_def.via
        if coalesce_alias is not None:
            resolved_dim.coalesce_alias = coalesce_alias
        ctx.result.dimensions.append(resolved_dim)
        ctx.result.required_objects.add(resolved_dim.object_name)
        return resolved_dim

    def _resolve_coalesce_dimension(
        self,
        ctx: _ResolutionContext,
        coalesce: CoalesceDimension,
        seen_aliases: set[str],
    ) -> None:
        """Expand a coalesce group into its constituent resolved dimensions.

        Validates: at least 2 members, alias is unique within the query and
        does not collide with an existing dimension/measure name, all members
        resolve to the same abstract column type.
        """
        alias = coalesce.alias
        if not alias:
            ctx.errors.append(
                SemanticError(
                    code="COALESCE_MISSING_ALIAS",
                    message="Coalesce dimension requires a non-empty 'as' alias",
                    path="select.dimensions",
                )
            )
            return
        if alias in seen_aliases:
            ctx.errors.append(
                SemanticError(
                    code="DUPLICATE_COALESCE_ALIAS",
                    message=f"Duplicate coalesce alias '{alias}' in this query",
                    path="select.dimensions",
                )
            )
            return
        if alias in ctx.model.dimensions or alias in ctx.model.effective_measures:
            ctx.errors.append(
                SemanticError(
                    code="COALESCE_ALIAS_COLLISION",
                    message=(
                        f"Coalesce alias '{alias}' collides with an existing "
                        f"model dimension or measure name"
                    ),
                    path="select.dimensions",
                )
            )
            return
        if len(coalesce.coalesce) < 2:
            ctx.errors.append(
                SemanticError(
                    code="COALESCE_TOO_FEW_MEMBERS",
                    message=(
                        f"Coalesce '{alias}' requires at least 2 dimensions "
                        f"(got {len(coalesce.coalesce)})"
                    ),
                    path="select.dimensions",
                )
            )
            return
        seen_aliases.add(alias)

        # Resolve each member with the alias tag; verify type compatibility.
        member_types: set[str] = set()
        for member in coalesce.coalesce:
            resolved = self._append_resolved_dimension(ctx, member, coalesce_alias=alias)
            if resolved:
                dim_def = ctx.model.dimensions.get(member)
                if dim_def:
                    member_types.add(dim_def.result_type.value)
        if len(member_types) > 1:
            ctx.errors.append(
                SemanticError(
                    code="COALESCE_TYPE_MISMATCH",
                    message=(
                        f"Coalesce '{alias}' members have incompatible result types: "
                        f"{sorted(member_types)}"
                    ),
                    path="select.dimensions",
                )
            )

    def _resolve_dimension(
        self, ctx: _ResolutionContext, ref: DimensionRef
    ) -> ResolvedDimension | None:
        """Resolve a dimension reference to its physical column."""
        dim = ctx.model.dimensions.get(ref.name)
        if dim is None:
            ctx.errors.append(
                SemanticError(
                    code="UNKNOWN_DIMENSION",
                    message=f"Unknown dimension '{ref.name}'",
                    path="select.dimensions",
                )
            )
            return None

        obj_name = dim.view
        col_name = dim.column
        obj = ctx.model.data_objects.get(obj_name)
        if obj is None:
            ctx.errors.append(
                SemanticError(
                    code="UNKNOWN_DATA_OBJECT",
                    message=f"Dimension '{ref.name}' references unknown data object '{obj_name}'",
                )
            )
            return None

        vf = obj.columns.get(col_name)
        source_col = vf.code if vf else col_name

        return ResolvedDimension(
            name=ref.name,
            object_name=obj_name,
            column_name=col_name,
            source_column=source_col,
            grain=ref.grain or dim.time_grain,
        )

    # -- measures & metrics --------------------------------------------------

    def _resolve_measure(self, ctx: _ResolutionContext, name: str) -> ResolvedMeasure | None:
        """Resolve a measure name to its aggregate expression."""
        measure = ctx.model.effective_measures.get(name)
        if measure is None:
            metric = ctx.model.metrics.get(name)
            if metric:
                return self._resolve_metric(ctx, name, metric)
            ctx.errors.append(
                SemanticError(
                    code="UNKNOWN_MEASURE",
                    message=f"Unknown measure '{name}'",
                    path="select.measures",
                )
            )
            return None

        expr = self._build_measure_expr(ctx, measure)
        grain_override = measure.grain
        effective_grain: list[str] | None = None
        if grain_override is not None:
            query_dim_names = [d.name for d in ctx.result.dimensions]
            effective_grain = _resolve_effective_grain(grain_override, query_dim_names)
            if effective_grain is not None and not set(effective_grain) <= set(query_dim_names):
                bad = sorted(set(effective_grain) - set(query_dim_names))
                ctx.errors.append(
                    SemanticError(
                        code="GRAIN_NOT_SUBSET",
                        message=(
                            f"Measure '{name}' grain {bad} is not a subset of "
                            f"query dimensions {query_dim_names}. "
                            f"This would cause row multiplication."
                        ),
                        path="select.measures",
                    )
                )
        return ResolvedMeasure(
            name=name,
            aggregation=measure.aggregation,
            expression=expr,
            is_expression=measure.expression is not None,
            total=measure.total,
            grain_override=grain_override,
            effective_grain=effective_grain,
            filter_context=measure.filter_context,
        )

    def _build_measure_expr(self, ctx: _ResolutionContext, measure: Measure) -> Expr:
        """Build the aggregate expression for a measure."""
        # Engine-delegated aggregation (Databricks Metric View). Emit
        # ``MEASURE("<label>")`` literally — there's no source column
        # to read; the engine resolves the aggregation by name. Dialect
        # support is enforced downstream by ``_check_aggregation_supported``.
        if measure.aggregation == AggregationType.MEASURE:
            return FunctionCall(
                name="MEASURE",
                args=[ColumnRef(name=measure.label, table=None)],
            )
        if measure.expression:
            return self._expand_expression(ctx, measure)

        # Build column references for all columns. Routes through
        # ``make_column_expr`` so a measure column that points at a
        # computed (``expression:``) column inlines the template body
        # — without this, ``count_distinct`` over an ``expression:``
        # column would emit ``COUNT(DISTINCT "obj"."")`` (zero-length
        # identifier, DB error).
        args: list[Expr] = []
        if measure.columns:
            for ref in measure.columns:
                obj_name = ref.view or ""
                col_name = ref.column or ""
                # A column-less ref (``dataObject`` set, ``column`` empty) anchors the
                # measure on the object without naming a column — used by the
                # synthesized row-count measure to emit ``COUNT(*)`` while still
                # contributing the anchor to source-object resolution.
                if not col_name:
                    continue
                obj = ctx.model.data_objects.get(obj_name)
                if obj and col_name in obj.columns:
                    args.append(make_column_expr(ctx.model, obj_name, col_name))
                else:
                    args.append(ColumnRef(name=col_name, table=obj_name))
        if not args:
            args = [Literal.number(1)]

        agg = measure.aggregation.upper()
        distinct = measure.distinct
        if agg == "COUNT_DISTINCT":
            agg = "COUNT"
            distinct = True

        # LISTAGG: attach separator and optional ordering
        separator: str | None = None
        order_by: list[OrderByItem] = []
        if agg == "LISTAGG":
            separator = measure.delimiter if measure.delimiter is not None else ","
            if measure.within_group:
                wg = measure.within_group
                wg_obj_name = wg.column.view or ""
                wg_col_name = wg.column.column or ""
                wg_obj = ctx.model.data_objects.get(wg_obj_name)
                if wg_obj and wg_col_name in wg_obj.columns:
                    wg_expr: Expr = make_column_expr(ctx.model, wg_obj_name, wg_col_name)
                else:
                    wg_expr = ColumnRef(name=wg_col_name, table=wg_obj_name)
                order_by = [
                    OrderByItem(expr=wg_expr, desc=wg.order.upper() == "DESC"),
                ]

        result = FunctionCall(
            name=agg,
            args=args,
            distinct=distinct,
            order_by=order_by,
            separator=separator,
        )
        return self._apply_measure_filters(ctx, measure, result)

    def _expand_expression(self, ctx: _ResolutionContext, measure: Measure) -> Expr:
        """Expand a measure expression with ``{[DataObject].[Column]}`` refs into AST."""
        formula = measure.expression or ""
        agg = measure.aggregation.upper()

        tokens = tokenize_measure_expression(formula, ctx.model)
        inner = parse_expression(tokens)

        distinct = measure.distinct
        if agg == "COUNT_DISTINCT":
            agg = "COUNT"
            distinct = True

        result = FunctionCall(
            name=agg,
            args=[inner],
            distinct=distinct,
        )
        return self._apply_measure_filters(ctx, measure, result)

    @staticmethod
    def _apply_measure_filters(
        ctx: _ResolutionContext, measure: Measure, func: FunctionCall
    ) -> FunctionCall:
        """Wrap aggregate args with CASE WHEN if the measure has filters."""
        if not measure.filters:
            return func
        condition = build_measure_filter_condition(measure.filters, ctx.model, ctx.errors)
        if condition is None:
            return func
        wrapped_args: list[Expr] = [CaseExpr(when_clauses=[(condition, arg)]) for arg in func.args]
        return FunctionCall(
            name=func.name,
            args=wrapped_args,
            distinct=func.distinct,
            order_by=func.order_by,
            separator=func.separator,
        )

    def _resolve_metric(
        self, ctx: _ResolutionContext, name: str, metric: Metric
    ) -> ResolvedMeasure | None:
        """Resolve a metric to its combined expression."""
        return metric_resolution.resolve_metric(self, ctx, name, metric)

    def _validate_partition_dimensions(
        self,
        ctx: _ResolutionContext,
        metric_name: str,
        partition_by: list[str],
        path_template: str,
    ) -> bool:
        return metric_resolution.validate_partition_dimensions(
            self, ctx, metric_name, partition_by, path_template
        )

    def _resolve_window_metric(
        self, ctx: _ResolutionContext, name: str, metric: Metric
    ) -> ResolvedMeasure | None:
        """Resolve a window metric (rank/lag/lead/ntile/first_value/last_value)."""
        return metric_resolution.resolve_window_metric(self, ctx, name, metric)

    def _resolve_derived_metric(
        self, ctx: _ResolutionContext, name: str, metric: Metric
    ) -> ResolvedMeasure | None:
        """Resolve a derived metric to its combined expression."""
        return metric_resolution.resolve_derived_metric(self, ctx, name, metric)

    def _resolve_cumulative_metric(
        self, ctx: _ResolutionContext, name: str, metric: Metric
    ) -> ResolvedMeasure | None:
        """Resolve a cumulative metric referencing an existing measure."""
        return metric_resolution.resolve_cumulative_metric(self, ctx, name, metric)

    def _resolve_pop_metric(
        self, ctx: _ResolutionContext, name: str, metric: Metric
    ) -> ResolvedMeasure | None:
        """Resolve a period-over-period metric."""
        return metric_resolution.resolve_pop_metric(self, ctx, name, metric)

    def _collect_having_measure_refs(self, query: QueryObject, model: SemanticModel) -> list[str]:
        """Collect measure/metric names referenced in any HAVING filter.

        Walks ``query.having`` recursively (each entry is a
        ``QueryFilter`` or a ``QueryFilterGroup``) and returns the
        ordered, de-duplicated list of ``field`` values that name a
        known measure or metric in the model. Order is preserved for
        deterministic resolution; duplicates are dropped on first sight.
        """

        seen: set[str] = set()
        out: list[str] = []
        measure_names = model.effective_measures

        def _visit(item: QueryFilterItem) -> None:
            if isinstance(item, QueryFilterGroup):
                for child in item.filters:
                    _visit(child)
                return
            field = item.field
            if field in seen:
                return
            if field in measure_names or field in model.metrics:
                seen.add(field)
                out.append(field)

        for entry in query.having:
            _visit(entry)
        return out

    def _get_measure_source_objects(self, ctx: _ResolutionContext, name: str) -> set[str]:
        """Extract all source data objects for a measure or metric."""
        result: set[str] = set()

        measure = ctx.model.effective_measures.get(name)
        if measure:
            for cref in measure.columns:
                if cref.view:
                    result.add(cref.view)
            if measure.expression:
                col_refs = re.findall(r"\{\[([^\]]+)\]\.\[([^\]]+)\]\}", measure.expression)
                for obj_name, _col_name in col_refs:
                    result.add(obj_name)
            for fi in measure.filters:
                collect_measure_filter_objects(fi, result)
            return result

        metric = ctx.model.metrics.get(name)
        if metric:
            if metric.type == MetricType.CUMULATIVE and metric.measure:
                # Cumulative metric: source objects come from the referenced measure
                result.update(self._get_measure_source_objects(ctx, metric.measure))
            elif metric.type == MetricType.WINDOW and metric.measure:
                # Window metric: source objects come from the referenced measure
                result.update(self._get_measure_source_objects(ctx, metric.measure))
            elif metric.expression:
                # Derived or PoP metric: parse expression for measure references
                measure_refs = re.findall(r"\{\[([^\]]+)\]\}", metric.expression)
                for ref_name in measure_refs:
                    result.update(self._get_measure_source_objects(ctx, ref_name))

        return result

    # -- base object selection -----------------------------------------------

    def _select_base_object(self, ctx: _ResolutionContext) -> str:
        """Select the base (fact) object — prefer measure source objects with most joins."""
        if ctx.result.measure_source_objects:
            best = ""
            best_joins = -1
            for obj_name in sorted(ctx.result.measure_source_objects):
                obj = ctx.model.data_objects.get(obj_name)
                n = len(obj.joins) if obj else 0
                if n > best_joins:
                    best = obj_name
                    best_joins = n
            if best:
                return best

        # Dimension-only: use JoinGraph to find the deepest ancestor
        # (possibly an intermediate fact/bridge table) that can reach
        # all required dimension objects via directed join paths.
        if len(ctx.result.required_objects) > 1:
            graph = JoinGraph(ctx.model, use_path_names=ctx.result.use_path_names or None)
            root = graph.find_common_root(ctx.result.required_objects)
            if root:
                return root

        for obj_name in sorted(ctx.result.required_objects):
            obj = ctx.model.data_objects.get(obj_name)
            if obj and obj.joins:
                return obj_name

        if ctx.result.required_objects:
            return next(iter(sorted(ctx.result.required_objects)))
        if ctx.model.data_objects:
            return next(iter(ctx.model.data_objects))
        return ""

    # -- usePathNames validation ---------------------------------------------

    def _validate_use_path_names(
        self, ctx: _ResolutionContext, use_path_names: list[UsePathName]
    ) -> None:
        """Validate usePathNames references."""
        for upn in use_path_names:
            if upn.source not in ctx.model.data_objects:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT",
                        message=f"usePathNames references unknown data object '{upn.source}'",
                        path="usePathNames",
                    )
                )
                continue
            if upn.target not in ctx.model.data_objects:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT",
                        message=f"usePathNames references unknown data object '{upn.target}'",
                        path="usePathNames",
                    )
                )
                continue
            source_obj = ctx.model.data_objects[upn.source]
            found = any(
                j.join_to == upn.target and j.secondary and j.path_name == upn.path_name
                for j in source_obj.joins
            )
            if not found:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_PATH_NAME",
                        message=(
                            f"No secondary join with pathName '{upn.path_name}' "
                            f"from '{upn.source}' to '{upn.target}'"
                        ),
                        path="usePathNames",
                    )
                )

    # -- static model filters ------------------------------------------------

    def _resolve_static_filter(
        self, ctx: _ResolutionContext, mf: ModelFilter
    ) -> ResolvedFilter | None:
        """Resolve a static model filter to a physical WHERE expression.

        Silently skips filters on data objects that are unreachable from the
        query's join graph — they are simply irrelevant to the current query.
        """
        return filter_resolution.resolve_static_filter(self, ctx, mf)

    # -- filters -------------------------------------------------------------

    def _resolve_filter_object(
        self,
        ctx: _ResolutionContext,
        obj_name: str,
        filter_path: str,
        _field_label: str,
    ) -> bool:
        """Ensure *obj_name* is joined; auto-extend if reachable.

        Silently skips filters on unreachable data objects — they are
        irrelevant to the current query.
        """
        return filter_resolution.resolve_filter_object(
            self, ctx, obj_name, filter_path, _field_label
        )

    def _resolve_filter_item(
        self, ctx: _ResolutionContext, item: QueryFilterItem, *, is_having: bool
    ) -> ResolvedFilter | None:
        """Resolve a filter item (leaf or group) to a physical expression."""
        return filter_resolution.resolve_filter_item(self, ctx, item, is_having=is_having)

    def _resolve_filter_group(
        self, ctx: _ResolutionContext, group: QueryFilterGroup, *, is_having: bool
    ) -> ResolvedFilter | None:
        """Resolve a filter group recursively, combining with AND/OR."""
        return filter_resolution.resolve_filter_group(self, ctx, group, is_having=is_having)

    def _resolve_filter(
        self, ctx: _ResolutionContext, qf: QueryFilter, *, is_having: bool
    ) -> ResolvedFilter | None:
        """Resolve a query filter to a physical expression.

        Filter fields can reference:
        1. A dimension name (e.g. ``"Order Priority"``)
        2. A qualified column ``"DataObject.Column"`` (e.g. ``"Orders.Order Priority"``)
        3. For HAVING filters, a measure name (e.g. ``"Revenue"``)

        If the referenced data object is reachable but not yet joined, the
        join path is auto-extended.
        """
        return filter_resolution.resolve_filter(self, ctx, qf, is_having=is_having)

    # -- order by ------------------------------------------------------------

    def _resolve_order_by_field(
        self, ctx: _ResolutionContext, field_name: str, select_count: int
    ) -> Expr | None:
        """Resolve an order-by field to its expression."""
        return filter_resolution.resolve_order_by_field(self, ctx, field_name, select_count)

`resolve(query, model, qualify_table=None)` ¶

Source code in src/orionbelt/compiler/resolution.py

def resolve(
    self,
    query: QueryObject,
    model: SemanticModel,
    qualify_table: Callable[[DataObject], str] | None = None,
) -> ResolvedQuery:
    ctx = _ResolutionContext(
        model=model,
        result=ResolvedQuery(
            limit=query.limit,
            offset=query.offset,
            use_path_names=list(query.use_path_names),
            is_raw=query.select.is_raw,
            distinct=query.select.distinct,
            grouping=query.grouping,
        ),
        qualify_table=qualify_table,
    )

    # Build global column lookup: col_name → (object_name, source_column)
    for obj_name, obj in model.data_objects.items():
        for col_name, col_obj in obj.columns.items():
            ctx.global_columns[col_name] = (obj_name, col_obj.code)

    if query.select.is_raw:
        # Raw mode: project physical columns, no aggregation.
        for ref in query.select.fields:
            self._resolve_raw_field(ctx, ref)
    else:
        # Aggregate mode (default).
        # 1. Resolve dimensions (string or coalesce group).
        # Coalesce groups expand into their constituent dimensions, each
        # tagged with the same coalesce_alias so the CFL outer wrapper can
        # emit COALESCE(d1, d2, ...) AS <alias>.
        for dim_entry in query.select.dimensions:
            if isinstance(dim_entry, CoalesceDimension):
                self._resolve_coalesce_dimension(ctx, dim_entry, ctx.result.coalesce_aliases)
            else:
                self._append_resolved_dimension(ctx, dim_entry)

        # 2. Resolve measures and track their source objects
        for measure_name in query.select.measures:
            resolved_meas = self._resolve_measure(ctx, measure_name)
            if resolved_meas:
                ctx.result.measures.append(resolved_meas)
                source_objs = self._get_measure_source_objects(ctx, measure_name)
                ctx.result.measure_source_objects.update(source_objs)
                ctx.result.required_objects.update(source_objs)

        # 2.5. Auto-include measures referenced by HAVING but not by SELECT.
        # Without this, codegen emits a HAVING clause that references an
        # alias for a column the SELECT doesn't project — every database
        # rejects the SQL with a "must appear in GROUP BY" binder error.
        # Routing this through the regular measure-resolution path also
        # updates ``measure_source_objects`` so the multi-fact CFL trigger
        # below sees the HAVING-only measure's source.
        existing_measure_names = {m.name for m in ctx.result.measures}
        for ref in self._collect_having_measure_refs(query, model):
            if ref in existing_measure_names:
                continue
            resolved_meas = self._resolve_measure(ctx, ref)
            if resolved_meas is None:
                continue
            ctx.result.measures.append(resolved_meas)
            ctx.result.having_only_measures.add(ref)
            existing_measure_names.add(ref)
            source_objs = self._get_measure_source_objects(ctx, ref)
            ctx.result.measure_source_objects.update(source_objs)
            ctx.result.required_objects.update(source_objs)

    # 3. Determine base object (the one with most joins / most measures)
    ctx.result.base_object = self._select_base_object(ctx)
    if ctx.result.base_object:
        ctx.result.required_objects.add(ctx.result.base_object)

    # Detect multi-fact: CFL is needed only when measure source objects
    # span multiple independent fact tables.
    if len(ctx.result.measure_source_objects) > 1:
        graph = JoinGraph(model, use_path_names=query.use_path_names or None)
        reachable = graph.descendants(ctx.result.base_object)
        unreachable = ctx.result.measure_source_objects - reachable - {ctx.result.base_object}
        if unreachable:
            ctx.result.requires_cfl = True

    # Dimension-only queries: when dimensions span independent branches,
    # join through intermediate bridge/fact tables (no CFL needed).
    # Add intermediate tables from the join steps to required_objects
    # so the star schema planner includes them.
    if not ctx.result.measure_source_objects and ctx.result.dimensions:
        dim_objects = {d.object_name for d in ctx.result.dimensions}
        if not dim_objects <= {ctx.result.base_object}:
            graph = JoinGraph(model, use_path_names=query.use_path_names or None)
            steps = graph.find_join_path(
                {ctx.result.base_object},
                dim_objects,
                via_constraints=ctx.result.via_constraints or None,
            )
            for step in steps:
                ctx.result.required_objects.add(step.from_object)
                ctx.result.required_objects.add(step.to_object)

    # Raw mode: detect multi-fact (fields span objects unreachable from
    # the base via directed joins). The pipeline rejects this case for
    # now — raw CFL is a planned follow-up.
    if ctx.result.is_raw and ctx.result.base_object:
        field_objects = {f.object_name for f in ctx.result.fields}
        if len(field_objects) > 1:
            graph = JoinGraph(model, use_path_names=query.use_path_names or None)
            reachable = graph.descendants(ctx.result.base_object)
            unreachable = field_objects - reachable - {ctx.result.base_object}
            if unreachable:
                ctx.result.requires_cfl = True

    # Validate dimensionsExclude constraints
    if query.dimensions_exclude:
        if query.select.measures:
            ctx.errors.append(
                SemanticError(
                    code="DIMENSIONS_EXCLUDE_WITH_MEASURES",
                    message="dimensionsExclude cannot be combined with measures",
                    path="select",
                )
            )
        elif len(ctx.result.dimensions) < 2:
            ctx.errors.append(
                SemanticError(
                    code="DIMENSIONS_EXCLUDE_INSUFFICIENT",
                    message="dimensionsExclude requires at least 2 dimensions",
                    path="select.dimensions",
                )
            )
        else:
            ctx.result.dimensions_exclude = True

    # 4. Validate usePathNames before building join graph
    self._validate_use_path_names(ctx, query.use_path_names)

    # 5. Resolve join paths
    ctx.graph = JoinGraph(model, use_path_names=query.use_path_names or None)
    if ctx.result.base_object and len(ctx.result.required_objects) > 1:
        ctx.result.join_steps = ctx.graph.find_join_path(
            {ctx.result.base_object},
            ctx.result.required_objects,
            via_constraints=ctx.result.via_constraints or None,
        )

    # Build set of all objects present in the query's join graph
    if ctx.result.base_object:
        ctx.joined_objects.add(ctx.result.base_object)
    for step in ctx.result.join_steps:
        ctx.joined_objects.add(step.to_object)

    # Detect required objects that the star-schema planner cannot reach.
    # Many-to-one joins are forward-only (reverse traversal would inflate
    # the base table), so a required object that's only reachable via a
    # reverse m-to-1 hop is unreachable.  Raise a clear error rather than
    # silently producing wrong SQL.  CFL legs are validated separately.
    if ctx.result.base_object and not ctx.result.requires_cfl:
        unreachable = ctx.result.required_objects - ctx.joined_objects
        for unreachable_name in sorted(unreachable):
            ctx.errors.append(
                SemanticError(
                    code="UNREACHABLE_REQUIRED_OBJECT",
                    message=(
                        f"Data object '{unreachable_name}' is required by the query but "
                        f"cannot be reached from base '{ctx.result.base_object}' via "
                        f"directed joins. Many-to-one joins are forward-only; reverse "
                        f"traversal would inflate row counts. Add an explicit join from "
                        f"'{ctx.result.base_object}' (or an intermediate object) to "
                        f"'{unreachable_name}', or split the query so each fact is "
                        f"queried independently."
                    ),
                    path="select",
                )
            )

    # 5b. Inject static model filters — always applied as WHERE conditions
    static_exprs: list[Expr] = []
    for mf in model.filters:
        static_filter = self._resolve_static_filter(ctx, mf)
        if static_filter:
            ctx.result.where_filters.append(static_filter)
            static_exprs.append(static_filter.expression)

    # 6. Classify filters — skip query-time duplicates of static filters
    for qfi in query.where:
        resolved_filter = self._resolve_filter_item(ctx, qfi, is_having=False)
        if resolved_filter and resolved_filter.expression not in static_exprs:
            ctx.result.where_filters.append(resolved_filter)

    for qfi in query.having:
        resolved_filter = self._resolve_filter_item(ctx, qfi, is_having=True)
        if resolved_filter:
            ctx.result.having_filters.append(resolved_filter)

    # 7. Resolve order by — must reference a dimension or measure in SELECT
    select_count = len(ctx.result.dimensions) + len(ctx.result.measures)
    for ob in query.order_by:
        expr = self._resolve_order_by_field(ctx, ob.field, select_count)
        if expr:
            ctx.result.order_by_exprs.append((expr, ob.direction == "desc", ob.nulls))

    # 8. ROLLUP / CUBE: backfill NULLS FIRST on any explicit ORDER BY entry
    # that didn't specify a NULLs position. Subtotal and grand-total rows
    # carry NULLs in the rolled-up group-by columns, and BI tools expect
    # those totals at the top of the result — not interleaved with details.
    if ctx.result.grouping is not None and ctx.result.order_by_exprs:
        ctx.result.order_by_exprs = [
            (expr, desc, NullsPosition.FIRST if nulls is None else nulls)
            for expr, desc, nulls in ctx.result.order_by_exprs
        ]

    # 9. Auto-order — when no explicit ORDER BY, append ORDER BY over all
    # SELECT dimensions (or raw fields) under two conditions:
    #   (a) LIMIT is set: cache hashes on compiled SQL; without ORDER BY
    #       ``LIMIT N`` returns any N rows, freezing one arbitrary slice.
    #   (b) ROLLUP / CUBE: subtotal layout is otherwise unpredictable.
    # ROLLUP / CUBE defaults to NULLS FIRST (totals at the top).
    # Aggregate-only queries (no dims, no fields) are already single-row
    # deterministic — skip.
    needs_auto_order = not ctx.result.order_by_exprs and (
        ctx.result.limit is not None or ctx.result.grouping is not None
    )
    if needs_auto_order:
        nulls_default = NullsPosition.FIRST if ctx.result.grouping is not None else None
        if ctx.result.is_raw and ctx.result.fields:
            for f in ctx.result.fields:
                ctx.result.order_by_exprs.append(
                    (ColumnRef(name=f.alias), False, nulls_default)
                )
        elif ctx.result.dimensions:
            for dim in ctx.result.dimensions:
                ctx.result.order_by_exprs.append(
                    (ColumnRef(name=dim.name), False, nulls_default)
                )

    if ctx.errors:
        raise ResolutionError(ctx.errors)

    return ctx.result

Star Schema Planner¶

`orionbelt.compiler.star.StarSchemaPlanner` ¶

Plans star-schema queries: single fact base with dimension joins.

Source code in src/orionbelt/compiler/star.py

class StarSchemaPlanner:
    """Plans star-schema queries: single fact base with dimension joins."""

    def plan(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        qualify_table: Callable[[DataObject], str] | None = None,
        dialect: Dialect | None = None,
    ) -> QueryPlan:
        builder = QueryBuilder()
        graph = JoinGraph(model, use_path_names=resolved.use_path_names or None)

        def qualify(obj: DataObject) -> str:
            return qualify_table(obj) if qualify_table else obj.qualified_code

        base_object = model.data_objects.get(resolved.base_object)
        if not base_object:
            return QueryPlan(ast=builder.build())

        base_alias = resolved.base_object

        # SELECT: dimensions (apply time grain truncation if specified)
        grouping_dim_aliases: list[str] = []
        for dim in resolved.dimensions:
            col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
            if dim.grain and dialect:
                col = dialect.render_time_grain(col, dim.grain)
            builder.select(AliasedExpr(expr=col, alias=dim.name))
            if resolved.grouping is not None:
                grouping_dim_aliases.append(dim.name)

        # SELECT: measures (aggregated) — for metrics, substitute component refs
        settings = model.settings
        measure_exprs: dict[str, Expr] = {}
        for measure in resolved.measures:
            if measure.component_measures:
                expr: Expr = _substitute_measure_refs(
                    measure.expression, resolved.metric_components
                )
                metric = model.metrics.get(measure.name)
                if metric and dialect:
                    resolved_type = resolve_metric_data_type(metric, settings)
                    if resolved_type:
                        expr = dialect.cast_to_obml_type(expr, resolved_type)
                builder.select(AliasedExpr(expr=expr, alias=measure.name))
            else:
                expr = measure.expression
                model_measure = model.effective_measures.get(measure.name)
                if model_measure and dialect:
                    resolved_type = resolve_measure_data_type(model_measure, settings)
                    if resolved_type:
                        expr = dialect.cast_to_obml_type(expr, resolved_type)
                builder.select(AliasedExpr(expr=expr, alias=measure.name))
            measure_exprs[measure.name] = expr

        # FROM: base fact table
        builder.from_(qualify(base_object), alias=base_alias)

        # JOINs: dimension and intermediate tables
        joined = {base_alias}
        for step in resolved.join_steps:
            # Determine which side of the step needs to be joined
            if step.to_object not in joined:
                new_object = step.to_object
            elif step.from_object not in joined:
                new_object = step.from_object
            else:
                continue  # both already joined
            obj = model.data_objects.get(new_object)
            if not obj:
                continue
            on_expr = graph.build_join_condition(step)
            builder.join(
                table=qualify(obj),
                on=on_expr,
                join_type=step.join_type,
                alias=new_object,
            )
            joined.add(new_object)

        # WHERE
        for wf in resolved.where_filters:
            builder.where(wf.expression)

        # GROUP BY (all dimension columns, with time grain if applicable).
        # Stash the per-dim group-by expression by alias so GROUPING() below
        # can reuse the SAME expression — Postgres rejects GROUPING(<alias>)
        # with "column does not exist" and requires the group-key expression.
        group_by_exprs: dict[str, Expr] = {}
        for dim in resolved.dimensions:
            gb_col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
            if dim.grain and dialect:
                gb_col = dialect.render_time_grain(gb_col, dim.grain)
            builder.group_by(gb_col)
            group_by_exprs[dim.name] = gb_col

        # GROUPING() flag columns + grouping modifier (rollup/cube)
        if resolved.grouping is not None and grouping_dim_aliases:
            builder.grouping(resolved.grouping.value)
            for alias in grouping_dim_aliases:
                gb_arg = group_by_exprs.get(alias) or ColumnRef(name=alias)
                flag_col = FunctionCall(name="GROUPING", args=[gb_arg])
                builder.select(AliasedExpr(expr=flag_col, alias=_grouping_flag_alias(alias)))

        # HAVING — expand alias references to actual CAST'd aggregate expressions
        for hf in resolved.having_filters:
            builder.having(_expand_measure_refs(hf.expression, measure_exprs))

        # ORDER BY (use alias for time-grained dimensions)
        grained_cols: dict[tuple[str, str | None], str] = {
            (d.source_column, d.object_name): d.name for d in resolved.dimensions if d.grain
        }
        for expr, desc, nulls in resolved.order_by_exprs:
            if isinstance(expr, ColumnRef) and (expr.name, expr.table) in grained_cols:
                expr = ColumnRef(name=grained_cols[(expr.name, expr.table)])
            builder.order_by(expr, desc=desc, nulls_last=_nulls_last(nulls))

        # LIMIT / OFFSET
        if resolved.limit is not None:
            builder.limit(resolved.limit)
        if resolved.offset is not None:
            builder.offset(resolved.offset)

        return QueryPlan(ast=builder.build())

`plan(resolved, model, qualify_table=None, dialect=None)` ¶

Source code in src/orionbelt/compiler/star.py

def plan(
    self,
    resolved: ResolvedQuery,
    model: SemanticModel,
    qualify_table: Callable[[DataObject], str] | None = None,
    dialect: Dialect | None = None,
) -> QueryPlan:
    builder = QueryBuilder()
    graph = JoinGraph(model, use_path_names=resolved.use_path_names or None)

    def qualify(obj: DataObject) -> str:
        return qualify_table(obj) if qualify_table else obj.qualified_code

    base_object = model.data_objects.get(resolved.base_object)
    if not base_object:
        return QueryPlan(ast=builder.build())

    base_alias = resolved.base_object

    # SELECT: dimensions (apply time grain truncation if specified)
    grouping_dim_aliases: list[str] = []
    for dim in resolved.dimensions:
        col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
        if dim.grain and dialect:
            col = dialect.render_time_grain(col, dim.grain)
        builder.select(AliasedExpr(expr=col, alias=dim.name))
        if resolved.grouping is not None:
            grouping_dim_aliases.append(dim.name)

    # SELECT: measures (aggregated) — for metrics, substitute component refs
    settings = model.settings
    measure_exprs: dict[str, Expr] = {}
    for measure in resolved.measures:
        if measure.component_measures:
            expr: Expr = _substitute_measure_refs(
                measure.expression, resolved.metric_components
            )
            metric = model.metrics.get(measure.name)
            if metric and dialect:
                resolved_type = resolve_metric_data_type(metric, settings)
                if resolved_type:
                    expr = dialect.cast_to_obml_type(expr, resolved_type)
            builder.select(AliasedExpr(expr=expr, alias=measure.name))
        else:
            expr = measure.expression
            model_measure = model.effective_measures.get(measure.name)
            if model_measure and dialect:
                resolved_type = resolve_measure_data_type(model_measure, settings)
                if resolved_type:
                    expr = dialect.cast_to_obml_type(expr, resolved_type)
            builder.select(AliasedExpr(expr=expr, alias=measure.name))
        measure_exprs[measure.name] = expr

    # FROM: base fact table
    builder.from_(qualify(base_object), alias=base_alias)

    # JOINs: dimension and intermediate tables
    joined = {base_alias}
    for step in resolved.join_steps:
        # Determine which side of the step needs to be joined
        if step.to_object not in joined:
            new_object = step.to_object
        elif step.from_object not in joined:
            new_object = step.from_object
        else:
            continue  # both already joined
        obj = model.data_objects.get(new_object)
        if not obj:
            continue
        on_expr = graph.build_join_condition(step)
        builder.join(
            table=qualify(obj),
            on=on_expr,
            join_type=step.join_type,
            alias=new_object,
        )
        joined.add(new_object)

    # WHERE
    for wf in resolved.where_filters:
        builder.where(wf.expression)

    # GROUP BY (all dimension columns, with time grain if applicable).
    # Stash the per-dim group-by expression by alias so GROUPING() below
    # can reuse the SAME expression — Postgres rejects GROUPING(<alias>)
    # with "column does not exist" and requires the group-key expression.
    group_by_exprs: dict[str, Expr] = {}
    for dim in resolved.dimensions:
        gb_col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
        if dim.grain and dialect:
            gb_col = dialect.render_time_grain(gb_col, dim.grain)
        builder.group_by(gb_col)
        group_by_exprs[dim.name] = gb_col

    # GROUPING() flag columns + grouping modifier (rollup/cube)
    if resolved.grouping is not None and grouping_dim_aliases:
        builder.grouping(resolved.grouping.value)
        for alias in grouping_dim_aliases:
            gb_arg = group_by_exprs.get(alias) or ColumnRef(name=alias)
            flag_col = FunctionCall(name="GROUPING", args=[gb_arg])
            builder.select(AliasedExpr(expr=flag_col, alias=_grouping_flag_alias(alias)))

    # HAVING — expand alias references to actual CAST'd aggregate expressions
    for hf in resolved.having_filters:
        builder.having(_expand_measure_refs(hf.expression, measure_exprs))

    # ORDER BY (use alias for time-grained dimensions)
    grained_cols: dict[tuple[str, str | None], str] = {
        (d.source_column, d.object_name): d.name for d in resolved.dimensions if d.grain
    }
    for expr, desc, nulls in resolved.order_by_exprs:
        if isinstance(expr, ColumnRef) and (expr.name, expr.table) in grained_cols:
            expr = ColumnRef(name=grained_cols[(expr.name, expr.table)])
        builder.order_by(expr, desc=desc, nulls_last=_nulls_last(nulls))

    # LIMIT / OFFSET
    if resolved.limit is not None:
        builder.limit(resolved.limit)
    if resolved.offset is not None:
        builder.offset(resolved.offset)

    return QueryPlan(ast=builder.build())

CFL Planner¶

`orionbelt.compiler.cfl.CFLPlanner` ¶

Plans Composite Fact Layer queries: conformed dimensions + fact stitching.

Uses a UNION ALL strategy: 1. Each fact leg SELECTs conformed dimensions + its own measures (NULL for others) 2. UNION ALL combines the legs into a single CTE 3. Outer query aggregates over the union, grouping by conformed dimensions

Source code in src/orionbelt/compiler/cfl.py

class CFLPlanner:
    """Plans Composite Fact Layer queries: conformed dimensions + fact stitching.

    Uses a UNION ALL strategy:
    1. Each fact leg SELECTs conformed dimensions + its own measures (NULL for others)
    2. UNION ALL combines the legs into a single CTE
    3. Outer query aggregates over the union, grouping by conformed dimensions
    """

    def plan(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        qualify_table: Callable[[DataObject], str] | None = None,
        union_by_name: bool = False,
        dialect: Dialect | None = None,
    ) -> QueryPlan:
        """Plan a CFL query."""
        self._validate_fanout(resolved, model)

        # dimensionsExclude: EXCEPT-based anti-join pattern
        if resolved.dimensions_exclude:
            return self._plan_dimensions_exclude(resolved, model, qualify_table)

        # Group measures by their source object
        measures_by_object, cross_fact = self._group_measures_by_object(resolved, model)

        # Dimension-only CFL: no measures but dimensions on independent branches.
        # Create leg groupings from connecting fact tables.
        if not measures_by_object and not cross_fact and resolved.requires_cfl:
            measures_by_object = self._group_dimensions_into_legs(resolved, model)

        if len(measures_by_object) <= 1 and not cross_fact:
            # Single fact — delegate to star schema
            from orionbelt.compiler.star import StarSchemaPlanner

            return StarSchemaPlanner().plan(
                resolved, model, qualify_table=qualify_table, dialect=dialect
            )

        # Two-column statistical aggregates (CORR/COVAR_*/REGR_*) need
        # paired-row semantics that the UNION ALL + concat-count multi-fact
        # path cannot express. Without this guard the planner emits
        # ``CORR(CAST(f0 AS VARCHAR) || '|' || CAST(f1 AS VARCHAR))`` — one
        # argument, wrong type. Fail fast with a clear error so the caller
        # can restructure their model or restrict the query to a single
        # fact source instead of getting an opaque execution-time error.
        for measure in resolved.measures:
            agg = measure.aggregation.lower() if measure.aggregation else ""
            if agg in TWO_COLUMN_AGGREGATIONS:
                raise UnsupportedAggregationForCFLError(measure.name, agg)

        # Multi-fact: UNION ALL strategy
        return self._plan_union_all(
            resolved,
            model,
            measures_by_object,
            cross_fact,
            qualify_table=qualify_table,
            union_by_name=union_by_name,
            dialect=dialect,
        )

    def _validate_fanout(self, resolved: ResolvedQuery, model: SemanticModel) -> None:
        """Validate that grain is compatible and no fanout will occur."""
        errors: list[str] = []

        for dim in resolved.dimensions:
            if dim.object_name not in model.data_objects:
                errors.append(
                    f"Dimension '{dim.name}' references unknown data object '{dim.object_name}'"
                )

        if errors:
            raise FanoutError("; ".join(errors))

    def _group_measures_by_object(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
    ) -> tuple[dict[str, list[ResolvedMeasure]], list[ResolvedMeasure]]:
        """Group measures by their primary source object."""
        return cfl_projection.group_measures_by_object(self, resolved, model)

    @staticmethod
    def _group_dimensions_into_legs(
        resolved: ResolvedQuery,
        model: SemanticModel,
    ) -> dict[str, list[ResolvedMeasure]]:
        """Group dimensions into CFL legs for dimension-only queries."""
        return cfl_projection.group_dimensions_into_legs(resolved, model)

    @staticmethod
    def _is_multi_field(measure: ResolvedMeasure) -> bool:
        """Check if a measure has multiple field args (e.g. COUNT(a, b))."""
        return cfl_projection.is_multi_field(measure)

    @staticmethod
    def _resolve_null_type_for_field(
        measure: ResolvedMeasure,
        field_idx: int,
        model: SemanticModel,
        dialect: Dialect | None = None,
    ) -> str | None:
        """Resolve the SQL type for NULL padding in CFL UNION ALL legs."""
        return cfl_projection.resolve_null_type_for_field(measure, field_idx, model, dialect)

    @staticmethod
    def _multi_field_cte_alias(measure_name: str, idx: int) -> str:
        """CTE column name for the *idx*-th field of a multi-field measure."""
        return cfl_projection.multi_field_cte_alias(measure_name, idx)

    @staticmethod
    def _unwrap_aggregation(measure: ResolvedMeasure) -> Expr:
        """Extract the inner expression from an aggregated measure."""
        return cfl_projection.unwrap_aggregation(measure)

    def _build_outer_metric_expr(
        self,
        metric: ResolvedMeasure,
        resolved: ResolvedQuery,
        cte_name: str,
    ) -> Expr:
        """Build the outer query expression for a metric."""
        return cfl_projection.build_outer_metric_expr(self, metric, resolved, cte_name)

    def _substitute_outer_refs(self, expr: Expr, resolved: ResolvedQuery, cte_name: str) -> Expr:
        """Recursively substitute measure refs with outer aggregations."""
        return cfl_projection.substitute_outer_refs(self, expr, resolved, cte_name)

    @staticmethod
    def _collect_table_refs(expr: Expr, tables: set[str]) -> None:
        """Recursively collect table names from ColumnRef nodes."""
        cfl_projection.collect_table_refs(expr, tables)

    @staticmethod
    def _remap_cfl_order_by(expr: Expr, resolved: ResolvedQuery, model: SemanticModel) -> Expr:
        """Remap ORDER BY expressions to use CTE aliases for the outer query."""
        return cfl_projection.remap_cfl_order_by(expr, resolved, model)

    def _build_outer_concat_count(
        self,
        measure_name: str,
        n_fields: int,
        agg: str,
        distinct: bool,
        cte_name: str,
    ) -> Expr:
        """Build ``COUNT(DISTINCT CAST(f0 AS VARCHAR) || '|' || ...)`` for the outer query."""
        return cfl_projection.build_outer_concat_count(
            self, measure_name, n_fields, agg, distinct, cte_name
        )

    def _plan_union_all(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        measures_by_object: dict[str, list[ResolvedMeasure]],
        cross_fact: list[ResolvedMeasure] | None = None,
        qualify_table: Callable[[DataObject], str] | None = None,
        union_by_name: bool = False,
        dialect: Dialect | None = None,
    ) -> QueryPlan:
        """UNION ALL strategy: stack fact legs with NULL padding, aggregate outside.

        When *union_by_name* is True (DuckDB, Snowflake) each leg only emits
        the columns it actually has — the database fills missing columns with
        NULL automatically via ``UNION ALL BY NAME``.
        """
        graph = JoinGraph(model, use_path_names=resolved.use_path_names or None)

        def qualify(obj: DataObject) -> str:
            return qualify_table(obj) if qualify_table else obj.qualified_code

        # Collect all measures across all objects + cross-fact measures
        all_measures: list[ResolvedMeasure] = []
        for measures in measures_by_object.values():
            all_measures.extend(measures)
        if cross_fact:
            all_measures.extend(cross_fact)

        # Collect data objects referenced by WHERE filters — each leg
        # must join these tables so the filter predicates are valid.
        filter_objects: set[str] = set()
        for wf in resolved.where_filters:
            self._collect_table_refs(wf.expression, filter_objects)

        # Build one SELECT per fact object group.
        # Each leg computes its own LCA (least common ancestor) as the lead
        # table — the graph-central node that can reach all dimension objects
        # and the measure's source object with minimal hops.
        union_legs: list[Select] = []
        leg_infos: list[CflLegInfo] = []
        for obj_name, measures in measures_by_object.items():
            leg_builder = QueryBuilder()
            this_measure_names = {m.name for m in measures}

            # Compute reachability from this leg's fact object upfront
            reachable = graph.descendants(obj_name) | {obj_name}

            # Collect table references from this leg's own-measure
            # expressions. A measure like ``Electronics Sales`` is
            # defined as ``SUM(CASE WHEN Products.productcat = …
            # THEN Sales.salesamount END)`` — the CASE condition
            # references Products, which must be joined into this
            # leg's FROM. Without this, the generated SQL emits
            # ``"Products"."productcat"`` against a FROM clause that
            # only has Sales + Clients, and the database raises
            # "missing FROM-clause entry for table Products".
            measure_expr_objects: set[str] = set()
            for m in measures:
                self._collect_table_refs(m.expression, measure_expr_objects)
            if cross_fact:
                for m in cross_fact:
                    if m.name in this_measure_names:
                        self._collect_table_refs(m.expression, measure_expr_objects)

            # SELECT conformed dimensions — only emit real column refs for
            # dimensions reachable from this leg's fact AND whose `via:`
            # waypoint (if any) is also reachable from this leg's fact.
            # Role-playing dimensions tied to a different fact via `via:`
            # are NULL-padded so each leg only projects the values that
            # belong to its own fact.
            for dim in resolved.dimensions:
                via_ok = dim.via is None or dim.via in reachable
                if dim.object_name in reachable and via_ok:
                    col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
                    if dim.grain and dialect:
                        col = dialect.render_time_grain(col, dim.grain)
                    leg_builder.select(AliasedExpr(expr=col, alias=dim.name))
                elif not union_by_name:
                    model_dim = model.dimensions.get(dim.name)
                    dim_type = model_dim.result_type.value if model_dim else None
                    col = Cast(Literal.null(), type_name=dim_type) if dim_type else Literal.null()
                    leg_builder.select(AliasedExpr(expr=col, alias=dim.name))

            # SELECT this fact's measures (raw expressions, no aggregation).
            # When union_by_name is True, skip NULL padding for other facts'
            # measures — the database fills them automatically.
            for m in all_measures:
                if self._is_multi_field(m):
                    assert isinstance(m.expression, FunctionCall)
                    for i, arg in enumerate(m.expression.args):
                        alias = self._multi_field_cte_alias(m.name, i)
                        arg_table = arg.table if isinstance(arg, ColumnRef) else None
                        if arg_table == obj_name:
                            leg_builder.select(AliasedExpr(expr=arg, alias=alias))
                        elif not union_by_name:
                            null_type = self._resolve_null_type_for_field(m, i, model)
                            null_expr: Expr = (
                                Cast(Literal.null(), type_name=null_type)
                                if null_type
                                else Literal.null()
                            )
                            leg_builder.select(AliasedExpr(expr=null_expr, alias=alias))
                elif m.name in this_measure_names:
                    # Cast the own-measure column to the same type used for
                    # NULL padding in sibling legs, so every leg's column
                    # agrees on a single type. Without this, strict-typed
                    # engines (ClickHouse with UNION ALL) produce a Variant
                    # type that SUM can't aggregate ("ILLEGAL_TYPE_OF_ARGUMENT
                    # Variant(Decimal, Float64)").
                    own_expr: Expr = self._unwrap_aggregation(m)
                    own_type_name = self._resolve_null_type_for_field(m, 0, model, dialect)
                    if own_type_name:
                        own_expr = Cast(expr=own_expr, type_name=own_type_name)
                    leg_builder.select(AliasedExpr(expr=own_expr, alias=m.name))
                elif not union_by_name:
                    model_measure = model.measures.get(m.name)
                    null_type_name = self._resolve_null_type_for_field(m, 0, model, dialect)
                    if null_type_name is None and model_measure:
                        null_type_name = model_measure.result_type.value
                    null_expr = (
                        Cast(Literal.null(), type_name=null_type_name)
                        if null_type_name
                        else Literal.null()
                    )
                    leg_builder.select(AliasedExpr(expr=null_expr, alias=m.name))

            # Determine the common root for this leg:
            # the deepest directed ancestor that can reach all dimension
            # objects, measure's source object, filter-referenced objects,
            # and any objects referenced by this leg's measure expressions.
            # Only include dimensions reachable from this leg's fact object.
            leg_required = {
                dim.object_name for dim in resolved.dimensions if dim.object_name in reachable
            }
            leg_required.add(obj_name)
            leg_required.update(filter_objects)
            # Include objects referenced by measure expressions, but only
            # those reachable from this leg's fact — cross-fact filter
            # tables would otherwise pull unrelated facts into the leg.
            leg_required.update(measure_expr_objects & reachable)
            lead = graph.find_common_root(leg_required)
            lead_obj = model.data_objects.get(lead)

            # FROM: the lead (LCA) table
            if lead_obj:
                leg_builder.from_(qualify(lead_obj), alias=lead)

            # JOINs: all required objects reachable from the lead
            join_targets = leg_required - {lead}
            steps: list[JoinStep] = []
            if join_targets:
                steps = graph.find_join_path(
                    {lead},
                    leg_required,
                    via_constraints=resolved.via_constraints or None,
                )
                # Dedupe by alias so a dim reachable through multiple
                # paths within one leg emits only one JOIN — postgres
                # rejects "table specified more than once" when two
                # role-played dims resolve to the same target object.
                joined_aliases: set[str] = {lead}
                for step in steps:
                    if step.to_object in joined_aliases:
                        continue
                    target_object = model.data_objects.get(step.to_object)
                    if target_object:
                        on_expr = graph.build_join_condition(step)
                        leg_builder.join(
                            table=qualify(target_object),
                            on=on_expr,
                            join_type=step.join_type,
                            alias=step.to_object,
                        )
                        joined_aliases.add(step.to_object)

            # Capture leg info for explain
            leg_join_strs = (
                [f"{s.from_object} → {s.to_object}" for s in steps] if join_targets else []
            )
            if lead == obj_name:
                leg_reason = (
                    f'"{lead}" is the measure source — '
                    f"all required dimension objects are reachable from it"
                )
            else:
                leg_reason = (
                    f'"{lead}" is the deepest common root that can reach '
                    f'measure source "{obj_name}" and all reachable dimension objects'
                )
            leg_infos.append(
                CflLegInfo(
                    measure_source=obj_name,
                    common_root=lead,
                    reason=leg_reason,
                    measures=[m.name for m in measures],
                    joins=leg_join_strs,
                )
            )

            # Apply WHERE filters to each leg
            for wf in resolved.where_filters:
                leg_builder.where(wf.expression)

            union_legs.append(leg_builder.build())

        # Create the UNION ALL CTE
        cte_name = "composite_01"
        union_cte = CTE(name=cte_name, query=UnionAll(queries=union_legs))
        # All ColumnRefs that resolve to raw CTE columns inside outer-query
        # aggregate functions are qualified with *cte_name*. ClickHouse otherwise
        # resolves bare identifiers to sibling SELECT aliases first — when those
        # aliases are themselves aggregates (the case for measures and metrics
        # in the outer SELECT), it rejects the resulting nested aggregate as
        # ``ILLEGAL_AGGREGATION``. The qualification is harmless on dialects
        # that resolve column-first.

        # Build outer query: aggregate over the composite CTE
        outer_builder = QueryBuilder()

        # SELECT dimensions.  Coalesce groups emit COALESCE(d1, d2, ...) once
        # under the alias; plain dims keep their original column reference.
        emitted_coalesce_aliases: set[str] = set()
        coalesce_groups: dict[str, list[str]] = {}
        for d in resolved.dimensions:
            if d.coalesce_alias:
                coalesce_groups.setdefault(d.coalesce_alias, []).append(d.name)
        for dim in resolved.dimensions:
            if dim.coalesce_alias:
                if dim.coalesce_alias in emitted_coalesce_aliases:
                    continue
                emitted_coalesce_aliases.add(dim.coalesce_alias)
                outer_builder.select(
                    AliasedExpr(
                        expr=FunctionCall(
                            name="COALESCE",
                            args=[
                                ColumnRef(name=member)
                                for member in coalesce_groups[dim.coalesce_alias]
                            ],
                        ),
                        alias=dim.coalesce_alias,
                    )
                )
            else:
                outer_builder.select(
                    AliasedExpr(
                        expr=ColumnRef(name=dim.name),
                        alias=dim.name,
                    )
                )

        # SELECT aggregated measures and metrics
        # First, aggregate every measure from the UNION ALL legs. This
        # includes component measures pulled in only to feed a metric
        # (e.g. Total Returns / Total Purchases behind Return Rate /
        # Gross Margin). We still compute their aggregate expression and
        # record it in ``outer_measure_exprs`` so HAVING can reference any
        # measure, but we only PROJECT the measures the caller actually
        # requested — otherwise the result carries extra columns the
        # consumer never asked for, which Postgres-federation clients
        # (Dremio) reject as an unexpected dataset shape.
        settings = model.settings
        requested_measure_names = {rm.name for rm in resolved.measures}
        seen_measure_names: set[str] = set()
        outer_measure_exprs: dict[str, Expr] = {}
        for m in all_measures:
            seen_measure_names.add(m.name)
            agg = m.aggregation.upper()
            distinct = False
            if agg == "COUNT_DISTINCT":
                agg = "COUNT"
                distinct = True
            if isinstance(m.expression, FunctionCall) and m.expression.distinct:
                distinct = True

            if self._is_multi_field(m):
                # Multi-field: concat CTE columns in outer query
                assert isinstance(m.expression, FunctionCall)
                n_fields = len(m.expression.args)
                agg_expr: Expr = self._build_outer_concat_count(
                    m.name, n_fields, agg, distinct, cte_name
                )
            else:
                agg_expr = FunctionCall(
                    name=agg,
                    args=[ColumnRef(name=m.name, table=cte_name)],
                    distinct=distinct,
                )
            # Apply CAST for resolved data_type (effective_measures so
            # multi-fact synthesized counts get the same integer CAST as
            # declared count measures).
            model_measure = model.effective_measures.get(m.name)
            if model_measure and dialect:
                resolved_type = resolve_measure_data_type(model_measure, settings)
                if resolved_type:
                    agg_expr = dialect.cast_to_obml_type(agg_expr, resolved_type)
            if m.name in requested_measure_names:
                outer_builder.select(AliasedExpr(expr=agg_expr, alias=m.name))
            outer_measure_exprs[m.name] = agg_expr

        # Then, add metric expressions that combine component measures
        for m in resolved.measures:
            if m.component_measures and m.name not in seen_measure_names:
                metric_expr: Expr = self._build_outer_metric_expr(m, resolved, cte_name)
                metric = model.metrics.get(m.name)
                if metric and dialect:
                    resolved_type = resolve_metric_data_type(metric, settings)
                    if resolved_type:
                        metric_expr = dialect.cast_to_obml_type(metric_expr, resolved_type)
                outer_builder.select(AliasedExpr(expr=metric_expr, alias=m.name))
                outer_measure_exprs[m.name] = metric_expr

        outer_builder.from_(cte_name, alias=cte_name)

        # GROUP BY dimensions.  Coalesce groups group by the COALESCE expression
        # itself (most dialects accept either the alias or the expression; the
        # expression is portable across all eight supported dialects).
        grouped_coalesce_aliases: set[str] = set()
        for dim in resolved.dimensions:
            if dim.coalesce_alias:
                if dim.coalesce_alias in grouped_coalesce_aliases:
                    continue
                grouped_coalesce_aliases.add(dim.coalesce_alias)
                outer_builder.group_by(
                    FunctionCall(
                        name="COALESCE",
                        args=[
                            ColumnRef(name=member) for member in coalesce_groups[dim.coalesce_alias]
                        ],
                    )
                )
            else:
                outer_builder.group_by(ColumnRef(name=dim.name))

        # GROUPING() flag columns + grouping modifier (rollup/cube) — outer query only
        # so subtotal rows compose correctly over the unioned facts (the
        # individual UNION ALL legs stay at detail grain).
        if resolved.grouping is not None and resolved.dimensions:
            outer_builder.grouping(resolved.grouping.value)
            flag_aliases: list[str] = []
            for dim in resolved.dimensions:
                alias_name = dim.coalesce_alias or dim.name
                if alias_name in flag_aliases:
                    continue
                flag_aliases.append(alias_name)
            for alias in flag_aliases:
                flag_col = FunctionCall(name="GROUPING", args=[ColumnRef(name=alias)])
                outer_builder.select(AliasedExpr(expr=flag_col, alias=_grouping_flag_alias(alias)))

        # HAVING — expand alias references to actual CAST'd aggregate expressions
        for hf in resolved.having_filters:
            outer_builder.having(_expand_cfl_measure_refs(hf.expression, outer_measure_exprs))

        # ORDER BY and LIMIT — remap to CTE aliases
        for expr, desc, nulls in resolved.order_by_exprs:
            outer_builder.order_by(
                self._remap_cfl_order_by(expr, resolved, model),
                desc=desc,
                nulls_last=_nulls_last(nulls),
            )
        if resolved.limit is not None:
            outer_builder.limit(resolved.limit)
        if resolved.offset is not None:
            outer_builder.offset(resolved.offset)

        outer_select = outer_builder.build()

        # Attach CTE
        final = Select(
            columns=outer_select.columns,
            from_=outer_select.from_,
            joins=outer_select.joins,
            where=outer_select.where,
            group_by=outer_select.group_by,
            having=outer_select.having,
            order_by=outer_select.order_by,
            limit=outer_select.limit,
            offset=outer_select.offset,
            ctes=[union_cte],
            grouping=outer_select.grouping,
        )

        return QueryPlan(ast=final, cfl_legs=leg_infos)

    # -- dimensionsExclude: EXCEPT-based anti-join ----------------------------

    def _plan_dimensions_exclude(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        qualify_table: Callable[[DataObject], str] | None = None,
    ) -> QueryPlan:
        """Plan a dimensionsExclude query using EXCEPT pattern."""
        return cfl_exclude.plan_dimensions_exclude(self, resolved, model, qualify_table)

    @staticmethod
    def _partition_dimensions(
        resolved: ResolvedQuery,
        graph: JoinGraph,
    ) -> list[list[ResolvedDimension]]:
        """Partition dimensions into groups on independent branches."""
        return cfl_exclude.partition_dimensions(resolved, graph)

    @staticmethod
    def _build_group_distinct_select(
        dims: list[ResolvedDimension],
        model: SemanticModel,
        graph: JoinGraph,
        qualify: Callable[[DataObject], str],
        via_constraints: dict[str, str] | None = None,
    ) -> Select:
        """Build SELECT DISTINCT (via GROUP BY) for a group of dimensions."""
        return cfl_exclude.build_group_distinct_select(
            dims, model, graph, qualify, via_constraints=via_constraints
        )

    def _build_existing_pairs_select(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        graph: JoinGraph,
        qualify: Callable[[DataObject], str],
    ) -> Select:
        """Build SELECT for existing dimension combinations via fact-table joins."""
        return cfl_exclude.build_existing_pairs_select(self, resolved, model, graph, qualify)

`plan(resolved, model, qualify_table=None, union_by_name=False, dialect=None)` ¶

Plan a CFL query.

Source code in src/orionbelt/compiler/cfl.py

def plan(
    self,
    resolved: ResolvedQuery,
    model: SemanticModel,
    qualify_table: Callable[[DataObject], str] | None = None,
    union_by_name: bool = False,
    dialect: Dialect | None = None,
) -> QueryPlan:
    """Plan a CFL query."""
    self._validate_fanout(resolved, model)

    # dimensionsExclude: EXCEPT-based anti-join pattern
    if resolved.dimensions_exclude:
        return self._plan_dimensions_exclude(resolved, model, qualify_table)

    # Group measures by their source object
    measures_by_object, cross_fact = self._group_measures_by_object(resolved, model)

    # Dimension-only CFL: no measures but dimensions on independent branches.
    # Create leg groupings from connecting fact tables.
    if not measures_by_object and not cross_fact and resolved.requires_cfl:
        measures_by_object = self._group_dimensions_into_legs(resolved, model)

    if len(measures_by_object) <= 1 and not cross_fact:
        # Single fact — delegate to star schema
        from orionbelt.compiler.star import StarSchemaPlanner

        return StarSchemaPlanner().plan(
            resolved, model, qualify_table=qualify_table, dialect=dialect
        )

    # Two-column statistical aggregates (CORR/COVAR_*/REGR_*) need
    # paired-row semantics that the UNION ALL + concat-count multi-fact
    # path cannot express. Without this guard the planner emits
    # ``CORR(CAST(f0 AS VARCHAR) || '|' || CAST(f1 AS VARCHAR))`` — one
    # argument, wrong type. Fail fast with a clear error so the caller
    # can restructure their model or restrict the query to a single
    # fact source instead of getting an opaque execution-time error.
    for measure in resolved.measures:
        agg = measure.aggregation.lower() if measure.aggregation else ""
        if agg in TWO_COLUMN_AGGREGATIONS:
            raise UnsupportedAggregationForCFLError(measure.name, agg)

    # Multi-fact: UNION ALL strategy
    return self._plan_union_all(
        resolved,
        model,
        measures_by_object,
        cross_fact,
        qualify_table=qualify_table,
        union_by_name=union_by_name,
        dialect=dialect,
    )

Join Graph¶

`orionbelt.compiler.graph.JoinGraph` ¶

Graph of data objects (nodes) and relationships (edges) for join path resolution.

Source code in src/orionbelt/compiler/graph.py

class JoinGraph:
    """Graph of data objects (nodes) and relationships (edges) for join path resolution."""

    def __init__(
        self,
        model: SemanticModel,
        use_path_names: list[UsePathName] | None = None,
    ) -> None:
        self._graph: nx.Graph[str] = nx.Graph()
        self._directed: nx.DiGraph[str] = nx.DiGraph()
        # Path-finding graph: many-to-one is forward-only (would cause fanout
        # in reverse); one-to-one and many-to-many are bidirectional.
        self._traversable: nx.DiGraph[str] = nx.DiGraph()
        self._model = model
        self._build(model, use_path_names)

    def _build(
        self,
        model: SemanticModel,
        use_path_names: list[UsePathName] | None = None,
    ) -> None:
        """Build the graph from the semantic model.

        Secondary joins are only included when their pathName is requested
        via *use_path_names*.  When a secondary override is active for a
        ``(source, target)`` pair, the primary join for that pair is excluded.
        """
        for name in model.data_objects:
            self._graph.add_node(name)
            self._directed.add_node(name)
            self._traversable.add_node(name)

        # Build a lookup: (source, target) → pathName for active overrides
        active_overrides: dict[tuple[str, str], str] = {}
        if use_path_names:
            for upn in use_path_names:
                active_overrides[(upn.source, upn.target)] = upn.path_name

        for obj_name, obj in model.data_objects.items():
            for join in obj.joins:
                if join.join_to not in model.data_objects:
                    continue
                pair = (obj_name, join.join_to)

                if join.secondary:
                    # Only include if this secondary join's pathName is active
                    if pair in active_overrides and active_overrides[pair] == join.path_name:
                        self._add_edge(obj_name, join)
                else:
                    # Primary join: skip if an active override exists for this pair
                    if pair not in active_overrides:
                        self._add_edge(obj_name, join)

    def _add_edge(self, obj_name: str, join: object) -> None:
        """Add an edge to the undirected, directed, and traversable graphs.

        The traversable graph is used by :meth:`find_join_path` to enforce
        the rule "many-to-one is never bidirectional": walking such a join
        backwards would multiply rows of the source table, so only forward
        traversal is allowed.  One-to-one and many-to-many joins remain
        bidirectional in the traversable graph.
        """
        from orionbelt.models.semantic import DataObjectJoin

        assert isinstance(join, DataObjectJoin)
        self._graph.add_edge(
            obj_name,
            join.join_to,
            columns_from=join.columns_from,
            columns_to=join.columns_to,
            cardinality=join.join_type,
            source_object=obj_name,
        )
        self._directed.add_edge(
            obj_name,
            join.join_to,
            columns_from=join.columns_from,
            columns_to=join.columns_to,
            cardinality=join.join_type,
        )
        self._traversable.add_edge(obj_name, join.join_to)
        if join.join_type != Cardinality.MANY_TO_ONE:
            # Safe to walk backwards: row count is preserved.
            self._traversable.add_edge(join.join_to, obj_name)

    def descendants(self, node: str) -> set[str]:
        """Return all nodes reachable from *node* via directed join paths."""
        if node not in self._directed:
            return set()
        return nx.descendants(self._directed, node)

    def find_common_root(self, required_objects: set[str]) -> str:
        """Find the common root for a set of required objects.

        The join graph is a DAG (joins define direction: source → joinTo).
        The common root is the **deepest** node that can reach ALL
        *required_objects* via directed join paths.  "Deepest" = smallest
        descendant set (most specific ancestor, closest to the required nodes).

        In ``returns → sales → customer``, with required ``{customer, item}``,
        the common root is ``sales`` (it can reach both).  With required
        ``{customer, item, returns}``, the common root is ``returns`` (the
        only node that can reach all three).
        """
        required = required_objects & set(self._directed.nodes)
        if len(required) <= 1:
            return next(iter(sorted(required))) if required else ""

        # Find all nodes that can reach ALL required nodes via directed paths
        candidates: list[tuple[str, int]] = []
        for node in self._directed.nodes:
            reachable = nx.descendants(self._directed, node) | {node}
            if required <= reachable:
                candidates.append((node, len(reachable)))

        if not candidates:
            # Fallback: no single directed ancestor covers all —
            # use undirected shortest-path center
            return self._find_center_undirected(required)

        # Pick the deepest ancestor: smallest reachable set that still covers all
        candidates.sort(key=lambda x: (x[1], x[0]))
        return candidates[0][0]

    def _find_center_undirected(self, required: set[str]) -> str:
        """Fallback: center of the Steiner tree in the undirected graph."""
        nodes = sorted(required)
        if len(nodes) <= 1:
            return nodes[0] if nodes else ""

        steiner: set[str] = set()
        for i in range(len(nodes)):
            for j in range(i + 1, len(nodes)):
                try:
                    path: list[str] = nx.shortest_path(self._graph, nodes[i], nodes[j])
                    steiner.update(path)
                except nx.NetworkXNoPath:
                    pass

        if not steiner:
            return nodes[0]

        best: str = nodes[0]
        best_max: int | float = len(self._graph.nodes) + 1
        for node in sorted(steiner):
            max_dist = max(nx.shortest_path_length(self._graph, node, r) for r in nodes)
            if max_dist < best_max:
                best_max = max_dist
                best = node
        return best

    def find_join_path(
        self,
        from_objects: set[str],
        to_objects: set[str],
        via_constraints: dict[str, str] | None = None,
    ) -> list[JoinStep]:
        """Find a minimal join path connecting all required data objects.

        Uses shortest path for each target object from the set of source objects.

        *via_constraints* maps ``target → via``: for constrained targets, only
        the ``via`` object is used as the source so the path is forced through it.
        """
        steps: list[JoinStep] = []
        visited_edges: set[tuple[str, str]] = set()
        via = via_constraints or {}

        # Process via waypoints first so they are in source_list when their
        # constrained targets are processed.
        all_targets = to_objects - from_objects
        via_targets = {t for t in all_targets if t in via}
        non_via_targets = all_targets - via_targets
        via_waypoints = {via[t] for t in via_targets} - from_objects - via_targets
        ordered_targets = sorted(via_waypoints) + sorted(non_via_targets) + sorted(via_targets)

        source_list = list(from_objects)

        for target in ordered_targets:
            best_path: list[str] | None = None
            sources = [via[target]] if target in via and via[target] in source_list else source_list
            for source in sources:
                try:
                    path = nx.shortest_path(self._traversable, source, target)
                    if best_path is None or len(path) < len(best_path):
                        best_path = path
                except nx.NetworkXNoPath:
                    continue

            if best_path is None:
                continue

            for i in range(len(best_path) - 1):
                edge = (best_path[i], best_path[i + 1])
                rev_edge = (best_path[i + 1], best_path[i])
                if edge in visited_edges or rev_edge in visited_edges:
                    continue
                visited_edges.add(edge)

                edge_data = self._graph.edges[edge]
                source_object = edge_data.get("source_object", edge[0])

                if source_object == edge[0]:
                    step = JoinStep(
                        from_object=edge[0],
                        to_object=edge[1],
                        from_columns=edge_data["columns_from"],
                        to_columns=edge_data["columns_to"],
                        join_type=ASTJoinType.LEFT,
                        cardinality=edge_data["cardinality"],
                    )
                else:
                    # Path traverses edge in reverse direction.
                    # from_object/to_object are swapped, so columns must be
                    # swapped too to keep the ON clause correctly oriented.
                    step = JoinStep(
                        from_object=edge[1],
                        to_object=edge[0],
                        from_columns=edge_data["columns_to"],
                        to_columns=edge_data["columns_from"],
                        join_type=ASTJoinType.LEFT,
                        cardinality=edge_data["cardinality"],
                        reversed=True,
                    )
                steps.append(step)

            # Add target to sources for subsequent lookups
            if target not in source_list:
                source_list.append(target)

        return steps

    def find_join_path_undirected(
        self,
        from_object: str,
        to_object: str,
    ) -> list[JoinStep]:
        """Find a join path ignoring cardinality direction.

        Unlike :meth:`find_join_path` (which forbids walking many-to-one
        joins backwards to prevent fanout in the outer query), this walker
        considers the join graph as undirected.  It's intended for
        correlated subqueries — EXISTS / NOT EXISTS — where row counts on
        the outer side are unaffected by how many rows the subquery scans.

        Each emitted :class:`JoinStep` is oriented so ``from_object`` is the
        step's predecessor on the path and ``to_object`` is its successor;
        ``from_columns`` / ``to_columns`` are swapped when the underlying
        join edge is traversed against its declared direction.
        """
        if from_object == to_object:
            return []
        if from_object not in self._graph or to_object not in self._graph:
            return []
        try:
            path: list[str] = nx.shortest_path(self._graph, from_object, to_object)
        except nx.NetworkXNoPath:
            return []

        steps: list[JoinStep] = []
        for i in range(len(path) - 1):
            pred, succ = path[i], path[i + 1]
            edge_data = self._graph.edges[(pred, succ)]
            source_object = edge_data.get("source_object", pred)
            if source_object == pred:
                from_cols = edge_data["columns_from"]
                to_cols = edge_data["columns_to"]
                reversed_ = False
            else:
                from_cols = edge_data["columns_to"]
                to_cols = edge_data["columns_from"]
                reversed_ = True
            steps.append(
                JoinStep(
                    from_object=pred,
                    to_object=succ,
                    from_columns=from_cols,
                    to_columns=to_cols,
                    join_type=ASTJoinType.LEFT,
                    cardinality=edge_data["cardinality"],
                    reversed=reversed_,
                )
            )
        return steps

    def build_join_condition(self, step: JoinStep) -> Expr:
        """Build the ON clause expression for a join step.

        Routes both sides through ``make_column_expr`` so a computed
        join key (``expression:`` instead of ``code:`` on the column)
        inlines its template body. Without this, a join on a computed
        key would render ``"obj"."" = "other"."key"`` and the database
        would error on the zero-length identifier.
        """
        from orionbelt.compiler.resolution import make_column_expr

        conditions: list[Expr] = []
        for from_c, to_c in zip(step.from_columns, step.to_columns, strict=True):
            from_obj = self._model.data_objects.get(step.from_object)
            to_obj = self._model.data_objects.get(step.to_object)
            if from_obj and from_c in from_obj.columns:
                left_expr: Expr = make_column_expr(self._model, step.from_object, from_c)
            else:
                left_expr = ColumnRef(name=from_c, table=step.from_object)
            if to_obj and to_c in to_obj.columns:
                right_expr: Expr = make_column_expr(self._model, step.to_object, to_c)
            else:
                right_expr = ColumnRef(name=to_c, table=step.to_object)
            conditions.append(BinaryOp(left=left_expr, op="=", right=right_expr))

        if not conditions:
            msg = f"Join from '{step.from_object}' to '{step.to_object}' has no join columns"
            raise ValueError(msg)
        result: Expr = conditions[0]
        for cond in conditions[1:]:
            result = BinaryOp(left=result, op="AND", right=cond)
        return result

    def detect_cycles(self) -> list[list[str]]:
        """Detect cyclic join paths."""
        try:
            cycles = list(nx.simple_cycles(self._directed))
            return cycles
        except nx.NetworkXError:
            return []

    def validate_deterministic(self) -> list[SemanticError]:
        """Ensure join paths are deterministic (no ambiguity)."""
        errors: list[SemanticError] = []
        # Check for multiple edges between the same pair of nodes
        for u, v in self._graph.edges():
            if self._graph.number_of_edges(u, v) > 1:
                errors.append(
                    SemanticError(
                        code="AMBIGUOUS_JOIN",
                        message=f"Multiple join paths between '{u}' and '{v}'",
                        path=f"dataObjects.{u}.joins",
                    )
                )
        return errors

`find_join_path(from_objects, to_objects, via_constraints=None)` ¶

Find a minimal join path connecting all required data objects.

Uses shortest path for each target object from the set of source objects.

via_constraints maps target → via: for constrained targets, only the via object is used as the source so the path is forced through it.

Source code in src/orionbelt/compiler/graph.py

def find_join_path(
    self,
    from_objects: set[str],
    to_objects: set[str],
    via_constraints: dict[str, str] | None = None,
) -> list[JoinStep]:
    """Find a minimal join path connecting all required data objects.

    Uses shortest path for each target object from the set of source objects.

    *via_constraints* maps ``target → via``: for constrained targets, only
    the ``via`` object is used as the source so the path is forced through it.
    """
    steps: list[JoinStep] = []
    visited_edges: set[tuple[str, str]] = set()
    via = via_constraints or {}

    # Process via waypoints first so they are in source_list when their
    # constrained targets are processed.
    all_targets = to_objects - from_objects
    via_targets = {t for t in all_targets if t in via}
    non_via_targets = all_targets - via_targets
    via_waypoints = {via[t] for t in via_targets} - from_objects - via_targets
    ordered_targets = sorted(via_waypoints) + sorted(non_via_targets) + sorted(via_targets)

    source_list = list(from_objects)

    for target in ordered_targets:
        best_path: list[str] | None = None
        sources = [via[target]] if target in via and via[target] in source_list else source_list
        for source in sources:
            try:
                path = nx.shortest_path(self._traversable, source, target)
                if best_path is None or len(path) < len(best_path):
                    best_path = path
            except nx.NetworkXNoPath:
                continue

        if best_path is None:
            continue

        for i in range(len(best_path) - 1):
            edge = (best_path[i], best_path[i + 1])
            rev_edge = (best_path[i + 1], best_path[i])
            if edge in visited_edges or rev_edge in visited_edges:
                continue
            visited_edges.add(edge)

            edge_data = self._graph.edges[edge]
            source_object = edge_data.get("source_object", edge[0])

            if source_object == edge[0]:
                step = JoinStep(
                    from_object=edge[0],
                    to_object=edge[1],
                    from_columns=edge_data["columns_from"],
                    to_columns=edge_data["columns_to"],
                    join_type=ASTJoinType.LEFT,
                    cardinality=edge_data["cardinality"],
                )
            else:
                # Path traverses edge in reverse direction.
                # from_object/to_object are swapped, so columns must be
                # swapped too to keep the ON clause correctly oriented.
                step = JoinStep(
                    from_object=edge[1],
                    to_object=edge[0],
                    from_columns=edge_data["columns_to"],
                    to_columns=edge_data["columns_from"],
                    join_type=ASTJoinType.LEFT,
                    cardinality=edge_data["cardinality"],
                    reversed=True,
                )
            steps.append(step)

        # Add target to sources for subsequent lookups
        if target not in source_list:
            source_list.append(target)

    return steps

`build_join_condition(step)` ¶

Build the ON clause expression for a join step.

Routes both sides through make_column_expr so a computed join key (expression: instead of code: on the column) inlines its template body. Without this, a join on a computed key would render "obj"."" = "other"."key" and the database would error on the zero-length identifier.

Source code in src/orionbelt/compiler/graph.py

def build_join_condition(self, step: JoinStep) -> Expr:
    """Build the ON clause expression for a join step.

    Routes both sides through ``make_column_expr`` so a computed
    join key (``expression:`` instead of ``code:`` on the column)
    inlines its template body. Without this, a join on a computed
    key would render ``"obj"."" = "other"."key"`` and the database
    would error on the zero-length identifier.
    """
    from orionbelt.compiler.resolution import make_column_expr

    conditions: list[Expr] = []
    for from_c, to_c in zip(step.from_columns, step.to_columns, strict=True):
        from_obj = self._model.data_objects.get(step.from_object)
        to_obj = self._model.data_objects.get(step.to_object)
        if from_obj and from_c in from_obj.columns:
            left_expr: Expr = make_column_expr(self._model, step.from_object, from_c)
        else:
            left_expr = ColumnRef(name=from_c, table=step.from_object)
        if to_obj and to_c in to_obj.columns:
            right_expr: Expr = make_column_expr(self._model, step.to_object, to_c)
        else:
            right_expr = ColumnRef(name=to_c, table=step.to_object)
        conditions.append(BinaryOp(left=left_expr, op="=", right=right_expr))

    if not conditions:
        msg = f"Join from '{step.from_object}' to '{step.to_object}' has no join columns"
        raise ValueError(msg)
    result: Expr = conditions[0]
    for cond in conditions[1:]:
        result = BinaryOp(left=result, op="AND", right=cond)
    return result

`detect_cycles()` ¶

Detect cyclic join paths.

Source code in src/orionbelt/compiler/graph.py

def detect_cycles(self) -> list[list[str]]:
    """Detect cyclic join paths."""
    try:
        cycles = list(nx.simple_cycles(self._directed))
        return cycles
    except nx.NetworkXError:
        return []

Code Generator¶

`orionbelt.compiler.codegen.CodeGenerator` ¶

Generates SQL from AST using a dialect.

Source code in src/orionbelt/compiler/codegen.py

class CodeGenerator:
    """Generates SQL from AST using a dialect."""

    def __init__(self, dialect: Dialect) -> None:
        self._dialect = dialect

    @property
    def dialect(self) -> Dialect:
        return self._dialect

    def generate(self, ast: Select) -> str:
        """Generate SQL string from AST using the configured dialect."""
        return self._dialect.compile(ast)

`generate(ast)` ¶

Generate SQL string from AST using the configured dialect.

Source code in src/orionbelt/compiler/codegen.py

def generate(self, ast: Select) -> str:
    """Generate SQL string from AST using the configured dialect."""
    return self._dialect.compile(ast)

Dialect Base¶

`orionbelt.dialect.base.Dialect` ¶

Bases: ABC

Abstract base for all SQL dialects.

Provides default SQL compilation; dialects override specific methods.

Source code in src/orionbelt/dialect/base.py

class Dialect(ABC):
    """Abstract base for all SQL dialects.

    Provides default SQL compilation; dialects override specific methods.
    """

    _ABSTRACT_TYPE_MAP: dict[str, str] = {
        "string": "VARCHAR",
        "json": "VARCHAR",
        "int": "INTEGER",
        "float": "FLOAT",
        "date": "DATE",
        "time": "TIME",
        "time_tz": "TIME",
        "timestamp": "TIMESTAMP",
        "timestamp_tz": "TIMESTAMP",
        "boolean": "BOOLEAN",
    }

    _MAX_DECIMAL_PRECISION: int = 38

    _OBML_SIMPLE_TYPE_MAP: dict[str, str] = {
        "bigint": "BIGINT",
        "integer": "INTEGER",
        "double": "DOUBLE",
        "date": "DATE",
        "timestamp": "TIMESTAMP",
        "time": "TIME",
        "string": "VARCHAR",
        "boolean": "BOOLEAN",
    }

    def render_obml_type(self, obml_type: OBMLType) -> str:
        """Render an OBMLType to a dialect-specific SQL type string.

        Handles precision clamping for decimal types.
        """
        if isinstance(obml_type, DecimalType):
            p = min(obml_type.precision, self._MAX_DECIMAL_PRECISION)
            s = min(obml_type.scale, p)
            return f"DECIMAL({p}, {s})"
        return self._OBML_SIMPLE_TYPE_MAP.get(obml_type.name, obml_type.name.upper())

    def cast_to_obml_type(self, expr: Expr, obml_type: OBMLType) -> Expr:
        """Build an Expr that coerces ``expr`` to the given OBML type.

        Default form is a plain ``CAST(expr AS <type>)``. Dialects whose
        ``CAST`` doesn't accept a parameterized decimal (notably BigQuery
        — "Parameterized types are not allowed in CAST expressions") can
        override to wrap the cast with a ROUND to honour the user-specified
        scale.
        """
        return Cast(expr=expr, type_name=self.render_obml_type(obml_type))

    def _resolve_type_name(self, type_name: str) -> str:
        """Map an abstract type name to a dialect-specific SQL type.

        Looks up ``_ABSTRACT_TYPE_MAP`` first; if *type_name* is not found
        (e.g. already a concrete SQL type like ``VARCHAR``), returns it as-is.
        """
        return self._ABSTRACT_TYPE_MAP.get(type_name, type_name)

    def format_table_ref(self, database: str, schema: str, code: str) -> str:
        """Format a fully-qualified table reference.

        Default: three-part ``database.schema.code`` (Snowflake/Databricks/Dremio).
        Postgres and ClickHouse override to two-part naming.
        All components are quoted to prevent SQL injection.
        """
        return (
            f"{self.quote_identifier(database)}"
            f".{self.quote_identifier(schema)}"
            f".{self.quote_identifier(code)}"
        )

    @property
    @abstractmethod
    def name(self) -> str: ...

    @property
    @abstractmethod
    def capabilities(self) -> DialectCapabilities: ...

    @abstractmethod
    def quote_identifier(self, name: str) -> str:
        """Quote an identifier per dialect rules."""

    @abstractmethod
    def render_time_grain(self, column: Expr, grain: TimeGrain) -> Expr:
        """Wrap a column expression for the given time grain."""

    @abstractmethod
    def render_cast(self, expr: Expr, target_type: str) -> Expr:
        """Render a CAST expression."""

    @abstractmethod
    def current_date_sql(self) -> str:
        """Return SQL for the current date."""

    @abstractmethod
    def date_add_sql(self, date_sql: str, unit: str, count: int) -> str:
        """Return SQL that adds count units to date_sql."""

    @abstractmethod
    def render_date_trunc_sql(self, column_sql: str, grain: str) -> str:
        """Return SQL string that truncates a date/timestamp to the given grain.

        String-level helper (not AST) for use in raw SQL CTEs like date_range.
        """

    @abstractmethod
    def render_date_spine_cte_sql(
        self,
        min_date: str,
        max_date: str,
        grain: str,
        offset: int,
        offset_grain: str,
    ) -> str:
        """Return the SQL body for a date spine CTE.

        Must produce two columns: ``spine_date`` and ``spine_date_prev``.
        ``spine_date_prev`` is NULL when the offset date falls before min_date.

        Parameters
        ----------
        min_date : str
            SQL expression referencing the minimum date (e.g. ``date_range.min_date``).
        max_date : str
            SQL expression referencing the maximum date.
        grain : str
            Time grain string: ``day``, ``week``, ``month``, ``quarter``, ``year``.
        offset : int
            Signed period offset (e.g. ``-1`` for previous period).
        offset_grain : str
            Grain of the offset (e.g. ``year`` for YoY).
        """

    def render_string_contains(self, column: Expr, pattern: Expr) -> Expr:
        """Default: column LIKE '%' || pattern || '%'."""
        return BinaryOp(
            left=column,
            op="LIKE",
            right=BinaryOp(
                left=BinaryOp(left=Literal.string("%"), op="||", right=pattern),
                op="||",
                right=Literal.string("%"),
            ),
        )

    def _map_function_name(self, name: str) -> str:
        """Map a function name to the dialect-specific equivalent.

        Override in subclasses to remap names (e.g. ANY_VALUE → any in ClickHouse).
        """
        return name

    def _check_aggregation_supported(self, name: str) -> None:
        """Raise ``UnsupportedAggregationError`` when the dialect doesn't support
        the given aggregation. Matches case-insensitively against
        ``capabilities.unsupported_aggregations`` (lowercase OBML names).

        Existing per-function compile overrides (``_compile_mode``,
        ``_compile_median``) still raise directly — this generic gate is a
        catch-all for purely-standard aggregations like ``REGR_SLOPE`` where
        no special compile path exists.
        """
        if name.lower() in {a.lower() for a in self.capabilities.unsupported_aggregations}:
            raise UnsupportedAggregationError(self.name, name.lower())

    def _compile_median(self, args: list[Expr]) -> str:
        """Compile MEDIAN — default uses MEDIAN(col).

        Works for Snowflake, ClickHouse, Databricks, and Dremio. Postgres overrides.
        """
        col_sql = self.compile_expr(args[0]) if args else "NULL"
        return f"MEDIAN({col_sql})"

    def _compile_mode(self, args: list[Expr]) -> str:
        """Compile MODE — default uses MODE(col).

        Works for Snowflake and Databricks. Postgres, ClickHouse, and Dremio override.
        """
        col_sql = self.compile_expr(args[0]) if args else "NULL"
        return f"MODE({col_sql})"

    def _compile_listagg(
        self,
        args: list[Expr],
        distinct: bool,
        order_by: list[OrderByItem],
        separator: str | None,
    ) -> str:
        """Compile LISTAGG — default uses LISTAGG(col, sep) WITHIN GROUP (ORDER BY ...).

        Works for Snowflake and Dremio. Postgres, ClickHouse, and Databricks override.
        """
        sep = separator if separator is not None else ","
        col_sql = self.compile_expr(args[0]) if args else "''"
        distinct_sql = "DISTINCT " if distinct else ""
        escaped_sep = sep.replace("'", "''")
        result = f"LISTAGG({distinct_sql}{col_sql}, '{escaped_sep}')"
        if order_by:
            ob = ", ".join(self.compile_order_by(o) for o in order_by)
            result += f" WITHIN GROUP (ORDER BY {ob})"
        return result

    def _compile_cast(self, inner: Expr, type_name: str) -> str:
        """Render ``CAST(expr AS type)``. Dialects override to handle nullability."""
        resolved_type = self._resolve_type_name(type_name)
        return f"CAST({self.compile_expr(inner)} AS {resolved_type})"

    # SQL operator precedence (higher = binds tighter). Used by the
    # precedence-aware emitter in ``compile_expr`` to skip wrapping a
    # child whose precedence is higher than its parent's required level.
    # Pre-v2.7.4 the emitter wrapped *every* operator unconditionally,
    # producing deeply-nested unreadable SQL (issue #79).
    _CLAUSE_ROOT_PREC = 0  # no surrounding context → no wrap
    _PREC_OR = 1
    _PREC_AND = 2
    _PREC_NOT = 3
    _PREC_CMP = 4  # =, <>, <, <=, >, >=, IS NULL, IN, BETWEEN, LIKE
    _PREC_ADD = 5  # +, -, ||
    _PREC_MUL = 6  # *, /, %
    _PREC_UNARY = 7  # unary -, +
    _PREC_ATOM = 100  # literals, column refs, function calls, CAST(...), CASE...END

    @staticmethod
    def _wrap_if_lower(sql: str, self_prec: int, parent_prec: int) -> str:
        """Wrap ``sql`` in ``(...)`` only when it would bind weaker than
        its parent — i.e. its precedence is strictly less than the
        parent's required level. ``parent_prec = 0`` (clause root) is
        always satisfied so the outermost expression never gets a
        redundant outer wrap.
        """
        if self_prec < parent_prec:
            return f"({sql})"
        return sql

    @classmethod
    def _binary_op_precedence(cls, op: str) -> int:
        """Return the precedence of a ``BinaryOp.op`` value."""
        up = op.upper().strip()
        if up == "OR":
            return cls._PREC_OR
        if up == "AND":
            return cls._PREC_AND
        if up in ("=", "<>", "!=", "<", "<=", ">", ">=", "LIKE", "NOT LIKE"):
            return cls._PREC_CMP
        if up in ("+", "-", "||"):
            return cls._PREC_ADD
        if up in ("*", "/", "%"):
            return cls._PREC_MUL
        # Unknown operator — wrap defensively (treat as lowest precedence).
        return cls._CLAUSE_ROOT_PREC

    # Non-associative operators — children at the same precedence must
    # be wrapped on BOTH sides. SQL forbids chained comparisons
    # (``a >= b = c`` is a syntax error in every dialect we support),
    # subtraction and division are left-associative but ``a - (b - c)``
    # differs from ``a - b - c``, so the right operand is wrapped at
    # equal precedence — see the left-associative branch below.
    _NON_ASSOCIATIVE_OPS: frozenset[str] = frozenset(
        {"=", "<>", "!=", "<", "<=", ">", ">=", "LIKE", "NOT LIKE"}
    )

    def _compile_binary_op(self, left: Expr, op: str, right: Expr) -> str:
        """Render an infix binary expression *without* an outer wrap.

        The dispatcher in ``compile_expr`` decides whether to add an outer
        ``(...)`` wrap based on the parent's precedence. Dialects override
        to widen operand precision (e.g. ClickHouse decimal division) or
        special-case operators that don't translate one-to-one (e.g. MySQL
        string concat).
        """
        self_prec = self._binary_op_precedence(op)
        # Comparison + LIKE forbid chaining — wrap any equal-precedence
        # child on either side. Other ops are left-associative: left at
        # self_prec, right at self_prec + 1 so ``a - (b - c)`` keeps its
        # required parens.
        op_upper = op.upper().strip()
        if op_upper in self._NON_ASSOCIATIVE_OPS:
            left_sql = self.compile_expr(left, _parent_prec=self_prec + 1)
            right_sql = self.compile_expr(right, _parent_prec=self_prec + 1)
        else:
            left_sql = self.compile_expr(left, _parent_prec=self_prec)
            right_sql = self.compile_expr(right, _parent_prec=self_prec + 1)
        return f"{left_sql} {op} {right_sql}"

    def render_decimal_division_sql(self, left_sql: str, right_sql: str) -> str:
        """Render ``left / right`` for decimal-typed operands, given raw SQL.

        Used by code paths that build division as string SQL (e.g. PoP
        comparison CTEs) rather than as ``BinaryOp`` AST nodes. Default
        is plain SQL division; ClickHouse overrides to widen both sides
        to ``Decimal(38, 10)`` first so ratio precision survives.
        """
        return f"{left_sql} / {right_sql}"

    def _compile_multi_field_count(self, args: list[Expr], distinct: bool) -> str:
        """Compile COUNT with multiple fields by concatenating with ``||``.

        Default (non-Snowflake) strategy: cast each field to VARCHAR and
        join with ``'|'`` separator so the database sees a single expression.
        Snowflake overrides this to emit native ``COUNT(col1, col2)``.
        """
        parts = [f"CAST({self.compile_expr(a)} AS VARCHAR)" for a in args]
        concat = " || '|' || ".join(parts)
        if distinct:
            return f"COUNT(DISTINCT {concat})"
        return f"COUNT({concat})"

    def compile(self, ast: Select) -> str:
        """Render a complete SQL AST to a dialect-specific string."""
        return self.compile_select(ast)

    def compile_select(self, node: Select) -> str:
        """Compile a SELECT statement."""
        parts: list[str] = []

        # CTEs
        if node.ctes:
            cte_parts = []
            for cte in node.ctes:
                if isinstance(cte.query, RawSQL):
                    cte_sql = cte.query.sql
                elif isinstance(cte.query, UnionAll):
                    cte_sql = self.compile_union_all(cte.query)
                elif isinstance(cte.query, Except):
                    cte_sql = self.compile_except(cte.query)
                else:
                    cte_sql = self.compile_select(cte.query)
                cte_parts.append(f"{self.quote_identifier(cte.name)} AS (\n{cte_sql}\n)")
            parts.append("WITH " + ",\n".join(cte_parts))

        # SELECT
        keyword = "SELECT DISTINCT" if node.distinct else "SELECT"
        if node.columns:
            cols = ", ".join(self.compile_expr(c) for c in node.columns)
            parts.append(f"{keyword} {cols}")
        else:
            parts.append(f"{keyword} *")

        # FROM
        if node.from_:
            parts.append(f"FROM {self.compile_from(node.from_)}")

        # JOINs
        for join in node.joins:
            parts.append(self.compile_join(join))

        # WHERE
        if node.where:
            parts.append(f"WHERE {self.compile_expr(node.where)}")

        # GROUP BY
        if node.group_by:
            parts.append(self.compile_group_by(node.group_by, node.grouping))

        # HAVING
        if node.having:
            parts.append(f"HAVING {self.compile_expr(node.having)}")

        # ORDER BY
        if node.order_by:
            orders = ", ".join(self.compile_order_by(o) for o in node.order_by)
            parts.append(f"ORDER BY {orders}")

        # LIMIT
        if node.limit is not None:
            parts.append(f"LIMIT {node.limit}")

        # OFFSET
        if node.offset is not None:
            parts.append(f"OFFSET {node.offset}")

        return "\n".join(parts)

    def compile_group_by(self, group_by: list[Expr], grouping: str | None) -> str:
        """Render the GROUP BY clause.

        Default ANSI form (Postgres, Snowflake, DuckDB, BigQuery, Databricks,
        Dremio, MySQL): ``GROUP BY ROLLUP(a, b)`` / ``GROUP BY CUBE(a, b)``.
        ClickHouse overrides to the trailing-modifier form
        (``GROUP BY a, b WITH ROLLUP``).

        When ``capabilities.supports_group_by_all`` is set and no grouping
        modifier is requested, emits ``GROUP BY ALL`` — the engine
        auto-derives the grouping list from the SELECT. Equivalent SQL
        with a much shorter and more idiomatic form on modern OLAP
        engines, especially for queries with computed dimensions.
        """
        if grouping == "rollup":
            groups = ", ".join(self.compile_expr(g) for g in group_by)
            return f"GROUP BY ROLLUP({groups})"
        if grouping == "cube":
            groups = ", ".join(self.compile_expr(g) for g in group_by)
            return f"GROUP BY CUBE({groups})"
        if self.capabilities.supports_group_by_all:
            return "GROUP BY ALL"
        groups = ", ".join(self.compile_expr(g) for g in group_by)
        return f"GROUP BY {groups}"

    def compile_from(self, node: From) -> str:
        if isinstance(node.source, Select):
            sub = self.compile_select(node.source)
            result = f"(\n{sub}\n)"
        else:
            result = self._render_source_string(node.source)
        if node.alias:
            result += f" AS {self.quote_identifier(node.alias)}"
        return result

    def compile_join(self, node: Join) -> str:
        if isinstance(node.source, Select):
            source = f"(\n{self.compile_select(node.source)}\n)"
        else:
            source = self._render_source_string(node.source)
        if node.alias:
            source += f" AS {self.quote_identifier(node.alias)}"

        parts = [f"{node.join_type.value} JOIN {source}"]
        if node.on:
            parts.append(f"ON {self.compile_expr(node.on)}")
        return " ".join(parts)

    def _render_source_string(self, source: str) -> str:
        """Render a ``From``/``Join`` string source.

        Wrap modules emit bare CTE names (e.g. ``base``); the star/CFL
        planners emit pre-quoted qualified table strings (e.g.
        ``"DB"."SCHEMA"."TABLE"``). Quote the former so case-sensitive
        dialects like Snowflake match the CTE declaration; pass the latter
        through unchanged.
        """
        if source.isidentifier():
            return self.quote_identifier(source)
        return source

    def compile_order_by(self, node: OrderByItem) -> str:
        result = self.compile_expr(node.expr)
        if node.desc:
            result += " DESC"
        else:
            result += " ASC"
        if node.nulls_last is True:
            result += " NULLS LAST"
        elif node.nulls_last is False:
            result += " NULLS FIRST"
        return result

    def compile_union_all(self, node: UnionAll) -> str:
        """Compile a UNION ALL of multiple SELECT statements."""
        return "\nUNION ALL\n".join(self.compile_select(q) for q in node.queries)

    def compile_except(self, node: Except) -> str:
        """Compile an EXCEPT of two SELECT statements."""
        return self.compile_select(node.left) + "\nEXCEPT\n" + self.compile_select(node.right)

    def compile_expr(self, expr: Expr, _parent_prec: int = 0) -> str:
        """Compile an expression node to SQL string.

        ``_parent_prec`` is the precedence of the surrounding operator
        (or ``_CLAUSE_ROOT_PREC = 0`` when called at the root of a SELECT
        projection, ON / WHERE / HAVING clause, GROUP BY / ORDER BY item,
        or function argument). Each operator branch wraps its own SQL in
        ``(...)`` only when its precedence is strictly less than the
        parent's required level; atoms (literals, column refs, function
        calls, CAST, CASE) are at ``_PREC_ATOM`` and never wrap.

        Pre-v2.7.4 every ``BinaryOp`` / ``IsNull`` / ``InList`` /
        ``Between`` / ``UnaryOp`` wrapped itself unconditionally,
        producing deeply-nested unreadable SQL — issue #79.
        """
        match expr:
            case Literal(value=None):
                return "NULL"
            case Literal(value=True):
                return "TRUE"
            case Literal(value=False):
                return "FALSE"
            case Literal(value=v) if isinstance(v, str):
                escaped = v.replace("'", "''")
                return f"'{escaped}'"
            case Literal(value=v):
                return str(v)
            case Star(table=None):
                return "*"
            case Star(table=t) if t is not None:
                return f"{self.quote_identifier(t)}.*"
            case ColumnRef(name=name, table=None):
                return self.quote_identifier(name)
            case ColumnRef(name=name, table=table) if table is not None:
                return f"{self.quote_identifier(table)}.{self.quote_identifier(name)}"
            case AliasedExpr(expr=inner, alias=alias):
                return f"{self.compile_expr(inner)} AS {self.quote_identifier(alias)}"
            case FunctionCall(
                name=fname,
                args=args,
                distinct=distinct,
                order_by=order_by,
                separator=separator,
            ):
                # Reject aggregations explicitly listed as unsupported by the dialect.
                # Per-function overrides (_compile_mode etc.) still apply for cases
                # that have a special compile path; this catches plain aggregates
                # like REGR_SLOPE that have no override.
                self._check_aggregation_supported(fname)
                # LISTAGG: dialect-specific rendering
                if fname.upper() == "LISTAGG":
                    return self._compile_listagg(args, distinct, order_by, separator)
                # MODE: dialect-specific rendering
                if fname.upper() == "MODE":
                    return self._compile_mode(args)
                # MEDIAN: dialect-specific rendering
                if fname.upper() == "MEDIAN":
                    return self._compile_median(args)
                # Multi-field COUNT: concatenate fields for portability
                # (Snowflake overrides to use native multi-arg syntax)
                if fname.upper() == "COUNT" and len(args) > 1:
                    return self._compile_multi_field_count(args, distinct)
                fname = self._map_function_name(fname)
                args_sql = ", ".join(self.compile_expr(a) for a in args)
                if distinct:
                    return f"{fname}(DISTINCT {args_sql})"
                return f"{fname}({args_sql})"
            case BinaryOp(left=left, op=op, right=right):
                self_prec = self._binary_op_precedence(op)
                sql = self._compile_binary_op(left, op, right)
                return self._wrap_if_lower(sql, self_prec, _parent_prec)
            case UnaryOp(op=op, operand=operand):
                self_prec = self._PREC_NOT if op.upper() == "NOT" else self._PREC_UNARY
                sql = f"{op} {self.compile_expr(operand, _parent_prec=self_prec)}"
                return self._wrap_if_lower(sql, self_prec, _parent_prec)
            case IsNull(expr=inner, negated=False):
                sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} IS NULL"
                return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
            case IsNull(expr=inner, negated=True):
                sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} IS NOT NULL"
                return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
            case InList(expr=inner, values=values, negated=negated):
                vals = ", ".join(self.compile_expr(v) for v in values)
                op = "NOT IN" if negated else "IN"
                sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} {op} ({vals})"
                return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
            case CaseExpr(when_clauses=whens, else_clause=else_):
                parts = ["CASE"]
                for when_cond, then_val in whens:
                    parts.append(
                        f"WHEN {self.compile_expr(when_cond)} THEN {self.compile_expr(then_val)}"
                    )
                if else_ is not None:
                    parts.append(f"ELSE {self.compile_expr(else_)}")
                parts.append("END")
                return " ".join(parts)
            case Cast(expr=inner, type_name=type_name):
                return self._compile_cast(inner, type_name)
            case SubqueryExpr(query=query):
                return f"(\n{self.compile_select(query)}\n)"
            case Exists(subquery=subq, negated=False):
                return f"EXISTS (\n{self.compile_select(subq)}\n)"
            case Exists(subquery=subq, negated=True):
                return f"NOT EXISTS (\n{self.compile_select(subq)}\n)"
            case RawSQL(sql=sql):
                return sql
            case Between(expr=inner, low=low, high=high, negated=negated):
                op = "NOT BETWEEN" if negated else "BETWEEN"
                inner_sql = self.compile_expr(inner, _parent_prec=self._PREC_CMP)
                low_sql = self.compile_expr(low, _parent_prec=self._PREC_CMP)
                high_sql = self.compile_expr(high, _parent_prec=self._PREC_CMP)
                sql = f"{inner_sql} {op} {low_sql} AND {high_sql}"
                return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
            case RegexMatch(column=column, pattern=pattern, negated=negated):
                return self.compile_regex_match(column, pattern, negated=negated)
            case RelativeDateRange(
                column=column,
                unit=unit,
                count=count,
                direction=direction,
                include_current=include_current,
            ):
                return self.compile_relative_date_range(
                    column=column,
                    unit=unit,
                    count=count,
                    direction=direction,
                    include_current=include_current,
                )
            case WindowFunction(
                func_name=fname,
                args=args,
                partition_by=partition_by,
                order_by=order_by,
                frame=frame,
                distinct=distinct,
            ):
                args_sql = ", ".join(self.compile_expr(a) for a in args)
                func_sql = f"{fname}(DISTINCT {args_sql})" if distinct else f"{fname}({args_sql})"
                over_parts: list[str] = []
                if partition_by:
                    pb = ", ".join(self.compile_expr(p) for p in partition_by)
                    over_parts.append(f"PARTITION BY {pb}")
                if order_by:
                    ob = ", ".join(self.compile_order_by(o) for o in order_by)
                    over_parts.append(f"ORDER BY {ob}")
                if frame is not None:
                    over_parts.append(f"{frame.mode} BETWEEN {frame.start} AND {frame.end}")
                over_clause = " ".join(over_parts)
                return f"{func_sql} OVER ({over_clause})"
            case _:
                raise ValueError(f"Unknown AST node type: {type(expr).__name__}")

    def compile_regex_match(self, column: Expr, pattern: str, *, negated: bool) -> str:
        """Compile a regex predicate. Default uses ``REGEXP_LIKE`` — overridden
        per dialect that needs a different syntax (Postgres ``~``, MySQL
        ``REGEXP``, ClickHouse ``match`` etc.).

        The pattern is rendered as a SQL string literal; callers pass it
        as ``RegexMatch.pattern`` (already a Python ``str``).
        """
        col_sql = self.compile_expr(column)
        pat_sql = self.compile_expr(Literal.string(pattern))
        op_sql = f"REGEXP_LIKE({col_sql}, {pat_sql})"
        return f"NOT {op_sql}" if negated else op_sql

    def compile_relative_date_range(
        self,
        column: Expr,
        unit: str,
        count: int,
        direction: str,
        include_current: bool,
    ) -> str:
        """Compile a relative date range predicate to SQL."""
        col_sql = self.compile_expr(column)
        base = self.current_date_sql()

        if direction == "future":
            start = base if include_current else self.date_add_sql(base, "day", 1)
            end = self.date_add_sql(start, unit, count)
        else:
            end = self.date_add_sql(base, "day", 1) if include_current else base
            start = self.date_add_sql(end, unit, -count)

        return f"({col_sql} >= {start} AND {col_sql} < {end})"

`render_obml_type(obml_type)` ¶

Render an OBMLType to a dialect-specific SQL type string.

Handles precision clamping for decimal types.

Source code in src/orionbelt/dialect/base.py

def render_obml_type(self, obml_type: OBMLType) -> str:
    """Render an OBMLType to a dialect-specific SQL type string.

    Handles precision clamping for decimal types.
    """
    if isinstance(obml_type, DecimalType):
        p = min(obml_type.precision, self._MAX_DECIMAL_PRECISION)
        s = min(obml_type.scale, p)
        return f"DECIMAL({p}, {s})"
    return self._OBML_SIMPLE_TYPE_MAP.get(obml_type.name, obml_type.name.upper())

`cast_to_obml_type(expr, obml_type)` ¶

Build an Expr that coerces expr to the given OBML type.

Default form is a plain CAST(expr AS <type>). Dialects whose CAST doesn't accept a parameterized decimal (notably BigQuery — "Parameterized types are not allowed in CAST expressions") can override to wrap the cast with a ROUND to honour the user-specified scale.

Source code in src/orionbelt/dialect/base.py

def cast_to_obml_type(self, expr: Expr, obml_type: OBMLType) -> Expr:
    """Build an Expr that coerces ``expr`` to the given OBML type.

    Default form is a plain ``CAST(expr AS <type>)``. Dialects whose
    ``CAST`` doesn't accept a parameterized decimal (notably BigQuery
    — "Parameterized types are not allowed in CAST expressions") can
    override to wrap the cast with a ROUND to honour the user-specified
    scale.
    """
    return Cast(expr=expr, type_name=self.render_obml_type(obml_type))

`format_table_ref(database, schema, code)` ¶

Format a fully-qualified table reference.

Default: three-part database.schema.code (Snowflake/Databricks/Dremio). Postgres and ClickHouse override to two-part naming. All components are quoted to prevent SQL injection.

Source code in src/orionbelt/dialect/base.py

def format_table_ref(self, database: str, schema: str, code: str) -> str:
    """Format a fully-qualified table reference.

    Default: three-part ``database.schema.code`` (Snowflake/Databricks/Dremio).
    Postgres and ClickHouse override to two-part naming.
    All components are quoted to prevent SQL injection.
    """
    return (
        f"{self.quote_identifier(database)}"
        f".{self.quote_identifier(schema)}"
        f".{self.quote_identifier(code)}"
    )

`quote_identifier(name)` `abstractmethod` ¶

Quote an identifier per dialect rules.

Source code in src/orionbelt/dialect/base.py

@abstractmethod
def quote_identifier(self, name: str) -> str:
    """Quote an identifier per dialect rules."""

`render_time_grain(column, grain)` `abstractmethod` ¶

Wrap a column expression for the given time grain.

Source code in src/orionbelt/dialect/base.py

@abstractmethod
def render_time_grain(self, column: Expr, grain: TimeGrain) -> Expr:
    """Wrap a column expression for the given time grain."""

`render_cast(expr, target_type)` `abstractmethod` ¶

Render a CAST expression.

Source code in src/orionbelt/dialect/base.py

@abstractmethod
def render_cast(self, expr: Expr, target_type: str) -> Expr:
    """Render a CAST expression."""

`current_date_sql()` `abstractmethod` ¶

Return SQL for the current date.

Source code in src/orionbelt/dialect/base.py

@abstractmethod
def current_date_sql(self) -> str:
    """Return SQL for the current date."""

`date_add_sql(date_sql, unit, count)` `abstractmethod` ¶

Return SQL that adds count units to date_sql.

Source code in src/orionbelt/dialect/base.py

@abstractmethod
def date_add_sql(self, date_sql: str, unit: str, count: int) -> str:
    """Return SQL that adds count units to date_sql."""

`render_date_trunc_sql(column_sql, grain)` `abstractmethod` ¶

Return SQL string that truncates a date/timestamp to the given grain.

String-level helper (not AST) for use in raw SQL CTEs like date_range.

Source code in src/orionbelt/dialect/base.py

@abstractmethod
def render_date_trunc_sql(self, column_sql: str, grain: str) -> str:
    """Return SQL string that truncates a date/timestamp to the given grain.

    String-level helper (not AST) for use in raw SQL CTEs like date_range.
    """

`render_date_spine_cte_sql(min_date, max_date, grain, offset, offset_grain)` `abstractmethod` ¶

Return the SQL body for a date spine CTE.

Must produce two columns: spine_date and spine_date_prev. spine_date_prev is NULL when the offset date falls before min_date.

Parameters¶

min_date : str SQL expression referencing the minimum date (e.g. date_range.min_date). max_date : str SQL expression referencing the maximum date. grain : str Time grain string: day, week, month, quarter, year. offset : int Signed period offset (e.g. -1 for previous period). offset_grain : str Grain of the offset (e.g. year for YoY).

Source code in src/orionbelt/dialect/base.py

@abstractmethod
def render_date_spine_cte_sql(
    self,
    min_date: str,
    max_date: str,
    grain: str,
    offset: int,
    offset_grain: str,
) -> str:
    """Return the SQL body for a date spine CTE.

    Must produce two columns: ``spine_date`` and ``spine_date_prev``.
    ``spine_date_prev`` is NULL when the offset date falls before min_date.

    Parameters
    ----------
    min_date : str
        SQL expression referencing the minimum date (e.g. ``date_range.min_date``).
    max_date : str
        SQL expression referencing the maximum date.
    grain : str
        Time grain string: ``day``, ``week``, ``month``, ``quarter``, ``year``.
    offset : int
        Signed period offset (e.g. ``-1`` for previous period).
    offset_grain : str
        Grain of the offset (e.g. ``year`` for YoY).
    """

`render_string_contains(column, pattern)` ¶

Default: column LIKE '%' || pattern || '%'.

Source code in src/orionbelt/dialect/base.py

def render_string_contains(self, column: Expr, pattern: Expr) -> Expr:
    """Default: column LIKE '%' || pattern || '%'."""
    return BinaryOp(
        left=column,
        op="LIKE",
        right=BinaryOp(
            left=BinaryOp(left=Literal.string("%"), op="||", right=pattern),
            op="||",
            right=Literal.string("%"),
        ),
    )

`render_decimal_division_sql(left_sql, right_sql)` ¶

Render left / right for decimal-typed operands, given raw SQL.

Used by code paths that build division as string SQL (e.g. PoP comparison CTEs) rather than as BinaryOp AST nodes. Default is plain SQL division; ClickHouse overrides to widen both sides to Decimal(38, 10) first so ratio precision survives.

Source code in src/orionbelt/dialect/base.py

def render_decimal_division_sql(self, left_sql: str, right_sql: str) -> str:
    """Render ``left / right`` for decimal-typed operands, given raw SQL.

    Used by code paths that build division as string SQL (e.g. PoP
    comparison CTEs) rather than as ``BinaryOp`` AST nodes. Default
    is plain SQL division; ClickHouse overrides to widen both sides
    to ``Decimal(38, 10)`` first so ratio precision survives.
    """
    return f"{left_sql} / {right_sql}"

`compile(ast)` ¶

Render a complete SQL AST to a dialect-specific string.

Source code in src/orionbelt/dialect/base.py

def compile(self, ast: Select) -> str:
    """Render a complete SQL AST to a dialect-specific string."""
    return self.compile_select(ast)

`compile_select(node)` ¶

Compile a SELECT statement.

Source code in src/orionbelt/dialect/base.py

def compile_select(self, node: Select) -> str:
    """Compile a SELECT statement."""
    parts: list[str] = []

    # CTEs
    if node.ctes:
        cte_parts = []
        for cte in node.ctes:
            if isinstance(cte.query, RawSQL):
                cte_sql = cte.query.sql
            elif isinstance(cte.query, UnionAll):
                cte_sql = self.compile_union_all(cte.query)
            elif isinstance(cte.query, Except):
                cte_sql = self.compile_except(cte.query)
            else:
                cte_sql = self.compile_select(cte.query)
            cte_parts.append(f"{self.quote_identifier(cte.name)} AS (\n{cte_sql}\n)")
        parts.append("WITH " + ",\n".join(cte_parts))

    # SELECT
    keyword = "SELECT DISTINCT" if node.distinct else "SELECT"
    if node.columns:
        cols = ", ".join(self.compile_expr(c) for c in node.columns)
        parts.append(f"{keyword} {cols}")
    else:
        parts.append(f"{keyword} *")

    # FROM
    if node.from_:
        parts.append(f"FROM {self.compile_from(node.from_)}")

    # JOINs
    for join in node.joins:
        parts.append(self.compile_join(join))

    # WHERE
    if node.where:
        parts.append(f"WHERE {self.compile_expr(node.where)}")

    # GROUP BY
    if node.group_by:
        parts.append(self.compile_group_by(node.group_by, node.grouping))

    # HAVING
    if node.having:
        parts.append(f"HAVING {self.compile_expr(node.having)}")

    # ORDER BY
    if node.order_by:
        orders = ", ".join(self.compile_order_by(o) for o in node.order_by)
        parts.append(f"ORDER BY {orders}")

    # LIMIT
    if node.limit is not None:
        parts.append(f"LIMIT {node.limit}")

    # OFFSET
    if node.offset is not None:
        parts.append(f"OFFSET {node.offset}")

    return "\n".join(parts)

`compile_group_by(group_by, grouping)` ¶

Render the GROUP BY clause.

Default ANSI form (Postgres, Snowflake, DuckDB, BigQuery, Databricks, Dremio, MySQL): GROUP BY ROLLUP(a, b) / GROUP BY CUBE(a, b). ClickHouse overrides to the trailing-modifier form (GROUP BY a, b WITH ROLLUP).

When capabilities.supports_group_by_all is set and no grouping modifier is requested, emits GROUP BY ALL — the engine auto-derives the grouping list from the SELECT. Equivalent SQL with a much shorter and more idiomatic form on modern OLAP engines, especially for queries with computed dimensions.

Source code in src/orionbelt/dialect/base.py

def compile_group_by(self, group_by: list[Expr], grouping: str | None) -> str:
    """Render the GROUP BY clause.

    Default ANSI form (Postgres, Snowflake, DuckDB, BigQuery, Databricks,
    Dremio, MySQL): ``GROUP BY ROLLUP(a, b)`` / ``GROUP BY CUBE(a, b)``.
    ClickHouse overrides to the trailing-modifier form
    (``GROUP BY a, b WITH ROLLUP``).

    When ``capabilities.supports_group_by_all`` is set and no grouping
    modifier is requested, emits ``GROUP BY ALL`` — the engine
    auto-derives the grouping list from the SELECT. Equivalent SQL
    with a much shorter and more idiomatic form on modern OLAP
    engines, especially for queries with computed dimensions.
    """
    if grouping == "rollup":
        groups = ", ".join(self.compile_expr(g) for g in group_by)
        return f"GROUP BY ROLLUP({groups})"
    if grouping == "cube":
        groups = ", ".join(self.compile_expr(g) for g in group_by)
        return f"GROUP BY CUBE({groups})"
    if self.capabilities.supports_group_by_all:
        return "GROUP BY ALL"
    groups = ", ".join(self.compile_expr(g) for g in group_by)
    return f"GROUP BY {groups}"

`compile_union_all(node)` ¶

Compile a UNION ALL of multiple SELECT statements.

Source code in src/orionbelt/dialect/base.py

def compile_union_all(self, node: UnionAll) -> str:
    """Compile a UNION ALL of multiple SELECT statements."""
    return "\nUNION ALL\n".join(self.compile_select(q) for q in node.queries)

`compile_except(node)` ¶

Compile an EXCEPT of two SELECT statements.

Source code in src/orionbelt/dialect/base.py

def compile_except(self, node: Except) -> str:
    """Compile an EXCEPT of two SELECT statements."""
    return self.compile_select(node.left) + "\nEXCEPT\n" + self.compile_select(node.right)

`compile_expr(expr, _parent_prec=0)` ¶

Compile an expression node to SQL string.

_parent_prec is the precedence of the surrounding operator (or _CLAUSE_ROOT_PREC = 0 when called at the root of a SELECT projection, ON / WHERE / HAVING clause, GROUP BY / ORDER BY item, or function argument). Each operator branch wraps its own SQL in (...) only when its precedence is strictly less than the parent's required level; atoms (literals, column refs, function calls, CAST, CASE) are at _PREC_ATOM and never wrap.

Pre-v2.7.4 every BinaryOp / IsNull / InList / Between / UnaryOp wrapped itself unconditionally, producing deeply-nested unreadable SQL — issue #79.

Source code in src/orionbelt/dialect/base.py

def compile_expr(self, expr: Expr, _parent_prec: int = 0) -> str:
    """Compile an expression node to SQL string.

    ``_parent_prec`` is the precedence of the surrounding operator
    (or ``_CLAUSE_ROOT_PREC = 0`` when called at the root of a SELECT
    projection, ON / WHERE / HAVING clause, GROUP BY / ORDER BY item,
    or function argument). Each operator branch wraps its own SQL in
    ``(...)`` only when its precedence is strictly less than the
    parent's required level; atoms (literals, column refs, function
    calls, CAST, CASE) are at ``_PREC_ATOM`` and never wrap.

    Pre-v2.7.4 every ``BinaryOp`` / ``IsNull`` / ``InList`` /
    ``Between`` / ``UnaryOp`` wrapped itself unconditionally,
    producing deeply-nested unreadable SQL — issue #79.
    """
    match expr:
        case Literal(value=None):
            return "NULL"
        case Literal(value=True):
            return "TRUE"
        case Literal(value=False):
            return "FALSE"
        case Literal(value=v) if isinstance(v, str):
            escaped = v.replace("'", "''")
            return f"'{escaped}'"
        case Literal(value=v):
            return str(v)
        case Star(table=None):
            return "*"
        case Star(table=t) if t is not None:
            return f"{self.quote_identifier(t)}.*"
        case ColumnRef(name=name, table=None):
            return self.quote_identifier(name)
        case ColumnRef(name=name, table=table) if table is not None:
            return f"{self.quote_identifier(table)}.{self.quote_identifier(name)}"
        case AliasedExpr(expr=inner, alias=alias):
            return f"{self.compile_expr(inner)} AS {self.quote_identifier(alias)}"
        case FunctionCall(
            name=fname,
            args=args,
            distinct=distinct,
            order_by=order_by,
            separator=separator,
        ):
            # Reject aggregations explicitly listed as unsupported by the dialect.
            # Per-function overrides (_compile_mode etc.) still apply for cases
            # that have a special compile path; this catches plain aggregates
            # like REGR_SLOPE that have no override.
            self._check_aggregation_supported(fname)
            # LISTAGG: dialect-specific rendering
            if fname.upper() == "LISTAGG":
                return self._compile_listagg(args, distinct, order_by, separator)
            # MODE: dialect-specific rendering
            if fname.upper() == "MODE":
                return self._compile_mode(args)
            # MEDIAN: dialect-specific rendering
            if fname.upper() == "MEDIAN":
                return self._compile_median(args)
            # Multi-field COUNT: concatenate fields for portability
            # (Snowflake overrides to use native multi-arg syntax)
            if fname.upper() == "COUNT" and len(args) > 1:
                return self._compile_multi_field_count(args, distinct)
            fname = self._map_function_name(fname)
            args_sql = ", ".join(self.compile_expr(a) for a in args)
            if distinct:
                return f"{fname}(DISTINCT {args_sql})"
            return f"{fname}({args_sql})"
        case BinaryOp(left=left, op=op, right=right):
            self_prec = self._binary_op_precedence(op)
            sql = self._compile_binary_op(left, op, right)
            return self._wrap_if_lower(sql, self_prec, _parent_prec)
        case UnaryOp(op=op, operand=operand):
            self_prec = self._PREC_NOT if op.upper() == "NOT" else self._PREC_UNARY
            sql = f"{op} {self.compile_expr(operand, _parent_prec=self_prec)}"
            return self._wrap_if_lower(sql, self_prec, _parent_prec)
        case IsNull(expr=inner, negated=False):
            sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} IS NULL"
            return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
        case IsNull(expr=inner, negated=True):
            sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} IS NOT NULL"
            return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
        case InList(expr=inner, values=values, negated=negated):
            vals = ", ".join(self.compile_expr(v) for v in values)
            op = "NOT IN" if negated else "IN"
            sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} {op} ({vals})"
            return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
        case CaseExpr(when_clauses=whens, else_clause=else_):
            parts = ["CASE"]
            for when_cond, then_val in whens:
                parts.append(
                    f"WHEN {self.compile_expr(when_cond)} THEN {self.compile_expr(then_val)}"
                )
            if else_ is not None:
                parts.append(f"ELSE {self.compile_expr(else_)}")
            parts.append("END")
            return " ".join(parts)
        case Cast(expr=inner, type_name=type_name):
            return self._compile_cast(inner, type_name)
        case SubqueryExpr(query=query):
            return f"(\n{self.compile_select(query)}\n)"
        case Exists(subquery=subq, negated=False):
            return f"EXISTS (\n{self.compile_select(subq)}\n)"
        case Exists(subquery=subq, negated=True):
            return f"NOT EXISTS (\n{self.compile_select(subq)}\n)"
        case RawSQL(sql=sql):
            return sql
        case Between(expr=inner, low=low, high=high, negated=negated):
            op = "NOT BETWEEN" if negated else "BETWEEN"
            inner_sql = self.compile_expr(inner, _parent_prec=self._PREC_CMP)
            low_sql = self.compile_expr(low, _parent_prec=self._PREC_CMP)
            high_sql = self.compile_expr(high, _parent_prec=self._PREC_CMP)
            sql = f"{inner_sql} {op} {low_sql} AND {high_sql}"
            return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
        case RegexMatch(column=column, pattern=pattern, negated=negated):
            return self.compile_regex_match(column, pattern, negated=negated)
        case RelativeDateRange(
            column=column,
            unit=unit,
            count=count,
            direction=direction,
            include_current=include_current,
        ):
            return self.compile_relative_date_range(
                column=column,
                unit=unit,
                count=count,
                direction=direction,
                include_current=include_current,
            )
        case WindowFunction(
            func_name=fname,
            args=args,
            partition_by=partition_by,
            order_by=order_by,
            frame=frame,
            distinct=distinct,
        ):
            args_sql = ", ".join(self.compile_expr(a) for a in args)
            func_sql = f"{fname}(DISTINCT {args_sql})" if distinct else f"{fname}({args_sql})"
            over_parts: list[str] = []
            if partition_by:
                pb = ", ".join(self.compile_expr(p) for p in partition_by)
                over_parts.append(f"PARTITION BY {pb}")
            if order_by:
                ob = ", ".join(self.compile_order_by(o) for o in order_by)
                over_parts.append(f"ORDER BY {ob}")
            if frame is not None:
                over_parts.append(f"{frame.mode} BETWEEN {frame.start} AND {frame.end}")
            over_clause = " ".join(over_parts)
            return f"{func_sql} OVER ({over_clause})"
        case _:
            raise ValueError(f"Unknown AST node type: {type(expr).__name__}")

`compile_regex_match(column, pattern, *, negated)` ¶

Compile a regex predicate. Default uses REGEXP_LIKE — overridden per dialect that needs a different syntax (Postgres ~, MySQL REGEXP, ClickHouse match etc.).

The pattern is rendered as a SQL string literal; callers pass it as RegexMatch.pattern (already a Python str).

Source code in src/orionbelt/dialect/base.py

def compile_regex_match(self, column: Expr, pattern: str, *, negated: bool) -> str:
    """Compile a regex predicate. Default uses ``REGEXP_LIKE`` — overridden
    per dialect that needs a different syntax (Postgres ``~``, MySQL
    ``REGEXP``, ClickHouse ``match`` etc.).

    The pattern is rendered as a SQL string literal; callers pass it
    as ``RegexMatch.pattern`` (already a Python ``str``).
    """
    col_sql = self.compile_expr(column)
    pat_sql = self.compile_expr(Literal.string(pattern))
    op_sql = f"REGEXP_LIKE({col_sql}, {pat_sql})"
    return f"NOT {op_sql}" if negated else op_sql

`compile_relative_date_range(column, unit, count, direction, include_current)` ¶

Compile a relative date range predicate to SQL.

Source code in src/orionbelt/dialect/base.py

def compile_relative_date_range(
    self,
    column: Expr,
    unit: str,
    count: int,
    direction: str,
    include_current: bool,
) -> str:
    """Compile a relative date range predicate to SQL."""
    col_sql = self.compile_expr(column)
    base = self.current_date_sql()

    if direction == "future":
        start = base if include_current else self.date_add_sql(base, "day", 1)
        end = self.date_add_sql(start, unit, count)
    else:
        end = self.date_add_sql(base, "day", 1) if include_current else base
        start = self.date_add_sql(end, unit, -count)

    return f"({col_sql} >= {start} AND {col_sql} < {end})"

`orionbelt.dialect.base.DialectCapabilities` `dataclass` ¶

Flags indicating what SQL features a dialect supports.

Source code in src/orionbelt/dialect/base.py

@dataclass
class DialectCapabilities:
    """Flags indicating what SQL features a dialect supports."""

    supports_cte: bool = True
    supports_qualify: bool = False
    supports_arrays: bool = False
    supports_window_filters: bool = False
    supports_ilike: bool = False
    supports_time_travel: bool = False
    supports_semi_structured: bool = False
    supports_union_all_by_name: bool = False
    # ``GROUP BY ALL`` (Snowflake 2022+, Databricks/Spark 3.4+, DuckDB 0.7+,
    # BigQuery, ClickHouse 22.6+) auto-derives the grouping list from the
    # SELECT clause. Functionally equivalent to the explicit list but much
    # shorter on queries with computed dimensions, where the explicit form
    # repeats the full expression. Postgres, MySQL, Dremio do not support it.
    supports_group_by_all: bool = False
    unsupported_aggregations: list[str] = field(default_factory=list)

Dialect Registry¶

`orionbelt.dialect.registry.DialectRegistry` ¶

Registry for SQL dialect plugins.

Source code in src/orionbelt/dialect/registry.py

class DialectRegistry:
    """Registry for SQL dialect plugins."""

    _dialects: dict[str, type[Dialect]] = {}

    @classmethod
    def register(cls, dialect_class: type[Dialect]) -> type[Dialect]:
        """Register a dialect class. Can be used as a decorator."""
        # Instantiate to read the name property
        instance = dialect_class()
        cls._dialects[instance.name] = dialect_class
        return dialect_class

    @classmethod
    def get(cls, name: str) -> Dialect:
        """Get an instance of the named dialect."""
        if name not in cls._dialects:
            raise UnsupportedDialectError(name, available=cls.available())
        return cls._dialects[name]()

    @classmethod
    def available(cls) -> list[str]:
        """List registered dialect names."""
        return sorted(cls._dialects.keys())

    @classmethod
    def reset(cls) -> None:
        """Clear all registered dialects (for testing)."""
        cls._dialects.clear()

`get(name)` `classmethod` ¶

Get an instance of the named dialect.

Source code in src/orionbelt/dialect/registry.py

@classmethod
def get(cls, name: str) -> Dialect:
    """Get an instance of the named dialect."""
    if name not in cls._dialects:
        raise UnsupportedDialectError(name, available=cls.available())
    return cls._dialects[name]()

`available()` `classmethod` ¶

List registered dialect names.

Source code in src/orionbelt/dialect/registry.py

@classmethod
def available(cls) -> list[str]:
    """List registered dialect names."""
    return sorted(cls._dialects.keys())

`register(dialect_class)` `classmethod` ¶

Register a dialect class. Can be used as a decorator.

Source code in src/orionbelt/dialect/registry.py

@classmethod
def register(cls, dialect_class: type[Dialect]) -> type[Dialect]:
    """Register a dialect class. Can be used as a decorator."""
    # Instantiate to read the name property
    instance = dialect_class()
    cls._dialects[instance.name] = dialect_class
    return dialect_class

YAML Parser¶

`orionbelt.parser.loader.TrackedLoader` ¶

YAML loader that tracks source positions for error reporting.

Uses ruamel.yaml which preserves line/column info on every parsed node.

Source code in src/orionbelt/parser/loader.py

class TrackedLoader:
    """YAML loader that tracks source positions for error reporting.

    Uses ruamel.yaml which preserves line/column info on every parsed node.
    """

    def __init__(self) -> None:
        self._yaml = YAML()
        self._yaml.preserve_quotes = True
        # Reject duplicate YAML keys (e.g. two columns with the same name).
        # Without this, ruamel.yaml silently keeps only the last value.
        self._yaml.allow_duplicate_keys = False
        # Reject deeply nested structures (mitigates stack-based DoS).
        # ruamel.yaml raises an error when nesting exceeds this limit.
        self._yaml.max_depth = _MAX_DEPTH

    # -- safety checks -------------------------------------------------------

    @staticmethod
    def _check_yaml_safety(content: str) -> None:
        """Pre-parse safety checks on raw YAML text.

        Raises ``YAMLSafetyError`` if the content contains anchors/aliases
        (not used in OBML) or exceeds the maximum document size.
        """
        if len(content) > _MAX_DOCUMENT_SIZE:
            raise YAMLSafetyError(
                f"YAML document exceeds maximum size "
                f"({len(content):,} chars > {_MAX_DOCUMENT_SIZE:,} limit)"
            )
        # Strip full-line comments before scanning so that &name inside
        # comments (e.g. "# see R&D notes") does not cause a false positive.
        stripped = _COMMENT_LINE_RE.sub("", content)
        if _ANCHOR_RE.search(stripped):
            raise YAMLSafetyError("YAML anchors/aliases are not supported in OBML")

    @staticmethod
    def _check_node_count(data: Any, limit: int = _MAX_NODE_COUNT) -> None:
        """Post-parse defense-in-depth: reject documents with too many nodes."""
        count = 0
        stack: list[Any] = [data]
        while stack:
            node = stack.pop()
            count += 1
            if count > limit:
                raise YAMLSafetyError(f"YAML document exceeds maximum node count ({limit:,})")
            if isinstance(node, dict):
                stack.extend(node.values())
            elif isinstance(node, list):
                stack.extend(node)

    # -- public loading API --------------------------------------------------

    def load(self, path: Path) -> tuple[dict[str, Any], SourceMap]:
        """Load a YAML file and return parsed dict + source position map."""
        with path.open("r", encoding="utf-8") as handle:
            content = handle.read()
        self._check_yaml_safety(content)
        data = self._yaml.load(content)
        if data is None:
            return {}, SourceMap()
        self._check_node_count(data)
        source_map = SourceMap()
        self._extract_positions(data, str(path), "", source_map)
        return self._to_plain_dict(data), source_map

    def load_string(
        self, content: str, filename: str = "<string>"
    ) -> tuple[dict[str, Any], SourceMap]:
        """Load YAML from a string."""
        self._check_yaml_safety(content)
        data = self._yaml.load(content)
        if data is None:
            return {}, SourceMap()
        self._check_node_count(data)
        source_map = SourceMap()
        self._extract_positions(data, filename, "", source_map)
        return self._to_plain_dict(data), source_map

    def _extract_positions(
        self,
        data: Any,
        filename: str,
        prefix: str,
        source_map: SourceMap,
    ) -> None:
        """Recursively extract source positions from ruamel.yaml nodes."""
        if isinstance(data, CommentedMap):
            for key in data:
                key_path = f"{prefix}.{key}" if prefix else str(key)
                # Try to get position for this key from ruamel.yaml's lc object
                try:
                    lc = data.lc
                    # lc.key() returns a callable in newer ruamel.yaml
                    key_positions = lc.key(key)
                    if key_positions:
                        line, col = key_positions
                        source_map.add(
                            key_path,
                            SourceSpan(file=filename, line=line + 1, column=col + 1),
                        )
                except (AttributeError, KeyError, TypeError):
                    # Fallback: use the map's own position
                    try:
                        lc = data.lc
                        source_map.add(
                            key_path,
                            SourceSpan(file=filename, line=lc.line + 1, column=lc.col + 1),
                        )
                    except (AttributeError, TypeError):
                        pass
                self._extract_positions(data[key], filename, key_path, source_map)
        elif isinstance(data, CommentedSeq):
            for i, item in enumerate(data):
                item_path = f"{prefix}[{i}]"
                try:
                    lc = data.lc
                    item_pos = lc.item(i)
                    if item_pos:
                        line, col = item_pos
                        source_map.add(
                            item_path,
                            SourceSpan(file=filename, line=line + 1, column=col + 1),
                        )
                except (AttributeError, KeyError, TypeError):
                    pass
                self._extract_positions(item, filename, item_path, source_map)

    def _to_plain_dict(self, data: Any) -> dict[str, Any]:
        """Convert ruamel.yaml CommentedMap/Seq to plain Python dict/list."""
        if isinstance(data, CommentedMap):
            return {str(k): self._to_plain_value(v) for k, v in data.items()}
        if isinstance(data, dict):
            return {str(k): self._to_plain_value(v) for k, v in data.items()}
        return {}

    def _to_plain_value(self, data: Any) -> Any:
        if isinstance(data, CommentedMap):
            return {str(k): self._to_plain_value(v) for k, v in data.items()}
        if isinstance(data, CommentedSeq):
            return [self._to_plain_value(item) for item in data]
        if isinstance(data, dict):
            return {str(k): self._to_plain_value(v) for k, v in data.items()}
        if isinstance(data, list):
            return [self._to_plain_value(item) for item in data]
        return data

`load(path)` ¶

Load a YAML file and return parsed dict + source position map.

Source code in src/orionbelt/parser/loader.py

def load(self, path: Path) -> tuple[dict[str, Any], SourceMap]:
    """Load a YAML file and return parsed dict + source position map."""
    with path.open("r", encoding="utf-8") as handle:
        content = handle.read()
    self._check_yaml_safety(content)
    data = self._yaml.load(content)
    if data is None:
        return {}, SourceMap()
    self._check_node_count(data)
    source_map = SourceMap()
    self._extract_positions(data, str(path), "", source_map)
    return self._to_plain_dict(data), source_map

`load_string(content, filename='<string>')` ¶

Load YAML from a string.

Source code in src/orionbelt/parser/loader.py

def load_string(
    self, content: str, filename: str = "<string>"
) -> tuple[dict[str, Any], SourceMap]:
    """Load YAML from a string."""
    self._check_yaml_safety(content)
    data = self._yaml.load(content)
    if data is None:
        return {}, SourceMap()
    self._check_node_count(data)
    source_map = SourceMap()
    self._extract_positions(data, filename, "", source_map)
    return self._to_plain_dict(data), source_map

Reference Resolver¶

`orionbelt.parser.resolver.ReferenceResolver` ¶

Resolves all references in a raw YAML model to a fully-typed SemanticModel.

Source code in src/orionbelt/parser/resolver.py

class ReferenceResolver:
    """Resolves all references in a raw YAML model to a fully-typed SemanticModel."""

    def resolve(
        self,
        raw: dict[str, Any],
        source_map: SourceMap | None = None,
    ) -> tuple[SemanticModel, ValidationResult]:
        """Resolve raw YAML dict into a validated SemanticModel.

        Returns (model, validation_result). If there are errors,
        the model may be partially populated.
        """
        errors: list[SemanticError] = []
        warnings: list[SemanticError] = []

        # Strict OBML: reject unknown top-level keys (catches typos like
        # ``dataObjekt:`` that would silently be dropped by ``raw.get(...)``).
        _check_unknown_keys(raw, _TOP_LEVEL_KEYS, "", errors, source_map)

        # Parse data objects
        data_objects: dict[str, DataObject] = {}
        raw_objects = raw.get("dataObjects", {})
        if not isinstance(raw_objects, dict):
            errors.append(
                SemanticError(
                    code="DATA_OBJECT_PARSE_ERROR",
                    message="'dataObjects' must be a YAML mapping, not a list or scalar",
                    path="dataObjects",
                )
            )
            raw_objects = {}
        for name, raw_obj in raw_objects.items():
            try:
                _check_unknown_keys(
                    raw_obj, _DATA_OBJECT_KEYS, f"dataObjects.{name}", errors, source_map
                )
                obj_columns: dict[str, DataObjectColumn] = {}
                for fname, fdata in raw_obj.get("columns", {}).items():
                    _check_unknown_keys(
                        fdata,
                        _DATA_OBJECT_COLUMN_KEYS,
                        f"dataObjects.{name}.columns.{fname}",
                        errors,
                        source_map,
                    )
                    obj_columns[fname] = DataObjectColumn(
                        label=fname,
                        code=fdata.get("code", fname if not fdata.get("expression") else ""),
                        abstract_type=fdata.get("abstractType", "string"),
                        sql_type=fdata.get("sqlType"),
                        sql_precision=fdata.get("sqlPrecision"),
                        sql_scale=fdata.get("sqlScale"),
                        num_class=fdata.get("numClass"),
                        primary_key=bool(fdata.get("primaryKey", False)),
                        description=fdata.get("description"),
                        comment=fdata.get("comment"),
                        owner=fdata.get("owner"),
                        expression=fdata.get("expression"),
                        synonyms=fdata.get("synonyms", []),
                        custom_extensions=_parse_extensions(fdata),
                    )

                obj_joins: list[DataObjectJoin] = []
                for ji, jdata in enumerate(raw_obj.get("joins", [])):
                    _check_unknown_keys(
                        jdata,
                        _DATA_OBJECT_JOIN_KEYS,
                        f"dataObjects.{name}.joins[{ji}]",
                        errors,
                        source_map,
                    )
                    obj_joins.append(
                        DataObjectJoin(
                            join_type=jdata["joinType"],
                            join_to=jdata["joinTo"],
                            columns_from=jdata["columnsFrom"],
                            columns_to=jdata["columnsTo"],
                            secondary=jdata.get("secondary", False),
                            path_name=jdata.get("pathName"),
                        )
                    )

                data_objects[name] = DataObject(
                    label=name,
                    code=raw_obj.get("code", ""),
                    database=raw_obj.get("database", ""),
                    schema_name=raw_obj.get("schema", ""),
                    columns=obj_columns,
                    joins=obj_joins,
                    description=raw_obj.get("description"),
                    comment=raw_obj.get("comment"),
                    owner=raw_obj.get("owner"),
                    countable=raw_obj.get("countable", True),
                    count_label=raw_obj.get("countLabel"),
                    synonyms=raw_obj.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_obj),
                    refresh=_parse_refresh(raw_obj.get("refresh"), name, errors),
                )
            except Exception as e:
                span = source_map.get(f"dataObjects.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="DATA_OBJECT_PARSE_ERROR",
                        message=f"Failed to parse data object '{name}': {e}",
                        path=f"dataObjects.{name}",
                        span=span,
                    )
                )

        # Parse dimensions
        dimensions: dict[str, Dimension] = {}
        raw_dims = raw.get("dimensions", {})
        if not isinstance(raw_dims, dict):
            errors.append(
                SemanticError(
                    code="DIMENSION_PARSE_ERROR",
                    message="'dimensions' must be a YAML mapping, not a list or scalar",
                    path="dimensions",
                )
            )
            raw_dims = {}
        for name, raw_dim in raw_dims.items():
            try:
                _check_unknown_keys(
                    raw_dim, _DIMENSION_KEYS, f"dimensions.{name}", errors, source_map
                )
                data_object = raw_dim.get("dataObject")
                column = raw_dim.get("column")

                # Validate the data object exists
                if data_object and data_object not in data_objects:
                    span = source_map.get(f"dimensions.{name}") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_DATA_OBJECT",
                            message=(
                                f"Dimension '{name}' references unknown data object '{data_object}'"
                            ),
                            path=f"dimensions.{name}",
                            span=span,
                            suggestions=_suggest_similar(data_object, list(data_objects.keys())),
                        )
                    )

                # Validate the column exists in the data object
                if (
                    data_object
                    and column
                    and data_object in data_objects
                    and column not in data_objects[data_object].columns
                ):
                    span = source_map.get(f"dimensions.{name}") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_COLUMN",
                            message=(
                                f"Dimension '{name}' references unknown column "
                                f"'{column}' in data object '{data_object}'"
                            ),
                            path=f"dimensions.{name}",
                            span=span,
                            suggestions=_suggest_similar(
                                column, list(data_objects[data_object].columns.keys())
                            ),
                        )
                    )

                via = raw_dim.get("via")
                if via and via not in data_objects:
                    span = source_map.get(f"dimensions.{name}") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_DATA_OBJECT",
                            message=(
                                f"Dimension '{name}' via references unknown data object '{via}'"
                            ),
                            path=f"dimensions.{name}",
                            span=span,
                            suggestions=_suggest_similar(via, list(data_objects.keys())),
                        )
                    )

                dimensions[name] = Dimension(
                    label=name,
                    view=data_object or "",
                    column=column or "",
                    result_type=raw_dim.get("resultType", "string"),
                    time_grain=raw_dim.get("timeGrain"),
                    via=via,
                    description=raw_dim.get("description"),
                    format=raw_dim.get("format"),
                    owner=raw_dim.get("owner"),
                    synonyms=raw_dim.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_dim),
                )
            except Exception as e:
                span = source_map.get(f"dimensions.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="DIMENSION_PARSE_ERROR",
                        message=f"Failed to parse dimension '{name}': {e}",
                        path=f"dimensions.{name}",
                        span=span,
                    )
                )

        # Parse measures
        measures: dict[str, Measure] = {}
        raw_measures = raw.get("measures", {})
        if not isinstance(raw_measures, dict):
            errors.append(
                SemanticError(
                    code="MEASURE_PARSE_ERROR",
                    message="'measures' must be a YAML mapping, not a list or scalar",
                    path="measures",
                )
            )
            raw_measures = {}
        for name, raw_meas in raw_measures.items():
            try:
                _check_unknown_keys(raw_meas, _MEASURE_KEYS, f"measures.{name}", errors, source_map)
                measure_columns: list[DataColumnRef] = []
                for ci, fdata in enumerate(raw_meas.get("columns", [])):
                    _check_unknown_keys(
                        fdata,
                        _DATA_COLUMN_REF_KEYS,
                        f"measures.{name}.columns[{ci}]",
                        errors,
                        source_map,
                    )
                    measure_columns.append(
                        DataColumnRef(
                            view=fdata.get("dataObject"),
                            column=fdata.get("column"),
                        )
                    )

                # Resolve expression field references
                expression = raw_meas.get("expression")
                if expression:
                    self._validate_expression_refs(
                        name, expression, data_objects, errors, source_map
                    )

                # Parse measure filters (new `filters:` list or legacy `filter:` single)
                measure_filters: list[MeasureFilterItem] = []
                raw_filters = raw_meas.get("filters")
                if raw_filters and isinstance(raw_filters, list):
                    for fi, rf in enumerate(raw_filters):
                        measure_filters.append(
                            _parse_measure_filter_item(
                                rf,
                                f"measures.{name}.filters[{fi}]",
                                errors,
                                source_map,
                            )
                        )
                else:
                    # Backward compat: single `filter:` key → [filter]
                    raw_filter = raw_meas.get("filter")
                    if raw_filter:
                        measure_filters.append(
                            _parse_measure_filter_item(
                                raw_filter, f"measures.{name}.filter", errors, source_map
                            )
                        )

                # Parse grain override
                grain_override: GrainOverride | None = None
                raw_grain = raw_meas.get("grain")
                if raw_grain and isinstance(raw_grain, dict):
                    _check_unknown_keys(
                        raw_grain,
                        _GRAIN_OVERRIDE_KEYS,
                        f"measures.{name}.grain",
                        errors,
                        source_map,
                    )
                    grain_override = GrainOverride(
                        mode=raw_grain.get("mode", "RELATIVE"),
                        exclude=raw_grain.get("exclude", []),
                        include=raw_grain.get("include", []),
                        keep_only=raw_grain.get("keepOnly", []),
                    )
                    # Validate dimension references in grain
                    for dim_name in (
                        grain_override.include + grain_override.exclude + grain_override.keep_only
                    ):
                        if dim_name not in dimensions:
                            span = source_map.get(f"measures.{name}.grain") if source_map else None
                            errors.append(
                                SemanticError(
                                    code="UNKNOWN_GRAIN_DIMENSION",
                                    message=(
                                        f"Measure '{name}' grain references "
                                        f"unknown dimension '{dim_name}'"
                                    ),
                                    path=f"measures.{name}.grain",
                                    span=span,
                                    suggestions=_suggest_similar(dim_name, list(dimensions.keys())),
                                )
                            )

                # Parse filter context
                filter_ctx: FilterContext | None = None
                raw_fc = raw_meas.get("filterContext")
                if raw_fc and isinstance(raw_fc, dict):
                    _check_unknown_keys(
                        raw_fc,
                        _FILTER_CONTEXT_KEYS,
                        f"measures.{name}.filterContext",
                        errors,
                        source_map,
                    )
                    include_filters: list[FilterContextFilter] = []
                    for inc_i, raw_incl in enumerate(raw_fc.get("include", [])):
                        if isinstance(raw_incl, dict):
                            _check_unknown_keys(
                                raw_incl,
                                _FILTER_CONTEXT_FILTER_KEYS,
                                f"measures.{name}.filterContext.include[{inc_i}]",
                                errors,
                                source_map,
                            )
                            include_filters.append(
                                FilterContextFilter(
                                    field=raw_incl.get("field", ""),
                                    op=raw_incl.get("op", "equals"),
                                    value=raw_incl.get("value"),
                                )
                            )
                    filter_ctx = FilterContext(
                        mode=raw_fc.get("mode", "RELATIVE"),
                        exclude=raw_fc.get("exclude", []),
                        include=include_filters,
                        keep_only=raw_fc.get("keepOnly", []),
                    )
                    # Validate field references in exclude/keepOnly
                    all_dim_names = set(dimensions.keys())
                    all_col_refs: set[str] = set()
                    for obj_name, obj_def in data_objects.items():
                        for col_name in obj_def.columns:
                            all_col_refs.add(f"{obj_name}.{col_name}")
                    for field_name in filter_ctx.exclude + filter_ctx.keep_only:
                        if field_name not in all_dim_names and field_name not in all_col_refs:
                            span = (
                                source_map.get(f"measures.{name}.filterContext")
                                if source_map
                                else None
                            )
                            errors.append(
                                SemanticError(
                                    code="UNKNOWN_FILTER_CONTEXT_FIELD",
                                    message=(
                                        f"Measure '{name}' filterContext references "
                                        f"unknown field '{field_name}'"
                                    ),
                                    path=f"measures.{name}.filterContext",
                                    span=span,
                                    suggestions=_suggest_similar(field_name, list(all_dim_names)),
                                )
                            )
                    for incl in filter_ctx.include:
                        if incl.field not in all_dim_names and incl.field not in all_col_refs:
                            span = (
                                source_map.get(f"measures.{name}.filterContext")
                                if source_map
                                else None
                            )
                            errors.append(
                                SemanticError(
                                    code="UNKNOWN_FILTER_CONTEXT_FIELD",
                                    message=(
                                        f"Measure '{name}' filterContext.include "
                                        f"references unknown field '{incl.field}'"
                                    ),
                                    path=f"measures.{name}.filterContext.include",
                                    span=span,
                                    suggestions=_suggest_similar(incl.field, list(all_dim_names)),
                                )
                            )

                measures[name] = Measure(
                    label=name,
                    columns=measure_columns,
                    result_type=raw_meas.get("resultType", "float"),
                    aggregation=raw_meas.get("aggregation", "sum"),
                    expression=expression,
                    distinct=raw_meas.get("distinct", False),
                    total=raw_meas.get("total", False),
                    grain=grain_override,
                    filter_context=filter_ctx,
                    filters=measure_filters,
                    data_type=raw_meas.get("dataType"),
                    description=raw_meas.get("description"),
                    format=raw_meas.get("format"),
                    allow_fan_out=raw_meas.get("allowFanOut", False),
                    delimiter=raw_meas.get("delimiter"),
                    within_group=raw_meas.get("withinGroup"),
                    owner=raw_meas.get("owner"),
                    synonyms=raw_meas.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_meas),
                )
            except Exception as e:
                span = source_map.get(f"measures.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="MEASURE_PARSE_ERROR",
                        message=f"Failed to parse measure '{name}': {e}",
                        path=f"measures.{name}",
                        span=span,
                    )
                )

        # Validate the count-synthesis knobs here so a bad value becomes a
        # structured SemanticError rather than a raw AttributeError (list
        # pattern) or an uncaught Pydantic ValidationError (invalid token) at
        # model construction below. Fall back to safe values so resolution can
        # continue collecting errors.
        _count_pattern = raw.get("countLabelPattern", DEFAULT_COUNT_PATTERN)
        _pattern_err = count_pattern_error(_count_pattern)
        if _pattern_err is not None:
            span = source_map.get("countLabelPattern") if source_map else None
            errors.append(
                SemanticError(
                    code="INVALID_COUNT_LABEL_PATTERN",
                    message=_pattern_err,
                    path="countLabelPattern",
                    span=span,
                )
            )
            _count_pattern = DEFAULT_COUNT_PATTERN
        _expose_counts = raw.get("exposeCounts", True)
        if not isinstance(_expose_counts, bool):
            span = source_map.get("exposeCounts") if source_map else None
            errors.append(
                SemanticError(
                    code="INVALID_EXPOSE_COUNTS",
                    message="exposeCounts must be a boolean (true/false)",
                    path="exposeCounts",
                    span=span,
                )
            )
            _expose_counts = True

        # Names of synthesized count measures (name == resolved count label,
        # e.g. "Sales Count"). These are valid measure references (metrics may
        # target them) even though they are not declared — they are materialized
        # on read via ``effective_measures`` (see models/synthesis.py). Declared
        # measures already sit in ``measures``; a declared count of the same
        # name overrides synthesis, so unioning is safe either way.
        synthesized_measure_names: set[str] = (
            {
                count_label(key, obj, _count_pattern)
                for key, obj in data_objects.items()
                if obj.countable
            }
            if _expose_counts
            else set()
        )

        # Parse metrics
        metrics: dict[str, Metric] = {}
        raw_metrics = raw.get("metrics", {})
        if not isinstance(raw_metrics, dict):
            errors.append(
                SemanticError(
                    code="METRIC_PARSE_ERROR",
                    message="'metrics' must be a YAML mapping, not a list or scalar",
                    path="metrics",
                )
            )
            raw_metrics = {}
        for name, raw_metric in raw_metrics.items():
            try:
                _check_unknown_keys(raw_metric, _METRIC_KEYS, f"metrics.{name}", errors, source_map)
                raw_pop_block = raw_metric.get("periodOverPeriod")
                if isinstance(raw_pop_block, dict):
                    _check_unknown_keys(
                        raw_pop_block,
                        _PERIOD_OVER_PERIOD_KEYS,
                        f"metrics.{name}.periodOverPeriod",
                        errors,
                        source_map,
                    )
                metric_type = raw_metric.get("type", "derived")

                if metric_type == MetricType.CUMULATIVE:
                    # Cumulative metric: validate measure reference exists
                    ref_measure = raw_metric.get("measure", "")
                    if (
                        ref_measure
                        and ref_measure not in measures
                        and ref_measure not in synthesized_measure_names
                    ):
                        span = source_map.get(f"metrics.{name}.measure") if source_map else None
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_MEASURE",
                                message=(
                                    f"Cumulative metric '{name}' references "
                                    f"unknown measure '{ref_measure}'"
                                ),
                                path=f"metrics.{name}.measure",
                                span=span,
                            )
                        )

                    # Validate timeDimension references a known dimension
                    cum_time_dim = raw_metric.get("timeDimension", "")
                    if cum_time_dim and cum_time_dim not in dimensions:
                        span = (
                            source_map.get(f"metrics.{name}.timeDimension") if source_map else None
                        )
                        errors.append(
                            SemanticError(
                                code="CUMULATIVE_UNKNOWN_TIME_DIMENSION",
                                message=(
                                    f"Cumulative metric '{name}' references "
                                    f"unknown time dimension '{cum_time_dim}'"
                                ),
                                path=f"metrics.{name}.timeDimension",
                                span=span,
                                suggestions=_suggest_similar(cum_time_dim, list(dimensions.keys())),
                            )
                        )

                    metrics[name] = Metric(
                        label=name,
                        type=MetricType.CUMULATIVE,
                        measure=raw_metric.get("measure"),
                        time_dimension=raw_metric.get("timeDimension"),
                        cumulative_type=raw_metric.get("cumulativeType", "sum"),
                        window=raw_metric.get("window"),
                        grain_to_date=raw_metric.get("grainToDate"),
                        partition_by=list(raw_metric.get("partitionBy", []) or []),
                        data_type=raw_metric.get("dataType"),
                        description=raw_metric.get("description"),
                        format=raw_metric.get("format"),
                        owner=raw_metric.get("owner"),
                        synonyms=raw_metric.get("synonyms", []),
                        custom_extensions=_parse_extensions(raw_metric),
                    )
                elif metric_type == MetricType.PERIOD_OVER_PERIOD:
                    # Period-over-period metric: validate expression + PoP config
                    expression = raw_metric.get("expression", "")
                    self._validate_metric_expression_refs(
                        name,
                        expression,
                        measures,
                        errors,
                        source_map,
                        metrics,
                        synthesized_measure_names,
                    )

                    raw_pop = raw_metric.get("periodOverPeriod")
                    if not raw_pop:
                        span = source_map.get(f"metrics.{name}") if source_map else None
                        errors.append(
                            SemanticError(
                                code="METRIC_PARSE_ERROR",
                                message=(
                                    f"Period-over-period metric '{name}' "
                                    f"requires 'periodOverPeriod' configuration"
                                ),
                                path=f"metrics.{name}",
                                span=span,
                            )
                        )
                        raw_pop = {}

                    # Validate time dimension reference
                    pop_time_dim = raw_pop.get("timeDimension", "")
                    if pop_time_dim and pop_time_dim not in dimensions:
                        span = (
                            source_map.get(f"metrics.{name}.periodOverPeriod")
                            if source_map
                            else None
                        )
                        errors.append(
                            SemanticError(
                                code="POP_UNKNOWN_TIME_DIMENSION",
                                message=(
                                    f"Period-over-period metric '{name}' references "
                                    f"unknown time dimension '{pop_time_dim}'"
                                ),
                                path=f"metrics.{name}.periodOverPeriod.timeDimension",
                                span=span,
                                suggestions=_suggest_similar(pop_time_dim, list(dimensions.keys())),
                            )
                        )

                    pop_config = PeriodOverPeriod(
                        time_dimension=raw_pop.get("timeDimension", ""),
                        grain=raw_pop.get("grain", "month"),
                        offset=raw_pop.get("offset", -1),
                        offset_grain=raw_pop.get("offsetGrain", "year"),
                        comparison=raw_pop.get("comparison", "percentChange"),
                    )

                    metrics[name] = Metric(
                        label=name,
                        type=MetricType.PERIOD_OVER_PERIOD,
                        expression=expression,
                        period_over_period=pop_config,
                        data_type=raw_metric.get("dataType"),
                        description=raw_metric.get("description"),
                        format=raw_metric.get("format"),
                        owner=raw_metric.get("owner"),
                        synonyms=raw_metric.get("synonyms", []),
                        custom_extensions=_parse_extensions(raw_metric),
                    )
                elif metric_type == MetricType.WINDOW:
                    # Window metric (rank/lag/lead/ntile/first_value/last_value)
                    ref_measure = raw_metric.get("measure")
                    if (
                        ref_measure
                        and ref_measure not in measures
                        and ref_measure not in synthesized_measure_names
                    ):
                        span = source_map.get(f"metrics.{name}.measure") if source_map else None
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_MEASURE",
                                message=(
                                    f"Window metric '{name}' references "
                                    f"unknown measure '{ref_measure}'"
                                ),
                                path=f"metrics.{name}.measure",
                                span=span,
                            )
                        )

                    win_time_dim = raw_metric.get("timeDimension", "")
                    if win_time_dim and win_time_dim not in dimensions:
                        span = (
                            source_map.get(f"metrics.{name}.timeDimension") if source_map else None
                        )
                        errors.append(
                            SemanticError(
                                code="WINDOW_UNKNOWN_TIME_DIMENSION",
                                message=(
                                    f"Window metric '{name}' references "
                                    f"unknown time dimension '{win_time_dim}'"
                                ),
                                path=f"metrics.{name}.timeDimension",
                                span=span,
                                suggestions=_suggest_similar(win_time_dim, list(dimensions.keys())),
                            )
                        )

                    metrics[name] = Metric(
                        label=name,
                        type=MetricType.WINDOW,
                        measure=ref_measure,
                        time_dimension=raw_metric.get("timeDimension"),
                        window_function=raw_metric.get("windowFunction"),
                        offset=raw_metric.get("offset"),
                        buckets=raw_metric.get("buckets"),
                        order_direction=raw_metric.get("orderDirection", "desc"),
                        default_value=raw_metric.get("defaultValue"),
                        partition_by=list(raw_metric.get("partitionBy", []) or []),
                        data_type=raw_metric.get("dataType"),
                        description=raw_metric.get("description"),
                        format=raw_metric.get("format"),
                        owner=raw_metric.get("owner"),
                        synonyms=raw_metric.get("synonyms", []),
                        custom_extensions=_parse_extensions(raw_metric),
                    )
                else:
                    # Derived metric (default)
                    expression = raw_metric.get("expression", "")
                    self._validate_metric_expression_refs(
                        name,
                        expression,
                        measures,
                        errors,
                        source_map,
                        metrics,
                        synthesized_measure_names,
                    )

                    metrics[name] = Metric(
                        label=name,
                        expression=expression,
                        data_type=raw_metric.get("dataType"),
                        description=raw_metric.get("description"),
                        format=raw_metric.get("format"),
                        owner=raw_metric.get("owner"),
                        synonyms=raw_metric.get("synonyms", []),
                        custom_extensions=_parse_extensions(raw_metric),
                    )
            except Exception as e:
                span = source_map.get(f"metrics.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="METRIC_PARSE_ERROR",
                        message=f"Failed to parse metric '{name}': {e}",
                        path=f"metrics.{name}",
                        span=span,
                    )
                )

        # Parse static model filters
        model_filters: list[ModelFilter] = []
        raw_filters = raw.get("filters", [])
        if not isinstance(raw_filters, list):
            errors.append(
                SemanticError(
                    code="FILTER_PARSE_ERROR",
                    message="'filters' must be a YAML list, not a mapping or scalar",
                    path="filters",
                )
            )
            raw_filters = []
        for i, rf in enumerate(raw_filters):
            try:
                _check_unknown_keys(rf, _MODEL_FILTER_KEYS, f"filters[{i}]", errors, source_map)
                obj_name = rf.get("dataObject", "")
                col_name = rf.get("column", "")
                if obj_name and obj_name not in data_objects:
                    span = source_map.get(f"filters[{i}]") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_FILTER_DATA_OBJECT",
                            message=(
                                f"Static filter[{i}] references unknown data object '{obj_name}'"
                            ),
                            path=f"filters[{i}]",
                            span=span,
                        )
                    )
                elif obj_name and col_name and col_name not in data_objects[obj_name].columns:
                    span = source_map.get(f"filters[{i}]") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_FILTER_COLUMN",
                            message=(
                                f"Static filter[{i}] references unknown column "
                                f"'{col_name}' in data object '{obj_name}'"
                            ),
                            path=f"filters[{i}]",
                            span=span,
                        )
                    )
                raw_val = rf.get("value")
                raw_vals = rf.get("values", [])
                model_filters.append(
                    ModelFilter(
                        data_object=obj_name,
                        column=col_name,
                        operator=rf.get("operator", "equals"),
                        value=_coerce_filter_value(raw_val),
                        values=[_coerce_filter_value(v) for v in raw_vals],
                    )
                )
            except Exception as e:
                span = source_map.get(f"filters[{i}]") if source_map else None
                errors.append(
                    SemanticError(
                        code="FILTER_PARSE_ERROR",
                        message=f"Failed to parse static filter[{i}]: {e}",
                        path=f"filters[{i}]",
                        span=span,
                    )
                )

        settings = _parse_settings(raw.get("settings"), errors, source_map)

        # Parse examples block (PLAN_agent_api_improvements §5)
        examples = self._parse_examples(raw.get("examples"), errors)

        model = SemanticModel(
            version=raw.get("version", 1.0),
            name=raw.get("name"),
            description=raw.get("description"),
            data_objects=data_objects,
            dimensions=dimensions,
            measures=measures,
            metrics=metrics,
            filters=model_filters,
            examples=examples,
            extends_sources=raw.get("_extends_sources", []),
            inherits_source=raw.get("_inherits_source"),
            owner=raw.get("owner"),
            # Sanitized above so an invalid value is a structured error, not a
            # ValidationError raised here.
            expose_counts=_expose_counts,
            count_label_pattern=_count_pattern,
            custom_extensions=_parse_extensions(raw, "", errors, source_map),
            settings=settings,
        )

        result = ValidationResult(
            valid=len(errors) == 0,
            errors=errors,
            warnings=warnings,
        )

        return model, result

    def _parse_examples(self, raw: object, errors: list[SemanticError]) -> list[ModelExample]:
        """Parse the model-level ``examples:`` block.

        Accepts a list of mapping entries. Each entry must have ``name``,
        ``description``, and ``query``. ``intent_tags`` (alias ``intentTags``)
        is optional. Names must be unique within the block.
        """
        if raw is None:
            return []
        if not isinstance(raw, list):
            errors.append(
                SemanticError(
                    code="EXAMPLES_PARSE_ERROR",
                    message="'examples' must be a YAML list of example entries",
                    path="examples",
                )
            )
            return []

        out: list[ModelExample] = []
        seen: set[str] = set()
        for i, entry in enumerate(raw):
            if not isinstance(entry, dict):
                errors.append(
                    SemanticError(
                        code="EXAMPLES_PARSE_ERROR",
                        message=f"examples[{i}] must be a mapping",
                        path=f"examples[{i}]",
                    )
                )
                continue
            _check_unknown_keys(entry, _MODEL_EXAMPLE_KEYS, f"examples[{i}]", errors)
            name = entry.get("name")
            description = entry.get("description")
            query = entry.get("query")
            intent_tags = entry.get("intent_tags") or entry.get("intentTags") or []
            if not isinstance(name, str) or not name:
                errors.append(
                    SemanticError(
                        code="EXAMPLES_PARSE_ERROR",
                        message=f"examples[{i}].name is required and must be a string",
                        path=f"examples[{i}].name",
                    )
                )
                continue
            if name in seen:
                errors.append(
                    SemanticError(
                        code="DUPLICATE_EXAMPLE_NAME",
                        message=f"Duplicate example name '{name}'",
                        path=f"examples[{i}].name",
                    )
                )
                continue
            if not isinstance(description, str):
                errors.append(
                    SemanticError(
                        code="EXAMPLES_PARSE_ERROR",
                        message=f"examples[{i}].description is required",
                        path=f"examples[{i}].description",
                    )
                )
                continue
            if not isinstance(query, dict):
                errors.append(
                    SemanticError(
                        code="EXAMPLES_PARSE_ERROR",
                        message=f"examples[{i}].query must be a mapping (QueryObject payload)",
                        path=f"examples[{i}].query",
                    )
                )
                continue
            if not isinstance(intent_tags, list):
                errors.append(
                    SemanticError(
                        code="EXAMPLES_PARSE_ERROR",
                        message=f"examples[{i}].intent_tags must be a list",
                        path=f"examples[{i}].intent_tags",
                    )
                )
                continue
            seen.add(name)
            out.append(
                ModelExample(
                    name=name,
                    description=description,
                    intent_tags=[str(t) for t in intent_tags],
                    query=dict(query),
                )
            )
        return out

    def _validate_expression_refs(
        self,
        measure_name: str,
        expression: str,
        data_objects: dict[str, DataObject],
        errors: list[SemanticError],
        source_map: SourceMap | None,
    ) -> None:
        """Validate {[DataObject].[Column]} references in a measure expression."""
        span = source_map.get(f"measures.{measure_name}.expression") if source_map else None
        named_refs = re.findall(r"\{\[([^\]{}\[]+)\]\.\[([^\]{}\[]+)\]\}", expression)
        for obj_name, col_name in named_refs:
            if obj_name not in data_objects:
                errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT_IN_EXPRESSION",
                        message=(
                            f"Measure '{measure_name}' expression references unknown "
                            f"data object '{obj_name}'"
                        ),
                        path=f"measures.{measure_name}.expression",
                        span=span,
                    )
                )
            elif col_name not in data_objects[obj_name].columns:
                errors.append(
                    SemanticError(
                        code="UNKNOWN_COLUMN_IN_EXPRESSION",
                        message=(
                            f"Measure '{measure_name}' expression references unknown column "
                            f"'{col_name}' in data object '{obj_name}'"
                        ),
                        path=f"measures.{measure_name}.expression",
                        span=span,
                    )
                )

        # Strip valid refs, scan remainder for malformed attempts.
        remainder = re.sub(r"\{\[[^\]{}\[]+\]\.\[[^\]{}\[]+\]\}", "", expression)
        path = f"measures.{measure_name}.expression"

        def _merr(msg: str) -> None:
            errors.append(
                SemanticError(code="MALFORMED_EXPRESSION_REF", message=msg, path=path, span=span)
            )

        # {[Obj][Col]} — missing dot separator
        for o, c in re.findall(r"\{\[([^\]{}\[]+)\]\[([^\]{}\[]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{o}][{c}]}}' — missing '.' separator"
            )

        # {[Obj.Col]} — dot inside single bracket pair
        for bad in re.findall(r"\{\[([^\]{}\[]+\.[^\]{}\[]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{bad}]}}' — use '{{[Obj].[Col]}}' syntax"
            )

        # {Obj.Col} — missing all inner brackets
        for bad in re.findall(r"\{([A-Za-z][^\[{}\]]*\.[A-Za-z][^\[{}\]]*)\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{{bad}}}' — missing '[' and ']', use '{{[Obj].[Col]}}' syntax"
            )

        # {[Obj].[Col] — missing closing }
        for o, c in re.findall(r"\{\[([^\]{}\[]+)\]\.\[([^\]{}\[]+)\](?!\})", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{o}].[{c}]' — missing closing '}}'"
            )

        # [Obj].[Col]} — missing opening {
        for o, c in re.findall(r"(?<!\{)\[([^\]{}\[]+)\]\.\[([^\]{}\[]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '[{o}].[{c}]}}' — missing opening '{{'"
            )

        # {[Obj].[Col} — missing ] on column
        for o, c in re.findall(r"\{\[([^\]{}\[]+)\]\.\[([^\]{}\[]*)\}(?!\])", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{o}].[{c}}}' — missing closing ']' on column"
            )

        # {[Obj.[Col]} — missing ] on data object
        for o, c in re.findall(r"\{\[([^\]{}\[]*)\.?\[([^\]{}\[]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{o}.[{c}]}}' — missing closing ']' on data object"
            )

        # {Obj].[Col]} — missing [ on data object
        for o, c in re.findall(r"\{([^\[{}\]]+)\]\.\[([^\]{}\[]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{{o}].[{c}]}}' — missing opening '[' on data object"
            )

        # {[Obj].Col]} — missing [ on column
        for o, c in re.findall(r"\{\[([^\]{}\[]+)\]\.([^\[{}\]]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{o}].{c}]}}' — missing opening '[' on column"
            )

    def _validate_metric_expression_refs(
        self,
        metric_name: str,
        expression: str,
        measures: dict[str, Measure],
        errors: list[SemanticError],
        source_map: SourceMap | None,
        metrics: dict[str, Metric] | None = None,
        synthesized_measures: set[str] | None = None,
    ) -> None:
        """Validate {[Measure Name]} references in a metric expression.

        References can resolve to either measures or already-defined metrics
        (typically cumulative or window metrics that have been parsed earlier
        in the same model). ``metrics`` defaults to ``None`` so existing
        callers continue to work; the caller passes the in-progress metrics
        dict to enable cross-metric composition. ``synthesized_measures`` names
        the auto-generated ``<object>.count`` measures, which are valid
        references even though they are not in ``measures``.
        """
        span = source_map.get(f"metrics.{metric_name}.expression") if source_map else None

        valid_refs = re.findall(r"\{\[([^\]{}\[]+)\]\}", expression)

        # Strip valid {[Name]} refs, then scan remainder for malformed attempts.
        remainder = re.sub(r"\{\[[^\]{}\[]+\]\}", "", expression)

        # {[Name} — missing closing ]
        for bad in re.findall(r"\{\[([^\]{}]*)\}", remainder):
            errors.append(
                SemanticError(
                    code="MALFORMED_EXPRESSION_REF",
                    message=(
                        f"Metric '{metric_name}' has malformed reference"
                        f" '{{[{bad}}}' — missing closing ']'"
                    ),
                    path=f"metrics.{metric_name}.expression",
                    span=span,
                )
            )

        # {[Name] — missing closing }
        for bad in re.findall(r"\{\[([^\]{}]+)\](?!\})", remainder):
            errors.append(
                SemanticError(
                    code="MALFORMED_EXPRESSION_REF",
                    message=(
                        f"Metric '{metric_name}' has malformed reference"
                        f" '{{[{bad}]' — missing closing '}}'"
                    ),
                    path=f"metrics.{metric_name}.expression",
                    span=span,
                )
            )

        # {Name]} — missing opening [
        for bad in re.findall(r"\{([^\[{}\]]+)\]\}", remainder):
            errors.append(
                SemanticError(
                    code="MALFORMED_EXPRESSION_REF",
                    message=(
                        f"Metric '{metric_name}' has malformed reference"
                        f" '{{{bad}]}}' — missing opening '['"
                    ),
                    path=f"metrics.{metric_name}.expression",
                    span=span,
                )
            )

        # {Name} — missing both [ and ]
        for bad in re.findall(r"\{([^\[{\]}\s]+)\}", remainder):
            errors.append(
                SemanticError(
                    code="MALFORMED_EXPRESSION_REF",
                    message=(
                        f"Metric '{metric_name}' has malformed reference"
                        f" '{{{bad}}}' — missing '[' and ']'"
                    ),
                    path=f"metrics.{metric_name}.expression",
                    span=span,
                )
            )

        # [Name]} — missing opening {
        for bad in re.findall(r"(?<!\{)\[([^\]{}\[]+)\]\}", remainder):
            errors.append(
                SemanticError(
                    code="MALFORMED_EXPRESSION_REF",
                    message=(
                        f"Metric '{metric_name}' has malformed reference"
                        f" '[{bad}]}}' — missing opening '{{'"
                    ),
                    path=f"metrics.{metric_name}.expression",
                    span=span,
                )
            )

        known_metrics = metrics or {}
        known_counts = synthesized_measures or set()
        for ref_name in valid_refs:
            if (
                ref_name not in measures
                and ref_name not in known_metrics
                and ref_name not in known_counts
            ):
                errors.append(
                    SemanticError(
                        code="UNKNOWN_MEASURE_REF",
                        message=(f"Metric '{metric_name}' references unknown measure '{ref_name}'"),
                        path=f"metrics.{metric_name}.expression",
                        span=span,
                        suggestions=_suggest_similar(
                            ref_name,
                            list(measures.keys())
                            + list(known_metrics.keys())
                            + sorted(known_counts),
                        ),
                    )
                )

`resolve(raw, source_map=None)` ¶

Resolve raw YAML dict into a validated SemanticModel.

Returns (model, validation_result). If there are errors, the model may be partially populated.

Source code in src/orionbelt/parser/resolver.py

def resolve(
    self,
    raw: dict[str, Any],
    source_map: SourceMap | None = None,
) -> tuple[SemanticModel, ValidationResult]:
    """Resolve raw YAML dict into a validated SemanticModel.

    Returns (model, validation_result). If there are errors,
    the model may be partially populated.
    """
    errors: list[SemanticError] = []
    warnings: list[SemanticError] = []

    # Strict OBML: reject unknown top-level keys (catches typos like
    # ``dataObjekt:`` that would silently be dropped by ``raw.get(...)``).
    _check_unknown_keys(raw, _TOP_LEVEL_KEYS, "", errors, source_map)

    # Parse data objects
    data_objects: dict[str, DataObject] = {}
    raw_objects = raw.get("dataObjects", {})
    if not isinstance(raw_objects, dict):
        errors.append(
            SemanticError(
                code="DATA_OBJECT_PARSE_ERROR",
                message="'dataObjects' must be a YAML mapping, not a list or scalar",
                path="dataObjects",
            )
        )
        raw_objects = {}
    for name, raw_obj in raw_objects.items():
        try:
            _check_unknown_keys(
                raw_obj, _DATA_OBJECT_KEYS, f"dataObjects.{name}", errors, source_map
            )
            obj_columns: dict[str, DataObjectColumn] = {}
            for fname, fdata in raw_obj.get("columns", {}).items():
                _check_unknown_keys(
                    fdata,
                    _DATA_OBJECT_COLUMN_KEYS,
                    f"dataObjects.{name}.columns.{fname}",
                    errors,
                    source_map,
                )
                obj_columns[fname] = DataObjectColumn(
                    label=fname,
                    code=fdata.get("code", fname if not fdata.get("expression") else ""),
                    abstract_type=fdata.get("abstractType", "string"),
                    sql_type=fdata.get("sqlType"),
                    sql_precision=fdata.get("sqlPrecision"),
                    sql_scale=fdata.get("sqlScale"),
                    num_class=fdata.get("numClass"),
                    primary_key=bool(fdata.get("primaryKey", False)),
                    description=fdata.get("description"),
                    comment=fdata.get("comment"),
                    owner=fdata.get("owner"),
                    expression=fdata.get("expression"),
                    synonyms=fdata.get("synonyms", []),
                    custom_extensions=_parse_extensions(fdata),
                )

            obj_joins: list[DataObjectJoin] = []
            for ji, jdata in enumerate(raw_obj.get("joins", [])):
                _check_unknown_keys(
                    jdata,
                    _DATA_OBJECT_JOIN_KEYS,
                    f"dataObjects.{name}.joins[{ji}]",
                    errors,
                    source_map,
                )
                obj_joins.append(
                    DataObjectJoin(
                        join_type=jdata["joinType"],
                        join_to=jdata["joinTo"],
                        columns_from=jdata["columnsFrom"],
                        columns_to=jdata["columnsTo"],
                        secondary=jdata.get("secondary", False),
                        path_name=jdata.get("pathName"),
                    )
                )

            data_objects[name] = DataObject(
                label=name,
                code=raw_obj.get("code", ""),
                database=raw_obj.get("database", ""),
                schema_name=raw_obj.get("schema", ""),
                columns=obj_columns,
                joins=obj_joins,
                description=raw_obj.get("description"),
                comment=raw_obj.get("comment"),
                owner=raw_obj.get("owner"),
                countable=raw_obj.get("countable", True),
                count_label=raw_obj.get("countLabel"),
                synonyms=raw_obj.get("synonyms", []),
                custom_extensions=_parse_extensions(raw_obj),
                refresh=_parse_refresh(raw_obj.get("refresh"), name, errors),
            )
        except Exception as e:
            span = source_map.get(f"dataObjects.{name}") if source_map else None
            errors.append(
                SemanticError(
                    code="DATA_OBJECT_PARSE_ERROR",
                    message=f"Failed to parse data object '{name}': {e}",
                    path=f"dataObjects.{name}",
                    span=span,
                )
            )

    # Parse dimensions
    dimensions: dict[str, Dimension] = {}
    raw_dims = raw.get("dimensions", {})
    if not isinstance(raw_dims, dict):
        errors.append(
            SemanticError(
                code="DIMENSION_PARSE_ERROR",
                message="'dimensions' must be a YAML mapping, not a list or scalar",
                path="dimensions",
            )
        )
        raw_dims = {}
    for name, raw_dim in raw_dims.items():
        try:
            _check_unknown_keys(
                raw_dim, _DIMENSION_KEYS, f"dimensions.{name}", errors, source_map
            )
            data_object = raw_dim.get("dataObject")
            column = raw_dim.get("column")

            # Validate the data object exists
            if data_object and data_object not in data_objects:
                span = source_map.get(f"dimensions.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT",
                        message=(
                            f"Dimension '{name}' references unknown data object '{data_object}'"
                        ),
                        path=f"dimensions.{name}",
                        span=span,
                        suggestions=_suggest_similar(data_object, list(data_objects.keys())),
                    )
                )

            # Validate the column exists in the data object
            if (
                data_object
                and column
                and data_object in data_objects
                and column not in data_objects[data_object].columns
            ):
                span = source_map.get(f"dimensions.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="UNKNOWN_COLUMN",
                        message=(
                            f"Dimension '{name}' references unknown column "
                            f"'{column}' in data object '{data_object}'"
                        ),
                        path=f"dimensions.{name}",
                        span=span,
                        suggestions=_suggest_similar(
                            column, list(data_objects[data_object].columns.keys())
                        ),
                    )
                )

            via = raw_dim.get("via")
            if via and via not in data_objects:
                span = source_map.get(f"dimensions.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT",
                        message=(
                            f"Dimension '{name}' via references unknown data object '{via}'"
                        ),
                        path=f"dimensions.{name}",
                        span=span,
                        suggestions=_suggest_similar(via, list(data_objects.keys())),
                    )
                )

            dimensions[name] = Dimension(
                label=name,
                view=data_object or "",
                column=column or "",
                result_type=raw_dim.get("resultType", "string"),
                time_grain=raw_dim.get("timeGrain"),
                via=via,
                description=raw_dim.get("description"),
                format=raw_dim.get("format"),
                owner=raw_dim.get("owner"),
                synonyms=raw_dim.get("synonyms", []),
                custom_extensions=_parse_extensions(raw_dim),
            )
        except Exception as e:
            span = source_map.get(f"dimensions.{name}") if source_map else None
            errors.append(
                SemanticError(
                    code="DIMENSION_PARSE_ERROR",
                    message=f"Failed to parse dimension '{name}': {e}",
                    path=f"dimensions.{name}",
                    span=span,
                )
            )

    # Parse measures
    measures: dict[str, Measure] = {}
    raw_measures = raw.get("measures", {})
    if not isinstance(raw_measures, dict):
        errors.append(
            SemanticError(
                code="MEASURE_PARSE_ERROR",
                message="'measures' must be a YAML mapping, not a list or scalar",
                path="measures",
            )
        )
        raw_measures = {}
    for name, raw_meas in raw_measures.items():
        try:
            _check_unknown_keys(raw_meas, _MEASURE_KEYS, f"measures.{name}", errors, source_map)
            measure_columns: list[DataColumnRef] = []
            for ci, fdata in enumerate(raw_meas.get("columns", [])):
                _check_unknown_keys(
                    fdata,
                    _DATA_COLUMN_REF_KEYS,
                    f"measures.{name}.columns[{ci}]",
                    errors,
                    source_map,
                )
                measure_columns.append(
                    DataColumnRef(
                        view=fdata.get("dataObject"),
                        column=fdata.get("column"),
                    )
                )

            # Resolve expression field references
            expression = raw_meas.get("expression")
            if expression:
                self._validate_expression_refs(
                    name, expression, data_objects, errors, source_map
                )

            # Parse measure filters (new `filters:` list or legacy `filter:` single)
            measure_filters: list[MeasureFilterItem] = []
            raw_filters = raw_meas.get("filters")
            if raw_filters and isinstance(raw_filters, list):
                for fi, rf in enumerate(raw_filters):
                    measure_filters.append(
                        _parse_measure_filter_item(
                            rf,
                            f"measures.{name}.filters[{fi}]",
                            errors,
                            source_map,
                        )
                    )
            else:
                # Backward compat: single `filter:` key → [filter]
                raw_filter = raw_meas.get("filter")
                if raw_filter:
                    measure_filters.append(
                        _parse_measure_filter_item(
                            raw_filter, f"measures.{name}.filter", errors, source_map
                        )
                    )

            # Parse grain override
            grain_override: GrainOverride | None = None
            raw_grain = raw_meas.get("grain")
            if raw_grain and isinstance(raw_grain, dict):
                _check_unknown_keys(
                    raw_grain,
                    _GRAIN_OVERRIDE_KEYS,
                    f"measures.{name}.grain",
                    errors,
                    source_map,
                )
                grain_override = GrainOverride(
                    mode=raw_grain.get("mode", "RELATIVE"),
                    exclude=raw_grain.get("exclude", []),
                    include=raw_grain.get("include", []),
                    keep_only=raw_grain.get("keepOnly", []),
                )
                # Validate dimension references in grain
                for dim_name in (
                    grain_override.include + grain_override.exclude + grain_override.keep_only
                ):
                    if dim_name not in dimensions:
                        span = source_map.get(f"measures.{name}.grain") if source_map else None
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_GRAIN_DIMENSION",
                                message=(
                                    f"Measure '{name}' grain references "
                                    f"unknown dimension '{dim_name}'"
                                ),
                                path=f"measures.{name}.grain",
                                span=span,
                                suggestions=_suggest_similar(dim_name, list(dimensions.keys())),
                            )
                        )

            # Parse filter context
            filter_ctx: FilterContext | None = None
            raw_fc = raw_meas.get("filterContext")
            if raw_fc and isinstance(raw_fc, dict):
                _check_unknown_keys(
                    raw_fc,
                    _FILTER_CONTEXT_KEYS,
                    f"measures.{name}.filterContext",
                    errors,
                    source_map,
                )
                include_filters: list[FilterContextFilter] = []
                for inc_i, raw_incl in enumerate(raw_fc.get("include", [])):
                    if isinstance(raw_incl, dict):
                        _check_unknown_keys(
                            raw_incl,
                            _FILTER_CONTEXT_FILTER_KEYS,
                            f"measures.{name}.filterContext.include[{inc_i}]",
                            errors,
                            source_map,
                        )
                        include_filters.append(
                            FilterContextFilter(
                                field=raw_incl.get("field", ""),
                                op=raw_incl.get("op", "equals"),
                                value=raw_incl.get("value"),
                            )
                        )
                filter_ctx = FilterContext(
                    mode=raw_fc.get("mode", "RELATIVE"),
                    exclude=raw_fc.get("exclude", []),
                    include=include_filters,
                    keep_only=raw_fc.get("keepOnly", []),
                )
                # Validate field references in exclude/keepOnly
                all_dim_names = set(dimensions.keys())
                all_col_refs: set[str] = set()
                for obj_name, obj_def in data_objects.items():
                    for col_name in obj_def.columns:
                        all_col_refs.add(f"{obj_name}.{col_name}")
                for field_name in filter_ctx.exclude + filter_ctx.keep_only:
                    if field_name not in all_dim_names and field_name not in all_col_refs:
                        span = (
                            source_map.get(f"measures.{name}.filterContext")
                            if source_map
                            else None
                        )
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_FILTER_CONTEXT_FIELD",
                                message=(
                                    f"Measure '{name}' filterContext references "
                                    f"unknown field '{field_name}'"
                                ),
                                path=f"measures.{name}.filterContext",
                                span=span,
                                suggestions=_suggest_similar(field_name, list(all_dim_names)),
                            )
                        )
                for incl in filter_ctx.include:
                    if incl.field not in all_dim_names and incl.field not in all_col_refs:
                        span = (
                            source_map.get(f"measures.{name}.filterContext")
                            if source_map
                            else None
                        )
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_FILTER_CONTEXT_FIELD",
                                message=(
                                    f"Measure '{name}' filterContext.include "
                                    f"references unknown field '{incl.field}'"
                                ),
                                path=f"measures.{name}.filterContext.include",
                                span=span,
                                suggestions=_suggest_similar(incl.field, list(all_dim_names)),
                            )
                        )

            measures[name] = Measure(
                label=name,
                columns=measure_columns,
                result_type=raw_meas.get("resultType", "float"),
                aggregation=raw_meas.get("aggregation", "sum"),
                expression=expression,
                distinct=raw_meas.get("distinct", False),
                total=raw_meas.get("total", False),
                grain=grain_override,
                filter_context=filter_ctx,
                filters=measure_filters,
                data_type=raw_meas.get("dataType"),
                description=raw_meas.get("description"),
                format=raw_meas.get("format"),
                allow_fan_out=raw_meas.get("allowFanOut", False),
                delimiter=raw_meas.get("delimiter"),
                within_group=raw_meas.get("withinGroup"),
                owner=raw_meas.get("owner"),
                synonyms=raw_meas.get("synonyms", []),
                custom_extensions=_parse_extensions(raw_meas),
            )
        except Exception as e:
            span = source_map.get(f"measures.{name}") if source_map else None
            errors.append(
                SemanticError(
                    code="MEASURE_PARSE_ERROR",
                    message=f"Failed to parse measure '{name}': {e}",
                    path=f"measures.{name}",
                    span=span,
                )
            )

    # Validate the count-synthesis knobs here so a bad value becomes a
    # structured SemanticError rather than a raw AttributeError (list
    # pattern) or an uncaught Pydantic ValidationError (invalid token) at
    # model construction below. Fall back to safe values so resolution can
    # continue collecting errors.
    _count_pattern = raw.get("countLabelPattern", DEFAULT_COUNT_PATTERN)
    _pattern_err = count_pattern_error(_count_pattern)
    if _pattern_err is not None:
        span = source_map.get("countLabelPattern") if source_map else None
        errors.append(
            SemanticError(
                code="INVALID_COUNT_LABEL_PATTERN",
                message=_pattern_err,
                path="countLabelPattern",
                span=span,
            )
        )
        _count_pattern = DEFAULT_COUNT_PATTERN
    _expose_counts = raw.get("exposeCounts", True)
    if not isinstance(_expose_counts, bool):
        span = source_map.get("exposeCounts") if source_map else None
        errors.append(
            SemanticError(
                code="INVALID_EXPOSE_COUNTS",
                message="exposeCounts must be a boolean (true/false)",
                path="exposeCounts",
                span=span,
            )
        )
        _expose_counts = True

    # Names of synthesized count measures (name == resolved count label,
    # e.g. "Sales Count"). These are valid measure references (metrics may
    # target them) even though they are not declared — they are materialized
    # on read via ``effective_measures`` (see models/synthesis.py). Declared
    # measures already sit in ``measures``; a declared count of the same
    # name overrides synthesis, so unioning is safe either way.
    synthesized_measure_names: set[str] = (
        {
            count_label(key, obj, _count_pattern)
            for key, obj in data_objects.items()
            if obj.countable
        }
        if _expose_counts
        else set()
    )

    # Parse metrics
    metrics: dict[str, Metric] = {}
    raw_metrics = raw.get("metrics", {})
    if not isinstance(raw_metrics, dict):
        errors.append(
            SemanticError(
                code="METRIC_PARSE_ERROR",
                message="'metrics' must be a YAML mapping, not a list or scalar",
                path="metrics",
            )
        )
        raw_metrics = {}
    for name, raw_metric in raw_metrics.items():
        try:
            _check_unknown_keys(raw_metric, _METRIC_KEYS, f"metrics.{name}", errors, source_map)
            raw_pop_block = raw_metric.get("periodOverPeriod")
            if isinstance(raw_pop_block, dict):
                _check_unknown_keys(
                    raw_pop_block,
                    _PERIOD_OVER_PERIOD_KEYS,
                    f"metrics.{name}.periodOverPeriod",
                    errors,
                    source_map,
                )
            metric_type = raw_metric.get("type", "derived")

            if metric_type == MetricType.CUMULATIVE:
                # Cumulative metric: validate measure reference exists
                ref_measure = raw_metric.get("measure", "")
                if (
                    ref_measure
                    and ref_measure not in measures
                    and ref_measure not in synthesized_measure_names
                ):
                    span = source_map.get(f"metrics.{name}.measure") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_MEASURE",
                            message=(
                                f"Cumulative metric '{name}' references "
                                f"unknown measure '{ref_measure}'"
                            ),
                            path=f"metrics.{name}.measure",
                            span=span,
                        )
                    )

                # Validate timeDimension references a known dimension
                cum_time_dim = raw_metric.get("timeDimension", "")
                if cum_time_dim and cum_time_dim not in dimensions:
                    span = (
                        source_map.get(f"metrics.{name}.timeDimension") if source_map else None
                    )
                    errors.append(
                        SemanticError(
                            code="CUMULATIVE_UNKNOWN_TIME_DIMENSION",
                            message=(
                                f"Cumulative metric '{name}' references "
                                f"unknown time dimension '{cum_time_dim}'"
                            ),
                            path=f"metrics.{name}.timeDimension",
                            span=span,
                            suggestions=_suggest_similar(cum_time_dim, list(dimensions.keys())),
                        )
                    )

                metrics[name] = Metric(
                    label=name,
                    type=MetricType.CUMULATIVE,
                    measure=raw_metric.get("measure"),
                    time_dimension=raw_metric.get("timeDimension"),
                    cumulative_type=raw_metric.get("cumulativeType", "sum"),
                    window=raw_metric.get("window"),
                    grain_to_date=raw_metric.get("grainToDate"),
                    partition_by=list(raw_metric.get("partitionBy", []) or []),
                    data_type=raw_metric.get("dataType"),
                    description=raw_metric.get("description"),
                    format=raw_metric.get("format"),
                    owner=raw_metric.get("owner"),
                    synonyms=raw_metric.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_metric),
                )
            elif metric_type == MetricType.PERIOD_OVER_PERIOD:
                # Period-over-period metric: validate expression + PoP config
                expression = raw_metric.get("expression", "")
                self._validate_metric_expression_refs(
                    name,
                    expression,
                    measures,
                    errors,
                    source_map,
                    metrics,
                    synthesized_measure_names,
                )

                raw_pop = raw_metric.get("periodOverPeriod")
                if not raw_pop:
                    span = source_map.get(f"metrics.{name}") if source_map else None
                    errors.append(
                        SemanticError(
                            code="METRIC_PARSE_ERROR",
                            message=(
                                f"Period-over-period metric '{name}' "
                                f"requires 'periodOverPeriod' configuration"
                            ),
                            path=f"metrics.{name}",
                            span=span,
                        )
                    )
                    raw_pop = {}

                # Validate time dimension reference
                pop_time_dim = raw_pop.get("timeDimension", "")
                if pop_time_dim and pop_time_dim not in dimensions:
                    span = (
                        source_map.get(f"metrics.{name}.periodOverPeriod")
                        if source_map
                        else None
                    )
                    errors.append(
                        SemanticError(
                            code="POP_UNKNOWN_TIME_DIMENSION",
                            message=(
                                f"Period-over-period metric '{name}' references "
                                f"unknown time dimension '{pop_time_dim}'"
                            ),
                            path=f"metrics.{name}.periodOverPeriod.timeDimension",
                            span=span,
                            suggestions=_suggest_similar(pop_time_dim, list(dimensions.keys())),
                        )
                    )

                pop_config = PeriodOverPeriod(
                    time_dimension=raw_pop.get("timeDimension", ""),
                    grain=raw_pop.get("grain", "month"),
                    offset=raw_pop.get("offset", -1),
                    offset_grain=raw_pop.get("offsetGrain", "year"),
                    comparison=raw_pop.get("comparison", "percentChange"),
                )

                metrics[name] = Metric(
                    label=name,
                    type=MetricType.PERIOD_OVER_PERIOD,
                    expression=expression,
                    period_over_period=pop_config,
                    data_type=raw_metric.get("dataType"),
                    description=raw_metric.get("description"),
                    format=raw_metric.get("format"),
                    owner=raw_metric.get("owner"),
                    synonyms=raw_metric.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_metric),
                )
            elif metric_type == MetricType.WINDOW:
                # Window metric (rank/lag/lead/ntile/first_value/last_value)
                ref_measure = raw_metric.get("measure")
                if (
                    ref_measure
                    and ref_measure not in measures
                    and ref_measure not in synthesized_measure_names
                ):
                    span = source_map.get(f"metrics.{name}.measure") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_MEASURE",
                            message=(
                                f"Window metric '{name}' references "
                                f"unknown measure '{ref_measure}'"
                            ),
                            path=f"metrics.{name}.measure",
                            span=span,
                        )
                    )

                win_time_dim = raw_metric.get("timeDimension", "")
                if win_time_dim and win_time_dim not in dimensions:
                    span = (
                        source_map.get(f"metrics.{name}.timeDimension") if source_map else None
                    )
                    errors.append(
                        SemanticError(
                            code="WINDOW_UNKNOWN_TIME_DIMENSION",
                            message=(
                                f"Window metric '{name}' references "
                                f"unknown time dimension '{win_time_dim}'"
                            ),
                            path=f"metrics.{name}.timeDimension",
                            span=span,
                            suggestions=_suggest_similar(win_time_dim, list(dimensions.keys())),
                        )
                    )

                metrics[name] = Metric(
                    label=name,
                    type=MetricType.WINDOW,
                    measure=ref_measure,
                    time_dimension=raw_metric.get("timeDimension"),
                    window_function=raw_metric.get("windowFunction"),
                    offset=raw_metric.get("offset"),
                    buckets=raw_metric.get("buckets"),
                    order_direction=raw_metric.get("orderDirection", "desc"),
                    default_value=raw_metric.get("defaultValue"),
                    partition_by=list(raw_metric.get("partitionBy", []) or []),
                    data_type=raw_metric.get("dataType"),
                    description=raw_metric.get("description"),
                    format=raw_metric.get("format"),
                    owner=raw_metric.get("owner"),
                    synonyms=raw_metric.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_metric),
                )
            else:
                # Derived metric (default)
                expression = raw_metric.get("expression", "")
                self._validate_metric_expression_refs(
                    name,
                    expression,
                    measures,
                    errors,
                    source_map,
                    metrics,
                    synthesized_measure_names,
                )

                metrics[name] = Metric(
                    label=name,
                    expression=expression,
                    data_type=raw_metric.get("dataType"),
                    description=raw_metric.get("description"),
                    format=raw_metric.get("format"),
                    owner=raw_metric.get("owner"),
                    synonyms=raw_metric.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_metric),
                )
        except Exception as e:
            span = source_map.get(f"metrics.{name}") if source_map else None
            errors.append(
                SemanticError(
                    code="METRIC_PARSE_ERROR",
                    message=f"Failed to parse metric '{name}': {e}",
                    path=f"metrics.{name}",
                    span=span,
                )
            )

    # Parse static model filters
    model_filters: list[ModelFilter] = []
    raw_filters = raw.get("filters", [])
    if not isinstance(raw_filters, list):
        errors.append(
            SemanticError(
                code="FILTER_PARSE_ERROR",
                message="'filters' must be a YAML list, not a mapping or scalar",
                path="filters",
            )
        )
        raw_filters = []
    for i, rf in enumerate(raw_filters):
        try:
            _check_unknown_keys(rf, _MODEL_FILTER_KEYS, f"filters[{i}]", errors, source_map)
            obj_name = rf.get("dataObject", "")
            col_name = rf.get("column", "")
            if obj_name and obj_name not in data_objects:
                span = source_map.get(f"filters[{i}]") if source_map else None
                errors.append(
                    SemanticError(
                        code="UNKNOWN_FILTER_DATA_OBJECT",
                        message=(
                            f"Static filter[{i}] references unknown data object '{obj_name}'"
                        ),
                        path=f"filters[{i}]",
                        span=span,
                    )
                )
            elif obj_name and col_name and col_name not in data_objects[obj_name].columns:
                span = source_map.get(f"filters[{i}]") if source_map else None
                errors.append(
                    SemanticError(
                        code="UNKNOWN_FILTER_COLUMN",
                        message=(
                            f"Static filter[{i}] references unknown column "
                            f"'{col_name}' in data object '{obj_name}'"
                        ),
                        path=f"filters[{i}]",
                        span=span,
                    )
                )
            raw_val = rf.get("value")
            raw_vals = rf.get("values", [])
            model_filters.append(
                ModelFilter(
                    data_object=obj_name,
                    column=col_name,
                    operator=rf.get("operator", "equals"),
                    value=_coerce_filter_value(raw_val),
                    values=[_coerce_filter_value(v) for v in raw_vals],
                )
            )
        except Exception as e:
            span = source_map.get(f"filters[{i}]") if source_map else None
            errors.append(
                SemanticError(
                    code="FILTER_PARSE_ERROR",
                    message=f"Failed to parse static filter[{i}]: {e}",
                    path=f"filters[{i}]",
                    span=span,
                )
            )

    settings = _parse_settings(raw.get("settings"), errors, source_map)

    # Parse examples block (PLAN_agent_api_improvements §5)
    examples = self._parse_examples(raw.get("examples"), errors)

    model = SemanticModel(
        version=raw.get("version", 1.0),
        name=raw.get("name"),
        description=raw.get("description"),
        data_objects=data_objects,
        dimensions=dimensions,
        measures=measures,
        metrics=metrics,
        filters=model_filters,
        examples=examples,
        extends_sources=raw.get("_extends_sources", []),
        inherits_source=raw.get("_inherits_source"),
        owner=raw.get("owner"),
        # Sanitized above so an invalid value is a structured error, not a
        # ValidationError raised here.
        expose_counts=_expose_counts,
        count_label_pattern=_count_pattern,
        custom_extensions=_parse_extensions(raw, "", errors, source_map),
        settings=settings,
    )

    result = ValidationResult(
        valid=len(errors) == 0,
        errors=errors,
        warnings=warnings,
    )

    return model, result

Semantic Validator¶

`orionbelt.parser.validator.SemanticValidator` ¶

Validates semantic rules from spec §3.8.

Source code in src/orionbelt/parser/validator.py

class SemanticValidator:
    """Validates semantic rules from spec §3.8."""

    def validate(self, model: SemanticModel) -> list[SemanticError]:
        errors: list[SemanticError] = []
        errors.extend(self._check_unique_identifiers(model))
        errors.extend(self._check_unique_column_names(model))
        errors.extend(self._check_secondary_joins(model))
        errors.extend(self._check_no_cyclic_joins(model))
        errors.extend(self._check_no_multipath_joins(model))
        errors.extend(self._check_measures_resolve(model))
        errors.extend(self._check_join_targets_exist(model))
        errors.extend(self._check_references_resolve(model))
        errors.extend(self._check_num_class_on_numeric_columns(model))
        errors.extend(self._check_time_grain_on_temporal_columns(model))
        errors.extend(self._check_measure_filter_refs(model))
        errors.extend(self._check_via_reachability(model))
        errors.extend(self._check_missing_via(model))
        return errors

    def _check_unique_identifiers(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure no duplicate names across dimensions, measures, and metrics.

        Data object names live in a separate namespace — a dimension may share
        its name with a data object (e.g. dimension "Region" on data object "Region").
        """
        errors: list[SemanticError] = []
        all_names: dict[str, str] = {}  # name -> type

        def _register(name: str, kind: str, path: str) -> None:
            existing = all_names.get(name)
            if existing is not None:
                errors.append(
                    SemanticError(
                        code="DUPLICATE_IDENTIFIER",
                        message=(
                            f"{kind.title()} '{name}' conflicts with existing {existing} '{name}'"
                        ),
                        path=path,
                    )
                )
            all_names[name] = kind

        for name in model.dimensions:
            _register(name, "dimension", f"dimensions.{name}")

        for name in model.measures:
            _register(name, "measure", f"measures.{name}")

        for name in model.metrics:
            _register(name, "metric", f"metrics.{name}")

        # Synthesized count measures (name == resolved count label, e.g.
        # "Sales Count") occupy the measure namespace too (models/synthesis.py).
        # A declared measure of the same name is the intended override (D4) and
        # is fine; but a dimension or metric with that name would be shadowed by
        # the synthesized measure at query time, so reject the collision. Two
        # countable objects that resolve to the same count name also collide.
        if getattr(model, "expose_counts", True):
            pattern = model_count_pattern(model)
            seen_counts: dict[str, str] = {}  # count name -> data object key
            for obj_key, obj in model.data_objects.items():
                if not obj.countable:
                    continue
                cid = count_label(obj_key, obj, pattern)
                clashing = all_names.get(cid)
                if clashing in ("dimension", "metric"):
                    errors.append(
                        SemanticError(
                            code="DUPLICATE_IDENTIFIER",
                            message=(
                                f"{str(clashing).title()} '{cid}' conflicts with the synthesized "
                                f"count measure for data object '{obj_key}'. Rename it, set "
                                f"'countLabel'/'countLabelPattern', or 'countable: false'."
                            ),
                            path=f"{clashing}s.{cid}",
                        )
                    )
                elif cid in seen_counts:
                    errors.append(
                        SemanticError(
                            code="DUPLICATE_IDENTIFIER",
                            message=(
                                f"Data objects '{seen_counts[cid]}' and '{obj_key}' both "
                                f"synthesize a count measure named '{cid}'. Give one a distinct "
                                f"'countLabel' or set 'countable: false'."
                            ),
                            path=f"dataObjects.{obj_key}.countLabel",
                        )
                    )
                else:
                    seen_counts[cid] = obj_key

        return errors

    def _check_unique_column_names(self, model: SemanticModel) -> list[SemanticError]:
        """Column names must be unique within each data object.

        Duplicate YAML keys are now rejected at parse time by TrackedLoader
        (``allow_duplicate_keys = False``). This validator is retained as a
        structural hook in case models are constructed programmatically.
        """
        return []

    def _check_secondary_joins(self, model: SemanticModel) -> list[SemanticError]:
        """Validate secondary join constraints.

        - Every secondary join MUST have a pathName.
        - pathName must be unique per (source, target) pair.
        """
        errors: list[SemanticError] = []
        # Track pathName per (source, target) pair
        path_names: dict[tuple[str, str], set[str]] = {}

        for obj_name, obj in model.data_objects.items():
            for i, join in enumerate(obj.joins):
                if join.secondary and not join.path_name:
                    errors.append(
                        SemanticError(
                            code="SECONDARY_JOIN_MISSING_PATH_NAME",
                            message=(
                                f"Data object '{obj_name}' join[{i}] is secondary "
                                f"but has no pathName"
                            ),
                            path=f"dataObjects.{obj_name}.joins[{i}]",
                        )
                    )
                if join.path_name:
                    pair = (obj_name, join.join_to)
                    if pair not in path_names:
                        path_names[pair] = set()
                    if join.path_name in path_names[pair]:
                        errors.append(
                            SemanticError(
                                code="DUPLICATE_JOIN_PATH_NAME",
                                message=(
                                    f"Data object '{obj_name}' join[{i}] has duplicate "
                                    f"pathName '{join.path_name}' for target '{join.join_to}'"
                                ),
                                path=f"dataObjects.{obj_name}.joins[{i}]",
                            )
                        )
                    else:
                        path_names[pair].add(join.path_name)

        return errors

    def _check_no_cyclic_joins(self, model: SemanticModel) -> list[SemanticError]:
        """Detect cyclic join paths."""
        errors: list[SemanticError] = []

        # Build adjacency list from joins (skip secondary joins)
        adj: dict[str, set[str]] = {}
        for obj_name, obj in model.data_objects.items():
            if obj_name not in adj:
                adj[obj_name] = set()
            for join in obj.joins:
                if not join.secondary:
                    adj[obj_name].add(join.join_to)

        # Iterative DFS cycle detection (avoids RecursionError on large models)
        visited: set[str] = set()
        rec_stack: set[str] = set()

        for start in adj:
            if start in visited:
                continue
            stack: list[tuple[str, list[str]]] = [(start, iter(adj.get(start, set())))]  # type: ignore[list-item]
            path: list[str] = [start]
            visited.add(start)
            rec_stack.add(start)

            while stack:
                node, neighbors = stack[-1]
                advanced = False
                for neighbor in neighbors:
                    if neighbor not in visited:
                        visited.add(neighbor)
                        rec_stack.add(neighbor)
                        path.append(neighbor)
                        stack.append((neighbor, iter(adj.get(neighbor, set()))))  # type: ignore[arg-type]
                        advanced = True
                        break
                    elif neighbor in rec_stack:
                        if neighbor in path:
                            cycle = path[path.index(neighbor) :] + [neighbor]
                        else:
                            cycle = [node, neighbor]
                        errors.append(
                            SemanticError(
                                code="CYCLIC_JOIN",
                                message=f"Cyclic join detected: {' -> '.join(cycle)}",
                                path=f"dataObjects.{node}.joins",
                            )
                        )
                if not advanced:
                    stack.pop()
                    rec_stack.discard(node)
                    if path:
                        path.pop()

        return errors

    def _check_no_multipath_joins(self, model: SemanticModel) -> list[SemanticError]:
        """Detect multiple distinct paths between any pair of nodes in the join DAG.

        Only flags true diamonds where both paths go through intermediaries.
        A direct edge from start to target is canonical, so an additional
        indirect path (e.g. Purchases→Suppliers direct + Purchases→Products→Suppliers)
        is not ambiguous and is not flagged.
        """
        errors: list[SemanticError] = []

        # Build adjacency list from joins (skip secondary joins)
        adj: dict[str, list[str]] = {}
        for obj_name, obj in model.data_objects.items():
            if obj_name not in adj:
                adj[obj_name] = []
            for join in obj.joins:
                if not join.secondary:
                    adj[obj_name].append(join.join_to)

        reported: set[tuple[str, str]] = set()

        for start in adj:
            if not adj[start]:
                continue
            # BFS from start; track first parent that reached each node
            direct_neighbors: set[str] = set()
            first_parent: dict[str, str] = {}
            queue: deque[tuple[str, str]] = deque()
            for neighbor in adj[start]:
                if neighbor == start:
                    continue
                direct_neighbors.add(neighbor)
                if neighbor not in first_parent:
                    first_parent[neighbor] = start
                    queue.append((neighbor, start))

            while queue:
                node, _parent = queue.popleft()
                for neighbor in adj.get(node, []):
                    if neighbor == start:
                        continue
                    if neighbor not in first_parent:
                        first_parent[neighbor] = node
                        queue.append((neighbor, node))
                    elif first_parent[neighbor] != node:
                        # Skip if target has a direct edge from start —
                        # the direct join is the canonical path.
                        if neighbor in direct_neighbors:
                            continue
                        pair = (start, neighbor)
                        if pair not in reported:
                            reported.add(pair)
                            errors.append(
                                SemanticError(
                                    code="MULTIPATH_JOIN",
                                    message=(
                                        f"Multiple join paths from '{start}' to "
                                        f"'{neighbor}' (via '{first_parent[neighbor]}' "
                                        f"and '{node}'). "
                                        f"Join paths must be unambiguous."
                                    ),
                                    path=f"dataObjects.{start}.joins",
                                )
                            )

        return errors

    def _check_measures_resolve(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure measure column references resolve to actual data object columns."""
        errors: list[SemanticError] = []
        for name, measure in model.measures.items():
            for i, col_ref in enumerate(measure.columns):
                obj_name = col_ref.view
                col_name = col_ref.column
                if obj_name and obj_name not in model.data_objects:
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_DATA_OBJECT",
                            message=(
                                f"Measure '{name}' column[{i}] references "
                                f"unknown data object '{obj_name}'"
                            ),
                            path=f"measures.{name}.columns[{i}]",
                        )
                    )
                elif obj_name and col_name:
                    obj = model.data_objects[obj_name]
                    if col_name not in obj.columns:
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_COLUMN",
                                message=(
                                    f"Measure '{name}' column[{i}] references "
                                    f"unknown column '{col_name}' in data object '{obj_name}'"
                                ),
                                path=f"measures.{name}.columns[{i}]",
                            )
                        )
        return errors

    def _check_join_targets_exist(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure join targets reference existing data objects."""
        errors: list[SemanticError] = []
        for obj_name, obj in model.data_objects.items():
            for i, join in enumerate(obj.joins):
                if not join.columns_from or not join.columns_to:
                    errors.append(
                        SemanticError(
                            code="EMPTY_JOIN_COLUMNS",
                            message=(
                                f"Data object '{obj_name}' join[{i}] to "
                                f"'{join.join_to}' has empty join columns"
                            ),
                            path=f"dataObjects.{obj_name}.joins[{i}]",
                        )
                    )
                elif len(join.columns_from) != len(join.columns_to):
                    errors.append(
                        SemanticError(
                            code="JOIN_COLUMN_COUNT_MISMATCH",
                            message=(
                                f"Data object '{obj_name}' join[{i}] has "
                                f"{len(join.columns_from)} columnsFrom and "
                                f"{len(join.columns_to)} columnsTo"
                            ),
                            path=f"dataObjects.{obj_name}.joins[{i}]",
                        )
                    )
                if join.join_to not in model.data_objects:
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_JOIN_TARGET",
                            message=(
                                f"Data object '{obj_name}' join[{i}] references "
                                f"unknown data object '{join.join_to}'"
                            ),
                            path=f"dataObjects.{obj_name}.joins[{i}]",
                        )
                    )
                else:
                    # Validate join columns exist
                    for col_name in join.columns_from:
                        if col_name not in obj.columns:
                            errors.append(
                                SemanticError(
                                    code="UNKNOWN_JOIN_COLUMN",
                                    message=(
                                        f"Data object '{obj_name}' join[{i}] columnsFrom "
                                        f"references unknown column '{col_name}'"
                                    ),
                                    path=f"dataObjects.{obj_name}.joins[{i}].columnsFrom",
                                )
                            )
                    target_obj = model.data_objects[join.join_to]
                    for col_name in join.columns_to:
                        if col_name not in target_obj.columns:
                            errors.append(
                                SemanticError(
                                    code="UNKNOWN_JOIN_COLUMN",
                                    message=(
                                        f"Data object '{obj_name}' join[{i}] columnsTo "
                                        f"references unknown column '{col_name}' "
                                        f"in data object '{join.join_to}'"
                                    ),
                                    path=f"dataObjects.{obj_name}.joins[{i}].columnsTo",
                                )
                            )
        return errors

    def _check_references_resolve(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure dimension references resolve."""
        errors: list[SemanticError] = []
        for name, dim in model.dimensions.items():
            obj_name = dim.view
            col_name = dim.column
            if obj_name and obj_name not in model.data_objects:
                errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT",
                        message=f"Dimension '{name}' references unknown data object '{obj_name}'",
                        path=f"dimensions.{name}",
                    )
                )
            elif obj_name and col_name:
                obj = model.data_objects[obj_name]
                if col_name not in obj.columns:
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_COLUMN",
                            message=(
                                f"Dimension '{name}' references unknown column "
                                f"'{col_name}' in data object '{obj_name}'"
                            ),
                            path=f"dimensions.{name}",
                        )
                    )
        return errors

    _NUMERIC_TYPES = {DataType.INT, DataType.FLOAT}
    _TIME_GRAIN_TYPES = {DataType.DATE, DataType.TIMESTAMP, DataType.TIMESTAMP_TZ}

    def _check_time_grain_on_temporal_columns(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure timeGrain is only set when the underlying column is temporal.

        ``timeGrain`` compiles to ``date_trunc(grain, column)``, which fails at
        runtime if the column's abstractType is not date/timestamp/timestamp_tz.
        Reject at model-load time so the error surfaces during validation rather
        than during the first query.
        """
        errors: list[SemanticError] = []
        for name, dim in model.dimensions.items():
            if dim.time_grain is None:
                continue
            obj_name = dim.view
            col_name = dim.column
            if not obj_name or not col_name:
                continue
            obj = model.data_objects.get(obj_name)
            if obj is None or col_name not in obj.columns:
                # Caught by _check_references_resolve.
                continue
            col = obj.columns[col_name]
            if col.abstract_type not in self._TIME_GRAIN_TYPES:
                errors.append(
                    SemanticError(
                        code="TIME_GRAIN_ON_NON_TEMPORAL",
                        message=(
                            f"Dimension '{name}' has timeGrain "
                            f"'{dim.time_grain.value}' but underlying column "
                            f"'{obj_name}.{col_name}' has abstractType "
                            f"'{col.abstract_type.value}'. timeGrain requires "
                            f"the column to be date, timestamp, or timestamp_tz. "
                            f"Drop timeGrain, fix the column's abstractType, or "
                            f"define a computed column with to_date()."
                        ),
                        path=f"dimensions.{name}",
                    )
                )
        return errors

    def _check_num_class_on_numeric_columns(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure numClass is only set on numeric columns (int or float)."""
        errors: list[SemanticError] = []
        for obj_name, obj in model.data_objects.items():
            for col_name, col in obj.columns.items():
                if col.num_class and col.abstract_type not in self._NUMERIC_TYPES:
                    errors.append(
                        SemanticError(
                            code="NUM_CLASS_ON_NON_NUMERIC",
                            message=(
                                f"Column '{col_name}' in data object '{obj_name}' "
                                f"has numClass '{col.num_class}' but abstractType "
                                f"'{col.abstract_type}' is not numeric (int or float)"
                            ),
                            path=f"dataObjects.{obj_name}.columns.{col_name}",
                        )
                    )
        return errors

    def _check_measure_filter_refs(self, model: SemanticModel) -> list[SemanticError]:
        """Verify that measure filter columns reference existing data objects and columns."""
        errors: list[SemanticError] = []
        for meas_name, measure in model.measures.items():
            for fi in measure.filters:
                self._validate_filter_item(fi, model, meas_name, errors)
        return errors

    def _validate_filter_item(
        self,
        item: MeasureFilterItem,
        model: SemanticModel,
        meas_name: str,
        errors: list[SemanticError],
    ) -> None:
        """Recursively validate a measure filter item."""
        if isinstance(item, MeasureFilter):
            if not item.column or not item.column.view:
                return
            obj = model.data_objects.get(item.column.view)
            if not obj:
                errors.append(
                    SemanticError(
                        code="UNKNOWN_FILTER_DATA_OBJECT",
                        message=(
                            f"Measure '{meas_name}' filter references unknown "
                            f"data object '{item.column.view}'"
                        ),
                        path=f"measures.{meas_name}.filters",
                    )
                )
                return
            if item.column.column and item.column.column not in obj.columns:
                errors.append(
                    SemanticError(
                        code="UNKNOWN_FILTER_COLUMN",
                        message=(
                            f"Measure '{meas_name}' filter references unknown "
                            f"column '{item.column.column}' in '{item.column.view}'"
                        ),
                        path=f"measures.{meas_name}.filters",
                    )
                )
        elif isinstance(item, MeasureFilterGroup):
            for child in item.filters:
                self._validate_filter_item(child, model, meas_name, errors)

    def _build_directed_graph(self, model: SemanticModel) -> nx.DiGraph[str]:
        """Build a directed graph from primary (non-secondary) joins."""
        g: nx.DiGraph[str] = nx.DiGraph()
        for name in model.data_objects:
            g.add_node(name)
        for obj_name, obj in model.data_objects.items():
            for join in obj.joins:
                if not join.secondary and join.join_to in model.data_objects:
                    g.add_edge(obj_name, join.join_to)
        return g

    def _check_via_reachability(self, model: SemanticModel) -> list[SemanticError]:
        """Validate that each dimension's dataObject is reachable from its via."""
        errors: list[SemanticError] = []
        dims_with_via = [(name, dim) for name, dim in model.dimensions.items() if dim.via]
        if not dims_with_via:
            return errors

        g = self._build_directed_graph(model)
        for name, dim in dims_with_via:
            if dim.via not in model.data_objects:
                errors.append(
                    SemanticError(
                        code="INVALID_VIA_DATA_OBJECT",
                        message=(
                            f"Dimension '{name}': via references unknown data object '{dim.via}'"
                        ),
                        path=f"dimensions.{name}",
                    )
                )
                continue
            if dim.via == dim.view:
                continue
            reachable = nx.descendants(g, dim.via) if dim.via in g else set()
            if dim.view not in reachable:
                errors.append(
                    SemanticError(
                        code="INVALID_VIA_DATA_OBJECT",
                        message=(
                            f"Dimension '{name}': data object '{dim.view}' is not "
                            f"reachable from via data object '{dim.via}'"
                        ),
                        path=f"dimensions.{name}",
                    )
                )
        return errors

    def _check_missing_via(self, model: SemanticModel) -> list[SemanticError]:
        """Warn when a dimension's target has direct joins from multiple fact tables.

        A fact table is a data object that is the source of at least one measure.
        Only direct joins (one hop) from a fact table to the dimension's target
        count — transitive reachability through other fact tables does not create
        real ambiguity and should not trigger a warning.  Dimensions whose target
        IS a fact table (e.g. Sales Date on Sales) are also skipped because the
        column lives on the fact table itself.

        Path-invariance heuristic: when every reaching fact joins to the target
        on the target's primary key, the dim attribute is path-invariant — the
        same Client ID (or Calendar.date) from any fact resolves to the same
        target row, so the dim attribute value is identical regardless of
        which fact drove the join. Role-playing semantics (Sales Year Month
        vs Purchase Year Month) are a choice the modeller makes by adding
        explicit ``via:`` on a per-dimension basis, not a correctness concern
        the validator should flag for every shared dim table.
        """
        warnings: list[SemanticError] = []

        measure_sources: set[str] = set()
        for meas in model.measures.values():
            for col_ref in meas.columns:
                if col_ref.view:
                    measure_sources.add(col_ref.view)
        if len(measure_sources) < 2:
            return warnings

        g = self._build_directed_graph(model)
        fact_tables = sorted(measure_sources & set(g.nodes))

        direct_children: dict[str, set[str]] = {}
        for ft in fact_tables:
            direct_children[ft] = set(g.successors(ft))

        for dim_name, dim in model.dimensions.items():
            if dim.via:
                continue
            target = dim.view
            if not target or target not in g:
                continue
            if target in measure_sources:
                continue
            reaching_facts = [ft for ft in fact_tables if target in direct_children[ft]]
            if len(reaching_facts) <= 1:
                continue

            if self._is_path_invariant(model, target, reaching_facts):
                continue

            warnings.append(
                SemanticError(
                    code="MISSING_VIA",
                    message=(
                        f"Dimension '{dim_name}' on '{target}' has direct "
                        f"joins from multiple fact tables "
                        f"({', '.join(reaching_facts)}). "
                        f"Consider adding role-playing dimensions with 'via' "
                        f"to disambiguate join paths."
                    ),
                    path=f"dimensions.{dim_name}",
                    severity="warning",
                )
            )
        return warnings

    @staticmethod
    def _is_path_invariant(model: SemanticModel, target: str, reaching_facts: list[str]) -> bool:
        """True when every reaching fact joins to the target on its primary key.

        Same Client ID (or Calendar date) from any fact resolves to the same
        target row, so the dim attribute value is identical regardless of which
        fact drove the join — there's no correctness ambiguity to warn about.
        Joins on non-PK columns CAN resolve to different rows from different
        facts and are kept under the warning.
        """
        target_obj = model.data_objects.get(target)
        if target_obj is None:
            return False

        pk_cols = {col_name for col_name, col in target_obj.columns.items() if col.primary_key}
        if not pk_cols:
            return False

        for ft_name in reaching_facts:
            ft_obj = model.data_objects.get(ft_name)
            if ft_obj is None:
                return False
            joins_to_target = [j for j in ft_obj.joins if j.join_to == target]
            if not joins_to_target:
                return False
            for j in joins_to_target:
                # Every column on the target side of the join must be a PK column.
                if not j.columns_to or any(c not in pk_cols for c in j.columns_to):
                    return False

        return True

`validate(model)` ¶

Source code in src/orionbelt/parser/validator.py

def validate(self, model: SemanticModel) -> list[SemanticError]:
    errors: list[SemanticError] = []
    errors.extend(self._check_unique_identifiers(model))
    errors.extend(self._check_unique_column_names(model))
    errors.extend(self._check_secondary_joins(model))
    errors.extend(self._check_no_cyclic_joins(model))
    errors.extend(self._check_no_multipath_joins(model))
    errors.extend(self._check_measures_resolve(model))
    errors.extend(self._check_join_targets_exist(model))
    errors.extend(self._check_references_resolve(model))
    errors.extend(self._check_num_class_on_numeric_columns(model))
    errors.extend(self._check_time_grain_on_temporal_columns(model))
    errors.extend(self._check_measure_filter_refs(model))
    errors.extend(self._check_via_reachability(model))
    errors.extend(self._check_missing_via(model))
    return errors

Semantic Model¶

`orionbelt.models.semantic.SemanticModel` ¶

Bases: BaseModel

Complete semantic model parsed from OBML YAML.

Source code in src/orionbelt/models/semantic.py

class SemanticModel(BaseModel):
    """Complete semantic model parsed from OBML YAML."""

    version: float = 1.0
    name: str | None = Field(
        default=None,
        description=(
            "Optional addressing identifier for multi-model mode (v2.4.0+). "
            "When unset, the multi-model loader uses the filename stem. "
            "After normalization (lowercase + spaces/dots/dashes → "
            "underscores + trim) must match ``^[a-z][a-z0-9_]{0,62}$``. "
            "BI tools select this model via the Flight `database` catalog "
            "or pgwire `database=` URL parameter."
        ),
    )
    description: str | None = None
    settings: ModelSettings | None = None
    data_objects: dict[str, DataObject] = Field(default={}, alias="dataObjects")
    dimensions: dict[str, Dimension] = {}
    measures: dict[str, Measure] = {}
    metrics: dict[str, Metric] = {}
    filters: list[ModelFilter] = Field(default_factory=list)
    examples: list[ModelExample] = Field(default_factory=list)
    extends_sources: list[str] = Field(default_factory=list)
    inherits_source: str | None = None
    owner: str | None = None
    expose_counts: bool = Field(
        True,
        alias="exposeCounts",
        description=(
            "When true (default), synthesize a row-count measure for every countable "
            "data object. Set false to suppress all synthesized counts (e.g. wide "
            "models where N facts would balloon the measure list). Declared measures "
            "are unaffected."
        ),
    )
    count_label_pattern: str = Field(
        "{object} Count",
        alias="countLabelPattern",
        description=(
            "Name/label template for synthesized count measures (the count's id is "
            "its label). The only valid token is ``{object}``, which interpolates each "
            "object's display label (e.g. 'Sales' -> 'Sales Count'). A per-object "
            "``countLabel`` overrides it."
        ),
    )
    custom_extensions: list[CustomExtension] = Field(default_factory=list, alias="customExtensions")

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @field_validator("count_label_pattern", mode="before")
    @classmethod
    def _validate_count_label_pattern(cls, v: object) -> object:
        """The pattern may reference only the ``{object}`` token.

        Reject any other field access (``{name}``, ``{object.__class__}``, ...) —
        cheap insurance even though OBML is author-controlled. Bare/escaped braces
        and positional ``{}`` are rejected too; only the named ``{object}`` field
        is allowed. Delegates to the shared ``count_pattern_error`` so the OBML
        resolver reports the same rule as a structured error.
        """
        from orionbelt.models.synthesis import count_pattern_error

        if not isinstance(v, str):
            return v
        msg = count_pattern_error(v)
        if msg is not None:
            raise ValueError(msg)
        return v

    @property
    def effective_measures(self) -> dict[str, Measure]:
        """Declared measures plus synthesized row-count measures (declared win).

        The single source of truth for the model's queryable measure namespace.
        Synthesized counts are *not* persisted on ``measures`` (they never
        roundtrip through YAML/OSI) — they are computed on demand here so every
        read/resolve surface sees them as ordinary named measures.
        """
        from orionbelt.models.synthesis import synthesize_count_measures

        merged = dict(self.measures)
        merged.update(synthesize_count_measures(self))
        return merged

    @field_validator("name", mode="before")
    @classmethod
    def _validate_name(cls, v: str | None) -> str | None:
        """Reject invalid names early. Pydantic validators raise ValueError
        which the loader turns into a model-validation error.

        Empty / whitespace-only strings are treated as ``None`` rather than
        passed through, so an empty ``name:`` in YAML falls back to the
        filename stem at startup.
        """
        if v is None:
            return None
        if not isinstance(v, str):
            raise ValueError("name must be a string")
        if not v.strip():
            return None
        # Use the same normalization pipeline the loader uses, so an OBML
        # `name:` that's invalid surfaces during parse-time rather than
        # only at startup. The normalized value is stored on the model.
        from orionbelt.models.identifiers import (
            ModelNameError,
            normalize_model_name,
        )

        try:
            return normalize_model_name(v, source="OBML `name:` field")
        except ModelNameError as exc:
            raise ValueError(str(exc)) from None

`effective_measures` `property` ¶

Declared measures plus synthesized row-count measures (declared win).

The single source of truth for the model's queryable measure namespace. Synthesized counts are not persisted on measures (they never roundtrip through YAML/OSI) — they are computed on demand here so every read/resolve surface sees them as ordinary named measures.

`orionbelt.models.semantic.DataObject` ¶

Bases: BaseModel

A database table or view with its columns and joins.

Source code in src/orionbelt/models/semantic.py

class DataObject(BaseModel):
    """A database table or view with its columns and joins."""

    label: str
    code: str
    database: str
    schema_name: str = Field(alias="schema")
    columns: dict[str, DataObjectColumn] = {}
    joins: list[DataObjectJoin] = []
    description: str | None = None
    comment: str | None = None
    owner: str | None = None
    countable: bool = Field(
        True,
        description=(
            "When true (default), the model synthesizes a grain-anchored row-count "
            "measure for this data object (name == label, e.g. 'Sales Count'). Set "
            "false to opt out (no count measure is added to the model's measure list)."
        ),
    )
    count_label: str | None = Field(
        None,
        alias="countLabel",
        description=(
            "Optional name/label for this object's synthesized count measure (the "
            "count's id is its label). Overrides the model-level ``countLabelPattern``. "
            "The ``{object}`` token interpolates the object's display label. Ignored "
            "when ``countable`` is false."
        ),
    )
    synonyms: list[str] = Field(default_factory=list)
    custom_extensions: list[CustomExtension] = Field(default_factory=list, alias="customExtensions")
    refresh: RefreshPolicy | None = Field(
        default=None,
        description=(
            "Optional freshness contract for the physical table this dataObject maps to. "
            "Drives result-cache TTL composition. PLAN_freshness_driven_cache.md §5."
        ),
    )

    @property
    def qualified_code(self) -> str:
        """Full qualified table reference: database.schema.code."""
        return f"{self.database}.{self.schema_name}.{self.code}"

    @model_validator(mode="after")
    def _validate_count_label(self) -> DataObject:
        """``countLabel`` only has an effect when the object is countable.

        Warn (do not error) so a stray override on a non-countable object is
        surfaced without breaking model load.
        """
        if self.count_label is not None and not self.countable:
            import warnings

            warnings.warn(
                f"Data object '{self.label}' sets 'countLabel' but 'countable' is false; "
                "the label is ignored because no count measure is synthesized.",
                stacklevel=2,
            )
        return self

    model_config = {"populate_by_name": True, "extra": "forbid"}

`qualified_code` `property` ¶

Full qualified table reference: database.schema.code.

`orionbelt.models.semantic.Dimension` ¶

Bases: BaseModel

A named dimension referencing a data object column.

Source code in src/orionbelt/models/semantic.py

class Dimension(BaseModel):
    """A named dimension referencing a data object column."""

    label: str
    view: str = Field(alias="dataObject")
    column: str = ""
    result_type: DataType = Field(DataType.STRING, alias="resultType")
    time_grain: TimeGrain | None = Field(None, alias="timeGrain")
    description: str | None = None
    format: str | None = None
    via: str | None = None
    owner: str | None = None
    synonyms: list[str] = Field(default_factory=list)
    custom_extensions: list[CustomExtension] = Field(default_factory=list, alias="customExtensions")

    model_config = {"populate_by_name": True, "extra": "forbid"}

`orionbelt.models.semantic.Measure` ¶

Bases: BaseModel

An aggregation measure with optional expression template.

Source code in src/orionbelt/models/semantic.py

class Measure(BaseModel):
    """An aggregation measure with optional expression template."""

    label: str
    columns: list[DataColumnRef] = []
    result_type: DataType = Field(DataType.FLOAT, alias="resultType")
    aggregation: AggregationType
    expression: str | None = None
    distinct: bool = False
    total: bool = False
    grain: GrainOverride | None = None
    filter_context: FilterContext | None = Field(None, alias="filterContext")
    filters: list[MeasureFilterItem] = []
    data_type: str | None = Field(None, alias="dataType")
    description: str | None = None
    format: str | None = None
    allow_fan_out: bool = Field(False, alias="allowFanOut")
    delimiter: str | None = None
    within_group: WithinGroup | None = Field(None, alias="withinGroup")
    owner: str | None = None
    synonyms: list[str] = Field(default_factory=list)
    custom_extensions: list[CustomExtension] = Field(default_factory=list, alias="customExtensions")

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @field_validator("aggregation", mode="before")
    @classmethod
    def _normalize_aggregation(cls, v: object) -> object:
        """Lowercase aggregation names so ``SUM`` / ``Sum`` / ``sum`` all
        resolve to the same ``AggregationType.SUM``. The enum's canonical
        spelling is lowercase, but uppercase SQL-style is a common BI/LLM
        convention that pre-v2.7.5 worked by accident (``aggregation``
        was a plain ``str``) — keep accepting it now that the field is
        a validated enum.

        ``AGG`` and ``AGGREGATE`` are accepted as aliases for ``MEASURE``
        (v2.7.7+) so OBML reads naturally for users coming from
        Databricks (``measure``), older Spark docs (``aggregate``), or
        the shorthand most BI tools default to (``agg``).
        """
        if isinstance(v, str):
            lowered = v.lower()
            if lowered in ("agg", "aggregate"):
                return "measure"
            return lowered
        return v

    @field_validator("data_type", mode="before")
    @classmethod
    def _validate_data_type(cls, v: str | None) -> str | None:
        if v is not None:
            parse_data_type(v)
        return v

    @model_validator(mode="after")
    def _validate_total_grain_exclusion(self) -> Measure:
        if self.total and self.grain is not None:
            raise ValueError("'total: true' and 'grain' are mutually exclusive")
        return self

    @model_validator(mode="after")
    def _validate_measure_delegation(self) -> Measure:
        """``aggregation: measure`` delegates the aggregation to the
        engine's metric-view resolver, so the OBML measure declaration
        must NOT specify ``columns:`` or ``expression:`` — there is no
        source column for OBSL to read; the engine resolves the measure
        by name. Reject the combination at model-load time rather than
        emitting SQL that would silently ignore the column reference.
        """
        if self.aggregation == AggregationType.MEASURE:
            if self.columns:
                raise ValueError(
                    "aggregation: measure delegates resolution to the engine "
                    "(Databricks Metric View); 'columns:' must be omitted. "
                    "The engine resolves the measure by its OBML label."
                )
            if self.expression is not None:
                raise ValueError(
                    "aggregation: measure delegates resolution to the engine "
                    "(Databricks Metric View); 'expression:' must be omitted. "
                    "The engine resolves the measure by its OBML label."
                )
            if self.filters:
                raise ValueError(
                    "aggregation: measure delegates resolution to the engine "
                    "(Databricks Metric View); 'filters:' is not applicable. "
                    "Define the filter inside the metric view itself."
                )
            if self.total:
                raise ValueError(
                    "aggregation: measure cannot be combined with 'total: true' "
                    "(OBSL cannot wrap the engine-resolved aggregation in "
                    "a window function — define the total at the metric-view level)."
                )
        return self

    @model_validator(mode="after")
    def _validate_statistical_aggregation_arity(self) -> Measure:
        """Reject malformed statistical aggregates at model-load time.

        Two-column aggregates (``corr``, ``covar_*``, ``regr_*``) require
        exactly two entries in ``columns``. Single-column statistical
        aggregates (``stddev``, ``stddev_pop``, ``variance``, ``var_pop``)
        require exactly one.

        ``expression:`` form is **not allowed** for two-column
        aggregates — a single expression string collapses to one scalar
        argument, producing invalid SQL like ``CORR((a + b))`` instead
        of ``CORR(a, b)``. To express per-argument transformations on
        two-column aggregates, define the inputs as computed columns on
        the data object and reference them via ``columns:``.

        Single-column statistical aggregates (``stddev`` etc.) DO accept
        ``expression:`` — the result ``STDDEV(<scalar expression>)`` is
        valid SQL.
        """
        agg = self.aggregation.lower()
        if self.expression is not None:
            if agg in TWO_COLUMN_AGGREGATIONS:
                raise ValueError(
                    f"Aggregation '{agg}' requires exactly 2 columns and cannot be "
                    "combined with 'expression:'. Use the 'columns:' list with two "
                    "entries (define computed columns on the data object if you need "
                    "per-argument transformations) so the aggregate's argument order "
                    "is explicit."
                )
            return self
        if agg in TWO_COLUMN_AGGREGATIONS and len(self.columns) != 2:
            raise ValueError(
                f"Aggregation '{agg}' requires exactly 2 columns, got {len(self.columns)}"
            )
        if agg in SINGLE_COLUMN_STATISTICAL_AGGREGATIONS and len(self.columns) != 1:
            raise ValueError(
                f"Aggregation '{agg}' requires exactly 1 column, got {len(self.columns)}"
            )
        return self

`orionbelt.models.semantic.Metric` ¶

Bases: BaseModel

A metric: derived expression, cumulative window, or period-over-period comparison.

Derived (default): references measures by name using {[Measure Name]} syntax. Cumulative: applies a window function to an existing measure, ordered by a time dimension. Supports running totals, rolling windows, and grain-to-date resets. Period-over-Period: compares a measure's value against a prior time period using a synthetical date spine. Supports ratio, difference, previous value, and percent change.

Source code in src/orionbelt/models/semantic.py

class Metric(BaseModel):
    """A metric: derived expression, cumulative window, or period-over-period comparison.

    **Derived** (default): references measures by name using ``{[Measure Name]}`` syntax.
    **Cumulative**: applies a window function to an existing measure, ordered by a time
    dimension.  Supports running totals, rolling windows, and grain-to-date resets.
    **Period-over-Period**: compares a measure's value against a prior time period using
    a synthetical date spine.  Supports ratio, difference, previous value, and percent change.
    """

    label: str
    type: MetricType = MetricType.DERIVED
    # Derived metrics
    expression: str | None = None
    # Cumulative metrics
    measure: str | None = None
    time_dimension: str | None = Field(None, alias="timeDimension")
    cumulative_type: CumulativeAggType = Field(CumulativeAggType.SUM, alias="cumulativeType")
    window: int | None = None
    grain_to_date: GrainToDate | None = Field(None, alias="grainToDate")
    # Per-dimension partitioning for cumulative + window metrics. Each entry
    # must be a model dimension reachable from the measure's source object.
    partition_by: list[str] = Field(default_factory=list, alias="partitionBy")
    # Period-over-Period metrics
    period_over_period: PeriodOverPeriod | None = Field(None, alias="periodOverPeriod")
    # Window metrics (rank / lag / lead / ntile / first_value / last_value)
    window_function: WindowFunctionKind | None = Field(None, alias="windowFunction")
    offset: int | None = None
    buckets: int | None = None
    order_direction: str = Field("desc", alias="orderDirection")
    default_value: str | int | float | bool | None = Field(None, alias="defaultValue")
    # Common
    data_type: str | None = Field(None, alias="dataType")
    description: str | None = None
    format: str | None = None
    owner: str | None = None
    synonyms: list[str] = Field(default_factory=list)
    custom_extensions: list[CustomExtension] = Field(default_factory=list, alias="customExtensions")

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @field_validator("data_type", mode="before")
    @classmethod
    def _validate_data_type(cls, v: str | None) -> str | None:
        if v is not None:
            parse_data_type(v)
        return v

    @model_validator(mode="after")
    def _validate_metric_type(self) -> Metric:
        if self.type == MetricType.DERIVED:
            if not self.expression:
                raise ValueError("Derived metrics require 'expression'")
            if self.partition_by:
                raise ValueError("Derived metrics must not have 'partitionBy'")
        elif self.type == MetricType.CUMULATIVE:
            if not self.measure:
                raise ValueError("Cumulative metrics require 'measure'")
            if not self.time_dimension:
                raise ValueError("Cumulative metrics require 'timeDimension'")
            if self.expression:
                raise ValueError("Cumulative metrics must not have 'expression'")
            if self.window is not None and self.grain_to_date is not None:
                raise ValueError("'window' and 'grainToDate' are mutually exclusive")
            if self.window is not None and self.window < 1:
                raise ValueError("'window' must be >= 1")
        elif self.type == MetricType.PERIOD_OVER_PERIOD:
            if not self.expression:
                raise ValueError("Period-over-period metrics require 'expression'")
            if not self.period_over_period:
                raise ValueError("Period-over-period metrics require 'periodOverPeriod'")
            if self.measure:
                raise ValueError(
                    "Period-over-period metrics must not have 'measure' "
                    "(use 'expression' to reference measures)"
                )
            if self.window is not None or self.grain_to_date is not None:
                raise ValueError(
                    "Period-over-period metrics must not have 'window' or 'grainToDate'"
                )
            if self.partition_by:
                raise ValueError("Period-over-period metrics must not have 'partitionBy'")
        elif self.type == MetricType.WINDOW:
            if self.window_function is None:
                raise ValueError("Window metrics require 'windowFunction'")
            if not self.measure and self.window_function not in {
                WindowFunctionKind.ROW_NUMBER,
                WindowFunctionKind.NTILE,
            }:
                # row_number / ntile can rank without an explicit measure, falling back
                # to ordering on the time dimension. All other window functions take
                # the measure as their argument or ORDER BY input.
                raise ValueError(
                    f"Window metric with function '{self.window_function.value}' requires 'measure'"
                )
            if self.expression:
                raise ValueError("Window metrics must not have 'expression'")
            if self.window is not None or self.grain_to_date is not None:
                raise ValueError("Window metrics must not have 'window' or 'grainToDate'")
            if self.window_function in {WindowFunctionKind.LAG, WindowFunctionKind.LEAD}:
                if self.offset is None or self.offset < 1:
                    raise ValueError(
                        f"Window metric with function '{self.window_function.value}' "
                        f"requires positive 'offset'"
                    )
                if not self.time_dimension:
                    raise ValueError(
                        f"Window metric with function '{self.window_function.value}' "
                        f"requires 'timeDimension'"
                    )
            if self.window_function == WindowFunctionKind.NTILE and (
                self.buckets is None or self.buckets < 2
            ):
                raise ValueError("Window metric with function 'ntile' requires 'buckets' >= 2")
            if self.order_direction.lower() not in {"asc", "desc"}:
                raise ValueError("'orderDirection' must be 'asc' or 'desc'")
        return self

Query Models¶

`orionbelt.models.query.QueryObject` ¶

Bases: BaseModel

A complete YAML analytical query.

Source code in src/orionbelt/models/query.py

class QueryObject(BaseModel):
    """A complete YAML analytical query."""

    select: QuerySelect
    where: list[QueryFilterItem] = []
    having: list[QueryFilterItem] = []
    order_by: list[QueryOrderBy] = Field([], alias="orderBy")
    limit: int | None = None
    offset: int | None = None
    use_path_names: list[UsePathName] = Field([], alias="usePathNames")
    dimensions_exclude: bool = Field(False, alias="dimensionsExclude")
    grouping: Grouping | None = Field(
        default=None,
        description=(
            "Hierarchical grouping modifier. 'rollup' emits GROUP BY ROLLUP(...) "
            "for hierarchical subtotals + grand total. 'cube' emits GROUP BY CUBE(...) "
            "for the full cross-tab. Adds one GROUPING(dim) AS _g_<dim> column per "
            "selected dimension so callers can distinguish subtotal/grand-total rows."
        ),
    )

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @model_validator(mode="after")
    def _validate_grouping(self) -> QueryObject:
        """Reject grouping with no dimensions or in raw mode."""
        if self.grouping is None:
            return self
        if self.select.is_raw:
            raise ValueError(
                "select.fields (raw mode) cannot be combined with grouping (rollup/cube)"
            )
        if not self.select.dimensions:
            raise ValueError(
                "grouping (rollup/cube) requires at least one dimension in select.dimensions"
            )
        return self

    @model_validator(mode="after")
    def _validate_raw_mode_exclusivity(self) -> QueryObject:
        """Raw mode (``select.fields``) is mutually exclusive with aggregate
        features. Catch misuse early so the resolver can assume a clean shape.
        """
        if self.select.is_raw:
            if self.select.dimensions:
                raise ValueError(
                    "select.fields (raw mode) cannot be combined with select.dimensions"
                )
            if self.select.measures:
                raise ValueError("select.fields (raw mode) cannot be combined with select.measures")
            if self.having:
                raise ValueError("select.fields (raw mode) cannot be combined with having")
            if self.dimensions_exclude:
                raise ValueError(
                    "select.fields (raw mode) cannot be combined with dimensionsExclude"
                )
        elif self.select.distinct:
            raise ValueError("select.distinct is only valid in raw mode (with select.fields)")
        return self

`orionbelt.models.query.QuerySelect` ¶

Bases: BaseModel

The SELECT part of a query.

Two mutually exclusive modes:

Aggregate mode (default): dimensions + measures produce a grouped, aggregated result (GROUP BY dimensions, aggregate measures).
Raw mode: fields returns un-aggregated rows from one or more data objects joined per the model. Set distinct: true for SELECT DISTINCT. Raw mode rejects dimensions, measures, metrics, and HAVING.

Source code in src/orionbelt/models/query.py

class QuerySelect(BaseModel):
    """The SELECT part of a query.

    Two mutually exclusive modes:

    * **Aggregate mode** (default): ``dimensions`` + ``measures`` produce a
      grouped, aggregated result (GROUP BY dimensions, aggregate measures).
    * **Raw mode**: ``fields`` returns un-aggregated rows from one or more
      data objects joined per the model. Set ``distinct: true`` for
      ``SELECT DISTINCT``. Raw mode rejects ``dimensions``, ``measures``,
      ``metrics``, and ``HAVING``.
    """

    dimensions: list[str | CoalesceDimension] = []
    measures: list[str] = []
    fields: list[str] = []
    distinct: bool = False

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @property
    def is_raw(self) -> bool:
        """True when this select is in raw mode (fields-based projection)."""
        return bool(self.fields)

`is_raw` `property` ¶

True when this select is in raw mode (fields-based projection).

`orionbelt.models.query.QueryFilter` ¶

Bases: BaseModel

A filter condition in a query.

Source code in src/orionbelt/models/query.py

class QueryFilter(BaseModel):
    """A filter condition in a query."""

    field: str
    op: FilterOperator
    value: Any = None
    subquery: Subquery | None = None

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @field_validator("value", mode="before")
    @classmethod
    def _validate_filter_value(cls, v: Any) -> Any:
        """Reject arbitrary nested objects — allow scalars, lists of scalars, and dicts
        (for RELATIVE filters which use ``{unit, count, direction}`` objects).
        Date/datetime values are coerced to ISO strings.
        """
        if v is None:
            return v
        if isinstance(v, datetime):
            return v.isoformat()
        if isinstance(v, date):
            return v.isoformat()
        if isinstance(v, (str, int, float, bool)):
            return v
        if isinstance(v, list):
            coerced = [i.isoformat() if isinstance(i, (date, datetime)) else i for i in v]
            if all(isinstance(i, (str, int, float, bool)) for i in coerced):
                return coerced
        if isinstance(v, dict) and all(isinstance(k, str) for k in v):
            return v
        msg = "Filter value must be a scalar, list of scalars, or object"
        raise ValueError(msg)

    @model_validator(mode="after")
    def _validate_subquery_exclusivity(self) -> QueryFilter:
        """``exists`` / ``nonexists`` require ``subquery`` (and reject ``value``).

        All other operators reject ``subquery`` — the payload would be silently
        ignored, which would mask typos.
        """
        is_subquery_op = self.op in (FilterOperator.EXISTS, FilterOperator.NONEXISTS)
        if is_subquery_op:
            if self.subquery is None:
                raise ValueError(
                    f"Operator '{self.op}' requires a 'subquery' object with 'dataObject'"
                )
            if self.value is not None:
                raise ValueError(f"Operator '{self.op}' takes 'subquery', not 'value' / 'values'")
        elif self.subquery is not None:
            raise ValueError(
                f"Operator '{self.op}' does not accept 'subquery' — use 'exists' or 'nonexists'"
            )
        return self

`orionbelt.models.query.UsePathName` ¶

Bases: BaseModel

Selects a named secondary join path for a specific (source, target) pair.

Source code in src/orionbelt/models/query.py

class UsePathName(BaseModel):
    """Selects a named secondary join path for a specific (source, target) pair."""

    source: str
    target: str
    path_name: str = Field(alias="pathName")

    model_config = {"populate_by_name": True, "extra": "forbid"}

`orionbelt.models.query.DimensionRef` ¶

Bases: BaseModel

Reference to a dimension, optionally with time grain.

Supports notation like "customer.country" or "order.order_date:month".

Source code in src/orionbelt/models/query.py

class DimensionRef(BaseModel):
    """Reference to a dimension, optionally with time grain.

    Supports notation like "customer.country" or "order.order_date:month".
    """

    name: str
    grain: TimeGrain | None = None

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @classmethod
    def parse(cls, raw: str) -> DimensionRef:
        """Parse 'name:grain' notation."""
        if ":" in raw:
            name, grain_str = raw.rsplit(":", 1)
            return cls(name=name, grain=TimeGrain(grain_str))
        return cls(name=raw)

`parse(raw)` `classmethod` ¶

Parse 'name:grain' notation.

Source code in src/orionbelt/models/query.py

@classmethod
def parse(cls, raw: str) -> DimensionRef:
    """Parse 'name:grain' notation."""
    if ":" in raw:
        name, grain_str = raw.rsplit(":", 1)
        return cls(name=name, grain=TimeGrain(grain_str))
    return cls(name=raw)

Error Models¶

`orionbelt.models.errors.SemanticError` ¶

Bases: BaseModel

A structured error or warning with optional source position and remediation.

Used uniformly for errors (severity="error") and warnings (severity="warning"). See models/warnings.py for the stable warning code taxonomy.

Source code in src/orionbelt/models/errors.py

class SemanticError(BaseModel):
    """A structured error or warning with optional source position and remediation.

    Used uniformly for errors (``severity="error"``) and warnings (``severity="warning"``).
    See ``models/warnings.py`` for the stable warning code taxonomy.
    """

    code: str
    message: str
    path: str | None = None
    span: SourceSpan | None = None
    suggestions: list[str] = Field(default_factory=list)
    severity: str = "error"
    hint: str | None = Field(
        default=None,
        description="Optional remediation suggestion (single sentence)",
    )
    context: dict[str, Any] | None = Field(
        default=None,
        description=(
            "Optional structured detail (e.g. which measure / dataObject / column) so "
            "agents can branch on the data without parsing the message."
        ),
    )

`orionbelt.models.errors.ValidationResult` ¶

Bases: BaseModel

Result of semantic model validation.

Source code in src/orionbelt/models/errors.py

class ValidationResult(BaseModel):
    """Result of semantic model validation."""

    valid: bool
    errors: list[SemanticError] = Field(default_factory=list)
    warnings: list[SemanticError] = Field(default_factory=list)

`orionbelt.models.errors.SourceSpan` ¶

Bases: BaseModel

Points to exact location in YAML source for error reporting.

Source code in src/orionbelt/models/errors.py

class SourceSpan(BaseModel):
    """Points to exact location in YAML source for error reporting."""

    file: str
    line: int
    column: int
    end_line: int | None = None
    end_column: int | None = None

SQL AST Nodes¶

`orionbelt.ast.nodes.Select` `dataclass` ¶

A complete SELECT statement.

Source code in src/orionbelt/ast/nodes.py

@dataclass(frozen=True)
class Select:
    """A complete SELECT statement."""

    columns: list[Expr] = field(default_factory=list)
    from_: From | None = None
    joins: list[Join] = field(default_factory=list)
    where: Expr | None = None
    group_by: list[Expr] = field(default_factory=list)
    having: Expr | None = None
    order_by: list[OrderByItem] = field(default_factory=list)
    limit: int | None = None
    offset: int | None = None
    ctes: list[CTE] = field(default_factory=list)
    distinct: bool = False
    grouping: str | None = None
    """Hierarchical grouping modifier: 'rollup' or 'cube'.

    When set, the dialect emits ``GROUP BY ROLLUP(...)`` / ``GROUP BY CUBE(...)``
    (or ClickHouse-style ``GROUP BY ... WITH ROLLUP``) instead of plain
    ``GROUP BY``. The planner is responsible for appending the
    ``GROUPING(dim) AS _g_<dim>`` columns to the SELECT projection."""

`grouping = None` `class-attribute` `instance-attribute` ¶

Hierarchical grouping modifier: 'rollup' or 'cube'.

When set, the dialect emits GROUP BY ROLLUP(...) / GROUP BY CUBE(...) (or ClickHouse-style GROUP BY ... WITH ROLLUP) instead of plain GROUP BY. The planner is responsible for appending the GROUPING(dim) AS _g_<dim> columns to the SELECT projection.

`orionbelt.ast.nodes.ColumnRef` `dataclass` ¶

Reference to a column, optionally qualified by table/alias.

Source code in src/orionbelt/ast/nodes.py

@dataclass(frozen=True)
class ColumnRef:
    """Reference to a column, optionally qualified by table/alias."""

    name: str
    table: str | None = None

`orionbelt.ast.nodes.FunctionCall` `dataclass` ¶

SQL function call, e.g. SUM(col), DATE_TRUNC('month', col).

Source code in src/orionbelt/ast/nodes.py

@dataclass(frozen=True)
class FunctionCall:
    """SQL function call, e.g. SUM(col), DATE_TRUNC('month', col)."""

    name: str
    args: list[Expr] = field(default_factory=list)
    distinct: bool = False
    order_by: list[OrderByItem] = field(default_factory=list)
    separator: str | None = None

`orionbelt.ast.nodes.BinaryOp` `dataclass` ¶

Binary operation: left op right.

Source code in src/orionbelt/ast/nodes.py

@dataclass(frozen=True)
class BinaryOp:
    """Binary operation: left op right."""

    left: Expr
    op: str  # +, -, *, /, =, <>, AND, OR, LIKE, etc.
    right: Expr

`orionbelt.ast.nodes.Literal` `dataclass` ¶

A literal value: number, string, boolean, or NULL.

Source code in src/orionbelt/ast/nodes.py

@dataclass(frozen=True)
class Literal:
    """A literal value: number, string, boolean, or NULL."""

    value: str | int | float | bool | None

    @classmethod
    def string(cls, v: str) -> Literal:
        return cls(value=v)

    @classmethod
    def number(cls, v: int | float) -> Literal:
        return cls(value=v)

    @classmethod
    def null(cls) -> Literal:
        return cls(value=None)

    @classmethod
    def boolean(cls, v: bool) -> Literal:
        return cls(value=v)

AST Builder¶

`orionbelt.ast.builder.QueryBuilder` ¶

Fluent builder for ergonomic AST construction.

Source code in src/orionbelt/ast/builder.py

class QueryBuilder:
    """Fluent builder for ergonomic AST construction."""

    def __init__(self) -> None:
        self._columns: list[Expr] = []
        self._from: From | None = None
        self._joins: list[Join] = []
        self._where: Expr | None = None
        self._group_by: list[Expr] = []
        self._having: Expr | None = None
        self._order_by: list[OrderByItem] = []
        self._limit: int | None = None
        self._offset: int | None = None
        self._ctes: list[CTE] = []
        self._distinct: bool = False
        self._grouping: str | None = None

    def select(self, *columns: Expr) -> Self:
        self._columns.extend(columns)
        return self

    def select_aliased(self, expr: Expr, alias: str) -> Self:
        self._columns.append(AliasedExpr(expr=expr, alias=alias))
        return self

    def from_(self, table: str, alias: str | None = None) -> Self:
        self._from = From(source=table, alias=alias)
        return self

    def from_subquery(self, subquery: Select, alias: str) -> Self:
        self._from = From(source=subquery, alias=alias)
        return self

    def join(
        self,
        table: str,
        on: Expr,
        join_type: JoinType = JoinType.LEFT,
        alias: str | None = None,
    ) -> Self:
        self._joins.append(Join(join_type=join_type, source=table, alias=alias, on=on))
        return self

    def where(self, condition: Expr) -> Self:
        if self._where is None:
            self._where = condition
        else:
            self._where = BinaryOp(left=self._where, op="AND", right=condition)
        return self

    def group_by(self, *exprs: Expr) -> Self:
        self._group_by.extend(exprs)
        return self

    def having(self, condition: Expr) -> Self:
        if self._having is None:
            self._having = condition
        else:
            self._having = BinaryOp(left=self._having, op="AND", right=condition)
        return self

    def order_by(self, expr: Expr, desc: bool = False, nulls_last: bool | None = None) -> Self:
        self._order_by.append(OrderByItem(expr=expr, desc=desc, nulls_last=nulls_last))
        return self

    def limit(self, n: int) -> Self:
        self._limit = n
        return self

    def offset(self, n: int) -> Self:
        self._offset = n
        return self

    def with_cte(self, name: str, query: Select | UnionAll | Except | RawSQL) -> Self:
        self._ctes.append(CTE(name=name, query=query))
        return self

    def distinct(self, value: bool = True) -> Self:
        self._distinct = value
        return self

    def grouping(self, mode: str | None) -> Self:
        """Set the hierarchical grouping modifier ('rollup' or 'cube')."""
        self._grouping = mode
        return self

    def build(self) -> Select:
        return Select(
            columns=self._columns,
            from_=self._from,
            joins=self._joins,
            where=self._where,
            group_by=self._group_by,
            having=self._having,
            order_by=self._order_by,
            limit=self._limit,
            offset=self._offset,
            ctes=self._ctes,
            distinct=self._distinct,
            grouping=self._grouping,
        )

`grouping(mode)` ¶

Set the hierarchical grouping modifier ('rollup' or 'cube').

Source code in src/orionbelt/ast/builder.py

def grouping(self, mode: str | None) -> Self:
    """Set the hierarchical grouping modifier ('rollup' or 'cube')."""
    self._grouping = mode
    return self

API Schemas¶

`orionbelt.api.schemas` ¶

API request/response Pydantic schemas.

`SessionCreateRequest` ¶

Bases: BaseModel

Request body for POST /sessions.

Source code in src/orionbelt/api/schemas.py

class SessionCreateRequest(BaseModel):
    """Request body for POST /sessions."""

    metadata: dict[str, str] = Field(default_factory=dict)

`SessionResponse` ¶

Bases: BaseModel

Single session info.

Source code in src/orionbelt/api/schemas.py

class SessionResponse(BaseModel):
    """Single session info."""

    session_id: str
    created_at: datetime
    last_accessed_at: datetime
    model_count: int
    metadata: dict[str, str] = Field(default_factory=dict)
    expires_at: datetime = Field(description="Idle TTL deadline (refreshed on each access)")
    max_expires_at: datetime = Field(description="Absolute lifetime deadline (fixed at creation)")

`SessionListResponse` ¶

Bases: BaseModel

Response for GET /sessions.

Source code in src/orionbelt/api/schemas.py

class SessionListResponse(BaseModel):
    """Response for GET /sessions."""

    sessions: list[SessionResponse]

`ModelLoadRequest` ¶

Bases: BaseModel

Request body for POST /sessions/{session_id}/models.

Source code in src/orionbelt/api/schemas.py

class ModelLoadRequest(BaseModel):
    """Request body for POST /sessions/{session_id}/models."""

    model_yaml: str | None = Field(
        default=None,
        description="OBML model as YAML string (provide model_yaml OR model_json)",
        max_length=5_000_000,
    )
    model_json: dict[str, object] | str | None = Field(
        default=None,
        description="OBML model as JSON object or JSON string (auto-parsed)",
    )
    extends: list[str] | None = Field(
        default=None,
        description="Optional inline YAML strings of analytical fragments to merge",
    )
    inherits: str | None = Field(
        default=None,
        description="Optional model ID of an already-loaded parent model in the session",
    )
    dedup: bool = Field(
        default=True,
        description=(
            "When True (default), identical OBML content already loaded in this session "
            "reuses the existing model_id (response.model_load == 'reused'). "
            "When False, always loads fresh."
        ),
    )

    @model_validator(mode="after")
    def _parse_model_json_string(self) -> ModelLoadRequest:
        if isinstance(self.model_json, str):
            self.model_json = json.loads(self.model_json)
        return self

`ModelLoadResponse` ¶

Bases: BaseModel

Response for POST /sessions/{session_id}/models.

Source code in src/orionbelt/api/schemas.py

class ModelLoadResponse(BaseModel):
    """Response for POST /sessions/{session_id}/models."""

    model_id: str
    data_objects: int
    dimensions: int
    measures: int
    metrics: int
    warnings: list[StructuredWarning] = Field(default_factory=list)
    model_load: str = Field(
        default="fresh",
        description=(
            "Whether the load parsed a fresh model or reused an existing one. "
            "Values: 'fresh' | 'reused'."
        ),
    )
    health: ModelHealth | None = Field(
        default=None,
        description=(
            "Structural health of the model's join graph: orphan dataObjects, "
            "fan-trap risks, unreachable dimensions. Always present on a fresh load."
        ),
    )

`ModelSummaryResponse` ¶

Bases: BaseModel

Short model summary for listing.

Source code in src/orionbelt/api/schemas.py

class ModelSummaryResponse(BaseModel):
    """Short model summary for listing."""

    model_id: str
    data_objects: int
    dimensions: int
    measures: int
    metrics: int

`SessionQueryRequest` ¶

Bases: BaseModel

Request body for POST /sessions/{session_id}/query/sql.

Source code in src/orionbelt/api/schemas.py

class SessionQueryRequest(BaseModel):
    """Request body for POST /sessions/{session_id}/query/sql."""

    model_id: str
    query: QueryObject
    dialect: str | None = Field(
        default=None,
        description=(
            "SQL dialect. Resolution: explicit value → model.settings.defaultDialect → "
            "DB_VENDOR env → 'postgres'."
        ),
    )

`QueryCompileResponse` ¶

Bases: BaseModel

Response body for POST /query/sql.

Source code in src/orionbelt/api/schemas.py

class QueryCompileResponse(BaseModel):
    """Response body for POST /query/sql."""

    sql: str
    dialect: str
    resolved: ResolvedInfoResponse
    warnings: list[StructuredWarning] = Field(default_factory=list)
    sql_valid: bool = True
    explain: ExplainPlanResponse | None = None
    physical_tables: list[str] = Field(
        default_factory=list,
        description=(
            "Deduplicated DATABASE.SCHEMA.CODE strings the query touches. "
            "Drives freshness-cache TTL composition and heartbeat invalidation."
        ),
    )

`ValidateRequest` ¶

Bases: BaseModel

Request body for POST /validate.

Source code in src/orionbelt/api/schemas.py

class ValidateRequest(BaseModel):
    """Request body for POST /validate."""

    model_yaml: str | None = Field(
        default=None,
        description="OBML model as YAML string (provide model_yaml OR model_json)",
        max_length=5_000_000,
    )
    model_json: dict[str, object] | str | None = Field(
        default=None,
        description="OBML model as JSON object or JSON string (auto-parsed)",
    )
    extends: list[str] | None = Field(
        default=None,
        description="Optional inline YAML strings of analytical fragments to merge",
    )
    inherits: str | None = Field(
        default=None,
        description="Optional model ID of an already-loaded parent model in the session",
    )

    @model_validator(mode="after")
    def _parse_model_json_string(self) -> ValidateRequest:
        if isinstance(self.model_json, str):
            self.model_json = json.loads(self.model_json)
        return self

`ValidateResponse` ¶

Bases: BaseModel

Response body for POST /validate.

Source code in src/orionbelt/api/schemas.py

class ValidateResponse(BaseModel):
    """Response body for POST /validate."""

    valid: bool
    errors: list[ErrorDetail] = Field(default_factory=list)
    warnings: list[ErrorDetail] = Field(default_factory=list)

`DialectListResponse` ¶

Bases: BaseModel

Response for GET /dialects.

Source code in src/orionbelt/api/schemas.py

class DialectListResponse(BaseModel):
    """Response for GET /dialects."""

    dialects: list[DialectInfo] = Field(default_factory=list)

`HealthResponse` ¶

Bases: BaseModel

Health check response.

Source code in src/orionbelt/api/schemas.py

class HealthResponse(BaseModel):
    """Health check response."""

    status: str = "ok"
    version: str = ""
    auth_mode: str = Field(
        default="none",
        description="Effective auth mode: 'none', 'api_key', or 'oidc'. "
        "Clients check this to know whether a credential is required.",
    )

Settings¶

`orionbelt.settings.Settings` ¶

Bases: BaseSettings

Configuration for OrionBelt REST API server.

Values are read from environment variables and from a .env file in the working directory. See .env.template for all options.

Source code in src/orionbelt/settings.py

class Settings(BaseSettings):
    """Configuration for OrionBelt REST API server.

    Values are read from environment variables and from a ``.env`` file
    in the working directory.  See ``.env.template`` for all options.
    """

    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        extra="ignore",
    )

    # Shared
    log_level: str = "INFO"
    # Log format:
    #   "console"  — pretty-printed for local dev (default)
    #   "json"     — structured JSON for log aggregators (ELK, Datadog, etc.)
    #   "cloudrun" — JSON + disables uvicorn access logs (Cloud Run provides its own)
    log_format: str = "console"

    # REST API
    api_server_host: str = "localhost"
    api_server_port: int = 8000
    port: int | None = None  # Cloud Run injects PORT; takes precedence over api_server_port

    # Authentication. Single AUTH_MODE selector drives every direct surface
    # (REST now; Flight + pgwire in Phase 2). Off by default to preserve the
    # public-demo / local-dev behaviour. See design/PLAN_authentication.md §1.
    #   "none"    — no auth (default)
    #   "api_key" — validate API_KEYS against the shared key store
    #   "oidc"    — Phase 4 (not implemented; rejected loudly at startup)
    auth_mode: str = "none"
    api_keys: str = ""  # comma-separated; required when auth_mode=api_key (>=16 chars each)
    api_key_header: str = "X-API-Key"  # REST header name; Bearer is always accepted as fallback
    # Legacy alias for auth_mode=api_key. Deprecated; honoured one release with
    # a startup warning. Only takes effect when AUTH_MODE is left at "none".
    auth_enabled: bool = False

    # Public-doc surfaces. Default True preserves current public-demo behaviour.
    # Set EXPOSE_API_DOCS=false on non-demo deployments to disable Swagger UI,
    # ReDoc, and the OpenAPI schema endpoint. EXPOSE_OPENAPI_SCHEMA can be
    # toggled independently to keep /openapi.json live (e.g. for client codegen)
    # while hiding the human-facing /docs and /redoc pages.
    expose_api_docs: bool = True
    expose_openapi_schema: bool = True

    @property
    def effective_port(self) -> int:
        """Return the port to listen on (Cloud Run PORT takes precedence)."""
        return self.port if self.port is not None else self.api_server_port

    # Sessions
    session_ttl_seconds: int = 1800  # 30 min inactivity
    session_max_age_seconds: int = 86400  # 24 h absolute max lifetime
    session_cleanup_interval: int = 60  # seconds between cleanup sweeps
    max_sessions: int = 500  # global concurrent session cap (429 when full)
    max_models_per_session: int = 10  # max models a single session may hold
    disable_session_list: bool = False  # hide GET /sessions endpoint
    session_rate_limit: int = 10  # max POST /sessions per IP per minute
    trusted_proxy_count: int = 0  # number of trusted reverse proxies in front of the app

    # Admin-curated model pre-loading. When MODEL_FILES is set, REST POST
    # /models returns 403 (the catalog is admin-managed) and the models are
    # loaded into named protected sessions at startup.
    #
    # MODEL_FILES (comma-separated paths):
    #     Each OBML YAML loads into its own internal session, addressable
    #     by the OBML `name:` field (fallback: filename stem, normalized to
    #     a valid identifier). BI tools select via the Flight `database`
    #     catalog or pgwire `database=` URL parameter. A single path is
    #     fine — it just means one named protected session.
    #     See design/PLAN_flight_natural_sql.md §3.x multi-model.
    model_dir: str | None = None  # base directory (set by Docker)
    model_files: str | None = None  # comma-separated paths

    # Query execution
    query_execute: bool = False  # enable POST /v1/query/execute
    query_default_limit: int = 1000  # max rows when query has no LIMIT
    db_pool_size: int = 5  # connection pool size per dialect

    # Default locale for /v1/query/execute?format_values=true (and TSV output).
    # Used when the request omits the ``locale`` query param. BCP-47 tag
    # (e.g. "de", "en-US"). Empty → en-style separators ("," / ".").
    default_locale: str = ""

    # Arrow Flight SQL server (requires ob-flight-extension)
    flight_enabled: bool = False  # start gRPC Flight server on FLIGHT_PORT (implies query_execute)
    flight_port: int = 8815
    flight_auth_mode: str = "none"  # "none" or "token"
    flight_api_token: str | None = None
    db_vendor: str = "duckdb"  # default vendor driver for Flight query execution

    # Flight Semantic QL governance. See design/PLAN_flight_natural_sql.md.
    # Semantic QL / OBSQL (SELECT dim, measure FROM <model>) is always enabled.
    # Raw SQL pass-through and write operations are **not** configurable —
    # OBSL is a semantic layer, not a JDBC proxy. There are no env flags
    # that allow arbitrary SQL through to the warehouse.

    # Postgres wire surface. Today: trust auth only, simple-query protocol.
    # Auth modes "password" / "scram-sha-256" land in Phase 2 alongside the
    # shared auth subsystem (see design/PLAN_authentication.md §3.3).
    pgwire_enabled: bool = False
    pgwire_host: str = "0.0.0.0"  # noqa: S104 — server bind address
    pgwire_port: int = 5432
    pgwire_auth_mode: str = "trust"  # "trust" (Step 1) | "password" | "scram-sha-256" (Step 6)
    pgwire_max_connections: int = 64
    pgwire_query_timeout_seconds: int = 60
    # Hard deadline for the pre-auth handshake (startup + password/SCRAM
    # exchange). Bounds how long an unauthenticated client can hold a
    # connection slot, preventing slot-exhaustion DoS.
    pgwire_auth_timeout_seconds: int = 10

    # One-shot batch endpoint (POST /v1/oneshot/batch). See PLAN_oneshot_batch.md.
    oneshot_batch_max_queries: int = 50
    oneshot_batch_max_parallelism: int = 8
    oneshot_batch_default_timeout_ms: int = 30000  # per-query
    oneshot_batch_batch_timeout_ms: int = 120000  # whole batch

    # Freshness-driven result cache. See design/PLAN_freshness_driven_cache.md.
    cache_backend: str = "noop"  # "noop" or "file"
    cache_dir: str = "./cache"
    cache_max_ttl_seconds: int = 86400
    cache_min_ttl_seconds: int = 5
    cache_max_value_bytes: int = 10 * 1024 * 1024  # 10 MB
    cache_max_disk_bytes: int = 5 * 1024 * 1024 * 1024  # 5 GB
    cache_sweep_interval_seconds: int = 86400
    cache_unknown_freshness_policy: str = "no_cache"  # or "default_ttl"
    cache_unknown_freshness_default_ttl: int = 300
    heartbeat_auth_token: str | None = None  # endpoint disabled (404) when unset

`effective_port` `property` ¶

Return the port to listen on (Cloud Run PORT takes precedence).

Python API Reference¶

Service Layer¶

ModelStore¶

orionbelt.service.model_store.ModelStore ¶

load_model(yaml_str=None, *, raw_dict=None, extends_yaml=None, inherits_model_id=None, dedup=True) ¶

get_model(model_id) ¶

describe(model_id) ¶

list_models() ¶

remove_model(model_id) ¶

compile_query(model_id, query, dialect) ¶

validate(yaml_str=None, *, raw_dict=None, extends_yaml=None, inherits_model_id=None) ¶

SessionManager¶

orionbelt.service.session_manager.SessionManager ¶

Parameters¶

active_count property ¶

start() ¶

stop() ¶

create_session(metadata=None) ¶

get_store(session_id) ¶

get_session(session_id) ¶

close_session(session_id) ¶

list_sessions() ¶

get_or_create_default() ¶

SessionInfo¶

orionbelt.service.session_manager.SessionInfo dataclass ¶

Compiler Pipeline¶

orionbelt.compiler.pipeline.CompilationPipeline ¶

compile(query, model, dialect_name) ¶

Query Resolution¶

orionbelt.compiler.resolution.QueryResolver ¶

resolve(query, model, qualify_table=None) ¶

Star Schema Planner¶

orionbelt.compiler.star.StarSchemaPlanner ¶

plan(resolved, model, qualify_table=None, dialect=None) ¶

CFL Planner¶

orionbelt.compiler.cfl.CFLPlanner ¶

plan(resolved, model, qualify_table=None, union_by_name=False, dialect=None) ¶

Join Graph¶

orionbelt.compiler.graph.JoinGraph ¶

find_join_path(from_objects, to_objects, via_constraints=None) ¶

build_join_condition(step) ¶

detect_cycles() ¶

Code Generator¶

orionbelt.compiler.codegen.CodeGenerator ¶

generate(ast) ¶

Dialect Base¶

orionbelt.dialect.base.Dialect ¶

render_obml_type(obml_type) ¶

cast_to_obml_type(expr, obml_type) ¶

format_table_ref(database, schema, code) ¶

quote_identifier(name) abstractmethod ¶

render_time_grain(column, grain) abstractmethod ¶

render_cast(expr, target_type) abstractmethod ¶

current_date_sql() abstractmethod ¶

date_add_sql(date_sql, unit, count) abstractmethod ¶

render_date_trunc_sql(column_sql, grain) abstractmethod ¶

render_date_spine_cte_sql(min_date, max_date, grain, offset, offset_grain) abstractmethod ¶

Parameters¶

render_string_contains(column, pattern) ¶

render_decimal_division_sql(left_sql, right_sql) ¶

compile(ast) ¶

compile_select(node) ¶

compile_group_by(group_by, grouping) ¶

compile_union_all(node) ¶

compile_except(node) ¶

compile_expr(expr, _parent_prec=0) ¶

compile_regex_match(column, pattern, *, negated) ¶

compile_relative_date_range(column, unit, count, direction, include_current) ¶

orionbelt.dialect.base.DialectCapabilities dataclass ¶

Dialect Registry¶

orionbelt.dialect.registry.DialectRegistry ¶

get(name) classmethod ¶

available() classmethod ¶

register(dialect_class) classmethod ¶

YAML Parser¶

orionbelt.parser.loader.TrackedLoader ¶

load(path) ¶

load_string(content, filename='<string>') ¶

Reference Resolver¶

orionbelt.parser.resolver.ReferenceResolver ¶

`orionbelt.service.model_store.ModelStore` ¶

`load_model(yaml_str=None, *, raw_dict=None, extends_yaml=None, inherits_model_id=None, dedup=True)` ¶

`get_model(model_id)` ¶

`describe(model_id)` ¶

`list_models()` ¶

`remove_model(model_id)` ¶

`compile_query(model_id, query, dialect)` ¶

`validate(yaml_str=None, *, raw_dict=None, extends_yaml=None, inherits_model_id=None)` ¶

`orionbelt.service.session_manager.SessionManager` ¶

`active_count` `property` ¶

`start()` ¶

`stop()` ¶

`create_session(metadata=None)` ¶

`get_store(session_id)` ¶

`get_session(session_id)` ¶

`close_session(session_id)` ¶

`list_sessions()` ¶

`get_or_create_default()` ¶

`orionbelt.service.session_manager.SessionInfo` `dataclass` ¶

`orionbelt.compiler.pipeline.CompilationPipeline` ¶

`compile(query, model, dialect_name)` ¶

`orionbelt.compiler.resolution.QueryResolver` ¶

`resolve(query, model, qualify_table=None)` ¶

`orionbelt.compiler.star.StarSchemaPlanner` ¶

`plan(resolved, model, qualify_table=None, dialect=None)` ¶

`orionbelt.compiler.cfl.CFLPlanner` ¶

`plan(resolved, model, qualify_table=None, union_by_name=False, dialect=None)` ¶

`orionbelt.compiler.graph.JoinGraph` ¶

`find_join_path(from_objects, to_objects, via_constraints=None)` ¶

`build_join_condition(step)` ¶

`detect_cycles()` ¶

`orionbelt.compiler.codegen.CodeGenerator` ¶

`generate(ast)` ¶

`orionbelt.dialect.base.Dialect` ¶

`render_obml_type(obml_type)` ¶

`cast_to_obml_type(expr, obml_type)` ¶

`format_table_ref(database, schema, code)` ¶

`quote_identifier(name)` `abstractmethod` ¶

`render_time_grain(column, grain)` `abstractmethod` ¶

`render_cast(expr, target_type)` `abstractmethod` ¶

`current_date_sql()` `abstractmethod` ¶

`date_add_sql(date_sql, unit, count)` `abstractmethod` ¶

`render_date_trunc_sql(column_sql, grain)` `abstractmethod` ¶

`render_date_spine_cte_sql(min_date, max_date, grain, offset, offset_grain)` `abstractmethod` ¶

`render_string_contains(column, pattern)` ¶

`render_decimal_division_sql(left_sql, right_sql)` ¶

`compile(ast)` ¶

`compile_select(node)` ¶

`compile_group_by(group_by, grouping)` ¶

`compile_union_all(node)` ¶

`compile_except(node)` ¶

`compile_expr(expr, _parent_prec=0)` ¶

`compile_regex_match(column, pattern, *, negated)` ¶

`compile_relative_date_range(column, unit, count, direction, include_current)` ¶

`orionbelt.dialect.base.DialectCapabilities` `dataclass` ¶

`orionbelt.dialect.registry.DialectRegistry` ¶

`get(name)` `classmethod` ¶

`available()` `classmethod` ¶

`register(dialect_class)` `classmethod` ¶

`orionbelt.parser.loader.TrackedLoader` ¶

`load(path)` ¶

`load_string(content, filename='<string>')` ¶

`orionbelt.parser.resolver.ReferenceResolver` ¶

`resolve(raw, source_map=None)` ¶

`orionbelt.parser.validator.SemanticValidator` ¶

`validate(model)` ¶

`orionbelt.models.semantic.SemanticModel` ¶

`effective_measures` `property` ¶

`orionbelt.models.semantic.DataObject` ¶

`qualified_code` `property` ¶

`orionbelt.models.semantic.Dimension` ¶

`orionbelt.models.semantic.Measure` ¶

`orionbelt.models.semantic.Metric` ¶

`orionbelt.models.query.QueryObject` ¶

`orionbelt.models.query.QuerySelect` ¶

`is_raw` `property` ¶

`orionbelt.models.query.QueryFilter` ¶

`orionbelt.models.query.UsePathName` ¶

`orionbelt.models.query.DimensionRef` ¶

`parse(raw)` `classmethod` ¶