Skip to content
OBML v1.0
OrionBelt v2.7.9

Python API Reference

Auto-generated documentation from source code docstrings.

Service Layer

ModelStore

orionbelt.service.model_store.ModelStore

In-memory model registry. Thread-safe via threading.Lock.

Models are keyed by short UUID (8-char hex). All parsing, validation, and compilation infrastructure is instantiated internally, following the same singleton pattern as api/deps.py.

Source code in src/orionbelt/service/model_store.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
class ModelStore:
    """In-memory model registry.  Thread-safe via ``threading.Lock``.

    Models are keyed by short UUID (8-char hex).  All parsing, validation,
    and compilation infrastructure is instantiated internally, following the
    same singleton pattern as ``api/deps.py``.
    """

    def __init__(self, max_models: int = 10) -> None:
        self._lock = threading.Lock()
        self._models: dict[str, SemanticModel] = {}
        # Parallel storage of each loaded model's *merged* raw YAML dict
        # so inheritance can re-merge against the exact same content the
        # parent was built from. Pre-fix (v2.7.5) inheritance round-tripped
        # through ``_model_to_raw`` which dropped most non-essential
        # fields (numClass, primaryKey, expression on computed columns,
        # measure dataType / filters / grain / delimiter / withinGroup,
        # most metric subtype config, …) — child models would inherit
        # a stripped parent and silently compile invalid SQL such as
        # ``SUM("T"."")`` for any parent computed column whose ``code:``
        # the resolver had derived from its ``expression``.
        self._raws: dict[str, dict[str, object]] = {}
        self._graphs: dict[str, GraphArtifact] = {}
        # Per-store summary cache so dedup hits can return the original
        # data_objects/dimensions/measures/metrics counts without re-walking
        # the model.
        self._summaries: dict[str, ModelSummary] = {}
        self._max_models = max_models
        # Dedup index: content_hash → model_id. Populated on every successful
        # load and consulted before parsing on the next load. See
        # design/PLAN_model_load_dedup.md.
        self._content_hash_index: dict[str, str] = {}

        # Internal pipeline singletons (stateless, safe to share).
        self._loader = TrackedLoader()
        self._resolver = ReferenceResolver()
        self._validator = SemanticValidator()
        self._merger = ExtendsMerger()
        self._pipeline = CompilationPipeline()

    # -- helpers -------------------------------------------------------------

    @staticmethod
    def _new_id() -> str:
        return uuid.uuid4().hex[:8]

    @staticmethod
    def _health_for(model: SemanticModel) -> ModelHealthSummary:
        """Compute structural health for a loaded model."""
        h = compute_health(model)
        return ModelHealthSummary(
            status=h.status,
            data_objects=h.data_objects,
            joins=h.joins,
            orphan_data_objects=h.orphan_data_objects,
            fan_trap_risks=[
                FanTrapRiskInfo(
                    tables=r.tables,
                    reason=r.reason,
                    suggested_pattern=r.suggested_pattern,
                )
                for r in h.fan_trap_risks
            ],
            unreachable_dimensions=h.unreachable_dimensions,
            warnings_count=h.warnings_count,
        )

    @staticmethod
    def _content_hash(yaml_str: str) -> str:
        """SHA-256 of the OBML body, with surrounding whitespace stripped.

        Stripping at the boundary makes a trailing newline difference
        invisible to dedup; everything else (key order, comments, internal
        whitespace) still produces a different hash.
        """
        return hashlib.sha256(yaml_str.strip().encode("utf-8")).hexdigest()

    def _parse_and_validate(
        self,
        yaml_str: str | None = None,
        *,
        raw_dict: dict[str, object] | None = None,
        extends_yaml: list[str] | None = None,
        inherits_model_id: str | None = None,
    ) -> tuple[SemanticModel, dict[str, object], list[ErrorInfo], list[ErrorInfo]]:
        """Parse YAML (or accept pre-parsed dict), resolve references, validate.

        Returns ``(model, merged_raw, errors, warnings)``.
        Provide either ``yaml_str`` or ``raw_dict``, not both.

        ``merged_raw`` is the fully-merged raw dict the resolver consumed
        (after extends/inherits processing) — callers store it so future
        inherits-from-this-model loads can re-merge against the exact
        content rather than going through a lossy ``_model_to_raw``
        round-trip.
        """
        errors: list[ErrorInfo] = []
        warnings: list[ErrorInfo] = []

        # 1. Parse YAML or use pre-parsed dict
        if raw_dict is not None:
            raw = raw_dict
            source_map = None
        elif yaml_str is not None:
            try:
                raw, source_map = self._loader.load_string(yaml_str)
            except YAMLSafetyError as exc:
                errors.append(ErrorInfo(code="YAML_SAFETY_ERROR", message=str(exc)))
                return SemanticModel(), {}, errors, warnings
            except Exception as exc:
                errors.append(ErrorInfo(code="YAML_PARSE_ERROR", message=str(exc)))
                return SemanticModel(), {}, errors, warnings
        else:
            errors.append(
                ErrorInfo(
                    code="NO_MODEL_INPUT",
                    message="Provide either model_yaml or model_json",
                )
            )
            return SemanticModel(), {}, errors, warnings

        # 1b. Merge extends/inherits if provided
        try:
            inherits_raw: dict[str, object] | None = None
            if inherits_model_id is not None:
                # Prefer the parent's stored raw dict — captured at load
                # time so every field round-trips intact. Fall back to
                # the lossy ``_model_to_raw`` only when no raw is on
                # record (legacy / programmatically-constructed models).
                with self._lock:
                    inherits_raw = self._raws.get(inherits_model_id)
                if inherits_raw is None:
                    parent_model = self.get_model(inherits_model_id)
                    inherits_raw = self._model_to_raw(parent_model)

            if extends_yaml or inherits_raw is not None:
                raw, merge_warnings = self._merger.merge_from_strings(
                    raw,
                    extend_yamls=extends_yaml,
                    inherits_raw=inherits_raw,
                )
                for mw in merge_warnings:
                    warnings.append(
                        ErrorInfo(
                            code=WarningCode.MERGE_WARNING,
                            message=mw,
                            severity="warning",
                        )
                    )
                source_map = None
        except MergeError as exc:
            errors.append(ErrorInfo(code=exc.code, message=exc.message))
            return SemanticModel(), {}, errors, warnings
        except KeyError:
            errors.append(
                ErrorInfo(
                    code="PARENT_MODEL_NOT_FOUND",
                    message=f"Parent model '{inherits_model_id}' not found in session",
                )
            )
            return SemanticModel(), {}, errors, warnings

        # 2. Resolve references
        model, resolution = self._resolver.resolve(raw, source_map)
        for e in resolution.errors:
            errors.append(
                ErrorInfo(
                    code=e.code,
                    message=e.message,
                    path=e.path,
                    suggestions=list(e.suggestions),
                    severity=e.severity,
                    hint=e.hint,
                    context=e.context,
                )
            )
        for w in resolution.warnings:
            warnings.append(
                ErrorInfo(
                    code=w.code,
                    message=w.message,
                    path=w.path,
                    suggestions=list(w.suggestions),
                    severity=w.severity or "warning",
                    hint=w.hint,
                    context=w.context,
                )
            )

        # 3. Semantic validation
        sem_errors = self._validator.validate(model)
        for e in sem_errors:
            info = ErrorInfo(
                code=e.code,
                message=e.message,
                path=e.path,
                suggestions=list(e.suggestions),
                severity=e.severity,
                hint=e.hint,
                context=e.context,
            )
            if e.severity == "warning":
                warnings.append(info)
            else:
                errors.append(info)

        # 4. Cross-dataObject refresh contract consistency check.
        from orionbelt.cache.contracts import collect_table_contracts

        _, refresh_warnings = collect_table_contracts(model)
        for w in refresh_warnings:
            warnings.append(
                ErrorInfo(
                    code=w.code,
                    message=w.message,
                    path=w.path,
                    suggestions=list(w.suggestions),
                    severity=w.severity or "warning",
                    hint=w.hint,
                    context=w.context,
                )
            )

        return model, raw, errors, warnings

    @staticmethod
    def _model_to_raw(model: SemanticModel) -> dict[str, object]:
        """Convert a SemanticModel back to a raw dict for inherits merging.

        .. deprecated:: v2.7.5
            Lossy fallback only — drops most non-essential fields. New
            code stores and reuses the merged raw dict captured at load
            time (see ``ModelStore._raws``). This method remains for the
            edge case where a parent model was constructed programmatically
            without ever passing through ``load_model``.
        """
        raw: dict[str, object] = {"version": model.version}
        if model.description:
            raw["description"] = model.description
        if model.data_objects:
            objs: dict[str, object] = {}
            for name, obj in model.data_objects.items():
                obj_raw: dict[str, object] = {
                    "code": obj.code,
                    "database": obj.database,
                    "schema": obj.schema_name,
                }
                if obj.columns:
                    cols: dict[str, object] = {}
                    for cname, col in obj.columns.items():
                        cols[cname] = {
                            "code": col.code,
                            "abstractType": col.abstract_type.value,
                        }
                    obj_raw["columns"] = cols
                if obj.joins:
                    joins: list[dict[str, object]] = []
                    for j in obj.joins:
                        jd: dict[str, object] = {
                            "joinType": j.join_type.value,
                            "joinTo": j.join_to,
                            "columnsFrom": list(j.columns_from),
                            "columnsTo": list(j.columns_to),
                        }
                        if j.secondary:
                            jd["secondary"] = True
                            jd["pathName"] = j.path_name
                        joins.append(jd)
                    obj_raw["joins"] = joins
                if obj.refresh is not None:
                    refresh: dict[str, object] = {"mode": obj.refresh.mode}
                    if obj.refresh.interval:
                        refresh["interval"] = obj.refresh.interval
                    if obj.refresh.anchor:
                        refresh["anchor"] = obj.refresh.anchor
                    if obj.refresh.timezone:
                        refresh["timezone"] = obj.refresh.timezone
                    if obj.refresh.max_staleness:
                        refresh["maxStaleness"] = obj.refresh.max_staleness
                    obj_raw["refresh"] = refresh
                objs[name] = obj_raw
            raw["dataObjects"] = objs
        if model.dimensions:
            dims: dict[str, object] = {}
            for name, dim in model.dimensions.items():
                dd: dict[str, object] = {
                    "dataObject": dim.view,
                    "column": dim.column,
                    "resultType": dim.result_type.value,
                }
                if dim.time_grain:
                    dd["timeGrain"] = dim.time_grain.value
                dims[name] = dd
            raw["dimensions"] = dims
        if model.measures:
            meas: dict[str, object] = {}
            for name, m in model.measures.items():
                md: dict[str, object] = {
                    "aggregation": m.aggregation,
                    "resultType": m.result_type.value,
                }
                if m.expression:
                    md["expression"] = m.expression
                if m.columns:
                    md["columns"] = [
                        {"dataObject": c.view or "", "column": c.column or ""} for c in m.columns
                    ]
                if m.total:
                    md["total"] = True
                meas[name] = md
            raw["measures"] = meas
        if model.metrics:
            mets: dict[str, object] = {}
            for name, met in model.metrics.items():
                mtd: dict[str, object] = {"type": met.type.value}
                if met.expression:
                    mtd["expression"] = met.expression
                if met.measure:
                    mtd["measure"] = met.measure
                if met.time_dimension:
                    mtd["timeDimension"] = met.time_dimension
                mets[name] = mtd
            raw["metrics"] = mets
        if model.filters:
            raw["filters"] = [
                {
                    "dataObject": f.data_object,
                    "column": f.column,
                    "operator": f.operator,
                    **({"value": f.value} if f.value is not None else {}),
                    **({"values": f.values} if f.values else {}),
                }
                for f in model.filters
            ]
        return raw

    # -- public API ----------------------------------------------------------

    def load_model(
        self,
        yaml_str: str | None = None,
        *,
        raw_dict: dict[str, object] | None = None,
        extends_yaml: list[str] | None = None,
        inherits_model_id: str | None = None,
        dedup: bool = True,
    ) -> LoadResult:
        """Parse, validate, and store a model.  Returns id + summary.

        Provide either ``yaml_str`` or ``raw_dict``.
        Raises ``ModelValidationError`` if the model has validation errors.
        Raises ``ModelCapacityError`` if the session's model cap is reached.

        When ``dedup`` is True (default) and the same OBML bytes have already
        been loaded into this store, the existing ``model_id`` is returned
        and ``model_load`` is set to ``"reused"``. Dedup only applies to
        plain ``yaml_str`` loads — when ``raw_dict``, ``extends_yaml``, or
        ``inherits_model_id`` is supplied the load always runs fresh, since
        the effective content depends on inputs not captured by the YAML
        bytes alone.
        """
        # Dedup is meaningful only for a stand-alone YAML body. The other
        # input shapes either skip the YAML stage (raw_dict) or fold in
        # additional state (extends/inherits) that the bytes don't capture.
        dedup_eligible = (
            dedup
            and yaml_str is not None
            and raw_dict is None
            and not extends_yaml
            and inherits_model_id is None
        )
        content_hash: str | None = None
        if dedup_eligible:
            content_hash = self._content_hash(yaml_str or "")
            with self._lock:
                existing_id = self._content_hash_index.get(content_hash)
                if existing_id is not None and existing_id in self._models:
                    summary = self._summaries.get(existing_id)
                    if summary is not None:
                        existing_model = self._models[existing_id]
                        existing_health = self._health_for(existing_model)
                        return LoadResult(
                            model_id=existing_id,
                            data_objects=summary.data_objects,
                            dimensions=summary.dimensions,
                            measures=summary.measures,
                            metrics=summary.metrics,
                            warnings=[],
                            model_load="reused",
                            health=existing_health,
                        )
                # Stale index entry — drop it and fall through to a fresh load.
                if existing_id is not None:
                    self._content_hash_index.pop(content_hash, None)

        with self._lock:
            if len(self._models) >= self._max_models:
                raise ModelCapacityError(f"Maximum models per session reached ({self._max_models})")

        model, merged_raw, errors, warnings = self._parse_and_validate(
            yaml_str,
            raw_dict=raw_dict,
            extends_yaml=extends_yaml,
            inherits_model_id=inherits_model_id,
        )
        if errors:
            raise ModelValidationError(errors, warnings)

        model_id = self._new_id()

        # Eagerly export OBSL-Core graph (Option C: at model load time).
        graph = export_obsl(model, model_id)
        turtle = graph.serialize(format="turtle")
        artifact = GraphArtifact(graph=graph, turtle=turtle, generated_at=time.monotonic())

        summary = ModelSummary(
            model_id=model_id,
            data_objects=len(model.data_objects),
            dimensions=len(model.dimensions),
            measures=len(model.measures),
            metrics=len(model.metrics),
        )

        with self._lock:
            # Re-check capacity under lock — the first check (above) ran
            # outside the lock while parsing/exporting, so a concurrent
            # request may have filled the slot in the meantime.
            if len(self._models) >= self._max_models:
                raise ModelCapacityError(f"Maximum models per session reached ({self._max_models})")
            self._models[model_id] = model
            self._raws[model_id] = merged_raw
            self._graphs[model_id] = artifact
            self._summaries[model_id] = summary
            if content_hash is not None:
                # If a concurrent request beat us to it, the last writer wins;
                # the race is benign (both models work, the older one is just
                # not reachable via the index). See PLAN_model_load_dedup.md §6.3.
                self._content_hash_index[content_hash] = model_id

        return LoadResult(
            model_id=model_id,
            data_objects=summary.data_objects,
            dimensions=summary.dimensions,
            measures=summary.measures,
            metrics=summary.metrics,
            warnings=warnings,
            model_load="fresh",
            health=self._health_for(model),
        )

    def get_model(self, model_id: str) -> SemanticModel:
        """Look up a loaded model.  Raises ``KeyError`` if not found."""
        with self._lock:
            try:
                return self._models[model_id]
            except KeyError:
                raise KeyError(f"No model loaded with id '{model_id}'") from None

    def describe(self, model_id: str) -> ModelDescription:
        """Return a structured summary suitable for LLM consumption."""
        model = self.get_model(model_id)

        data_objects = [
            DataObjectInfo(
                label=obj.label,
                code=obj.qualified_code,
                columns=list(obj.columns.keys()),
                join_targets=[j.join_to for j in obj.joins],
                synonyms=obj.synonyms,
                owner=obj.owner,
            )
            for obj in model.data_objects.values()
        ]

        dimensions = [
            DimensionInfo(
                name=dim.label,
                result_type=dim.result_type.value,
                data_object=dim.view,
                column=dim.column,
                time_grain=dim.time_grain.value if dim.time_grain else None,
                synonyms=dim.synonyms,
                owner=dim.owner,
            )
            for dim in model.dimensions.values()
        ]

        measures = [
            MeasureInfo(
                name=m.label,
                result_type=m.result_type.value,
                aggregation=m.aggregation,
                expression=m.expression,
                synonyms=m.synonyms,
                owner=m.owner,
            )
            for m in model.measures.values()
        ]

        metrics = [
            MetricInfo(
                name=met.label,
                expression=met.expression,
                synonyms=met.synonyms,
                type=met.type.value,
                measure=met.measure,
                time_dimension=met.time_dimension,
                owner=met.owner,
            )
            for met in model.metrics.values()
        ]

        return ModelDescription(
            model_id=model_id,
            data_objects=data_objects,
            dimensions=dimensions,
            measures=measures,
            metrics=metrics,
        )

    def list_models(self) -> list[ModelSummary]:
        """Return a short summary for every loaded model."""
        with self._lock:
            return list(self._summaries.values())

    def remove_model(self, model_id: str) -> None:
        """Unload a model and its cached OBSL graph.  Raises ``KeyError`` if not found.

        Also removes the model's entry from the dedup index so the next load
        of the same OBML content runs fresh. PLAN_model_load_dedup.md §6.2.
        """
        with self._lock:
            try:
                del self._models[model_id]
            except KeyError:
                raise KeyError(f"No model loaded with id '{model_id}'") from None
            self._raws.pop(model_id, None)
            self._graphs.pop(model_id, None)
            self._summaries.pop(model_id, None)
            stale_hashes = [h for h, mid in self._content_hash_index.items() if mid == model_id]
            for h in stale_hashes:
                del self._content_hash_index[h]

    def compile_query(
        self,
        model_id: str,
        query: QueryObject,
        dialect: str,
    ) -> CompilationResult:
        """Compile a query against a loaded model."""
        model = self.get_model(model_id)
        return self._pipeline.compile(query, model, dialect)

    def refresh_contracts(self, model_id: str) -> dict[str, RefreshContract]:
        """Per-physical-table freshness contracts for the given model.

        Used by the result cache to derive an effective TTL for a query
        based on the dataObjects it touched.
        """
        from orionbelt.cache.contracts import collect_table_contracts

        model = self.get_model(model_id)
        contracts, _ = collect_table_contracts(model)
        return contracts

    def validate(
        self,
        yaml_str: str | None = None,
        *,
        raw_dict: dict[str, object] | None = None,
        extends_yaml: list[str] | None = None,
        inherits_model_id: str | None = None,
    ) -> ValidationSummary:
        """Validate a model without storing it.  Accepts YAML string or raw dict."""
        _model, _raw, errors, warnings = self._parse_and_validate(
            yaml_str,
            raw_dict=raw_dict,
            extends_yaml=extends_yaml,
            inherits_model_id=inherits_model_id,
        )
        return ValidationSummary(
            valid=len(errors) == 0,
            errors=errors,
            warnings=warnings,
        )

    # -- OBSL graph ---------------------------------------------------------

    def get_graph(self, model_id: str) -> GraphArtifact:
        """Return the cached OBSL graph for a model.  Raises ``KeyError`` if not found."""
        with self._lock:
            try:
                return self._graphs[model_id]
            except KeyError:
                raise KeyError(f"No graph for model '{model_id}'") from None

    def query_graph(self, model_id: str, sparql: str) -> SPARQLResult:
        """Execute a read-only SPARQL query against a model's OBSL graph."""
        artifact = self.get_graph(model_id)
        return execute_sparql(artifact.graph, sparql)

load_model(yaml_str=None, *, raw_dict=None, extends_yaml=None, inherits_model_id=None, dedup=True)

Parse, validate, and store a model. Returns id + summary.

Provide either yaml_str or raw_dict. Raises ModelValidationError if the model has validation errors. Raises ModelCapacityError if the session's model cap is reached.

When dedup is True (default) and the same OBML bytes have already been loaded into this store, the existing model_id is returned and model_load is set to "reused". Dedup only applies to plain yaml_str loads — when raw_dict, extends_yaml, or inherits_model_id is supplied the load always runs fresh, since the effective content depends on inputs not captured by the YAML bytes alone.

Source code in src/orionbelt/service/model_store.py
def load_model(
    self,
    yaml_str: str | None = None,
    *,
    raw_dict: dict[str, object] | None = None,
    extends_yaml: list[str] | None = None,
    inherits_model_id: str | None = None,
    dedup: bool = True,
) -> LoadResult:
    """Parse, validate, and store a model.  Returns id + summary.

    Provide either ``yaml_str`` or ``raw_dict``.
    Raises ``ModelValidationError`` if the model has validation errors.
    Raises ``ModelCapacityError`` if the session's model cap is reached.

    When ``dedup`` is True (default) and the same OBML bytes have already
    been loaded into this store, the existing ``model_id`` is returned
    and ``model_load`` is set to ``"reused"``. Dedup only applies to
    plain ``yaml_str`` loads — when ``raw_dict``, ``extends_yaml``, or
    ``inherits_model_id`` is supplied the load always runs fresh, since
    the effective content depends on inputs not captured by the YAML
    bytes alone.
    """
    # Dedup is meaningful only for a stand-alone YAML body. The other
    # input shapes either skip the YAML stage (raw_dict) or fold in
    # additional state (extends/inherits) that the bytes don't capture.
    dedup_eligible = (
        dedup
        and yaml_str is not None
        and raw_dict is None
        and not extends_yaml
        and inherits_model_id is None
    )
    content_hash: str | None = None
    if dedup_eligible:
        content_hash = self._content_hash(yaml_str or "")
        with self._lock:
            existing_id = self._content_hash_index.get(content_hash)
            if existing_id is not None and existing_id in self._models:
                summary = self._summaries.get(existing_id)
                if summary is not None:
                    existing_model = self._models[existing_id]
                    existing_health = self._health_for(existing_model)
                    return LoadResult(
                        model_id=existing_id,
                        data_objects=summary.data_objects,
                        dimensions=summary.dimensions,
                        measures=summary.measures,
                        metrics=summary.metrics,
                        warnings=[],
                        model_load="reused",
                        health=existing_health,
                    )
            # Stale index entry — drop it and fall through to a fresh load.
            if existing_id is not None:
                self._content_hash_index.pop(content_hash, None)

    with self._lock:
        if len(self._models) >= self._max_models:
            raise ModelCapacityError(f"Maximum models per session reached ({self._max_models})")

    model, merged_raw, errors, warnings = self._parse_and_validate(
        yaml_str,
        raw_dict=raw_dict,
        extends_yaml=extends_yaml,
        inherits_model_id=inherits_model_id,
    )
    if errors:
        raise ModelValidationError(errors, warnings)

    model_id = self._new_id()

    # Eagerly export OBSL-Core graph (Option C: at model load time).
    graph = export_obsl(model, model_id)
    turtle = graph.serialize(format="turtle")
    artifact = GraphArtifact(graph=graph, turtle=turtle, generated_at=time.monotonic())

    summary = ModelSummary(
        model_id=model_id,
        data_objects=len(model.data_objects),
        dimensions=len(model.dimensions),
        measures=len(model.measures),
        metrics=len(model.metrics),
    )

    with self._lock:
        # Re-check capacity under lock — the first check (above) ran
        # outside the lock while parsing/exporting, so a concurrent
        # request may have filled the slot in the meantime.
        if len(self._models) >= self._max_models:
            raise ModelCapacityError(f"Maximum models per session reached ({self._max_models})")
        self._models[model_id] = model
        self._raws[model_id] = merged_raw
        self._graphs[model_id] = artifact
        self._summaries[model_id] = summary
        if content_hash is not None:
            # If a concurrent request beat us to it, the last writer wins;
            # the race is benign (both models work, the older one is just
            # not reachable via the index). See PLAN_model_load_dedup.md §6.3.
            self._content_hash_index[content_hash] = model_id

    return LoadResult(
        model_id=model_id,
        data_objects=summary.data_objects,
        dimensions=summary.dimensions,
        measures=summary.measures,
        metrics=summary.metrics,
        warnings=warnings,
        model_load="fresh",
        health=self._health_for(model),
    )

get_model(model_id)

Look up a loaded model. Raises KeyError if not found.

Source code in src/orionbelt/service/model_store.py
def get_model(self, model_id: str) -> SemanticModel:
    """Look up a loaded model.  Raises ``KeyError`` if not found."""
    with self._lock:
        try:
            return self._models[model_id]
        except KeyError:
            raise KeyError(f"No model loaded with id '{model_id}'") from None

describe(model_id)

Return a structured summary suitable for LLM consumption.

Source code in src/orionbelt/service/model_store.py
def describe(self, model_id: str) -> ModelDescription:
    """Return a structured summary suitable for LLM consumption."""
    model = self.get_model(model_id)

    data_objects = [
        DataObjectInfo(
            label=obj.label,
            code=obj.qualified_code,
            columns=list(obj.columns.keys()),
            join_targets=[j.join_to for j in obj.joins],
            synonyms=obj.synonyms,
            owner=obj.owner,
        )
        for obj in model.data_objects.values()
    ]

    dimensions = [
        DimensionInfo(
            name=dim.label,
            result_type=dim.result_type.value,
            data_object=dim.view,
            column=dim.column,
            time_grain=dim.time_grain.value if dim.time_grain else None,
            synonyms=dim.synonyms,
            owner=dim.owner,
        )
        for dim in model.dimensions.values()
    ]

    measures = [
        MeasureInfo(
            name=m.label,
            result_type=m.result_type.value,
            aggregation=m.aggregation,
            expression=m.expression,
            synonyms=m.synonyms,
            owner=m.owner,
        )
        for m in model.measures.values()
    ]

    metrics = [
        MetricInfo(
            name=met.label,
            expression=met.expression,
            synonyms=met.synonyms,
            type=met.type.value,
            measure=met.measure,
            time_dimension=met.time_dimension,
            owner=met.owner,
        )
        for met in model.metrics.values()
    ]

    return ModelDescription(
        model_id=model_id,
        data_objects=data_objects,
        dimensions=dimensions,
        measures=measures,
        metrics=metrics,
    )

list_models()

Return a short summary for every loaded model.

Source code in src/orionbelt/service/model_store.py
def list_models(self) -> list[ModelSummary]:
    """Return a short summary for every loaded model."""
    with self._lock:
        return list(self._summaries.values())

remove_model(model_id)

Unload a model and its cached OBSL graph. Raises KeyError if not found.

Also removes the model's entry from the dedup index so the next load of the same OBML content runs fresh. PLAN_model_load_dedup.md §6.2.

Source code in src/orionbelt/service/model_store.py
def remove_model(self, model_id: str) -> None:
    """Unload a model and its cached OBSL graph.  Raises ``KeyError`` if not found.

    Also removes the model's entry from the dedup index so the next load
    of the same OBML content runs fresh. PLAN_model_load_dedup.md §6.2.
    """
    with self._lock:
        try:
            del self._models[model_id]
        except KeyError:
            raise KeyError(f"No model loaded with id '{model_id}'") from None
        self._raws.pop(model_id, None)
        self._graphs.pop(model_id, None)
        self._summaries.pop(model_id, None)
        stale_hashes = [h for h, mid in self._content_hash_index.items() if mid == model_id]
        for h in stale_hashes:
            del self._content_hash_index[h]

compile_query(model_id, query, dialect)

Compile a query against a loaded model.

Source code in src/orionbelt/service/model_store.py
def compile_query(
    self,
    model_id: str,
    query: QueryObject,
    dialect: str,
) -> CompilationResult:
    """Compile a query against a loaded model."""
    model = self.get_model(model_id)
    return self._pipeline.compile(query, model, dialect)

validate(yaml_str=None, *, raw_dict=None, extends_yaml=None, inherits_model_id=None)

Validate a model without storing it. Accepts YAML string or raw dict.

Source code in src/orionbelt/service/model_store.py
def validate(
    self,
    yaml_str: str | None = None,
    *,
    raw_dict: dict[str, object] | None = None,
    extends_yaml: list[str] | None = None,
    inherits_model_id: str | None = None,
) -> ValidationSummary:
    """Validate a model without storing it.  Accepts YAML string or raw dict."""
    _model, _raw, errors, warnings = self._parse_and_validate(
        yaml_str,
        raw_dict=raw_dict,
        extends_yaml=extends_yaml,
        inherits_model_id=inherits_model_id,
    )
    return ValidationSummary(
        valid=len(errors) == 0,
        errors=errors,
        warnings=warnings,
    )

SessionManager

orionbelt.service.session_manager.SessionManager

Manages TTL-scoped sessions, each holding its own ModelStore.

Thread-safe. Call :meth:start to begin the background cleanup thread and :meth:stop to shut it down.

Parameters

ttl_seconds: Sliding idle timeout — sessions expire after this many seconds of inactivity. max_age_seconds: Absolute maximum session lifetime regardless of activity. max_sessions: Global cap on concurrent sessions. create_session raises :class:SessionCapacityError when at capacity. max_models_per_session: Maximum models a single session may hold. Passed through to each ModelStore instance. cleanup_interval: Seconds between background purge sweeps. is_single_model_mode: Flag retained for backwards compatibility — historically set when a MODEL_FILE preloaded the __default__ session. With MODEL_FILES (admin-curated named sessions), the flag is True and the __default__ session — still created on demand by MCP stdio — is kept alive and excluded from purge. False otherwise, in which case __default__ is treated like any other session and subject to TTL/max-age expiry.

Source code in src/orionbelt/service/session_manager.py
class SessionManager:
    """Manages TTL-scoped sessions, each holding its own ``ModelStore``.

    Thread-safe.  Call :meth:`start` to begin the background cleanup thread
    and :meth:`stop` to shut it down.

    Parameters
    ----------
    ttl_seconds:
        Sliding idle timeout — sessions expire after this many seconds of
        inactivity.
    max_age_seconds:
        Absolute maximum session lifetime regardless of activity.
    max_sessions:
        Global cap on concurrent sessions.  ``create_session`` raises
        :class:`SessionCapacityError` when at capacity.
    max_models_per_session:
        Maximum models a single session may hold.  Passed through to each
        ``ModelStore`` instance.
    cleanup_interval:
        Seconds between background purge sweeps.
    is_single_model_mode:
        Flag retained for backwards compatibility — historically set when
        a ``MODEL_FILE`` preloaded the ``__default__`` session. With
        ``MODEL_FILES`` (admin-curated named sessions), the flag is True
        and the ``__default__`` session — still created on demand by MCP
        stdio — is kept alive and excluded from purge. False otherwise,
        in which case ``__default__`` is treated like any other session
        and subject to TTL/max-age expiry.
    """

    def __init__(
        self,
        ttl_seconds: int = 1800,
        max_age_seconds: int = 86400,
        max_sessions: int = 500,
        max_models_per_session: int = 10,
        cleanup_interval: int = 60,
        is_single_model_mode: bool = False,
    ) -> None:
        self._ttl = ttl_seconds
        self._max_age = max_age_seconds
        self._max_sessions = max_sessions
        self._max_models = max_models_per_session
        self._cleanup_interval = cleanup_interval
        self._is_single_model_mode = is_single_model_mode
        self._lock = threading.Lock()
        self._sessions: dict[str, _Session] = {}
        self._stop_event = threading.Event()
        self._cleanup_thread: threading.Thread | None = None

    @property
    def ttl(self) -> int:
        """Session TTL in seconds."""
        return self._ttl

    @property
    def max_age(self) -> int:
        """Absolute max session lifetime in seconds."""
        return self._max_age

    @property
    def max_sessions(self) -> int:
        """Global concurrent session cap."""
        return self._max_sessions

    @property
    def max_models_per_session(self) -> int:
        """Maximum models a single session may hold."""
        return self._max_models

    # -- lifecycle -----------------------------------------------------------

    def start(self) -> None:
        """Start the background cleanup daemon thread."""
        if self._cleanup_thread is not None:
            return
        self._stop_event.clear()
        self._cleanup_thread = threading.Thread(
            target=self._cleanup_loop, daemon=True, name="session-cleanup"
        )
        self._cleanup_thread.start()

    def stop(self) -> None:
        """Signal the cleanup thread to stop and wait for it."""
        self._stop_event.set()
        if self._cleanup_thread is not None:
            self._cleanup_thread.join(timeout=5)
            self._cleanup_thread = None

    # -- public API ----------------------------------------------------------

    def create_session(self, metadata: dict[str, str] | None = None) -> SessionInfo:
        """Create a new session and return its info.

        Raises :class:`SessionCapacityError` when the global session cap
        is reached.
        """
        now_mono = time.monotonic()
        now_wall = datetime.now(UTC)
        session_id = secrets.token_hex(16)  # 32-char hex (128-bit)
        session = _Session(
            session_id=session_id,
            store=ModelStore(max_models=self._max_models),
            created_at=now_wall,
            created_at_mono=now_mono,
            last_accessed=now_mono,
            metadata=metadata or {},
            created_at_wall=now_wall,
            last_accessed_wall=now_wall,
        )
        with self._lock:
            # Count only non-default, non-expired sessions toward the cap.
            active = sum(
                1
                for s in self._sessions.values()
                if s.session_id != _DEFAULT_SESSION_ID and not self._is_expired(s, now_mono)
            )
            if active >= self._max_sessions:
                logger.warning(
                    "Session cap reached (%d/%d), rejecting create",
                    active,
                    self._max_sessions,
                )
                raise SessionCapacityError(
                    f"Maximum number of concurrent sessions reached ({self._max_sessions})"
                )
            self._sessions[session_id] = session
        logger.info("Session created: %s", session_id)
        return self._session_info(session)

    def get_store(self, session_id: str) -> ModelStore:
        """Get the ModelStore for a session, updating its last-accessed time.

        Raises :class:`SessionExpiredError` if the session has expired.
        Raises :class:`SessionNotFoundError` if the session ID is unknown.
        """
        now_mono = time.monotonic()
        with self._lock:
            session = self._sessions.get(session_id)
            if session is None:
                raise SessionNotFoundError(f"Session '{session_id}' not found")
            if self._is_expired(session, now_mono):
                reason = self._expiry_reason(session, now_mono)
                del self._sessions[session_id]
                logger.info("Session expired on access: %s (%s)", session_id, reason)
                raise SessionExpiredError(f"Session '{session_id}' has expired ({reason})")
            session.last_accessed = now_mono
            session.last_accessed_wall = datetime.now(UTC)
            return session.store

    def get_session(self, session_id: str) -> SessionInfo:
        """Get session info (also refreshes last-accessed)."""
        now_mono = time.monotonic()
        with self._lock:
            session = self._sessions.get(session_id)
            if session is None:
                raise SessionNotFoundError(f"Session '{session_id}' not found")
            if self._is_expired(session, now_mono):
                reason = self._expiry_reason(session, now_mono)
                del self._sessions[session_id]
                logger.info("Session expired on access: %s (%s)", session_id, reason)
                raise SessionExpiredError(f"Session '{session_id}' has expired ({reason})")
            session.last_accessed = now_mono
            session.last_accessed_wall = datetime.now(UTC)
            return self._session_info(session)

    def close_session(self, session_id: str) -> None:
        """Explicitly close a session."""
        with self._lock:
            if session_id not in self._sessions:
                raise SessionNotFoundError(f"Session '{session_id}' not found")
            del self._sessions[session_id]
        logger.info("Session closed: %s", session_id)

    def list_sessions(self) -> list[SessionInfo]:
        """Return info for all non-expired user sessions.

        Excludes the default session and any admin-managed (protected)
        sessions created by the multi-model startup loader.
        """
        now_mono = time.monotonic()
        result: list[SessionInfo] = []
        with self._lock:
            for session in self._sessions.values():
                if session.session_id == _DEFAULT_SESSION_ID:
                    continue
                if session.protected:
                    continue
                if not self._is_expired(session, now_mono):
                    result.append(self._session_info(session))
        return result

    @property
    def active_count(self) -> int:
        """Number of active (non-expired) sessions."""
        now_mono = time.monotonic()
        with self._lock:
            return sum(1 for s in self._sessions.values() if not self._is_expired(s, now_mono))

    def get_or_create_default(self) -> ModelStore:
        """Get (or lazily create) the legacy ``__default__`` session.

        Unlike :meth:`get_or_create_named`, the default session is NOT
        marked protected — its lifecycle is controlled by the
        ``is_single_model_mode`` flag, not by the protected mechanism.
        This preserves backward compatibility with the v2.3.x model-
        upload semantics where each new user session inherits the
        preloaded YAML.
        """
        with self._lock:
            session = self._sessions.get(_DEFAULT_SESSION_ID)
            if session is not None:
                session.last_accessed = time.monotonic()
                session.last_accessed_wall = datetime.now(UTC)
                return session.store
            now_mono = time.monotonic()
            now_wall = datetime.now(UTC)
            session = _Session(
                session_id=_DEFAULT_SESSION_ID,
                store=ModelStore(max_models=self._max_models),
                created_at=now_wall,
                created_at_mono=now_mono,
                last_accessed=now_mono,
                created_at_wall=now_wall,
                last_accessed_wall=now_wall,
            )
            self._sessions[_DEFAULT_SESSION_ID] = session
            return session.store

    def get_or_create_named(self, session_id: str) -> ModelStore:
        """Get (or lazily create) a session with a caller-chosen id.

        Used by the multi-model startup loader to register each pre-loaded
        model as its own internal session whose id is the resolved model
        name. The created session is marked ``protected`` — exempt from
        idle TTL eviction and not listed by :meth:`list_sessions`. Admin-
        managed.
        """
        with self._lock:
            session = self._sessions.get(session_id)
            if session is not None:
                session.last_accessed = time.monotonic()
                session.last_accessed_wall = datetime.now(UTC)
                return session.store
            now_mono = time.monotonic()
            now_wall = datetime.now(UTC)
            session = _Session(
                session_id=session_id,
                store=ModelStore(max_models=self._max_models),
                created_at=now_wall,
                created_at_mono=now_mono,
                last_accessed=now_mono,
                created_at_wall=now_wall,
                last_accessed_wall=now_wall,
                protected=True,
            )
            self._sessions[session_id] = session
            return session.store

    def list_protected_session_ids(self) -> list[str]:
        """Return the ids of all admin-managed (protected) sessions.

        Used by multi-model discovery (``GET /v1/models``) and by Flight
        routing to enumerate which model names are available.
        """
        with self._lock:
            return [s.session_id for s in self._sessions.values() if s.protected]

    # -- internal ------------------------------------------------------------

    def _is_expired(self, session: _Session, now_mono: float) -> bool:
        """Check if a session has exceeded idle TTL or absolute max-age.

        Protected sessions (admin-loaded models via ``MODEL_FILES``) never
        expire — they're owned by the process lifecycle, not by client
        activity. Without this guard, ``get_store()`` would delete them on
        access past TTL, even though ``_purge_expired`` correctly skips
        them.
        """
        if session.protected:
            return False
        idle = now_mono - session.last_accessed > self._ttl
        aged = now_mono - session.created_at_mono > self._max_age
        return idle or aged

    def _expiry_reason(self, session: _Session, now_mono: float) -> str:
        """Return a human-readable reason why a session expired."""
        idle_elapsed = now_mono - session.last_accessed
        age_elapsed = now_mono - session.created_at_mono
        if age_elapsed > self._max_age:
            return f"max-age {self._max_age}s exceeded after {age_elapsed:.0f}s"
        return f"idle {self._ttl}s exceeded after {idle_elapsed:.0f}s"

    def _session_info(self, session: _Session) -> SessionInfo:
        now_wall = datetime.now(UTC)
        idle_remaining = self._ttl - (time.monotonic() - session.last_accessed)
        age_remaining = self._max_age - (time.monotonic() - session.created_at_mono)

        # expires_at = when the idle TTL would fire (from last access)
        expires_at = now_wall + timedelta(seconds=max(0.0, idle_remaining))
        # max_expires_at = absolute deadline (from creation)
        max_expires_at = now_wall + timedelta(seconds=max(0.0, age_remaining))

        return SessionInfo(
            session_id=session.session_id,
            created_at=session.created_at_wall,
            last_accessed_at=session.last_accessed_wall,
            model_count=len(session.store.list_models()),
            metadata=session.metadata,
            expires_at=expires_at,
            max_expires_at=max_expires_at,
        )

    def _purge_expired(self) -> None:
        """Remove all expired sessions (called by cleanup thread).

        Protected sessions (admin-managed pre-loads) are never purged.
        The legacy ``__default__`` session is kept alive when
        ``is_single_model_mode`` is set.
        """
        now_mono = time.monotonic()
        with self._lock:
            skip_default = self._is_single_model_mode
            expired = [
                sid
                for sid, s in self._sessions.items()
                if not s.protected
                and (not skip_default or sid != _DEFAULT_SESSION_ID)
                and self._is_expired(s, now_mono)
            ]
            for sid in expired:
                reason = self._expiry_reason(self._sessions[sid], now_mono)
                del self._sessions[sid]
                logger.info("Session purged: %s (%s)", sid, reason)
        if expired:
            logger.info(
                "Purge sweep: removed %d session(s), %d remaining",
                len(expired),
                len(self._sessions),
            )

    def _cleanup_loop(self) -> None:
        """Background loop that periodically purges expired sessions."""
        while not self._stop_event.wait(timeout=self._cleanup_interval):
            self._purge_expired()

active_count property

Number of active (non-expired) sessions.

start()

Start the background cleanup daemon thread.

Source code in src/orionbelt/service/session_manager.py
def start(self) -> None:
    """Start the background cleanup daemon thread."""
    if self._cleanup_thread is not None:
        return
    self._stop_event.clear()
    self._cleanup_thread = threading.Thread(
        target=self._cleanup_loop, daemon=True, name="session-cleanup"
    )
    self._cleanup_thread.start()

stop()

Signal the cleanup thread to stop and wait for it.

Source code in src/orionbelt/service/session_manager.py
def stop(self) -> None:
    """Signal the cleanup thread to stop and wait for it."""
    self._stop_event.set()
    if self._cleanup_thread is not None:
        self._cleanup_thread.join(timeout=5)
        self._cleanup_thread = None

create_session(metadata=None)

Create a new session and return its info.

Raises :class:SessionCapacityError when the global session cap is reached.

Source code in src/orionbelt/service/session_manager.py
def create_session(self, metadata: dict[str, str] | None = None) -> SessionInfo:
    """Create a new session and return its info.

    Raises :class:`SessionCapacityError` when the global session cap
    is reached.
    """
    now_mono = time.monotonic()
    now_wall = datetime.now(UTC)
    session_id = secrets.token_hex(16)  # 32-char hex (128-bit)
    session = _Session(
        session_id=session_id,
        store=ModelStore(max_models=self._max_models),
        created_at=now_wall,
        created_at_mono=now_mono,
        last_accessed=now_mono,
        metadata=metadata or {},
        created_at_wall=now_wall,
        last_accessed_wall=now_wall,
    )
    with self._lock:
        # Count only non-default, non-expired sessions toward the cap.
        active = sum(
            1
            for s in self._sessions.values()
            if s.session_id != _DEFAULT_SESSION_ID and not self._is_expired(s, now_mono)
        )
        if active >= self._max_sessions:
            logger.warning(
                "Session cap reached (%d/%d), rejecting create",
                active,
                self._max_sessions,
            )
            raise SessionCapacityError(
                f"Maximum number of concurrent sessions reached ({self._max_sessions})"
            )
        self._sessions[session_id] = session
    logger.info("Session created: %s", session_id)
    return self._session_info(session)

get_store(session_id)

Get the ModelStore for a session, updating its last-accessed time.

Raises :class:SessionExpiredError if the session has expired. Raises :class:SessionNotFoundError if the session ID is unknown.

Source code in src/orionbelt/service/session_manager.py
def get_store(self, session_id: str) -> ModelStore:
    """Get the ModelStore for a session, updating its last-accessed time.

    Raises :class:`SessionExpiredError` if the session has expired.
    Raises :class:`SessionNotFoundError` if the session ID is unknown.
    """
    now_mono = time.monotonic()
    with self._lock:
        session = self._sessions.get(session_id)
        if session is None:
            raise SessionNotFoundError(f"Session '{session_id}' not found")
        if self._is_expired(session, now_mono):
            reason = self._expiry_reason(session, now_mono)
            del self._sessions[session_id]
            logger.info("Session expired on access: %s (%s)", session_id, reason)
            raise SessionExpiredError(f"Session '{session_id}' has expired ({reason})")
        session.last_accessed = now_mono
        session.last_accessed_wall = datetime.now(UTC)
        return session.store

get_session(session_id)

Get session info (also refreshes last-accessed).

Source code in src/orionbelt/service/session_manager.py
def get_session(self, session_id: str) -> SessionInfo:
    """Get session info (also refreshes last-accessed)."""
    now_mono = time.monotonic()
    with self._lock:
        session = self._sessions.get(session_id)
        if session is None:
            raise SessionNotFoundError(f"Session '{session_id}' not found")
        if self._is_expired(session, now_mono):
            reason = self._expiry_reason(session, now_mono)
            del self._sessions[session_id]
            logger.info("Session expired on access: %s (%s)", session_id, reason)
            raise SessionExpiredError(f"Session '{session_id}' has expired ({reason})")
        session.last_accessed = now_mono
        session.last_accessed_wall = datetime.now(UTC)
        return self._session_info(session)

close_session(session_id)

Explicitly close a session.

Source code in src/orionbelt/service/session_manager.py
def close_session(self, session_id: str) -> None:
    """Explicitly close a session."""
    with self._lock:
        if session_id not in self._sessions:
            raise SessionNotFoundError(f"Session '{session_id}' not found")
        del self._sessions[session_id]
    logger.info("Session closed: %s", session_id)

list_sessions()

Return info for all non-expired user sessions.

Excludes the default session and any admin-managed (protected) sessions created by the multi-model startup loader.

Source code in src/orionbelt/service/session_manager.py
def list_sessions(self) -> list[SessionInfo]:
    """Return info for all non-expired user sessions.

    Excludes the default session and any admin-managed (protected)
    sessions created by the multi-model startup loader.
    """
    now_mono = time.monotonic()
    result: list[SessionInfo] = []
    with self._lock:
        for session in self._sessions.values():
            if session.session_id == _DEFAULT_SESSION_ID:
                continue
            if session.protected:
                continue
            if not self._is_expired(session, now_mono):
                result.append(self._session_info(session))
    return result

get_or_create_default()

Get (or lazily create) the legacy __default__ session.

Unlike :meth:get_or_create_named, the default session is NOT marked protected — its lifecycle is controlled by the is_single_model_mode flag, not by the protected mechanism. This preserves backward compatibility with the v2.3.x model- upload semantics where each new user session inherits the preloaded YAML.

Source code in src/orionbelt/service/session_manager.py
def get_or_create_default(self) -> ModelStore:
    """Get (or lazily create) the legacy ``__default__`` session.

    Unlike :meth:`get_or_create_named`, the default session is NOT
    marked protected — its lifecycle is controlled by the
    ``is_single_model_mode`` flag, not by the protected mechanism.
    This preserves backward compatibility with the v2.3.x model-
    upload semantics where each new user session inherits the
    preloaded YAML.
    """
    with self._lock:
        session = self._sessions.get(_DEFAULT_SESSION_ID)
        if session is not None:
            session.last_accessed = time.monotonic()
            session.last_accessed_wall = datetime.now(UTC)
            return session.store
        now_mono = time.monotonic()
        now_wall = datetime.now(UTC)
        session = _Session(
            session_id=_DEFAULT_SESSION_ID,
            store=ModelStore(max_models=self._max_models),
            created_at=now_wall,
            created_at_mono=now_mono,
            last_accessed=now_mono,
            created_at_wall=now_wall,
            last_accessed_wall=now_wall,
        )
        self._sessions[_DEFAULT_SESSION_ID] = session
        return session.store

SessionInfo

orionbelt.service.session_manager.SessionInfo dataclass

Public session metadata (returned by list/get).

Source code in src/orionbelt/service/session_manager.py
@dataclass
class SessionInfo:
    """Public session metadata (returned by list/get)."""

    session_id: str
    created_at: datetime
    last_accessed_at: datetime
    model_count: int
    metadata: dict[str, str]
    expires_at: datetime
    max_expires_at: datetime

Compiler Pipeline

orionbelt.compiler.pipeline.CompilationPipeline

Orchestrates: Query → Resolution → Planning → AST → SQL.

Source code in src/orionbelt/compiler/pipeline.py
class CompilationPipeline:
    """Orchestrates: Query → Resolution → Planning → AST → SQL."""

    def __init__(self) -> None:
        self._resolver = QueryResolver()
        self._star_planner = StarSchemaPlanner()
        self._cfl_planner = CFLPlanner()
        self._raw_planner = RawPlanner()

    def compile(
        self,
        query: QueryObject,
        model: SemanticModel,
        dialect_name: str,
    ) -> CompilationResult:
        """Compile a query to SQL for the specified dialect."""
        # Create dialect first so resolution and planning share one
        # ``qualify_table`` — the EXISTS filter operator needs it during
        # resolution to render the correlated subquery's FROM clause.
        dialect = DialectRegistry.get(dialect_name)
        qualify_table = lambda obj: dialect.format_table_ref(  # noqa: E731
            obj.database, obj.schema_name, obj.code
        )

        # Phase 1: Resolution
        resolved = self._resolver.resolve(query, model, qualify_table=qualify_table)

        # Phase 1.5: Fanout detection (skip for CFL — each fact queried independently)
        if not resolved.requires_cfl:
            detect_fanout(resolved, model)

        # Phase 2: Planning (raw / star schema / CFL)
        use_cfl = resolved.requires_cfl or resolved.dimensions_exclude
        if resolved.is_raw:
            plan = self._raw_planner.plan(
                resolved,
                model,
                qualify_table=qualify_table,
                dialect=dialect,
                union_by_name=dialect.capabilities.supports_union_all_by_name,
            )
        elif use_cfl:
            plan = self._cfl_planner.plan(
                resolved,
                model,
                qualify_table=qualify_table,
                union_by_name=dialect.capabilities.supports_union_all_by_name,
                dialect=dialect,
            )
        else:
            plan = self._star_planner.plan(
                resolved, model, qualify_table=qualify_table, dialect=dialect
            )

        # Phase 2.3 – 2.6: Aggregate-mode wrappers (filter context, PoP,
        # totals, cumulative). Raw mode has no measures, so these are no-ops
        # and skipped entirely for clarity.
        if resolved.is_raw:
            wrapped_ast = plan.ast
        else:
            # ROLLUP/CUBE wraps the base CTE inside total/PoP/cumulative
            # wrappers, but the outer wrapper SELECTs by dim/measure name —
            # the GROUPING() flag columns won't survive. Warn so callers
            # know subtotal rows have NULL in rolled-up dims with no flag
            # to disambiguate them from real-NULL detail rows.
            if resolved.grouping is not None and (
                resolved.has_totals
                or resolved.has_pop
                or resolved.has_cumulative
                or resolved.has_window
            ):
                resolved.warnings.append(
                    warning(
                        code=WarningCode.INCOMPATIBLE_COMBINATION,
                        message=(
                            "ROLLUP/CUBE combined with total / period-over-period / "
                            "cumulative measures — GROUPING() flag columns may not "
                            "appear in the final projection. Subtotal rows are still "
                            "produced, but callers cannot distinguish them from "
                            "detail rows whose rolled-up dim is legitimately NULL."
                        ),
                        hint=(
                            "Avoid combining `grouping: rollup|cube` with "
                            "`total: true`, period-over-period metrics, or cumulative "
                            "metrics in the same query."
                        ),
                        context={
                            "grouping": resolved.grouping.value,
                            "has_totals": resolved.has_totals,
                            "has_pop": resolved.has_pop,
                            "has_cumulative": resolved.has_cumulative,
                        },
                    )
                )
            # Wrap with filter context CTEs if needed
            wrapped_ast = wrap_with_filter_context(
                plan.ast, resolved, model, dialect, qualify_table
            )

            # Wrap with PoP CTEs if needed
            wrapped_ast = wrap_with_pop(wrapped_ast, resolved, model, dialect, qualify_table)

            # Wrap with totals CTE if needed
            # Skip totals wrap when PoP or cumulative is active — the combination
            # produces invalid SQL because totals rewrites the AST structure that
            # PoP/cumulative wrappers depend on.
            if resolved.has_totals and (resolved.has_pop or resolved.has_cumulative):
                resolved.warnings.append(
                    warning(
                        code=WarningCode.INCOMPATIBLE_COMBINATION,
                        message=(
                            "total=True measures are ignored when combined with "
                            "period-over-period or cumulative metrics in the same query"
                        ),
                        hint=(
                            "Drop total=True from the affected measures, or remove the "
                            "PoP/cumulative metric from this query."
                        ),
                        context={
                            "has_totals": True,
                            "has_pop": resolved.has_pop,
                            "has_cumulative": resolved.has_cumulative,
                        },
                    )
                )
            else:
                wrapped_ast = wrap_with_totals(wrapped_ast, resolved)

            # Wrap with cumulative CTE if needed.
            # Pass model + dialect so the wrapper can apply the declared
            # dataType cast inside cumulative_base and on the outer
            # window — otherwise cumulative output is silently DOUBLE
            # regardless of the metric's declared type.
            wrapped_ast = wrap_with_cumulative(wrapped_ast, resolved, model=model, dialect=dialect)

            # Wrap with window CTE (rank / lag / lead / ntile / first/last value).
            # Runs after cumulative so window metrics can compose over cumulative
            # outputs (e.g. ranking a moving average).
            wrapped_ast = wrap_with_window(wrapped_ast, resolved, model=model, dialect=dialect)

            # Drop HAVING-only auto-included measures from the final SELECT
            # so the user only sees the columns they asked for.
            wrapped_ast = _drop_having_only_projection(wrapped_ast, resolved)

        # Phase 3: Dialect-specific SQL rendering
        codegen = CodeGenerator(dialect)
        sql = codegen.generate(wrapped_ast)

        # Phase 4: SQL validation (non-blocking)
        validation_errors = validate_sql(sql, dialect_name)
        sql_valid = len(validation_errors) == 0
        warnings = resolved.warnings
        if not sql_valid:
            warnings = warnings + [
                warning(
                    code=WarningCode.SQL_VALIDATION,
                    message=f"SQL validation: {e}",
                )
                for e in validation_errors
            ]

        # Build explain plan
        explain = self._build_explain(resolved, model, use_cfl, plan)

        # Compute deduplicated physical tables touched by the query
        physical_tables = _compute_physical_tables(resolved, query, model)

        return CompilationResult(
            sql=sql,
            dialect=dialect_name,
            physical_tables=physical_tables,
            resolved=ResolvedInfo(
                fact_tables=resolved.fact_tables,
                dimensions=[d.name for d in resolved.dimensions],
                measures=[m.name for m in resolved.measures],
            ),
            warnings=warnings,
            sql_valid=sql_valid,
            explain=explain,
        )

    @staticmethod
    def _q(name: str) -> str:
        """Quote an identifier for explain output."""
        return f'"{name}"'

    def _build_explain(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        use_cfl: bool,
        plan: QueryPlan,
    ) -> ExplainPlan:
        """Build the explain plan from resolution results."""
        q = self._q

        # Planner choice
        if resolved.is_raw:
            planner = "Raw"
            distinct_note = " with DISTINCT" if resolved.distinct else ""
            planner_reason = (
                f"Raw-mode projection of physical columns{distinct_note} — "
                f"no aggregation, no GROUP BY"
            )
        elif use_cfl:
            if resolved.dimensions_exclude:
                planner = "CFL"
                planner_reason = (
                    "dimensionsExclude anti-join — "
                    "CROSS JOIN of distinct values EXCEPT existing combinations"
                )
            else:
                planner = "CFL"
                sources = ", ".join(q(s) for s in sorted(resolved.measure_source_objects))
                planner_reason = (
                    f"Measures reference independent fact tables ({sources}) — "
                    f"Composite Fact Layer merges them via UNION ALL"
                )
        else:
            planner = "Star Schema"
            planner_reason = (
                "All requested objects are reachable from a single base via directed joins"
            )

        # Base object — explain should reflect actual selection logic
        base = resolved.base_object
        if resolved.measure_source_objects:
            if use_cfl and len(resolved.measure_source_objects) > 1:
                base_reason = (
                    "Not applicable — each CFL leg uses its own common root (see cfl_legs)"
                )
            elif len(resolved.measure_source_objects) > 1:
                sources = ", ".join(q(s) for s in sorted(resolved.measure_source_objects))
                base_reason = (
                    f"{q(base)} selected as base — most connected fact table "
                    f"among measure sources ({sources})"
                )
            else:
                base_reason = f"{q(base)} selected as base — sole measure source object"
        elif len(resolved.required_objects) > 1:
            base_reason = (
                f"{q(base)} selected as base — common root that can reach "
                f"all required objects via directed joins"
            )
        else:
            base_reason = f"{q(base)} selected as base for single-object query"

        # Joins — for CFL queries the per-leg joins are more informative,
        # so only include resolution-level joins for star schema queries.
        explain_joins: list[ExplainJoin] = []
        if not use_cfl:
            for step in resolved.join_steps:
                join_cols = [
                    f"{fc} = {tc}"
                    for fc, tc in zip(step.from_columns, step.to_columns, strict=True)
                ]
                if step.reversed:
                    reason = (
                        f"Reversed join from {q(step.from_object)} to {q(step.to_object)} — "
                        f"original join was defined in the opposite direction"
                    )
                else:
                    reason = (
                        f"Join {q(step.from_object)}{q(step.to_object)} to include "
                        f"columns needed by the query"
                    )
                explain_joins.append(
                    ExplainJoin(
                        from_object=step.from_object,
                        to_object=step.to_object,
                        join_columns=join_cols,
                        reason=reason,
                        cardinality=step.cardinality.value,
                    )
                )

        # CFL leg details
        cfl_leg_explains: list[ExplainCflLeg] = []
        for leg in plan.cfl_legs:
            cfl_leg_explains.append(
                ExplainCflLeg(
                    measure_source=leg.measure_source,
                    common_root=leg.common_root,
                    reason=leg.reason,
                    measures=leg.measures,
                    joins=leg.joins,
                )
            )

        return ExplainPlan(
            planner=planner,
            planner_reason=planner_reason,
            base_object=base,
            base_object_reason=base_reason,
            joins=explain_joins,
            where_filter_count=len(resolved.where_filters),
            having_filter_count=len(resolved.having_filters),
            has_totals=resolved.has_totals,
            has_grain_overrides=resolved.has_grain_overrides,
            has_filter_context=resolved.has_filter_context,
            has_cumulative=resolved.has_cumulative,
            has_pop=resolved.has_pop,
            has_window=resolved.has_window,
            cfl_legs=cfl_leg_explains,
        )

compile(query, model, dialect_name)

Compile a query to SQL for the specified dialect.

Source code in src/orionbelt/compiler/pipeline.py
def compile(
    self,
    query: QueryObject,
    model: SemanticModel,
    dialect_name: str,
) -> CompilationResult:
    """Compile a query to SQL for the specified dialect."""
    # Create dialect first so resolution and planning share one
    # ``qualify_table`` — the EXISTS filter operator needs it during
    # resolution to render the correlated subquery's FROM clause.
    dialect = DialectRegistry.get(dialect_name)
    qualify_table = lambda obj: dialect.format_table_ref(  # noqa: E731
        obj.database, obj.schema_name, obj.code
    )

    # Phase 1: Resolution
    resolved = self._resolver.resolve(query, model, qualify_table=qualify_table)

    # Phase 1.5: Fanout detection (skip for CFL — each fact queried independently)
    if not resolved.requires_cfl:
        detect_fanout(resolved, model)

    # Phase 2: Planning (raw / star schema / CFL)
    use_cfl = resolved.requires_cfl or resolved.dimensions_exclude
    if resolved.is_raw:
        plan = self._raw_planner.plan(
            resolved,
            model,
            qualify_table=qualify_table,
            dialect=dialect,
            union_by_name=dialect.capabilities.supports_union_all_by_name,
        )
    elif use_cfl:
        plan = self._cfl_planner.plan(
            resolved,
            model,
            qualify_table=qualify_table,
            union_by_name=dialect.capabilities.supports_union_all_by_name,
            dialect=dialect,
        )
    else:
        plan = self._star_planner.plan(
            resolved, model, qualify_table=qualify_table, dialect=dialect
        )

    # Phase 2.3 – 2.6: Aggregate-mode wrappers (filter context, PoP,
    # totals, cumulative). Raw mode has no measures, so these are no-ops
    # and skipped entirely for clarity.
    if resolved.is_raw:
        wrapped_ast = plan.ast
    else:
        # ROLLUP/CUBE wraps the base CTE inside total/PoP/cumulative
        # wrappers, but the outer wrapper SELECTs by dim/measure name —
        # the GROUPING() flag columns won't survive. Warn so callers
        # know subtotal rows have NULL in rolled-up dims with no flag
        # to disambiguate them from real-NULL detail rows.
        if resolved.grouping is not None and (
            resolved.has_totals
            or resolved.has_pop
            or resolved.has_cumulative
            or resolved.has_window
        ):
            resolved.warnings.append(
                warning(
                    code=WarningCode.INCOMPATIBLE_COMBINATION,
                    message=(
                        "ROLLUP/CUBE combined with total / period-over-period / "
                        "cumulative measures — GROUPING() flag columns may not "
                        "appear in the final projection. Subtotal rows are still "
                        "produced, but callers cannot distinguish them from "
                        "detail rows whose rolled-up dim is legitimately NULL."
                    ),
                    hint=(
                        "Avoid combining `grouping: rollup|cube` with "
                        "`total: true`, period-over-period metrics, or cumulative "
                        "metrics in the same query."
                    ),
                    context={
                        "grouping": resolved.grouping.value,
                        "has_totals": resolved.has_totals,
                        "has_pop": resolved.has_pop,
                        "has_cumulative": resolved.has_cumulative,
                    },
                )
            )
        # Wrap with filter context CTEs if needed
        wrapped_ast = wrap_with_filter_context(
            plan.ast, resolved, model, dialect, qualify_table
        )

        # Wrap with PoP CTEs if needed
        wrapped_ast = wrap_with_pop(wrapped_ast, resolved, model, dialect, qualify_table)

        # Wrap with totals CTE if needed
        # Skip totals wrap when PoP or cumulative is active — the combination
        # produces invalid SQL because totals rewrites the AST structure that
        # PoP/cumulative wrappers depend on.
        if resolved.has_totals and (resolved.has_pop or resolved.has_cumulative):
            resolved.warnings.append(
                warning(
                    code=WarningCode.INCOMPATIBLE_COMBINATION,
                    message=(
                        "total=True measures are ignored when combined with "
                        "period-over-period or cumulative metrics in the same query"
                    ),
                    hint=(
                        "Drop total=True from the affected measures, or remove the "
                        "PoP/cumulative metric from this query."
                    ),
                    context={
                        "has_totals": True,
                        "has_pop": resolved.has_pop,
                        "has_cumulative": resolved.has_cumulative,
                    },
                )
            )
        else:
            wrapped_ast = wrap_with_totals(wrapped_ast, resolved)

        # Wrap with cumulative CTE if needed.
        # Pass model + dialect so the wrapper can apply the declared
        # dataType cast inside cumulative_base and on the outer
        # window — otherwise cumulative output is silently DOUBLE
        # regardless of the metric's declared type.
        wrapped_ast = wrap_with_cumulative(wrapped_ast, resolved, model=model, dialect=dialect)

        # Wrap with window CTE (rank / lag / lead / ntile / first/last value).
        # Runs after cumulative so window metrics can compose over cumulative
        # outputs (e.g. ranking a moving average).
        wrapped_ast = wrap_with_window(wrapped_ast, resolved, model=model, dialect=dialect)

        # Drop HAVING-only auto-included measures from the final SELECT
        # so the user only sees the columns they asked for.
        wrapped_ast = _drop_having_only_projection(wrapped_ast, resolved)

    # Phase 3: Dialect-specific SQL rendering
    codegen = CodeGenerator(dialect)
    sql = codegen.generate(wrapped_ast)

    # Phase 4: SQL validation (non-blocking)
    validation_errors = validate_sql(sql, dialect_name)
    sql_valid = len(validation_errors) == 0
    warnings = resolved.warnings
    if not sql_valid:
        warnings = warnings + [
            warning(
                code=WarningCode.SQL_VALIDATION,
                message=f"SQL validation: {e}",
            )
            for e in validation_errors
        ]

    # Build explain plan
    explain = self._build_explain(resolved, model, use_cfl, plan)

    # Compute deduplicated physical tables touched by the query
    physical_tables = _compute_physical_tables(resolved, query, model)

    return CompilationResult(
        sql=sql,
        dialect=dialect_name,
        physical_tables=physical_tables,
        resolved=ResolvedInfo(
            fact_tables=resolved.fact_tables,
            dimensions=[d.name for d in resolved.dimensions],
            measures=[m.name for m in resolved.measures],
        ),
        warnings=warnings,
        sql_valid=sql_valid,
        explain=explain,
    )

Query Resolution

orionbelt.compiler.resolution.QueryResolver

Resolves a QueryObject + SemanticModel into a ResolvedQuery.

Source code in src/orionbelt/compiler/resolution.py
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
class QueryResolver:
    """Resolves a QueryObject + SemanticModel into a ResolvedQuery."""

    def resolve(
        self,
        query: QueryObject,
        model: SemanticModel,
        qualify_table: Callable[[DataObject], str] | None = None,
    ) -> ResolvedQuery:
        ctx = _ResolutionContext(
            model=model,
            result=ResolvedQuery(
                limit=query.limit,
                offset=query.offset,
                use_path_names=list(query.use_path_names),
                is_raw=query.select.is_raw,
                distinct=query.select.distinct,
                grouping=query.grouping,
            ),
            qualify_table=qualify_table,
        )

        # Build global column lookup: col_name → (object_name, source_column)
        for obj_name, obj in model.data_objects.items():
            for col_name, col_obj in obj.columns.items():
                ctx.global_columns[col_name] = (obj_name, col_obj.code)

        if query.select.is_raw:
            # Raw mode: project physical columns, no aggregation.
            for ref in query.select.fields:
                self._resolve_raw_field(ctx, ref)
        else:
            # Aggregate mode (default).
            # 1. Resolve dimensions (string or coalesce group).
            # Coalesce groups expand into their constituent dimensions, each
            # tagged with the same coalesce_alias so the CFL outer wrapper can
            # emit COALESCE(d1, d2, ...) AS <alias>.
            for dim_entry in query.select.dimensions:
                if isinstance(dim_entry, CoalesceDimension):
                    self._resolve_coalesce_dimension(ctx, dim_entry, ctx.result.coalesce_aliases)
                else:
                    self._append_resolved_dimension(ctx, dim_entry)

            # 2. Resolve measures and track their source objects
            for measure_name in query.select.measures:
                resolved_meas = self._resolve_measure(ctx, measure_name)
                if resolved_meas:
                    ctx.result.measures.append(resolved_meas)
                    source_objs = self._get_measure_source_objects(ctx, measure_name)
                    ctx.result.measure_source_objects.update(source_objs)
                    ctx.result.required_objects.update(source_objs)

            # 2.5. Auto-include measures referenced by HAVING but not by SELECT.
            # Without this, codegen emits a HAVING clause that references an
            # alias for a column the SELECT doesn't project — every database
            # rejects the SQL with a "must appear in GROUP BY" binder error.
            # Routing this through the regular measure-resolution path also
            # updates ``measure_source_objects`` so the multi-fact CFL trigger
            # below sees the HAVING-only measure's source.
            existing_measure_names = {m.name for m in ctx.result.measures}
            for ref in self._collect_having_measure_refs(query, model):
                if ref in existing_measure_names:
                    continue
                resolved_meas = self._resolve_measure(ctx, ref)
                if resolved_meas is None:
                    continue
                ctx.result.measures.append(resolved_meas)
                ctx.result.having_only_measures.add(ref)
                existing_measure_names.add(ref)
                source_objs = self._get_measure_source_objects(ctx, ref)
                ctx.result.measure_source_objects.update(source_objs)
                ctx.result.required_objects.update(source_objs)

        # 3. Determine base object (the one with most joins / most measures)
        ctx.result.base_object = self._select_base_object(ctx)
        if ctx.result.base_object:
            ctx.result.required_objects.add(ctx.result.base_object)

        # Detect multi-fact: CFL is needed only when measure source objects
        # span multiple independent fact tables.
        if len(ctx.result.measure_source_objects) > 1:
            graph = JoinGraph(model, use_path_names=query.use_path_names or None)
            reachable = graph.descendants(ctx.result.base_object)
            unreachable = ctx.result.measure_source_objects - reachable - {ctx.result.base_object}
            if unreachable:
                ctx.result.requires_cfl = True

        # Dimension-only queries: when dimensions span independent branches,
        # join through intermediate bridge/fact tables (no CFL needed).
        # Add intermediate tables from the join steps to required_objects
        # so the star schema planner includes them.
        if not ctx.result.measure_source_objects and ctx.result.dimensions:
            dim_objects = {d.object_name for d in ctx.result.dimensions}
            if not dim_objects <= {ctx.result.base_object}:
                graph = JoinGraph(model, use_path_names=query.use_path_names or None)
                steps = graph.find_join_path(
                    {ctx.result.base_object},
                    dim_objects,
                    via_constraints=ctx.result.via_constraints or None,
                )
                for step in steps:
                    ctx.result.required_objects.add(step.from_object)
                    ctx.result.required_objects.add(step.to_object)

        # Raw mode: detect multi-fact (fields span objects unreachable from
        # the base via directed joins). The pipeline rejects this case for
        # now — raw CFL is a planned follow-up.
        if ctx.result.is_raw and ctx.result.base_object:
            field_objects = {f.object_name for f in ctx.result.fields}
            if len(field_objects) > 1:
                graph = JoinGraph(model, use_path_names=query.use_path_names or None)
                reachable = graph.descendants(ctx.result.base_object)
                unreachable = field_objects - reachable - {ctx.result.base_object}
                if unreachable:
                    ctx.result.requires_cfl = True

        # Validate dimensionsExclude constraints
        if query.dimensions_exclude:
            if query.select.measures:
                ctx.errors.append(
                    SemanticError(
                        code="DIMENSIONS_EXCLUDE_WITH_MEASURES",
                        message="dimensionsExclude cannot be combined with measures",
                        path="select",
                    )
                )
            elif len(ctx.result.dimensions) < 2:
                ctx.errors.append(
                    SemanticError(
                        code="DIMENSIONS_EXCLUDE_INSUFFICIENT",
                        message="dimensionsExclude requires at least 2 dimensions",
                        path="select.dimensions",
                    )
                )
            else:
                ctx.result.dimensions_exclude = True

        # 4. Validate usePathNames before building join graph
        self._validate_use_path_names(ctx, query.use_path_names)

        # 5. Resolve join paths
        ctx.graph = JoinGraph(model, use_path_names=query.use_path_names or None)
        if ctx.result.base_object and len(ctx.result.required_objects) > 1:
            ctx.result.join_steps = ctx.graph.find_join_path(
                {ctx.result.base_object},
                ctx.result.required_objects,
                via_constraints=ctx.result.via_constraints or None,
            )

        # Build set of all objects present in the query's join graph
        if ctx.result.base_object:
            ctx.joined_objects.add(ctx.result.base_object)
        for step in ctx.result.join_steps:
            ctx.joined_objects.add(step.to_object)

        # Detect required objects that the star-schema planner cannot reach.
        # Many-to-one joins are forward-only (reverse traversal would inflate
        # the base table), so a required object that's only reachable via a
        # reverse m-to-1 hop is unreachable.  Raise a clear error rather than
        # silently producing wrong SQL.  CFL legs are validated separately.
        if ctx.result.base_object and not ctx.result.requires_cfl:
            unreachable = ctx.result.required_objects - ctx.joined_objects
            for unreachable_name in sorted(unreachable):
                ctx.errors.append(
                    SemanticError(
                        code="UNREACHABLE_REQUIRED_OBJECT",
                        message=(
                            f"Data object '{unreachable_name}' is required by the query but "
                            f"cannot be reached from base '{ctx.result.base_object}' via "
                            f"directed joins. Many-to-one joins are forward-only; reverse "
                            f"traversal would inflate row counts. Add an explicit join from "
                            f"'{ctx.result.base_object}' (or an intermediate object) to "
                            f"'{unreachable_name}', or split the query so each fact is "
                            f"queried independently."
                        ),
                        path="select",
                    )
                )

        # 5b. Inject static model filters — always applied as WHERE conditions
        static_exprs: list[Expr] = []
        for mf in model.filters:
            static_filter = self._resolve_static_filter(ctx, mf)
            if static_filter:
                ctx.result.where_filters.append(static_filter)
                static_exprs.append(static_filter.expression)

        # 6. Classify filters — skip query-time duplicates of static filters
        for qfi in query.where:
            resolved_filter = self._resolve_filter_item(ctx, qfi, is_having=False)
            if resolved_filter and resolved_filter.expression not in static_exprs:
                ctx.result.where_filters.append(resolved_filter)

        for qfi in query.having:
            resolved_filter = self._resolve_filter_item(ctx, qfi, is_having=True)
            if resolved_filter:
                ctx.result.having_filters.append(resolved_filter)

        # 7. Resolve order by — must reference a dimension or measure in SELECT
        select_count = len(ctx.result.dimensions) + len(ctx.result.measures)
        for ob in query.order_by:
            expr = self._resolve_order_by_field(ctx, ob.field, select_count)
            if expr:
                ctx.result.order_by_exprs.append((expr, ob.direction == "desc", ob.nulls))

        # 8. ROLLUP / CUBE: backfill NULLS FIRST on any explicit ORDER BY entry
        # that didn't specify a NULLs position. Subtotal and grand-total rows
        # carry NULLs in the rolled-up group-by columns, and BI tools expect
        # those totals at the top of the result — not interleaved with details.
        if ctx.result.grouping is not None and ctx.result.order_by_exprs:
            ctx.result.order_by_exprs = [
                (expr, desc, NullsPosition.FIRST if nulls is None else nulls)
                for expr, desc, nulls in ctx.result.order_by_exprs
            ]

        # 9. Auto-order — when no explicit ORDER BY, append ORDER BY over all
        # SELECT dimensions (or raw fields) under two conditions:
        #   (a) LIMIT is set: cache hashes on compiled SQL; without ORDER BY
        #       ``LIMIT N`` returns any N rows, freezing one arbitrary slice.
        #   (b) ROLLUP / CUBE: subtotal layout is otherwise unpredictable.
        # ROLLUP / CUBE defaults to NULLS FIRST (totals at the top).
        # Aggregate-only queries (no dims, no fields) are already single-row
        # deterministic — skip.
        needs_auto_order = not ctx.result.order_by_exprs and (
            ctx.result.limit is not None or ctx.result.grouping is not None
        )
        if needs_auto_order:
            nulls_default = NullsPosition.FIRST if ctx.result.grouping is not None else None
            if ctx.result.is_raw and ctx.result.fields:
                for f in ctx.result.fields:
                    ctx.result.order_by_exprs.append(
                        (ColumnRef(name=f.alias), False, nulls_default)
                    )
            elif ctx.result.dimensions:
                for dim in ctx.result.dimensions:
                    ctx.result.order_by_exprs.append(
                        (ColumnRef(name=dim.name), False, nulls_default)
                    )

        if ctx.errors:
            raise ResolutionError(ctx.errors)

        return ctx.result

    # -- raw mode fields -----------------------------------------------------

    def _resolve_raw_field(self, ctx: _ResolutionContext, ref: str) -> None:
        """Resolve a ``DataObject.Column`` reference for raw-mode projection.

        Errors are accumulated in the resolution context (raised at the end).
        """
        if "." not in ref:
            ctx.errors.append(
                SemanticError(
                    code="RAW_FIELD_INVALID_REF",
                    message=(
                        f"Raw-mode field '{ref}' must be a qualified 'DataObject.Column' reference"
                    ),
                    path="select.fields",
                )
            )
            return

        obj_name, col_name = ref.split(".", 1)
        obj_name = obj_name.strip()
        col_name = col_name.strip()
        obj = ctx.model.data_objects.get(obj_name)
        if obj is None:
            ctx.errors.append(
                SemanticError(
                    code="RAW_FIELD_UNKNOWN_OBJECT",
                    message=f"Raw-mode field '{ref}' references unknown data object '{obj_name}'",
                    path="select.fields",
                )
            )
            return
        column = obj.columns.get(col_name)
        if column is None:
            ctx.errors.append(
                SemanticError(
                    code="RAW_FIELD_UNKNOWN_COLUMN",
                    message=(
                        f"Raw-mode field '{ref}' references unknown column "
                        f"'{col_name}' on data object '{obj_name}'"
                    ),
                    path="select.fields",
                )
            )
            return

        ctx.result.fields.append(
            ResolvedField(
                object_name=obj_name,
                column_name=col_name,
                source_column=column.code,
                alias=ref,
            )
        )
        ctx.result.required_objects.add(obj_name)

    # -- dimensions ----------------------------------------------------------

    def _append_resolved_dimension(
        self,
        ctx: _ResolutionContext,
        dim_str: str,
        coalesce_alias: str | None = None,
    ) -> ResolvedDimension | None:
        """Resolve a single dimension string and append it to the result."""
        dim_ref = DimensionRef.parse(dim_str)
        resolved_dim = self._resolve_dimension(ctx, dim_ref)
        if resolved_dim is None:
            return None
        dim_def = ctx.model.dimensions.get(dim_ref.name)
        if dim_def and dim_def.via:
            resolved_dim.via = dim_def.via
            ctx.result.required_objects.add(dim_def.via)
            ctx.result.via_constraints[resolved_dim.object_name] = dim_def.via
        if coalesce_alias is not None:
            resolved_dim.coalesce_alias = coalesce_alias
        ctx.result.dimensions.append(resolved_dim)
        ctx.result.required_objects.add(resolved_dim.object_name)
        return resolved_dim

    def _resolve_coalesce_dimension(
        self,
        ctx: _ResolutionContext,
        coalesce: CoalesceDimension,
        seen_aliases: set[str],
    ) -> None:
        """Expand a coalesce group into its constituent resolved dimensions.

        Validates: at least 2 members, alias is unique within the query and
        does not collide with an existing dimension/measure name, all members
        resolve to the same abstract column type.
        """
        alias = coalesce.alias
        if not alias:
            ctx.errors.append(
                SemanticError(
                    code="COALESCE_MISSING_ALIAS",
                    message="Coalesce dimension requires a non-empty 'as' alias",
                    path="select.dimensions",
                )
            )
            return
        if alias in seen_aliases:
            ctx.errors.append(
                SemanticError(
                    code="DUPLICATE_COALESCE_ALIAS",
                    message=f"Duplicate coalesce alias '{alias}' in this query",
                    path="select.dimensions",
                )
            )
            return
        if alias in ctx.model.dimensions or alias in ctx.model.measures:
            ctx.errors.append(
                SemanticError(
                    code="COALESCE_ALIAS_COLLISION",
                    message=(
                        f"Coalesce alias '{alias}' collides with an existing "
                        f"model dimension or measure name"
                    ),
                    path="select.dimensions",
                )
            )
            return
        if len(coalesce.coalesce) < 2:
            ctx.errors.append(
                SemanticError(
                    code="COALESCE_TOO_FEW_MEMBERS",
                    message=(
                        f"Coalesce '{alias}' requires at least 2 dimensions "
                        f"(got {len(coalesce.coalesce)})"
                    ),
                    path="select.dimensions",
                )
            )
            return
        seen_aliases.add(alias)

        # Resolve each member with the alias tag; verify type compatibility.
        member_types: set[str] = set()
        for member in coalesce.coalesce:
            resolved = self._append_resolved_dimension(ctx, member, coalesce_alias=alias)
            if resolved:
                dim_def = ctx.model.dimensions.get(member)
                if dim_def:
                    member_types.add(dim_def.result_type.value)
        if len(member_types) > 1:
            ctx.errors.append(
                SemanticError(
                    code="COALESCE_TYPE_MISMATCH",
                    message=(
                        f"Coalesce '{alias}' members have incompatible result types: "
                        f"{sorted(member_types)}"
                    ),
                    path="select.dimensions",
                )
            )

    def _resolve_dimension(
        self, ctx: _ResolutionContext, ref: DimensionRef
    ) -> ResolvedDimension | None:
        """Resolve a dimension reference to its physical column."""
        dim = ctx.model.dimensions.get(ref.name)
        if dim is None:
            ctx.errors.append(
                SemanticError(
                    code="UNKNOWN_DIMENSION",
                    message=f"Unknown dimension '{ref.name}'",
                    path="select.dimensions",
                )
            )
            return None

        obj_name = dim.view
        col_name = dim.column
        obj = ctx.model.data_objects.get(obj_name)
        if obj is None:
            ctx.errors.append(
                SemanticError(
                    code="UNKNOWN_DATA_OBJECT",
                    message=f"Dimension '{ref.name}' references unknown data object '{obj_name}'",
                )
            )
            return None

        vf = obj.columns.get(col_name)
        source_col = vf.code if vf else col_name

        return ResolvedDimension(
            name=ref.name,
            object_name=obj_name,
            column_name=col_name,
            source_column=source_col,
            grain=ref.grain or dim.time_grain,
        )

    # -- measures & metrics --------------------------------------------------

    def _resolve_measure(self, ctx: _ResolutionContext, name: str) -> ResolvedMeasure | None:
        """Resolve a measure name to its aggregate expression."""
        measure = ctx.model.measures.get(name)
        if measure is None:
            metric = ctx.model.metrics.get(name)
            if metric:
                return self._resolve_metric(ctx, name, metric)
            ctx.errors.append(
                SemanticError(
                    code="UNKNOWN_MEASURE",
                    message=f"Unknown measure '{name}'",
                    path="select.measures",
                )
            )
            return None

        expr = self._build_measure_expr(ctx, measure)
        grain_override = measure.grain
        effective_grain: list[str] | None = None
        if grain_override is not None:
            query_dim_names = [d.name for d in ctx.result.dimensions]
            effective_grain = _resolve_effective_grain(grain_override, query_dim_names)
            if effective_grain is not None and not set(effective_grain) <= set(query_dim_names):
                bad = sorted(set(effective_grain) - set(query_dim_names))
                ctx.errors.append(
                    SemanticError(
                        code="GRAIN_NOT_SUBSET",
                        message=(
                            f"Measure '{name}' grain {bad} is not a subset of "
                            f"query dimensions {query_dim_names}. "
                            f"This would cause row multiplication."
                        ),
                        path="select.measures",
                    )
                )
        return ResolvedMeasure(
            name=name,
            aggregation=measure.aggregation,
            expression=expr,
            is_expression=measure.expression is not None,
            total=measure.total,
            grain_override=grain_override,
            effective_grain=effective_grain,
            filter_context=measure.filter_context,
        )

    def _build_measure_expr(self, ctx: _ResolutionContext, measure: Measure) -> Expr:
        """Build the aggregate expression for a measure."""
        # Engine-delegated aggregation (Databricks Metric View). Emit
        # ``MEASURE("<label>")`` literally — there's no source column
        # to read; the engine resolves the aggregation by name. Dialect
        # support is enforced downstream by ``_check_aggregation_supported``.
        if measure.aggregation == AggregationType.MEASURE:
            return FunctionCall(
                name="MEASURE",
                args=[ColumnRef(name=measure.label, table=None)],
            )
        if measure.expression:
            return self._expand_expression(ctx, measure)

        # Build column references for all columns. Routes through
        # ``make_column_expr`` so a measure column that points at a
        # computed (``expression:``) column inlines the template body
        # — without this, ``count_distinct`` over an ``expression:``
        # column would emit ``COUNT(DISTINCT "obj"."")`` (zero-length
        # identifier, DB error).
        args: list[Expr] = []
        if measure.columns:
            for ref in measure.columns:
                obj_name = ref.view or ""
                col_name = ref.column or ""
                obj = ctx.model.data_objects.get(obj_name)
                if obj and col_name in obj.columns:
                    args.append(make_column_expr(ctx.model, obj_name, col_name))
                else:
                    args.append(ColumnRef(name=col_name, table=obj_name))
        if not args:
            args = [Literal.number(1)]

        agg = measure.aggregation.upper()
        distinct = measure.distinct
        if agg == "COUNT_DISTINCT":
            agg = "COUNT"
            distinct = True

        # LISTAGG: attach separator and optional ordering
        separator: str | None = None
        order_by: list[OrderByItem] = []
        if agg == "LISTAGG":
            separator = measure.delimiter if measure.delimiter is not None else ","
            if measure.within_group:
                wg = measure.within_group
                wg_obj_name = wg.column.view or ""
                wg_col_name = wg.column.column or ""
                wg_obj = ctx.model.data_objects.get(wg_obj_name)
                if wg_obj and wg_col_name in wg_obj.columns:
                    wg_expr: Expr = make_column_expr(ctx.model, wg_obj_name, wg_col_name)
                else:
                    wg_expr = ColumnRef(name=wg_col_name, table=wg_obj_name)
                order_by = [
                    OrderByItem(expr=wg_expr, desc=wg.order.upper() == "DESC"),
                ]

        result = FunctionCall(
            name=agg,
            args=args,
            distinct=distinct,
            order_by=order_by,
            separator=separator,
        )
        return self._apply_measure_filters(ctx, measure, result)

    def _expand_expression(self, ctx: _ResolutionContext, measure: Measure) -> Expr:
        """Expand a measure expression with ``{[DataObject].[Column]}`` refs into AST."""
        formula = measure.expression or ""
        agg = measure.aggregation.upper()

        tokens = tokenize_measure_expression(formula, ctx.model)
        inner = parse_expression(tokens)

        distinct = measure.distinct
        if agg == "COUNT_DISTINCT":
            agg = "COUNT"
            distinct = True

        result = FunctionCall(
            name=agg,
            args=[inner],
            distinct=distinct,
        )
        return self._apply_measure_filters(ctx, measure, result)

    @staticmethod
    def _apply_measure_filters(
        ctx: _ResolutionContext, measure: Measure, func: FunctionCall
    ) -> FunctionCall:
        """Wrap aggregate args with CASE WHEN if the measure has filters."""
        if not measure.filters:
            return func
        condition = build_measure_filter_condition(measure.filters, ctx.model, ctx.errors)
        if condition is None:
            return func
        wrapped_args: list[Expr] = [CaseExpr(when_clauses=[(condition, arg)]) for arg in func.args]
        return FunctionCall(
            name=func.name,
            args=wrapped_args,
            distinct=func.distinct,
            order_by=func.order_by,
            separator=func.separator,
        )

    def _resolve_metric(
        self, ctx: _ResolutionContext, name: str, metric: Metric
    ) -> ResolvedMeasure | None:
        """Resolve a metric to its combined expression."""
        if metric.type == MetricType.CUMULATIVE:
            return self._resolve_cumulative_metric(ctx, name, metric)
        if metric.type == MetricType.PERIOD_OVER_PERIOD:
            return self._resolve_pop_metric(ctx, name, metric)
        if metric.type == MetricType.WINDOW:
            return self._resolve_window_metric(ctx, name, metric)
        return self._resolve_derived_metric(ctx, name, metric)

    def _validate_partition_dimensions(
        self,
        ctx: _ResolutionContext,
        metric_name: str,
        partition_by: list[str],
        path_template: str,
    ) -> bool:
        """Validate every partitionBy entry references a model dimension
        present in the query's SELECT. Returns False (and accumulates errors)
        on any failure. Reachability to the measure source is enforced
        transitively by ``required_objects`` reachability later in resolution.
        """
        if not partition_by:
            return True
        dim_names = {d.name for d in ctx.result.dimensions}
        for dim_name in partition_by:
            if dim_name not in ctx.model.dimensions:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_PARTITION_DIMENSION",
                        message=(
                            f"Metric '{metric_name}' references unknown partition "
                            f"dimension '{dim_name}'"
                        ),
                        path=path_template.format(metric_name),
                    )
                )
                return False
            if dim_name not in dim_names:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_PARTITION_DIMENSION",
                        message=(
                            f"Metric '{metric_name}' requires partitionBy dimension "
                            f"'{dim_name}' to be in the query's selected dimensions"
                        ),
                        path=path_template.format(metric_name),
                    )
                )
                return False
        return True

    def _resolve_window_metric(
        self, ctx: _ResolutionContext, name: str, metric: Metric
    ) -> ResolvedMeasure | None:
        """Resolve a window metric (rank/lag/lead/ntile/first_value/last_value)."""
        if metric.window_function is None:
            ctx.errors.append(
                SemanticError(
                    code="INVALID_METRIC",
                    message=f"Window metric '{name}' missing required 'windowFunction'",
                    path=f"metrics.{name}",
                )
            )
            return None

        wf = metric.window_function
        base_measure_name = metric.measure
        base_aggregation = ""

        # Validate referenced measure (if any) exists
        if base_measure_name is not None:
            base_measure = ctx.model.measures.get(base_measure_name)
            if base_measure is None:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_MEASURE",
                        message=(
                            f"Window metric '{name}' references unknown "
                            f"measure '{base_measure_name}'"
                        ),
                        path=f"metrics.{name}.measure",
                    )
                )
                return None
            base_aggregation = base_measure.aggregation
            if base_measure_name not in ctx.result.metric_components:
                comp = self._resolve_measure(ctx, base_measure_name)
                if comp:
                    ctx.result.metric_components[base_measure_name] = comp

        # timeDimension is required for LAG/LEAD, optional otherwise (RANK uses measure value)
        if metric.time_dimension is not None:
            dim = ctx.model.dimensions.get(metric.time_dimension)
            if dim is None:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_DIMENSION",
                        message=(
                            f"Window metric '{name}' references unknown "
                            f"timeDimension '{metric.time_dimension}'"
                        ),
                        path=f"metrics.{name}.timeDimension",
                    )
                )
                return None
            dim_names = {d.name for d in ctx.result.dimensions}
            if metric.time_dimension not in dim_names:
                ctx.errors.append(
                    SemanticError(
                        code="WINDOW_TIME_DIMENSION_NOT_IN_SELECT",
                        message=(
                            f"Window metric '{name}' requires timeDimension "
                            f"'{metric.time_dimension}' to be in the query's selected dimensions"
                        ),
                        path=f"metrics.{name}.timeDimension",
                    )
                )
                return None
        elif wf in {WindowFunctionKind.LAG, WindowFunctionKind.LEAD}:
            ctx.errors.append(
                SemanticError(
                    code="INVALID_LAG_LEAD",
                    message=(
                        f"Window metric '{name}' with function '{wf.value}' "
                        f"requires 'timeDimension'"
                    ),
                    path=f"metrics.{name}",
                )
            )
            return None

        if wf == WindowFunctionKind.NTILE and (metric.buckets is None or metric.buckets < 2):
            ctx.errors.append(
                SemanticError(
                    code="INVALID_NTILE_BUCKETS",
                    message=(
                        f"Window metric '{name}' with function 'ntile' requires 'buckets' >= 2"
                    ),
                    path=f"metrics.{name}.buckets",
                )
            )
            return None

        if wf in {WindowFunctionKind.LAG, WindowFunctionKind.LEAD} and (
            metric.offset is None or metric.offset < 1
        ):
            ctx.errors.append(
                SemanticError(
                    code="INVALID_LAG_LEAD",
                    message=(
                        f"Window metric '{name}' with function '{wf.value}' "
                        f"requires positive 'offset'"
                    ),
                    path=f"metrics.{name}.offset",
                )
            )
            return None

        if not self._validate_partition_dimensions(
            ctx, name, metric.partition_by, "metrics.{}.partitionBy"
        ):
            return None

        return ResolvedMeasure(
            name=name,
            aggregation=base_aggregation,
            expression=ColumnRef(name=base_measure_name or name),
            is_expression=True,
            component_measures=[base_measure_name] if base_measure_name else [],
            is_window=True,
            window_function=wf,
            window_base_measure=base_measure_name,
            window_time_dimension=metric.time_dimension,
            window_partition_by=list(metric.partition_by),
            window_offset=metric.offset,
            window_buckets=metric.buckets,
            window_order_direction=metric.order_direction.lower(),
            window_default_value=metric.default_value,
        )

    def _resolve_derived_metric(
        self, ctx: _ResolutionContext, name: str, metric: Metric
    ) -> ResolvedMeasure | None:
        """Resolve a derived metric to its combined expression."""
        formula = metric.expression

        # Extract and resolve each component measure
        component_names = re.findall(r"\{\[([^\]]+)\]\}", formula or "")
        for comp_name in component_names:
            if comp_name not in ctx.result.metric_components:
                comp = self._resolve_measure(ctx, comp_name)
                if comp:
                    ctx.result.metric_components[comp_name] = comp

        # Parse the formula into an AST tree
        try:
            tokens = tokenize_metric_formula(formula or "")
            parsed_expr = parse_expression(tokens)
        except Exception as exc:
            ctx.errors.append(
                SemanticError(
                    code="INVALID_METRIC_EXPRESSION",
                    message=f"Metric '{name}' has invalid expression: {exc}",
                    path=f"metrics.{name}.expression",
                )
            )
            return None

        return ResolvedMeasure(
            name=name,
            aggregation="",
            expression=parsed_expr,
            component_measures=component_names,
            is_expression=True,
        )

    def _resolve_cumulative_metric(
        self, ctx: _ResolutionContext, name: str, metric: Metric
    ) -> ResolvedMeasure | None:
        """Resolve a cumulative metric referencing an existing measure."""
        if metric.measure is None:
            ctx.errors.append(
                SemanticError(
                    code="INVALID_METRIC",
                    message=f"Cumulative metric '{name}' missing required 'measure' field",
                    path=f"metrics.{name}",
                )
            )
            return None
        if metric.time_dimension is None:
            ctx.errors.append(
                SemanticError(
                    code="INVALID_METRIC",
                    message=f"Cumulative metric '{name}' missing required 'timeDimension' field",
                    path=f"metrics.{name}",
                )
            )
            return None

        # Validate referenced measure exists
        base_measure = ctx.model.measures.get(metric.measure)
        if base_measure is None:
            ctx.errors.append(
                SemanticError(
                    code="UNKNOWN_MEASURE",
                    message=(
                        f"Cumulative metric '{name}' references unknown measure '{metric.measure}'"
                    ),
                    path=f"metrics.{name}.measure",
                )
            )
            return None

        # Validate timeDimension is a known dimension
        dim = ctx.model.dimensions.get(metric.time_dimension)
        if dim is None:
            ctx.errors.append(
                SemanticError(
                    code="UNKNOWN_DIMENSION",
                    message=(
                        f"Cumulative metric '{name}' references unknown "
                        f"timeDimension '{metric.time_dimension}'"
                    ),
                    path=f"metrics.{name}.timeDimension",
                )
            )
            return None

        # Validate timeDimension is in the query's selected dimensions
        dim_names = {d.name for d in ctx.result.dimensions}
        if metric.time_dimension not in dim_names:
            ctx.errors.append(
                SemanticError(
                    code="CUMULATIVE_TIME_DIMENSION_NOT_IN_SELECT",
                    message=(
                        f"Cumulative metric '{name}' requires timeDimension "
                        f"'{metric.time_dimension}' to be in the query's selected dimensions"
                    ),
                    path=f"metrics.{name}.timeDimension",
                )
            )
            return None

        # Validate partitionBy dimensions
        if not self._validate_partition_dimensions(
            ctx, name, metric.partition_by, "metrics.{}.partitionBy"
        ):
            return None

        # Resolve the base measure as a component (reuse existing resolution)
        if metric.measure not in ctx.result.metric_components:
            comp = self._resolve_measure(ctx, metric.measure)
            if comp:
                ctx.result.metric_components[metric.measure] = comp

        # The cumulative metric's expression is a placeholder ColumnRef to the base measure
        # The actual window function is built during the cumulative_wrap phase
        return ResolvedMeasure(
            name=name,
            aggregation=base_measure.aggregation,
            expression=ColumnRef(name=metric.measure),
            is_expression=True,
            component_measures=[metric.measure],
            is_cumulative=True,
            cumulative_measure=metric.measure,
            cumulative_time_dimension=metric.time_dimension,
            cumulative_type=metric.cumulative_type,
            cumulative_window=metric.window,
            cumulative_grain_to_date=metric.grain_to_date,
            cumulative_partition_by=list(metric.partition_by),
        )

    def _resolve_pop_metric(
        self, ctx: _ResolutionContext, name: str, metric: Metric
    ) -> ResolvedMeasure | None:
        """Resolve a period-over-period metric."""
        if metric.period_over_period is None:
            ctx.errors.append(
                SemanticError(
                    code="INVALID_METRIC",
                    message=f"PoP metric '{name}' missing required 'periodOverPeriod' field",
                    path=f"metrics.{name}",
                )
            )
            return None
        if metric.expression is None:
            ctx.errors.append(
                SemanticError(
                    code="INVALID_METRIC",
                    message=f"PoP metric '{name}' missing required 'expression' field",
                    path=f"metrics.{name}",
                )
            )
            return None

        pop = metric.period_over_period

        # Validate timeDimension is a known dimension
        dim = ctx.model.dimensions.get(pop.time_dimension)
        if dim is None:
            ctx.errors.append(
                SemanticError(
                    code="POP_UNKNOWN_TIME_DIMENSION",
                    message=(
                        f"Period-over-period metric '{name}' references unknown "
                        f"time dimension '{pop.time_dimension}'"
                    ),
                    path=f"metrics.{name}.periodOverPeriod.timeDimension",
                )
            )
            return None

        # Validate timeDimension is in the query's selected dimensions
        dim_names = {d.name for d in ctx.result.dimensions}
        if pop.time_dimension not in dim_names:
            ctx.errors.append(
                SemanticError(
                    code="POP_TIME_DIMENSION_NOT_IN_SELECT",
                    message=(
                        f"Period-over-period metric '{name}' requires time dimension "
                        f"'{pop.time_dimension}' to be in the query's selected dimensions"
                    ),
                    path=f"metrics.{name}.periodOverPeriod.timeDimension",
                )
            )
            return None

        # Validate offset is non-zero
        if pop.offset == 0:
            ctx.errors.append(
                SemanticError(
                    code="POP_INVALID_OFFSET",
                    message=(
                        f"Period-over-period metric '{name}' has offset=0 "
                        f"(must be non-zero, e.g. -1 for previous period)"
                    ),
                    path=f"metrics.{name}.periodOverPeriod.offset",
                )
            )
            return None

        # Resolve the expression (same as derived — parse {[Measure Name]} refs)
        component_names = re.findall(r"\{\[([^\]]+)\]\}", metric.expression)

        # PoP comparison logic only supports single-measure expressions
        if len(component_names) > 1:
            ctx.errors.append(
                SemanticError(
                    code="POP_MULTI_MEASURE_NOT_SUPPORTED",
                    message=(
                        f"Period-over-period metric '{name}' references multiple measures "
                        f"({', '.join(component_names)}). PoP comparison currently supports "
                        f"only single-measure expressions."
                    ),
                    path=f"metrics.{name}.expression",
                )
            )
            return None

        for comp_name in component_names:
            if comp_name not in ctx.result.metric_components:
                comp = self._resolve_measure(ctx, comp_name)
                if comp:
                    ctx.result.metric_components[comp_name] = comp

        try:
            tokens = tokenize_metric_formula(metric.expression)
            parsed_expr = parse_expression(tokens)
        except Exception as exc:
            ctx.errors.append(
                SemanticError(
                    code="INVALID_METRIC_EXPRESSION",
                    message=f"Metric '{name}' has invalid expression: {exc}",
                    path=f"metrics.{name}.expression",
                )
            )
            return None

        # Use the first component measure as the base (for single-measure PoP)
        pop_base = component_names[0] if component_names else None

        return ResolvedMeasure(
            name=name,
            aggregation="",
            expression=parsed_expr,
            component_measures=component_names,
            is_expression=True,
            is_pop=True,
            pop_base_measure=pop_base,
            pop_time_dimension=pop.time_dimension,
            pop_grain=pop.grain,
            pop_offset=pop.offset,
            pop_offset_grain=pop.offset_grain,
            pop_comparison=pop.comparison,
        )

    def _collect_having_measure_refs(self, query: QueryObject, model: SemanticModel) -> list[str]:
        """Collect measure/metric names referenced in any HAVING filter.

        Walks ``query.having`` recursively (each entry is a
        ``QueryFilter`` or a ``QueryFilterGroup``) and returns the
        ordered, de-duplicated list of ``field`` values that name a
        known measure or metric in the model. Order is preserved for
        deterministic resolution; duplicates are dropped on first sight.
        """

        seen: set[str] = set()
        out: list[str] = []

        def _visit(item: QueryFilterItem) -> None:
            if isinstance(item, QueryFilterGroup):
                for child in item.filters:
                    _visit(child)
                return
            field = item.field
            if field in seen:
                return
            if field in model.measures or field in model.metrics:
                seen.add(field)
                out.append(field)

        for entry in query.having:
            _visit(entry)
        return out

    def _get_measure_source_objects(self, ctx: _ResolutionContext, name: str) -> set[str]:
        """Extract all source data objects for a measure or metric."""
        result: set[str] = set()

        measure = ctx.model.measures.get(name)
        if measure:
            for cref in measure.columns:
                if cref.view:
                    result.add(cref.view)
            if measure.expression:
                col_refs = re.findall(r"\{\[([^\]]+)\]\.\[([^\]]+)\]\}", measure.expression)
                for obj_name, _col_name in col_refs:
                    result.add(obj_name)
            for fi in measure.filters:
                collect_measure_filter_objects(fi, result)
            return result

        metric = ctx.model.metrics.get(name)
        if metric:
            if metric.type == MetricType.CUMULATIVE and metric.measure:
                # Cumulative metric: source objects come from the referenced measure
                result.update(self._get_measure_source_objects(ctx, metric.measure))
            elif metric.type == MetricType.WINDOW and metric.measure:
                # Window metric: source objects come from the referenced measure
                result.update(self._get_measure_source_objects(ctx, metric.measure))
            elif metric.expression:
                # Derived or PoP metric: parse expression for measure references
                measure_refs = re.findall(r"\{\[([^\]]+)\]\}", metric.expression)
                for ref_name in measure_refs:
                    result.update(self._get_measure_source_objects(ctx, ref_name))

        return result

    # -- base object selection -----------------------------------------------

    def _select_base_object(self, ctx: _ResolutionContext) -> str:
        """Select the base (fact) object — prefer measure source objects with most joins."""
        if ctx.result.measure_source_objects:
            best = ""
            best_joins = -1
            for obj_name in sorted(ctx.result.measure_source_objects):
                obj = ctx.model.data_objects.get(obj_name)
                n = len(obj.joins) if obj else 0
                if n > best_joins:
                    best = obj_name
                    best_joins = n
            if best:
                return best

        # Dimension-only: use JoinGraph to find the deepest ancestor
        # (possibly an intermediate fact/bridge table) that can reach
        # all required dimension objects via directed join paths.
        if len(ctx.result.required_objects) > 1:
            graph = JoinGraph(ctx.model, use_path_names=ctx.result.use_path_names or None)
            root = graph.find_common_root(ctx.result.required_objects)
            if root:
                return root

        for obj_name in sorted(ctx.result.required_objects):
            obj = ctx.model.data_objects.get(obj_name)
            if obj and obj.joins:
                return obj_name

        if ctx.result.required_objects:
            return next(iter(sorted(ctx.result.required_objects)))
        if ctx.model.data_objects:
            return next(iter(ctx.model.data_objects))
        return ""

    # -- usePathNames validation ---------------------------------------------

    def _validate_use_path_names(
        self, ctx: _ResolutionContext, use_path_names: list[UsePathName]
    ) -> None:
        """Validate usePathNames references."""
        for upn in use_path_names:
            if upn.source not in ctx.model.data_objects:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT",
                        message=f"usePathNames references unknown data object '{upn.source}'",
                        path="usePathNames",
                    )
                )
                continue
            if upn.target not in ctx.model.data_objects:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT",
                        message=f"usePathNames references unknown data object '{upn.target}'",
                        path="usePathNames",
                    )
                )
                continue
            source_obj = ctx.model.data_objects[upn.source]
            found = any(
                j.join_to == upn.target and j.secondary and j.path_name == upn.path_name
                for j in source_obj.joins
            )
            if not found:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_PATH_NAME",
                        message=(
                            f"No secondary join with pathName '{upn.path_name}' "
                            f"from '{upn.source}' to '{upn.target}'"
                        ),
                        path="usePathNames",
                    )
                )

    # -- static model filters ------------------------------------------------

    def _resolve_static_filter(
        self, ctx: _ResolutionContext, mf: ModelFilter
    ) -> ResolvedFilter | None:
        """Resolve a static model filter to a physical WHERE expression.

        Silently skips filters on data objects that are unreachable from the
        query's join graph — they are simply irrelevant to the current query.
        """
        obj = ctx.model.data_objects.get(mf.data_object)
        if obj is None:
            return None

        col = obj.columns.get(mf.column)
        if col is None:
            return None

        if not self._resolve_filter_object(ctx, mf.data_object, "filters", mf.column):
            return None

        # Route through ``make_column_expr`` so a ``MeasureFilter`` on a
        # computed column inlines the expression — without this a filter
        # on a boolean ``expression:`` column emitted ``(1 = FALSE)``
        # because the empty ``code:`` was collapsed by the CAST.
        col_expr: Expr = make_column_expr(ctx.model, mf.data_object, mf.column)
        qf = QueryFilter(field=mf.column, op=mf.operator, value=mf.value or mf.values or None)
        filter_expr = build_filter_expr(col_expr, qf, ctx.errors)
        if filter_expr is None:
            return None
        return ResolvedFilter(
            expression=filter_expr,
            is_aggregate=False,
            referenced_fields=frozenset({mf.column}),
        )

    # -- filters -------------------------------------------------------------

    def _resolve_filter_object(
        self,
        ctx: _ResolutionContext,
        obj_name: str,
        filter_path: str,
        _field_label: str,
    ) -> bool:
        """Ensure *obj_name* is joined; auto-extend if reachable.

        Silently skips filters on unreachable data objects — they are
        irrelevant to the current query.
        """
        if obj_name in ctx.joined_objects:
            return True
        if ctx.graph is None:
            return False
        reachable = any(obj_name in ctx.graph.descendants(j) for j in list(ctx.joined_objects))
        if not reachable:
            return False
        new_steps = ctx.graph.find_join_path(ctx.joined_objects, {obj_name})
        for step in new_steps:
            if step.to_object not in ctx.joined_objects:
                ctx.result.join_steps.append(step)
                ctx.joined_objects.add(step.to_object)
                ctx.result.required_objects.add(step.to_object)
        return True

    def _resolve_filter_item(
        self, ctx: _ResolutionContext, item: QueryFilterItem, *, is_having: bool
    ) -> ResolvedFilter | None:
        """Resolve a filter item (leaf or group) to a physical expression."""
        if isinstance(item, QueryFilter):
            return self._resolve_filter(ctx, item, is_having=is_having)
        return self._resolve_filter_group(ctx, item, is_having=is_having)

    def _resolve_filter_group(
        self, ctx: _ResolutionContext, group: QueryFilterGroup, *, is_having: bool
    ) -> ResolvedFilter | None:
        """Resolve a filter group recursively, combining with AND/OR."""
        child_exprs: list[Expr] = []
        all_fields: set[str] = set()
        for child in group.filters:
            resolved = self._resolve_filter_item(ctx, child, is_having=is_having)
            if resolved:
                child_exprs.append(resolved.expression)
                all_fields.update(resolved.referenced_fields)

        if not child_exprs:
            return None

        # Combine children with the group's logic
        op = "AND" if group.logic == "and" else "OR"
        combined: Expr = child_exprs[0]
        for expr in child_exprs[1:]:
            combined = BinaryOp(left=combined, op=op, right=expr)

        # Optionally negate
        if group.negated:
            combined = UnaryOp(op="NOT", operand=combined)

        return ResolvedFilter(
            expression=combined,
            is_aggregate=is_having,
            referenced_fields=frozenset(all_fields),
        )

    def _resolve_filter(
        self, ctx: _ResolutionContext, qf: QueryFilter, *, is_having: bool
    ) -> ResolvedFilter | None:
        """Resolve a query filter to a physical expression.

        Filter fields can reference:
        1. A dimension name (e.g. ``"Order Priority"``)
        2. A qualified column ``"DataObject.Column"`` (e.g. ``"Orders.Order Priority"``)
        3. For HAVING filters, a measure name (e.g. ``"Revenue"``)

        If the referenced data object is reachable but not yet joined, the
        join path is auto-extended.
        """
        filter_path = "having" if is_having else "where"

        # 1. Try dimension name
        col_expr: Expr
        subject_object: str | None = None
        dim = ctx.model.dimensions.get(qf.field)
        if dim:
            obj_name = dim.view
            if not self._resolve_filter_object(ctx, obj_name, filter_path, qf.field):
                return None
            col_name = dim.column
            col_expr = make_column_expr(ctx.model, obj_name, col_name)
            subject_object = obj_name

        # 2. HAVING: try measure or metric name
        elif is_having and (qf.field in ctx.model.measures or qf.field in ctx.model.metrics):
            col_expr = ColumnRef(name=qf.field)

        # 3. Try qualified column: "DataObject.Column"
        elif "." in qf.field:
            parts = qf.field.split(".", 1)
            obj_name, col_name = parts[0].strip(), parts[1].strip()
            obj = ctx.model.data_objects.get(obj_name)
            if obj is None:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_FILTER_FIELD",
                        message=(f"Unknown data object '{obj_name}' in filter field '{qf.field}'"),
                        path=filter_path,
                    )
                )
                return None
            if col_name not in obj.columns:
                ctx.errors.append(
                    SemanticError(
                        code="UNKNOWN_FILTER_FIELD",
                        message=(
                            f"Unknown column '{col_name}' in data object "
                            f"'{obj_name}' for filter field '{qf.field}'"
                        ),
                        path=filter_path,
                    )
                )
                return None
            if not self._resolve_filter_object(ctx, obj_name, filter_path, qf.field):
                return None
            col_expr = make_column_expr(ctx.model, obj_name, col_name)
            subject_object = obj_name

        else:
            ctx.errors.append(
                SemanticError(
                    code="UNKNOWN_FILTER_FIELD",
                    message=f"Unknown filter field '{qf.field}'",
                    path=filter_path,
                )
            )
            return None

        # exists/nonexists need model + subject object + qualify_table to
        # build the correlated subquery. HAVING is rejected entirely in v2.7:
        # the correlation predicate references row-level columns of the
        # subject data object, but HAVING is evaluated after GROUP BY — the
        # raw subject column is no longer in scope, producing invalid SQL on
        # every dialect. Measure-level EXISTS (the proper HAVING-equivalent)
        # is deferred to ``MeasureFilter.subquery`` in a future release.
        if qf.op in (FilterOperator.EXISTS, FilterOperator.NONEXISTS):
            if is_having:
                ctx.errors.append(
                    SemanticError(
                        code="INVALID_FILTER_OPERATOR",
                        message=(
                            f"'{qf.op}' is only valid in 'where' — HAVING is "
                            "evaluated after GROUP BY, where the row-level "
                            "correlation predicate is out of scope. Move the "
                            "filter to 'where', or use a precomputed boolean "
                            "column on the data object."
                        ),
                        path=filter_path,
                    )
                )
                return None
            if subject_object is None:
                ctx.errors.append(
                    SemanticError(
                        code="INVALID_FILTER_OPERATOR",
                        message=(
                            f"'{qf.op}' requires a dimension or qualified column "
                            "as the subject — measure/metric references are not "
                            "valid subjects for a correlated subquery."
                        ),
                        path=filter_path,
                    )
                )
                return None
            qt = ctx.qualify_table or (lambda obj: obj.qualified_code)
            filter_expr = build_exists_filter_expr(qf, ctx.model, subject_object, qt, ctx.errors)
        else:
            filter_expr = build_filter_expr(col_expr, qf, ctx.errors)

        if filter_expr is None:
            return None
        return ResolvedFilter(
            expression=filter_expr,
            is_aggregate=is_having,
            referenced_fields=frozenset({qf.field}),
        )

    # -- order by ------------------------------------------------------------

    def _resolve_order_by_field(
        self, ctx: _ResolutionContext, field_name: str, select_count: int
    ) -> Expr | None:
        """Resolve an order-by field to its expression."""
        # Coalesce alias: outer SELECT exposes it as a bare alias column,
        # so a table-less ColumnRef is the right form for both star and CFL.
        if field_name in ctx.result.coalesce_aliases:
            return ColumnRef(name=field_name)

        for dim in ctx.result.dimensions:
            if dim.name == field_name:
                # Use make_column_expr so computed columns (which have empty
                # ``code``) inline their expression instead of producing an
                # empty column ref like ``"Orders"."" ``.
                return make_column_expr(ctx.model, dim.object_name, dim.column_name)

        for meas in ctx.result.measures:
            if meas.name == field_name:
                # Window / cumulative / period-over-period metrics are
                # exposed by the outer SELECT as a bare alias after their
                # wrapper CTE runs — ordering by ``meas.expression`` here
                # would point ORDER BY at the *base measure's* inner
                # aggregate (the lag-input, the cumulative-input), not at
                # the windowed output the user asked for. Same pattern as
                # coalesce_aliases above: emit a table-less ColumnRef so
                # both star and CFL outer SELECTs bind it correctly.
                if meas.is_window or meas.is_cumulative or meas.is_pop:
                    return ColumnRef(name=meas.name)
                return meas.expression

        # Raw mode: order by the field's "DataObject.Column" alias.
        for f in ctx.result.fields:
            if f.alias == field_name:
                return make_column_expr(ctx.model, f.object_name, f.column_name)

        if field_name.isdigit():
            pos = int(field_name)
            if 1 <= pos <= select_count:
                return Literal.number(pos)
            ctx.errors.append(
                SemanticError(
                    code="INVALID_ORDER_BY_POSITION",
                    message=(
                        f"ORDER BY position {pos} is out of range "
                        f"(SELECT has {select_count} columns)"
                    ),
                    path="order_by",
                )
            )
            return None

        ctx.errors.append(
            SemanticError(
                code="UNKNOWN_ORDER_BY_FIELD",
                message=(
                    f"ORDER BY field '{field_name}' is not a dimension "
                    f"or measure in the query's SELECT"
                ),
                path="order_by",
            )
        )
        return None

resolve(query, model, qualify_table=None)

Source code in src/orionbelt/compiler/resolution.py
def resolve(
    self,
    query: QueryObject,
    model: SemanticModel,
    qualify_table: Callable[[DataObject], str] | None = None,
) -> ResolvedQuery:
    ctx = _ResolutionContext(
        model=model,
        result=ResolvedQuery(
            limit=query.limit,
            offset=query.offset,
            use_path_names=list(query.use_path_names),
            is_raw=query.select.is_raw,
            distinct=query.select.distinct,
            grouping=query.grouping,
        ),
        qualify_table=qualify_table,
    )

    # Build global column lookup: col_name → (object_name, source_column)
    for obj_name, obj in model.data_objects.items():
        for col_name, col_obj in obj.columns.items():
            ctx.global_columns[col_name] = (obj_name, col_obj.code)

    if query.select.is_raw:
        # Raw mode: project physical columns, no aggregation.
        for ref in query.select.fields:
            self._resolve_raw_field(ctx, ref)
    else:
        # Aggregate mode (default).
        # 1. Resolve dimensions (string or coalesce group).
        # Coalesce groups expand into their constituent dimensions, each
        # tagged with the same coalesce_alias so the CFL outer wrapper can
        # emit COALESCE(d1, d2, ...) AS <alias>.
        for dim_entry in query.select.dimensions:
            if isinstance(dim_entry, CoalesceDimension):
                self._resolve_coalesce_dimension(ctx, dim_entry, ctx.result.coalesce_aliases)
            else:
                self._append_resolved_dimension(ctx, dim_entry)

        # 2. Resolve measures and track their source objects
        for measure_name in query.select.measures:
            resolved_meas = self._resolve_measure(ctx, measure_name)
            if resolved_meas:
                ctx.result.measures.append(resolved_meas)
                source_objs = self._get_measure_source_objects(ctx, measure_name)
                ctx.result.measure_source_objects.update(source_objs)
                ctx.result.required_objects.update(source_objs)

        # 2.5. Auto-include measures referenced by HAVING but not by SELECT.
        # Without this, codegen emits a HAVING clause that references an
        # alias for a column the SELECT doesn't project — every database
        # rejects the SQL with a "must appear in GROUP BY" binder error.
        # Routing this through the regular measure-resolution path also
        # updates ``measure_source_objects`` so the multi-fact CFL trigger
        # below sees the HAVING-only measure's source.
        existing_measure_names = {m.name for m in ctx.result.measures}
        for ref in self._collect_having_measure_refs(query, model):
            if ref in existing_measure_names:
                continue
            resolved_meas = self._resolve_measure(ctx, ref)
            if resolved_meas is None:
                continue
            ctx.result.measures.append(resolved_meas)
            ctx.result.having_only_measures.add(ref)
            existing_measure_names.add(ref)
            source_objs = self._get_measure_source_objects(ctx, ref)
            ctx.result.measure_source_objects.update(source_objs)
            ctx.result.required_objects.update(source_objs)

    # 3. Determine base object (the one with most joins / most measures)
    ctx.result.base_object = self._select_base_object(ctx)
    if ctx.result.base_object:
        ctx.result.required_objects.add(ctx.result.base_object)

    # Detect multi-fact: CFL is needed only when measure source objects
    # span multiple independent fact tables.
    if len(ctx.result.measure_source_objects) > 1:
        graph = JoinGraph(model, use_path_names=query.use_path_names or None)
        reachable = graph.descendants(ctx.result.base_object)
        unreachable = ctx.result.measure_source_objects - reachable - {ctx.result.base_object}
        if unreachable:
            ctx.result.requires_cfl = True

    # Dimension-only queries: when dimensions span independent branches,
    # join through intermediate bridge/fact tables (no CFL needed).
    # Add intermediate tables from the join steps to required_objects
    # so the star schema planner includes them.
    if not ctx.result.measure_source_objects and ctx.result.dimensions:
        dim_objects = {d.object_name for d in ctx.result.dimensions}
        if not dim_objects <= {ctx.result.base_object}:
            graph = JoinGraph(model, use_path_names=query.use_path_names or None)
            steps = graph.find_join_path(
                {ctx.result.base_object},
                dim_objects,
                via_constraints=ctx.result.via_constraints or None,
            )
            for step in steps:
                ctx.result.required_objects.add(step.from_object)
                ctx.result.required_objects.add(step.to_object)

    # Raw mode: detect multi-fact (fields span objects unreachable from
    # the base via directed joins). The pipeline rejects this case for
    # now — raw CFL is a planned follow-up.
    if ctx.result.is_raw and ctx.result.base_object:
        field_objects = {f.object_name for f in ctx.result.fields}
        if len(field_objects) > 1:
            graph = JoinGraph(model, use_path_names=query.use_path_names or None)
            reachable = graph.descendants(ctx.result.base_object)
            unreachable = field_objects - reachable - {ctx.result.base_object}
            if unreachable:
                ctx.result.requires_cfl = True

    # Validate dimensionsExclude constraints
    if query.dimensions_exclude:
        if query.select.measures:
            ctx.errors.append(
                SemanticError(
                    code="DIMENSIONS_EXCLUDE_WITH_MEASURES",
                    message="dimensionsExclude cannot be combined with measures",
                    path="select",
                )
            )
        elif len(ctx.result.dimensions) < 2:
            ctx.errors.append(
                SemanticError(
                    code="DIMENSIONS_EXCLUDE_INSUFFICIENT",
                    message="dimensionsExclude requires at least 2 dimensions",
                    path="select.dimensions",
                )
            )
        else:
            ctx.result.dimensions_exclude = True

    # 4. Validate usePathNames before building join graph
    self._validate_use_path_names(ctx, query.use_path_names)

    # 5. Resolve join paths
    ctx.graph = JoinGraph(model, use_path_names=query.use_path_names or None)
    if ctx.result.base_object and len(ctx.result.required_objects) > 1:
        ctx.result.join_steps = ctx.graph.find_join_path(
            {ctx.result.base_object},
            ctx.result.required_objects,
            via_constraints=ctx.result.via_constraints or None,
        )

    # Build set of all objects present in the query's join graph
    if ctx.result.base_object:
        ctx.joined_objects.add(ctx.result.base_object)
    for step in ctx.result.join_steps:
        ctx.joined_objects.add(step.to_object)

    # Detect required objects that the star-schema planner cannot reach.
    # Many-to-one joins are forward-only (reverse traversal would inflate
    # the base table), so a required object that's only reachable via a
    # reverse m-to-1 hop is unreachable.  Raise a clear error rather than
    # silently producing wrong SQL.  CFL legs are validated separately.
    if ctx.result.base_object and not ctx.result.requires_cfl:
        unreachable = ctx.result.required_objects - ctx.joined_objects
        for unreachable_name in sorted(unreachable):
            ctx.errors.append(
                SemanticError(
                    code="UNREACHABLE_REQUIRED_OBJECT",
                    message=(
                        f"Data object '{unreachable_name}' is required by the query but "
                        f"cannot be reached from base '{ctx.result.base_object}' via "
                        f"directed joins. Many-to-one joins are forward-only; reverse "
                        f"traversal would inflate row counts. Add an explicit join from "
                        f"'{ctx.result.base_object}' (or an intermediate object) to "
                        f"'{unreachable_name}', or split the query so each fact is "
                        f"queried independently."
                    ),
                    path="select",
                )
            )

    # 5b. Inject static model filters — always applied as WHERE conditions
    static_exprs: list[Expr] = []
    for mf in model.filters:
        static_filter = self._resolve_static_filter(ctx, mf)
        if static_filter:
            ctx.result.where_filters.append(static_filter)
            static_exprs.append(static_filter.expression)

    # 6. Classify filters — skip query-time duplicates of static filters
    for qfi in query.where:
        resolved_filter = self._resolve_filter_item(ctx, qfi, is_having=False)
        if resolved_filter and resolved_filter.expression not in static_exprs:
            ctx.result.where_filters.append(resolved_filter)

    for qfi in query.having:
        resolved_filter = self._resolve_filter_item(ctx, qfi, is_having=True)
        if resolved_filter:
            ctx.result.having_filters.append(resolved_filter)

    # 7. Resolve order by — must reference a dimension or measure in SELECT
    select_count = len(ctx.result.dimensions) + len(ctx.result.measures)
    for ob in query.order_by:
        expr = self._resolve_order_by_field(ctx, ob.field, select_count)
        if expr:
            ctx.result.order_by_exprs.append((expr, ob.direction == "desc", ob.nulls))

    # 8. ROLLUP / CUBE: backfill NULLS FIRST on any explicit ORDER BY entry
    # that didn't specify a NULLs position. Subtotal and grand-total rows
    # carry NULLs in the rolled-up group-by columns, and BI tools expect
    # those totals at the top of the result — not interleaved with details.
    if ctx.result.grouping is not None and ctx.result.order_by_exprs:
        ctx.result.order_by_exprs = [
            (expr, desc, NullsPosition.FIRST if nulls is None else nulls)
            for expr, desc, nulls in ctx.result.order_by_exprs
        ]

    # 9. Auto-order — when no explicit ORDER BY, append ORDER BY over all
    # SELECT dimensions (or raw fields) under two conditions:
    #   (a) LIMIT is set: cache hashes on compiled SQL; without ORDER BY
    #       ``LIMIT N`` returns any N rows, freezing one arbitrary slice.
    #   (b) ROLLUP / CUBE: subtotal layout is otherwise unpredictable.
    # ROLLUP / CUBE defaults to NULLS FIRST (totals at the top).
    # Aggregate-only queries (no dims, no fields) are already single-row
    # deterministic — skip.
    needs_auto_order = not ctx.result.order_by_exprs and (
        ctx.result.limit is not None or ctx.result.grouping is not None
    )
    if needs_auto_order:
        nulls_default = NullsPosition.FIRST if ctx.result.grouping is not None else None
        if ctx.result.is_raw and ctx.result.fields:
            for f in ctx.result.fields:
                ctx.result.order_by_exprs.append(
                    (ColumnRef(name=f.alias), False, nulls_default)
                )
        elif ctx.result.dimensions:
            for dim in ctx.result.dimensions:
                ctx.result.order_by_exprs.append(
                    (ColumnRef(name=dim.name), False, nulls_default)
                )

    if ctx.errors:
        raise ResolutionError(ctx.errors)

    return ctx.result

Star Schema Planner

orionbelt.compiler.star.StarSchemaPlanner

Plans star-schema queries: single fact base with dimension joins.

Source code in src/orionbelt/compiler/star.py
class StarSchemaPlanner:
    """Plans star-schema queries: single fact base with dimension joins."""

    def plan(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        qualify_table: Callable[[DataObject], str] | None = None,
        dialect: Dialect | None = None,
    ) -> QueryPlan:
        builder = QueryBuilder()
        graph = JoinGraph(model, use_path_names=resolved.use_path_names or None)

        def qualify(obj: DataObject) -> str:
            return qualify_table(obj) if qualify_table else obj.qualified_code

        base_object = model.data_objects.get(resolved.base_object)
        if not base_object:
            return QueryPlan(ast=builder.build())

        base_alias = resolved.base_object

        # SELECT: dimensions (apply time grain truncation if specified)
        grouping_dim_aliases: list[str] = []
        for dim in resolved.dimensions:
            col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
            if dim.grain and dialect:
                col = dialect.render_time_grain(col, dim.grain)
            builder.select(AliasedExpr(expr=col, alias=dim.name))
            if resolved.grouping is not None:
                grouping_dim_aliases.append(dim.name)

        # SELECT: measures (aggregated) — for metrics, substitute component refs
        settings = model.settings
        measure_exprs: dict[str, Expr] = {}
        for measure in resolved.measures:
            if measure.component_measures:
                expr: Expr = _substitute_measure_refs(
                    measure.expression, resolved.metric_components
                )
                metric = model.metrics.get(measure.name)
                if metric and dialect:
                    resolved_type = resolve_metric_data_type(metric, settings)
                    if resolved_type:
                        expr = dialect.cast_to_obml_type(expr, resolved_type)
                builder.select(AliasedExpr(expr=expr, alias=measure.name))
            else:
                expr = measure.expression
                model_measure = model.measures.get(measure.name)
                if model_measure and dialect:
                    resolved_type = resolve_measure_data_type(model_measure, settings)
                    if resolved_type:
                        expr = dialect.cast_to_obml_type(expr, resolved_type)
                builder.select(AliasedExpr(expr=expr, alias=measure.name))
            measure_exprs[measure.name] = expr

        # FROM: base fact table
        builder.from_(qualify(base_object), alias=base_alias)

        # JOINs: dimension and intermediate tables
        joined = {base_alias}
        for step in resolved.join_steps:
            # Determine which side of the step needs to be joined
            if step.to_object not in joined:
                new_object = step.to_object
            elif step.from_object not in joined:
                new_object = step.from_object
            else:
                continue  # both already joined
            obj = model.data_objects.get(new_object)
            if not obj:
                continue
            on_expr = graph.build_join_condition(step)
            builder.join(
                table=qualify(obj),
                on=on_expr,
                join_type=step.join_type,
                alias=new_object,
            )
            joined.add(new_object)

        # WHERE
        for wf in resolved.where_filters:
            builder.where(wf.expression)

        # GROUP BY (all dimension columns, with time grain if applicable).
        # Stash the per-dim group-by expression by alias so GROUPING() below
        # can reuse the SAME expression — Postgres rejects GROUPING(<alias>)
        # with "column does not exist" and requires the group-key expression.
        group_by_exprs: dict[str, Expr] = {}
        for dim in resolved.dimensions:
            gb_col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
            if dim.grain and dialect:
                gb_col = dialect.render_time_grain(gb_col, dim.grain)
            builder.group_by(gb_col)
            group_by_exprs[dim.name] = gb_col

        # GROUPING() flag columns + grouping modifier (rollup/cube)
        if resolved.grouping is not None and grouping_dim_aliases:
            builder.grouping(resolved.grouping.value)
            for alias in grouping_dim_aliases:
                gb_arg = group_by_exprs.get(alias) or ColumnRef(name=alias)
                flag_col = FunctionCall(name="GROUPING", args=[gb_arg])
                builder.select(AliasedExpr(expr=flag_col, alias=_grouping_flag_alias(alias)))

        # HAVING — expand alias references to actual CAST'd aggregate expressions
        for hf in resolved.having_filters:
            builder.having(_expand_measure_refs(hf.expression, measure_exprs))

        # ORDER BY (use alias for time-grained dimensions)
        grained_cols: dict[tuple[str, str | None], str] = {
            (d.source_column, d.object_name): d.name for d in resolved.dimensions if d.grain
        }
        for expr, desc, nulls in resolved.order_by_exprs:
            if isinstance(expr, ColumnRef) and (expr.name, expr.table) in grained_cols:
                expr = ColumnRef(name=grained_cols[(expr.name, expr.table)])
            builder.order_by(expr, desc=desc, nulls_last=_nulls_last(nulls))

        # LIMIT / OFFSET
        if resolved.limit is not None:
            builder.limit(resolved.limit)
        if resolved.offset is not None:
            builder.offset(resolved.offset)

        return QueryPlan(ast=builder.build())

plan(resolved, model, qualify_table=None, dialect=None)

Source code in src/orionbelt/compiler/star.py
def plan(
    self,
    resolved: ResolvedQuery,
    model: SemanticModel,
    qualify_table: Callable[[DataObject], str] | None = None,
    dialect: Dialect | None = None,
) -> QueryPlan:
    builder = QueryBuilder()
    graph = JoinGraph(model, use_path_names=resolved.use_path_names or None)

    def qualify(obj: DataObject) -> str:
        return qualify_table(obj) if qualify_table else obj.qualified_code

    base_object = model.data_objects.get(resolved.base_object)
    if not base_object:
        return QueryPlan(ast=builder.build())

    base_alias = resolved.base_object

    # SELECT: dimensions (apply time grain truncation if specified)
    grouping_dim_aliases: list[str] = []
    for dim in resolved.dimensions:
        col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
        if dim.grain and dialect:
            col = dialect.render_time_grain(col, dim.grain)
        builder.select(AliasedExpr(expr=col, alias=dim.name))
        if resolved.grouping is not None:
            grouping_dim_aliases.append(dim.name)

    # SELECT: measures (aggregated) — for metrics, substitute component refs
    settings = model.settings
    measure_exprs: dict[str, Expr] = {}
    for measure in resolved.measures:
        if measure.component_measures:
            expr: Expr = _substitute_measure_refs(
                measure.expression, resolved.metric_components
            )
            metric = model.metrics.get(measure.name)
            if metric and dialect:
                resolved_type = resolve_metric_data_type(metric, settings)
                if resolved_type:
                    expr = dialect.cast_to_obml_type(expr, resolved_type)
            builder.select(AliasedExpr(expr=expr, alias=measure.name))
        else:
            expr = measure.expression
            model_measure = model.measures.get(measure.name)
            if model_measure and dialect:
                resolved_type = resolve_measure_data_type(model_measure, settings)
                if resolved_type:
                    expr = dialect.cast_to_obml_type(expr, resolved_type)
            builder.select(AliasedExpr(expr=expr, alias=measure.name))
        measure_exprs[measure.name] = expr

    # FROM: base fact table
    builder.from_(qualify(base_object), alias=base_alias)

    # JOINs: dimension and intermediate tables
    joined = {base_alias}
    for step in resolved.join_steps:
        # Determine which side of the step needs to be joined
        if step.to_object not in joined:
            new_object = step.to_object
        elif step.from_object not in joined:
            new_object = step.from_object
        else:
            continue  # both already joined
        obj = model.data_objects.get(new_object)
        if not obj:
            continue
        on_expr = graph.build_join_condition(step)
        builder.join(
            table=qualify(obj),
            on=on_expr,
            join_type=step.join_type,
            alias=new_object,
        )
        joined.add(new_object)

    # WHERE
    for wf in resolved.where_filters:
        builder.where(wf.expression)

    # GROUP BY (all dimension columns, with time grain if applicable).
    # Stash the per-dim group-by expression by alias so GROUPING() below
    # can reuse the SAME expression — Postgres rejects GROUPING(<alias>)
    # with "column does not exist" and requires the group-key expression.
    group_by_exprs: dict[str, Expr] = {}
    for dim in resolved.dimensions:
        gb_col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
        if dim.grain and dialect:
            gb_col = dialect.render_time_grain(gb_col, dim.grain)
        builder.group_by(gb_col)
        group_by_exprs[dim.name] = gb_col

    # GROUPING() flag columns + grouping modifier (rollup/cube)
    if resolved.grouping is not None and grouping_dim_aliases:
        builder.grouping(resolved.grouping.value)
        for alias in grouping_dim_aliases:
            gb_arg = group_by_exprs.get(alias) or ColumnRef(name=alias)
            flag_col = FunctionCall(name="GROUPING", args=[gb_arg])
            builder.select(AliasedExpr(expr=flag_col, alias=_grouping_flag_alias(alias)))

    # HAVING — expand alias references to actual CAST'd aggregate expressions
    for hf in resolved.having_filters:
        builder.having(_expand_measure_refs(hf.expression, measure_exprs))

    # ORDER BY (use alias for time-grained dimensions)
    grained_cols: dict[tuple[str, str | None], str] = {
        (d.source_column, d.object_name): d.name for d in resolved.dimensions if d.grain
    }
    for expr, desc, nulls in resolved.order_by_exprs:
        if isinstance(expr, ColumnRef) and (expr.name, expr.table) in grained_cols:
            expr = ColumnRef(name=grained_cols[(expr.name, expr.table)])
        builder.order_by(expr, desc=desc, nulls_last=_nulls_last(nulls))

    # LIMIT / OFFSET
    if resolved.limit is not None:
        builder.limit(resolved.limit)
    if resolved.offset is not None:
        builder.offset(resolved.offset)

    return QueryPlan(ast=builder.build())

CFL Planner

orionbelt.compiler.cfl.CFLPlanner

Plans Composite Fact Layer queries: conformed dimensions + fact stitching.

Uses a UNION ALL strategy: 1. Each fact leg SELECTs conformed dimensions + its own measures (NULL for others) 2. UNION ALL combines the legs into a single CTE 3. Outer query aggregates over the union, grouping by conformed dimensions

Source code in src/orionbelt/compiler/cfl.py
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
class CFLPlanner:
    """Plans Composite Fact Layer queries: conformed dimensions + fact stitching.

    Uses a UNION ALL strategy:
    1. Each fact leg SELECTs conformed dimensions + its own measures (NULL for others)
    2. UNION ALL combines the legs into a single CTE
    3. Outer query aggregates over the union, grouping by conformed dimensions
    """

    def plan(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        qualify_table: Callable[[DataObject], str] | None = None,
        union_by_name: bool = False,
        dialect: Dialect | None = None,
    ) -> QueryPlan:
        """Plan a CFL query."""
        self._validate_fanout(resolved, model)

        # dimensionsExclude: EXCEPT-based anti-join pattern
        if resolved.dimensions_exclude:
            return self._plan_dimensions_exclude(resolved, model, qualify_table)

        # Group measures by their source object
        measures_by_object, cross_fact = self._group_measures_by_object(resolved, model)

        # Dimension-only CFL: no measures but dimensions on independent branches.
        # Create leg groupings from connecting fact tables.
        if not measures_by_object and not cross_fact and resolved.requires_cfl:
            measures_by_object = self._group_dimensions_into_legs(resolved, model)

        if len(measures_by_object) <= 1 and not cross_fact:
            # Single fact — delegate to star schema
            from orionbelt.compiler.star import StarSchemaPlanner

            return StarSchemaPlanner().plan(
                resolved, model, qualify_table=qualify_table, dialect=dialect
            )

        # Two-column statistical aggregates (CORR/COVAR_*/REGR_*) need
        # paired-row semantics that the UNION ALL + concat-count multi-fact
        # path cannot express. Without this guard the planner emits
        # ``CORR(CAST(f0 AS VARCHAR) || '|' || CAST(f1 AS VARCHAR))`` — one
        # argument, wrong type. Fail fast with a clear error so the caller
        # can restructure their model or restrict the query to a single
        # fact source instead of getting an opaque execution-time error.
        for measure in resolved.measures:
            agg = measure.aggregation.lower() if measure.aggregation else ""
            if agg in TWO_COLUMN_AGGREGATIONS:
                raise UnsupportedAggregationForCFLError(measure.name, agg)

        # Multi-fact: UNION ALL strategy
        return self._plan_union_all(
            resolved,
            model,
            measures_by_object,
            cross_fact,
            qualify_table=qualify_table,
            union_by_name=union_by_name,
            dialect=dialect,
        )

    def _validate_fanout(self, resolved: ResolvedQuery, model: SemanticModel) -> None:
        """Validate that grain is compatible and no fanout will occur."""
        errors: list[str] = []

        for dim in resolved.dimensions:
            if dim.object_name not in model.data_objects:
                errors.append(
                    f"Dimension '{dim.name}' references unknown data object '{dim.object_name}'"
                )

        if errors:
            raise FanoutError("; ".join(errors))

    def _group_measures_by_object(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
    ) -> tuple[dict[str, list[ResolvedMeasure]], list[ResolvedMeasure]]:
        """Group measures by their primary source object.

        Returns ``(groups, cross_fact)`` where *cross_fact* contains
        multi-field measures whose fields span multiple objects.
        For metrics, expand their component measures into the grouping
        instead of the metric itself.  Cross-fact measures ensure every
        involved object has a leg, but are not assigned to any single
        group — their individual fields are distributed per-leg by
        ``_plan_union_all``.
        """
        groups: dict[str, list[ResolvedMeasure]] = {}
        cross_fact: list[ResolvedMeasure] = []
        seen: set[str] = set()

        for measure in resolved.measures:
            if measure.component_measures:
                # Metric: add each component measure to its source object
                for comp_name in measure.component_measures:
                    if comp_name in seen:
                        continue
                    seen.add(comp_name)
                    comp = resolved.metric_components.get(comp_name)
                    if comp is None:
                        continue
                    model_measure = model.measures.get(comp_name)
                    if model_measure and model_measure.columns:
                        obj_name = model_measure.columns[0].view or resolved.base_object
                    else:
                        obj_name = resolved.base_object
                    groups.setdefault(obj_name, []).append(comp)
            else:
                if measure.name in seen:
                    continue
                seen.add(measure.name)
                model_measure = model.measures.get(measure.name)
                if not model_measure:
                    groups.setdefault(resolved.base_object, []).append(measure)
                    continue

                # Collect source objects: from explicit columns or expression AST
                field_objects: set[str]
                if model_measure.columns:
                    field_objects = {f.view for f in model_measure.columns if f.view}
                else:
                    # Expression-based measure: extract table refs from the AST
                    field_objects = set()
                    self._collect_table_refs(measure.expression, field_objects)
                if len(field_objects) > 1:
                    # Cross-fact multi-field measure: ensure each
                    # involved object has a leg, but don't assign
                    # the measure to any single group.
                    cross_fact.append(measure)
                    for obj in field_objects:
                        groups.setdefault(obj, [])
                elif field_objects:
                    obj_name = next(iter(field_objects))
                    groups.setdefault(obj_name, []).append(measure)
                else:
                    groups.setdefault(resolved.base_object, []).append(measure)

        return groups, cross_fact

    @staticmethod
    def _group_dimensions_into_legs(
        resolved: ResolvedQuery,
        model: SemanticModel,
    ) -> dict[str, list[ResolvedMeasure]]:
        """Group dimensions into CFL legs for dimension-only queries.

        For each dimension, find the fact/bridge table that can reach it
        via directed join paths, and use that as the leg's key object.
        Returns empty measure lists per leg (dimension-only, no aggregates).
        """
        graph = JoinGraph(model, use_path_names=resolved.use_path_names or None)
        legs: dict[str, list[ResolvedMeasure]] = {}
        assigned: set[str] = set()

        # Build a lookup: for each dimension object, which fact tables can reach it?
        dim_objects = {d.object_name for d in resolved.dimensions}
        fact_candidates: list[tuple[str, set[str]]] = []
        for obj_name, obj in model.data_objects.items():
            if not obj.joins:
                continue
            reachable_dims = dim_objects & (graph.descendants(obj_name) | {obj_name})
            if reachable_dims:
                fact_candidates.append((obj_name, reachable_dims))

        # Greedy: pick fact table covering most unassigned dimensions first
        fact_candidates.sort(key=lambda x: (-len(x[1]), x[0]))
        for fact_obj, reachable in fact_candidates:
            covers = reachable - assigned
            if covers:
                legs[fact_obj] = []
                assigned.update(covers)

        return legs

    @staticmethod
    def _is_multi_field(measure: ResolvedMeasure) -> bool:
        """Check if a measure has multiple field args (e.g. COUNT(a, b))."""
        return isinstance(measure.expression, FunctionCall) and len(measure.expression.args) > 1

    @staticmethod
    def _resolve_null_type_for_field(
        measure: ResolvedMeasure,
        field_idx: int,
        model: SemanticModel,
        dialect: Dialect | None = None,
    ) -> str | None:
        """Resolve the SQL type for NULL padding in CFL UNION ALL legs.

        Two regimes apply:

        * **Numeric aggregates** (SUM / AVG / MIN / MAX / MEDIAN / etc.) —
          the inner column projection is the *aggregate's input column*, and
          OBSL casts the outer aggregate to the measure's declared
          ``dataType`` (e.g. ``decimal(18, 2)``). Padding with that same
          declared type keeps every CFL leg's column compatible with the
          outer ``SUM``/``AVG`` and avoids ClickHouse's ``Decimal`` +
          ``Float64`` Variant trap (where padding with the column's
          declared OBML ``abstractType: float`` mismatches storage as
          ``Decimal`` and produces ``ILLEGAL_TYPE_OF_ARGUMENT``).

        * **Count-style aggregates** (COUNT / COUNT_DISTINCT) — the inner
          column projection is the *raw column itself* (e.g. ``complid``,
          a text ID). The outer ``COUNT(DISTINCT ...)`` happily counts any
          type, but each CFL leg's column must agree on a type for
          ``UNION ALL``. Padding with the declared aggregate output type
          (BIGINT) trips strict-typed engines (Postgres / MySQL / strict
          ClickHouse) when the source column is text. Pad with the
          source column's abstract type instead.

        For multi-field measures (e.g. ``COUNT(a, b)``), per-column
        abstract types are used regardless of aggregation kind.
        """
        model_measure = model.measures.get(measure.name)
        if not model_measure:
            return None
        agg = (model_measure.aggregation or "").lower()
        is_count_style = agg in ("count", "count_distinct")
        # Multi-field measures: per-column abstract_type for each slot.
        if len(model_measure.columns) > 1:
            if field_idx < len(model_measure.columns):
                ref = model_measure.columns[field_idx]
                obj = model.data_objects.get(ref.view) if ref.view else None
                if obj and ref.column in obj.columns:
                    return obj.columns[ref.column].abstract_type.value
            return model_measure.result_type.value
        # Single-/zero-column COUNT-style: pad with the source column's
        # native type so UNION ALL legs agree (raw column, not aggregate).
        if is_count_style and len(model_measure.columns) == 1:
            ref = model_measure.columns[0]
            obj = model.data_objects.get(ref.view) if ref.view else None
            if obj and ref.column in obj.columns:
                return obj.columns[ref.column].abstract_type.value
        # Numeric aggregates: align padding with the outer CAST target.
        if dialect is not None and len(model_measure.columns) <= 1:
            resolved = resolve_measure_data_type(model_measure, model.settings)
            if resolved is not None:
                return dialect.render_obml_type(resolved)
        # Fallback to measure result_type.
        return model_measure.result_type.value

    @staticmethod
    def _multi_field_cte_alias(measure_name: str, idx: int) -> str:
        """CTE column name for the *idx*-th field of a multi-field measure."""
        return f"{measure_name}__f{idx}"

    @staticmethod
    def _unwrap_aggregation(measure: ResolvedMeasure) -> Expr:
        """Extract the inner expression from an aggregated measure.

        For FunctionCall(SUM, [inner]) → returns inner.
        Falls back to the full expression if not a FunctionCall.
        """
        if isinstance(measure.expression, FunctionCall) and measure.expression.args:
            return measure.expression.args[0]
        return measure.expression

    def _build_outer_metric_expr(
        self,
        metric: ResolvedMeasure,
        resolved: ResolvedQuery,
        cte_name: str,
    ) -> Expr:
        """Build the outer query expression for a metric.

        Walks the metric's AST tree and replaces each ColumnRef(measure_name)
        with ``AGG("cte_name"."measure_name")`` using the component measure's
        aggregation. The CTE qualification matters: when the outer SELECT
        also aliases its column ``measure_name`` to ``AGG(...)``, ClickHouse
        resolves a bare ``"measure_name"`` to the sibling alias (the
        aggregate itself) and rejects the resulting nested aggregate as
        ``ILLEGAL_AGGREGATION``. Qualifying with the CTE name forces the
        inner ref to resolve to the raw CTE column.
        """
        return self._substitute_outer_refs(metric.expression, resolved, cte_name)

    def _substitute_outer_refs(self, expr: Expr, resolved: ResolvedQuery, cte_name: str) -> Expr:
        """Recursively substitute measure refs with outer aggregations.

        Walks ``BinaryOp`` and ``FunctionCall.args`` so a metric formula
        with embedded SQL functions (e.g. ``... / NULLIF(other, 0)``)
        substitutes refs inside the function call instead of leaving the
        bare label, which would later bind against a non-existent
        column.
        """
        if isinstance(expr, ColumnRef) and expr.table is None:
            comp = resolved.metric_components.get(expr.name)
            if comp:
                agg = comp.aggregation.upper()
                distinct = False
                if agg == "COUNT_DISTINCT":
                    agg = "COUNT"
                    distinct = True
                if isinstance(comp.expression, FunctionCall) and comp.expression.distinct:
                    distinct = True
                return FunctionCall(
                    name=agg,
                    args=[ColumnRef(name=comp.name, table=cte_name)],
                    distinct=distinct,
                )
        if isinstance(expr, BinaryOp):
            new_left = self._substitute_outer_refs(expr.left, resolved, cte_name)
            new_right = self._substitute_outer_refs(expr.right, resolved, cte_name)
            if new_left is not expr.left or new_right is not expr.right:
                return BinaryOp(left=new_left, op=expr.op, right=new_right)
        if isinstance(expr, FunctionCall):
            new_args = [self._substitute_outer_refs(a, resolved, cte_name) for a in expr.args]
            if any(n is not o for n, o in zip(new_args, expr.args, strict=True)):
                return FunctionCall(
                    name=expr.name,
                    args=new_args,
                    distinct=expr.distinct,
                    order_by=expr.order_by,
                    separator=expr.separator,
                )
        return expr

    @staticmethod
    def _collect_table_refs(expr: Expr, tables: set[str]) -> None:
        """Recursively collect table names from ColumnRef nodes."""
        if isinstance(expr, ColumnRef) and expr.table:
            tables.add(expr.table)
        elif isinstance(expr, BinaryOp):
            CFLPlanner._collect_table_refs(expr.left, tables)
            CFLPlanner._collect_table_refs(expr.right, tables)
        elif isinstance(expr, UnaryOp):
            CFLPlanner._collect_table_refs(expr.operand, tables)
        elif isinstance(expr, (InList, IsNull, Between)):
            CFLPlanner._collect_table_refs(expr.expr, tables)
        elif isinstance(expr, RelativeDateRange):
            CFLPlanner._collect_table_refs(expr.column, tables)
        elif isinstance(expr, FunctionCall):
            for arg in expr.args:
                CFLPlanner._collect_table_refs(arg, tables)

    @staticmethod
    def _remap_cfl_order_by(expr: Expr, resolved: ResolvedQuery, model: SemanticModel) -> Expr:
        """Remap ORDER BY expressions to use CTE aliases for the outer query.

        In CFL, the outer query selects from the composite CTE — original
        table-qualified refs are out of scope.  Remap dimension and measure
        expressions to their CTE alias names. Matches by structural equality
        with each dimension's column expression so computed columns (where
        the source AST is an inlined expression, not a bare ColumnRef) also
        remap correctly.
        """
        for dim in resolved.dimensions:
            if expr == make_column_expr(model, dim.object_name, dim.column_name):
                return ColumnRef(name=dim.name)
        # Measure: match by identity (same expression object)
        for meas in resolved.measures:
            if expr is meas.expression:
                return ColumnRef(name=meas.name)
        # Numeric position — pass through
        return expr

    def _build_outer_concat_count(
        self,
        measure_name: str,
        n_fields: int,
        agg: str,
        distinct: bool,
        cte_name: str,
    ) -> Expr:
        """Build ``COUNT(DISTINCT CAST(f0 AS VARCHAR) || '|' || ...)`` for the outer query.

        Each field reference is qualified with *cte_name* so it resolves to
        the raw CTE column rather than any sibling SELECT alias (see
        ``_substitute_outer_refs`` for the alias-shadowing rationale).
        """
        parts: list[Expr] = [
            Cast(
                expr=ColumnRef(
                    name=self._multi_field_cte_alias(measure_name, i),
                    table=cte_name,
                ),
                type_name="VARCHAR",
            )
            for i in range(n_fields)
        ]
        concat: Expr = parts[0]
        for part in parts[1:]:
            concat = BinaryOp(
                left=concat,
                op="||",
                right=BinaryOp(
                    left=Literal.string("|"),
                    op="||",
                    right=part,
                ),
            )
        return FunctionCall(name=agg, args=[concat], distinct=distinct)

    def _plan_union_all(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        measures_by_object: dict[str, list[ResolvedMeasure]],
        cross_fact: list[ResolvedMeasure] | None = None,
        qualify_table: Callable[[DataObject], str] | None = None,
        union_by_name: bool = False,
        dialect: Dialect | None = None,
    ) -> QueryPlan:
        """UNION ALL strategy: stack fact legs with NULL padding, aggregate outside.

        When *union_by_name* is True (DuckDB, Snowflake) each leg only emits
        the columns it actually has — the database fills missing columns with
        NULL automatically via ``UNION ALL BY NAME``.
        """
        graph = JoinGraph(model, use_path_names=resolved.use_path_names or None)

        def qualify(obj: DataObject) -> str:
            return qualify_table(obj) if qualify_table else obj.qualified_code

        # Collect all measures across all objects + cross-fact measures
        all_measures: list[ResolvedMeasure] = []
        for measures in measures_by_object.values():
            all_measures.extend(measures)
        if cross_fact:
            all_measures.extend(cross_fact)

        # Collect data objects referenced by WHERE filters — each leg
        # must join these tables so the filter predicates are valid.
        filter_objects: set[str] = set()
        for wf in resolved.where_filters:
            self._collect_table_refs(wf.expression, filter_objects)

        # Build one SELECT per fact object group.
        # Each leg computes its own LCA (least common ancestor) as the lead
        # table — the graph-central node that can reach all dimension objects
        # and the measure's source object with minimal hops.
        union_legs: list[Select] = []
        leg_infos: list[CflLegInfo] = []
        for obj_name, measures in measures_by_object.items():
            leg_builder = QueryBuilder()
            this_measure_names = {m.name for m in measures}

            # Compute reachability from this leg's fact object upfront
            reachable = graph.descendants(obj_name) | {obj_name}

            # Collect table references from this leg's own-measure
            # expressions. A measure like ``Electronics Sales`` is
            # defined as ``SUM(CASE WHEN Products.productcat = …
            # THEN Sales.salesamount END)`` — the CASE condition
            # references Products, which must be joined into this
            # leg's FROM. Without this, the generated SQL emits
            # ``"Products"."productcat"`` against a FROM clause that
            # only has Sales + Clients, and the database raises
            # "missing FROM-clause entry for table Products".
            measure_expr_objects: set[str] = set()
            for m in measures:
                self._collect_table_refs(m.expression, measure_expr_objects)
            if cross_fact:
                for m in cross_fact:
                    if m.name in this_measure_names:
                        self._collect_table_refs(m.expression, measure_expr_objects)

            # SELECT conformed dimensions — only emit real column refs for
            # dimensions reachable from this leg's fact AND whose `via:`
            # waypoint (if any) is also reachable from this leg's fact.
            # Role-playing dimensions tied to a different fact via `via:`
            # are NULL-padded so each leg only projects the values that
            # belong to its own fact.
            for dim in resolved.dimensions:
                via_ok = dim.via is None or dim.via in reachable
                if dim.object_name in reachable and via_ok:
                    col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
                    if dim.grain and dialect:
                        col = dialect.render_time_grain(col, dim.grain)
                    leg_builder.select(AliasedExpr(expr=col, alias=dim.name))
                elif not union_by_name:
                    model_dim = model.dimensions.get(dim.name)
                    dim_type = model_dim.result_type.value if model_dim else None
                    col = Cast(Literal.null(), type_name=dim_type) if dim_type else Literal.null()
                    leg_builder.select(AliasedExpr(expr=col, alias=dim.name))

            # SELECT this fact's measures (raw expressions, no aggregation).
            # When union_by_name is True, skip NULL padding for other facts'
            # measures — the database fills them automatically.
            for m in all_measures:
                if self._is_multi_field(m):
                    assert isinstance(m.expression, FunctionCall)
                    for i, arg in enumerate(m.expression.args):
                        alias = self._multi_field_cte_alias(m.name, i)
                        arg_table = arg.table if isinstance(arg, ColumnRef) else None
                        if arg_table == obj_name:
                            leg_builder.select(AliasedExpr(expr=arg, alias=alias))
                        elif not union_by_name:
                            null_type = self._resolve_null_type_for_field(m, i, model)
                            null_expr: Expr = (
                                Cast(Literal.null(), type_name=null_type)
                                if null_type
                                else Literal.null()
                            )
                            leg_builder.select(AliasedExpr(expr=null_expr, alias=alias))
                elif m.name in this_measure_names:
                    # Cast the own-measure column to the same type used for
                    # NULL padding in sibling legs, so every leg's column
                    # agrees on a single type. Without this, strict-typed
                    # engines (ClickHouse with UNION ALL) produce a Variant
                    # type that SUM can't aggregate ("ILLEGAL_TYPE_OF_ARGUMENT
                    # Variant(Decimal, Float64)").
                    own_expr: Expr = self._unwrap_aggregation(m)
                    own_type_name = self._resolve_null_type_for_field(m, 0, model, dialect)
                    if own_type_name:
                        own_expr = Cast(expr=own_expr, type_name=own_type_name)
                    leg_builder.select(AliasedExpr(expr=own_expr, alias=m.name))
                elif not union_by_name:
                    model_measure = model.measures.get(m.name)
                    null_type_name = self._resolve_null_type_for_field(m, 0, model, dialect)
                    if null_type_name is None and model_measure:
                        null_type_name = model_measure.result_type.value
                    null_expr = (
                        Cast(Literal.null(), type_name=null_type_name)
                        if null_type_name
                        else Literal.null()
                    )
                    leg_builder.select(AliasedExpr(expr=null_expr, alias=m.name))

            # Determine the common root for this leg:
            # the deepest directed ancestor that can reach all dimension
            # objects, measure's source object, filter-referenced objects,
            # and any objects referenced by this leg's measure expressions.
            # Only include dimensions reachable from this leg's fact object.
            leg_required = {
                dim.object_name for dim in resolved.dimensions if dim.object_name in reachable
            }
            leg_required.add(obj_name)
            leg_required.update(filter_objects)
            # Include objects referenced by measure expressions, but only
            # those reachable from this leg's fact — cross-fact filter
            # tables would otherwise pull unrelated facts into the leg.
            leg_required.update(measure_expr_objects & reachable)
            lead = graph.find_common_root(leg_required)
            lead_obj = model.data_objects.get(lead)

            # FROM: the lead (LCA) table
            if lead_obj:
                leg_builder.from_(qualify(lead_obj), alias=lead)

            # JOINs: all required objects reachable from the lead
            join_targets = leg_required - {lead}
            steps: list[JoinStep] = []
            if join_targets:
                steps = graph.find_join_path(
                    {lead},
                    leg_required,
                    via_constraints=resolved.via_constraints or None,
                )
                # Dedupe by alias so a dim reachable through multiple
                # paths within one leg emits only one JOIN — postgres
                # rejects "table specified more than once" when two
                # role-played dims resolve to the same target object.
                joined_aliases: set[str] = {lead}
                for step in steps:
                    if step.to_object in joined_aliases:
                        continue
                    target_object = model.data_objects.get(step.to_object)
                    if target_object:
                        on_expr = graph.build_join_condition(step)
                        leg_builder.join(
                            table=qualify(target_object),
                            on=on_expr,
                            join_type=step.join_type,
                            alias=step.to_object,
                        )
                        joined_aliases.add(step.to_object)

            # Capture leg info for explain
            leg_join_strs = (
                [f"{s.from_object}{s.to_object}" for s in steps] if join_targets else []
            )
            if lead == obj_name:
                leg_reason = (
                    f'"{lead}" is the measure source — '
                    f"all required dimension objects are reachable from it"
                )
            else:
                leg_reason = (
                    f'"{lead}" is the deepest common root that can reach '
                    f'measure source "{obj_name}" and all reachable dimension objects'
                )
            leg_infos.append(
                CflLegInfo(
                    measure_source=obj_name,
                    common_root=lead,
                    reason=leg_reason,
                    measures=[m.name for m in measures],
                    joins=leg_join_strs,
                )
            )

            # Apply WHERE filters to each leg
            for wf in resolved.where_filters:
                leg_builder.where(wf.expression)

            union_legs.append(leg_builder.build())

        # Create the UNION ALL CTE
        cte_name = "composite_01"
        union_cte = CTE(name=cte_name, query=UnionAll(queries=union_legs))
        # All ColumnRefs that resolve to raw CTE columns inside outer-query
        # aggregate functions are qualified with *cte_name*. ClickHouse otherwise
        # resolves bare identifiers to sibling SELECT aliases first — when those
        # aliases are themselves aggregates (the case for measures and metrics
        # in the outer SELECT), it rejects the resulting nested aggregate as
        # ``ILLEGAL_AGGREGATION``. The qualification is harmless on dialects
        # that resolve column-first.

        # Build outer query: aggregate over the composite CTE
        outer_builder = QueryBuilder()

        # SELECT dimensions.  Coalesce groups emit COALESCE(d1, d2, ...) once
        # under the alias; plain dims keep their original column reference.
        emitted_coalesce_aliases: set[str] = set()
        coalesce_groups: dict[str, list[str]] = {}
        for d in resolved.dimensions:
            if d.coalesce_alias:
                coalesce_groups.setdefault(d.coalesce_alias, []).append(d.name)
        for dim in resolved.dimensions:
            if dim.coalesce_alias:
                if dim.coalesce_alias in emitted_coalesce_aliases:
                    continue
                emitted_coalesce_aliases.add(dim.coalesce_alias)
                outer_builder.select(
                    AliasedExpr(
                        expr=FunctionCall(
                            name="COALESCE",
                            args=[
                                ColumnRef(name=member)
                                for member in coalesce_groups[dim.coalesce_alias]
                            ],
                        ),
                        alias=dim.coalesce_alias,
                    )
                )
            else:
                outer_builder.select(
                    AliasedExpr(
                        expr=ColumnRef(name=dim.name),
                        alias=dim.name,
                    )
                )

        # SELECT aggregated measures and metrics
        # First, add all component measures (from UNION ALL legs)
        settings = model.settings
        seen_measure_names: set[str] = set()
        outer_measure_exprs: dict[str, Expr] = {}
        for m in all_measures:
            seen_measure_names.add(m.name)
            agg = m.aggregation.upper()
            distinct = False
            if agg == "COUNT_DISTINCT":
                agg = "COUNT"
                distinct = True
            if isinstance(m.expression, FunctionCall) and m.expression.distinct:
                distinct = True

            if self._is_multi_field(m):
                # Multi-field: concat CTE columns in outer query
                assert isinstance(m.expression, FunctionCall)
                n_fields = len(m.expression.args)
                agg_expr: Expr = self._build_outer_concat_count(
                    m.name, n_fields, agg, distinct, cte_name
                )
            else:
                agg_expr = FunctionCall(
                    name=agg,
                    args=[ColumnRef(name=m.name, table=cte_name)],
                    distinct=distinct,
                )
            # Apply CAST for resolved data_type
            model_measure = model.measures.get(m.name)
            if model_measure and dialect:
                resolved_type = resolve_measure_data_type(model_measure, settings)
                if resolved_type:
                    agg_expr = dialect.cast_to_obml_type(agg_expr, resolved_type)
            outer_builder.select(AliasedExpr(expr=agg_expr, alias=m.name))
            outer_measure_exprs[m.name] = agg_expr

        # Then, add metric expressions that combine component measures
        for m in resolved.measures:
            if m.component_measures and m.name not in seen_measure_names:
                metric_expr: Expr = self._build_outer_metric_expr(m, resolved, cte_name)
                metric = model.metrics.get(m.name)
                if metric and dialect:
                    resolved_type = resolve_metric_data_type(metric, settings)
                    if resolved_type:
                        metric_expr = dialect.cast_to_obml_type(metric_expr, resolved_type)
                outer_builder.select(AliasedExpr(expr=metric_expr, alias=m.name))
                outer_measure_exprs[m.name] = metric_expr

        outer_builder.from_(cte_name, alias=cte_name)

        # GROUP BY dimensions.  Coalesce groups group by the COALESCE expression
        # itself (most dialects accept either the alias or the expression; the
        # expression is portable across all eight supported dialects).
        grouped_coalesce_aliases: set[str] = set()
        for dim in resolved.dimensions:
            if dim.coalesce_alias:
                if dim.coalesce_alias in grouped_coalesce_aliases:
                    continue
                grouped_coalesce_aliases.add(dim.coalesce_alias)
                outer_builder.group_by(
                    FunctionCall(
                        name="COALESCE",
                        args=[
                            ColumnRef(name=member) for member in coalesce_groups[dim.coalesce_alias]
                        ],
                    )
                )
            else:
                outer_builder.group_by(ColumnRef(name=dim.name))

        # GROUPING() flag columns + grouping modifier (rollup/cube) — outer query only
        # so subtotal rows compose correctly over the unioned facts (the
        # individual UNION ALL legs stay at detail grain).
        if resolved.grouping is not None and resolved.dimensions:
            outer_builder.grouping(resolved.grouping.value)
            flag_aliases: list[str] = []
            for dim in resolved.dimensions:
                alias_name = dim.coalesce_alias or dim.name
                if alias_name in flag_aliases:
                    continue
                flag_aliases.append(alias_name)
            for alias in flag_aliases:
                flag_col = FunctionCall(name="GROUPING", args=[ColumnRef(name=alias)])
                outer_builder.select(AliasedExpr(expr=flag_col, alias=_grouping_flag_alias(alias)))

        # HAVING — expand alias references to actual CAST'd aggregate expressions
        for hf in resolved.having_filters:
            outer_builder.having(_expand_cfl_measure_refs(hf.expression, outer_measure_exprs))

        # ORDER BY and LIMIT — remap to CTE aliases
        for expr, desc, nulls in resolved.order_by_exprs:
            outer_builder.order_by(
                self._remap_cfl_order_by(expr, resolved, model),
                desc=desc,
                nulls_last=_nulls_last(nulls),
            )
        if resolved.limit is not None:
            outer_builder.limit(resolved.limit)
        if resolved.offset is not None:
            outer_builder.offset(resolved.offset)

        outer_select = outer_builder.build()

        # Attach CTE
        final = Select(
            columns=outer_select.columns,
            from_=outer_select.from_,
            joins=outer_select.joins,
            where=outer_select.where,
            group_by=outer_select.group_by,
            having=outer_select.having,
            order_by=outer_select.order_by,
            limit=outer_select.limit,
            offset=outer_select.offset,
            ctes=[union_cte],
            grouping=outer_select.grouping,
        )

        return QueryPlan(ast=final, cfl_legs=leg_infos)

    # -- dimensionsExclude: EXCEPT-based anti-join ----------------------------

    def _plan_dimensions_exclude(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        qualify_table: Callable[[DataObject], str] | None = None,
    ) -> QueryPlan:
        """Plan a dimensionsExclude query using EXCEPT pattern.

        Generates:
          WITH dim_group_00 AS (SELECT DISTINCT dims FROM ...),
               dim_group_01 AS (...),
               non_combinations AS (
                 SELECT ... FROM dim_group_00 CROSS JOIN dim_group_01
                 EXCEPT
                 SELECT ... FROM fact_joins
               )
          SELECT ... FROM non_combinations ORDER BY ... LIMIT ...
        """
        graph = JoinGraph(model, use_path_names=resolved.use_path_names or None)

        def qualify(obj: DataObject) -> str:
            return qualify_table(obj) if qualify_table else obj.qualified_code

        # Partition dimensions into independent groups
        dim_groups = self._partition_dimensions(resolved, graph)

        ctes: list[CTE] = []

        # CTE per dimension group: SELECT DISTINCT via GROUP BY
        group_cte_names: list[str] = []
        for i, group_dims in enumerate(dim_groups):
            cte_name = f"dim_group_{i:02d}"
            group_cte_names.append(cte_name)
            cte_query = self._build_group_distinct_select(
                group_dims,
                model,
                graph,
                qualify,
                via_constraints=resolved.via_constraints or None,
            )
            ctes.append(CTE(name=cte_name, query=cte_query))

        # Build "all_pairs": CROSS JOIN of all dim_group CTEs
        all_pairs_builder = QueryBuilder()
        for dim in resolved.dimensions:
            all_pairs_builder.select(AliasedExpr(expr=ColumnRef(name=dim.name), alias=dim.name))
        all_pairs_builder.from_(group_cte_names[0], alias=group_cte_names[0])
        for cte_name in group_cte_names[1:]:
            all_pairs_builder._joins.append(
                Join(join_type=JoinType.CROSS, source=cte_name, alias=cte_name)
            )
        all_pairs_select = all_pairs_builder.build()

        # Build "existing_pairs": actual combinations via fact-table joins
        existing_pairs_select = self._build_existing_pairs_select(resolved, model, graph, qualify)

        # EXCEPT CTE: all_pairs EXCEPT existing_pairs
        except_cte = CTE(
            name="non_combinations",
            query=Except(left=all_pairs_select, right=existing_pairs_select),
        )
        ctes.append(except_cte)

        # Outer query: SELECT from non_combinations with ORDER BY / LIMIT
        outer_builder = QueryBuilder()
        for dim in resolved.dimensions:
            outer_builder.select(AliasedExpr(expr=ColumnRef(name=dim.name), alias=dim.name))
        outer_builder.from_("non_combinations", alias="non_combinations")

        for expr, desc, nulls in resolved.order_by_exprs:
            outer_builder.order_by(
                self._remap_cfl_order_by(expr, resolved, model),
                desc=desc,
                nulls_last=_nulls_last(nulls),
            )
        if resolved.limit is not None:
            outer_builder.limit(resolved.limit)
        if resolved.offset is not None:
            outer_builder.offset(resolved.offset)

        outer = outer_builder.build()
        final = Select(
            columns=outer.columns,
            from_=outer.from_,
            joins=outer.joins,
            order_by=outer.order_by,
            limit=outer.limit,
            offset=outer.offset,
            ctes=ctes,
        )
        return QueryPlan(ast=final)

    @staticmethod
    def _partition_dimensions(
        resolved: ResolvedQuery,
        graph: JoinGraph,
    ) -> list[list[ResolvedDimension]]:
        """Partition dimensions into groups on independent branches."""
        obj_to_dims: dict[str, list[ResolvedDimension]] = {}
        for dim in resolved.dimensions:
            obj_to_dims.setdefault(dim.object_name, []).append(dim)

        # Cluster: two objects are in the same group if one is a descendant
        # of the other (i.e., connected via directed join paths).
        objects = sorted(obj_to_dims.keys())
        groups: list[set[str]] = []
        assigned: set[str] = set()

        for obj in objects:
            if obj in assigned:
                continue
            group = {obj}
            reachable = graph.descendants(obj) | {obj}
            for other in objects:
                if (
                    other != obj
                    and other not in assigned
                    and (other in reachable or obj in (graph.descendants(other) | {other}))
                ):
                    group.add(other)
            groups.append(group)
            assigned.update(group)

        # Convert to lists of ResolvedDimension
        result: list[list[ResolvedDimension]] = []
        for group_objs in groups:
            group_dims: list[ResolvedDimension] = []
            for obj in sorted(group_objs):
                group_dims.extend(obj_to_dims[obj])
            result.append(group_dims)
        return result

    @staticmethod
    def _build_group_distinct_select(
        dims: list[ResolvedDimension],
        model: SemanticModel,
        graph: JoinGraph,
        qualify: Callable[[DataObject], str],
        via_constraints: dict[str, str] | None = None,
    ) -> Select:
        """Build SELECT DISTINCT (via GROUP BY) for a group of dimensions."""
        required_objects = {d.object_name for d in dims}

        # Find the common root that can reach all objects in this group
        if len(required_objects) > 1:
            root = graph.find_common_root(required_objects)
        else:
            root = next(iter(required_objects))

        # If root is a pure dimension table with no joins, check if a fact
        # table can reach it (needed for bridge-table traversal).
        root_obj = model.data_objects.get(root)
        if root_obj and not root_obj.joins and root not in required_objects:
            root = next(iter(sorted(required_objects)))
            root_obj = model.data_objects.get(root)

        builder = QueryBuilder()
        for dim in dims:
            col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
            builder.select(AliasedExpr(expr=col, alias=dim.name))
            builder.group_by(col)

        if root_obj:
            builder.from_(qualify(root_obj), alias=root)

        # Join to reach all dimension objects from root
        all_needed = required_objects | {root}
        if len(all_needed) > 1:
            steps = graph.find_join_path(
                {root},
                all_needed,
                via_constraints=via_constraints,
            )
            joined_aliases: set[str] = {root}
            for step in steps:
                if step.to_object in joined_aliases:
                    continue
                target_obj = model.data_objects.get(step.to_object)
                if target_obj:
                    on_expr = graph.build_join_condition(step)
                    builder.join(
                        table=qualify(target_obj),
                        on=on_expr,
                        join_type=step.join_type,
                        alias=step.to_object,
                    )
                    joined_aliases.add(step.to_object)

        return builder.build()

    def _build_existing_pairs_select(
        self,
        resolved: ResolvedQuery,
        model: SemanticModel,
        graph: JoinGraph,
        qualify: Callable[[DataObject], str],
    ) -> Select:
        """Build SELECT for existing dimension combinations via fact-table joins.

        Uses a fact/bridge table as the base and joins through hub tables
        to reach all dimension objects on both branches.
        """
        all_dim_objects = {d.object_name for d in resolved.dimensions}

        # Find fact tables that connect the dimension groups
        leg_objects = self._group_dimensions_into_legs(resolved, model)
        fact_tables = set(leg_objects.keys())

        # Use a fact table as the base (pick the one with most joins)
        best_fact = max(
            sorted(fact_tables),
            key=lambda f: len(model.data_objects[f].joins) if f in model.data_objects else 0,
        )
        best_fact_obj = model.data_objects.get(best_fact)

        builder = QueryBuilder()
        for dim in resolved.dimensions:
            col: Expr = make_column_expr(model, dim.object_name, dim.column_name)
            builder.select(AliasedExpr(expr=col, alias=dim.name))
            builder.group_by(col)

        if best_fact_obj:
            builder.from_(qualify(best_fact_obj), alias=best_fact)

        # Required: all dimension objects + all fact tables
        all_needed = all_dim_objects | fact_tables | {best_fact}
        joined: set[str] = {best_fact}
        steps = graph.find_join_path(
            {best_fact},
            all_needed,
            via_constraints=resolved.via_constraints or None,
        )
        for step in steps:
            # Determine the actual new table to join.
            # For reversed edges, to_object may already be joined and the
            # actual new table is from_object.
            if step.to_object not in joined:
                new_table = step.to_object
            elif step.from_object not in joined:
                new_table = step.from_object
            else:
                continue  # Both already joined

            target_obj = model.data_objects.get(new_table)
            if target_obj:
                on_expr = graph.build_join_condition(step)
                builder.join(
                    table=qualify(target_obj),
                    on=on_expr,
                    join_type=step.join_type,
                    alias=new_table,
                )
                joined.add(new_table)

        # Apply WHERE filters to existing pairs
        for wf in resolved.where_filters:
            builder.where(wf.expression)

        return builder.build()

plan(resolved, model, qualify_table=None, union_by_name=False, dialect=None)

Plan a CFL query.

Source code in src/orionbelt/compiler/cfl.py
def plan(
    self,
    resolved: ResolvedQuery,
    model: SemanticModel,
    qualify_table: Callable[[DataObject], str] | None = None,
    union_by_name: bool = False,
    dialect: Dialect | None = None,
) -> QueryPlan:
    """Plan a CFL query."""
    self._validate_fanout(resolved, model)

    # dimensionsExclude: EXCEPT-based anti-join pattern
    if resolved.dimensions_exclude:
        return self._plan_dimensions_exclude(resolved, model, qualify_table)

    # Group measures by their source object
    measures_by_object, cross_fact = self._group_measures_by_object(resolved, model)

    # Dimension-only CFL: no measures but dimensions on independent branches.
    # Create leg groupings from connecting fact tables.
    if not measures_by_object and not cross_fact and resolved.requires_cfl:
        measures_by_object = self._group_dimensions_into_legs(resolved, model)

    if len(measures_by_object) <= 1 and not cross_fact:
        # Single fact — delegate to star schema
        from orionbelt.compiler.star import StarSchemaPlanner

        return StarSchemaPlanner().plan(
            resolved, model, qualify_table=qualify_table, dialect=dialect
        )

    # Two-column statistical aggregates (CORR/COVAR_*/REGR_*) need
    # paired-row semantics that the UNION ALL + concat-count multi-fact
    # path cannot express. Without this guard the planner emits
    # ``CORR(CAST(f0 AS VARCHAR) || '|' || CAST(f1 AS VARCHAR))`` — one
    # argument, wrong type. Fail fast with a clear error so the caller
    # can restructure their model or restrict the query to a single
    # fact source instead of getting an opaque execution-time error.
    for measure in resolved.measures:
        agg = measure.aggregation.lower() if measure.aggregation else ""
        if agg in TWO_COLUMN_AGGREGATIONS:
            raise UnsupportedAggregationForCFLError(measure.name, agg)

    # Multi-fact: UNION ALL strategy
    return self._plan_union_all(
        resolved,
        model,
        measures_by_object,
        cross_fact,
        qualify_table=qualify_table,
        union_by_name=union_by_name,
        dialect=dialect,
    )

Join Graph

orionbelt.compiler.graph.JoinGraph

Graph of data objects (nodes) and relationships (edges) for join path resolution.

Source code in src/orionbelt/compiler/graph.py
class JoinGraph:
    """Graph of data objects (nodes) and relationships (edges) for join path resolution."""

    def __init__(
        self,
        model: SemanticModel,
        use_path_names: list[UsePathName] | None = None,
    ) -> None:
        self._graph: nx.Graph[str] = nx.Graph()
        self._directed: nx.DiGraph[str] = nx.DiGraph()
        # Path-finding graph: many-to-one is forward-only (would cause fanout
        # in reverse); one-to-one and many-to-many are bidirectional.
        self._traversable: nx.DiGraph[str] = nx.DiGraph()
        self._model = model
        self._build(model, use_path_names)

    def _build(
        self,
        model: SemanticModel,
        use_path_names: list[UsePathName] | None = None,
    ) -> None:
        """Build the graph from the semantic model.

        Secondary joins are only included when their pathName is requested
        via *use_path_names*.  When a secondary override is active for a
        ``(source, target)`` pair, the primary join for that pair is excluded.
        """
        for name in model.data_objects:
            self._graph.add_node(name)
            self._directed.add_node(name)
            self._traversable.add_node(name)

        # Build a lookup: (source, target) → pathName for active overrides
        active_overrides: dict[tuple[str, str], str] = {}
        if use_path_names:
            for upn in use_path_names:
                active_overrides[(upn.source, upn.target)] = upn.path_name

        for obj_name, obj in model.data_objects.items():
            for join in obj.joins:
                if join.join_to not in model.data_objects:
                    continue
                pair = (obj_name, join.join_to)

                if join.secondary:
                    # Only include if this secondary join's pathName is active
                    if pair in active_overrides and active_overrides[pair] == join.path_name:
                        self._add_edge(obj_name, join)
                else:
                    # Primary join: skip if an active override exists for this pair
                    if pair not in active_overrides:
                        self._add_edge(obj_name, join)

    def _add_edge(self, obj_name: str, join: object) -> None:
        """Add an edge to the undirected, directed, and traversable graphs.

        The traversable graph is used by :meth:`find_join_path` to enforce
        the rule "many-to-one is never bidirectional": walking such a join
        backwards would multiply rows of the source table, so only forward
        traversal is allowed.  One-to-one and many-to-many joins remain
        bidirectional in the traversable graph.
        """
        from orionbelt.models.semantic import DataObjectJoin

        assert isinstance(join, DataObjectJoin)
        self._graph.add_edge(
            obj_name,
            join.join_to,
            columns_from=join.columns_from,
            columns_to=join.columns_to,
            cardinality=join.join_type,
            source_object=obj_name,
        )
        self._directed.add_edge(
            obj_name,
            join.join_to,
            columns_from=join.columns_from,
            columns_to=join.columns_to,
            cardinality=join.join_type,
        )
        self._traversable.add_edge(obj_name, join.join_to)
        if join.join_type != Cardinality.MANY_TO_ONE:
            # Safe to walk backwards: row count is preserved.
            self._traversable.add_edge(join.join_to, obj_name)

    def descendants(self, node: str) -> set[str]:
        """Return all nodes reachable from *node* via directed join paths."""
        if node not in self._directed:
            return set()
        return nx.descendants(self._directed, node)

    def find_common_root(self, required_objects: set[str]) -> str:
        """Find the common root for a set of required objects.

        The join graph is a DAG (joins define direction: source → joinTo).
        The common root is the **deepest** node that can reach ALL
        *required_objects* via directed join paths.  "Deepest" = smallest
        descendant set (most specific ancestor, closest to the required nodes).

        In ``returns → sales → customer``, with required ``{customer, item}``,
        the common root is ``sales`` (it can reach both).  With required
        ``{customer, item, returns}``, the common root is ``returns`` (the
        only node that can reach all three).
        """
        required = required_objects & set(self._directed.nodes)
        if len(required) <= 1:
            return next(iter(sorted(required))) if required else ""

        # Find all nodes that can reach ALL required nodes via directed paths
        candidates: list[tuple[str, int]] = []
        for node in self._directed.nodes:
            reachable = nx.descendants(self._directed, node) | {node}
            if required <= reachable:
                candidates.append((node, len(reachable)))

        if not candidates:
            # Fallback: no single directed ancestor covers all —
            # use undirected shortest-path center
            return self._find_center_undirected(required)

        # Pick the deepest ancestor: smallest reachable set that still covers all
        candidates.sort(key=lambda x: (x[1], x[0]))
        return candidates[0][0]

    def _find_center_undirected(self, required: set[str]) -> str:
        """Fallback: center of the Steiner tree in the undirected graph."""
        nodes = sorted(required)
        if len(nodes) <= 1:
            return nodes[0] if nodes else ""

        steiner: set[str] = set()
        for i in range(len(nodes)):
            for j in range(i + 1, len(nodes)):
                try:
                    path: list[str] = nx.shortest_path(self._graph, nodes[i], nodes[j])
                    steiner.update(path)
                except nx.NetworkXNoPath:
                    pass

        if not steiner:
            return nodes[0]

        best: str = nodes[0]
        best_max: int | float = len(self._graph.nodes) + 1
        for node in sorted(steiner):
            max_dist = max(nx.shortest_path_length(self._graph, node, r) for r in nodes)
            if max_dist < best_max:
                best_max = max_dist
                best = node
        return best

    def find_join_path(
        self,
        from_objects: set[str],
        to_objects: set[str],
        via_constraints: dict[str, str] | None = None,
    ) -> list[JoinStep]:
        """Find a minimal join path connecting all required data objects.

        Uses shortest path for each target object from the set of source objects.

        *via_constraints* maps ``target → via``: for constrained targets, only
        the ``via`` object is used as the source so the path is forced through it.
        """
        steps: list[JoinStep] = []
        visited_edges: set[tuple[str, str]] = set()
        via = via_constraints or {}

        # Process via waypoints first so they are in source_list when their
        # constrained targets are processed.
        all_targets = to_objects - from_objects
        via_targets = {t for t in all_targets if t in via}
        non_via_targets = all_targets - via_targets
        via_waypoints = {via[t] for t in via_targets} - from_objects - via_targets
        ordered_targets = sorted(via_waypoints) + sorted(non_via_targets) + sorted(via_targets)

        source_list = list(from_objects)

        for target in ordered_targets:
            best_path: list[str] | None = None
            sources = [via[target]] if target in via and via[target] in source_list else source_list
            for source in sources:
                try:
                    path = nx.shortest_path(self._traversable, source, target)
                    if best_path is None or len(path) < len(best_path):
                        best_path = path
                except nx.NetworkXNoPath:
                    continue

            if best_path is None:
                continue

            for i in range(len(best_path) - 1):
                edge = (best_path[i], best_path[i + 1])
                rev_edge = (best_path[i + 1], best_path[i])
                if edge in visited_edges or rev_edge in visited_edges:
                    continue
                visited_edges.add(edge)

                edge_data = self._graph.edges[edge]
                source_object = edge_data.get("source_object", edge[0])

                if source_object == edge[0]:
                    step = JoinStep(
                        from_object=edge[0],
                        to_object=edge[1],
                        from_columns=edge_data["columns_from"],
                        to_columns=edge_data["columns_to"],
                        join_type=ASTJoinType.LEFT,
                        cardinality=edge_data["cardinality"],
                    )
                else:
                    # Path traverses edge in reverse direction.
                    # from_object/to_object are swapped, so columns must be
                    # swapped too to keep the ON clause correctly oriented.
                    step = JoinStep(
                        from_object=edge[1],
                        to_object=edge[0],
                        from_columns=edge_data["columns_to"],
                        to_columns=edge_data["columns_from"],
                        join_type=ASTJoinType.LEFT,
                        cardinality=edge_data["cardinality"],
                        reversed=True,
                    )
                steps.append(step)

            # Add target to sources for subsequent lookups
            if target not in source_list:
                source_list.append(target)

        return steps

    def find_join_path_undirected(
        self,
        from_object: str,
        to_object: str,
    ) -> list[JoinStep]:
        """Find a join path ignoring cardinality direction.

        Unlike :meth:`find_join_path` (which forbids walking many-to-one
        joins backwards to prevent fanout in the outer query), this walker
        considers the join graph as undirected.  It's intended for
        correlated subqueries — EXISTS / NOT EXISTS — where row counts on
        the outer side are unaffected by how many rows the subquery scans.

        Each emitted :class:`JoinStep` is oriented so ``from_object`` is the
        step's predecessor on the path and ``to_object`` is its successor;
        ``from_columns`` / ``to_columns`` are swapped when the underlying
        join edge is traversed against its declared direction.
        """
        if from_object == to_object:
            return []
        if from_object not in self._graph or to_object not in self._graph:
            return []
        try:
            path: list[str] = nx.shortest_path(self._graph, from_object, to_object)
        except nx.NetworkXNoPath:
            return []

        steps: list[JoinStep] = []
        for i in range(len(path) - 1):
            pred, succ = path[i], path[i + 1]
            edge_data = self._graph.edges[(pred, succ)]
            source_object = edge_data.get("source_object", pred)
            if source_object == pred:
                from_cols = edge_data["columns_from"]
                to_cols = edge_data["columns_to"]
                reversed_ = False
            else:
                from_cols = edge_data["columns_to"]
                to_cols = edge_data["columns_from"]
                reversed_ = True
            steps.append(
                JoinStep(
                    from_object=pred,
                    to_object=succ,
                    from_columns=from_cols,
                    to_columns=to_cols,
                    join_type=ASTJoinType.LEFT,
                    cardinality=edge_data["cardinality"],
                    reversed=reversed_,
                )
            )
        return steps

    def build_join_condition(self, step: JoinStep) -> Expr:
        """Build the ON clause expression for a join step.

        Routes both sides through ``make_column_expr`` so a computed
        join key (``expression:`` instead of ``code:`` on the column)
        inlines its template body. Without this, a join on a computed
        key would render ``"obj"."" = "other"."key"`` and the database
        would error on the zero-length identifier.
        """
        from orionbelt.compiler.resolution import make_column_expr

        conditions: list[Expr] = []
        for from_c, to_c in zip(step.from_columns, step.to_columns, strict=True):
            from_obj = self._model.data_objects.get(step.from_object)
            to_obj = self._model.data_objects.get(step.to_object)
            if from_obj and from_c in from_obj.columns:
                left_expr: Expr = make_column_expr(self._model, step.from_object, from_c)
            else:
                left_expr = ColumnRef(name=from_c, table=step.from_object)
            if to_obj and to_c in to_obj.columns:
                right_expr: Expr = make_column_expr(self._model, step.to_object, to_c)
            else:
                right_expr = ColumnRef(name=to_c, table=step.to_object)
            conditions.append(BinaryOp(left=left_expr, op="=", right=right_expr))

        if not conditions:
            msg = f"Join from '{step.from_object}' to '{step.to_object}' has no join columns"
            raise ValueError(msg)
        result: Expr = conditions[0]
        for cond in conditions[1:]:
            result = BinaryOp(left=result, op="AND", right=cond)
        return result

    def detect_cycles(self) -> list[list[str]]:
        """Detect cyclic join paths."""
        try:
            cycles = list(nx.simple_cycles(self._directed))
            return cycles
        except nx.NetworkXError:
            return []

    def validate_deterministic(self) -> list[SemanticError]:
        """Ensure join paths are deterministic (no ambiguity)."""
        errors: list[SemanticError] = []
        # Check for multiple edges between the same pair of nodes
        for u, v in self._graph.edges():
            if self._graph.number_of_edges(u, v) > 1:
                errors.append(
                    SemanticError(
                        code="AMBIGUOUS_JOIN",
                        message=f"Multiple join paths between '{u}' and '{v}'",
                        path=f"dataObjects.{u}.joins",
                    )
                )
        return errors

find_join_path(from_objects, to_objects, via_constraints=None)

Find a minimal join path connecting all required data objects.

Uses shortest path for each target object from the set of source objects.

via_constraints maps target → via: for constrained targets, only the via object is used as the source so the path is forced through it.

Source code in src/orionbelt/compiler/graph.py
def find_join_path(
    self,
    from_objects: set[str],
    to_objects: set[str],
    via_constraints: dict[str, str] | None = None,
) -> list[JoinStep]:
    """Find a minimal join path connecting all required data objects.

    Uses shortest path for each target object from the set of source objects.

    *via_constraints* maps ``target → via``: for constrained targets, only
    the ``via`` object is used as the source so the path is forced through it.
    """
    steps: list[JoinStep] = []
    visited_edges: set[tuple[str, str]] = set()
    via = via_constraints or {}

    # Process via waypoints first so they are in source_list when their
    # constrained targets are processed.
    all_targets = to_objects - from_objects
    via_targets = {t for t in all_targets if t in via}
    non_via_targets = all_targets - via_targets
    via_waypoints = {via[t] for t in via_targets} - from_objects - via_targets
    ordered_targets = sorted(via_waypoints) + sorted(non_via_targets) + sorted(via_targets)

    source_list = list(from_objects)

    for target in ordered_targets:
        best_path: list[str] | None = None
        sources = [via[target]] if target in via and via[target] in source_list else source_list
        for source in sources:
            try:
                path = nx.shortest_path(self._traversable, source, target)
                if best_path is None or len(path) < len(best_path):
                    best_path = path
            except nx.NetworkXNoPath:
                continue

        if best_path is None:
            continue

        for i in range(len(best_path) - 1):
            edge = (best_path[i], best_path[i + 1])
            rev_edge = (best_path[i + 1], best_path[i])
            if edge in visited_edges or rev_edge in visited_edges:
                continue
            visited_edges.add(edge)

            edge_data = self._graph.edges[edge]
            source_object = edge_data.get("source_object", edge[0])

            if source_object == edge[0]:
                step = JoinStep(
                    from_object=edge[0],
                    to_object=edge[1],
                    from_columns=edge_data["columns_from"],
                    to_columns=edge_data["columns_to"],
                    join_type=ASTJoinType.LEFT,
                    cardinality=edge_data["cardinality"],
                )
            else:
                # Path traverses edge in reverse direction.
                # from_object/to_object are swapped, so columns must be
                # swapped too to keep the ON clause correctly oriented.
                step = JoinStep(
                    from_object=edge[1],
                    to_object=edge[0],
                    from_columns=edge_data["columns_to"],
                    to_columns=edge_data["columns_from"],
                    join_type=ASTJoinType.LEFT,
                    cardinality=edge_data["cardinality"],
                    reversed=True,
                )
            steps.append(step)

        # Add target to sources for subsequent lookups
        if target not in source_list:
            source_list.append(target)

    return steps

build_join_condition(step)

Build the ON clause expression for a join step.

Routes both sides through make_column_expr so a computed join key (expression: instead of code: on the column) inlines its template body. Without this, a join on a computed key would render "obj"."" = "other"."key" and the database would error on the zero-length identifier.

Source code in src/orionbelt/compiler/graph.py
def build_join_condition(self, step: JoinStep) -> Expr:
    """Build the ON clause expression for a join step.

    Routes both sides through ``make_column_expr`` so a computed
    join key (``expression:`` instead of ``code:`` on the column)
    inlines its template body. Without this, a join on a computed
    key would render ``"obj"."" = "other"."key"`` and the database
    would error on the zero-length identifier.
    """
    from orionbelt.compiler.resolution import make_column_expr

    conditions: list[Expr] = []
    for from_c, to_c in zip(step.from_columns, step.to_columns, strict=True):
        from_obj = self._model.data_objects.get(step.from_object)
        to_obj = self._model.data_objects.get(step.to_object)
        if from_obj and from_c in from_obj.columns:
            left_expr: Expr = make_column_expr(self._model, step.from_object, from_c)
        else:
            left_expr = ColumnRef(name=from_c, table=step.from_object)
        if to_obj and to_c in to_obj.columns:
            right_expr: Expr = make_column_expr(self._model, step.to_object, to_c)
        else:
            right_expr = ColumnRef(name=to_c, table=step.to_object)
        conditions.append(BinaryOp(left=left_expr, op="=", right=right_expr))

    if not conditions:
        msg = f"Join from '{step.from_object}' to '{step.to_object}' has no join columns"
        raise ValueError(msg)
    result: Expr = conditions[0]
    for cond in conditions[1:]:
        result = BinaryOp(left=result, op="AND", right=cond)
    return result

detect_cycles()

Detect cyclic join paths.

Source code in src/orionbelt/compiler/graph.py
def detect_cycles(self) -> list[list[str]]:
    """Detect cyclic join paths."""
    try:
        cycles = list(nx.simple_cycles(self._directed))
        return cycles
    except nx.NetworkXError:
        return []

Code Generator

orionbelt.compiler.codegen.CodeGenerator

Generates SQL from AST using a dialect.

Source code in src/orionbelt/compiler/codegen.py
class CodeGenerator:
    """Generates SQL from AST using a dialect."""

    def __init__(self, dialect: Dialect) -> None:
        self._dialect = dialect

    @property
    def dialect(self) -> Dialect:
        return self._dialect

    def generate(self, ast: Select) -> str:
        """Generate SQL string from AST using the configured dialect."""
        return self._dialect.compile(ast)

generate(ast)

Generate SQL string from AST using the configured dialect.

Source code in src/orionbelt/compiler/codegen.py
def generate(self, ast: Select) -> str:
    """Generate SQL string from AST using the configured dialect."""
    return self._dialect.compile(ast)

Dialect Base

orionbelt.dialect.base.Dialect

Bases: ABC

Abstract base for all SQL dialects.

Provides default SQL compilation; dialects override specific methods.

Source code in src/orionbelt/dialect/base.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
class Dialect(ABC):
    """Abstract base for all SQL dialects.

    Provides default SQL compilation; dialects override specific methods.
    """

    _ABSTRACT_TYPE_MAP: dict[str, str] = {
        "string": "VARCHAR",
        "json": "VARCHAR",
        "int": "INTEGER",
        "float": "FLOAT",
        "date": "DATE",
        "time": "TIME",
        "time_tz": "TIME",
        "timestamp": "TIMESTAMP",
        "timestamp_tz": "TIMESTAMP",
        "boolean": "BOOLEAN",
    }

    _MAX_DECIMAL_PRECISION: int = 38

    _OBML_SIMPLE_TYPE_MAP: dict[str, str] = {
        "bigint": "BIGINT",
        "integer": "INTEGER",
        "double": "DOUBLE",
        "date": "DATE",
        "timestamp": "TIMESTAMP",
        "time": "TIME",
        "string": "VARCHAR",
        "boolean": "BOOLEAN",
    }

    def render_obml_type(self, obml_type: OBMLType) -> str:
        """Render an OBMLType to a dialect-specific SQL type string.

        Handles precision clamping for decimal types.
        """
        if isinstance(obml_type, DecimalType):
            p = min(obml_type.precision, self._MAX_DECIMAL_PRECISION)
            s = min(obml_type.scale, p)
            return f"DECIMAL({p}, {s})"
        return self._OBML_SIMPLE_TYPE_MAP.get(obml_type.name, obml_type.name.upper())

    def cast_to_obml_type(self, expr: Expr, obml_type: OBMLType) -> Expr:
        """Build an Expr that coerces ``expr`` to the given OBML type.

        Default form is a plain ``CAST(expr AS <type>)``. Dialects whose
        ``CAST`` doesn't accept a parameterized decimal (notably BigQuery
        — "Parameterized types are not allowed in CAST expressions") can
        override to wrap the cast with a ROUND to honour the user-specified
        scale.
        """
        return Cast(expr=expr, type_name=self.render_obml_type(obml_type))

    def _resolve_type_name(self, type_name: str) -> str:
        """Map an abstract type name to a dialect-specific SQL type.

        Looks up ``_ABSTRACT_TYPE_MAP`` first; if *type_name* is not found
        (e.g. already a concrete SQL type like ``VARCHAR``), returns it as-is.
        """
        return self._ABSTRACT_TYPE_MAP.get(type_name, type_name)

    def format_table_ref(self, database: str, schema: str, code: str) -> str:
        """Format a fully-qualified table reference.

        Default: three-part ``database.schema.code`` (Snowflake/Databricks/Dremio).
        Postgres and ClickHouse override to two-part naming.
        All components are quoted to prevent SQL injection.
        """
        return (
            f"{self.quote_identifier(database)}"
            f".{self.quote_identifier(schema)}"
            f".{self.quote_identifier(code)}"
        )

    @property
    @abstractmethod
    def name(self) -> str: ...

    @property
    @abstractmethod
    def capabilities(self) -> DialectCapabilities: ...

    @abstractmethod
    def quote_identifier(self, name: str) -> str:
        """Quote an identifier per dialect rules."""

    @abstractmethod
    def render_time_grain(self, column: Expr, grain: TimeGrain) -> Expr:
        """Wrap a column expression for the given time grain."""

    @abstractmethod
    def render_cast(self, expr: Expr, target_type: str) -> Expr:
        """Render a CAST expression."""

    @abstractmethod
    def current_date_sql(self) -> str:
        """Return SQL for the current date."""

    @abstractmethod
    def date_add_sql(self, date_sql: str, unit: str, count: int) -> str:
        """Return SQL that adds count units to date_sql."""

    @abstractmethod
    def render_date_trunc_sql(self, column_sql: str, grain: str) -> str:
        """Return SQL string that truncates a date/timestamp to the given grain.

        String-level helper (not AST) for use in raw SQL CTEs like date_range.
        """

    @abstractmethod
    def render_date_spine_cte_sql(
        self,
        min_date: str,
        max_date: str,
        grain: str,
        offset: int,
        offset_grain: str,
    ) -> str:
        """Return the SQL body for a date spine CTE.

        Must produce two columns: ``spine_date`` and ``spine_date_prev``.
        ``spine_date_prev`` is NULL when the offset date falls before min_date.

        Parameters
        ----------
        min_date : str
            SQL expression referencing the minimum date (e.g. ``date_range.min_date``).
        max_date : str
            SQL expression referencing the maximum date.
        grain : str
            Time grain string: ``day``, ``week``, ``month``, ``quarter``, ``year``.
        offset : int
            Signed period offset (e.g. ``-1`` for previous period).
        offset_grain : str
            Grain of the offset (e.g. ``year`` for YoY).
        """

    def render_string_contains(self, column: Expr, pattern: Expr) -> Expr:
        """Default: column LIKE '%' || pattern || '%'."""
        return BinaryOp(
            left=column,
            op="LIKE",
            right=BinaryOp(
                left=BinaryOp(left=Literal.string("%"), op="||", right=pattern),
                op="||",
                right=Literal.string("%"),
            ),
        )

    def _map_function_name(self, name: str) -> str:
        """Map a function name to the dialect-specific equivalent.

        Override in subclasses to remap names (e.g. ANY_VALUE → any in ClickHouse).
        """
        return name

    def _check_aggregation_supported(self, name: str) -> None:
        """Raise ``UnsupportedAggregationError`` when the dialect doesn't support
        the given aggregation. Matches case-insensitively against
        ``capabilities.unsupported_aggregations`` (lowercase OBML names).

        Existing per-function compile overrides (``_compile_mode``,
        ``_compile_median``) still raise directly — this generic gate is a
        catch-all for purely-standard aggregations like ``REGR_SLOPE`` where
        no special compile path exists.
        """
        if name.lower() in {a.lower() for a in self.capabilities.unsupported_aggregations}:
            raise UnsupportedAggregationError(self.name, name.lower())

    def _compile_median(self, args: list[Expr]) -> str:
        """Compile MEDIAN — default uses MEDIAN(col).

        Works for Snowflake, ClickHouse, Databricks, and Dremio. Postgres overrides.
        """
        col_sql = self.compile_expr(args[0]) if args else "NULL"
        return f"MEDIAN({col_sql})"

    def _compile_mode(self, args: list[Expr]) -> str:
        """Compile MODE — default uses MODE(col).

        Works for Snowflake and Databricks. Postgres, ClickHouse, and Dremio override.
        """
        col_sql = self.compile_expr(args[0]) if args else "NULL"
        return f"MODE({col_sql})"

    def _compile_listagg(
        self,
        args: list[Expr],
        distinct: bool,
        order_by: list[OrderByItem],
        separator: str | None,
    ) -> str:
        """Compile LISTAGG — default uses LISTAGG(col, sep) WITHIN GROUP (ORDER BY ...).

        Works for Snowflake and Dremio. Postgres, ClickHouse, and Databricks override.
        """
        sep = separator if separator is not None else ","
        col_sql = self.compile_expr(args[0]) if args else "''"
        distinct_sql = "DISTINCT " if distinct else ""
        escaped_sep = sep.replace("'", "''")
        result = f"LISTAGG({distinct_sql}{col_sql}, '{escaped_sep}')"
        if order_by:
            ob = ", ".join(self.compile_order_by(o) for o in order_by)
            result += f" WITHIN GROUP (ORDER BY {ob})"
        return result

    def _compile_cast(self, inner: Expr, type_name: str) -> str:
        """Render ``CAST(expr AS type)``. Dialects override to handle nullability."""
        resolved_type = self._resolve_type_name(type_name)
        return f"CAST({self.compile_expr(inner)} AS {resolved_type})"

    # SQL operator precedence (higher = binds tighter). Used by the
    # precedence-aware emitter in ``compile_expr`` to skip wrapping a
    # child whose precedence is higher than its parent's required level.
    # Pre-v2.7.4 the emitter wrapped *every* operator unconditionally,
    # producing deeply-nested unreadable SQL (issue #79).
    _CLAUSE_ROOT_PREC = 0  # no surrounding context → no wrap
    _PREC_OR = 1
    _PREC_AND = 2
    _PREC_NOT = 3
    _PREC_CMP = 4  # =, <>, <, <=, >, >=, IS NULL, IN, BETWEEN, LIKE
    _PREC_ADD = 5  # +, -, ||
    _PREC_MUL = 6  # *, /, %
    _PREC_UNARY = 7  # unary -, +
    _PREC_ATOM = 100  # literals, column refs, function calls, CAST(...), CASE...END

    @staticmethod
    def _wrap_if_lower(sql: str, self_prec: int, parent_prec: int) -> str:
        """Wrap ``sql`` in ``(...)`` only when it would bind weaker than
        its parent — i.e. its precedence is strictly less than the
        parent's required level. ``parent_prec = 0`` (clause root) is
        always satisfied so the outermost expression never gets a
        redundant outer wrap.
        """
        if self_prec < parent_prec:
            return f"({sql})"
        return sql

    @classmethod
    def _binary_op_precedence(cls, op: str) -> int:
        """Return the precedence of a ``BinaryOp.op`` value."""
        up = op.upper().strip()
        if up == "OR":
            return cls._PREC_OR
        if up == "AND":
            return cls._PREC_AND
        if up in ("=", "<>", "!=", "<", "<=", ">", ">=", "LIKE", "NOT LIKE"):
            return cls._PREC_CMP
        if up in ("+", "-", "||"):
            return cls._PREC_ADD
        if up in ("*", "/", "%"):
            return cls._PREC_MUL
        # Unknown operator — wrap defensively (treat as lowest precedence).
        return cls._CLAUSE_ROOT_PREC

    # Non-associative operators — children at the same precedence must
    # be wrapped on BOTH sides. SQL forbids chained comparisons
    # (``a >= b = c`` is a syntax error in every dialect we support),
    # subtraction and division are left-associative but ``a - (b - c)``
    # differs from ``a - b - c``, so the right operand is wrapped at
    # equal precedence — see the left-associative branch below.
    _NON_ASSOCIATIVE_OPS: frozenset[str] = frozenset(
        {"=", "<>", "!=", "<", "<=", ">", ">=", "LIKE", "NOT LIKE"}
    )

    def _compile_binary_op(self, left: Expr, op: str, right: Expr) -> str:
        """Render an infix binary expression *without* an outer wrap.

        The dispatcher in ``compile_expr`` decides whether to add an outer
        ``(...)`` wrap based on the parent's precedence. Dialects override
        to widen operand precision (e.g. ClickHouse decimal division) or
        special-case operators that don't translate one-to-one (e.g. MySQL
        string concat).
        """
        self_prec = self._binary_op_precedence(op)
        # Comparison + LIKE forbid chaining — wrap any equal-precedence
        # child on either side. Other ops are left-associative: left at
        # self_prec, right at self_prec + 1 so ``a - (b - c)`` keeps its
        # required parens.
        op_upper = op.upper().strip()
        if op_upper in self._NON_ASSOCIATIVE_OPS:
            left_sql = self.compile_expr(left, _parent_prec=self_prec + 1)
            right_sql = self.compile_expr(right, _parent_prec=self_prec + 1)
        else:
            left_sql = self.compile_expr(left, _parent_prec=self_prec)
            right_sql = self.compile_expr(right, _parent_prec=self_prec + 1)
        return f"{left_sql} {op} {right_sql}"

    def render_decimal_division_sql(self, left_sql: str, right_sql: str) -> str:
        """Render ``left / right`` for decimal-typed operands, given raw SQL.

        Used by code paths that build division as string SQL (e.g. PoP
        comparison CTEs) rather than as ``BinaryOp`` AST nodes. Default
        is plain SQL division; ClickHouse overrides to widen both sides
        to ``Decimal(38, 10)`` first so ratio precision survives.
        """
        return f"{left_sql} / {right_sql}"

    def _compile_multi_field_count(self, args: list[Expr], distinct: bool) -> str:
        """Compile COUNT with multiple fields by concatenating with ``||``.

        Default (non-Snowflake) strategy: cast each field to VARCHAR and
        join with ``'|'`` separator so the database sees a single expression.
        Snowflake overrides this to emit native ``COUNT(col1, col2)``.
        """
        parts = [f"CAST({self.compile_expr(a)} AS VARCHAR)" for a in args]
        concat = " || '|' || ".join(parts)
        if distinct:
            return f"COUNT(DISTINCT {concat})"
        return f"COUNT({concat})"

    def compile(self, ast: Select) -> str:
        """Render a complete SQL AST to a dialect-specific string."""
        return self.compile_select(ast)

    def compile_select(self, node: Select) -> str:
        """Compile a SELECT statement."""
        parts: list[str] = []

        # CTEs
        if node.ctes:
            cte_parts = []
            for cte in node.ctes:
                if isinstance(cte.query, RawSQL):
                    cte_sql = cte.query.sql
                elif isinstance(cte.query, UnionAll):
                    cte_sql = self.compile_union_all(cte.query)
                elif isinstance(cte.query, Except):
                    cte_sql = self.compile_except(cte.query)
                else:
                    cte_sql = self.compile_select(cte.query)
                cte_parts.append(f"{self.quote_identifier(cte.name)} AS (\n{cte_sql}\n)")
            parts.append("WITH " + ",\n".join(cte_parts))

        # SELECT
        keyword = "SELECT DISTINCT" if node.distinct else "SELECT"
        if node.columns:
            cols = ", ".join(self.compile_expr(c) for c in node.columns)
            parts.append(f"{keyword} {cols}")
        else:
            parts.append(f"{keyword} *")

        # FROM
        if node.from_:
            parts.append(f"FROM {self.compile_from(node.from_)}")

        # JOINs
        for join in node.joins:
            parts.append(self.compile_join(join))

        # WHERE
        if node.where:
            parts.append(f"WHERE {self.compile_expr(node.where)}")

        # GROUP BY
        if node.group_by:
            parts.append(self.compile_group_by(node.group_by, node.grouping))

        # HAVING
        if node.having:
            parts.append(f"HAVING {self.compile_expr(node.having)}")

        # ORDER BY
        if node.order_by:
            orders = ", ".join(self.compile_order_by(o) for o in node.order_by)
            parts.append(f"ORDER BY {orders}")

        # LIMIT
        if node.limit is not None:
            parts.append(f"LIMIT {node.limit}")

        # OFFSET
        if node.offset is not None:
            parts.append(f"OFFSET {node.offset}")

        return "\n".join(parts)

    def compile_group_by(self, group_by: list[Expr], grouping: str | None) -> str:
        """Render the GROUP BY clause.

        Default ANSI form (Postgres, Snowflake, DuckDB, BigQuery, Databricks,
        Dremio, MySQL): ``GROUP BY ROLLUP(a, b)`` / ``GROUP BY CUBE(a, b)``.
        ClickHouse overrides to the trailing-modifier form
        (``GROUP BY a, b WITH ROLLUP``).

        When ``capabilities.supports_group_by_all`` is set and no grouping
        modifier is requested, emits ``GROUP BY ALL`` — the engine
        auto-derives the grouping list from the SELECT. Equivalent SQL
        with a much shorter and more idiomatic form on modern OLAP
        engines, especially for queries with computed dimensions.
        """
        if grouping == "rollup":
            groups = ", ".join(self.compile_expr(g) for g in group_by)
            return f"GROUP BY ROLLUP({groups})"
        if grouping == "cube":
            groups = ", ".join(self.compile_expr(g) for g in group_by)
            return f"GROUP BY CUBE({groups})"
        if self.capabilities.supports_group_by_all:
            return "GROUP BY ALL"
        groups = ", ".join(self.compile_expr(g) for g in group_by)
        return f"GROUP BY {groups}"

    def compile_from(self, node: From) -> str:
        if isinstance(node.source, Select):
            sub = self.compile_select(node.source)
            result = f"(\n{sub}\n)"
        else:
            result = self._render_source_string(node.source)
        if node.alias:
            result += f" AS {self.quote_identifier(node.alias)}"
        return result

    def compile_join(self, node: Join) -> str:
        if isinstance(node.source, Select):
            source = f"(\n{self.compile_select(node.source)}\n)"
        else:
            source = self._render_source_string(node.source)
        if node.alias:
            source += f" AS {self.quote_identifier(node.alias)}"

        parts = [f"{node.join_type.value} JOIN {source}"]
        if node.on:
            parts.append(f"ON {self.compile_expr(node.on)}")
        return " ".join(parts)

    def _render_source_string(self, source: str) -> str:
        """Render a ``From``/``Join`` string source.

        Wrap modules emit bare CTE names (e.g. ``base``); the star/CFL
        planners emit pre-quoted qualified table strings (e.g.
        ``"DB"."SCHEMA"."TABLE"``). Quote the former so case-sensitive
        dialects like Snowflake match the CTE declaration; pass the latter
        through unchanged.
        """
        if source.isidentifier():
            return self.quote_identifier(source)
        return source

    def compile_order_by(self, node: OrderByItem) -> str:
        result = self.compile_expr(node.expr)
        if node.desc:
            result += " DESC"
        else:
            result += " ASC"
        if node.nulls_last is True:
            result += " NULLS LAST"
        elif node.nulls_last is False:
            result += " NULLS FIRST"
        return result

    def compile_union_all(self, node: UnionAll) -> str:
        """Compile a UNION ALL of multiple SELECT statements."""
        return "\nUNION ALL\n".join(self.compile_select(q) for q in node.queries)

    def compile_except(self, node: Except) -> str:
        """Compile an EXCEPT of two SELECT statements."""
        return self.compile_select(node.left) + "\nEXCEPT\n" + self.compile_select(node.right)

    def compile_expr(self, expr: Expr, _parent_prec: int = 0) -> str:
        """Compile an expression node to SQL string.

        ``_parent_prec`` is the precedence of the surrounding operator
        (or ``_CLAUSE_ROOT_PREC = 0`` when called at the root of a SELECT
        projection, ON / WHERE / HAVING clause, GROUP BY / ORDER BY item,
        or function argument). Each operator branch wraps its own SQL in
        ``(...)`` only when its precedence is strictly less than the
        parent's required level; atoms (literals, column refs, function
        calls, CAST, CASE) are at ``_PREC_ATOM`` and never wrap.

        Pre-v2.7.4 every ``BinaryOp`` / ``IsNull`` / ``InList`` /
        ``Between`` / ``UnaryOp`` wrapped itself unconditionally,
        producing deeply-nested unreadable SQL — issue #79.
        """
        match expr:
            case Literal(value=None):
                return "NULL"
            case Literal(value=True):
                return "TRUE"
            case Literal(value=False):
                return "FALSE"
            case Literal(value=v) if isinstance(v, str):
                escaped = v.replace("'", "''")
                return f"'{escaped}'"
            case Literal(value=v):
                return str(v)
            case Star(table=None):
                return "*"
            case Star(table=t) if t is not None:
                return f"{self.quote_identifier(t)}.*"
            case ColumnRef(name=name, table=None):
                return self.quote_identifier(name)
            case ColumnRef(name=name, table=table) if table is not None:
                return f"{self.quote_identifier(table)}.{self.quote_identifier(name)}"
            case AliasedExpr(expr=inner, alias=alias):
                return f"{self.compile_expr(inner)} AS {self.quote_identifier(alias)}"
            case FunctionCall(
                name=fname,
                args=args,
                distinct=distinct,
                order_by=order_by,
                separator=separator,
            ):
                # Reject aggregations explicitly listed as unsupported by the dialect.
                # Per-function overrides (_compile_mode etc.) still apply for cases
                # that have a special compile path; this catches plain aggregates
                # like REGR_SLOPE that have no override.
                self._check_aggregation_supported(fname)
                # LISTAGG: dialect-specific rendering
                if fname.upper() == "LISTAGG":
                    return self._compile_listagg(args, distinct, order_by, separator)
                # MODE: dialect-specific rendering
                if fname.upper() == "MODE":
                    return self._compile_mode(args)
                # MEDIAN: dialect-specific rendering
                if fname.upper() == "MEDIAN":
                    return self._compile_median(args)
                # Multi-field COUNT: concatenate fields for portability
                # (Snowflake overrides to use native multi-arg syntax)
                if fname.upper() == "COUNT" and len(args) > 1:
                    return self._compile_multi_field_count(args, distinct)
                fname = self._map_function_name(fname)
                args_sql = ", ".join(self.compile_expr(a) for a in args)
                if distinct:
                    return f"{fname}(DISTINCT {args_sql})"
                return f"{fname}({args_sql})"
            case BinaryOp(left=left, op=op, right=right):
                self_prec = self._binary_op_precedence(op)
                sql = self._compile_binary_op(left, op, right)
                return self._wrap_if_lower(sql, self_prec, _parent_prec)
            case UnaryOp(op=op, operand=operand):
                self_prec = self._PREC_NOT if op.upper() == "NOT" else self._PREC_UNARY
                sql = f"{op} {self.compile_expr(operand, _parent_prec=self_prec)}"
                return self._wrap_if_lower(sql, self_prec, _parent_prec)
            case IsNull(expr=inner, negated=False):
                sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} IS NULL"
                return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
            case IsNull(expr=inner, negated=True):
                sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} IS NOT NULL"
                return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
            case InList(expr=inner, values=values, negated=negated):
                vals = ", ".join(self.compile_expr(v) for v in values)
                op = "NOT IN" if negated else "IN"
                sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} {op} ({vals})"
                return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
            case CaseExpr(when_clauses=whens, else_clause=else_):
                parts = ["CASE"]
                for when_cond, then_val in whens:
                    parts.append(
                        f"WHEN {self.compile_expr(when_cond)} THEN {self.compile_expr(then_val)}"
                    )
                if else_ is not None:
                    parts.append(f"ELSE {self.compile_expr(else_)}")
                parts.append("END")
                return " ".join(parts)
            case Cast(expr=inner, type_name=type_name):
                return self._compile_cast(inner, type_name)
            case SubqueryExpr(query=query):
                return f"(\n{self.compile_select(query)}\n)"
            case Exists(subquery=subq, negated=False):
                return f"EXISTS (\n{self.compile_select(subq)}\n)"
            case Exists(subquery=subq, negated=True):
                return f"NOT EXISTS (\n{self.compile_select(subq)}\n)"
            case RawSQL(sql=sql):
                return sql
            case Between(expr=inner, low=low, high=high, negated=negated):
                op = "NOT BETWEEN" if negated else "BETWEEN"
                inner_sql = self.compile_expr(inner, _parent_prec=self._PREC_CMP)
                low_sql = self.compile_expr(low, _parent_prec=self._PREC_CMP)
                high_sql = self.compile_expr(high, _parent_prec=self._PREC_CMP)
                sql = f"{inner_sql} {op} {low_sql} AND {high_sql}"
                return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
            case RegexMatch(column=column, pattern=pattern, negated=negated):
                return self.compile_regex_match(column, pattern, negated=negated)
            case RelativeDateRange(
                column=column,
                unit=unit,
                count=count,
                direction=direction,
                include_current=include_current,
            ):
                return self.compile_relative_date_range(
                    column=column,
                    unit=unit,
                    count=count,
                    direction=direction,
                    include_current=include_current,
                )
            case WindowFunction(
                func_name=fname,
                args=args,
                partition_by=partition_by,
                order_by=order_by,
                frame=frame,
                distinct=distinct,
            ):
                args_sql = ", ".join(self.compile_expr(a) for a in args)
                func_sql = f"{fname}(DISTINCT {args_sql})" if distinct else f"{fname}({args_sql})"
                over_parts: list[str] = []
                if partition_by:
                    pb = ", ".join(self.compile_expr(p) for p in partition_by)
                    over_parts.append(f"PARTITION BY {pb}")
                if order_by:
                    ob = ", ".join(self.compile_order_by(o) for o in order_by)
                    over_parts.append(f"ORDER BY {ob}")
                if frame is not None:
                    over_parts.append(f"{frame.mode} BETWEEN {frame.start} AND {frame.end}")
                over_clause = " ".join(over_parts)
                return f"{func_sql} OVER ({over_clause})"
            case _:
                raise ValueError(f"Unknown AST node type: {type(expr).__name__}")

    def compile_regex_match(self, column: Expr, pattern: str, *, negated: bool) -> str:
        """Compile a regex predicate. Default uses ``REGEXP_LIKE`` — overridden
        per dialect that needs a different syntax (Postgres ``~``, MySQL
        ``REGEXP``, ClickHouse ``match`` etc.).

        The pattern is rendered as a SQL string literal; callers pass it
        as ``RegexMatch.pattern`` (already a Python ``str``).
        """
        col_sql = self.compile_expr(column)
        pat_sql = self.compile_expr(Literal.string(pattern))
        op_sql = f"REGEXP_LIKE({col_sql}, {pat_sql})"
        return f"NOT {op_sql}" if negated else op_sql

    def compile_relative_date_range(
        self,
        column: Expr,
        unit: str,
        count: int,
        direction: str,
        include_current: bool,
    ) -> str:
        """Compile a relative date range predicate to SQL."""
        col_sql = self.compile_expr(column)
        base = self.current_date_sql()

        if direction == "future":
            start = base if include_current else self.date_add_sql(base, "day", 1)
            end = self.date_add_sql(start, unit, count)
        else:
            end = self.date_add_sql(base, "day", 1) if include_current else base
            start = self.date_add_sql(end, unit, -count)

        return f"({col_sql} >= {start} AND {col_sql} < {end})"

render_obml_type(obml_type)

Render an OBMLType to a dialect-specific SQL type string.

Handles precision clamping for decimal types.

Source code in src/orionbelt/dialect/base.py
def render_obml_type(self, obml_type: OBMLType) -> str:
    """Render an OBMLType to a dialect-specific SQL type string.

    Handles precision clamping for decimal types.
    """
    if isinstance(obml_type, DecimalType):
        p = min(obml_type.precision, self._MAX_DECIMAL_PRECISION)
        s = min(obml_type.scale, p)
        return f"DECIMAL({p}, {s})"
    return self._OBML_SIMPLE_TYPE_MAP.get(obml_type.name, obml_type.name.upper())

cast_to_obml_type(expr, obml_type)

Build an Expr that coerces expr to the given OBML type.

Default form is a plain CAST(expr AS <type>). Dialects whose CAST doesn't accept a parameterized decimal (notably BigQuery — "Parameterized types are not allowed in CAST expressions") can override to wrap the cast with a ROUND to honour the user-specified scale.

Source code in src/orionbelt/dialect/base.py
def cast_to_obml_type(self, expr: Expr, obml_type: OBMLType) -> Expr:
    """Build an Expr that coerces ``expr`` to the given OBML type.

    Default form is a plain ``CAST(expr AS <type>)``. Dialects whose
    ``CAST`` doesn't accept a parameterized decimal (notably BigQuery
    — "Parameterized types are not allowed in CAST expressions") can
    override to wrap the cast with a ROUND to honour the user-specified
    scale.
    """
    return Cast(expr=expr, type_name=self.render_obml_type(obml_type))

format_table_ref(database, schema, code)

Format a fully-qualified table reference.

Default: three-part database.schema.code (Snowflake/Databricks/Dremio). Postgres and ClickHouse override to two-part naming. All components are quoted to prevent SQL injection.

Source code in src/orionbelt/dialect/base.py
def format_table_ref(self, database: str, schema: str, code: str) -> str:
    """Format a fully-qualified table reference.

    Default: three-part ``database.schema.code`` (Snowflake/Databricks/Dremio).
    Postgres and ClickHouse override to two-part naming.
    All components are quoted to prevent SQL injection.
    """
    return (
        f"{self.quote_identifier(database)}"
        f".{self.quote_identifier(schema)}"
        f".{self.quote_identifier(code)}"
    )

quote_identifier(name) abstractmethod

Quote an identifier per dialect rules.

Source code in src/orionbelt/dialect/base.py
@abstractmethod
def quote_identifier(self, name: str) -> str:
    """Quote an identifier per dialect rules."""

render_time_grain(column, grain) abstractmethod

Wrap a column expression for the given time grain.

Source code in src/orionbelt/dialect/base.py
@abstractmethod
def render_time_grain(self, column: Expr, grain: TimeGrain) -> Expr:
    """Wrap a column expression for the given time grain."""

render_cast(expr, target_type) abstractmethod

Render a CAST expression.

Source code in src/orionbelt/dialect/base.py
@abstractmethod
def render_cast(self, expr: Expr, target_type: str) -> Expr:
    """Render a CAST expression."""

current_date_sql() abstractmethod

Return SQL for the current date.

Source code in src/orionbelt/dialect/base.py
@abstractmethod
def current_date_sql(self) -> str:
    """Return SQL for the current date."""

date_add_sql(date_sql, unit, count) abstractmethod

Return SQL that adds count units to date_sql.

Source code in src/orionbelt/dialect/base.py
@abstractmethod
def date_add_sql(self, date_sql: str, unit: str, count: int) -> str:
    """Return SQL that adds count units to date_sql."""

render_date_trunc_sql(column_sql, grain) abstractmethod

Return SQL string that truncates a date/timestamp to the given grain.

String-level helper (not AST) for use in raw SQL CTEs like date_range.

Source code in src/orionbelt/dialect/base.py
@abstractmethod
def render_date_trunc_sql(self, column_sql: str, grain: str) -> str:
    """Return SQL string that truncates a date/timestamp to the given grain.

    String-level helper (not AST) for use in raw SQL CTEs like date_range.
    """

render_date_spine_cte_sql(min_date, max_date, grain, offset, offset_grain) abstractmethod

Return the SQL body for a date spine CTE.

Must produce two columns: spine_date and spine_date_prev. spine_date_prev is NULL when the offset date falls before min_date.

Parameters

min_date : str SQL expression referencing the minimum date (e.g. date_range.min_date). max_date : str SQL expression referencing the maximum date. grain : str Time grain string: day, week, month, quarter, year. offset : int Signed period offset (e.g. -1 for previous period). offset_grain : str Grain of the offset (e.g. year for YoY).

Source code in src/orionbelt/dialect/base.py
@abstractmethod
def render_date_spine_cte_sql(
    self,
    min_date: str,
    max_date: str,
    grain: str,
    offset: int,
    offset_grain: str,
) -> str:
    """Return the SQL body for a date spine CTE.

    Must produce two columns: ``spine_date`` and ``spine_date_prev``.
    ``spine_date_prev`` is NULL when the offset date falls before min_date.

    Parameters
    ----------
    min_date : str
        SQL expression referencing the minimum date (e.g. ``date_range.min_date``).
    max_date : str
        SQL expression referencing the maximum date.
    grain : str
        Time grain string: ``day``, ``week``, ``month``, ``quarter``, ``year``.
    offset : int
        Signed period offset (e.g. ``-1`` for previous period).
    offset_grain : str
        Grain of the offset (e.g. ``year`` for YoY).
    """

render_string_contains(column, pattern)

Default: column LIKE '%' || pattern || '%'.

Source code in src/orionbelt/dialect/base.py
def render_string_contains(self, column: Expr, pattern: Expr) -> Expr:
    """Default: column LIKE '%' || pattern || '%'."""
    return BinaryOp(
        left=column,
        op="LIKE",
        right=BinaryOp(
            left=BinaryOp(left=Literal.string("%"), op="||", right=pattern),
            op="||",
            right=Literal.string("%"),
        ),
    )

render_decimal_division_sql(left_sql, right_sql)

Render left / right for decimal-typed operands, given raw SQL.

Used by code paths that build division as string SQL (e.g. PoP comparison CTEs) rather than as BinaryOp AST nodes. Default is plain SQL division; ClickHouse overrides to widen both sides to Decimal(38, 10) first so ratio precision survives.

Source code in src/orionbelt/dialect/base.py
def render_decimal_division_sql(self, left_sql: str, right_sql: str) -> str:
    """Render ``left / right`` for decimal-typed operands, given raw SQL.

    Used by code paths that build division as string SQL (e.g. PoP
    comparison CTEs) rather than as ``BinaryOp`` AST nodes. Default
    is plain SQL division; ClickHouse overrides to widen both sides
    to ``Decimal(38, 10)`` first so ratio precision survives.
    """
    return f"{left_sql} / {right_sql}"

compile(ast)

Render a complete SQL AST to a dialect-specific string.

Source code in src/orionbelt/dialect/base.py
def compile(self, ast: Select) -> str:
    """Render a complete SQL AST to a dialect-specific string."""
    return self.compile_select(ast)

compile_select(node)

Compile a SELECT statement.

Source code in src/orionbelt/dialect/base.py
def compile_select(self, node: Select) -> str:
    """Compile a SELECT statement."""
    parts: list[str] = []

    # CTEs
    if node.ctes:
        cte_parts = []
        for cte in node.ctes:
            if isinstance(cte.query, RawSQL):
                cte_sql = cte.query.sql
            elif isinstance(cte.query, UnionAll):
                cte_sql = self.compile_union_all(cte.query)
            elif isinstance(cte.query, Except):
                cte_sql = self.compile_except(cte.query)
            else:
                cte_sql = self.compile_select(cte.query)
            cte_parts.append(f"{self.quote_identifier(cte.name)} AS (\n{cte_sql}\n)")
        parts.append("WITH " + ",\n".join(cte_parts))

    # SELECT
    keyword = "SELECT DISTINCT" if node.distinct else "SELECT"
    if node.columns:
        cols = ", ".join(self.compile_expr(c) for c in node.columns)
        parts.append(f"{keyword} {cols}")
    else:
        parts.append(f"{keyword} *")

    # FROM
    if node.from_:
        parts.append(f"FROM {self.compile_from(node.from_)}")

    # JOINs
    for join in node.joins:
        parts.append(self.compile_join(join))

    # WHERE
    if node.where:
        parts.append(f"WHERE {self.compile_expr(node.where)}")

    # GROUP BY
    if node.group_by:
        parts.append(self.compile_group_by(node.group_by, node.grouping))

    # HAVING
    if node.having:
        parts.append(f"HAVING {self.compile_expr(node.having)}")

    # ORDER BY
    if node.order_by:
        orders = ", ".join(self.compile_order_by(o) for o in node.order_by)
        parts.append(f"ORDER BY {orders}")

    # LIMIT
    if node.limit is not None:
        parts.append(f"LIMIT {node.limit}")

    # OFFSET
    if node.offset is not None:
        parts.append(f"OFFSET {node.offset}")

    return "\n".join(parts)

compile_group_by(group_by, grouping)

Render the GROUP BY clause.

Default ANSI form (Postgres, Snowflake, DuckDB, BigQuery, Databricks, Dremio, MySQL): GROUP BY ROLLUP(a, b) / GROUP BY CUBE(a, b). ClickHouse overrides to the trailing-modifier form (GROUP BY a, b WITH ROLLUP).

When capabilities.supports_group_by_all is set and no grouping modifier is requested, emits GROUP BY ALL — the engine auto-derives the grouping list from the SELECT. Equivalent SQL with a much shorter and more idiomatic form on modern OLAP engines, especially for queries with computed dimensions.

Source code in src/orionbelt/dialect/base.py
def compile_group_by(self, group_by: list[Expr], grouping: str | None) -> str:
    """Render the GROUP BY clause.

    Default ANSI form (Postgres, Snowflake, DuckDB, BigQuery, Databricks,
    Dremio, MySQL): ``GROUP BY ROLLUP(a, b)`` / ``GROUP BY CUBE(a, b)``.
    ClickHouse overrides to the trailing-modifier form
    (``GROUP BY a, b WITH ROLLUP``).

    When ``capabilities.supports_group_by_all`` is set and no grouping
    modifier is requested, emits ``GROUP BY ALL`` — the engine
    auto-derives the grouping list from the SELECT. Equivalent SQL
    with a much shorter and more idiomatic form on modern OLAP
    engines, especially for queries with computed dimensions.
    """
    if grouping == "rollup":
        groups = ", ".join(self.compile_expr(g) for g in group_by)
        return f"GROUP BY ROLLUP({groups})"
    if grouping == "cube":
        groups = ", ".join(self.compile_expr(g) for g in group_by)
        return f"GROUP BY CUBE({groups})"
    if self.capabilities.supports_group_by_all:
        return "GROUP BY ALL"
    groups = ", ".join(self.compile_expr(g) for g in group_by)
    return f"GROUP BY {groups}"

compile_union_all(node)

Compile a UNION ALL of multiple SELECT statements.

Source code in src/orionbelt/dialect/base.py
def compile_union_all(self, node: UnionAll) -> str:
    """Compile a UNION ALL of multiple SELECT statements."""
    return "\nUNION ALL\n".join(self.compile_select(q) for q in node.queries)

compile_except(node)

Compile an EXCEPT of two SELECT statements.

Source code in src/orionbelt/dialect/base.py
def compile_except(self, node: Except) -> str:
    """Compile an EXCEPT of two SELECT statements."""
    return self.compile_select(node.left) + "\nEXCEPT\n" + self.compile_select(node.right)

compile_expr(expr, _parent_prec=0)

Compile an expression node to SQL string.

_parent_prec is the precedence of the surrounding operator (or _CLAUSE_ROOT_PREC = 0 when called at the root of a SELECT projection, ON / WHERE / HAVING clause, GROUP BY / ORDER BY item, or function argument). Each operator branch wraps its own SQL in (...) only when its precedence is strictly less than the parent's required level; atoms (literals, column refs, function calls, CAST, CASE) are at _PREC_ATOM and never wrap.

Pre-v2.7.4 every BinaryOp / IsNull / InList / Between / UnaryOp wrapped itself unconditionally, producing deeply-nested unreadable SQL — issue #79.

Source code in src/orionbelt/dialect/base.py
def compile_expr(self, expr: Expr, _parent_prec: int = 0) -> str:
    """Compile an expression node to SQL string.

    ``_parent_prec`` is the precedence of the surrounding operator
    (or ``_CLAUSE_ROOT_PREC = 0`` when called at the root of a SELECT
    projection, ON / WHERE / HAVING clause, GROUP BY / ORDER BY item,
    or function argument). Each operator branch wraps its own SQL in
    ``(...)`` only when its precedence is strictly less than the
    parent's required level; atoms (literals, column refs, function
    calls, CAST, CASE) are at ``_PREC_ATOM`` and never wrap.

    Pre-v2.7.4 every ``BinaryOp`` / ``IsNull`` / ``InList`` /
    ``Between`` / ``UnaryOp`` wrapped itself unconditionally,
    producing deeply-nested unreadable SQL — issue #79.
    """
    match expr:
        case Literal(value=None):
            return "NULL"
        case Literal(value=True):
            return "TRUE"
        case Literal(value=False):
            return "FALSE"
        case Literal(value=v) if isinstance(v, str):
            escaped = v.replace("'", "''")
            return f"'{escaped}'"
        case Literal(value=v):
            return str(v)
        case Star(table=None):
            return "*"
        case Star(table=t) if t is not None:
            return f"{self.quote_identifier(t)}.*"
        case ColumnRef(name=name, table=None):
            return self.quote_identifier(name)
        case ColumnRef(name=name, table=table) if table is not None:
            return f"{self.quote_identifier(table)}.{self.quote_identifier(name)}"
        case AliasedExpr(expr=inner, alias=alias):
            return f"{self.compile_expr(inner)} AS {self.quote_identifier(alias)}"
        case FunctionCall(
            name=fname,
            args=args,
            distinct=distinct,
            order_by=order_by,
            separator=separator,
        ):
            # Reject aggregations explicitly listed as unsupported by the dialect.
            # Per-function overrides (_compile_mode etc.) still apply for cases
            # that have a special compile path; this catches plain aggregates
            # like REGR_SLOPE that have no override.
            self._check_aggregation_supported(fname)
            # LISTAGG: dialect-specific rendering
            if fname.upper() == "LISTAGG":
                return self._compile_listagg(args, distinct, order_by, separator)
            # MODE: dialect-specific rendering
            if fname.upper() == "MODE":
                return self._compile_mode(args)
            # MEDIAN: dialect-specific rendering
            if fname.upper() == "MEDIAN":
                return self._compile_median(args)
            # Multi-field COUNT: concatenate fields for portability
            # (Snowflake overrides to use native multi-arg syntax)
            if fname.upper() == "COUNT" and len(args) > 1:
                return self._compile_multi_field_count(args, distinct)
            fname = self._map_function_name(fname)
            args_sql = ", ".join(self.compile_expr(a) for a in args)
            if distinct:
                return f"{fname}(DISTINCT {args_sql})"
            return f"{fname}({args_sql})"
        case BinaryOp(left=left, op=op, right=right):
            self_prec = self._binary_op_precedence(op)
            sql = self._compile_binary_op(left, op, right)
            return self._wrap_if_lower(sql, self_prec, _parent_prec)
        case UnaryOp(op=op, operand=operand):
            self_prec = self._PREC_NOT if op.upper() == "NOT" else self._PREC_UNARY
            sql = f"{op} {self.compile_expr(operand, _parent_prec=self_prec)}"
            return self._wrap_if_lower(sql, self_prec, _parent_prec)
        case IsNull(expr=inner, negated=False):
            sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} IS NULL"
            return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
        case IsNull(expr=inner, negated=True):
            sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} IS NOT NULL"
            return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
        case InList(expr=inner, values=values, negated=negated):
            vals = ", ".join(self.compile_expr(v) for v in values)
            op = "NOT IN" if negated else "IN"
            sql = f"{self.compile_expr(inner, _parent_prec=self._PREC_CMP)} {op} ({vals})"
            return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
        case CaseExpr(when_clauses=whens, else_clause=else_):
            parts = ["CASE"]
            for when_cond, then_val in whens:
                parts.append(
                    f"WHEN {self.compile_expr(when_cond)} THEN {self.compile_expr(then_val)}"
                )
            if else_ is not None:
                parts.append(f"ELSE {self.compile_expr(else_)}")
            parts.append("END")
            return " ".join(parts)
        case Cast(expr=inner, type_name=type_name):
            return self._compile_cast(inner, type_name)
        case SubqueryExpr(query=query):
            return f"(\n{self.compile_select(query)}\n)"
        case Exists(subquery=subq, negated=False):
            return f"EXISTS (\n{self.compile_select(subq)}\n)"
        case Exists(subquery=subq, negated=True):
            return f"NOT EXISTS (\n{self.compile_select(subq)}\n)"
        case RawSQL(sql=sql):
            return sql
        case Between(expr=inner, low=low, high=high, negated=negated):
            op = "NOT BETWEEN" if negated else "BETWEEN"
            inner_sql = self.compile_expr(inner, _parent_prec=self._PREC_CMP)
            low_sql = self.compile_expr(low, _parent_prec=self._PREC_CMP)
            high_sql = self.compile_expr(high, _parent_prec=self._PREC_CMP)
            sql = f"{inner_sql} {op} {low_sql} AND {high_sql}"
            return self._wrap_if_lower(sql, self._PREC_CMP, _parent_prec)
        case RegexMatch(column=column, pattern=pattern, negated=negated):
            return self.compile_regex_match(column, pattern, negated=negated)
        case RelativeDateRange(
            column=column,
            unit=unit,
            count=count,
            direction=direction,
            include_current=include_current,
        ):
            return self.compile_relative_date_range(
                column=column,
                unit=unit,
                count=count,
                direction=direction,
                include_current=include_current,
            )
        case WindowFunction(
            func_name=fname,
            args=args,
            partition_by=partition_by,
            order_by=order_by,
            frame=frame,
            distinct=distinct,
        ):
            args_sql = ", ".join(self.compile_expr(a) for a in args)
            func_sql = f"{fname}(DISTINCT {args_sql})" if distinct else f"{fname}({args_sql})"
            over_parts: list[str] = []
            if partition_by:
                pb = ", ".join(self.compile_expr(p) for p in partition_by)
                over_parts.append(f"PARTITION BY {pb}")
            if order_by:
                ob = ", ".join(self.compile_order_by(o) for o in order_by)
                over_parts.append(f"ORDER BY {ob}")
            if frame is not None:
                over_parts.append(f"{frame.mode} BETWEEN {frame.start} AND {frame.end}")
            over_clause = " ".join(over_parts)
            return f"{func_sql} OVER ({over_clause})"
        case _:
            raise ValueError(f"Unknown AST node type: {type(expr).__name__}")

compile_regex_match(column, pattern, *, negated)

Compile a regex predicate. Default uses REGEXP_LIKE — overridden per dialect that needs a different syntax (Postgres ~, MySQL REGEXP, ClickHouse match etc.).

The pattern is rendered as a SQL string literal; callers pass it as RegexMatch.pattern (already a Python str).

Source code in src/orionbelt/dialect/base.py
def compile_regex_match(self, column: Expr, pattern: str, *, negated: bool) -> str:
    """Compile a regex predicate. Default uses ``REGEXP_LIKE`` — overridden
    per dialect that needs a different syntax (Postgres ``~``, MySQL
    ``REGEXP``, ClickHouse ``match`` etc.).

    The pattern is rendered as a SQL string literal; callers pass it
    as ``RegexMatch.pattern`` (already a Python ``str``).
    """
    col_sql = self.compile_expr(column)
    pat_sql = self.compile_expr(Literal.string(pattern))
    op_sql = f"REGEXP_LIKE({col_sql}, {pat_sql})"
    return f"NOT {op_sql}" if negated else op_sql

compile_relative_date_range(column, unit, count, direction, include_current)

Compile a relative date range predicate to SQL.

Source code in src/orionbelt/dialect/base.py
def compile_relative_date_range(
    self,
    column: Expr,
    unit: str,
    count: int,
    direction: str,
    include_current: bool,
) -> str:
    """Compile a relative date range predicate to SQL."""
    col_sql = self.compile_expr(column)
    base = self.current_date_sql()

    if direction == "future":
        start = base if include_current else self.date_add_sql(base, "day", 1)
        end = self.date_add_sql(start, unit, count)
    else:
        end = self.date_add_sql(base, "day", 1) if include_current else base
        start = self.date_add_sql(end, unit, -count)

    return f"({col_sql} >= {start} AND {col_sql} < {end})"

orionbelt.dialect.base.DialectCapabilities dataclass

Flags indicating what SQL features a dialect supports.

Source code in src/orionbelt/dialect/base.py
@dataclass
class DialectCapabilities:
    """Flags indicating what SQL features a dialect supports."""

    supports_cte: bool = True
    supports_qualify: bool = False
    supports_arrays: bool = False
    supports_window_filters: bool = False
    supports_ilike: bool = False
    supports_time_travel: bool = False
    supports_semi_structured: bool = False
    supports_union_all_by_name: bool = False
    # ``GROUP BY ALL`` (Snowflake 2022+, Databricks/Spark 3.4+, DuckDB 0.7+,
    # BigQuery, ClickHouse 22.6+) auto-derives the grouping list from the
    # SELECT clause. Functionally equivalent to the explicit list but much
    # shorter on queries with computed dimensions, where the explicit form
    # repeats the full expression. Postgres, MySQL, Dremio do not support it.
    supports_group_by_all: bool = False
    unsupported_aggregations: list[str] = field(default_factory=list)

Dialect Registry

orionbelt.dialect.registry.DialectRegistry

Registry for SQL dialect plugins.

Source code in src/orionbelt/dialect/registry.py
class DialectRegistry:
    """Registry for SQL dialect plugins."""

    _dialects: dict[str, type[Dialect]] = {}

    @classmethod
    def register(cls, dialect_class: type[Dialect]) -> type[Dialect]:
        """Register a dialect class. Can be used as a decorator."""
        # Instantiate to read the name property
        instance = dialect_class()
        cls._dialects[instance.name] = dialect_class
        return dialect_class

    @classmethod
    def get(cls, name: str) -> Dialect:
        """Get an instance of the named dialect."""
        if name not in cls._dialects:
            raise UnsupportedDialectError(name, available=cls.available())
        return cls._dialects[name]()

    @classmethod
    def available(cls) -> list[str]:
        """List registered dialect names."""
        return sorted(cls._dialects.keys())

    @classmethod
    def reset(cls) -> None:
        """Clear all registered dialects (for testing)."""
        cls._dialects.clear()

get(name) classmethod

Get an instance of the named dialect.

Source code in src/orionbelt/dialect/registry.py
@classmethod
def get(cls, name: str) -> Dialect:
    """Get an instance of the named dialect."""
    if name not in cls._dialects:
        raise UnsupportedDialectError(name, available=cls.available())
    return cls._dialects[name]()

available() classmethod

List registered dialect names.

Source code in src/orionbelt/dialect/registry.py
@classmethod
def available(cls) -> list[str]:
    """List registered dialect names."""
    return sorted(cls._dialects.keys())

register(dialect_class) classmethod

Register a dialect class. Can be used as a decorator.

Source code in src/orionbelt/dialect/registry.py
@classmethod
def register(cls, dialect_class: type[Dialect]) -> type[Dialect]:
    """Register a dialect class. Can be used as a decorator."""
    # Instantiate to read the name property
    instance = dialect_class()
    cls._dialects[instance.name] = dialect_class
    return dialect_class

YAML Parser

orionbelt.parser.loader.TrackedLoader

YAML loader that tracks source positions for error reporting.

Uses ruamel.yaml which preserves line/column info on every parsed node.

Source code in src/orionbelt/parser/loader.py
class TrackedLoader:
    """YAML loader that tracks source positions for error reporting.

    Uses ruamel.yaml which preserves line/column info on every parsed node.
    """

    def __init__(self) -> None:
        self._yaml = YAML()
        self._yaml.preserve_quotes = True
        # Reject duplicate YAML keys (e.g. two columns with the same name).
        # Without this, ruamel.yaml silently keeps only the last value.
        self._yaml.allow_duplicate_keys = False
        # Reject deeply nested structures (mitigates stack-based DoS).
        # ruamel.yaml raises an error when nesting exceeds this limit.
        self._yaml.max_depth = _MAX_DEPTH

    # -- safety checks -------------------------------------------------------

    @staticmethod
    def _check_yaml_safety(content: str) -> None:
        """Pre-parse safety checks on raw YAML text.

        Raises ``YAMLSafetyError`` if the content contains anchors/aliases
        (not used in OBML) or exceeds the maximum document size.
        """
        if len(content) > _MAX_DOCUMENT_SIZE:
            raise YAMLSafetyError(
                f"YAML document exceeds maximum size "
                f"({len(content):,} chars > {_MAX_DOCUMENT_SIZE:,} limit)"
            )
        # Strip full-line comments before scanning so that &name inside
        # comments (e.g. "# see R&D notes") does not cause a false positive.
        stripped = _COMMENT_LINE_RE.sub("", content)
        if _ANCHOR_RE.search(stripped):
            raise YAMLSafetyError("YAML anchors/aliases are not supported in OBML")

    @staticmethod
    def _check_node_count(data: Any, limit: int = _MAX_NODE_COUNT) -> None:
        """Post-parse defense-in-depth: reject documents with too many nodes."""
        count = 0
        stack: list[Any] = [data]
        while stack:
            node = stack.pop()
            count += 1
            if count > limit:
                raise YAMLSafetyError(f"YAML document exceeds maximum node count ({limit:,})")
            if isinstance(node, dict):
                stack.extend(node.values())
            elif isinstance(node, list):
                stack.extend(node)

    # -- public loading API --------------------------------------------------

    def load(self, path: Path) -> tuple[dict[str, Any], SourceMap]:
        """Load a YAML file and return parsed dict + source position map."""
        with path.open("r", encoding="utf-8") as handle:
            content = handle.read()
        self._check_yaml_safety(content)
        data = self._yaml.load(content)
        if data is None:
            return {}, SourceMap()
        self._check_node_count(data)
        source_map = SourceMap()
        self._extract_positions(data, str(path), "", source_map)
        return self._to_plain_dict(data), source_map

    def load_string(
        self, content: str, filename: str = "<string>"
    ) -> tuple[dict[str, Any], SourceMap]:
        """Load YAML from a string."""
        self._check_yaml_safety(content)
        data = self._yaml.load(content)
        if data is None:
            return {}, SourceMap()
        self._check_node_count(data)
        source_map = SourceMap()
        self._extract_positions(data, filename, "", source_map)
        return self._to_plain_dict(data), source_map

    def _extract_positions(
        self,
        data: Any,
        filename: str,
        prefix: str,
        source_map: SourceMap,
    ) -> None:
        """Recursively extract source positions from ruamel.yaml nodes."""
        if isinstance(data, CommentedMap):
            for key in data:
                key_path = f"{prefix}.{key}" if prefix else str(key)
                # Try to get position for this key from ruamel.yaml's lc object
                try:
                    lc = data.lc
                    # lc.key() returns a callable in newer ruamel.yaml
                    key_positions = lc.key(key)
                    if key_positions:
                        line, col = key_positions
                        source_map.add(
                            key_path,
                            SourceSpan(file=filename, line=line + 1, column=col + 1),
                        )
                except (AttributeError, KeyError, TypeError):
                    # Fallback: use the map's own position
                    try:
                        lc = data.lc
                        source_map.add(
                            key_path,
                            SourceSpan(file=filename, line=lc.line + 1, column=lc.col + 1),
                        )
                    except (AttributeError, TypeError):
                        pass
                self._extract_positions(data[key], filename, key_path, source_map)
        elif isinstance(data, CommentedSeq):
            for i, item in enumerate(data):
                item_path = f"{prefix}[{i}]"
                try:
                    lc = data.lc
                    item_pos = lc.item(i)
                    if item_pos:
                        line, col = item_pos
                        source_map.add(
                            item_path,
                            SourceSpan(file=filename, line=line + 1, column=col + 1),
                        )
                except (AttributeError, KeyError, TypeError):
                    pass
                self._extract_positions(item, filename, item_path, source_map)

    def _to_plain_dict(self, data: Any) -> dict[str, Any]:
        """Convert ruamel.yaml CommentedMap/Seq to plain Python dict/list."""
        if isinstance(data, CommentedMap):
            return {str(k): self._to_plain_value(v) for k, v in data.items()}
        if isinstance(data, dict):
            return {str(k): self._to_plain_value(v) for k, v in data.items()}
        return {}

    def _to_plain_value(self, data: Any) -> Any:
        if isinstance(data, CommentedMap):
            return {str(k): self._to_plain_value(v) for k, v in data.items()}
        if isinstance(data, CommentedSeq):
            return [self._to_plain_value(item) for item in data]
        if isinstance(data, dict):
            return {str(k): self._to_plain_value(v) for k, v in data.items()}
        if isinstance(data, list):
            return [self._to_plain_value(item) for item in data]
        return data

load(path)

Load a YAML file and return parsed dict + source position map.

Source code in src/orionbelt/parser/loader.py
def load(self, path: Path) -> tuple[dict[str, Any], SourceMap]:
    """Load a YAML file and return parsed dict + source position map."""
    with path.open("r", encoding="utf-8") as handle:
        content = handle.read()
    self._check_yaml_safety(content)
    data = self._yaml.load(content)
    if data is None:
        return {}, SourceMap()
    self._check_node_count(data)
    source_map = SourceMap()
    self._extract_positions(data, str(path), "", source_map)
    return self._to_plain_dict(data), source_map

load_string(content, filename='<string>')

Load YAML from a string.

Source code in src/orionbelt/parser/loader.py
def load_string(
    self, content: str, filename: str = "<string>"
) -> tuple[dict[str, Any], SourceMap]:
    """Load YAML from a string."""
    self._check_yaml_safety(content)
    data = self._yaml.load(content)
    if data is None:
        return {}, SourceMap()
    self._check_node_count(data)
    source_map = SourceMap()
    self._extract_positions(data, filename, "", source_map)
    return self._to_plain_dict(data), source_map

Reference Resolver

orionbelt.parser.resolver.ReferenceResolver

Resolves all references in a raw YAML model to a fully-typed SemanticModel.

Source code in src/orionbelt/parser/resolver.py
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
class ReferenceResolver:
    """Resolves all references in a raw YAML model to a fully-typed SemanticModel."""

    def resolve(
        self,
        raw: dict[str, Any],
        source_map: SourceMap | None = None,
    ) -> tuple[SemanticModel, ValidationResult]:
        """Resolve raw YAML dict into a validated SemanticModel.

        Returns (model, validation_result). If there are errors,
        the model may be partially populated.
        """
        errors: list[SemanticError] = []
        warnings: list[SemanticError] = []

        # Strict OBML: reject unknown top-level keys (catches typos like
        # ``dataObjekt:`` that would silently be dropped by ``raw.get(...)``).
        _check_unknown_keys(raw, _TOP_LEVEL_KEYS, "", errors, source_map)

        # Parse data objects
        data_objects: dict[str, DataObject] = {}
        raw_objects = raw.get("dataObjects", {})
        if not isinstance(raw_objects, dict):
            errors.append(
                SemanticError(
                    code="DATA_OBJECT_PARSE_ERROR",
                    message="'dataObjects' must be a YAML mapping, not a list or scalar",
                    path="dataObjects",
                )
            )
            raw_objects = {}
        for name, raw_obj in raw_objects.items():
            try:
                _check_unknown_keys(
                    raw_obj, _DATA_OBJECT_KEYS, f"dataObjects.{name}", errors, source_map
                )
                obj_columns: dict[str, DataObjectColumn] = {}
                for fname, fdata in raw_obj.get("columns", {}).items():
                    _check_unknown_keys(
                        fdata,
                        _DATA_OBJECT_COLUMN_KEYS,
                        f"dataObjects.{name}.columns.{fname}",
                        errors,
                        source_map,
                    )
                    obj_columns[fname] = DataObjectColumn(
                        label=fname,
                        code=fdata.get("code", fname if not fdata.get("expression") else ""),
                        abstract_type=fdata.get("abstractType", "string"),
                        sql_type=fdata.get("sqlType"),
                        sql_precision=fdata.get("sqlPrecision"),
                        sql_scale=fdata.get("sqlScale"),
                        num_class=fdata.get("numClass"),
                        primary_key=bool(fdata.get("primaryKey", False)),
                        description=fdata.get("description"),
                        comment=fdata.get("comment"),
                        owner=fdata.get("owner"),
                        expression=fdata.get("expression"),
                        synonyms=fdata.get("synonyms", []),
                        custom_extensions=_parse_extensions(fdata),
                    )

                obj_joins: list[DataObjectJoin] = []
                for ji, jdata in enumerate(raw_obj.get("joins", [])):
                    _check_unknown_keys(
                        jdata,
                        _DATA_OBJECT_JOIN_KEYS,
                        f"dataObjects.{name}.joins[{ji}]",
                        errors,
                        source_map,
                    )
                    obj_joins.append(
                        DataObjectJoin(
                            join_type=jdata["joinType"],
                            join_to=jdata["joinTo"],
                            columns_from=jdata["columnsFrom"],
                            columns_to=jdata["columnsTo"],
                            secondary=jdata.get("secondary", False),
                            path_name=jdata.get("pathName"),
                        )
                    )

                data_objects[name] = DataObject(
                    label=name,
                    code=raw_obj.get("code", ""),
                    database=raw_obj.get("database", ""),
                    schema_name=raw_obj.get("schema", ""),
                    columns=obj_columns,
                    joins=obj_joins,
                    description=raw_obj.get("description"),
                    comment=raw_obj.get("comment"),
                    owner=raw_obj.get("owner"),
                    synonyms=raw_obj.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_obj),
                    refresh=_parse_refresh(raw_obj.get("refresh"), name, errors),
                )
            except Exception as e:
                span = source_map.get(f"dataObjects.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="DATA_OBJECT_PARSE_ERROR",
                        message=f"Failed to parse data object '{name}': {e}",
                        path=f"dataObjects.{name}",
                        span=span,
                    )
                )

        # Parse dimensions
        dimensions: dict[str, Dimension] = {}
        raw_dims = raw.get("dimensions", {})
        if not isinstance(raw_dims, dict):
            errors.append(
                SemanticError(
                    code="DIMENSION_PARSE_ERROR",
                    message="'dimensions' must be a YAML mapping, not a list or scalar",
                    path="dimensions",
                )
            )
            raw_dims = {}
        for name, raw_dim in raw_dims.items():
            try:
                _check_unknown_keys(
                    raw_dim, _DIMENSION_KEYS, f"dimensions.{name}", errors, source_map
                )
                data_object = raw_dim.get("dataObject")
                column = raw_dim.get("column")

                # Validate the data object exists
                if data_object and data_object not in data_objects:
                    span = source_map.get(f"dimensions.{name}") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_DATA_OBJECT",
                            message=(
                                f"Dimension '{name}' references unknown data object '{data_object}'"
                            ),
                            path=f"dimensions.{name}",
                            span=span,
                            suggestions=_suggest_similar(data_object, list(data_objects.keys())),
                        )
                    )

                # Validate the column exists in the data object
                if (
                    data_object
                    and column
                    and data_object in data_objects
                    and column not in data_objects[data_object].columns
                ):
                    span = source_map.get(f"dimensions.{name}") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_COLUMN",
                            message=(
                                f"Dimension '{name}' references unknown column "
                                f"'{column}' in data object '{data_object}'"
                            ),
                            path=f"dimensions.{name}",
                            span=span,
                            suggestions=_suggest_similar(
                                column, list(data_objects[data_object].columns.keys())
                            ),
                        )
                    )

                via = raw_dim.get("via")
                if via and via not in data_objects:
                    span = source_map.get(f"dimensions.{name}") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_DATA_OBJECT",
                            message=(
                                f"Dimension '{name}' via references unknown data object '{via}'"
                            ),
                            path=f"dimensions.{name}",
                            span=span,
                            suggestions=_suggest_similar(via, list(data_objects.keys())),
                        )
                    )

                dimensions[name] = Dimension(
                    label=name,
                    view=data_object or "",
                    column=column or "",
                    result_type=raw_dim.get("resultType", "string"),
                    time_grain=raw_dim.get("timeGrain"),
                    via=via,
                    description=raw_dim.get("description"),
                    format=raw_dim.get("format"),
                    owner=raw_dim.get("owner"),
                    synonyms=raw_dim.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_dim),
                )
            except Exception as e:
                span = source_map.get(f"dimensions.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="DIMENSION_PARSE_ERROR",
                        message=f"Failed to parse dimension '{name}': {e}",
                        path=f"dimensions.{name}",
                        span=span,
                    )
                )

        # Parse measures
        measures: dict[str, Measure] = {}
        raw_measures = raw.get("measures", {})
        if not isinstance(raw_measures, dict):
            errors.append(
                SemanticError(
                    code="MEASURE_PARSE_ERROR",
                    message="'measures' must be a YAML mapping, not a list or scalar",
                    path="measures",
                )
            )
            raw_measures = {}
        for name, raw_meas in raw_measures.items():
            try:
                _check_unknown_keys(raw_meas, _MEASURE_KEYS, f"measures.{name}", errors, source_map)
                measure_columns: list[DataColumnRef] = []
                for ci, fdata in enumerate(raw_meas.get("columns", [])):
                    _check_unknown_keys(
                        fdata,
                        _DATA_COLUMN_REF_KEYS,
                        f"measures.{name}.columns[{ci}]",
                        errors,
                        source_map,
                    )
                    measure_columns.append(
                        DataColumnRef(
                            view=fdata.get("dataObject"),
                            column=fdata.get("column"),
                        )
                    )

                # Resolve expression field references
                expression = raw_meas.get("expression")
                if expression:
                    self._validate_expression_refs(
                        name, expression, data_objects, errors, source_map
                    )

                # Parse measure filters (new `filters:` list or legacy `filter:` single)
                measure_filters: list[MeasureFilterItem] = []
                raw_filters = raw_meas.get("filters")
                if raw_filters and isinstance(raw_filters, list):
                    for fi, rf in enumerate(raw_filters):
                        measure_filters.append(
                            _parse_measure_filter_item(
                                rf,
                                f"measures.{name}.filters[{fi}]",
                                errors,
                                source_map,
                            )
                        )
                else:
                    # Backward compat: single `filter:` key → [filter]
                    raw_filter = raw_meas.get("filter")
                    if raw_filter:
                        measure_filters.append(
                            _parse_measure_filter_item(
                                raw_filter, f"measures.{name}.filter", errors, source_map
                            )
                        )

                # Parse grain override
                grain_override: GrainOverride | None = None
                raw_grain = raw_meas.get("grain")
                if raw_grain and isinstance(raw_grain, dict):
                    _check_unknown_keys(
                        raw_grain,
                        _GRAIN_OVERRIDE_KEYS,
                        f"measures.{name}.grain",
                        errors,
                        source_map,
                    )
                    grain_override = GrainOverride(
                        mode=raw_grain.get("mode", "RELATIVE"),
                        exclude=raw_grain.get("exclude", []),
                        include=raw_grain.get("include", []),
                        keep_only=raw_grain.get("keepOnly", []),
                    )
                    # Validate dimension references in grain
                    for dim_name in (
                        grain_override.include + grain_override.exclude + grain_override.keep_only
                    ):
                        if dim_name not in dimensions:
                            span = source_map.get(f"measures.{name}.grain") if source_map else None
                            errors.append(
                                SemanticError(
                                    code="UNKNOWN_GRAIN_DIMENSION",
                                    message=(
                                        f"Measure '{name}' grain references "
                                        f"unknown dimension '{dim_name}'"
                                    ),
                                    path=f"measures.{name}.grain",
                                    span=span,
                                    suggestions=_suggest_similar(dim_name, list(dimensions.keys())),
                                )
                            )

                # Parse filter context
                filter_ctx: FilterContext | None = None
                raw_fc = raw_meas.get("filterContext")
                if raw_fc and isinstance(raw_fc, dict):
                    _check_unknown_keys(
                        raw_fc,
                        _FILTER_CONTEXT_KEYS,
                        f"measures.{name}.filterContext",
                        errors,
                        source_map,
                    )
                    include_filters: list[FilterContextFilter] = []
                    for inc_i, raw_incl in enumerate(raw_fc.get("include", [])):
                        if isinstance(raw_incl, dict):
                            _check_unknown_keys(
                                raw_incl,
                                _FILTER_CONTEXT_FILTER_KEYS,
                                f"measures.{name}.filterContext.include[{inc_i}]",
                                errors,
                                source_map,
                            )
                            include_filters.append(
                                FilterContextFilter(
                                    field=raw_incl.get("field", ""),
                                    op=raw_incl.get("op", "equals"),
                                    value=raw_incl.get("value"),
                                )
                            )
                    filter_ctx = FilterContext(
                        mode=raw_fc.get("mode", "RELATIVE"),
                        exclude=raw_fc.get("exclude", []),
                        include=include_filters,
                        keep_only=raw_fc.get("keepOnly", []),
                    )
                    # Validate field references in exclude/keepOnly
                    all_dim_names = set(dimensions.keys())
                    all_col_refs: set[str] = set()
                    for obj_name, obj_def in data_objects.items():
                        for col_name in obj_def.columns:
                            all_col_refs.add(f"{obj_name}.{col_name}")
                    for field_name in filter_ctx.exclude + filter_ctx.keep_only:
                        if field_name not in all_dim_names and field_name not in all_col_refs:
                            span = (
                                source_map.get(f"measures.{name}.filterContext")
                                if source_map
                                else None
                            )
                            errors.append(
                                SemanticError(
                                    code="UNKNOWN_FILTER_CONTEXT_FIELD",
                                    message=(
                                        f"Measure '{name}' filterContext references "
                                        f"unknown field '{field_name}'"
                                    ),
                                    path=f"measures.{name}.filterContext",
                                    span=span,
                                    suggestions=_suggest_similar(field_name, list(all_dim_names)),
                                )
                            )
                    for incl in filter_ctx.include:
                        if incl.field not in all_dim_names and incl.field not in all_col_refs:
                            span = (
                                source_map.get(f"measures.{name}.filterContext")
                                if source_map
                                else None
                            )
                            errors.append(
                                SemanticError(
                                    code="UNKNOWN_FILTER_CONTEXT_FIELD",
                                    message=(
                                        f"Measure '{name}' filterContext.include "
                                        f"references unknown field '{incl.field}'"
                                    ),
                                    path=f"measures.{name}.filterContext.include",
                                    span=span,
                                    suggestions=_suggest_similar(incl.field, list(all_dim_names)),
                                )
                            )

                measures[name] = Measure(
                    label=name,
                    columns=measure_columns,
                    result_type=raw_meas.get("resultType", "float"),
                    aggregation=raw_meas.get("aggregation", "sum"),
                    expression=expression,
                    distinct=raw_meas.get("distinct", False),
                    total=raw_meas.get("total", False),
                    grain=grain_override,
                    filter_context=filter_ctx,
                    filters=measure_filters,
                    data_type=raw_meas.get("dataType"),
                    description=raw_meas.get("description"),
                    format=raw_meas.get("format"),
                    allow_fan_out=raw_meas.get("allowFanOut", False),
                    delimiter=raw_meas.get("delimiter"),
                    within_group=raw_meas.get("withinGroup"),
                    owner=raw_meas.get("owner"),
                    synonyms=raw_meas.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_meas),
                )
            except Exception as e:
                span = source_map.get(f"measures.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="MEASURE_PARSE_ERROR",
                        message=f"Failed to parse measure '{name}': {e}",
                        path=f"measures.{name}",
                        span=span,
                    )
                )

        # Parse metrics
        metrics: dict[str, Metric] = {}
        raw_metrics = raw.get("metrics", {})
        if not isinstance(raw_metrics, dict):
            errors.append(
                SemanticError(
                    code="METRIC_PARSE_ERROR",
                    message="'metrics' must be a YAML mapping, not a list or scalar",
                    path="metrics",
                )
            )
            raw_metrics = {}
        for name, raw_metric in raw_metrics.items():
            try:
                _check_unknown_keys(raw_metric, _METRIC_KEYS, f"metrics.{name}", errors, source_map)
                raw_pop_block = raw_metric.get("periodOverPeriod")
                if isinstance(raw_pop_block, dict):
                    _check_unknown_keys(
                        raw_pop_block,
                        _PERIOD_OVER_PERIOD_KEYS,
                        f"metrics.{name}.periodOverPeriod",
                        errors,
                        source_map,
                    )
                metric_type = raw_metric.get("type", "derived")

                if metric_type == MetricType.CUMULATIVE:
                    # Cumulative metric: validate measure reference exists
                    ref_measure = raw_metric.get("measure", "")
                    if ref_measure and ref_measure not in measures:
                        span = source_map.get(f"metrics.{name}.measure") if source_map else None
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_MEASURE",
                                message=(
                                    f"Cumulative metric '{name}' references "
                                    f"unknown measure '{ref_measure}'"
                                ),
                                path=f"metrics.{name}.measure",
                                span=span,
                            )
                        )

                    # Validate timeDimension references a known dimension
                    cum_time_dim = raw_metric.get("timeDimension", "")
                    if cum_time_dim and cum_time_dim not in dimensions:
                        span = (
                            source_map.get(f"metrics.{name}.timeDimension") if source_map else None
                        )
                        errors.append(
                            SemanticError(
                                code="CUMULATIVE_UNKNOWN_TIME_DIMENSION",
                                message=(
                                    f"Cumulative metric '{name}' references "
                                    f"unknown time dimension '{cum_time_dim}'"
                                ),
                                path=f"metrics.{name}.timeDimension",
                                span=span,
                                suggestions=_suggest_similar(cum_time_dim, list(dimensions.keys())),
                            )
                        )

                    metrics[name] = Metric(
                        label=name,
                        type=MetricType.CUMULATIVE,
                        measure=raw_metric.get("measure"),
                        time_dimension=raw_metric.get("timeDimension"),
                        cumulative_type=raw_metric.get("cumulativeType", "sum"),
                        window=raw_metric.get("window"),
                        grain_to_date=raw_metric.get("grainToDate"),
                        partition_by=list(raw_metric.get("partitionBy", []) or []),
                        data_type=raw_metric.get("dataType"),
                        description=raw_metric.get("description"),
                        format=raw_metric.get("format"),
                        owner=raw_metric.get("owner"),
                        synonyms=raw_metric.get("synonyms", []),
                        custom_extensions=_parse_extensions(raw_metric),
                    )
                elif metric_type == MetricType.PERIOD_OVER_PERIOD:
                    # Period-over-period metric: validate expression + PoP config
                    expression = raw_metric.get("expression", "")
                    self._validate_metric_expression_refs(
                        name, expression, measures, errors, source_map, metrics
                    )

                    raw_pop = raw_metric.get("periodOverPeriod")
                    if not raw_pop:
                        span = source_map.get(f"metrics.{name}") if source_map else None
                        errors.append(
                            SemanticError(
                                code="METRIC_PARSE_ERROR",
                                message=(
                                    f"Period-over-period metric '{name}' "
                                    f"requires 'periodOverPeriod' configuration"
                                ),
                                path=f"metrics.{name}",
                                span=span,
                            )
                        )
                        raw_pop = {}

                    # Validate time dimension reference
                    pop_time_dim = raw_pop.get("timeDimension", "")
                    if pop_time_dim and pop_time_dim not in dimensions:
                        span = (
                            source_map.get(f"metrics.{name}.periodOverPeriod")
                            if source_map
                            else None
                        )
                        errors.append(
                            SemanticError(
                                code="POP_UNKNOWN_TIME_DIMENSION",
                                message=(
                                    f"Period-over-period metric '{name}' references "
                                    f"unknown time dimension '{pop_time_dim}'"
                                ),
                                path=f"metrics.{name}.periodOverPeriod.timeDimension",
                                span=span,
                                suggestions=_suggest_similar(pop_time_dim, list(dimensions.keys())),
                            )
                        )

                    pop_config = PeriodOverPeriod(
                        time_dimension=raw_pop.get("timeDimension", ""),
                        grain=raw_pop.get("grain", "month"),
                        offset=raw_pop.get("offset", -1),
                        offset_grain=raw_pop.get("offsetGrain", "year"),
                        comparison=raw_pop.get("comparison", "percentChange"),
                    )

                    metrics[name] = Metric(
                        label=name,
                        type=MetricType.PERIOD_OVER_PERIOD,
                        expression=expression,
                        period_over_period=pop_config,
                        data_type=raw_metric.get("dataType"),
                        description=raw_metric.get("description"),
                        format=raw_metric.get("format"),
                        owner=raw_metric.get("owner"),
                        synonyms=raw_metric.get("synonyms", []),
                        custom_extensions=_parse_extensions(raw_metric),
                    )
                elif metric_type == MetricType.WINDOW:
                    # Window metric (rank/lag/lead/ntile/first_value/last_value)
                    ref_measure = raw_metric.get("measure")
                    if ref_measure and ref_measure not in measures:
                        span = source_map.get(f"metrics.{name}.measure") if source_map else None
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_MEASURE",
                                message=(
                                    f"Window metric '{name}' references "
                                    f"unknown measure '{ref_measure}'"
                                ),
                                path=f"metrics.{name}.measure",
                                span=span,
                            )
                        )

                    win_time_dim = raw_metric.get("timeDimension", "")
                    if win_time_dim and win_time_dim not in dimensions:
                        span = (
                            source_map.get(f"metrics.{name}.timeDimension") if source_map else None
                        )
                        errors.append(
                            SemanticError(
                                code="WINDOW_UNKNOWN_TIME_DIMENSION",
                                message=(
                                    f"Window metric '{name}' references "
                                    f"unknown time dimension '{win_time_dim}'"
                                ),
                                path=f"metrics.{name}.timeDimension",
                                span=span,
                                suggestions=_suggest_similar(win_time_dim, list(dimensions.keys())),
                            )
                        )

                    metrics[name] = Metric(
                        label=name,
                        type=MetricType.WINDOW,
                        measure=ref_measure,
                        time_dimension=raw_metric.get("timeDimension"),
                        window_function=raw_metric.get("windowFunction"),
                        offset=raw_metric.get("offset"),
                        buckets=raw_metric.get("buckets"),
                        order_direction=raw_metric.get("orderDirection", "desc"),
                        default_value=raw_metric.get("defaultValue"),
                        partition_by=list(raw_metric.get("partitionBy", []) or []),
                        data_type=raw_metric.get("dataType"),
                        description=raw_metric.get("description"),
                        format=raw_metric.get("format"),
                        owner=raw_metric.get("owner"),
                        synonyms=raw_metric.get("synonyms", []),
                        custom_extensions=_parse_extensions(raw_metric),
                    )
                else:
                    # Derived metric (default)
                    expression = raw_metric.get("expression", "")
                    self._validate_metric_expression_refs(
                        name, expression, measures, errors, source_map, metrics
                    )

                    metrics[name] = Metric(
                        label=name,
                        expression=expression,
                        data_type=raw_metric.get("dataType"),
                        description=raw_metric.get("description"),
                        format=raw_metric.get("format"),
                        owner=raw_metric.get("owner"),
                        synonyms=raw_metric.get("synonyms", []),
                        custom_extensions=_parse_extensions(raw_metric),
                    )
            except Exception as e:
                span = source_map.get(f"metrics.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="METRIC_PARSE_ERROR",
                        message=f"Failed to parse metric '{name}': {e}",
                        path=f"metrics.{name}",
                        span=span,
                    )
                )

        # Parse static model filters
        model_filters: list[ModelFilter] = []
        raw_filters = raw.get("filters", [])
        if not isinstance(raw_filters, list):
            errors.append(
                SemanticError(
                    code="FILTER_PARSE_ERROR",
                    message="'filters' must be a YAML list, not a mapping or scalar",
                    path="filters",
                )
            )
            raw_filters = []
        for i, rf in enumerate(raw_filters):
            try:
                _check_unknown_keys(rf, _MODEL_FILTER_KEYS, f"filters[{i}]", errors, source_map)
                obj_name = rf.get("dataObject", "")
                col_name = rf.get("column", "")
                if obj_name and obj_name not in data_objects:
                    span = source_map.get(f"filters[{i}]") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_FILTER_DATA_OBJECT",
                            message=(
                                f"Static filter[{i}] references unknown data object '{obj_name}'"
                            ),
                            path=f"filters[{i}]",
                            span=span,
                        )
                    )
                elif obj_name and col_name and col_name not in data_objects[obj_name].columns:
                    span = source_map.get(f"filters[{i}]") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_FILTER_COLUMN",
                            message=(
                                f"Static filter[{i}] references unknown column "
                                f"'{col_name}' in data object '{obj_name}'"
                            ),
                            path=f"filters[{i}]",
                            span=span,
                        )
                    )
                raw_val = rf.get("value")
                raw_vals = rf.get("values", [])
                model_filters.append(
                    ModelFilter(
                        data_object=obj_name,
                        column=col_name,
                        operator=rf.get("operator", "equals"),
                        value=_coerce_filter_value(raw_val),
                        values=[_coerce_filter_value(v) for v in raw_vals],
                    )
                )
            except Exception as e:
                span = source_map.get(f"filters[{i}]") if source_map else None
                errors.append(
                    SemanticError(
                        code="FILTER_PARSE_ERROR",
                        message=f"Failed to parse static filter[{i}]: {e}",
                        path=f"filters[{i}]",
                        span=span,
                    )
                )

        settings = _parse_settings(raw.get("settings"), errors, source_map)

        # Parse examples block (PLAN_agent_api_improvements §5)
        examples = self._parse_examples(raw.get("examples"), errors)

        model = SemanticModel(
            version=raw.get("version", 1.0),
            name=raw.get("name"),
            description=raw.get("description"),
            data_objects=data_objects,
            dimensions=dimensions,
            measures=measures,
            metrics=metrics,
            filters=model_filters,
            examples=examples,
            extends_sources=raw.get("_extends_sources", []),
            inherits_source=raw.get("_inherits_source"),
            owner=raw.get("owner"),
            custom_extensions=_parse_extensions(raw, "", errors, source_map),
            settings=settings,
        )

        result = ValidationResult(
            valid=len(errors) == 0,
            errors=errors,
            warnings=warnings,
        )

        return model, result

    def _parse_examples(self, raw: object, errors: list[SemanticError]) -> list[ModelExample]:
        """Parse the model-level ``examples:`` block.

        Accepts a list of mapping entries. Each entry must have ``name``,
        ``description``, and ``query``. ``intent_tags`` (alias ``intentTags``)
        is optional. Names must be unique within the block.
        """
        if raw is None:
            return []
        if not isinstance(raw, list):
            errors.append(
                SemanticError(
                    code="EXAMPLES_PARSE_ERROR",
                    message="'examples' must be a YAML list of example entries",
                    path="examples",
                )
            )
            return []

        out: list[ModelExample] = []
        seen: set[str] = set()
        for i, entry in enumerate(raw):
            if not isinstance(entry, dict):
                errors.append(
                    SemanticError(
                        code="EXAMPLES_PARSE_ERROR",
                        message=f"examples[{i}] must be a mapping",
                        path=f"examples[{i}]",
                    )
                )
                continue
            _check_unknown_keys(entry, _MODEL_EXAMPLE_KEYS, f"examples[{i}]", errors)
            name = entry.get("name")
            description = entry.get("description")
            query = entry.get("query")
            intent_tags = entry.get("intent_tags") or entry.get("intentTags") or []
            if not isinstance(name, str) or not name:
                errors.append(
                    SemanticError(
                        code="EXAMPLES_PARSE_ERROR",
                        message=f"examples[{i}].name is required and must be a string",
                        path=f"examples[{i}].name",
                    )
                )
                continue
            if name in seen:
                errors.append(
                    SemanticError(
                        code="DUPLICATE_EXAMPLE_NAME",
                        message=f"Duplicate example name '{name}'",
                        path=f"examples[{i}].name",
                    )
                )
                continue
            if not isinstance(description, str):
                errors.append(
                    SemanticError(
                        code="EXAMPLES_PARSE_ERROR",
                        message=f"examples[{i}].description is required",
                        path=f"examples[{i}].description",
                    )
                )
                continue
            if not isinstance(query, dict):
                errors.append(
                    SemanticError(
                        code="EXAMPLES_PARSE_ERROR",
                        message=f"examples[{i}].query must be a mapping (QueryObject payload)",
                        path=f"examples[{i}].query",
                    )
                )
                continue
            if not isinstance(intent_tags, list):
                errors.append(
                    SemanticError(
                        code="EXAMPLES_PARSE_ERROR",
                        message=f"examples[{i}].intent_tags must be a list",
                        path=f"examples[{i}].intent_tags",
                    )
                )
                continue
            seen.add(name)
            out.append(
                ModelExample(
                    name=name,
                    description=description,
                    intent_tags=[str(t) for t in intent_tags],
                    query=dict(query),
                )
            )
        return out

    def _validate_expression_refs(
        self,
        measure_name: str,
        expression: str,
        data_objects: dict[str, DataObject],
        errors: list[SemanticError],
        source_map: SourceMap | None,
    ) -> None:
        """Validate {[DataObject].[Column]} references in a measure expression."""
        span = source_map.get(f"measures.{measure_name}.expression") if source_map else None
        named_refs = re.findall(r"\{\[([^\]{}\[]+)\]\.\[([^\]{}\[]+)\]\}", expression)
        for obj_name, col_name in named_refs:
            if obj_name not in data_objects:
                errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT_IN_EXPRESSION",
                        message=(
                            f"Measure '{measure_name}' expression references unknown "
                            f"data object '{obj_name}'"
                        ),
                        path=f"measures.{measure_name}.expression",
                        span=span,
                    )
                )
            elif col_name not in data_objects[obj_name].columns:
                errors.append(
                    SemanticError(
                        code="UNKNOWN_COLUMN_IN_EXPRESSION",
                        message=(
                            f"Measure '{measure_name}' expression references unknown column "
                            f"'{col_name}' in data object '{obj_name}'"
                        ),
                        path=f"measures.{measure_name}.expression",
                        span=span,
                    )
                )

        # Strip valid refs, scan remainder for malformed attempts.
        remainder = re.sub(r"\{\[[^\]{}\[]+\]\.\[[^\]{}\[]+\]\}", "", expression)
        path = f"measures.{measure_name}.expression"

        def _merr(msg: str) -> None:
            errors.append(
                SemanticError(code="MALFORMED_EXPRESSION_REF", message=msg, path=path, span=span)
            )

        # {[Obj][Col]} — missing dot separator
        for o, c in re.findall(r"\{\[([^\]{}\[]+)\]\[([^\]{}\[]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{o}][{c}]}}' — missing '.' separator"
            )

        # {[Obj.Col]} — dot inside single bracket pair
        for bad in re.findall(r"\{\[([^\]{}\[]+\.[^\]{}\[]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{bad}]}}' — use '{{[Obj].[Col]}}' syntax"
            )

        # {Obj.Col} — missing all inner brackets
        for bad in re.findall(r"\{([A-Za-z][^\[{}\]]*\.[A-Za-z][^\[{}\]]*)\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{{bad}}}' — missing '[' and ']', use '{{[Obj].[Col]}}' syntax"
            )

        # {[Obj].[Col] — missing closing }
        for o, c in re.findall(r"\{\[([^\]{}\[]+)\]\.\[([^\]{}\[]+)\](?!\})", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{o}].[{c}]' — missing closing '}}'"
            )

        # [Obj].[Col]} — missing opening {
        for o, c in re.findall(r"(?<!\{)\[([^\]{}\[]+)\]\.\[([^\]{}\[]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '[{o}].[{c}]}}' — missing opening '{{'"
            )

        # {[Obj].[Col} — missing ] on column
        for o, c in re.findall(r"\{\[([^\]{}\[]+)\]\.\[([^\]{}\[]*)\}(?!\])", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{o}].[{c}}}' — missing closing ']' on column"
            )

        # {[Obj.[Col]} — missing ] on data object
        for o, c in re.findall(r"\{\[([^\]{}\[]*)\.?\[([^\]{}\[]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{o}.[{c}]}}' — missing closing ']' on data object"
            )

        # {Obj].[Col]} — missing [ on data object
        for o, c in re.findall(r"\{([^\[{}\]]+)\]\.\[([^\]{}\[]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{{o}].[{c}]}}' — missing opening '[' on data object"
            )

        # {[Obj].Col]} — missing [ on column
        for o, c in re.findall(r"\{\[([^\]{}\[]+)\]\.([^\[{}\]]+)\]\}", remainder):
            _merr(
                f"Measure '{measure_name}' has malformed reference"
                f" '{{[{o}].{c}]}}' — missing opening '[' on column"
            )

    def _validate_metric_expression_refs(
        self,
        metric_name: str,
        expression: str,
        measures: dict[str, Measure],
        errors: list[SemanticError],
        source_map: SourceMap | None,
        metrics: dict[str, Metric] | None = None,
    ) -> None:
        """Validate {[Measure Name]} references in a metric expression.

        References can resolve to either measures or already-defined metrics
        (typically cumulative or window metrics that have been parsed earlier
        in the same model). ``metrics`` defaults to ``None`` so existing
        callers continue to work; the caller passes the in-progress metrics
        dict to enable cross-metric composition.
        """
        span = source_map.get(f"metrics.{metric_name}.expression") if source_map else None

        valid_refs = re.findall(r"\{\[([^\]{}\[]+)\]\}", expression)

        # Strip valid {[Name]} refs, then scan remainder for malformed attempts.
        remainder = re.sub(r"\{\[[^\]{}\[]+\]\}", "", expression)

        # {[Name} — missing closing ]
        for bad in re.findall(r"\{\[([^\]{}]*)\}", remainder):
            errors.append(
                SemanticError(
                    code="MALFORMED_EXPRESSION_REF",
                    message=(
                        f"Metric '{metric_name}' has malformed reference"
                        f" '{{[{bad}}}' — missing closing ']'"
                    ),
                    path=f"metrics.{metric_name}.expression",
                    span=span,
                )
            )

        # {[Name] — missing closing }
        for bad in re.findall(r"\{\[([^\]{}]+)\](?!\})", remainder):
            errors.append(
                SemanticError(
                    code="MALFORMED_EXPRESSION_REF",
                    message=(
                        f"Metric '{metric_name}' has malformed reference"
                        f" '{{[{bad}]' — missing closing '}}'"
                    ),
                    path=f"metrics.{metric_name}.expression",
                    span=span,
                )
            )

        # {Name]} — missing opening [
        for bad in re.findall(r"\{([^\[{}\]]+)\]\}", remainder):
            errors.append(
                SemanticError(
                    code="MALFORMED_EXPRESSION_REF",
                    message=(
                        f"Metric '{metric_name}' has malformed reference"
                        f" '{{{bad}]}}' — missing opening '['"
                    ),
                    path=f"metrics.{metric_name}.expression",
                    span=span,
                )
            )

        # {Name} — missing both [ and ]
        for bad in re.findall(r"\{([^\[{\]}\s]+)\}", remainder):
            errors.append(
                SemanticError(
                    code="MALFORMED_EXPRESSION_REF",
                    message=(
                        f"Metric '{metric_name}' has malformed reference"
                        f" '{{{bad}}}' — missing '[' and ']'"
                    ),
                    path=f"metrics.{metric_name}.expression",
                    span=span,
                )
            )

        # [Name]} — missing opening {
        for bad in re.findall(r"(?<!\{)\[([^\]{}\[]+)\]\}", remainder):
            errors.append(
                SemanticError(
                    code="MALFORMED_EXPRESSION_REF",
                    message=(
                        f"Metric '{metric_name}' has malformed reference"
                        f" '[{bad}]}}' — missing opening '{{'"
                    ),
                    path=f"metrics.{metric_name}.expression",
                    span=span,
                )
            )

        known_metrics = metrics or {}
        for ref_name in valid_refs:
            if ref_name not in measures and ref_name not in known_metrics:
                errors.append(
                    SemanticError(
                        code="UNKNOWN_MEASURE_REF",
                        message=(f"Metric '{metric_name}' references unknown measure '{ref_name}'"),
                        path=f"metrics.{metric_name}.expression",
                        span=span,
                        suggestions=_suggest_similar(
                            ref_name,
                            list(measures.keys()) + list(known_metrics.keys()),
                        ),
                    )
                )

resolve(raw, source_map=None)

Resolve raw YAML dict into a validated SemanticModel.

Returns (model, validation_result). If there are errors, the model may be partially populated.

Source code in src/orionbelt/parser/resolver.py
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
def resolve(
    self,
    raw: dict[str, Any],
    source_map: SourceMap | None = None,
) -> tuple[SemanticModel, ValidationResult]:
    """Resolve raw YAML dict into a validated SemanticModel.

    Returns (model, validation_result). If there are errors,
    the model may be partially populated.
    """
    errors: list[SemanticError] = []
    warnings: list[SemanticError] = []

    # Strict OBML: reject unknown top-level keys (catches typos like
    # ``dataObjekt:`` that would silently be dropped by ``raw.get(...)``).
    _check_unknown_keys(raw, _TOP_LEVEL_KEYS, "", errors, source_map)

    # Parse data objects
    data_objects: dict[str, DataObject] = {}
    raw_objects = raw.get("dataObjects", {})
    if not isinstance(raw_objects, dict):
        errors.append(
            SemanticError(
                code="DATA_OBJECT_PARSE_ERROR",
                message="'dataObjects' must be a YAML mapping, not a list or scalar",
                path="dataObjects",
            )
        )
        raw_objects = {}
    for name, raw_obj in raw_objects.items():
        try:
            _check_unknown_keys(
                raw_obj, _DATA_OBJECT_KEYS, f"dataObjects.{name}", errors, source_map
            )
            obj_columns: dict[str, DataObjectColumn] = {}
            for fname, fdata in raw_obj.get("columns", {}).items():
                _check_unknown_keys(
                    fdata,
                    _DATA_OBJECT_COLUMN_KEYS,
                    f"dataObjects.{name}.columns.{fname}",
                    errors,
                    source_map,
                )
                obj_columns[fname] = DataObjectColumn(
                    label=fname,
                    code=fdata.get("code", fname if not fdata.get("expression") else ""),
                    abstract_type=fdata.get("abstractType", "string"),
                    sql_type=fdata.get("sqlType"),
                    sql_precision=fdata.get("sqlPrecision"),
                    sql_scale=fdata.get("sqlScale"),
                    num_class=fdata.get("numClass"),
                    primary_key=bool(fdata.get("primaryKey", False)),
                    description=fdata.get("description"),
                    comment=fdata.get("comment"),
                    owner=fdata.get("owner"),
                    expression=fdata.get("expression"),
                    synonyms=fdata.get("synonyms", []),
                    custom_extensions=_parse_extensions(fdata),
                )

            obj_joins: list[DataObjectJoin] = []
            for ji, jdata in enumerate(raw_obj.get("joins", [])):
                _check_unknown_keys(
                    jdata,
                    _DATA_OBJECT_JOIN_KEYS,
                    f"dataObjects.{name}.joins[{ji}]",
                    errors,
                    source_map,
                )
                obj_joins.append(
                    DataObjectJoin(
                        join_type=jdata["joinType"],
                        join_to=jdata["joinTo"],
                        columns_from=jdata["columnsFrom"],
                        columns_to=jdata["columnsTo"],
                        secondary=jdata.get("secondary", False),
                        path_name=jdata.get("pathName"),
                    )
                )

            data_objects[name] = DataObject(
                label=name,
                code=raw_obj.get("code", ""),
                database=raw_obj.get("database", ""),
                schema_name=raw_obj.get("schema", ""),
                columns=obj_columns,
                joins=obj_joins,
                description=raw_obj.get("description"),
                comment=raw_obj.get("comment"),
                owner=raw_obj.get("owner"),
                synonyms=raw_obj.get("synonyms", []),
                custom_extensions=_parse_extensions(raw_obj),
                refresh=_parse_refresh(raw_obj.get("refresh"), name, errors),
            )
        except Exception as e:
            span = source_map.get(f"dataObjects.{name}") if source_map else None
            errors.append(
                SemanticError(
                    code="DATA_OBJECT_PARSE_ERROR",
                    message=f"Failed to parse data object '{name}': {e}",
                    path=f"dataObjects.{name}",
                    span=span,
                )
            )

    # Parse dimensions
    dimensions: dict[str, Dimension] = {}
    raw_dims = raw.get("dimensions", {})
    if not isinstance(raw_dims, dict):
        errors.append(
            SemanticError(
                code="DIMENSION_PARSE_ERROR",
                message="'dimensions' must be a YAML mapping, not a list or scalar",
                path="dimensions",
            )
        )
        raw_dims = {}
    for name, raw_dim in raw_dims.items():
        try:
            _check_unknown_keys(
                raw_dim, _DIMENSION_KEYS, f"dimensions.{name}", errors, source_map
            )
            data_object = raw_dim.get("dataObject")
            column = raw_dim.get("column")

            # Validate the data object exists
            if data_object and data_object not in data_objects:
                span = source_map.get(f"dimensions.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT",
                        message=(
                            f"Dimension '{name}' references unknown data object '{data_object}'"
                        ),
                        path=f"dimensions.{name}",
                        span=span,
                        suggestions=_suggest_similar(data_object, list(data_objects.keys())),
                    )
                )

            # Validate the column exists in the data object
            if (
                data_object
                and column
                and data_object in data_objects
                and column not in data_objects[data_object].columns
            ):
                span = source_map.get(f"dimensions.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="UNKNOWN_COLUMN",
                        message=(
                            f"Dimension '{name}' references unknown column "
                            f"'{column}' in data object '{data_object}'"
                        ),
                        path=f"dimensions.{name}",
                        span=span,
                        suggestions=_suggest_similar(
                            column, list(data_objects[data_object].columns.keys())
                        ),
                    )
                )

            via = raw_dim.get("via")
            if via and via not in data_objects:
                span = source_map.get(f"dimensions.{name}") if source_map else None
                errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT",
                        message=(
                            f"Dimension '{name}' via references unknown data object '{via}'"
                        ),
                        path=f"dimensions.{name}",
                        span=span,
                        suggestions=_suggest_similar(via, list(data_objects.keys())),
                    )
                )

            dimensions[name] = Dimension(
                label=name,
                view=data_object or "",
                column=column or "",
                result_type=raw_dim.get("resultType", "string"),
                time_grain=raw_dim.get("timeGrain"),
                via=via,
                description=raw_dim.get("description"),
                format=raw_dim.get("format"),
                owner=raw_dim.get("owner"),
                synonyms=raw_dim.get("synonyms", []),
                custom_extensions=_parse_extensions(raw_dim),
            )
        except Exception as e:
            span = source_map.get(f"dimensions.{name}") if source_map else None
            errors.append(
                SemanticError(
                    code="DIMENSION_PARSE_ERROR",
                    message=f"Failed to parse dimension '{name}': {e}",
                    path=f"dimensions.{name}",
                    span=span,
                )
            )

    # Parse measures
    measures: dict[str, Measure] = {}
    raw_measures = raw.get("measures", {})
    if not isinstance(raw_measures, dict):
        errors.append(
            SemanticError(
                code="MEASURE_PARSE_ERROR",
                message="'measures' must be a YAML mapping, not a list or scalar",
                path="measures",
            )
        )
        raw_measures = {}
    for name, raw_meas in raw_measures.items():
        try:
            _check_unknown_keys(raw_meas, _MEASURE_KEYS, f"measures.{name}", errors, source_map)
            measure_columns: list[DataColumnRef] = []
            for ci, fdata in enumerate(raw_meas.get("columns", [])):
                _check_unknown_keys(
                    fdata,
                    _DATA_COLUMN_REF_KEYS,
                    f"measures.{name}.columns[{ci}]",
                    errors,
                    source_map,
                )
                measure_columns.append(
                    DataColumnRef(
                        view=fdata.get("dataObject"),
                        column=fdata.get("column"),
                    )
                )

            # Resolve expression field references
            expression = raw_meas.get("expression")
            if expression:
                self._validate_expression_refs(
                    name, expression, data_objects, errors, source_map
                )

            # Parse measure filters (new `filters:` list or legacy `filter:` single)
            measure_filters: list[MeasureFilterItem] = []
            raw_filters = raw_meas.get("filters")
            if raw_filters and isinstance(raw_filters, list):
                for fi, rf in enumerate(raw_filters):
                    measure_filters.append(
                        _parse_measure_filter_item(
                            rf,
                            f"measures.{name}.filters[{fi}]",
                            errors,
                            source_map,
                        )
                    )
            else:
                # Backward compat: single `filter:` key → [filter]
                raw_filter = raw_meas.get("filter")
                if raw_filter:
                    measure_filters.append(
                        _parse_measure_filter_item(
                            raw_filter, f"measures.{name}.filter", errors, source_map
                        )
                    )

            # Parse grain override
            grain_override: GrainOverride | None = None
            raw_grain = raw_meas.get("grain")
            if raw_grain and isinstance(raw_grain, dict):
                _check_unknown_keys(
                    raw_grain,
                    _GRAIN_OVERRIDE_KEYS,
                    f"measures.{name}.grain",
                    errors,
                    source_map,
                )
                grain_override = GrainOverride(
                    mode=raw_grain.get("mode", "RELATIVE"),
                    exclude=raw_grain.get("exclude", []),
                    include=raw_grain.get("include", []),
                    keep_only=raw_grain.get("keepOnly", []),
                )
                # Validate dimension references in grain
                for dim_name in (
                    grain_override.include + grain_override.exclude + grain_override.keep_only
                ):
                    if dim_name not in dimensions:
                        span = source_map.get(f"measures.{name}.grain") if source_map else None
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_GRAIN_DIMENSION",
                                message=(
                                    f"Measure '{name}' grain references "
                                    f"unknown dimension '{dim_name}'"
                                ),
                                path=f"measures.{name}.grain",
                                span=span,
                                suggestions=_suggest_similar(dim_name, list(dimensions.keys())),
                            )
                        )

            # Parse filter context
            filter_ctx: FilterContext | None = None
            raw_fc = raw_meas.get("filterContext")
            if raw_fc and isinstance(raw_fc, dict):
                _check_unknown_keys(
                    raw_fc,
                    _FILTER_CONTEXT_KEYS,
                    f"measures.{name}.filterContext",
                    errors,
                    source_map,
                )
                include_filters: list[FilterContextFilter] = []
                for inc_i, raw_incl in enumerate(raw_fc.get("include", [])):
                    if isinstance(raw_incl, dict):
                        _check_unknown_keys(
                            raw_incl,
                            _FILTER_CONTEXT_FILTER_KEYS,
                            f"measures.{name}.filterContext.include[{inc_i}]",
                            errors,
                            source_map,
                        )
                        include_filters.append(
                            FilterContextFilter(
                                field=raw_incl.get("field", ""),
                                op=raw_incl.get("op", "equals"),
                                value=raw_incl.get("value"),
                            )
                        )
                filter_ctx = FilterContext(
                    mode=raw_fc.get("mode", "RELATIVE"),
                    exclude=raw_fc.get("exclude", []),
                    include=include_filters,
                    keep_only=raw_fc.get("keepOnly", []),
                )
                # Validate field references in exclude/keepOnly
                all_dim_names = set(dimensions.keys())
                all_col_refs: set[str] = set()
                for obj_name, obj_def in data_objects.items():
                    for col_name in obj_def.columns:
                        all_col_refs.add(f"{obj_name}.{col_name}")
                for field_name in filter_ctx.exclude + filter_ctx.keep_only:
                    if field_name not in all_dim_names and field_name not in all_col_refs:
                        span = (
                            source_map.get(f"measures.{name}.filterContext")
                            if source_map
                            else None
                        )
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_FILTER_CONTEXT_FIELD",
                                message=(
                                    f"Measure '{name}' filterContext references "
                                    f"unknown field '{field_name}'"
                                ),
                                path=f"measures.{name}.filterContext",
                                span=span,
                                suggestions=_suggest_similar(field_name, list(all_dim_names)),
                            )
                        )
                for incl in filter_ctx.include:
                    if incl.field not in all_dim_names and incl.field not in all_col_refs:
                        span = (
                            source_map.get(f"measures.{name}.filterContext")
                            if source_map
                            else None
                        )
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_FILTER_CONTEXT_FIELD",
                                message=(
                                    f"Measure '{name}' filterContext.include "
                                    f"references unknown field '{incl.field}'"
                                ),
                                path=f"measures.{name}.filterContext.include",
                                span=span,
                                suggestions=_suggest_similar(incl.field, list(all_dim_names)),
                            )
                        )

            measures[name] = Measure(
                label=name,
                columns=measure_columns,
                result_type=raw_meas.get("resultType", "float"),
                aggregation=raw_meas.get("aggregation", "sum"),
                expression=expression,
                distinct=raw_meas.get("distinct", False),
                total=raw_meas.get("total", False),
                grain=grain_override,
                filter_context=filter_ctx,
                filters=measure_filters,
                data_type=raw_meas.get("dataType"),
                description=raw_meas.get("description"),
                format=raw_meas.get("format"),
                allow_fan_out=raw_meas.get("allowFanOut", False),
                delimiter=raw_meas.get("delimiter"),
                within_group=raw_meas.get("withinGroup"),
                owner=raw_meas.get("owner"),
                synonyms=raw_meas.get("synonyms", []),
                custom_extensions=_parse_extensions(raw_meas),
            )
        except Exception as e:
            span = source_map.get(f"measures.{name}") if source_map else None
            errors.append(
                SemanticError(
                    code="MEASURE_PARSE_ERROR",
                    message=f"Failed to parse measure '{name}': {e}",
                    path=f"measures.{name}",
                    span=span,
                )
            )

    # Parse metrics
    metrics: dict[str, Metric] = {}
    raw_metrics = raw.get("metrics", {})
    if not isinstance(raw_metrics, dict):
        errors.append(
            SemanticError(
                code="METRIC_PARSE_ERROR",
                message="'metrics' must be a YAML mapping, not a list or scalar",
                path="metrics",
            )
        )
        raw_metrics = {}
    for name, raw_metric in raw_metrics.items():
        try:
            _check_unknown_keys(raw_metric, _METRIC_KEYS, f"metrics.{name}", errors, source_map)
            raw_pop_block = raw_metric.get("periodOverPeriod")
            if isinstance(raw_pop_block, dict):
                _check_unknown_keys(
                    raw_pop_block,
                    _PERIOD_OVER_PERIOD_KEYS,
                    f"metrics.{name}.periodOverPeriod",
                    errors,
                    source_map,
                )
            metric_type = raw_metric.get("type", "derived")

            if metric_type == MetricType.CUMULATIVE:
                # Cumulative metric: validate measure reference exists
                ref_measure = raw_metric.get("measure", "")
                if ref_measure and ref_measure not in measures:
                    span = source_map.get(f"metrics.{name}.measure") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_MEASURE",
                            message=(
                                f"Cumulative metric '{name}' references "
                                f"unknown measure '{ref_measure}'"
                            ),
                            path=f"metrics.{name}.measure",
                            span=span,
                        )
                    )

                # Validate timeDimension references a known dimension
                cum_time_dim = raw_metric.get("timeDimension", "")
                if cum_time_dim and cum_time_dim not in dimensions:
                    span = (
                        source_map.get(f"metrics.{name}.timeDimension") if source_map else None
                    )
                    errors.append(
                        SemanticError(
                            code="CUMULATIVE_UNKNOWN_TIME_DIMENSION",
                            message=(
                                f"Cumulative metric '{name}' references "
                                f"unknown time dimension '{cum_time_dim}'"
                            ),
                            path=f"metrics.{name}.timeDimension",
                            span=span,
                            suggestions=_suggest_similar(cum_time_dim, list(dimensions.keys())),
                        )
                    )

                metrics[name] = Metric(
                    label=name,
                    type=MetricType.CUMULATIVE,
                    measure=raw_metric.get("measure"),
                    time_dimension=raw_metric.get("timeDimension"),
                    cumulative_type=raw_metric.get("cumulativeType", "sum"),
                    window=raw_metric.get("window"),
                    grain_to_date=raw_metric.get("grainToDate"),
                    partition_by=list(raw_metric.get("partitionBy", []) or []),
                    data_type=raw_metric.get("dataType"),
                    description=raw_metric.get("description"),
                    format=raw_metric.get("format"),
                    owner=raw_metric.get("owner"),
                    synonyms=raw_metric.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_metric),
                )
            elif metric_type == MetricType.PERIOD_OVER_PERIOD:
                # Period-over-period metric: validate expression + PoP config
                expression = raw_metric.get("expression", "")
                self._validate_metric_expression_refs(
                    name, expression, measures, errors, source_map, metrics
                )

                raw_pop = raw_metric.get("periodOverPeriod")
                if not raw_pop:
                    span = source_map.get(f"metrics.{name}") if source_map else None
                    errors.append(
                        SemanticError(
                            code="METRIC_PARSE_ERROR",
                            message=(
                                f"Period-over-period metric '{name}' "
                                f"requires 'periodOverPeriod' configuration"
                            ),
                            path=f"metrics.{name}",
                            span=span,
                        )
                    )
                    raw_pop = {}

                # Validate time dimension reference
                pop_time_dim = raw_pop.get("timeDimension", "")
                if pop_time_dim and pop_time_dim not in dimensions:
                    span = (
                        source_map.get(f"metrics.{name}.periodOverPeriod")
                        if source_map
                        else None
                    )
                    errors.append(
                        SemanticError(
                            code="POP_UNKNOWN_TIME_DIMENSION",
                            message=(
                                f"Period-over-period metric '{name}' references "
                                f"unknown time dimension '{pop_time_dim}'"
                            ),
                            path=f"metrics.{name}.periodOverPeriod.timeDimension",
                            span=span,
                            suggestions=_suggest_similar(pop_time_dim, list(dimensions.keys())),
                        )
                    )

                pop_config = PeriodOverPeriod(
                    time_dimension=raw_pop.get("timeDimension", ""),
                    grain=raw_pop.get("grain", "month"),
                    offset=raw_pop.get("offset", -1),
                    offset_grain=raw_pop.get("offsetGrain", "year"),
                    comparison=raw_pop.get("comparison", "percentChange"),
                )

                metrics[name] = Metric(
                    label=name,
                    type=MetricType.PERIOD_OVER_PERIOD,
                    expression=expression,
                    period_over_period=pop_config,
                    data_type=raw_metric.get("dataType"),
                    description=raw_metric.get("description"),
                    format=raw_metric.get("format"),
                    owner=raw_metric.get("owner"),
                    synonyms=raw_metric.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_metric),
                )
            elif metric_type == MetricType.WINDOW:
                # Window metric (rank/lag/lead/ntile/first_value/last_value)
                ref_measure = raw_metric.get("measure")
                if ref_measure and ref_measure not in measures:
                    span = source_map.get(f"metrics.{name}.measure") if source_map else None
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_MEASURE",
                            message=(
                                f"Window metric '{name}' references "
                                f"unknown measure '{ref_measure}'"
                            ),
                            path=f"metrics.{name}.measure",
                            span=span,
                        )
                    )

                win_time_dim = raw_metric.get("timeDimension", "")
                if win_time_dim and win_time_dim not in dimensions:
                    span = (
                        source_map.get(f"metrics.{name}.timeDimension") if source_map else None
                    )
                    errors.append(
                        SemanticError(
                            code="WINDOW_UNKNOWN_TIME_DIMENSION",
                            message=(
                                f"Window metric '{name}' references "
                                f"unknown time dimension '{win_time_dim}'"
                            ),
                            path=f"metrics.{name}.timeDimension",
                            span=span,
                            suggestions=_suggest_similar(win_time_dim, list(dimensions.keys())),
                        )
                    )

                metrics[name] = Metric(
                    label=name,
                    type=MetricType.WINDOW,
                    measure=ref_measure,
                    time_dimension=raw_metric.get("timeDimension"),
                    window_function=raw_metric.get("windowFunction"),
                    offset=raw_metric.get("offset"),
                    buckets=raw_metric.get("buckets"),
                    order_direction=raw_metric.get("orderDirection", "desc"),
                    default_value=raw_metric.get("defaultValue"),
                    partition_by=list(raw_metric.get("partitionBy", []) or []),
                    data_type=raw_metric.get("dataType"),
                    description=raw_metric.get("description"),
                    format=raw_metric.get("format"),
                    owner=raw_metric.get("owner"),
                    synonyms=raw_metric.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_metric),
                )
            else:
                # Derived metric (default)
                expression = raw_metric.get("expression", "")
                self._validate_metric_expression_refs(
                    name, expression, measures, errors, source_map, metrics
                )

                metrics[name] = Metric(
                    label=name,
                    expression=expression,
                    data_type=raw_metric.get("dataType"),
                    description=raw_metric.get("description"),
                    format=raw_metric.get("format"),
                    owner=raw_metric.get("owner"),
                    synonyms=raw_metric.get("synonyms", []),
                    custom_extensions=_parse_extensions(raw_metric),
                )
        except Exception as e:
            span = source_map.get(f"metrics.{name}") if source_map else None
            errors.append(
                SemanticError(
                    code="METRIC_PARSE_ERROR",
                    message=f"Failed to parse metric '{name}': {e}",
                    path=f"metrics.{name}",
                    span=span,
                )
            )

    # Parse static model filters
    model_filters: list[ModelFilter] = []
    raw_filters = raw.get("filters", [])
    if not isinstance(raw_filters, list):
        errors.append(
            SemanticError(
                code="FILTER_PARSE_ERROR",
                message="'filters' must be a YAML list, not a mapping or scalar",
                path="filters",
            )
        )
        raw_filters = []
    for i, rf in enumerate(raw_filters):
        try:
            _check_unknown_keys(rf, _MODEL_FILTER_KEYS, f"filters[{i}]", errors, source_map)
            obj_name = rf.get("dataObject", "")
            col_name = rf.get("column", "")
            if obj_name and obj_name not in data_objects:
                span = source_map.get(f"filters[{i}]") if source_map else None
                errors.append(
                    SemanticError(
                        code="UNKNOWN_FILTER_DATA_OBJECT",
                        message=(
                            f"Static filter[{i}] references unknown data object '{obj_name}'"
                        ),
                        path=f"filters[{i}]",
                        span=span,
                    )
                )
            elif obj_name and col_name and col_name not in data_objects[obj_name].columns:
                span = source_map.get(f"filters[{i}]") if source_map else None
                errors.append(
                    SemanticError(
                        code="UNKNOWN_FILTER_COLUMN",
                        message=(
                            f"Static filter[{i}] references unknown column "
                            f"'{col_name}' in data object '{obj_name}'"
                        ),
                        path=f"filters[{i}]",
                        span=span,
                    )
                )
            raw_val = rf.get("value")
            raw_vals = rf.get("values", [])
            model_filters.append(
                ModelFilter(
                    data_object=obj_name,
                    column=col_name,
                    operator=rf.get("operator", "equals"),
                    value=_coerce_filter_value(raw_val),
                    values=[_coerce_filter_value(v) for v in raw_vals],
                )
            )
        except Exception as e:
            span = source_map.get(f"filters[{i}]") if source_map else None
            errors.append(
                SemanticError(
                    code="FILTER_PARSE_ERROR",
                    message=f"Failed to parse static filter[{i}]: {e}",
                    path=f"filters[{i}]",
                    span=span,
                )
            )

    settings = _parse_settings(raw.get("settings"), errors, source_map)

    # Parse examples block (PLAN_agent_api_improvements §5)
    examples = self._parse_examples(raw.get("examples"), errors)

    model = SemanticModel(
        version=raw.get("version", 1.0),
        name=raw.get("name"),
        description=raw.get("description"),
        data_objects=data_objects,
        dimensions=dimensions,
        measures=measures,
        metrics=metrics,
        filters=model_filters,
        examples=examples,
        extends_sources=raw.get("_extends_sources", []),
        inherits_source=raw.get("_inherits_source"),
        owner=raw.get("owner"),
        custom_extensions=_parse_extensions(raw, "", errors, source_map),
        settings=settings,
    )

    result = ValidationResult(
        valid=len(errors) == 0,
        errors=errors,
        warnings=warnings,
    )

    return model, result

Semantic Validator

orionbelt.parser.validator.SemanticValidator

Validates semantic rules from spec §3.8.

Source code in src/orionbelt/parser/validator.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
class SemanticValidator:
    """Validates semantic rules from spec §3.8."""

    def validate(self, model: SemanticModel) -> list[SemanticError]:
        errors: list[SemanticError] = []
        errors.extend(self._check_unique_identifiers(model))
        errors.extend(self._check_unique_column_names(model))
        errors.extend(self._check_secondary_joins(model))
        errors.extend(self._check_no_cyclic_joins(model))
        errors.extend(self._check_no_multipath_joins(model))
        errors.extend(self._check_measures_resolve(model))
        errors.extend(self._check_join_targets_exist(model))
        errors.extend(self._check_references_resolve(model))
        errors.extend(self._check_num_class_on_numeric_columns(model))
        errors.extend(self._check_time_grain_on_temporal_columns(model))
        errors.extend(self._check_measure_filter_refs(model))
        errors.extend(self._check_via_reachability(model))
        errors.extend(self._check_missing_via(model))
        return errors

    def _check_unique_identifiers(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure no duplicate names across dimensions, measures, and metrics.

        Data object names live in a separate namespace — a dimension may share
        its name with a data object (e.g. dimension "Region" on data object "Region").
        """
        errors: list[SemanticError] = []
        all_names: dict[str, str] = {}  # name -> type

        def _register(name: str, kind: str, path: str) -> None:
            existing = all_names.get(name)
            if existing is not None:
                errors.append(
                    SemanticError(
                        code="DUPLICATE_IDENTIFIER",
                        message=(
                            f"{kind.title()} '{name}' conflicts with existing {existing} '{name}'"
                        ),
                        path=path,
                    )
                )
            all_names[name] = kind

        for name in model.dimensions:
            _register(name, "dimension", f"dimensions.{name}")

        for name in model.measures:
            _register(name, "measure", f"measures.{name}")

        for name in model.metrics:
            _register(name, "metric", f"metrics.{name}")

        return errors

    def _check_unique_column_names(self, model: SemanticModel) -> list[SemanticError]:
        """Column names must be unique within each data object.

        Duplicate YAML keys are now rejected at parse time by TrackedLoader
        (``allow_duplicate_keys = False``). This validator is retained as a
        structural hook in case models are constructed programmatically.
        """
        return []

    def _check_secondary_joins(self, model: SemanticModel) -> list[SemanticError]:
        """Validate secondary join constraints.

        - Every secondary join MUST have a pathName.
        - pathName must be unique per (source, target) pair.
        """
        errors: list[SemanticError] = []
        # Track pathName per (source, target) pair
        path_names: dict[tuple[str, str], set[str]] = {}

        for obj_name, obj in model.data_objects.items():
            for i, join in enumerate(obj.joins):
                if join.secondary and not join.path_name:
                    errors.append(
                        SemanticError(
                            code="SECONDARY_JOIN_MISSING_PATH_NAME",
                            message=(
                                f"Data object '{obj_name}' join[{i}] is secondary "
                                f"but has no pathName"
                            ),
                            path=f"dataObjects.{obj_name}.joins[{i}]",
                        )
                    )
                if join.path_name:
                    pair = (obj_name, join.join_to)
                    if pair not in path_names:
                        path_names[pair] = set()
                    if join.path_name in path_names[pair]:
                        errors.append(
                            SemanticError(
                                code="DUPLICATE_JOIN_PATH_NAME",
                                message=(
                                    f"Data object '{obj_name}' join[{i}] has duplicate "
                                    f"pathName '{join.path_name}' for target '{join.join_to}'"
                                ),
                                path=f"dataObjects.{obj_name}.joins[{i}]",
                            )
                        )
                    else:
                        path_names[pair].add(join.path_name)

        return errors

    def _check_no_cyclic_joins(self, model: SemanticModel) -> list[SemanticError]:
        """Detect cyclic join paths."""
        errors: list[SemanticError] = []

        # Build adjacency list from joins (skip secondary joins)
        adj: dict[str, set[str]] = {}
        for obj_name, obj in model.data_objects.items():
            if obj_name not in adj:
                adj[obj_name] = set()
            for join in obj.joins:
                if not join.secondary:
                    adj[obj_name].add(join.join_to)

        # Iterative DFS cycle detection (avoids RecursionError on large models)
        visited: set[str] = set()
        rec_stack: set[str] = set()

        for start in adj:
            if start in visited:
                continue
            stack: list[tuple[str, list[str]]] = [(start, iter(adj.get(start, set())))]  # type: ignore[list-item]
            path: list[str] = [start]
            visited.add(start)
            rec_stack.add(start)

            while stack:
                node, neighbors = stack[-1]
                advanced = False
                for neighbor in neighbors:
                    if neighbor not in visited:
                        visited.add(neighbor)
                        rec_stack.add(neighbor)
                        path.append(neighbor)
                        stack.append((neighbor, iter(adj.get(neighbor, set()))))  # type: ignore[arg-type]
                        advanced = True
                        break
                    elif neighbor in rec_stack:
                        if neighbor in path:
                            cycle = path[path.index(neighbor) :] + [neighbor]
                        else:
                            cycle = [node, neighbor]
                        errors.append(
                            SemanticError(
                                code="CYCLIC_JOIN",
                                message=f"Cyclic join detected: {' -> '.join(cycle)}",
                                path=f"dataObjects.{node}.joins",
                            )
                        )
                if not advanced:
                    stack.pop()
                    rec_stack.discard(node)
                    if path:
                        path.pop()

        return errors

    def _check_no_multipath_joins(self, model: SemanticModel) -> list[SemanticError]:
        """Detect multiple distinct paths between any pair of nodes in the join DAG.

        Only flags true diamonds where both paths go through intermediaries.
        A direct edge from start to target is canonical, so an additional
        indirect path (e.g. Purchases→Suppliers direct + Purchases→Products→Suppliers)
        is not ambiguous and is not flagged.
        """
        errors: list[SemanticError] = []

        # Build adjacency list from joins (skip secondary joins)
        adj: dict[str, list[str]] = {}
        for obj_name, obj in model.data_objects.items():
            if obj_name not in adj:
                adj[obj_name] = []
            for join in obj.joins:
                if not join.secondary:
                    adj[obj_name].append(join.join_to)

        reported: set[tuple[str, str]] = set()

        for start in adj:
            if not adj[start]:
                continue
            # BFS from start; track first parent that reached each node
            direct_neighbors: set[str] = set()
            first_parent: dict[str, str] = {}
            queue: deque[tuple[str, str]] = deque()
            for neighbor in adj[start]:
                if neighbor == start:
                    continue
                direct_neighbors.add(neighbor)
                if neighbor not in first_parent:
                    first_parent[neighbor] = start
                    queue.append((neighbor, start))

            while queue:
                node, _parent = queue.popleft()
                for neighbor in adj.get(node, []):
                    if neighbor == start:
                        continue
                    if neighbor not in first_parent:
                        first_parent[neighbor] = node
                        queue.append((neighbor, node))
                    elif first_parent[neighbor] != node:
                        # Skip if target has a direct edge from start —
                        # the direct join is the canonical path.
                        if neighbor in direct_neighbors:
                            continue
                        pair = (start, neighbor)
                        if pair not in reported:
                            reported.add(pair)
                            errors.append(
                                SemanticError(
                                    code="MULTIPATH_JOIN",
                                    message=(
                                        f"Multiple join paths from '{start}' to "
                                        f"'{neighbor}' (via '{first_parent[neighbor]}' "
                                        f"and '{node}'). "
                                        f"Join paths must be unambiguous."
                                    ),
                                    path=f"dataObjects.{start}.joins",
                                )
                            )

        return errors

    def _check_measures_resolve(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure measure column references resolve to actual data object columns."""
        errors: list[SemanticError] = []
        for name, measure in model.measures.items():
            for i, col_ref in enumerate(measure.columns):
                obj_name = col_ref.view
                col_name = col_ref.column
                if obj_name and obj_name not in model.data_objects:
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_DATA_OBJECT",
                            message=(
                                f"Measure '{name}' column[{i}] references "
                                f"unknown data object '{obj_name}'"
                            ),
                            path=f"measures.{name}.columns[{i}]",
                        )
                    )
                elif obj_name and col_name:
                    obj = model.data_objects[obj_name]
                    if col_name not in obj.columns:
                        errors.append(
                            SemanticError(
                                code="UNKNOWN_COLUMN",
                                message=(
                                    f"Measure '{name}' column[{i}] references "
                                    f"unknown column '{col_name}' in data object '{obj_name}'"
                                ),
                                path=f"measures.{name}.columns[{i}]",
                            )
                        )
        return errors

    def _check_join_targets_exist(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure join targets reference existing data objects."""
        errors: list[SemanticError] = []
        for obj_name, obj in model.data_objects.items():
            for i, join in enumerate(obj.joins):
                if not join.columns_from or not join.columns_to:
                    errors.append(
                        SemanticError(
                            code="EMPTY_JOIN_COLUMNS",
                            message=(
                                f"Data object '{obj_name}' join[{i}] to "
                                f"'{join.join_to}' has empty join columns"
                            ),
                            path=f"dataObjects.{obj_name}.joins[{i}]",
                        )
                    )
                elif len(join.columns_from) != len(join.columns_to):
                    errors.append(
                        SemanticError(
                            code="JOIN_COLUMN_COUNT_MISMATCH",
                            message=(
                                f"Data object '{obj_name}' join[{i}] has "
                                f"{len(join.columns_from)} columnsFrom and "
                                f"{len(join.columns_to)} columnsTo"
                            ),
                            path=f"dataObjects.{obj_name}.joins[{i}]",
                        )
                    )
                if join.join_to not in model.data_objects:
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_JOIN_TARGET",
                            message=(
                                f"Data object '{obj_name}' join[{i}] references "
                                f"unknown data object '{join.join_to}'"
                            ),
                            path=f"dataObjects.{obj_name}.joins[{i}]",
                        )
                    )
                else:
                    # Validate join columns exist
                    for col_name in join.columns_from:
                        if col_name not in obj.columns:
                            errors.append(
                                SemanticError(
                                    code="UNKNOWN_JOIN_COLUMN",
                                    message=(
                                        f"Data object '{obj_name}' join[{i}] columnsFrom "
                                        f"references unknown column '{col_name}'"
                                    ),
                                    path=f"dataObjects.{obj_name}.joins[{i}].columnsFrom",
                                )
                            )
                    target_obj = model.data_objects[join.join_to]
                    for col_name in join.columns_to:
                        if col_name not in target_obj.columns:
                            errors.append(
                                SemanticError(
                                    code="UNKNOWN_JOIN_COLUMN",
                                    message=(
                                        f"Data object '{obj_name}' join[{i}] columnsTo "
                                        f"references unknown column '{col_name}' "
                                        f"in data object '{join.join_to}'"
                                    ),
                                    path=f"dataObjects.{obj_name}.joins[{i}].columnsTo",
                                )
                            )
        return errors

    def _check_references_resolve(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure dimension references resolve."""
        errors: list[SemanticError] = []
        for name, dim in model.dimensions.items():
            obj_name = dim.view
            col_name = dim.column
            if obj_name and obj_name not in model.data_objects:
                errors.append(
                    SemanticError(
                        code="UNKNOWN_DATA_OBJECT",
                        message=f"Dimension '{name}' references unknown data object '{obj_name}'",
                        path=f"dimensions.{name}",
                    )
                )
            elif obj_name and col_name:
                obj = model.data_objects[obj_name]
                if col_name not in obj.columns:
                    errors.append(
                        SemanticError(
                            code="UNKNOWN_COLUMN",
                            message=(
                                f"Dimension '{name}' references unknown column "
                                f"'{col_name}' in data object '{obj_name}'"
                            ),
                            path=f"dimensions.{name}",
                        )
                    )
        return errors

    _NUMERIC_TYPES = {DataType.INT, DataType.FLOAT}
    _TIME_GRAIN_TYPES = {DataType.DATE, DataType.TIMESTAMP, DataType.TIMESTAMP_TZ}

    def _check_time_grain_on_temporal_columns(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure timeGrain is only set when the underlying column is temporal.

        ``timeGrain`` compiles to ``date_trunc(grain, column)``, which fails at
        runtime if the column's abstractType is not date/timestamp/timestamp_tz.
        Reject at model-load time so the error surfaces during validation rather
        than during the first query.
        """
        errors: list[SemanticError] = []
        for name, dim in model.dimensions.items():
            if dim.time_grain is None:
                continue
            obj_name = dim.view
            col_name = dim.column
            if not obj_name or not col_name:
                continue
            obj = model.data_objects.get(obj_name)
            if obj is None or col_name not in obj.columns:
                # Caught by _check_references_resolve.
                continue
            col = obj.columns[col_name]
            if col.abstract_type not in self._TIME_GRAIN_TYPES:
                errors.append(
                    SemanticError(
                        code="TIME_GRAIN_ON_NON_TEMPORAL",
                        message=(
                            f"Dimension '{name}' has timeGrain "
                            f"'{dim.time_grain.value}' but underlying column "
                            f"'{obj_name}.{col_name}' has abstractType "
                            f"'{col.abstract_type.value}'. timeGrain requires "
                            f"the column to be date, timestamp, or timestamp_tz. "
                            f"Drop timeGrain, fix the column's abstractType, or "
                            f"define a computed column with to_date()."
                        ),
                        path=f"dimensions.{name}",
                    )
                )
        return errors

    def _check_num_class_on_numeric_columns(self, model: SemanticModel) -> list[SemanticError]:
        """Ensure numClass is only set on numeric columns (int or float)."""
        errors: list[SemanticError] = []
        for obj_name, obj in model.data_objects.items():
            for col_name, col in obj.columns.items():
                if col.num_class and col.abstract_type not in self._NUMERIC_TYPES:
                    errors.append(
                        SemanticError(
                            code="NUM_CLASS_ON_NON_NUMERIC",
                            message=(
                                f"Column '{col_name}' in data object '{obj_name}' "
                                f"has numClass '{col.num_class}' but abstractType "
                                f"'{col.abstract_type}' is not numeric (int or float)"
                            ),
                            path=f"dataObjects.{obj_name}.columns.{col_name}",
                        )
                    )
        return errors

    def _check_measure_filter_refs(self, model: SemanticModel) -> list[SemanticError]:
        """Verify that measure filter columns reference existing data objects and columns."""
        errors: list[SemanticError] = []
        for meas_name, measure in model.measures.items():
            for fi in measure.filters:
                self._validate_filter_item(fi, model, meas_name, errors)
        return errors

    def _validate_filter_item(
        self,
        item: MeasureFilterItem,
        model: SemanticModel,
        meas_name: str,
        errors: list[SemanticError],
    ) -> None:
        """Recursively validate a measure filter item."""
        if isinstance(item, MeasureFilter):
            if not item.column or not item.column.view:
                return
            obj = model.data_objects.get(item.column.view)
            if not obj:
                errors.append(
                    SemanticError(
                        code="UNKNOWN_FILTER_DATA_OBJECT",
                        message=(
                            f"Measure '{meas_name}' filter references unknown "
                            f"data object '{item.column.view}'"
                        ),
                        path=f"measures.{meas_name}.filters",
                    )
                )
                return
            if item.column.column and item.column.column not in obj.columns:
                errors.append(
                    SemanticError(
                        code="UNKNOWN_FILTER_COLUMN",
                        message=(
                            f"Measure '{meas_name}' filter references unknown "
                            f"column '{item.column.column}' in '{item.column.view}'"
                        ),
                        path=f"measures.{meas_name}.filters",
                    )
                )
        elif isinstance(item, MeasureFilterGroup):
            for child in item.filters:
                self._validate_filter_item(child, model, meas_name, errors)

    def _build_directed_graph(self, model: SemanticModel) -> nx.DiGraph[str]:
        """Build a directed graph from primary (non-secondary) joins."""
        g: nx.DiGraph[str] = nx.DiGraph()
        for name in model.data_objects:
            g.add_node(name)
        for obj_name, obj in model.data_objects.items():
            for join in obj.joins:
                if not join.secondary and join.join_to in model.data_objects:
                    g.add_edge(obj_name, join.join_to)
        return g

    def _check_via_reachability(self, model: SemanticModel) -> list[SemanticError]:
        """Validate that each dimension's dataObject is reachable from its via."""
        errors: list[SemanticError] = []
        dims_with_via = [(name, dim) for name, dim in model.dimensions.items() if dim.via]
        if not dims_with_via:
            return errors

        g = self._build_directed_graph(model)
        for name, dim in dims_with_via:
            if dim.via not in model.data_objects:
                errors.append(
                    SemanticError(
                        code="INVALID_VIA_DATA_OBJECT",
                        message=(
                            f"Dimension '{name}': via references unknown data object '{dim.via}'"
                        ),
                        path=f"dimensions.{name}",
                    )
                )
                continue
            if dim.via == dim.view:
                continue
            reachable = nx.descendants(g, dim.via) if dim.via in g else set()
            if dim.view not in reachable:
                errors.append(
                    SemanticError(
                        code="INVALID_VIA_DATA_OBJECT",
                        message=(
                            f"Dimension '{name}': data object '{dim.view}' is not "
                            f"reachable from via data object '{dim.via}'"
                        ),
                        path=f"dimensions.{name}",
                    )
                )
        return errors

    def _check_missing_via(self, model: SemanticModel) -> list[SemanticError]:
        """Warn when a dimension's target has direct joins from multiple fact tables.

        A fact table is a data object that is the source of at least one measure.
        Only direct joins (one hop) from a fact table to the dimension's target
        count — transitive reachability through other fact tables does not create
        real ambiguity and should not trigger a warning.  Dimensions whose target
        IS a fact table (e.g. Sales Date on Sales) are also skipped because the
        column lives on the fact table itself.

        Path-invariance heuristic: when every reaching fact joins to the target
        on the target's primary key, the dim attribute is path-invariant — the
        same Client ID (or Calendar.date) from any fact resolves to the same
        target row, so the dim attribute value is identical regardless of
        which fact drove the join. Role-playing semantics (Sales Year Month
        vs Purchase Year Month) are a choice the modeller makes by adding
        explicit ``via:`` on a per-dimension basis, not a correctness concern
        the validator should flag for every shared dim table.
        """
        warnings: list[SemanticError] = []

        measure_sources: set[str] = set()
        for meas in model.measures.values():
            for col_ref in meas.columns:
                if col_ref.view:
                    measure_sources.add(col_ref.view)
        if len(measure_sources) < 2:
            return warnings

        g = self._build_directed_graph(model)
        fact_tables = sorted(measure_sources & set(g.nodes))

        direct_children: dict[str, set[str]] = {}
        for ft in fact_tables:
            direct_children[ft] = set(g.successors(ft))

        for dim_name, dim in model.dimensions.items():
            if dim.via:
                continue
            target = dim.view
            if not target or target not in g:
                continue
            if target in measure_sources:
                continue
            reaching_facts = [ft for ft in fact_tables if target in direct_children[ft]]
            if len(reaching_facts) <= 1:
                continue

            if self._is_path_invariant(model, target, reaching_facts):
                continue

            warnings.append(
                SemanticError(
                    code="MISSING_VIA",
                    message=(
                        f"Dimension '{dim_name}' on '{target}' has direct "
                        f"joins from multiple fact tables "
                        f"({', '.join(reaching_facts)}). "
                        f"Consider adding role-playing dimensions with 'via' "
                        f"to disambiguate join paths."
                    ),
                    path=f"dimensions.{dim_name}",
                    severity="warning",
                )
            )
        return warnings

    @staticmethod
    def _is_path_invariant(model: SemanticModel, target: str, reaching_facts: list[str]) -> bool:
        """True when every reaching fact joins to the target on its primary key.

        Same Client ID (or Calendar date) from any fact resolves to the same
        target row, so the dim attribute value is identical regardless of which
        fact drove the join — there's no correctness ambiguity to warn about.
        Joins on non-PK columns CAN resolve to different rows from different
        facts and are kept under the warning.
        """
        target_obj = model.data_objects.get(target)
        if target_obj is None:
            return False

        pk_cols = {col_name for col_name, col in target_obj.columns.items() if col.primary_key}
        if not pk_cols:
            return False

        for ft_name in reaching_facts:
            ft_obj = model.data_objects.get(ft_name)
            if ft_obj is None:
                return False
            joins_to_target = [j for j in ft_obj.joins if j.join_to == target]
            if not joins_to_target:
                return False
            for j in joins_to_target:
                # Every column on the target side of the join must be a PK column.
                if not j.columns_to or any(c not in pk_cols for c in j.columns_to):
                    return False

        return True

validate(model)

Source code in src/orionbelt/parser/validator.py
def validate(self, model: SemanticModel) -> list[SemanticError]:
    errors: list[SemanticError] = []
    errors.extend(self._check_unique_identifiers(model))
    errors.extend(self._check_unique_column_names(model))
    errors.extend(self._check_secondary_joins(model))
    errors.extend(self._check_no_cyclic_joins(model))
    errors.extend(self._check_no_multipath_joins(model))
    errors.extend(self._check_measures_resolve(model))
    errors.extend(self._check_join_targets_exist(model))
    errors.extend(self._check_references_resolve(model))
    errors.extend(self._check_num_class_on_numeric_columns(model))
    errors.extend(self._check_time_grain_on_temporal_columns(model))
    errors.extend(self._check_measure_filter_refs(model))
    errors.extend(self._check_via_reachability(model))
    errors.extend(self._check_missing_via(model))
    return errors

Semantic Model

orionbelt.models.semantic.SemanticModel

Bases: BaseModel

Complete semantic model parsed from OBML YAML.

Source code in src/orionbelt/models/semantic.py
class SemanticModel(BaseModel):
    """Complete semantic model parsed from OBML YAML."""

    version: float = 1.0
    name: str | None = Field(
        default=None,
        description=(
            "Optional addressing identifier for multi-model mode (v2.4.0+). "
            "When unset, the multi-model loader uses the filename stem. "
            "After normalization (lowercase + spaces/dots/dashes → "
            "underscores + trim) must match ``^[a-z][a-z0-9_]{0,62}$``. "
            "BI tools select this model via the Flight `database` catalog "
            "or pgwire `database=` URL parameter."
        ),
    )
    description: str | None = None
    settings: ModelSettings | None = None
    data_objects: dict[str, DataObject] = Field(default={}, alias="dataObjects")
    dimensions: dict[str, Dimension] = {}
    measures: dict[str, Measure] = {}
    metrics: dict[str, Metric] = {}
    filters: list[ModelFilter] = Field(default_factory=list)
    examples: list[ModelExample] = Field(default_factory=list)
    extends_sources: list[str] = Field(default_factory=list)
    inherits_source: str | None = None
    owner: str | None = None
    custom_extensions: list[CustomExtension] = Field(default_factory=list, alias="customExtensions")

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @field_validator("name", mode="before")
    @classmethod
    def _validate_name(cls, v: str | None) -> str | None:
        """Reject invalid names early. Pydantic validators raise ValueError
        which the loader turns into a model-validation error.

        Empty / whitespace-only strings are treated as ``None`` rather than
        passed through, so an empty ``name:`` in YAML falls back to the
        filename stem at startup.
        """
        if v is None:
            return None
        if not isinstance(v, str):
            raise ValueError("name must be a string")
        if not v.strip():
            return None
        # Use the same normalization pipeline the loader uses, so an OBML
        # `name:` that's invalid surfaces during parse-time rather than
        # only at startup. The normalized value is stored on the model.
        from orionbelt.models.identifiers import (
            ModelNameError,
            normalize_model_name,
        )

        try:
            return normalize_model_name(v, source="OBML `name:` field")
        except ModelNameError as exc:
            raise ValueError(str(exc)) from None

orionbelt.models.semantic.DataObject

Bases: BaseModel

A database table or view with its columns and joins.

Source code in src/orionbelt/models/semantic.py
class DataObject(BaseModel):
    """A database table or view with its columns and joins."""

    label: str
    code: str
    database: str
    schema_name: str = Field(alias="schema")
    columns: dict[str, DataObjectColumn] = {}
    joins: list[DataObjectJoin] = []
    description: str | None = None
    comment: str | None = None
    owner: str | None = None
    synonyms: list[str] = Field(default_factory=list)
    custom_extensions: list[CustomExtension] = Field(default_factory=list, alias="customExtensions")
    refresh: RefreshPolicy | None = Field(
        default=None,
        description=(
            "Optional freshness contract for the physical table this dataObject maps to. "
            "Drives result-cache TTL composition. PLAN_freshness_driven_cache.md §5."
        ),
    )

    @property
    def qualified_code(self) -> str:
        """Full qualified table reference: database.schema.code."""
        return f"{self.database}.{self.schema_name}.{self.code}"

    model_config = {"populate_by_name": True, "extra": "forbid"}

qualified_code property

Full qualified table reference: database.schema.code.

orionbelt.models.semantic.Dimension

Bases: BaseModel

A named dimension referencing a data object column.

Source code in src/orionbelt/models/semantic.py
class Dimension(BaseModel):
    """A named dimension referencing a data object column."""

    label: str
    view: str = Field(alias="dataObject")
    column: str = ""
    result_type: DataType = Field(DataType.STRING, alias="resultType")
    time_grain: TimeGrain | None = Field(None, alias="timeGrain")
    description: str | None = None
    format: str | None = None
    via: str | None = None
    owner: str | None = None
    synonyms: list[str] = Field(default_factory=list)
    custom_extensions: list[CustomExtension] = Field(default_factory=list, alias="customExtensions")

    model_config = {"populate_by_name": True, "extra": "forbid"}

orionbelt.models.semantic.Measure

Bases: BaseModel

An aggregation measure with optional expression template.

Source code in src/orionbelt/models/semantic.py
class Measure(BaseModel):
    """An aggregation measure with optional expression template."""

    label: str
    columns: list[DataColumnRef] = []
    result_type: DataType = Field(DataType.FLOAT, alias="resultType")
    aggregation: AggregationType
    expression: str | None = None
    distinct: bool = False
    total: bool = False
    grain: GrainOverride | None = None
    filter_context: FilterContext | None = Field(None, alias="filterContext")
    filters: list[MeasureFilterItem] = []
    data_type: str | None = Field(None, alias="dataType")
    description: str | None = None
    format: str | None = None
    allow_fan_out: bool = Field(False, alias="allowFanOut")
    delimiter: str | None = None
    within_group: WithinGroup | None = Field(None, alias="withinGroup")
    owner: str | None = None
    synonyms: list[str] = Field(default_factory=list)
    custom_extensions: list[CustomExtension] = Field(default_factory=list, alias="customExtensions")

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @field_validator("aggregation", mode="before")
    @classmethod
    def _normalize_aggregation(cls, v: object) -> object:
        """Lowercase aggregation names so ``SUM`` / ``Sum`` / ``sum`` all
        resolve to the same ``AggregationType.SUM``. The enum's canonical
        spelling is lowercase, but uppercase SQL-style is a common BI/LLM
        convention that pre-v2.7.5 worked by accident (``aggregation``
        was a plain ``str``) — keep accepting it now that the field is
        a validated enum.

        ``AGG`` and ``AGGREGATE`` are accepted as aliases for ``MEASURE``
        (v2.7.7+) so OBML reads naturally for users coming from
        Databricks (``measure``), older Spark docs (``aggregate``), or
        the shorthand most BI tools default to (``agg``).
        """
        if isinstance(v, str):
            lowered = v.lower()
            if lowered in ("agg", "aggregate"):
                return "measure"
            return lowered
        return v

    @field_validator("data_type", mode="before")
    @classmethod
    def _validate_data_type(cls, v: str | None) -> str | None:
        if v is not None:
            parse_data_type(v)
        return v

    @model_validator(mode="after")
    def _validate_total_grain_exclusion(self) -> Measure:
        if self.total and self.grain is not None:
            raise ValueError("'total: true' and 'grain' are mutually exclusive")
        return self

    @model_validator(mode="after")
    def _validate_measure_delegation(self) -> Measure:
        """``aggregation: measure`` delegates the aggregation to the
        engine's metric-view resolver, so the OBML measure declaration
        must NOT specify ``columns:`` or ``expression:`` — there is no
        source column for OBSL to read; the engine resolves the measure
        by name. Reject the combination at model-load time rather than
        emitting SQL that would silently ignore the column reference.
        """
        if self.aggregation == AggregationType.MEASURE:
            if self.columns:
                raise ValueError(
                    "aggregation: measure delegates resolution to the engine "
                    "(Databricks Metric View); 'columns:' must be omitted. "
                    "The engine resolves the measure by its OBML label."
                )
            if self.expression is not None:
                raise ValueError(
                    "aggregation: measure delegates resolution to the engine "
                    "(Databricks Metric View); 'expression:' must be omitted. "
                    "The engine resolves the measure by its OBML label."
                )
            if self.filters:
                raise ValueError(
                    "aggregation: measure delegates resolution to the engine "
                    "(Databricks Metric View); 'filters:' is not applicable. "
                    "Define the filter inside the metric view itself."
                )
            if self.total:
                raise ValueError(
                    "aggregation: measure cannot be combined with 'total: true' "
                    "(OBSL cannot wrap the engine-resolved aggregation in "
                    "a window function — define the total at the metric-view level)."
                )
        return self

    @model_validator(mode="after")
    def _validate_statistical_aggregation_arity(self) -> Measure:
        """Reject malformed statistical aggregates at model-load time.

        Two-column aggregates (``corr``, ``covar_*``, ``regr_*``) require
        exactly two entries in ``columns``. Single-column statistical
        aggregates (``stddev``, ``stddev_pop``, ``variance``, ``var_pop``)
        require exactly one.

        ``expression:`` form is **not allowed** for two-column
        aggregates — a single expression string collapses to one scalar
        argument, producing invalid SQL like ``CORR((a + b))`` instead
        of ``CORR(a, b)``. To express per-argument transformations on
        two-column aggregates, define the inputs as computed columns on
        the data object and reference them via ``columns:``.

        Single-column statistical aggregates (``stddev`` etc.) DO accept
        ``expression:`` — the result ``STDDEV(<scalar expression>)`` is
        valid SQL.
        """
        agg = self.aggregation.lower()
        if self.expression is not None:
            if agg in TWO_COLUMN_AGGREGATIONS:
                raise ValueError(
                    f"Aggregation '{agg}' requires exactly 2 columns and cannot be "
                    "combined with 'expression:'. Use the 'columns:' list with two "
                    "entries (define computed columns on the data object if you need "
                    "per-argument transformations) so the aggregate's argument order "
                    "is explicit."
                )
            return self
        if agg in TWO_COLUMN_AGGREGATIONS and len(self.columns) != 2:
            raise ValueError(
                f"Aggregation '{agg}' requires exactly 2 columns, got {len(self.columns)}"
            )
        if agg in SINGLE_COLUMN_STATISTICAL_AGGREGATIONS and len(self.columns) != 1:
            raise ValueError(
                f"Aggregation '{agg}' requires exactly 1 column, got {len(self.columns)}"
            )
        return self

orionbelt.models.semantic.Metric

Bases: BaseModel

A metric: derived expression, cumulative window, or period-over-period comparison.

Derived (default): references measures by name using {[Measure Name]} syntax. Cumulative: applies a window function to an existing measure, ordered by a time dimension. Supports running totals, rolling windows, and grain-to-date resets. Period-over-Period: compares a measure's value against a prior time period using a synthetical date spine. Supports ratio, difference, previous value, and percent change.

Source code in src/orionbelt/models/semantic.py
class Metric(BaseModel):
    """A metric: derived expression, cumulative window, or period-over-period comparison.

    **Derived** (default): references measures by name using ``{[Measure Name]}`` syntax.
    **Cumulative**: applies a window function to an existing measure, ordered by a time
    dimension.  Supports running totals, rolling windows, and grain-to-date resets.
    **Period-over-Period**: compares a measure's value against a prior time period using
    a synthetical date spine.  Supports ratio, difference, previous value, and percent change.
    """

    label: str
    type: MetricType = MetricType.DERIVED
    # Derived metrics
    expression: str | None = None
    # Cumulative metrics
    measure: str | None = None
    time_dimension: str | None = Field(None, alias="timeDimension")
    cumulative_type: CumulativeAggType = Field(CumulativeAggType.SUM, alias="cumulativeType")
    window: int | None = None
    grain_to_date: GrainToDate | None = Field(None, alias="grainToDate")
    # Per-dimension partitioning for cumulative + window metrics. Each entry
    # must be a model dimension reachable from the measure's source object.
    partition_by: list[str] = Field(default_factory=list, alias="partitionBy")
    # Period-over-Period metrics
    period_over_period: PeriodOverPeriod | None = Field(None, alias="periodOverPeriod")
    # Window metrics (rank / lag / lead / ntile / first_value / last_value)
    window_function: WindowFunctionKind | None = Field(None, alias="windowFunction")
    offset: int | None = None
    buckets: int | None = None
    order_direction: str = Field("desc", alias="orderDirection")
    default_value: str | int | float | bool | None = Field(None, alias="defaultValue")
    # Common
    data_type: str | None = Field(None, alias="dataType")
    description: str | None = None
    format: str | None = None
    owner: str | None = None
    synonyms: list[str] = Field(default_factory=list)
    custom_extensions: list[CustomExtension] = Field(default_factory=list, alias="customExtensions")

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @field_validator("data_type", mode="before")
    @classmethod
    def _validate_data_type(cls, v: str | None) -> str | None:
        if v is not None:
            parse_data_type(v)
        return v

    @model_validator(mode="after")
    def _validate_metric_type(self) -> Metric:
        if self.type == MetricType.DERIVED:
            if not self.expression:
                raise ValueError("Derived metrics require 'expression'")
            if self.partition_by:
                raise ValueError("Derived metrics must not have 'partitionBy'")
        elif self.type == MetricType.CUMULATIVE:
            if not self.measure:
                raise ValueError("Cumulative metrics require 'measure'")
            if not self.time_dimension:
                raise ValueError("Cumulative metrics require 'timeDimension'")
            if self.expression:
                raise ValueError("Cumulative metrics must not have 'expression'")
            if self.window is not None and self.grain_to_date is not None:
                raise ValueError("'window' and 'grainToDate' are mutually exclusive")
            if self.window is not None and self.window < 1:
                raise ValueError("'window' must be >= 1")
        elif self.type == MetricType.PERIOD_OVER_PERIOD:
            if not self.expression:
                raise ValueError("Period-over-period metrics require 'expression'")
            if not self.period_over_period:
                raise ValueError("Period-over-period metrics require 'periodOverPeriod'")
            if self.measure:
                raise ValueError(
                    "Period-over-period metrics must not have 'measure' "
                    "(use 'expression' to reference measures)"
                )
            if self.window is not None or self.grain_to_date is not None:
                raise ValueError(
                    "Period-over-period metrics must not have 'window' or 'grainToDate'"
                )
            if self.partition_by:
                raise ValueError("Period-over-period metrics must not have 'partitionBy'")
        elif self.type == MetricType.WINDOW:
            if self.window_function is None:
                raise ValueError("Window metrics require 'windowFunction'")
            if not self.measure and self.window_function not in {
                WindowFunctionKind.ROW_NUMBER,
                WindowFunctionKind.NTILE,
            }:
                # row_number / ntile can rank without an explicit measure, falling back
                # to ordering on the time dimension. All other window functions take
                # the measure as their argument or ORDER BY input.
                raise ValueError(
                    f"Window metric with function '{self.window_function.value}' requires 'measure'"
                )
            if self.expression:
                raise ValueError("Window metrics must not have 'expression'")
            if self.window is not None or self.grain_to_date is not None:
                raise ValueError("Window metrics must not have 'window' or 'grainToDate'")
            if self.window_function in {WindowFunctionKind.LAG, WindowFunctionKind.LEAD}:
                if self.offset is None or self.offset < 1:
                    raise ValueError(
                        f"Window metric with function '{self.window_function.value}' "
                        f"requires positive 'offset'"
                    )
                if not self.time_dimension:
                    raise ValueError(
                        f"Window metric with function '{self.window_function.value}' "
                        f"requires 'timeDimension'"
                    )
            if self.window_function == WindowFunctionKind.NTILE and (
                self.buckets is None or self.buckets < 2
            ):
                raise ValueError("Window metric with function 'ntile' requires 'buckets' >= 2")
            if self.order_direction.lower() not in {"asc", "desc"}:
                raise ValueError("'orderDirection' must be 'asc' or 'desc'")
        return self

Query Models

orionbelt.models.query.QueryObject

Bases: BaseModel

A complete YAML analytical query.

Source code in src/orionbelt/models/query.py
class QueryObject(BaseModel):
    """A complete YAML analytical query."""

    select: QuerySelect
    where: list[QueryFilterItem] = []
    having: list[QueryFilterItem] = []
    order_by: list[QueryOrderBy] = Field([], alias="order_by")
    limit: int | None = None
    offset: int | None = None
    use_path_names: list[UsePathName] = Field([], alias="usePathNames")
    dimensions_exclude: bool = Field(False, alias="dimensionsExclude")
    grouping: Grouping | None = Field(
        default=None,
        description=(
            "Hierarchical grouping modifier. 'rollup' emits GROUP BY ROLLUP(...) "
            "for hierarchical subtotals + grand total. 'cube' emits GROUP BY CUBE(...) "
            "for the full cross-tab. Adds one GROUPING(dim) AS _g_<dim> column per "
            "selected dimension so callers can distinguish subtotal/grand-total rows."
        ),
    )

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @model_validator(mode="after")
    def _validate_grouping(self) -> QueryObject:
        """Reject grouping with no dimensions or in raw mode."""
        if self.grouping is None:
            return self
        if self.select.is_raw:
            raise ValueError(
                "select.fields (raw mode) cannot be combined with grouping (rollup/cube)"
            )
        if not self.select.dimensions:
            raise ValueError(
                "grouping (rollup/cube) requires at least one dimension in select.dimensions"
            )
        return self

    @model_validator(mode="after")
    def _validate_raw_mode_exclusivity(self) -> QueryObject:
        """Raw mode (``select.fields``) is mutually exclusive with aggregate
        features. Catch misuse early so the resolver can assume a clean shape.
        """
        if self.select.is_raw:
            if self.select.dimensions:
                raise ValueError(
                    "select.fields (raw mode) cannot be combined with select.dimensions"
                )
            if self.select.measures:
                raise ValueError("select.fields (raw mode) cannot be combined with select.measures")
            if self.having:
                raise ValueError("select.fields (raw mode) cannot be combined with having")
            if self.dimensions_exclude:
                raise ValueError(
                    "select.fields (raw mode) cannot be combined with dimensionsExclude"
                )
        elif self.select.distinct:
            raise ValueError("select.distinct is only valid in raw mode (with select.fields)")
        return self

orionbelt.models.query.QuerySelect

Bases: BaseModel

The SELECT part of a query.

Two mutually exclusive modes:

  • Aggregate mode (default): dimensions + measures produce a grouped, aggregated result (GROUP BY dimensions, aggregate measures).
  • Raw mode: fields returns un-aggregated rows from one or more data objects joined per the model. Set distinct: true for SELECT DISTINCT. Raw mode rejects dimensions, measures, metrics, and HAVING.
Source code in src/orionbelt/models/query.py
class QuerySelect(BaseModel):
    """The SELECT part of a query.

    Two mutually exclusive modes:

    * **Aggregate mode** (default): ``dimensions`` + ``measures`` produce a
      grouped, aggregated result (GROUP BY dimensions, aggregate measures).
    * **Raw mode**: ``fields`` returns un-aggregated rows from one or more
      data objects joined per the model. Set ``distinct: true`` for
      ``SELECT DISTINCT``. Raw mode rejects ``dimensions``, ``measures``,
      ``metrics``, and ``HAVING``.
    """

    dimensions: list[str | CoalesceDimension] = []
    measures: list[str] = []
    fields: list[str] = []
    distinct: bool = False

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @property
    def is_raw(self) -> bool:
        """True when this select is in raw mode (fields-based projection)."""
        return bool(self.fields)

is_raw property

True when this select is in raw mode (fields-based projection).

orionbelt.models.query.QueryFilter

Bases: BaseModel

A filter condition in a query.

Source code in src/orionbelt/models/query.py
class QueryFilter(BaseModel):
    """A filter condition in a query."""

    field: str
    op: FilterOperator
    value: Any = None
    subquery: Subquery | None = None

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @field_validator("value", mode="before")
    @classmethod
    def _validate_filter_value(cls, v: Any) -> Any:
        """Reject arbitrary nested objects — allow scalars, lists of scalars, and dicts
        (for RELATIVE filters which use ``{unit, count, direction}`` objects).
        Date/datetime values are coerced to ISO strings.
        """
        if v is None:
            return v
        if isinstance(v, datetime):
            return v.isoformat()
        if isinstance(v, date):
            return v.isoformat()
        if isinstance(v, (str, int, float, bool)):
            return v
        if isinstance(v, list):
            coerced = [i.isoformat() if isinstance(i, (date, datetime)) else i for i in v]
            if all(isinstance(i, (str, int, float, bool)) for i in coerced):
                return coerced
        if isinstance(v, dict) and all(isinstance(k, str) for k in v):
            return v
        msg = "Filter value must be a scalar, list of scalars, or object"
        raise ValueError(msg)

    @model_validator(mode="after")
    def _validate_subquery_exclusivity(self) -> QueryFilter:
        """``exists`` / ``nonexists`` require ``subquery`` (and reject ``value``).

        All other operators reject ``subquery`` — the payload would be silently
        ignored, which would mask typos.
        """
        is_subquery_op = self.op in (FilterOperator.EXISTS, FilterOperator.NONEXISTS)
        if is_subquery_op:
            if self.subquery is None:
                raise ValueError(
                    f"Operator '{self.op}' requires a 'subquery' object with 'dataObject'"
                )
            if self.value is not None:
                raise ValueError(f"Operator '{self.op}' takes 'subquery', not 'value' / 'values'")
        elif self.subquery is not None:
            raise ValueError(
                f"Operator '{self.op}' does not accept 'subquery' — use 'exists' or 'nonexists'"
            )
        return self

orionbelt.models.query.UsePathName

Bases: BaseModel

Selects a named secondary join path for a specific (source, target) pair.

Source code in src/orionbelt/models/query.py
class UsePathName(BaseModel):
    """Selects a named secondary join path for a specific (source, target) pair."""

    source: str
    target: str
    path_name: str = Field(alias="pathName")

    model_config = {"populate_by_name": True, "extra": "forbid"}

orionbelt.models.query.DimensionRef

Bases: BaseModel

Reference to a dimension, optionally with time grain.

Supports notation like "customer.country" or "order.order_date:month".

Source code in src/orionbelt/models/query.py
class DimensionRef(BaseModel):
    """Reference to a dimension, optionally with time grain.

    Supports notation like "customer.country" or "order.order_date:month".
    """

    name: str
    grain: TimeGrain | None = None

    model_config = {"populate_by_name": True, "extra": "forbid"}

    @classmethod
    def parse(cls, raw: str) -> DimensionRef:
        """Parse 'name:grain' notation."""
        if ":" in raw:
            name, grain_str = raw.rsplit(":", 1)
            return cls(name=name, grain=TimeGrain(grain_str))
        return cls(name=raw)

parse(raw) classmethod

Parse 'name:grain' notation.

Source code in src/orionbelt/models/query.py
@classmethod
def parse(cls, raw: str) -> DimensionRef:
    """Parse 'name:grain' notation."""
    if ":" in raw:
        name, grain_str = raw.rsplit(":", 1)
        return cls(name=name, grain=TimeGrain(grain_str))
    return cls(name=raw)

Error Models

orionbelt.models.errors.SemanticError

Bases: BaseModel

A structured error or warning with optional source position and remediation.

Used uniformly for errors (severity="error") and warnings (severity="warning"). See models/warnings.py for the stable warning code taxonomy.

Source code in src/orionbelt/models/errors.py
class SemanticError(BaseModel):
    """A structured error or warning with optional source position and remediation.

    Used uniformly for errors (``severity="error"``) and warnings (``severity="warning"``).
    See ``models/warnings.py`` for the stable warning code taxonomy.
    """

    code: str
    message: str
    path: str | None = None
    span: SourceSpan | None = None
    suggestions: list[str] = Field(default_factory=list)
    severity: str = "error"
    hint: str | None = Field(
        default=None,
        description="Optional remediation suggestion (single sentence)",
    )
    context: dict[str, Any] | None = Field(
        default=None,
        description=(
            "Optional structured detail (e.g. which measure / dataObject / column) so "
            "agents can branch on the data without parsing the message."
        ),
    )

orionbelt.models.errors.ValidationResult

Bases: BaseModel

Result of semantic model validation.

Source code in src/orionbelt/models/errors.py
class ValidationResult(BaseModel):
    """Result of semantic model validation."""

    valid: bool
    errors: list[SemanticError] = Field(default_factory=list)
    warnings: list[SemanticError] = Field(default_factory=list)

orionbelt.models.errors.SourceSpan

Bases: BaseModel

Points to exact location in YAML source for error reporting.

Source code in src/orionbelt/models/errors.py
class SourceSpan(BaseModel):
    """Points to exact location in YAML source for error reporting."""

    file: str
    line: int
    column: int
    end_line: int | None = None
    end_column: int | None = None

SQL AST Nodes

orionbelt.ast.nodes.Select dataclass

A complete SELECT statement.

Source code in src/orionbelt/ast/nodes.py
@dataclass(frozen=True)
class Select:
    """A complete SELECT statement."""

    columns: list[Expr] = field(default_factory=list)
    from_: From | None = None
    joins: list[Join] = field(default_factory=list)
    where: Expr | None = None
    group_by: list[Expr] = field(default_factory=list)
    having: Expr | None = None
    order_by: list[OrderByItem] = field(default_factory=list)
    limit: int | None = None
    offset: int | None = None
    ctes: list[CTE] = field(default_factory=list)
    distinct: bool = False
    grouping: str | None = None
    """Hierarchical grouping modifier: 'rollup' or 'cube'.

    When set, the dialect emits ``GROUP BY ROLLUP(...)`` / ``GROUP BY CUBE(...)``
    (or ClickHouse-style ``GROUP BY ... WITH ROLLUP``) instead of plain
    ``GROUP BY``. The planner is responsible for appending the
    ``GROUPING(dim) AS _g_<dim>`` columns to the SELECT projection."""

grouping = None class-attribute instance-attribute

Hierarchical grouping modifier: 'rollup' or 'cube'.

When set, the dialect emits GROUP BY ROLLUP(...) / GROUP BY CUBE(...) (or ClickHouse-style GROUP BY ... WITH ROLLUP) instead of plain GROUP BY. The planner is responsible for appending the GROUPING(dim) AS _g_<dim> columns to the SELECT projection.

orionbelt.ast.nodes.ColumnRef dataclass

Reference to a column, optionally qualified by table/alias.

Source code in src/orionbelt/ast/nodes.py
@dataclass(frozen=True)
class ColumnRef:
    """Reference to a column, optionally qualified by table/alias."""

    name: str
    table: str | None = None

orionbelt.ast.nodes.FunctionCall dataclass

SQL function call, e.g. SUM(col), DATE_TRUNC('month', col).

Source code in src/orionbelt/ast/nodes.py
@dataclass(frozen=True)
class FunctionCall:
    """SQL function call, e.g. SUM(col), DATE_TRUNC('month', col)."""

    name: str
    args: list[Expr] = field(default_factory=list)
    distinct: bool = False
    order_by: list[OrderByItem] = field(default_factory=list)
    separator: str | None = None

orionbelt.ast.nodes.BinaryOp dataclass

Binary operation: left op right.

Source code in src/orionbelt/ast/nodes.py
@dataclass(frozen=True)
class BinaryOp:
    """Binary operation: left op right."""

    left: Expr
    op: str  # +, -, *, /, =, <>, AND, OR, LIKE, etc.
    right: Expr

orionbelt.ast.nodes.Literal dataclass

A literal value: number, string, boolean, or NULL.

Source code in src/orionbelt/ast/nodes.py
@dataclass(frozen=True)
class Literal:
    """A literal value: number, string, boolean, or NULL."""

    value: str | int | float | bool | None

    @classmethod
    def string(cls, v: str) -> Literal:
        return cls(value=v)

    @classmethod
    def number(cls, v: int | float) -> Literal:
        return cls(value=v)

    @classmethod
    def null(cls) -> Literal:
        return cls(value=None)

    @classmethod
    def boolean(cls, v: bool) -> Literal:
        return cls(value=v)

AST Builder

orionbelt.ast.builder.QueryBuilder

Fluent builder for ergonomic AST construction.

Source code in src/orionbelt/ast/builder.py
class QueryBuilder:
    """Fluent builder for ergonomic AST construction."""

    def __init__(self) -> None:
        self._columns: list[Expr] = []
        self._from: From | None = None
        self._joins: list[Join] = []
        self._where: Expr | None = None
        self._group_by: list[Expr] = []
        self._having: Expr | None = None
        self._order_by: list[OrderByItem] = []
        self._limit: int | None = None
        self._offset: int | None = None
        self._ctes: list[CTE] = []
        self._distinct: bool = False
        self._grouping: str | None = None

    def select(self, *columns: Expr) -> Self:
        self._columns.extend(columns)
        return self

    def select_aliased(self, expr: Expr, alias: str) -> Self:
        self._columns.append(AliasedExpr(expr=expr, alias=alias))
        return self

    def from_(self, table: str, alias: str | None = None) -> Self:
        self._from = From(source=table, alias=alias)
        return self

    def from_subquery(self, subquery: Select, alias: str) -> Self:
        self._from = From(source=subquery, alias=alias)
        return self

    def join(
        self,
        table: str,
        on: Expr,
        join_type: JoinType = JoinType.LEFT,
        alias: str | None = None,
    ) -> Self:
        self._joins.append(Join(join_type=join_type, source=table, alias=alias, on=on))
        return self

    def where(self, condition: Expr) -> Self:
        if self._where is None:
            self._where = condition
        else:
            self._where = BinaryOp(left=self._where, op="AND", right=condition)
        return self

    def group_by(self, *exprs: Expr) -> Self:
        self._group_by.extend(exprs)
        return self

    def having(self, condition: Expr) -> Self:
        if self._having is None:
            self._having = condition
        else:
            self._having = BinaryOp(left=self._having, op="AND", right=condition)
        return self

    def order_by(self, expr: Expr, desc: bool = False, nulls_last: bool | None = None) -> Self:
        self._order_by.append(OrderByItem(expr=expr, desc=desc, nulls_last=nulls_last))
        return self

    def limit(self, n: int) -> Self:
        self._limit = n
        return self

    def offset(self, n: int) -> Self:
        self._offset = n
        return self

    def with_cte(self, name: str, query: Select | UnionAll | Except | RawSQL) -> Self:
        self._ctes.append(CTE(name=name, query=query))
        return self

    def distinct(self, value: bool = True) -> Self:
        self._distinct = value
        return self

    def grouping(self, mode: str | None) -> Self:
        """Set the hierarchical grouping modifier ('rollup' or 'cube')."""
        self._grouping = mode
        return self

    def build(self) -> Select:
        return Select(
            columns=self._columns,
            from_=self._from,
            joins=self._joins,
            where=self._where,
            group_by=self._group_by,
            having=self._having,
            order_by=self._order_by,
            limit=self._limit,
            offset=self._offset,
            ctes=self._ctes,
            distinct=self._distinct,
            grouping=self._grouping,
        )

grouping(mode)

Set the hierarchical grouping modifier ('rollup' or 'cube').

Source code in src/orionbelt/ast/builder.py
def grouping(self, mode: str | None) -> Self:
    """Set the hierarchical grouping modifier ('rollup' or 'cube')."""
    self._grouping = mode
    return self

API Schemas

orionbelt.api.schemas

API request/response Pydantic schemas.

SessionCreateRequest

Bases: BaseModel

Request body for POST /sessions.

Source code in src/orionbelt/api/schemas.py
class SessionCreateRequest(BaseModel):
    """Request body for POST /sessions."""

    metadata: dict[str, str] = Field(default_factory=dict)

SessionResponse

Bases: BaseModel

Single session info.

Source code in src/orionbelt/api/schemas.py
class SessionResponse(BaseModel):
    """Single session info."""

    session_id: str
    created_at: datetime
    last_accessed_at: datetime
    model_count: int
    metadata: dict[str, str] = Field(default_factory=dict)
    expires_at: datetime = Field(description="Idle TTL deadline (refreshed on each access)")
    max_expires_at: datetime = Field(description="Absolute lifetime deadline (fixed at creation)")

SessionListResponse

Bases: BaseModel

Response for GET /sessions.

Source code in src/orionbelt/api/schemas.py
class SessionListResponse(BaseModel):
    """Response for GET /sessions."""

    sessions: list[SessionResponse]

ModelLoadRequest

Bases: BaseModel

Request body for POST /sessions/{session_id}/models.

Source code in src/orionbelt/api/schemas.py
class ModelLoadRequest(BaseModel):
    """Request body for POST /sessions/{session_id}/models."""

    model_yaml: str | None = Field(
        default=None,
        description="OBML model as YAML string (provide model_yaml OR model_json)",
        max_length=5_000_000,
    )
    model_json: dict[str, object] | str | None = Field(
        default=None,
        description="OBML model as JSON object or JSON string (auto-parsed)",
    )
    extends: list[str] | None = Field(
        default=None,
        description="Optional inline YAML strings of analytical fragments to merge",
    )
    inherits: str | None = Field(
        default=None,
        description="Optional model ID of an already-loaded parent model in the session",
    )
    dedup: bool = Field(
        default=True,
        description=(
            "When True (default), identical OBML content already loaded in this session "
            "reuses the existing model_id (response.model_load == 'reused'). "
            "When False, always loads fresh."
        ),
    )

    @model_validator(mode="after")
    def _parse_model_json_string(self) -> ModelLoadRequest:
        if isinstance(self.model_json, str):
            self.model_json = json.loads(self.model_json)
        return self

ModelLoadResponse

Bases: BaseModel

Response for POST /sessions/{session_id}/models.

Source code in src/orionbelt/api/schemas.py
class ModelLoadResponse(BaseModel):
    """Response for POST /sessions/{session_id}/models."""

    model_id: str
    data_objects: int
    dimensions: int
    measures: int
    metrics: int
    warnings: list[StructuredWarning] = Field(default_factory=list)
    model_load: str = Field(
        default="fresh",
        description=(
            "Whether the load parsed a fresh model or reused an existing one. "
            "Values: 'fresh' | 'reused'."
        ),
    )
    health: ModelHealth | None = Field(
        default=None,
        description=(
            "Structural health of the model's join graph: orphan dataObjects, "
            "fan-trap risks, unreachable dimensions. Always present on a fresh load."
        ),
    )

ModelSummaryResponse

Bases: BaseModel

Short model summary for listing.

Source code in src/orionbelt/api/schemas.py
class ModelSummaryResponse(BaseModel):
    """Short model summary for listing."""

    model_id: str
    data_objects: int
    dimensions: int
    measures: int
    metrics: int

SessionQueryRequest

Bases: BaseModel

Request body for POST /sessions/{session_id}/query/sql.

Source code in src/orionbelt/api/schemas.py
class SessionQueryRequest(BaseModel):
    """Request body for POST /sessions/{session_id}/query/sql."""

    model_id: str
    query: QueryObject
    dialect: str | None = Field(
        default=None,
        description=(
            "SQL dialect. Resolution: explicit value → model.settings.defaultDialect → "
            "DB_VENDOR env → 'postgres'."
        ),
    )

QueryCompileResponse

Bases: BaseModel

Response body for POST /query/sql.

Source code in src/orionbelt/api/schemas.py
class QueryCompileResponse(BaseModel):
    """Response body for POST /query/sql."""

    sql: str
    dialect: str
    resolved: ResolvedInfoResponse
    warnings: list[StructuredWarning] = Field(default_factory=list)
    sql_valid: bool = True
    explain: ExplainPlanResponse | None = None
    physical_tables: list[str] = Field(
        default_factory=list,
        description=(
            "Deduplicated DATABASE.SCHEMA.CODE strings the query touches. "
            "Drives freshness-cache TTL composition and heartbeat invalidation."
        ),
    )

ValidateRequest

Bases: BaseModel

Request body for POST /validate.

Source code in src/orionbelt/api/schemas.py
class ValidateRequest(BaseModel):
    """Request body for POST /validate."""

    model_yaml: str | None = Field(
        default=None,
        description="OBML model as YAML string (provide model_yaml OR model_json)",
        max_length=5_000_000,
    )
    model_json: dict[str, object] | str | None = Field(
        default=None,
        description="OBML model as JSON object or JSON string (auto-parsed)",
    )
    extends: list[str] | None = Field(
        default=None,
        description="Optional inline YAML strings of analytical fragments to merge",
    )
    inherits: str | None = Field(
        default=None,
        description="Optional model ID of an already-loaded parent model in the session",
    )

    @model_validator(mode="after")
    def _parse_model_json_string(self) -> ValidateRequest:
        if isinstance(self.model_json, str):
            self.model_json = json.loads(self.model_json)
        return self

ValidateResponse

Bases: BaseModel

Response body for POST /validate.

Source code in src/orionbelt/api/schemas.py
class ValidateResponse(BaseModel):
    """Response body for POST /validate."""

    valid: bool
    errors: list[ErrorDetail] = Field(default_factory=list)
    warnings: list[ErrorDetail] = Field(default_factory=list)

DialectListResponse

Bases: BaseModel

Response for GET /dialects.

Source code in src/orionbelt/api/schemas.py
class DialectListResponse(BaseModel):
    """Response for GET /dialects."""

    dialects: list[DialectInfo] = Field(default_factory=list)

HealthResponse

Bases: BaseModel

Health check response.

Source code in src/orionbelt/api/schemas.py
class HealthResponse(BaseModel):
    """Health check response."""

    status: str = "ok"
    version: str = ""

Settings

orionbelt.settings.Settings

Bases: BaseSettings

Configuration for OrionBelt REST API server.

Values are read from environment variables and from a .env file in the working directory. See .env.template for all options.

Source code in src/orionbelt/settings.py
class Settings(BaseSettings):
    """Configuration for OrionBelt REST API server.

    Values are read from environment variables and from a ``.env`` file
    in the working directory.  See ``.env.template`` for all options.
    """

    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        extra="ignore",
    )

    # Shared
    log_level: str = "INFO"
    # Log format:
    #   "console"  — pretty-printed for local dev (default)
    #   "json"     — structured JSON for log aggregators (ELK, Datadog, etc.)
    #   "cloudrun" — JSON + disables uvicorn access logs (Cloud Run provides its own)
    log_format: str = "console"

    # REST API
    api_server_host: str = "localhost"
    api_server_port: int = 8000
    port: int | None = None  # Cloud Run injects PORT; takes precedence over api_server_port

    # Public-doc surfaces. Default True preserves current public-demo behaviour.
    # Set EXPOSE_API_DOCS=false on non-demo deployments to disable Swagger UI,
    # ReDoc, and the OpenAPI schema endpoint. EXPOSE_OPENAPI_SCHEMA can be
    # toggled independently to keep /openapi.json live (e.g. for client codegen)
    # while hiding the human-facing /docs and /redoc pages.
    expose_api_docs: bool = True
    expose_openapi_schema: bool = True

    @property
    def effective_port(self) -> int:
        """Return the port to listen on (Cloud Run PORT takes precedence)."""
        return self.port if self.port is not None else self.api_server_port

    # Sessions
    session_ttl_seconds: int = 1800  # 30 min inactivity
    session_max_age_seconds: int = 86400  # 24 h absolute max lifetime
    session_cleanup_interval: int = 60  # seconds between cleanup sweeps
    max_sessions: int = 500  # global concurrent session cap (429 when full)
    max_models_per_session: int = 10  # max models a single session may hold
    disable_session_list: bool = False  # hide GET /sessions endpoint
    session_rate_limit: int = 10  # max POST /sessions per IP per minute
    trusted_proxy_count: int = 0  # number of trusted reverse proxies in front of the app

    # Admin-curated model pre-loading. When MODEL_FILES is set, REST POST
    # /models returns 403 (the catalog is admin-managed) and the models are
    # loaded into named protected sessions at startup.
    #
    # MODEL_FILES (comma-separated paths):
    #     Each OBML YAML loads into its own internal session, addressable
    #     by the OBML `name:` field (fallback: filename stem, normalized to
    #     a valid identifier). BI tools select via the Flight `database`
    #     catalog or pgwire `database=` URL parameter. A single path is
    #     fine — it just means one named protected session.
    #     See design/PLAN_flight_natural_sql.md §3.x multi-model.
    model_dir: str | None = None  # base directory (set by Docker)
    model_files: str | None = None  # comma-separated paths

    # Query execution
    query_execute: bool = False  # enable POST /v1/query/execute
    query_default_limit: int = 1000  # max rows when query has no LIMIT
    db_pool_size: int = 5  # connection pool size per dialect

    # Default locale for /v1/query/execute?format_values=true (and TSV output).
    # Used when the request omits the ``locale`` query param. BCP-47 tag
    # (e.g. "de", "en-US"). Empty → en-style separators ("," / ".").
    default_locale: str = ""

    # Arrow Flight SQL server (requires ob-flight-extension)
    flight_enabled: bool = False  # start gRPC Flight server on FLIGHT_PORT (implies query_execute)
    flight_port: int = 8815
    flight_auth_mode: str = "none"  # "none" or "token"
    flight_api_token: str | None = None
    db_vendor: str = "duckdb"  # default vendor driver for Flight query execution

    # Flight Semantic QL governance. See design/PLAN_flight_natural_sql.md.
    # Semantic QL / OBSQL (SELECT dim, measure FROM <model>) is always enabled.
    # Raw SQL pass-through and write operations are **not** configurable —
    # OBSL is a semantic layer, not a JDBC proxy. There are no env flags
    # that allow arbitrary SQL through to the warehouse.

    # Postgres wire surface (see design/PLAN_postgres_wire.md).
    # Step 1 (Hello world): trust auth only, simple-query protocol.
    # Auth modes "password" / "scram-sha-256" land in Step 6 alongside the
    # unified auth subsystem (see design/PLAN_unified_auth.md).
    pgwire_enabled: bool = False
    pgwire_host: str = "0.0.0.0"  # noqa: S104 — server bind address
    pgwire_port: int = 5432
    pgwire_auth_mode: str = "trust"  # "trust" (Step 1) | "password" | "scram-sha-256" (Step 6)
    pgwire_max_connections: int = 64
    pgwire_query_timeout_seconds: int = 60

    # One-shot batch endpoint (POST /v1/oneshot/batch). See PLAN_oneshot_batch.md.
    oneshot_batch_max_queries: int = 50
    oneshot_batch_max_parallelism: int = 8
    oneshot_batch_default_timeout_ms: int = 30000  # per-query
    oneshot_batch_batch_timeout_ms: int = 120000  # whole batch

    # Freshness-driven result cache. See design/PLAN_freshness_driven_cache.md.
    cache_backend: str = "noop"  # "noop" or "file"
    cache_dir: str = "./cache"
    cache_max_ttl_seconds: int = 86400
    cache_min_ttl_seconds: int = 5
    cache_max_value_bytes: int = 10 * 1024 * 1024  # 10 MB
    cache_max_disk_bytes: int = 5 * 1024 * 1024 * 1024  # 5 GB
    cache_sweep_interval_seconds: int = 86400
    cache_unknown_freshness_policy: str = "no_cache"  # or "default_ttl"
    cache_unknown_freshness_default_ttl: int = 300
    heartbeat_auth_token: str | None = None  # endpoint disabled (404) when unset

effective_port property

Return the port to listen on (Cloud Run PORT takes precedence).