Source code for vibe.config_validation.pydantic_models.mode_config_model

from typing_extensions import Self
from typing import Any
from pydantic import BaseModel, model_validator, Field, ConfigDict

import shutil



[docs]
class ModeConfigModel(BaseModel):
    """
    This Pydantic model aims to validate all configuration TOMLs of VIBE modes.
    This model is an entry point in which all datasets will have their
    structure validated.

    Specifically, we must validate that each value of the `dataset_dict` (or `skim_dataset_dict`)
    is validated by the `DatasetDictConfigModel`. Note we forbid any unknown fields in this model,
    meaning a ValidationError will be raised if an unknown field is encountered.

    .. note::
        This is slightly misleading for skim modes. When running a skim production mode in VIBE
        the `dataset_dict` should not be set as VIBE will manage this in the TOML file during runtime.
        Instead, the `skim_dataset_dict`

    We utilise the model_validator decorator to perform a before AND after check on the input data.

    **Before Model Validator**

    .. code-block python

        @model_validator(mode="before")
        @classmethod
        def check_if_skim_dataset_was_parsed(cls, data: Any) -> Any:
            if isinstance(data, dict):
                if "skim_dataset_dict" in data.keys():
                    data['dataset_dict'] = {}
            return data

    This function is ran BEFORE the input data from the TOML is parsed into the Pydantic Model validation. In the before model validator, we check that if the `skim_dataset_dict` is parsed for skim_production modes.
    If parsed, we need to set the `dataset_dict` to a default value of {} as this Pydantic Model requires a
    `dataset_dict` field to be entered for its validation.

    **After Model Validator**

    .. code-block python

        @model_validator(mode="after")
        def check_dataset_dict_formatted_correctly(self) -> Self:
            for dataset_name, dataset_config in self.dataset_dict.items():
                DatasetDictConfigModel(dataset_name=dataset_name, **dataset_config)

            for skim_dataset_name, skim_dataset_config in self.skim_dataset_dict.items():
                DatasetDictConfigModel(dataset_name=skim_dataset_name, **skim_dataset_config)

            return self

    This function is ran AFTER the input data has been validated by the Pydantic model and is not instantiated
    as a Pydantic Model class. This after validator is checking that the values of the `dataset_dict` (or `skim_dataset_dict`)
    parse the `DatasetDictConfigModel` Pydantic model.
    """

    dataset_dict: dict[str, dict]
    skim_dataset_dict: dict[str, dict] = Field(default={})
    gbasf2_settings: dict[str, Any] = Field(default={})
    additional_mode_settings: dict[str, Any] = Field(default={})
    model_config = ConfigDict(extra="forbid")


[docs]
    @model_validator(mode="before")
    @classmethod
    def check_if_skim_dataset_was_parsed(cls, data: Any) -> Any:
        if isinstance(data, dict):
            if "skim_dataset_dict" in data.keys():
                data["dataset_dict"] = {}
        return data



[docs]
    @model_validator(mode="after")
    def check_dataset_dict_formatted_correctly(self) -> Self:
        for dataset_name, dataset_config in self.dataset_dict.items():
            self.dataset_dict.update(
                {dataset_name: DatasetDictConfigModel(dataset_name=dataset_name, **dataset_config).model_dump()}
            )

        for skim_dataset_name, skim_dataset_config in self.skim_dataset_dict.items():
            self.skim_dataset_dict.update(
                {
                    skim_dataset_name: DatasetDictConfigModel(
                        dataset_name=skim_dataset_name, **skim_dataset_config
                    ).model_dump()
                }
            )

        return self





[docs]
class DatasetDictConfigModel(BaseModel):
    """
    This Pydantic model validates the configuration of a given input dataset.

    A `model_validator` (after) is used to check that if `batch=True`, then `offline` must also be `True`.
    If not, an assertion error is raised.
    """

    dataset_name: str = Field(
        exclude=True, description="(Internal use only) Name of the dataset. This field is excluded from parsing."
    )

    # REQUIRED FIELD
    lpn: str = Field(
        pattern=r"^\/(?:[^\/]+\/)*[^\/]+$",
        description="The path to the dataset. This can be a local path, glob pattern, or gbasf2 logical path name (LPN).",
    )

    # OPTIONAL FIELDS
    globaltags: list[str] = Field(
        default=[], description="List of global tags to be set at the start of the basf2 path. Default is an empty list."
    )

    gbasf2_submission_campaign: str = Field(
        default="", description="Submission campaign suffix attached to the end of the gbasf2 project name."
    )

    kwargs: dict[str, Any] = Field(
        default={},
        description="Additional keyword arguments passed to the `create_basf2_path` method of ValidationModeBaseClass.",
    )

    offline: bool = Field(
        default=False,
        description="Flag indicating that the dataset must be run offline (i.e., not submitted to the Belle II grid).",
    )

    batch: bool = Field(
        default=False,
        description="Flag to tell VIBE to submit the dataset to the LSF batch system. Requires `offline=True`.",
    )


[docs]
    @model_validator(mode="after")
    def check_for_batch_true(self) -> Self:
        # If batch is set to true we must check that the LSF batch system
        # is present on this machine since this is the only batch system VIBE can submit to
        if self.batch:
            if not bool(shutil.which("bsub")):
                assert self.offline, (
                    f"The batch flag was set to True however offline=False for {self.dataset_name}. VIBE is only "
                    " compatible with the LSF batch system"
                )
            self.offline = True
        return self



[docs]
    @model_validator(mode="after")
    def declare_lpn_type(self) -> Self:
        """
        This function will dynamically set what type of LPN we are working with that being
            - Offline single rootfile
            - Offline globbed directory
            - Single datablock grid LPN
            - Grid collection
        """
        # TODO Add this functionality to continue the centralization of configuration and settings management
        # The idea is to create way to identify exactly what type of dataset is being used in this
        # validation to make setting up b2luigi workflows easier, for example in AnalysisNtupleMerger
        # we do all sorts of checks to find out what type of lpn is present to gather the number of
        # events accordingly. We could instead use and Enum here to make that process easier, or
        # better still, attach a function for the event calculator to an attribute and use that to
        # calculate the number of events
        return self