delta_merge_writer

`DeltaMergeConfig` ¶

Bases: BaseModel

Configuration for Merge options.

Parameters:

Name	Description	Default
`dataframe_columns`	The columns of the DataFrame.	required
`key_columns`	List of column names that form the key for the merge operation.	required
`when_matched_update`	Flag to specify whether to perform an update operation when matching records are found in the target Delta table.	required
`when_matched_delete`	Flag to specify whether to perform a delete operation when matching records are found in the target Delta table.	required
`when_not_matched_insert`	Flag to specify whether to perform an insert operation when matching records are not found in the target Delta table.	required
`cols_to_exclude_from_update`	List of column names to be excluded from the update in the target Delta table.	required
`use_partition_pruning`	Flag to specify whether to use partition pruning to optimize the performance of the merge operation.	required
`partition_by`	List of column names to partition by.	required

Source code in src/cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py

class DeltaMergeConfig(BaseModel):
    """Configuration for Merge options.

    Args:
        dataframe_columns: The columns of the DataFrame.
        key_columns: List of column names that form the key for the merge
            operation.
        when_matched_update: Flag to specify whether to perform an update
            operation when matching records are found in the target Delta table.
        when_matched_delete: Flag to specify whether to perform a delete
            operation when matching records are found in the target Delta table.
        when_not_matched_insert: Flag to specify whether to perform an insert
            operation when matching records are not found in the target Delta
            table.
        cols_to_exclude_from_update: List of column names to be excluded from
            the update in the target Delta table.
        use_partition_pruning: Flag to specify whether to use partition
            pruning to optimize the performance of the merge operation.
        partition_by: List of column names to partition by.
    """

    dataframe_columns: list[str]
    key_columns: list[str]
    cols_to_exclude_from_update: list[str] = Field(default_factory=list)
    when_matched_update: bool = True
    when_matched_delete: bool = False
    when_not_matched_insert: bool = True
    use_partition_pruning: bool = True
    partition_by: list[str] = Field(default_factory=list)
    cols_to_merge: list[str] = Field(default_factory=list, alias="_cols_to_merge")
    cols_to_update: set[str] = Field(default_factory=set, alias="_cols_to_update")
    cols_to_insert: set[str] = Field(default_factory=set, alias="_cols_to_insert")
    final_cols_to_update: dict[str, str] = Field(default_factory=dict)
    final_cols_to_insert: dict[str, str] = Field(default_factory=dict)

    model_config = ConfigDict(arbitrary_types_allowed=True)

    @model_validator(mode="before")
    @classmethod
    def _validate_update_delete(cls, config: Any):
        """Update and delete operations must be mutually exclusive."""
        if config.get("when_matched_update") and config.get("when_matched_delete"):
            raise ValueError("Update and delete operations cannot be used together.")
        return config

    @model_validator(mode="before")
    @classmethod
    def _validate_key_columns(cls, config: Any):
        """Key columns must exist in the data frame."""
        key_columns = config.get("key_columns")
        dataframe_columns = config.get("dataframe_columns")
        if not set(key_columns).issubset(set(dataframe_columns)):
            raise ValueError("Key columns must exist in the DataFrame columns.")
        return config

    @model_validator(mode="before")
    @classmethod
    def _derive_merge_columns(cls, config: Any):
        """Derive update and insert columns from the DataFrame columns."""
        dataframe_columns = config.get("dataframe_columns", [])
        config["_cols_to_merge"] = list(set(dataframe_columns))
        if config.get("cols_to_exclude_from_update"):
            config["_cols_to_update"] = set(config["_cols_to_merge"]) - set(config["cols_to_exclude_from_update"])
        else:
            config["_cols_to_update"] = set(config["_cols_to_merge"])

        config["_cols_to_insert"] = config["_cols_to_merge"]
        config["final_cols_to_update"] = {col: f"source.{col}" for col in config["_cols_to_update"]}
        config["final_cols_to_insert"] = {col: f"source.{col}" for col in config["_cols_to_insert"]}
        return config

    @model_validator(mode="after")
    @classmethod
    def _validate_partition_pruning(cls, config: Self):
        """If partition_pruning is set, the partition by columns must be known."""
        if config.use_partition_pruning is True and not config.partition_by:
            raise ValueError("Partition columns must be specified when using partition pruning.")
        return config

    @model_validator(mode="after")
    @classmethod
    def _validate_cols_exist(cls, config: Any):
        """If partition_pruning is set, the partition by columns must be known."""
        if any(col not in config.cols_to_merge for col in config.cols_to_update) or any(
            col not in config.cols_to_merge for col in config.cols_to_insert
        ):
            raise ValueError(
                "You specified column names for UPDATE or INSERT that either don't exist in the dataframe "
                "or are explicitly excluded from the MERGE.",
            )
        return config

`_derive_merge_columns(config)` `classmethod` ¶

Derive update and insert columns from the DataFrame columns.

Source code in src/cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py

@model_validator(mode="before")
@classmethod
def _derive_merge_columns(cls, config: Any):
    """Derive update and insert columns from the DataFrame columns."""
    dataframe_columns = config.get("dataframe_columns", [])
    config["_cols_to_merge"] = list(set(dataframe_columns))
    if config.get("cols_to_exclude_from_update"):
        config["_cols_to_update"] = set(config["_cols_to_merge"]) - set(config["cols_to_exclude_from_update"])
    else:
        config["_cols_to_update"] = set(config["_cols_to_merge"])

    config["_cols_to_insert"] = config["_cols_to_merge"]
    config["final_cols_to_update"] = {col: f"source.{col}" for col in config["_cols_to_update"]}
    config["final_cols_to_insert"] = {col: f"source.{col}" for col in config["_cols_to_insert"]}
    return config

`_validate_cols_exist(config)` `classmethod` ¶

If partition_pruning is set, the partition by columns must be known.

Source code in src/cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py

@model_validator(mode="after")
@classmethod
def _validate_cols_exist(cls, config: Any):
    """If partition_pruning is set, the partition by columns must be known."""
    if any(col not in config.cols_to_merge for col in config.cols_to_update) or any(
        col not in config.cols_to_merge for col in config.cols_to_insert
    ):
        raise ValueError(
            "You specified column names for UPDATE or INSERT that either don't exist in the dataframe "
            "or are explicitly excluded from the MERGE.",
        )
    return config

`_validate_key_columns(config)` `classmethod` ¶

Key columns must exist in the data frame.

Source code in src/cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py

@model_validator(mode="before")
@classmethod
def _validate_key_columns(cls, config: Any):
    """Key columns must exist in the data frame."""
    key_columns = config.get("key_columns")
    dataframe_columns = config.get("dataframe_columns")
    if not set(key_columns).issubset(set(dataframe_columns)):
        raise ValueError("Key columns must exist in the DataFrame columns.")
    return config

`_validate_partition_pruning(config)` `classmethod` ¶

If partition_pruning is set, the partition by columns must be known.

Source code in src/cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py

@model_validator(mode="after")
@classmethod
def _validate_partition_pruning(cls, config: Self):
    """If partition_pruning is set, the partition by columns must be known."""
    if config.use_partition_pruning is True and not config.partition_by:
        raise ValueError("Partition columns must be specified when using partition pruning.")
    return config

`_validate_update_delete(config)` `classmethod` ¶

Update and delete operations must be mutually exclusive.

Source code in src/cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py

@model_validator(mode="before")
@classmethod
def _validate_update_delete(cls, config: Any):
    """Update and delete operations must be mutually exclusive."""
    if config.get("when_matched_update") and config.get("when_matched_delete"):
        raise ValueError("Update and delete operations cannot be used together.")
    return config