Skip to content

read_excel

ReadExcelAction

Bases: PipelineAction

Reads data from an Excel file or directory of Excel files and returns a DataFrame.

The function reads Excel files using the ExcelDataFrameReader either from a single file or a directory path. It can read specific sheets, handle file extensions, and offers various options to customize how the data is read, such as specifying headers, index columns, and handling missing values. The resulting data is returned as a DataFrame, and metadata about the read files can be included in the context.

Example
Read Excel Table:
    action: READ_EXCEL
    options:
        file: excel_file_folder/excel_files_june/interesting_excel_file.xlsx
        usecols:
            - key_column
            - interesting_column
        options: <options for the ExcelDataFrameReader read method>

More Options

The READ_EXCEL action supports additional options that can be passed to the run method. For more information, refer to the method documentation.

Source code in src/cloe_nessy/pipeline/actions/read_excel.py
class ReadExcelAction(PipelineAction):
    """Reads data from an Excel file or directory of Excel files and returns a DataFrame.

    The function reads Excel files using the
    [`ExcelDataFrameReader`][cloe_nessy.integration.reader.excel_reader] either
    from a single file or a directory path. It can read specific sheets, handle
    file extensions, and offers various options to customize how the data is
    read, such as specifying headers, index columns, and handling missing
    values. The resulting data is returned as a DataFrame, and metadata about
    the read files can be included in the context.

    Example:
        ```yaml
        Read Excel Table:
            action: READ_EXCEL
            options:
                file: excel_file_folder/excel_files_june/interesting_excel_file.xlsx
                usecols:
                    - key_column
                    - interesting_column
                options: <options for the ExcelDataFrameReader read method>
        ```

    !!! note "More Options"
        The `READ_EXCEL` action supports additional options that can be passed to the
        run method. For more information, refer to the method documentation.
    """

    name: str = "READ_EXCEL"

    def run(
        self,
        context: PipelineContext,
        *,
        file: str | None = None,
        path: str | None = None,
        extension: str = "xlsx",
        recursive: bool = False,
        sheet_name: str | int | list = 0,
        sheet_name_as_column: bool = False,
        header: int | list[int] = 0,
        index_col: int | list[int] | None = None,
        usecols: int | str | list | Callable | None = None,
        dtype: str | None = None,
        fillna: str | dict[str, list[str]] | dict[str, str] | None = None,
        true_values: list | None = None,
        false_values: list | None = None,
        nrows: int | None = None,
        na_values: list[str] | dict[str, list[str]] | None = None,
        keep_default_na: bool = True,
        parse_dates: bool | list | dict = False,
        date_parser: Callable | None = None,
        thousands: str | None = None,
        include_index: bool = False,
        options: dict | None = None,
        add_metadata_column: bool = True,
        load_as_strings: bool = False,
        **_,
    ) -> PipelineContext:
        """Reads data from an Excel file or directory of Excel files and returns a DataFrame.

        Args:
            context: The context in which the action is executed.
            file: The path to a single Excel file. Either `file` or `path` must be specified.
            path: The directory path containing multiple Excel files. Either `file` or `path` must be specified.
            extension: The file extension to look for when reading from a directory.
            recursive: Whether to include subdirectories when reading from a directory path.
            sheet_name: The sheet name(s) or index(es) to read from the Excel file.
            sheet_name_as_column: Whether to add a column with the sheet name to the DataFrame.
            header: Row number(s) to use as the column labels.
            index_col: Column(s) to use as the index of the DataFrame.
            usecols: Subset of columns to parse. Can be an integer, string, list,
                or function.
            dtype: Data type for the columns.
            fillna: Method or value to use to fill NaN values.
            true_values: Values to consider as True.
            false_values: Values to consider as False.
            nrows: Number of rows to parse.
            na_values: Additional strings to recognize as NaN/NA.
            keep_default_na: Whether to append default NaN values when custom `na_values` are specified.
            parse_dates: Options for parsing date columns.
            date_parser: Function to use for converting strings to datetime objects.
            thousands: Thousands separator to use when parsing numeric columns.
            include_index: Whether to include an index column in the output DataFrame.
            options: Additional options to pass to the DataFrame reader.
            add_metadata_column: Whether to add a metadata column with file information to the DataFrame.
            load_as_strings: Whether to load all columns as strings.

        Raises:
            ValueError: Raised if both `file` and `path` are specified, or if neither is provided.

        Returns:
            The updated context, with the read data as a DataFrame.
        """
        if not options:
            options = dict()

        if file is not None and path is not None:
            self._tabular_logger.error("message: Only one of file or path have to be specified.")
            raise ValueError("Only one of file or path have to be specified.")

        excel_reader = ExcelDataFrameReader()
        if file is not None:
            df = excel_reader.read(
                location=file,
                sheet_name=sheet_name,
                sheet_name_as_column=sheet_name_as_column,
                header=header,
                index_col=index_col,
                usecols=usecols,
                true_values=true_values,
                false_values=false_values,
                nrows=nrows,
                dtype=dtype,
                fillna=fillna,
                na_values=na_values,
                keep_default_na=keep_default_na,
                parse_dates=parse_dates,
                date_parser=date_parser,
                thousands=thousands,
                include_index=include_index,
                options=options,
                add_metadata_column=add_metadata_column,
                load_as_strings=load_as_strings,
            )
        elif path is not None:
            file_list = get_file_paths(path, extension, recursive)
            df_dict: dict = {}
            for path in file_list:
                df_dict[path] = excel_reader.read(
                    location=path,
                    sheet_name=sheet_name,
                    sheet_name_as_column=sheet_name_as_column,
                    header=header,
                    index_col=index_col,
                    usecols=usecols,
                    dtype=dtype,
                    fillna=fillna,
                    true_values=true_values,
                    false_values=false_values,
                    nrows=nrows,
                    na_values=na_values,
                    keep_default_na=keep_default_na,
                    parse_dates=parse_dates,
                    date_parser=date_parser,
                    thousands=thousands,
                    include_index=include_index,
                    options=options,
                    add_metadata_column=add_metadata_column,
                    load_as_strings=load_as_strings,
                )
            df = reduce(DataFrame.unionAll, list(df_dict.values()))

        else:
            self._tabular_logger.error("action_name: READ_EXCEL | message: Either file or path have to be specified.")
            raise ValueError("Either file or path have to be specified.")

        runtime_info = context.runtime_info

        if add_metadata_column:
            read_files_list = list(set([x.file_path for x in df.select("__metadata.file_path").collect()]))
            if runtime_info is None:
                runtime_info = {"read_files": read_files_list}
            else:
                try:
                    runtime_info["read_files"] = list(set(runtime_info["read_files"] + read_files_list))
                except KeyError:
                    runtime_info["read_files"] = read_files_list

        return context.from_existing(data=df)

run(context, *, file=None, path=None, extension='xlsx', recursive=False, sheet_name=0, sheet_name_as_column=False, header=0, index_col=None, usecols=None, dtype=None, fillna=None, true_values=None, false_values=None, nrows=None, na_values=None, keep_default_na=True, parse_dates=False, date_parser=None, thousands=None, include_index=False, options=None, add_metadata_column=True, load_as_strings=False, **_)

Reads data from an Excel file or directory of Excel files and returns a DataFrame.

Parameters:

Name Type Description Default
context PipelineContext

The context in which the action is executed.

required
file str | None

The path to a single Excel file. Either file or path must be specified.

None
path str | None

The directory path containing multiple Excel files. Either file or path must be specified.

None
extension str

The file extension to look for when reading from a directory.

'xlsx'
recursive bool

Whether to include subdirectories when reading from a directory path.

False
sheet_name str | int | list

The sheet name(s) or index(es) to read from the Excel file.

0
sheet_name_as_column bool

Whether to add a column with the sheet name to the DataFrame.

False
header int | list[int]

Row number(s) to use as the column labels.

0
index_col int | list[int] | None

Column(s) to use as the index of the DataFrame.

None
usecols int | str | list | Callable | None

Subset of columns to parse. Can be an integer, string, list, or function.

None
dtype str | None

Data type for the columns.

None
fillna str | dict[str, list[str]] | dict[str, str] | None

Method or value to use to fill NaN values.

None
true_values list | None

Values to consider as True.

None
false_values list | None

Values to consider as False.

None
nrows int | None

Number of rows to parse.

None
na_values list[str] | dict[str, list[str]] | None

Additional strings to recognize as NaN/NA.

None
keep_default_na bool

Whether to append default NaN values when custom na_values are specified.

True
parse_dates bool | list | dict

Options for parsing date columns.

False
date_parser Callable | None

Function to use for converting strings to datetime objects.

None
thousands str | None

Thousands separator to use when parsing numeric columns.

None
include_index bool

Whether to include an index column in the output DataFrame.

False
options dict | None

Additional options to pass to the DataFrame reader.

None
add_metadata_column bool

Whether to add a metadata column with file information to the DataFrame.

True
load_as_strings bool

Whether to load all columns as strings.

False

Raises:

Type Description
ValueError

Raised if both file and path are specified, or if neither is provided.

Returns:

Type Description
PipelineContext

The updated context, with the read data as a DataFrame.

Source code in src/cloe_nessy/pipeline/actions/read_excel.py
def run(
    self,
    context: PipelineContext,
    *,
    file: str | None = None,
    path: str | None = None,
    extension: str = "xlsx",
    recursive: bool = False,
    sheet_name: str | int | list = 0,
    sheet_name_as_column: bool = False,
    header: int | list[int] = 0,
    index_col: int | list[int] | None = None,
    usecols: int | str | list | Callable | None = None,
    dtype: str | None = None,
    fillna: str | dict[str, list[str]] | dict[str, str] | None = None,
    true_values: list | None = None,
    false_values: list | None = None,
    nrows: int | None = None,
    na_values: list[str] | dict[str, list[str]] | None = None,
    keep_default_na: bool = True,
    parse_dates: bool | list | dict = False,
    date_parser: Callable | None = None,
    thousands: str | None = None,
    include_index: bool = False,
    options: dict | None = None,
    add_metadata_column: bool = True,
    load_as_strings: bool = False,
    **_,
) -> PipelineContext:
    """Reads data from an Excel file or directory of Excel files and returns a DataFrame.

    Args:
        context: The context in which the action is executed.
        file: The path to a single Excel file. Either `file` or `path` must be specified.
        path: The directory path containing multiple Excel files. Either `file` or `path` must be specified.
        extension: The file extension to look for when reading from a directory.
        recursive: Whether to include subdirectories when reading from a directory path.
        sheet_name: The sheet name(s) or index(es) to read from the Excel file.
        sheet_name_as_column: Whether to add a column with the sheet name to the DataFrame.
        header: Row number(s) to use as the column labels.
        index_col: Column(s) to use as the index of the DataFrame.
        usecols: Subset of columns to parse. Can be an integer, string, list,
            or function.
        dtype: Data type for the columns.
        fillna: Method or value to use to fill NaN values.
        true_values: Values to consider as True.
        false_values: Values to consider as False.
        nrows: Number of rows to parse.
        na_values: Additional strings to recognize as NaN/NA.
        keep_default_na: Whether to append default NaN values when custom `na_values` are specified.
        parse_dates: Options for parsing date columns.
        date_parser: Function to use for converting strings to datetime objects.
        thousands: Thousands separator to use when parsing numeric columns.
        include_index: Whether to include an index column in the output DataFrame.
        options: Additional options to pass to the DataFrame reader.
        add_metadata_column: Whether to add a metadata column with file information to the DataFrame.
        load_as_strings: Whether to load all columns as strings.

    Raises:
        ValueError: Raised if both `file` and `path` are specified, or if neither is provided.

    Returns:
        The updated context, with the read data as a DataFrame.
    """
    if not options:
        options = dict()

    if file is not None and path is not None:
        self._tabular_logger.error("message: Only one of file or path have to be specified.")
        raise ValueError("Only one of file or path have to be specified.")

    excel_reader = ExcelDataFrameReader()
    if file is not None:
        df = excel_reader.read(
            location=file,
            sheet_name=sheet_name,
            sheet_name_as_column=sheet_name_as_column,
            header=header,
            index_col=index_col,
            usecols=usecols,
            true_values=true_values,
            false_values=false_values,
            nrows=nrows,
            dtype=dtype,
            fillna=fillna,
            na_values=na_values,
            keep_default_na=keep_default_na,
            parse_dates=parse_dates,
            date_parser=date_parser,
            thousands=thousands,
            include_index=include_index,
            options=options,
            add_metadata_column=add_metadata_column,
            load_as_strings=load_as_strings,
        )
    elif path is not None:
        file_list = get_file_paths(path, extension, recursive)
        df_dict: dict = {}
        for path in file_list:
            df_dict[path] = excel_reader.read(
                location=path,
                sheet_name=sheet_name,
                sheet_name_as_column=sheet_name_as_column,
                header=header,
                index_col=index_col,
                usecols=usecols,
                dtype=dtype,
                fillna=fillna,
                true_values=true_values,
                false_values=false_values,
                nrows=nrows,
                na_values=na_values,
                keep_default_na=keep_default_na,
                parse_dates=parse_dates,
                date_parser=date_parser,
                thousands=thousands,
                include_index=include_index,
                options=options,
                add_metadata_column=add_metadata_column,
                load_as_strings=load_as_strings,
            )
        df = reduce(DataFrame.unionAll, list(df_dict.values()))

    else:
        self._tabular_logger.error("action_name: READ_EXCEL | message: Either file or path have to be specified.")
        raise ValueError("Either file or path have to be specified.")

    runtime_info = context.runtime_info

    if add_metadata_column:
        read_files_list = list(set([x.file_path for x in df.select("__metadata.file_path").collect()]))
        if runtime_info is None:
            runtime_info = {"read_files": read_files_list}
        else:
            try:
                runtime_info["read_files"] = list(set(runtime_info["read_files"] + read_files_list))
            except KeyError:
                runtime_info["read_files"] = read_files_list

    return context.from_existing(data=df)