Source code for causeinfer.data.hillstrom

"""
Hillstrom Email Marketing
-------------------------

An email marketing dataset from Kevin Hillstrom's MineThatData blog.

See an example using this data at `causeinfer/examples/business_hillstrom
<https://github.com/andrewtavis/causeinfer/blob/main/examples/business_hillstrom.ipynb>`_.

Description found at:
    https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html

Based on
    Kuchumov, A. pyuplift: Lightweight uplift modeling framework for Python. (2019).
    URL: https://github.com/duketemon/pyuplift.
    License: https://github.com/duketemon/pyuplift/blob/master/LICENSE.

    K. Hillstrom. “The MineThatData E-Mail Analytics And Data Mining Challenge”. 2008. URL:
    https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html.

Contents
    download_hillstrom,
    _format_data,
    load_hillstrom
"""

import os

import numpy as np
import pandas as pd
from causeinfer.data.download_utils import download_file, get_download_paths


[docs]def download_hillstrom(
    data_path=None,
    url="http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv",
):
    """
    Downloads the dataset from Kevin Hillstrom's blog.

    Parameters
    ----------
        data_path : str : optional (default=None)
            A user specified path for where the data should go.

        url : str
            The url from which the data is to be downloaded.

    Returns
    -------
        The data 'hillstrom.csv' in a 'datasets' folder, unless otherwise specified.
    """
    directory_path, dataset_path = get_download_paths(
        data_path, file_directory="datasets", file_name="hillstrom.csv"
    )
    if not os.path.isdir(directory_path):
        os.makedirs(directory_path)
        print(
            "/{} has been created in your local directory".format(
                directory_path.split("/")[-1]
            )
        )

    if not os.path.exists(dataset_path):
        download_file(url=url, output_path=dataset_path, zip_file=False)
    else:
        print("The dataset already exists at {}".format(dataset_path))


[docs]def _format_data(df, format_covariates=True, normalize=True):
    """
    Formats the data upon loading for consistent data preparation.

    Parameters
    ----------
        df : pd.DataFrame
            The original unformatted version of the data.

        format_covariates : bool : optional (default=True), controlled in load_hillstrom
            - True: creates dummy columns and encodes the data.

            - False: only steps for data readability will be taken.

        normalize : bool : optional (default=True), controlled in load_hillstrom
            Normalize dataset columns to prepare them for ML methods.

    Returns
    -------
        df : pd.DataFrame
            A formated version of the data.
    """
    # Split away the history segment index within the values and other formatting.
    df["history_segment"] = df["history_segment"].apply(lambda s: s.split(") ")[1])
    df["history_segment"] = df["history_segment"].astype(str)
    df["history_segment"] = [
        i.replace("$", "").replace(",", "").replace("-", "_").replace(" ", "")
        for i in df["history_segment"]
    ]

    # Column types to numeric.
    df[
        [
            "recency",
            "history",
            "mens",
            "womens",
            "newbie",
            "conversion",
            "visit",
            "spend",
        ]
    ] = df[
        [
            "recency",
            "history",
            "mens",
            "womens",
            "newbie",
            "conversion",
            "visit",
            "spend",
        ]
    ].apply(
        pd.to_numeric
    )

    # Rename columns.
    df = df.rename(columns={"segment": "treatment"})

    if format_covariates:

        # Create dummy columns.
        dummy_cols = ["zip_code", "history_segment", "channel"]
        for col in dummy_cols:
            df = pd.get_dummies(df, columns=[col], prefix=col)

        # Encode the treatment column.
        treatment_encoder = {"No E-Mail": 0, "Mens E-Mail": 1, "Womens E-Mail": 2}
        df["treatment"] = df["treatment"].apply(lambda x: treatment_encoder[x])

    if normalize:

        normalization_fields = ["recency", "history"]
        df[normalization_fields] = (
            df[normalization_fields] - df[normalization_fields].mean()
        ) / df[normalization_fields].std()

    # Format column names.
    df.rename(columns=lambda x: x.lower(), inplace=True)

    # Put treatment and response at the front and end of the df respectively.
    cols = list(df.columns)
    cols.insert(-1, cols.pop(cols.index("spend")))
    cols.insert(-1, cols.pop(cols.index("conversion")))
    cols.insert(-1, cols.pop(cols.index("visit")))
    cols.insert(0, cols.pop(cols.index("treatment")))
    df = df.loc[:, cols]

    return df


[docs]def load_hillstrom(
    file_path=None, format_covariates=True, download_if_missing=True, normalize=True,
):
    """
    Loads the Hillstrom dataset with formatting if desired.

    Parameters
    ----------
        file_path : str : optional (default=None)
            Specify another path for the dataset.

            By default the dataset should be stored in the 'datasets' folder in the cwd.

        format_covariates : bool : optional (default=True)
            Indicates whether raw data should be loaded without covariate manipulation.

        download_if_missing : bool : optional (default=True)
            Download the dataset if it is not downloaded before using 'download_hillstrom'.

        normalize : bool : optional (default=True)
            Normalize dataset columns to prepare them for ML methods.

    Returns
    -------
        data : dict object with the following attributes:

            data.description : str
                A description of the Hillstrom email marketing dataset.

            data.dataset_full : numpy.ndarray : (64000, 12) or formatted (64000, 22)
                The full dataset with features, treatment, and target variables.

            data.dataset_full_names : list, size 12 or formatted 22
                List of dataset variables names.

            data.features : numpy.ndarray : (64000, 8) or formatted (64000, 18)
                Each row corresponding to the 8 feature values in order.

            data.feature_names : list, size 8 or formatted 18
                List of feature names.

            data.treatment : numpy.ndarray : (64000,)
                Each value corresponds to the treatment.

            data.response_spend : numpy.ndarray : (64000,)
                Each value corresponds to how much customers spent during the two-week outcome period.

            data.response_visit : numpy.ndarray : (64000,)
                Each value corresponds to whether people visited the site during the two-week outcome period.

            data.response_conversion : numpy.ndarray : (64000,)
                Each value corresponds to whether they purchased at the site (i.e. converted) during the two-week outcome period.
    """
    # Check that the dataset exists.
    directory_path, dataset_path = get_download_paths(
        file_path=file_path, file_directory="datasets", file_name="hillstrom.csv",
    )
    # Fill above path if not.
    if not os.path.exists(dataset_path):
        if download_if_missing:
            download_hillstrom(directory_path)
        else:
            raise FileNotFoundError(
                "The dataset does not exist."
                "Use the 'download_hillstrom' function to download the dataset."
            )

    # Read the data.
    df = pd.read_csv(dataset_path)

    # Load formated or raw data.
    if format_covariates:
        if normalize:
            df = _format_data(df, format_covariates=True, normalize=True)
        else:
            df = _format_data(df, format_covariates=True, normalize=False)

    else:
        if normalize:
            df = _format_data(df, format_covariates=False, normalize=True)
        else:
            df = _format_data(df, format_covariates=False, normalize=False)

    description = (
        "The Hillstrom dataset contains 64,000 customers who purchased within twelve months."
        "The customers were involved in an e-mail marketing test."
        "1/3 were randomly chosen to receive an e-mail campaign featuring Mens merchandise."
        "1/3 were randomly chosen to receive an e-mail campaign featuring Womens merchandise."
        "1/3 were randomly chosen to not receive an e-mail campaign."
        "During a period of two weeks following the e-mail campaign, results were tracked."
        "Targeting for causal inference can be derived using visit, conversion, or total spent."
    )

    # Fields dropped to split the data for the user.
    drop_fields = ["spend", "visit", "conversion", "treatment"]

    return {
        "description": description,
        "dataset_full": df.values,
        "dataset_full_names": np.array(df.columns),
        "features": df.drop(drop_fields, axis=1).values,
        "feature_names": np.array(
            list(filter(lambda x: x not in drop_fields, df.columns))
        ),
        "treatment": df["treatment"].values,
        "response_spend": df["spend"].values,
        "response_visit": df["visit"].values,
        "response_conversion": df["conversion"].values,
    }