"""
Hillstrom Email Marketing
-------------------------
An email marketing dataset from Kevin Hillstrom's MineThatData blog.
See an example using this data at `causeinfer/examples/business_hillstrom
<https://github.com/andrewtavis/causeinfer/blob/main/examples/business_hillstrom.ipynb>`_.
Description found at:
https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html
Based on
Kuchumov, A. pyuplift: Lightweight uplift modeling framework for Python. (2019).
URL: https://github.com/duketemon/pyuplift.
License: https://github.com/duketemon/pyuplift/blob/master/LICENSE.
K. Hillstrom. “The MineThatData E-Mail Analytics And Data Mining Challenge”. 2008. URL:
https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html.
Contents
download_hillstrom,
_format_data,
load_hillstrom
"""
import os
import numpy as np
import pandas as pd
from causeinfer.data.download_utils import download_file, get_download_paths
[docs]def download_hillstrom(
data_path=None,
url="http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv",
):
"""
Downloads the dataset from Kevin Hillstrom's blog.
Parameters
----------
data_path : str : optional (default=None)
A user specified path for where the data should go.
url : str
The url from which the data is to be downloaded.
Returns
-------
The data 'hillstrom.csv' in a 'datasets' folder, unless otherwise specified.
"""
directory_path, dataset_path = get_download_paths(
data_path, file_directory="datasets", file_name="hillstrom.csv"
)
if not os.path.isdir(directory_path):
os.makedirs(directory_path)
print(
"/{} has been created in your local directory".format(
directory_path.split("/")[-1]
)
)
if not os.path.exists(dataset_path):
download_file(url=url, output_path=dataset_path, zip_file=False)
else:
print("The dataset already exists at {}".format(dataset_path))
[docs]def load_hillstrom(
file_path=None, format_covariates=True, download_if_missing=True, normalize=True,
):
"""
Loads the Hillstrom dataset with formatting if desired.
Parameters
----------
file_path : str : optional (default=None)
Specify another path for the dataset.
By default the dataset should be stored in the 'datasets' folder in the cwd.
format_covariates : bool : optional (default=True)
Indicates whether raw data should be loaded without covariate manipulation.
download_if_missing : bool : optional (default=True)
Download the dataset if it is not downloaded before using 'download_hillstrom'.
normalize : bool : optional (default=True)
Normalize dataset columns to prepare them for ML methods.
Returns
-------
data : dict object with the following attributes:
data.description : str
A description of the Hillstrom email marketing dataset.
data.dataset_full : numpy.ndarray : (64000, 12) or formatted (64000, 22)
The full dataset with features, treatment, and target variables.
data.dataset_full_names : list, size 12 or formatted 22
List of dataset variables names.
data.features : numpy.ndarray : (64000, 8) or formatted (64000, 18)
Each row corresponding to the 8 feature values in order.
data.feature_names : list, size 8 or formatted 18
List of feature names.
data.treatment : numpy.ndarray : (64000,)
Each value corresponds to the treatment.
data.response_spend : numpy.ndarray : (64000,)
Each value corresponds to how much customers spent during the two-week outcome period.
data.response_visit : numpy.ndarray : (64000,)
Each value corresponds to whether people visited the site during the two-week outcome period.
data.response_conversion : numpy.ndarray : (64000,)
Each value corresponds to whether they purchased at the site (i.e. converted) during the two-week outcome period.
"""
# Check that the dataset exists.
directory_path, dataset_path = get_download_paths(
file_path=file_path, file_directory="datasets", file_name="hillstrom.csv",
)
# Fill above path if not.
if not os.path.exists(dataset_path):
if download_if_missing:
download_hillstrom(directory_path)
else:
raise FileNotFoundError(
"The dataset does not exist."
"Use the 'download_hillstrom' function to download the dataset."
)
# Read the data.
df = pd.read_csv(dataset_path)
# Load formated or raw data.
if format_covariates:
if normalize:
df = _format_data(df, format_covariates=True, normalize=True)
else:
df = _format_data(df, format_covariates=True, normalize=False)
else:
if normalize:
df = _format_data(df, format_covariates=False, normalize=True)
else:
df = _format_data(df, format_covariates=False, normalize=False)
description = (
"The Hillstrom dataset contains 64,000 customers who purchased within twelve months."
"The customers were involved in an e-mail marketing test."
"1/3 were randomly chosen to receive an e-mail campaign featuring Mens merchandise."
"1/3 were randomly chosen to receive an e-mail campaign featuring Womens merchandise."
"1/3 were randomly chosen to not receive an e-mail campaign."
"During a period of two weeks following the e-mail campaign, results were tracked."
"Targeting for causal inference can be derived using visit, conversion, or total spent."
)
# Fields dropped to split the data for the user.
drop_fields = ["spend", "visit", "conversion", "treatment"]
return {
"description": description,
"dataset_full": df.values,
"dataset_full_names": np.array(df.columns),
"features": df.drop(drop_fields, axis=1).values,
"feature_names": np.array(
list(filter(lambda x: x not in drop_fields, df.columns))
),
"treatment": df["treatment"].values,
"response_spend": df["spend"].values,
"response_visit": df["visit"].values,
"response_conversion": df["conversion"].values,
}