Author: Philip Wong (philipkfw@gmail.com)
Kaggle Source: https://www.kaggle.com/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python
This case study uses a mock dataset hosted on Kaggle - intended to educate the concepts of customer segmentation, specifically unsupervised learning (K-Means Clustering).
Our 'client' runs a supermarket and has gathered basic data of their customers through issuing membership cards. Our goal is to better understand these customers in a way that will be useful for our client's marketing team. Some of the key questions we'll be looking to answer are;
# data manipulation libraries
import pandas as pd
import numpy as np
# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px # interactive plots
# machine learning libraries
from sklearn.cluster import KMeans
# other libraries
from datetime import date
import os
import warnings
warnings.filterwarnings('ignore') # to remove any warning messages
# import data
df = pd.read_csv("input/Mall_Customers.csv")
# explore dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 200 non-null int64 1 Gender 200 non-null object 2 Age 200 non-null int64 3 Annual Income (k$) 200 non-null int64 4 Spending Score (1-100) 200 non-null int64 dtypes: int64(4), object(1) memory usage: 7.9+ KB
# verify if there's any missing values in each column
df.isna().sum()
CustomerID 0 Gender 0 Age 0 Annual Income (k$) 0 Spending Score (1-100) 0 dtype: int64
# categorize ages by logical groups
criteria = [
df['Age'] <= 18, # teen
df['Age'] <= 25, # young adult
df['Age'] <= 55, # adult
df['Age'] <= 70, # senior
]
values = ['1. Teenager', '2. Young Adult', '3. Adult', '4. Senior']
df['Age_Category'] = np.select(criteria, values, default = '5. Retired')
df.head()
CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | Age_Category | |
---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 2. Young Adult |
1 | 2 | Male | 21 | 15 | 81 | 2. Young Adult |
2 | 3 | Female | 20 | 16 | 6 | 2. Young Adult |
3 | 4 | Female | 23 | 16 | 77 | 2. Young Adult |
4 | 5 | Female | 31 | 17 | 40 | 3. Adult |
# categorize ages by 10-year interval buckets
criteria = [
df['Age'] <= 10,
df['Age'] <= 20,
df['Age'] <= 30,
df['Age'] <= 40,
df['Age'] <= 50,
df['Age'] <= 60,
df['Age'] <= 70
]
values = ['1. 0-10', '2. 10-20', '3. 20-30', '4. 30-40', '5. 40-50', '6. 50-60', '7. 60-70']
df['Age_Bucket'] = np.select(criteria, values, default = '8. +71')
df.head()
CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | Age_Category | Age_Bucket | |
---|---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 2. Young Adult | 2. 10-20 |
1 | 2 | Male | 21 | 15 | 81 | 2. Young Adult | 3. 20-30 |
2 | 3 | Female | 20 | 16 | 6 | 2. Young Adult | 2. 10-20 |
3 | 4 | Female | 23 | 16 | 77 | 2. Young Adult | 3. 20-30 |
4 | 5 | Female | 31 | 17 | 40 | 3. Adult | 4. 30-40 |
# add 2 new fields -> gender categorized by 1 or 0
df['Male Gender'] = np.where(df['Gender'] == 'Male', 1, 0)
df['Female Gender'] = np.where(df['Gender'] == 'Female', 1, 0)
df.head()
CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | Age_Category | Age_Bucket | Male Gender | Female Gender | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 2. Young Adult | 2. 10-20 | 1 | 0 |
1 | 2 | Male | 21 | 15 | 81 | 2. Young Adult | 3. 20-30 | 1 | 0 |
2 | 3 | Female | 20 | 16 | 6 | 2. Young Adult | 2. 10-20 | 0 | 1 |
3 | 4 | Female | 23 | 16 | 77 | 2. Young Adult | 3. 20-30 | 0 | 1 |
4 | 5 | Female | 31 | 17 | 40 | 3. Adult | 4. 30-40 | 0 | 1 |
Insight #1: Referring to the distribution plots below, there's a high number of customers aged between 19 to 39 years old with annual salaries ranging from \$25K to $80K
# plot distribution of numerical variables (age, annual income, & spending score)
plt.figure(figsize=(14, 5))
n = 0
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
n += 1
plt.subplot(1, 3, n)
plt.subplots_adjust(hspace=0.5 ,wspace=0.5)
sns.distplot(df[x], bins=12, color='#2494E8')
plt.title('Distribution of {}'.format(x))
plt.show()
Insight #2: Referring to the two plots below;
# plot gender count
gender_count_df = df.groupby(['Gender', 'Age_Category']).size().reset_index(name='Count')
fig = (px.bar(gender_count_df,
x="Gender",
y="Count",
color="Age_Category",
text='Count',
title="Breakdown of Female & Male Customers",
color_discrete_sequence=["orange", "#F257EF", "#2494E8", "grey"],
width=950,
height=400))
fig.show()
# let's investigate the distribution of age by 10-year age buckets
gender_count_df = df.groupby(['Gender', 'Age_Bucket']).size().reset_index(name='Count')
fig = (px.bar(gender_count_df,
x="Age_Bucket",
y="Count",
color="Gender",
text='Count',
title="Breakdown of Female & Male Customers by Age Buckets",
color_discrete_sequence=["#2494E8", "grey"],
barmode="group",
width=950,
height=400))
fig.show()
Insight #3: Let's see if there's any correlation between a customer's annual income relative to their age
# Any correlation between annual income & age?
fig = (px.scatter(df,
x="Age",
y="Annual Income (k$)",
title='Correlation between Annual Income ($k) & Age',
color="Gender",
width=950,
height=400))
fig.update_layout(legend_title_text='Age Category', )
fig.show()
Insight #4: Referring to the scatter plot below;
# Any correlation between annual income & spending score?
fig = (px.scatter(df,
x="Annual Income (k$)",
y="Spending Score (1-100)",
title='Correlation between Annual Income ($k) & Spending Score (1-100) by Age Category',
color="Age_Category",
hover_data = ['Age_Bucket'],
width=950,
height=400))
fig.update_layout(legend_title_text='Age Category', )
fig.show()
Insight #5: Let's verify whether our correlation trends vary by gender
# female
fig = (px.scatter(df[df['Gender']=='Female'],
x="Annual Income (k$)",
y="Spending Score (1-100)",
title='Female - Correlation Trend',
color="Age_Category",
hover_data = ['Age_Bucket'],
width=950,
height=400))
fig.update_layout(legend_title_text='Age Category', )
fig.show()
# male
fig = (px.scatter(df[df['Gender']=='Male'],
x="Annual Income (k$)",
y="Spending Score (1-100)",
title='Male - Correlation Trend',
color="Age_Category",
hover_data = ['Age_Bucket'],
width=950,
height=400))
fig.update_layout(legend_title_text='Age Category', )
fig.show()
# Correlation among numerical variables
numerical_columns = df.select_dtypes(include='int64').drop('CustomerID', axis=1)
plt.figure(figsize=(14, 5))
matrix = numerical_columns.corr()
sns.heatmap(matrix,
annot=True,
cmap="Blues")
plt.show()
Referring to the plot below - we'll be plotting the relationship between annual income and spending score in clustering into 5 distinct groups -> we will prioritize from 1 to 5 (1 == highest priority)
km = KMeans(n_clusters=5)
y_predicted = km.fit_predict(df[['Annual Income (k$)', 'Spending Score (1-100)']])
df['Cluster'] = y_predicted
# separate dataframes for each cluster
df1 = df[df.Cluster==0]
df2 = df[df.Cluster==1]
df3 = df[df.Cluster==2]
df4 = df[df.Cluster==3]
df5 = df[df.Cluster==4]
# plot scatter plot
["orange", "#F257EF", "#2494E8", "grey"],
plt.figure(figsize=(14, 5))
plt.scatter(df1['Annual Income (k$)'], df1['Spending Score (1-100)'], color='green', alpha=0.7, label='Cluster #1')
plt.scatter(df2['Annual Income (k$)'], df2['Spending Score (1-100)'], color='#2494E8', alpha=0.7, label='Cluster #2')
plt.scatter(df3['Annual Income (k$)'], df3['Spending Score (1-100)'], color='black', alpha=0.7, label='Cluster #3')
plt.scatter(df4['Annual Income (k$)'], df4['Spending Score (1-100)'], color='#F257EF', alpha=0.7, label='Cluster #4')
plt.scatter(df5['Annual Income (k$)'], df5['Spending Score (1-100)'], color='orange', alpha=0.7, label='Cluster #5')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='red',marker='*',label='centroid')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()