# data manipulation libraries
import pandas as pd
import numpy as np

# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px # interactive plots

# machine learning libraries
from sklearn.cluster import KMeans

# other libraries
from datetime import date
import os
import warnings
warnings.filterwarnings('ignore') # to remove any warning messages


# import data
df = pd.read_csv("input/Mall_Customers.csv")

# explore dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


# verify if there's any missing values in each column
df.isna().sum()

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64


# categorize ages by logical groups
criteria = [
    df['Age'] <= 18, # teen
    df['Age'] <= 25, # young adult
    df['Age'] <= 55, # adult
    df['Age'] <= 70, # senior
]
values = ['1. Teenager', '2. Young Adult', '3. Adult', '4. Senior']
df['Age_Category'] = np.select(criteria, values, default = '5. Retired')
df.head()


# categorize ages by 10-year interval buckets
criteria = [
    df['Age'] <= 10, 
    df['Age'] <= 20, 
    df['Age'] <= 30,
    df['Age'] <= 40,
    df['Age'] <= 50,
    df['Age'] <= 60,
    df['Age'] <= 70
]
values = ['1. 0-10', '2. 10-20', '3. 20-30', '4. 30-40', '5. 40-50', '6. 50-60', '7. 60-70']
df['Age_Bucket'] = np.select(criteria, values, default = '8. +71')
df.head()


# add 2 new fields -> gender categorized by 1 or 0
df['Male Gender'] = np.where(df['Gender'] == 'Male', 1, 0)
df['Female Gender'] = np.where(df['Gender'] == 'Female', 1, 0)
df.head()


# plot distribution of numerical variables (age, annual income, & spending score)
plt.figure(figsize=(14, 5))
n = 0
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
    n += 1
    plt.subplot(1, 3, n)
    plt.subplots_adjust(hspace=0.5 ,wspace=0.5)
    sns.distplot(df[x], bins=12, color='#2494E8')
    plt.title('Distribution of {}'.format(x))
plt.show()


# plot gender count
gender_count_df = df.groupby(['Gender', 'Age_Category']).size().reset_index(name='Count')
fig = (px.bar(gender_count_df, 
              x="Gender", 
              y="Count", 
              color="Age_Category", 
              text='Count',
              title="Breakdown of Female & Male Customers",
              color_discrete_sequence=["orange", "#F257EF", "#2494E8", "grey"],
              width=950, 
              height=400))
fig.show()


# let's investigate the distribution of age by 10-year age buckets
gender_count_df = df.groupby(['Gender', 'Age_Bucket']).size().reset_index(name='Count')
fig = (px.bar(gender_count_df, 
              x="Age_Bucket", 
              y="Count", 
              color="Gender", 
              text='Count',
              title="Breakdown of Female & Male Customers by Age Buckets",
              color_discrete_sequence=["#2494E8", "grey"],
              barmode="group",
              width=950, 
              height=400))
fig.show()


# Any correlation between annual income & age?
fig = (px.scatter(df, 
                  x="Age", 
                  y="Annual Income (k$)",
                  title='Correlation between Annual Income ($k) & Age',
                  color="Gender",
                  width=950, 
                  height=400))
fig.update_layout(legend_title_text='Age Category', )
fig.show()


# Any correlation between annual income & spending score?
fig = (px.scatter(df, 
                  x="Annual Income (k$)", 
                  y="Spending Score (1-100)",
                  title='Correlation between Annual Income ($k) & Spending Score (1-100) by Age Category',
                  color="Age_Category",
                  hover_data = ['Age_Bucket'],
                  width=950, 
                  height=400))
fig.update_layout(legend_title_text='Age Category', )
fig.show()


# female
fig = (px.scatter(df[df['Gender']=='Female'], 
                  x="Annual Income (k$)", 
                  y="Spending Score (1-100)",
                  title='Female - Correlation Trend',
                  color="Age_Category",
                  hover_data = ['Age_Bucket'],
                  width=950, 
                  height=400))
fig.update_layout(legend_title_text='Age Category', )
fig.show()

# male
fig = (px.scatter(df[df['Gender']=='Male'], 
                  x="Annual Income (k$)", 
                  y="Spending Score (1-100)",
                  title='Male - Correlation Trend',
                  color="Age_Category",
                  hover_data = ['Age_Bucket'],
                  width=950, 
                  height=400))
fig.update_layout(legend_title_text='Age Category', )
fig.show()


# Correlation among numerical variables
numerical_columns = df.select_dtypes(include='int64').drop('CustomerID', axis=1)
plt.figure(figsize=(14, 5))
matrix = numerical_columns.corr()
sns.heatmap(matrix, 
            annot=True, 
            cmap="Blues")
plt.show()


km = KMeans(n_clusters=5)
y_predicted = km.fit_predict(df[['Annual Income (k$)', 'Spending Score (1-100)']])
df['Cluster'] = y_predicted

# separate dataframes for each cluster
df1 = df[df.Cluster==0]
df2 = df[df.Cluster==1]
df3 = df[df.Cluster==2]
df4 = df[df.Cluster==3]
df5 = df[df.Cluster==4]

# plot scatter plot
["orange", "#F257EF", "#2494E8", "grey"],

plt.figure(figsize=(14, 5))
plt.scatter(df1['Annual Income (k$)'], df1['Spending Score (1-100)'], color='green', alpha=0.7, label='Cluster #1')
plt.scatter(df2['Annual Income (k$)'], df2['Spending Score (1-100)'], color='#2494E8', alpha=0.7, label='Cluster #2')
plt.scatter(df3['Annual Income (k$)'], df3['Spending Score (1-100)'], color='black', alpha=0.7, label='Cluster #3')
plt.scatter(df4['Annual Income (k$)'], df4['Spending Score (1-100)'], color='#F257EF', alpha=0.7, label='Cluster #4')
plt.scatter(df5['Annual Income (k$)'], df5['Spending Score (1-100)'], color='orange', alpha=0.7, label='Cluster #5')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='red',marker='*',label='centroid')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

Customer Segmentation (K-Means) | Analysis¶

Project Context:¶

Executive Summary:¶

[1] Import Libraries¶

[2] Data Load & Exploration¶

[3] Feature Engineering¶

[4] Data Visualization¶

[5] Modelling¶

[6] Conclusion¶

	CustomerID	Gender	Age	Annual Income (k$)	Spending Score (1-100)	Age_Category
0	1	Male	19	15	39	2. Young Adult
1	2	Male	21	15	81	2. Young Adult
2	3	Female	20	16	6	2. Young Adult
3	4	Female	23	16	77	2. Young Adult
4	5	Female	31	17	40	3. Adult

	CustomerID	Gender	Age	Annual Income (k$)	Spending Score (1-100)	Age_Category	Age_Bucket
0	1	Male	19	15	39	2. Young Adult	2. 10-20
1	2	Male	21	15	81	2. Young Adult	3. 20-30
2	3	Female	20	16	6	2. Young Adult	2. 10-20
3	4	Female	23	16	77	2. Young Adult	3. 20-30
4	5	Female	31	17	40	3. Adult	4. 30-40