import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

file_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%204/data/Cust_Segmentation.csv"
df = pd.read_csv(file_url)
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Customer Id      850 non-null    int64  
 1   Age              850 non-null    int64  
 2   Edu              850 non-null    int64  
 3   Years Employed   850 non-null    int64  
 4   Income           850 non-null    int64  
 5   Card Debt        850 non-null    float64
 6   Other Debt       850 non-null    float64
 7   Defaulted        700 non-null    float64
 8   Address          850 non-null    object 
 9   DebtIncomeRatio  850 non-null    float64
dtypes: float64(4), int64(5), object(1)
memory usage: 66.5+ KB

fig, axs = plt.subplots(3, 3, figsize=(15, 15))

for ax, feature in zip(axs.flatten(), df.drop(["Customer Id", "Address"], axis=1)):
    if len(df[feature].unique()) <= 10:
        labels, sizes = np.unique(df[feature], return_counts=True)
        sns.barplot(x=labels, y=sizes, hue=labels, ax=ax, palette="tab10", legend=False)
        ax.set_xlabel("")
        ax.set_title(feature)
    else:
        sns.histplot(data=df, x=feature, ax=ax)
        ax.set_xlabel("")
        ax.set_title(feature)

axs[2, 2].axis("off")
plt.tight_layout()
plt.show()

df = df.drop(["Customer Id", "Address"], axis=1)
df = df.fillna(0)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
df_scaled.head()

kmeans = KMeans(n_clusters=3, n_init=12)
kmeans.fit(df)

KMeans(n_clusters=3, n_init=12)

KMeans(n_clusters=3, n_init=12)

df["Label"] = kmeans.labels_
df.groupby("Label").mean()

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')
ax.scatter(df["Edu"], df["Age"], df["Income"], c=df["Label"].astype(float), cmap="viridis")
ax.set_xlabel('Education')
ax.set_ylabel('Age')
ax.set_zlabel('Income')
plt.tight_layout()
plt.show()

	Customer Id	Age	Edu	Years Employed	Income	Card Debt	Other Debt	Defaulted	Address	DebtIncomeRatio
0	1	41	2	6	19	0.124	1.073	0.0	NBA001	6.3
1	2	47	1	26	100	4.582	8.218	0.0	NBA021	12.8
2	3	33	2	10	57	6.111	5.802	1.0	NBA013	20.9
3	4	29	2	4	19	0.681	0.516	0.0	NBA009	6.3
4	5	47	1	31	253	9.308	8.908	0.0	NBA008	7.2

	Age	Edu	Years Employed	Income	Card Debt	Other Debt	Defaulted	DebtIncomeRatio
0	0.742915	0.312122	-0.378790	-0.718459	-0.683811	-0.590489	-0.523797	-0.576525
1	1.489490	-0.766349	2.573721	1.384325	1.414474	1.512962	-0.523797	0.391387
2	-0.252518	0.312122	0.211712	0.268032	2.134141	0.801704	1.909138	1.597554
3	-0.750235	0.312122	-0.674041	-0.718459	-0.421643	-0.754467	-0.523797	-0.576525
4	1.489490	-0.766349	3.311849	5.356249	3.638900	1.716094	-0.523797	-0.442507

	Age	Edu	Years Employed	Income	Card Debt	Other Debt	Defaulted	DebtIncomeRatio
Label
0	41.333333	1.956284	15.256831	83.928962	3.103639	5.765279	0.136612	10.724590
1	45.388889	2.666667	19.555556	227.166667	5.678444	10.907167	0.222222	7.322222
2	32.964561	1.614792	6.374422	31.164869	1.032541	2.104133	0.237288	10.094761

Customer Segmentation¶

Import libraries¶

Load the dataset¶

Understand the dataset¶

Visualize the dataset¶

Preprocess the dataset¶

Train the k-Means clustering¶

Evaluate the model¶