import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
sns.set_style("whitegrid")

file_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/drug200.csv'
df = pd.read_csv(file_url)
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB

fig, axs = plt.subplots(2, 3, figsize=(15, 10))

for ax, feature in zip(axs.flatten(), df.columns):
    if df[feature].dtype == 'O':
        labels, sizes = np.unique(df[feature], return_counts=True)
        sns.barplot(x=labels, y=sizes, hue=labels, ax=ax, palette="tab10", legend=False)
        ax.set_xlabel("")
        ax.set_title(feature)  
    else:
        sns.histplot(data=df, x=feature, ax=ax)
        ax.set_xlabel("")
        ax.set_title(feature)

plt.tight_layout()
plt.show()

labels, sizes = np.unique(df["Drug"], return_counts=True)

fig, ax = plt.subplots(figsize=(6, 6))
ax.pie(sizes, textprops={'color': "w", 'fontsize': '12'}, autopct=lambda pct: "{:.2f}%\n({:d})".format(pct, round(pct/100 * sum(sizes))))
ax.legend(labels)
ax.set_title("Drug")
plt.show()

X = df.drop("Drug", axis=1)
y = df["Drug"]

for feature in X.columns:
    if X[feature].dtype == 'O':
        encoder = LabelEncoder()
        X[feature] = encoder.fit_transform(X[feature])

X.head()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (150, 5)
X_test shape: (50, 5)

classifier = DecisionTreeClassifier(criterion="entropy", max_depth=4)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.grid(False)
plt.show()

              precision    recall  f1-score   support

       drugA     1.0000    1.0000    1.0000         5
       drugB     1.0000    1.0000    1.0000         1
       drugC     1.0000    1.0000    1.0000         3
       drugX     1.0000    1.0000    1.0000        16
       drugY     1.0000    1.0000    1.0000        25

    accuracy                         1.0000        50
   macro avg     1.0000    1.0000    1.0000        50
weighted avg     1.0000    1.0000    1.0000        50

plot_tree(classifier, filled=True, feature_names=X.columns)
plt.show()

	Age	Sex	BP	Cholesterol	Na_to_K	Drug
0	23	F	HIGH	HIGH	25.355	drugY
1	47	M	LOW	HIGH	13.093	drugC
2	47	M	LOW	HIGH	10.114	drugC
3	28	F	NORMAL	HIGH	7.798	drugX
4	61	F	LOW	HIGH	18.043	drugY

	Age	Sex	BP	Na_to_K
0	23	0	0	25.355
1	47	1	1	13.093
2	47	1	1	10.114
3	28	0	2	7.798
4	61	0	1	18.043

Medical Treatment¶

Import libaries¶

Load the dataset¶

Understand the dataset¶

Visualize the dataset¶

Visualize the class distribution¶

Preprocess the dataset¶

Split the dataset into training and test sets¶

Train a Decision Tree classifier¶

Evaluate the model¶

Visualize the Decision Tree classifier¶