import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import matplotlib.pyplot as plt

file_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv"
df = pd.read_csv(file_url)
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3271 entries, 0 to 3270
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           3271 non-null   object 
 1   MinTemp        3271 non-null   float64
 2   MaxTemp        3271 non-null   float64
 3   Rainfall       3271 non-null   float64
 4   Evaporation    3271 non-null   float64
 5   Sunshine       3271 non-null   float64
 6   WindGustDir    3271 non-null   object 
 7   WindGustSpeed  3271 non-null   int64  
 8   WindDir9am     3271 non-null   object 
 9   WindDir3pm     3271 non-null   object 
 10  WindSpeed9am   3271 non-null   int64  
 11  WindSpeed3pm   3271 non-null   int64  
 12  Humidity9am    3271 non-null   int64  
 13  Humidity3pm    3271 non-null   int64  
 14  Pressure9am    3271 non-null   float64
 15  Pressure3pm    3271 non-null   float64
 16  Cloud9am       3271 non-null   int64  
 17  Cloud3pm       3271 non-null   int64  
 18  Temp9am        3271 non-null   float64
 19  Temp3pm        3271 non-null   float64
 20  RainToday      3271 non-null   object 
 21  RainTomorrow   3271 non-null   object 
dtypes: float64(9), int64(7), object(6)
memory usage: 562.3+ KB

labels, sizes = np.unique(df["RainTomorrow"], return_counts=True)

fig, ax = plt.subplots()
ax.pie(sizes, textprops={'color': "w", 'fontsize': '12'}, autopct=lambda pct: "{:.2f}%\n({:d})".format(pct, round(pct/100 * sum(sizes))))
ax.legend(labels)
ax.set_title("RainTomorrow")
plt.show()

df = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])
df["RainTomorrow"] = df["RainTomorrow"].map({'No': 0, 'Yes': 1})
df.head()

X = df.drop(["Date", "RainTomorrow"], axis=1)
y = df["RainTomorrow"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (2453, 66)
X_test shape: (818, 66)

classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)

KNeighborsClassifier()

KNeighborsClassifier()

y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=["0 (no)", "1 (yes)"])
plt.show()

              precision    recall  f1-score   support

           0     0.8479    0.9215    0.8832       599
           1     0.7186    0.5479    0.6218       219

    accuracy                         0.8215       818
   macro avg     0.7832    0.7347    0.7525       818
weighted avg     0.8133    0.8215    0.8132       818

classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()

y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=["0 (no)", "1 (yes)"])
plt.show()

              precision    recall  f1-score   support

           0     0.8311    0.8464    0.8387       599
           1     0.5577    0.5297    0.5433       219

    accuracy                         0.7616       818
   macro avg     0.6944    0.6880    0.6910       818
weighted avg     0.7579    0.7616    0.7596       818

classifier = LogisticRegression(solver="liblinear")
classifier.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

LogisticRegression(solver='liblinear')

y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=["0 (no)", "1 (yes)"])
plt.show()

              precision    recall  f1-score   support

           0     0.8605    0.9265    0.8923       599
           1     0.7457    0.5890    0.6582       219

    accuracy                         0.8362       818
   macro avg     0.8031    0.7578    0.7752       818
weighted avg     0.8297    0.8362    0.8296       818

classifier = SVC(gamma="auto")
classifier.fit(X_train, y_train)

SVC(gamma='auto')

SVC(gamma='auto')

y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=["0 (no)", "1 (yes)"])
plt.show()

              precision    recall  f1-score   support

           0     0.7547    0.9967    0.8590       599
           1     0.9259    0.1142    0.2033       219

    accuracy                         0.7604       818
   macro avg     0.8403    0.5554    0.5311       818
weighted avg     0.8006    0.7604    0.6834       818

	Date	MinTemp	MaxTemp	Rainfall	Evaporation	Sunshine	WindGustDir	WindGustSpeed	WindDir9am	WindDir3pm	...	Humidity9am	Humidity3pm	Pressure9am	Pressure3pm	Cloud9am	Cloud3pm	Temp9am	Temp3pm	RainToday	RainTomorrow
0	2/1/2008	19.5	22.4	15.6	6.2	0.0	W	41	S	SSW	...	92	84	1017.6	1017.4	8	8	20.7	20.9	Yes	Yes
1	2/2/2008	19.5	25.6	6.0	3.4	2.7	W	41	W	E	...	83	73	1017.9	1016.4	7	7	22.4	24.8	Yes	Yes
2	2/3/2008	21.6	24.5	6.6	2.4	0.1	W	41	ESE	ESE	...	88	86	1016.7	1015.6	7	8	23.5	23.0	Yes	Yes
3	2/4/2008	20.2	22.8	18.8	2.2	0.0	W	41	NNE	E	...	83	90	1014.2	1011.8	8	8	21.4	20.9	Yes	Yes
4	2/5/2008	19.7	25.7	77.4	4.8	0.0	W	41	NNE	W	...	88	74	1008.3	1004.8	8	8	22.5	25.5	Yes	Yes

Field	Description	Unit	Type
Date	Date of the Observation in YYYY-MM-DD	Date	object
Location	Location of the Observation	Location	object
MinTemp	Minimum temperature	Celsius	float
MaxTemp	Maximum temperature	Celsius	float
Rainfall	Amount of rainfall	Millimeters	float
Evaporation	Amount of evaporation	Millimeters	float
Sunshine	Amount of bright sunshine	hours	float
WindGustDir	Direction of the strongest gust	Compass Points	object
WindGustSpeed	Speed of the strongest gust	Kilometers/Hour	object
WindDir9am	Wind direction averaged of 10 minutes prior to 9am	Compass Points	object
WindDir3pm	Wind direction averaged of 10 minutes prior to 3pm	Compass Points	object
WindSpeed9am	Wind speed averaged of 10 minutes prior to 9am	Kilometers/Hour	float
WindSpeed3pm	Wind speed averaged of 10 minutes prior to 3pm	Kilometers/Hour	float
Humidity9am	Humidity at 9am	Percent	float
Humidity3pm	Humidity at 3pm	Percent	float
Pressure9am	Atmospheric pressure reduced to mean sea level at 9am	Hectopascal	float
Pressure3pm	Atmospheric pressure reduced to mean sea level at 3pm	Hectopascal	float
Cloud9am	Fraction of the sky obscured by cloud at 9am	Eights	float
Cloud3pm	Fraction of the sky obscured by cloud at 3pm	Eights	float
Temp9am	Temperature at 9am	Celsius	float
Temp3pm	Temperature at 3pm	Celsius	float
RainToday	If there was rain today	Yes/No	object
RainTomorrow	If there is rain tomorrow	Yes/No	float

	Date	MinTemp	MaxTemp	Rainfall	Evaporation	Sunshine	WindGustSpeed	WindSpeed9am	WindSpeed3pm	Humidity9am	...	WindDir3pm_NNW	WindDir3pm_NW	WindDir3pm_S	WindDir3pm_SE	WindDir3pm_SSE	WindDir3pm_SSW	WindDir3pm_SW	WindDir3pm_W	WindDir3pm_WNW	WindDir3pm_WSW
0	2/1/2008	19.5	22.4	15.6	6.2	0.0	41	17	20	92	...	False	False	False	False	False	True	False	False	False	False
1	2/2/2008	19.5	25.6	6.0	3.4	2.7	41	9	13	83	...	False	False	False	False	False	False	False	False	False	False
2	2/3/2008	21.6	24.5	6.6	2.4	0.1	41	17	2	88	...	False	False	False	False	False	False	False	False	False	False
3	2/4/2008	20.2	22.8	18.8	2.2	0.0	41	22	20	83	...	False	False	False	False	False	False	False	False	False	False
4	2/5/2008	19.7	25.7	77.4	4.8	0.0	41	11	6	88	...	False	False	False	False	False	False	False	True	False	False

Weather Forecast¶

Import libraries¶

Load the dataset¶

Understand the dataset¶

Visualize the class distribution¶

Preprocess the dataset¶

Split the dataset into train and test subsets¶

Train a k-Nearest Neighbors classifier¶

Evaluate the model¶

Train a Decision Tree classifier¶

Evaluate the model¶

Train a Logistic Regression classifier¶

Evaluate the model¶

Train a Support Vector Classifier¶

Evaluate the model¶