import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras import Sequential, Input, layers, callbacks
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

2025-03-06 14:21:41.234604: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1741292501.249172   58346 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741292501.254487   58346 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-06 14:21:41.297219: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

file_url = 'https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv'
df = pd.read_csv(file_url)
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cement              1030 non-null   float64
 1   Blast Furnace Slag  1030 non-null   float64
 2   Fly Ash             1030 non-null   float64
 3   Water               1030 non-null   float64
 4   Superplasticizer    1030 non-null   float64
 5   Coarse Aggregate    1030 non-null   float64
 6   Fine Aggregate      1030 non-null   float64
 7   Age                 1030 non-null   int64  
 8   Strength            1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.6 KB

fig, axs = plt.subplots(3, 3, figsize=(15, 15))

for ax, feature in zip(axs.flatten(), df.columns):
    if len(df[feature].unique()) <= 10:
        labels, sizes = np.unique(df[feature], return_counts=True)
        sns.barplot(x=labels, y=sizes, hue=labels, ax=ax, palette="tab10", legend=False)
        ax.set_xlabel("")
        ax.set_title(feature)
    else:
        sns.histplot(data=df, x=feature, ax=ax)
        ax.set_xlabel("")
        ax.set_title(feature)

plt.tight_layout()
plt.show()

fig, axs = plt.subplots(1, 2, figsize=(15, 5))

sns.histplot(data=df, x="Strength", kde=True, ax=axs[0])
axs[0].set_title("Histogram")

sns.boxplot(data=df, x="Strength", orient="h", ax=axs[1], legend=False)
axs[1].set_title("Box plot")

plt.show()

X = df.drop("Strength", axis=1)
y = df["Strength"]

scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=df.columns[:-1])
X.head()

X_train, X_right, y_train, y_right = train_test_split(X, y, train_size=0.8, random_state=0)
X_validation, X_test, y_validation, y_test = train_test_split(X_right, y_right, train_size=0.5, random_state=0)

print("X_train shape:", X_train.shape)
print("X_validation shape:", X_validation.shape)
print("X_test shape:", X_validation.shape)

X_train shape: (824, 8)
X_validation shape: (103, 8)
X_test shape: (103, 8)

model = Sequential()
model.add(Input(shape=(X.shape[1],)))
model.add(layers.Dense(50, activation="relu"))
model.add(layers.Dense(50, activation="relu"))
model.add(layers.Dense(1))

model.summary()

I0000 00:00:1741292961.745790   58346 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1801 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5

Model: "sequential"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ dense (Dense)                   │ (None, 50)             │           450 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 50)             │         2,550 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_2 (Dense)                 │ (None, 1)              │            51 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 3,051 (11.92 KB)

 Trainable params: 3,051 (11.92 KB)

 Non-trainable params: 0 (0.00 B)

class CustomVerbose(callbacks.Callback):
    def __init__(self, epochs_to_show):
        self.epochs_to_show = epochs_to_show

    def on_epoch_begin(self, epoch, logs=None):
        if epoch in self.epochs_to_show:
            self.epoch_start_time = datetime.now()

    def on_epoch_end(self, epoch, logs=None):
        if epoch in self.epochs_to_show:
            self.epoch_stop_time = datetime.now()
            print(f"Epoch {epoch+1}/{self.epochs_to_show[-1] + 1}")
            print(f"\telapsed time: {(self.epoch_stop_time - self.epoch_start_time).total_seconds():.3f}s - r2_score: {logs['r2_score']:.4f} - loss: {logs['loss']:.4f} - val_r2_score: {logs['val_r2_score']:.4f} - val_loss: {logs['val_loss']:.4f}")

model.compile(optimizer="adam", loss='mean_squared_error', metrics=['r2_score'])

epochs = 300
epochs_to_show = [0] + [i for i in range(int(epochs/10)-1, epochs, int(epochs/10))]
custom_verbose = CustomVerbose(epochs_to_show)
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=int(epochs/10), verbose=1)
history = model.fit(x=X_train, y=y_train, epochs=epochs, verbose=0, validation_data=(X_validation, y_validation), callbacks=[custom_verbose, early_stopping])

I0000 00:00:1741293039.641227   64674 service.cc:148] XLA service 0x7b41d0009a60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1741293039.641245   64674 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce GTX 1650, Compute Capability 7.5
2025-03-06 14:30:39.671254: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1741293039.767164   64674 cuda_dnn.cc:529] Loaded cuDNN version 90701
I0000 00:00:1741293040.345276   64674 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

Epoch 1/300
	elapsed time: 2.982s - r2_score: -4.4259 - loss: 1531.5669 - val_r2_score: -4.3278 - val_loss: 1318.7324
Epoch 30/300
	elapsed time: 0.099s - r2_score: 0.5445 - loss: 128.5662 - val_r2_score: 0.4962 - val_loss: 124.6939
Epoch 60/300
	elapsed time: 0.094s - r2_score: 0.7863 - loss: 60.3318 - val_r2_score: 0.7695 - val_loss: 57.0544
Epoch 90/300
	elapsed time: 0.096s - r2_score: 0.8723 - loss: 36.0539 - val_r2_score: 0.8702 - val_loss: 32.1337
Epoch 120/300
	elapsed time: 0.092s - r2_score: 0.9047 - loss: 26.9075 - val_r2_score: 0.8895 - val_loss: 27.3493
Epoch 150/300
	elapsed time: 0.093s - r2_score: 0.9197 - loss: 22.6718 - val_r2_score: 0.8983 - val_loss: 25.1747
Epoch 180/300
	elapsed time: 0.093s - r2_score: 0.9325 - loss: 19.0631 - val_r2_score: 0.9032 - val_loss: 23.9478
Epoch 210/300
	elapsed time: 0.099s - r2_score: 0.9363 - loss: 17.9921 - val_r2_score: 0.9048 - val_loss: 23.5635
Epoch 233: early stopping

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(history.history['r2_score'])
ax1.plot(history.history['val_r2_score'])
ax1.set_xlabel("Epochs")
ax1.set_ylabel("R² Score")
ax1.legend(["Training", "Validation"])

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
ax2.set_xlabel("Epochs")
ax2.set_ylabel("Loss")
ax2.legend(["Training", "Validation"])

plt.show()

y_pred = model.predict(X_test, verbose=0)

print(f"MSE = {mean_squared_error(y_test, y_pred):.2f}")
print(f"R² = {r2_score(y_test, y_pred):.2f}")

MSE = 30.78
R² = 0.89

	Cement	Blast Furnace Slag	Water	Superplasticizer	Coarse Aggregate	Fine Aggregate	Age	Strength
0	540.0	0.0	162.0	2.5	1040.0	676.0	28	79.99
1	540.0	0.0	162.0	2.5	1055.0	676.0	28	61.89
2	332.5	142.5	228.0	0.0	932.0	594.0	270	40.27
3	332.5	142.5	228.0	0.0	932.0	594.0	365	41.05
4	198.6	132.4	192.0	0.0	978.4	825.5	360	44.30

	Cement	Blast Furnace Slag	Fly Ash	Water	Superplasticizer	Coarse Aggregate	Fine Aggregate	Age
0	2.477915	-0.856888	-0.847144	-0.916764	-0.620448	0.863154	-1.217670	-0.279733
1	2.477915	-0.856888	-0.847144	-0.916764	-0.620448	1.056164	-1.217670	-0.279733
2	0.491425	0.795526	-0.847144	2.175461	-1.039143	-0.526517	-2.240917	3.553066
3	0.491425	0.795526	-0.847144	2.175461	-1.039143	-0.526517	-2.240917	5.057677
4	-0.790459	0.678408	-0.847144	0.488793	-1.039143	0.070527	0.647884	4.978487

Compressive Strength of Concrete¶

Import libraries¶

Load the dataset¶

Understand the dataset¶

Visualize the dataset¶

Analyze the target feature¶

Preprocess the dataset¶

Split the dataset into train, validation and test subsets¶

Build a Neural Network¶

Create a custom callback¶

Compile and train the model¶

Evaluate the model¶