# !pip install emoji
import numpy as np
from datetime import datetime
from keras import layers, Model, callbacks, optimizers
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import pandas as pd
import emoji
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

2025-01-04 14:20:03.367994: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1736022003.390173  234427 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736022003.396870  234427 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-04 14:20:03.419428: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

%%bash

if [ -e "/tmp/glove.6B.50d.txt.gz" ]; then
    echo "glove.6B.50d.txt.gz already exists!"
else
    gdown 1UBn4ibWdLH4wNRbCTodim7IFHGnNrDvt -O /tmp/
fi

gunzip -kf /tmp/glove.6B.50d.txt.gz

glove.6B.50d.txt.gz already exists!

url_train = 'https://raw.githubusercontent.com/LuisAngelMendozaVelasco/Deep_Learning_Specialization/main/Sequence_Models/Week2/Labs/data/train_emoji.csv'
url_test = 'https://raw.githubusercontent.com/LuisAngelMendozaVelasco/Deep_Learning_Specialization/main/Sequence_Models/Week2/Labs/data/tesss.csv'

data_train = pd.read_csv(url_train, header=None)
data_test = pd.read_csv(url_test, header=None)

X_train, y_train = data_train[0].to_numpy(), data_train[1].to_numpy()
X_test, X_validation, y_test, y_validation = train_test_split(data_test[0].to_numpy(), data_test[1].to_numpy(), test_size=0.5, random_state=0)

print("Number of training samples:", len(X_train))
print("Number of validation samples:", len(X_validation))
print("Number of test samples:", len(X_test))

Number of training samples: 132
Number of validation samples: 28
Number of test samples: 28

def label_to_emoji(label):
    """
    Converts a label (int or string) into the corresponding emoji code (string) ready to be printed
    """
    
    emoji_dictionary = {"0": ":heart:",
                        "1": ":baseball:",
                        "2": ":smile:",
                        "3": ":disappointed:",
                        "4": ":fork_and_knife:"}

    return emoji.emojize(emoji_dictionary[str(label)], language='alias')

max_len = len(max(X_train, key=len).split())
min_len = len(min(X_train, key=len).split())
mean_len = sum([len(x.split()) for x in X_train]) / len(X_train)
print(f'Maximum number of words in a sentence: {max_len}')
print(f'Minimum number of words in a sentence: {min_len}')
print(f'Mean number of words in a sentence: {mean_len:.2f}\n')

for index in range(10):
    print(X_train[index], label_to_emoji(y_train[index]))

Maximum number of words in a sentence: 10
Minimum number of words in a sentence: 1
Mean number of words in a sentence: 4.73

never talk to me again 😞
I am proud of your achievements 😄
It is the worst day in my life 😞
Miss you so much ❤️
food is life 🍴
I love you mum ❤️
Stop saying bullshit 😞
congratulations on your acceptance 😄
The assignment is too long  😞
I want to go play ⚾

labels, sizes = np.unique(y_train, return_counts=True)

fig, ax = plt.subplots()
ax.pie(sizes, textprops={'color': "w", 'fontsize': '12'}, autopct=lambda pct: "{:.2f}%\n({:d})".format(pct, round(pct/100 * sum(sizes))))
ax.legend(labels)
ax.set_title("Class")
plt.show()

def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}

        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}

        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1

    return words_to_index, index_to_words, word_to_vec_map

word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('/tmp/glove.6B.50d.txt')

def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to 'keras.layers.Embedding()'.
    
    Arguments:
    X -- Array of sentences (strings), of shape (m, 1)
    word_to_index -- A dictionary containing the each word mapped to its index
    max_len -- Maximum number of words in a sentence. We can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- Array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0] # Number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape
    X_indices = np.zeros((len(X), max_len))
    
    for i in range(m): # Loop over training examples
        # Convert the i-th training sentence in lower case and split it into words
        sentence_words = [w.lower() for w in X[i].split()]
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # If w exists in the dictionary
            if w in word_to_index:
                # Set the (i, j)-th entry of X_indices to the index of the correct word
                X_indices[i, j] = word_to_index[w]
                # Increment j to j + 1
                j = j + 1
    
    return X_indices

def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- Dictionary mapping words to their GloVe vector representation
    word_to_index -- Dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- Pretrained layer Keras instance
    """
    
    vocab_size = len(word_to_index) + 1             # Adding 1 to fit Keras embedding (requirement)
    any_word = list(word_to_vec_map.keys())[0]
    emb_dim = word_to_vec_map[any_word].shape[0]    # Define dimensionality of your GloVe word vectors (= 50)

    # Step 1
    # Initialize the embedding matrix as a numpy array of zeros.
    embedding_matrix = np.zeros((vocab_size, emb_dim))

    # Step 2
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary.
    for word, idx in word_to_index.items():
        embedding_matrix[idx, :] = word_to_vec_map[word]
    
    # Step 3
    # Define Keras embedding layer with the correct input and output sizes.
    # Make it non-trainable.
    embedding_layer = layers.Embedding(vocab_size, emb_dim, trainable=False)

    # Step 4
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,))

    # Set the weights of the embedding layer to the embedding matrix. The layer is now pretrained.
    embedding_layer.set_weights([embedding_matrix])

    return embedding_layer

# Define sentence_indices as the input.
sentence_indices = layers.Input((max_len,), dtype='int32')

# Create the embedding layer pretrained with GloVe Vectors.
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

# Propagate sentence_indices through the embedding layer.
embeddings = embedding_layer(sentence_indices)    

# Propagate the embeddings through an LSTM layer with 128-dimensional hidden state.
# The returned output should be a batch of sequences.
x = layers.LSTM(128, return_sequences=True)(embeddings)
# Add dropout with a probability of 0.5
x = layers.Dropout(0.5)(x) 
# Propagate x trough another LSTM layer with 128-dimensional hidden state.
# The returned output should be a single hidden state, not a batch of sequences.
x = layers.LSTM(128)(x)
# Add dropout with a probability of 0.5
x = layers.Dropout(0.5)(x) 
# Propagate x through a Dense layer with 5 units.
x = layers.Dense(5)(x)
# Add a softmax activation.
outputs = layers.Activation('softmax')(x)

# Create Model instance which converts sentence_indices into outputs.
model = Model(inputs=sentence_indices, outputs=outputs)
model.summary()

I0000 00:00:1736022015.644896  234427 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1741 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5

Model: "functional"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer (InputLayer)        │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Embedding)           │ (None, 10, 50)         │    20,000,050 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ lstm (LSTM)                     │ (None, 10, 128)        │        91,648 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ (None, 10, 128)        │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ lstm_1 (LSTM)                   │ (None, 128)            │       131,584 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_1 (Dropout)             │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 5)              │           645 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ activation (Activation)         │ (None, 5)              │             0 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 20,223,927 (77.15 MB)

 Trainable params: 223,877 (874.52 KB)

 Non-trainable params: 20,000,050 (76.29 MB)

class CustomVerbose(callbacks.Callback):
    def __init__(self, epochs_to_show):
        self.epochs_to_show = epochs_to_show

    def on_epoch_begin(self, epoch, logs=None):
        if epoch in self.epochs_to_show:
            self.epoch_start_time = datetime.now()

    def on_epoch_end(self, epoch, logs=None):
        if epoch in self.epochs_to_show:
            self.epoch_stop_time = datetime.now()
            print(f"Epoch {epoch+1}/{self.epochs_to_show[-1] + 1}")
            print(f"\telapsed time: {(self.epoch_stop_time - self.epoch_start_time).total_seconds():.3f}s - accuracy: {logs['categorical_accuracy']:.4f} - loss: {logs['loss']:.4f} - val_accuracy: {logs['val_categorical_accuracy']:.4f} - val_loss: {logs['val_loss']:.4f}")

model.compile(optimizer=optimizers.Adam(1e-4), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

epochs = 200
epochs_to_show = [0] + [i for i in range(int(epochs/10)-1, epochs, int(epochs/10))]
custom_verbose = CustomVerbose(epochs_to_show)
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=int(epochs/10), verbose=1)
X_train_, y_train_ = sentences_to_indices(X_train, word_to_index, max_len), tf.one_hot(y_train, depth=5)
X_validation_, y_validation_ = sentences_to_indices(X_validation, word_to_index, max_len), tf.one_hot(y_validation, depth=5)
history = model.fit(X_train_, y_train_, epochs=epochs, batch_size=32, verbose=0, validation_data=(X_validation_, y_validation_), callbacks=[custom_verbose, early_stopping])

I0000 00:00:1736022020.098586  234783 cuda_dnn.cc:529] Loaded cuDNN version 90300

Epoch 1/200
	elapsed time: 4.610s - accuracy: 0.2500 - loss: 1.6047 - val_accuracy: 0.2857 - val_loss: 1.5916
Epoch 20/200
	elapsed time: 0.125s - accuracy: 0.3712 - loss: 1.4485 - val_accuracy: 0.3571 - val_loss: 1.4700
Epoch 40/200
	elapsed time: 0.123s - accuracy: 0.6515 - loss: 1.1439 - val_accuracy: 0.6071 - val_loss: 1.1675
Epoch 60/200
	elapsed time: 0.123s - accuracy: 0.7273 - loss: 0.7253 - val_accuracy: 0.7143 - val_loss: 0.8038
Epoch 80/200
	elapsed time: 0.133s - accuracy: 0.8333 - loss: 0.4601 - val_accuracy: 0.7500 - val_loss: 0.5313
Epoch 100/200
	elapsed time: 0.131s - accuracy: 0.9167 - loss: 0.2750 - val_accuracy: 0.8214 - val_loss: 0.4175
Epoch 120/200
	elapsed time: 0.125s - accuracy: 0.9242 - loss: 0.1916 - val_accuracy: 0.8214 - val_loss: 0.4444
Epoch 140/200
	elapsed time: 0.134s - accuracy: 0.9545 - loss: 0.1786 - val_accuracy: 0.8929 - val_loss: 0.4704
Epoch 154: early stopping

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(history.history['categorical_accuracy'])
ax1.plot(history.history['val_categorical_accuracy'])
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Accuracy")
ax1.legend(["Training", "Validation"])

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
ax2.set_xlabel("Epochs")
ax2.set_ylabel("Loss")
ax2.legend(["Training", "Validation"])

plt.show()

X_test_ = sentences_to_indices(X_test, word_to_index, max_len)
y_test_ = tf.one_hot(y_test, depth=5)
prediction_proba = model.predict(X_test_, verbose=0)
y_pred = np.argmax(prediction_proba, axis=1)

print(classification_report(y_test, y_pred, digits=4))

ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.grid(False)
plt.show()

              precision    recall  f1-score   support

           0     0.5000    1.0000    0.6667         4
           2     0.8889    0.8000    0.8421        10
           3     0.8571    0.6667    0.7500         9
           4     1.0000    0.8000    0.8889         5

    accuracy                         0.7857        28
   macro avg     0.8115    0.8167    0.7869        28
weighted avg     0.8430    0.7857    0.7958        28

print({i:label_to_emoji(i) for i in range(5)})
prediction = model.predict(X_test_, verbose=0)

for i in range(len(X_test)):
    num = np.argmax(prediction[i])
    
    if(num != y_test[i]):
        print("Sentence: " + X_test[i] + " -> Expected emoji: " + label_to_emoji(y_test[i]) + ", Prediction: " + label_to_emoji(num))

{0: '❤️', 1: '⚾', 2: '😄', 3: '😞', 4: '🍴'}
Sentence: work is horrible	 -> Expected emoji: 😞, Prediction: 😄
Sentence: I did not have breakfast  -> Expected emoji: 🍴, Prediction: 😞
Sentence: This girl is messing with me	 -> Expected emoji: 😞, Prediction: ❤️
Sentence: she is a bully	 -> Expected emoji: 😞, Prediction: ❤️
Sentence: you brighten my day	 -> Expected emoji: 😄, Prediction: ❤️
Sentence: she got me a nice present	 -> Expected emoji: 😄, Prediction: ❤️

Emojify¶

Import libraries¶

Download GloVe pre-trained word vectors¶

Load the dataset¶

Visualize the dataset¶

Visualize the class distribution¶

Load the word vectors¶

Convert sentences into a list of indices¶

Load a pretrained embedding layer¶

Build a Long Short-Term Memory network¶

Create a custom callback¶

Compile and train the model¶

Evaluate the model¶