Emojify¶
Objective: Build a Long Short-Term Memory model that takes word embeddings as input to predict the most appropriate emoji.
Import libraries¶
# !pip install emoji
import numpy as np
from datetime import datetime
from keras import layers, Model, callbacks, optimizers
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import pandas as pd
import emoji
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
2025-01-04 14:20:03.367994: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered WARNING: All log messages before absl::InitializeLog() is called are written to STDERR E0000 00:00:1736022003.390173 234427 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered E0000 00:00:1736022003.396870 234427 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2025-01-04 14:20:03.419428: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Download GloVe pre-trained word vectors¶
Global Vectors for Word Representation (GloVe) is a type of word embedding that encodes the co-occurrence probability ratio between two words as vector differences. It’s an unsupervised learning algorithm for obtaining vector representations for words, mapping them into a meaningful space where the distance between words is related to semantic similarity.
%%bash
if [ -e "/tmp/glove.6B.50d.txt.gz" ]; then
echo "glove.6B.50d.txt.gz already exists!"
else
gdown 1UBn4ibWdLH4wNRbCTodim7IFHGnNrDvt -O /tmp/
fi
gunzip -kf /tmp/glove.6B.50d.txt.gz
glove.6B.50d.txt.gz already exists!
Load the dataset¶
url_train = 'https://raw.githubusercontent.com/LuisAngelMendozaVelasco/Deep_Learning_Specialization/main/Sequence_Models/Week2/Labs/data/train_emoji.csv'
url_test = 'https://raw.githubusercontent.com/LuisAngelMendozaVelasco/Deep_Learning_Specialization/main/Sequence_Models/Week2/Labs/data/tesss.csv'
data_train = pd.read_csv(url_train, header=None)
data_test = pd.read_csv(url_test, header=None)
X_train, y_train = data_train[0].to_numpy(), data_train[1].to_numpy()
X_test, X_validation, y_test, y_validation = train_test_split(data_test[0].to_numpy(), data_test[1].to_numpy(), test_size=0.5, random_state=0)
print("Number of training samples:", len(X_train))
print("Number of validation samples:", len(X_validation))
print("Number of test samples:", len(X_test))
Number of training samples: 132 Number of validation samples: 28 Number of test samples: 28
Visualize the dataset¶
def label_to_emoji(label):
"""
Converts a label (int or string) into the corresponding emoji code (string) ready to be printed
"""
emoji_dictionary = {"0": ":heart:",
"1": ":baseball:",
"2": ":smile:",
"3": ":disappointed:",
"4": ":fork_and_knife:"}
return emoji.emojize(emoji_dictionary[str(label)], language='alias')
max_len = len(max(X_train, key=len).split())
min_len = len(min(X_train, key=len).split())
mean_len = sum([len(x.split()) for x in X_train]) / len(X_train)
print(f'Maximum number of words in a sentence: {max_len}')
print(f'Minimum number of words in a sentence: {min_len}')
print(f'Mean number of words in a sentence: {mean_len:.2f}\n')
for index in range(10):
print(X_train[index], label_to_emoji(y_train[index]))
Maximum number of words in a sentence: 10 Minimum number of words in a sentence: 1 Mean number of words in a sentence: 4.73 never talk to me again 😞 I am proud of your achievements 😄 It is the worst day in my life 😞 Miss you so much ❤️ food is life 🍴 I love you mum ❤️ Stop saying bullshit 😞 congratulations on your acceptance 😄 The assignment is too long 😞 I want to go play ⚾
Visualize the class distribution¶
labels, sizes = np.unique(y_train, return_counts=True)
fig, ax = plt.subplots()
ax.pie(sizes, textprops={'color': "w", 'fontsize': '12'}, autopct=lambda pct: "{:.2f}%\n({:d})".format(pct, round(pct/100 * sum(sizes))))
ax.legend(labels)
ax.set_title("Class")
plt.show()
Load the word vectors¶
The model will use the pre-trained 50-dimensional GloVe embeddings to load the vector representations of words.
def read_glove_vecs(glove_file):
with open(glove_file, 'r', encoding='utf-8') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
curr_word = line[0]
words.add(curr_word)
word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
i = 1
words_to_index = {}
index_to_words = {}
for w in sorted(words):
words_to_index[w] = i
index_to_words[i] = w
i = i + 1
return words_to_index, index_to_words, word_to_vec_map
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('/tmp/glove.6B.50d.txt')
Convert sentences into a list of indices¶
Most deep learning frameworks require that all sequences in the same batch have the same length. The common solution to handling sequences of different length is to use padding.
Specifically:
- Set a maximum sequence length.
- Pad all sequences to have the same length.
def sentences_to_indices(X, word_to_index, max_len):
"""
Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
The output shape should be such that it can be given to 'keras.layers.Embedding()'.
Arguments:
X -- Array of sentences (strings), of shape (m, 1)
word_to_index -- A dictionary containing the each word mapped to its index
max_len -- Maximum number of words in a sentence. We can assume every sentence in X is no longer than this.
Returns:
X_indices -- Array of indices corresponding to words in the sentences from X, of shape (m, max_len)
"""
m = X.shape[0] # Number of training examples
# Initialize X_indices as a numpy matrix of zeros and the correct shape
X_indices = np.zeros((len(X), max_len))
for i in range(m): # Loop over training examples
# Convert the i-th training sentence in lower case and split it into words
sentence_words = [w.lower() for w in X[i].split()]
# Initialize j to 0
j = 0
# Loop over the words of sentence_words
for w in sentence_words:
# If w exists in the dictionary
if w in word_to_index:
# Set the (i, j)-th entry of X_indices to the index of the correct word
X_indices[i, j] = word_to_index[w]
# Increment j to j + 1
j = j + 1
return X_indices
Load a pretrained embedding layer¶
In Keras, the embedding matrix is represented as a "layer".
- The embedding matrix maps word indices to embedding vectors.
- The word indices are positive integers.
- The embedding vectors are dense vectors of fixed size.
- A "dense" vector is the opposite of a sparse vector. It means that most of its values are non-zero.
- The embedding matrix can be derived in two ways:
- Training a model to derive the embeddings from scratch.
- Using a pre-trained embedding.
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
"""
Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
Arguments:
word_to_vec_map -- Dictionary mapping words to their GloVe vector representation
word_to_index -- Dictionary mapping from words to their indices in the vocabulary (400,001 words)
Returns:
embedding_layer -- Pretrained layer Keras instance
"""
vocab_size = len(word_to_index) + 1 # Adding 1 to fit Keras embedding (requirement)
any_word = list(word_to_vec_map.keys())[0]
emb_dim = word_to_vec_map[any_word].shape[0] # Define dimensionality of your GloVe word vectors (= 50)
# Step 1
# Initialize the embedding matrix as a numpy array of zeros.
embedding_matrix = np.zeros((vocab_size, emb_dim))
# Step 2
# Set each row "idx" of the embedding matrix to be
# the word vector representation of the idx'th word of the vocabulary.
for word, idx in word_to_index.items():
embedding_matrix[idx, :] = word_to_vec_map[word]
# Step 3
# Define Keras embedding layer with the correct input and output sizes.
# Make it non-trainable.
embedding_layer = layers.Embedding(vocab_size, emb_dim, trainable=False)
# Step 4
# Build the embedding layer, it is required before setting the weights of the embedding layer.
embedding_layer.build((None,))
# Set the weights of the embedding layer to the embedding matrix. The layer is now pretrained.
embedding_layer.set_weights([embedding_matrix])
return embedding_layer
Build a Long Short-Term Memory network¶
# Define sentence_indices as the input.
sentence_indices = layers.Input((max_len,), dtype='int32')
# Create the embedding layer pretrained with GloVe Vectors.
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
# Propagate sentence_indices through the embedding layer.
embeddings = embedding_layer(sentence_indices)
# Propagate the embeddings through an LSTM layer with 128-dimensional hidden state.
# The returned output should be a batch of sequences.
x = layers.LSTM(128, return_sequences=True)(embeddings)
# Add dropout with a probability of 0.5
x = layers.Dropout(0.5)(x)
# Propagate x trough another LSTM layer with 128-dimensional hidden state.
# The returned output should be a single hidden state, not a batch of sequences.
x = layers.LSTM(128)(x)
# Add dropout with a probability of 0.5
x = layers.Dropout(0.5)(x)
# Propagate x through a Dense layer with 5 units.
x = layers.Dense(5)(x)
# Add a softmax activation.
outputs = layers.Activation('softmax')(x)
# Create Model instance which converts sentence_indices into outputs.
model = Model(inputs=sentence_indices, outputs=outputs)
model.summary()
I0000 00:00:1736022015.644896 234427 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1741 MB memory: -> device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Embedding) │ (None, 10, 50) │ 20,000,050 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ lstm (LSTM) │ (None, 10, 128) │ 91,648 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout (Dropout) │ (None, 10, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ lstm_1 (LSTM) │ (None, 128) │ 131,584 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_1 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense (Dense) │ (None, 5) │ 645 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ activation (Activation) │ (None, 5) │ 0 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 20,223,927 (77.15 MB)
Trainable params: 223,877 (874.52 KB)
Non-trainable params: 20,000,050 (76.29 MB)
Create a custom callback¶
class CustomVerbose(callbacks.Callback):
def __init__(self, epochs_to_show):
self.epochs_to_show = epochs_to_show
def on_epoch_begin(self, epoch, logs=None):
if epoch in self.epochs_to_show:
self.epoch_start_time = datetime.now()
def on_epoch_end(self, epoch, logs=None):
if epoch in self.epochs_to_show:
self.epoch_stop_time = datetime.now()
print(f"Epoch {epoch+1}/{self.epochs_to_show[-1] + 1}")
print(f"\telapsed time: {(self.epoch_stop_time - self.epoch_start_time).total_seconds():.3f}s - accuracy: {logs['categorical_accuracy']:.4f} - loss: {logs['loss']:.4f} - val_accuracy: {logs['val_categorical_accuracy']:.4f} - val_loss: {logs['val_loss']:.4f}")
Compile and train the model¶
model.compile(optimizer=optimizers.Adam(1e-4), loss='categorical_crossentropy', metrics=['categorical_accuracy'])
epochs = 200
epochs_to_show = [0] + [i for i in range(int(epochs/10)-1, epochs, int(epochs/10))]
custom_verbose = CustomVerbose(epochs_to_show)
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=int(epochs/10), verbose=1)
X_train_, y_train_ = sentences_to_indices(X_train, word_to_index, max_len), tf.one_hot(y_train, depth=5)
X_validation_, y_validation_ = sentences_to_indices(X_validation, word_to_index, max_len), tf.one_hot(y_validation, depth=5)
history = model.fit(X_train_, y_train_, epochs=epochs, batch_size=32, verbose=0, validation_data=(X_validation_, y_validation_), callbacks=[custom_verbose, early_stopping])
I0000 00:00:1736022020.098586 234783 cuda_dnn.cc:529] Loaded cuDNN version 90300
Epoch 1/200 elapsed time: 4.610s - accuracy: 0.2500 - loss: 1.6047 - val_accuracy: 0.2857 - val_loss: 1.5916 Epoch 20/200 elapsed time: 0.125s - accuracy: 0.3712 - loss: 1.4485 - val_accuracy: 0.3571 - val_loss: 1.4700 Epoch 40/200 elapsed time: 0.123s - accuracy: 0.6515 - loss: 1.1439 - val_accuracy: 0.6071 - val_loss: 1.1675 Epoch 60/200 elapsed time: 0.123s - accuracy: 0.7273 - loss: 0.7253 - val_accuracy: 0.7143 - val_loss: 0.8038 Epoch 80/200 elapsed time: 0.133s - accuracy: 0.8333 - loss: 0.4601 - val_accuracy: 0.7500 - val_loss: 0.5313 Epoch 100/200 elapsed time: 0.131s - accuracy: 0.9167 - loss: 0.2750 - val_accuracy: 0.8214 - val_loss: 0.4175 Epoch 120/200 elapsed time: 0.125s - accuracy: 0.9242 - loss: 0.1916 - val_accuracy: 0.8214 - val_loss: 0.4444 Epoch 140/200 elapsed time: 0.134s - accuracy: 0.9545 - loss: 0.1786 - val_accuracy: 0.8929 - val_loss: 0.4704 Epoch 154: early stopping
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.plot(history.history['categorical_accuracy'])
ax1.plot(history.history['val_categorical_accuracy'])
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Accuracy")
ax1.legend(["Training", "Validation"])
ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
ax2.set_xlabel("Epochs")
ax2.set_ylabel("Loss")
ax2.legend(["Training", "Validation"])
plt.show()
Evaluate the model¶
X_test_ = sentences_to_indices(X_test, word_to_index, max_len)
y_test_ = tf.one_hot(y_test, depth=5)
prediction_proba = model.predict(X_test_, verbose=0)
y_pred = np.argmax(prediction_proba, axis=1)
print(classification_report(y_test, y_pred, digits=4))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.grid(False)
plt.show()
precision recall f1-score support 0 0.5000 1.0000 0.6667 4 2 0.8889 0.8000 0.8421 10 3 0.8571 0.6667 0.7500 9 4 1.0000 0.8000 0.8889 5 accuracy 0.7857 28 macro avg 0.8115 0.8167 0.7869 28 weighted avg 0.8430 0.7857 0.7958 28
Show the mislabelled examples:
print({i:label_to_emoji(i) for i in range(5)})
prediction = model.predict(X_test_, verbose=0)
for i in range(len(X_test)):
num = np.argmax(prediction[i])
if(num != y_test[i]):
print("Sentence: " + X_test[i] + " -> Expected emoji: " + label_to_emoji(y_test[i]) + ", Prediction: " + label_to_emoji(num))
{0: '❤️', 1: '⚾', 2: '😄', 3: '😞', 4: '🍴'} Sentence: work is horrible -> Expected emoji: 😞, Prediction: 😄 Sentence: I did not have breakfast -> Expected emoji: 🍴, Prediction: 😞 Sentence: This girl is messing with me -> Expected emoji: 😞, Prediction: ❤️ Sentence: she is a bully -> Expected emoji: 😞, Prediction: ❤️ Sentence: you brighten my day -> Expected emoji: 😄, Prediction: ❤️ Sentence: she got me a nice present -> Expected emoji: 😄, Prediction: ❤️