#%% """ Credits: This code is adapted from the textbook "Deep Learning with Python", 2nd Edition, by François Chollet. """ #%% import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tensorflow.keras.layers import TextVectorization #%% On my computer, using Anaconda, the get_vocabulary method of # the TextVectorization layer gave me an error, because it could not # handle characters with ASCII codes > 127. # This code does some preprocessing (find characters with ASCII codes > 127 # and replace them with the SPACE character) to the text saved in # text_only_train_ds, to create a "cleaned-up" version called new_dataset. batch_size = 32 train_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size) val_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size) test_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size) text_only_train_ds = train_ds.map(lambda x, y: x) text_list = list(text_only_train_ds.as_numpy_iterator()) bad_list = [] new_list = [[]] * len(text_list) for i in range(0, len(text_list)): sublist = [[]] * len(text_list[i]) for j in range(0, len(text_list[i])): entry = text_list[i][j] mutable = bytearray(entry) np_data = np.array(mutable) bad = (np_data >= 128) np_data[bad] = 32 # ascii for 0 if (bad.sum() > 0): bad_list = bad_list + [(i,j)] sublist[j] = bytes(np_data) new_list[i] = np.array(sublist) new_dataset = tf.data.Dataset.from_tensor_slices(new_list) max_tokens = 20000 #%% Now we can create our text vectorization layer and call adapt() tv = TextVectorization(max_tokens=max_tokens, output_mode="int") tv.adapt(new_dataset) #%% Creating training, validation, test datasets max_tokens = 20000 text_vectorization = TextVectorization(max_tokens=max_tokens, output_mode="int") text_only_train_ds = train_ds.map(lambda x, y: x) text_vectorization.adapt(text_only_train_ds) int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y)) int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y)) int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y)) #%% Code to load pre-trained GloVe embeddings, glove_file = "../../../../../home/cse4392_data/glove.6b.100d.txt" embeddings_index = {} with open(glove_file, encoding='utf-8') as f: for line in f: word, coefs = line.split(maxsplit=1) coefs = np.fromstring(coefs, "f", sep=" ") embeddings_index[word] = coefs print(f"Found {len(embeddings_index)} word vectors.") #%% This code creates an embedding layer that matches the vocabulary # of the text vectorization layer we just created, and uses the # vectors from the GloVe file. embedding_dim = embeddings_index["hello"].shape[0] vocabulary = tv.get_vocabulary() word_index = dict(zip(vocabulary, range(len(vocabulary)))) embedding_matrix = np.zeros((max_tokens, embedding_dim)) lengths = np.zeros((len(word_index))) for word, i in word_index.items(): # for every word in our vocabulary from the reviews dataset if i < max_tokens: embedding_vector = embeddings_index.get(word) # look up the GloVe vector for that word if embedding_vector is not None: if (embedding_vector.shape[0] == embedding_dim): embedding_matrix[i] = embedding_vector # store that vector # here we create the actual Embedding layer, using embedding_matrix (whose values # we set in previous loop) to initialize the weights of the layer. # Then we set trainable=False, to make sure that the pretrained GloVe vectors # do not change during training. embedding_layer = layers.Embedding( max_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False, mask_zero=True, ) #%% Here we create the RNN model using the GloVe embeddings # This takes about 30 minutes on my computer (3 minutes per epoch) # You can skip this step, and download the model from # "glove_embeddings_sequence_model.keras", on the course website. # inputs = keras.Input(shape=(None,), dtype="int64") embedded = embedding_layer(inputs) x = layers.Bidirectional(layers.LSTM(32))(embedded) x = layers.Dropout(0.5)(x) outputs = layers.Dense(1, activation="sigmoid")(x) model = keras.Model(inputs, outputs) model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"]) model.summary() filename = "glove_embeddings_sequence_model.keras" callbacks = [ keras.callbacks.ModelCheckpoint(filename, save_best_only=True) ] model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks) #%% Here we evaluate the test accuracy of the model. It is about 80%, # not as good as the accuracy we get when we learn embeddings from our # dataset. filename = "glove_embeddings_sequence_model.keras" model = keras.models.load_model(filename) (test_loss, test_acc) = model.evaluate(int_test_ds) print("Test accuracy: %.2f%%" % (test_acc*100)) #%% These functions compute the difference and distance between # the embeddings of two words. filename = "glove_embeddings_sequence_model.keras" model = keras.models.load_model(filename) def we_diff(model, tv_layer, s1, s2): em_model = keras.Sequential(model.layers[0:2]) v1 = em_model(tv_layer([s1])) v1 = v1[0,0,:] v2 = em_model(tv_layer([s2])) v2 = v2[0,0,:] diff = v2 - v1 return diff def we_distance(model, tv_layer, s1, s2): diff = we_diff(model, tv_layer, s1, s2) dist = np.linalg.norm(diff) print("distance from \"%s\" to \"%s\" = %.2f" % (s1, s2, dist)) return dist #%% Printing the distance between various pairs of words, using # the GloVe embeddings filename = "glove_embeddings_sequence_model.keras" glove_model = keras.models.load_model(filename) we_distance(glove_model, tv, "great", "excellent") we_distance(glove_model, tv, "awful", "horrible") we_distance(glove_model, tv, "great", "awful") we_distance(glove_model, tv, "excellent", "horrible") we_distance(glove_model, tv, "big", "large") we_distance(glove_model, tv, "big", "small") #%% we_distance(glove_model, tv, "great", "excellent") we_distance(glove_model, tv, "awful", "horrible") we_distance(glove_model, tv, "great", "awful") we_distance(glove_model, tv, "excellent", "horrible") #%% we_distance(glove_model, tv, "buy", "purchase") we_distance(glove_model, tv, "buy", "shop") we_distance(glove_model, tv, "buy", "swim") we_distance(glove_model, tv, "buy", "study") #%% we_distance(glove_model, tv, "obtain", "acquire") we_distance(glove_model, tv, "obtain", "swim") we_distance(glove_model, tv, "acquire", "swim") #%% we_distance(glove_model, tv, "big", "large") we_distance(glove_model, tv, "big", "small") #%% we_distance(glove_model, tv, "movie", "film") we_distance(glove_model, tv, "movie", "shark")