#%% """ Credits: This code is adapted from the textbook "Deep Learning with Python", 2nd Edition, by François Chollet. """ #%% import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tensorflow.keras.layers import TextVectorization #%% Statistics on the length of each document. batch_size = 1 train_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size) val_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size) test_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size) dataset = val_ds number = len(dataset) objects = [None] * number # initialize the list of training inputs labels = [None] * number # initialize the list of training labels tensors = [None] * number # initialize the list of training inputs lengths_val = np.zeros((number)) total = 0 smallest = 1000000 largest = 0 counter = 0 for inputs, targets in dataset: objects[counter] = inputs[0] labels[counter] = targets[0] tensors[counter] = text_vectorization(objects[counter]) num_words = len(tensors[counter]) smallest = min(smallest, num_words) largest = max(largest, num_words) total = total + num_words lengths_val[counter] = num_words # Print rate of progress every thousand iterations, as this loop can be slow. if (counter % 1000== 0): print("processed %d out of %d entries" % (counter, number)) counter = counter+1 print("processed %d out of %d entries" % (counter, number)) print("smallest = %d words" % (smallest)) print("largest = %d words" % (largest)) average = total / counter print("average = %.1f words" % (average)) #%% Load training, validation, and test set for the Large Movie Review Dataset. # Each of these sets will be a BatchDataset object. batch_size = 32 train_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size) val_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size) test_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size) #%% Map datsets of text objects to sequences of integers. max_tokens = 20000 text_vectorization = TextVectorization(max_tokens=max_tokens, output_mode="int") text_only_train_ds = train_ds.map(lambda x, y: x) text_vectorization.adapt(text_only_train_ds) int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y)) int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y)) int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y)) #%% Training a bidirectional LSTM model on the sequences # This takes 15 hours on my computer (1.5 hours per epoch) inputs = keras.Input(shape=(None,), dtype="int64") oh_vec = tf.one_hot(inputs, depth=max_tokens) x1 = layers.Bidirectional(layers.LSTM(32))(oh_vec) x2 = layers.Dropout(0.5)(x1) outputs = layers.Dense(1, activation="sigmoid")(x2) model = keras.Model(inputs, outputs) model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"]) model.summary() filename = "one_hot_bidir_lstm2.keras" callbacks = [keras.callbacks.ModelCheckpoint(filename, save_best_only=True)] #%% model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks) model = keras.models.load_model(filename) (test_loss, test_acc) = model.evaluate(int_test_ds) print("Test accuracy: %.2f%%" % (test_acc*100)) #%% Define an RNN with word embeddings. Training this model takes about # 55 minutes on my computer (5.5 minutes per epoch) inputs = keras.Input(shape=(None,), dtype="int64") em = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs) x1 = layers.Bidirectional(layers.LSTM(32))(em) x2 = layers.Dropout(0.5)(x1) outputs = layers.Dense(1, activation="sigmoid")(x2) model = keras.Model(inputs, outputs) model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"]) model.summary() filename = "embeddings_bidir_lstm.keras" callbacks = [keras.callbacks.ModelCheckpoint(filename, save_best_only=True)] #%% Train the RNN with word embeddings model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks) #%% Evaluate the RNN with word embeddings model = keras.models.load_model(filename) (test_loss, test_acc) = model.evaluate(int_test_ds) print("Test accuracy: %.2f%%" % (test_acc*100)) #%% Code for computing the difference and distance between the # embeddings of two words. Strings s1 and s2 are the two words. def we_diff(model, tv_layer, s1, s2): em_model = keras.Sequential(model.layers[0:2]) v1 = em_model(tv_layer([s1])) v1 = v1[0,0,:] v2 = em_model(tv_layer([s2])) v2 = v2[0,0,:] diff = v2 - v1 return diff def we_distance(model, tv_layer, s1, s2): diff = we_diff(model, tv_layer, s1, s2) dist = np.linalg.norm(diff) print("distance from \"%s\" to \"%s\" = %.2f" % (s1, s2, dist)) return dist #%% Printing the distance between various pairs of words, using # the word embeddings that the model has learned. filename = "embeddings_bidir_lstm.keras" we_model = keras.models.load_model(filename) we_distance(we_model, text_vectorization, "great", "excellent") we_distance(we_model, text_vectorization, "awful", "horrible") we_distance(we_model, text_vectorization, "great", "awful") we_distance(we_model, text_vectorization, "excellent", "horrible") we_distance(we_model, text_vectorization, "big", "large") we_distance(we_model, text_vectorization, "big", "small") #%% we_distance(we_model, text_vectorization, "great", "excellent") we_distance(we_model, text_vectorization, "awful", "horrible") we_distance(we_model, text_vectorization, "great", "awful") we_distance(we_model, text_vectorization, "excellent", "horrible") #%% we_distance(we_model, text_vectorization, "buy", "purchase") we_distance(we_model, text_vectorization, "buy", "shop") we_distance(we_model, text_vectorization, "buy", "swim") we_distance(we_model, text_vectorization, "buy", "study") #%% we_distance(we_model, text_vectorization, "obtain", "acquire") we_distance(we_model, text_vectorization, "obtain", "swim") we_distance(we_model, text_vectorization, "acquire", "swim") #%% we_distance(we_model, text_vectorization, "big", "large") we_distance(we_model, text_vectorization, "big", "small")