#%% """ Credits: This code is adapted from the textbook "Deep Learning with Python", 2nd Edition, by François Chollet. """ #%% import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tensorflow.keras.layers import TextVectorization #%% Load training, validation, and test set for the Large Movie Review Dataset. # Each of these sets will be a BatchDataset object. batch_size = 32 train_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size) val_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size) test_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size) #%% A for loop illustrating one way of iterating over the contents of a # BatchDataset object. batch_size = 1 train_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size) number = len(train_ds) train_objects = [None] * number # initialize the list of training inputs train_labels = [None] * number # initialize the list of training labels counter = 0 for inputs, targets in train_ds: train_objects[counter] = inputs[0] train_labels[counter] = targets[0] # Print rate of progress every thousand iterations, as this loop can be slow. if (counter % 1000== 0): print("processed %d out of %d entries" % (counter, number)) counter = counter+1 print("processed %d out of %d entries" % (counter, number)) #%% An example of how to convert a BatchDataset object to a list, and how # to understand the structure of that list. batch_size = 32 train_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size) train_list = list(train_ds.as_numpy_iterator()) print("train_list contains %d batches" % (len(train_list))) print("Each batch is a tuple of %d elements" % (len(train_list[0]))) print("The first element is an array of %d inputs" % (len(train_list[0][0]))) print("The second element is an array of %d targets" % (len(train_list[0][1]))) print("\nThe first input of the first batch is:\n", train_list[0][0][0]) print("\nThe target for the first input of the first batch is:", train_list[0][1][0]) #%% Yet another example of accessing data from a BatchDataset object. # Here we show some info about the first batch of train_ds. # Note that we do not have an elegant way to access the # first batch directly. Instead, we use this hack, where we start # a loop over the dataset, and we break after the first iteration. batch_size = 32 train_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size) val_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size) test_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size) for inputs, targets in train_ds: print("inputs.shape:", inputs.shape) print("inputs.dtype:", inputs.dtype) print("targets.shape:", targets.shape) print("targets.dtype:", targets.dtype) print("inputs[0]:", inputs[0]) print("targets[0]:", targets[0]) break #%% An example of text vectorization applied to a toy dataset. text_vectorization = TextVectorization(output_mode="multi_hot", ngrams=1) dataset = [ "It is a beautiful day", "The sun is shining", "The weather is a bit warmer", ] text_vectorization.adapt(dataset) words = text_vectorization.get_vocabulary() print("\nvocabulary:\n", text_vectorization.get_vocabulary(), "\n") # Here we get the vectorized version of some text out = text_vectorization("What a beautiful day") print("result of vectorization:\n", out, "\n") # Here we "decode" the vector, mapping each number back to a token for value in out.numpy(): # print(value, words[value]) print("%2d" % (value), " \""+words[value]+"\",") #%% Map datsets of text objects to datasets of bag-of-words vectors. text_vectorization = TextVectorization(max_tokens=20000, output_mode="multi_hot") text_only_train_ds = train_ds.map(lambda x, y: x) text_vectorization.adapt(text_only_train_ds) binary_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y)) binary_1gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y)) binary_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y)) #%% Show some info about the first batch of text_only_train_ds # Note that we do not have an elegant way to access the # first batch directly. Instead, we use this hack, where we start # a loop over the dataset, and we break after the first iteration. for inputs in text_only_train_ds: print("inputs.shape:", inputs.shape) print("inputs.dtype:", inputs.dtype) print("inputs[0]:", inputs[0]) break #%% Show some info about the first batch of binary_1gram_train_ds # We use the familiar hack that we used before, to acecss the first batch. for inputs, targets in binary_1gram_train_ds: print("inputs.shape:", inputs.shape) print("inputs.dtype:", inputs.dtype) print("targets.shape:", targets.shape) print("targets.dtype:", targets.dtype) print("inputs[0]:", inputs[0]) print("targets[0]:", targets[0]) break #%% Training a dense model on the bag-of-words data # This version uses the 1-gram tokens (i.e., each token is a word) max_tokens = 20000 model = keras.Sequential([keras.Input(shape=(max_tokens,)), keras.layers.Dense(16, activation="relu"), keras.layers.Dropout(0.5), keras.layers.Dense(1, activation="sigmoid")]) model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"]) model.summary() callbacks = [keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only=True)] model.fit(binary_1gram_train_ds, validation_data=binary_1gram_val_ds, epochs=10, callbacks=callbacks) #%% Training a dense model, using the cache option. # As before, this version uses the 1-gram tokens (i.e., each token is a # word). This cell is equivalent to the previous cell, but runs faster # because of the cache option. max_tokens = 20000 model = keras.Sequential([keras.Input(shape=(max_tokens,)), keras.layers.Dense(16, activation="tanh"), keras.layers.Dropout(0.5), keras.layers.Dense(1, activation="sigmoid")]) model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"]) model.summary() callbacks = [keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only=True)] model.fit(binary_1gram_train_ds.cache(), validation_data=binary_1gram_val_ds.cache(), epochs=10, callbacks=callbacks) model = keras.models.load_model("binary_1gram.keras") (test_loss, test_acc) = model.evaluate(binary_1gram_test_ds) print("Test accuracy: %.2f%%" % (test_acc*100)) #%% Training another dense model, this time using bigrams. # First, create the bigrams version of the dataset. text_vectorization = TextVectorization(max_tokens=20000, ngrams=2, output_mode="multi_hot") text_only_train_ds = train_ds.map(lambda x, y: x) text_vectorization.adapt(text_only_train_ds) binary_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y)) binary_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y)) binary_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y)) #%% Training a dense model for the bigrams version, using the cache option max_tokens = 20000 model = keras.Sequential([keras.Input(shape=(max_tokens,)), layers.Dense(16, activation="relu"), layers.Dropout(0.5), layers.Dense(1, activation="sigmoid")]) model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"]) model.summary() callbacks = [keras.callbacks.ModelCheckpoint("binary_2gram.keras", save_best_only=True)] model.fit(binary_2gram_train_ds.cache(), validation_data=binary_2gram_val_ds.cache(), epochs=10, callbacks=callbacks) model = keras.models.load_model("binary_2gram.keras") (test_loss, test_acc) = model.evaluate(binary_2gram_test_ds) print("Test accuracy: %.2f%%" % (test_acc*100)) #%% applying the model to a new review review_text = "One of the best movies of the year. I strongly recommend it." vectorized_data = text_vectorization([review_text]) prediction = model(vectorized_data) print("prediction: %.3f\n" % (prediction[0][0].numpy())) #%% new_model = keras.Sequential([text_vectorization, model]) review_text = "One of the best movies of the year. I strongly recommend it." tensorized_input = tf.convert_to_tensor([review_text]) prediction = new_model(tensorized_input) print("prediction: %.3f\n" % (prediction[0][0].numpy())) prediction2 = new_model.predict(tensorized_input) print("prediction2: %.3f\n" % (prediction2[0])) #%% # The next part of the code will apply a 1-gram (word-based) model and a # bigram model to various inputs. I deliberately wrote each of those # inputs to get one or both models to produce wrong results. # Here we load the two models and we prepare the text vectorization # module for each model. model1 = keras.models.load_model("binary_1gram.keras") model2 = keras.models.load_model("binary_2gram.keras") text_vectorization1 = TextVectorization(max_tokens=20000, ngrams=1, output_mode="multi_hot") text_vectorization1.adapt(text_only_train_ds) text_vectorization2 = TextVectorization(max_tokens=20000, ngrams=2, output_mode="multi_hot") text_vectorization2.adapt(text_only_train_ds) #%% mock_review = """Before I watched it, I expected that it would be a bad movie. After watching, my impression was the exact opposite.""" vectorized_data1 = text_vectorization1([mock_review]) predictions1 = model1(vectorized_data1) print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy())) vectorized_data2 = text_vectorization2([mock_review]) predictions2 = model2(vectorized_data2) print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy())) #%% mock_review = """Before I watched it, I expected that it would be an excellent movie. After watching, my impression was the exact opposite.""" vectorized_data1 = text_vectorization1([mock_review]) predictions1 = model1(vectorized_data1) print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy())) vectorized_data2 = text_vectorization2([mock_review]) predictions2 = model2(vectorized_data2) print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy())) #%% mock_review = """Was this a bad movie? The answer is no, quite the opposite.""" vectorized_data1 = text_vectorization1([mock_review]) predictions1 = model1(vectorized_data1) print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy())) vectorized_data2 = text_vectorization2([mock_review]) predictions2 = model2(vectorized_data2) print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy())) #%% mock_review = """Was this a great movie? The answer is no, quite the opposite.""" vectorized_data1 = text_vectorization1([mock_review]) predictions1 = model1(vectorized_data1) print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy())) vectorized_data2 = text_vectorization2([mock_review]) predictions2 = model2(vectorized_data2) print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy())) #%% mock_review = """I wish I could say that this was an excellent movie, a good movie, or at least an OK movie. I most definitely cannot say that""" vectorized_data1 = text_vectorization1([mock_review]) predictions1 = model1(vectorized_data1) print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy())) vectorized_data2 = text_vectorization2([mock_review]) predictions2 = model2(vectorized_data2) print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy())) #%% mock_review = """I went to watch this movie with the highest expectations. My conclusions after watching it? This movie was not excellent, not good, not OK. A thoroughly bad movie.""" vectorized_data1 = text_vectorization1([mock_review]) predictions1 = model1(vectorized_data1) print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy())) vectorized_data2 = text_vectorization2([mock_review]) predictions2 = model2(vectorized_data2) print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy())) #%% mock_review = """What an awesome movie. Best film of the year, worthy of an Oscar. No, just kidding. Seriously, don't watch it, you will not enjoy it.""" #"""it is a total waste of time.""" vectorized_data1 = text_vectorization1([mock_review]) predictions1 = model1(vectorized_data1) print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy())) vectorized_data2 = text_vectorization2([mock_review]) predictions2 = model2(vectorized_data2) print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy())) #%% mock_review = """This is a great movie. It makes you feel what it is like to live through the horrors of a war, under utterly poor conditions.""" vectorized_data1 = text_vectorization1([mock_review]) predictions1 = model1(vectorized_data1) print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy())) vectorized_data2 = text_vectorization2([mock_review]) predictions2 = model2(vectorized_data2) print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy())) #%% mock_review = """This is a great movie. It makes you feel what it is like to live through the horrors of a war, under utterly poor and awful conditions.""" vectorized_data1 = text_vectorization1([mock_review]) predictions1 = model1(vectorized_data1) print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy())) vectorized_data2 = text_vectorization2([mock_review]) predictions2 = model2(vectorized_data2) print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy())) #%% mock_review = """This movie is great. It makes you feel what it is like to live through the horrors of a war, under utterly poor and awful conditions.""" vectorized_data1 = text_vectorization1([mock_review]) predictions1 = model1(vectorized_data1) print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy())) vectorized_data2 = text_vectorization2([mock_review]) predictions2 = model2(vectorized_data2) print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy())) #%% mock_review = """This movie is great. It makes you feel what it is like to live through a horrible war, under utterly poor and awful conditions.""" vectorized_data1 = text_vectorization1([mock_review]) predictions1 = model1(vectorized_data1) print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy())) vectorized_data2 = text_vectorization2([mock_review]) predictions2 = model2(vectorized_data2) print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))