#%%

"""
Credits: This code is adapted from the textbook "Deep Learning with Python", 
2nd Edition, by François Chollet. 
"""

#%%

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization


#%% Load training, validation, and test set for the Large Movie Review Dataset.
#   Each of these sets will be a BatchDataset object.

batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size)

val_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size)

test_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size)

#%% A for loop illustrating one way of iterating over the contents of a 
#   BatchDataset object.

batch_size = 1
train_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size)

number = len(train_ds)
train_objects = [None] * number    # initialize the list of training inputs
train_labels = [None] * number     # initialize the list of training labels

counter = 0
for inputs, targets in train_ds:
    train_objects[counter] = inputs[0]
    train_labels[counter] = targets[0]
    
    # Print rate of progress every thousand iterations, as this loop can be slow.
    if (counter % 1000== 0):
        print("processed %d out of %d entries" % (counter, number))
    counter = counter+1

print("processed %d out of %d entries" % (counter, number))


#%% An example of how to convert a BatchDataset object to a list, and how
#   to understand the structure of that list.

batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size)

train_list = list(train_ds.as_numpy_iterator())

print("train_list contains %d batches" % (len(train_list)))
print("Each batch is a tuple of %d elements" % (len(train_list[0])))
print("The first element is an array of %d inputs" % (len(train_list[0][0])))
print("The second element is an array of %d targets" % (len(train_list[0][1])))
print("\nThe first input of the first batch is:\n", train_list[0][0][0])
print("\nThe target for the first input of the first batch is:", train_list[0][1][0])

#%% Yet another example of accessing data from a BatchDataset object.
#   Here we show some info about the first batch of train_ds.
#   Note that we do not have an elegant way to access the 
#   first batch directly. Instead, we use this hack, where we start
#   a loop over the dataset, and we break after the first iteration.

batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size)

val_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size)

test_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size)

for inputs, targets in train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break


#%% An example of text vectorization applied to a toy dataset.

text_vectorization = TextVectorization(output_mode="multi_hot", ngrams=1)

dataset = [
"It is a beautiful day",
"The sun is shining",
"The weather is a bit warmer",
]

text_vectorization.adapt(dataset)
words = text_vectorization.get_vocabulary()

print("\nvocabulary:\n", text_vectorization.get_vocabulary(), "\n")

# Here we get the vectorized version of some text
out = text_vectorization("What a beautiful day")
print("result of vectorization:\n", out, "\n")

# Here we "decode" the vector, mapping each number back to a token
for value in out.numpy():
#    print(value, words[value])
    print("%2d" % (value), " \""+words[value]+"\",")

#%% Map datsets of text objects to datasets of bag-of-words vectors.

text_vectorization = TextVectorization(max_tokens=20000, output_mode="multi_hot")

text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)
binary_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
binary_1gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
binary_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))

#%% Show some info about the first batch of text_only_train_ds
#   Note that we do not have an elegant way to access the 
#   first batch directly. Instead, we use this hack, where we start
#   a loop over the dataset, and we break after the first iteration.

for inputs in text_only_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("inputs[0]:", inputs[0])
    break

#%% Show some info about the first batch of binary_1gram_train_ds
#   We use the familiar hack that we used before, to acecss the first batch.

for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

#%% Training a dense model on the bag-of-words data
#   This version uses the 1-gram tokens (i.e., each token is a word)

max_tokens = 20000
model = keras.Sequential([keras.Input(shape=(max_tokens,)),
                          keras.layers.Dense(16, activation="relu"),
                          keras.layers.Dropout(0.5),
                          keras.layers.Dense(1, activation="sigmoid")])

model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()
callbacks = [keras.callbacks.ModelCheckpoint("binary_1gram.keras",
                                             save_best_only=True)]
model.fit(binary_1gram_train_ds,
          validation_data=binary_1gram_val_ds,
          epochs=10, callbacks=callbacks)


#%% Training a dense model, using the cache option.
#   As before, this version uses the 1-gram tokens (i.e., each token is a 
#   word). This cell is equivalent to the previous cell, but runs faster
#   because of the cache option.

max_tokens = 20000
model = keras.Sequential([keras.Input(shape=(max_tokens,)),
                          keras.layers.Dense(16, activation="tanh"),
                          keras.layers.Dropout(0.5),
                          keras.layers.Dense(1, activation="sigmoid")])

model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()
callbacks = [keras.callbacks.ModelCheckpoint("binary_1gram.keras",
                                             save_best_only=True)]
model.fit(binary_1gram_train_ds.cache(),
          validation_data=binary_1gram_val_ds.cache(),
          epochs=10, callbacks=callbacks)

model = keras.models.load_model("binary_1gram.keras")
(test_loss, test_acc) = model.evaluate(binary_1gram_test_ds)
print("Test accuracy: %.2f%%" % (test_acc*100))

#%% Training another dense model, this time using bigrams.
#   First, create the bigrams version of the dataset.

text_vectorization = TextVectorization(max_tokens=20000, ngrams=2,
                                       output_mode="multi_hot")

text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)
binary_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
binary_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
binary_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))


#%% Training a dense model for the bigrams version, using the cache option 

max_tokens = 20000
model = keras.Sequential([keras.Input(shape=(max_tokens,)),
                          layers.Dense(16, activation="relu"),
                          layers.Dropout(0.5),
                          layers.Dense(1, activation="sigmoid")])

model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()
callbacks = [keras.callbacks.ModelCheckpoint("binary_2gram.keras",
                                             save_best_only=True)]
model.fit(binary_2gram_train_ds.cache(),
          validation_data=binary_2gram_val_ds.cache(),
          epochs=10, callbacks=callbacks)

model = keras.models.load_model("binary_2gram.keras")
(test_loss, test_acc) = model.evaluate(binary_2gram_test_ds)
print("Test accuracy: %.2f%%" % (test_acc*100))

#%% applying the model to a new review

review_text = "One of the best movies of the year. I strongly recommend it."
      
vectorized_data = text_vectorization([review_text])
prediction = model(vectorized_data)
print("prediction: %.3f\n" % (prediction[0][0].numpy()))

#%%

new_model = keras.Sequential([text_vectorization,
                              model])

review_text = "One of the best movies of the year. I strongly recommend it."
tensorized_input = tf.convert_to_tensor([review_text])
prediction = new_model(tensorized_input)
print("prediction: %.3f\n" % (prediction[0][0].numpy()))

prediction2 = new_model.predict(tensorized_input)
print("prediction2: %.3f\n" % (prediction2[0]))



#%%
#   The next part of the code will apply a 1-gram (word-based) model and a 
#   bigram model to various inputs. I deliberately wrote each of those
#   inputs to get one or both models to produce wrong results.
#   Here we load the two models and we prepare the text vectorization 
#   module for each model.

model1 = keras.models.load_model("binary_1gram.keras")
model2 = keras.models.load_model("binary_2gram.keras")

text_vectorization1 = TextVectorization(max_tokens=20000, ngrams=1,
                                        output_mode="multi_hot")
text_vectorization1.adapt(text_only_train_ds)

text_vectorization2 = TextVectorization(max_tokens=20000, ngrams=2,
                                        output_mode="multi_hot")
text_vectorization2.adapt(text_only_train_ds)


#%%

mock_review = """Before I watched it, I expected that it would be a
bad movie. After watching, my impression was the exact opposite."""

vectorized_data1 = text_vectorization1([mock_review])
predictions1 = model1(vectorized_data1)
print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy()))

vectorized_data2 = text_vectorization2([mock_review])
predictions2 = model2(vectorized_data2)
print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))

#%%

mock_review = """Before I watched it, I expected that it would be an 
excellent movie. After watching, my impression was the exact opposite."""
      
vectorized_data1 = text_vectorization1([mock_review])
predictions1 = model1(vectorized_data1)
print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy()))

vectorized_data2 = text_vectorization2([mock_review])
predictions2 = model2(vectorized_data2)
print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))

#%%

mock_review = """Was this a bad movie? The answer is no, quite the opposite."""

vectorized_data1 = text_vectorization1([mock_review])
predictions1 = model1(vectorized_data1)
print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy()))

vectorized_data2 = text_vectorization2([mock_review])
predictions2 = model2(vectorized_data2)
print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))

#%%

mock_review = """Was this a great movie? The answer is no, quite the opposite."""

vectorized_data1 = text_vectorization1([mock_review])
predictions1 = model1(vectorized_data1)
print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy()))

vectorized_data2 = text_vectorization2([mock_review])
predictions2 = model2(vectorized_data2)
print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))

#%%

mock_review = """I wish I could say that this was an excellent movie,
a good movie, or at least an OK movie. I most definitely cannot say that"""

vectorized_data1 = text_vectorization1([mock_review])
predictions1 = model1(vectorized_data1)
print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy()))

vectorized_data2 = text_vectorization2([mock_review])
predictions2 = model2(vectorized_data2)
print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))

#%%

mock_review = """I went to watch this movie with the highest expectations. 
My conclusions after watching it? This movie was not excellent, not good, 
not OK. A thoroughly bad movie."""

vectorized_data1 = text_vectorization1([mock_review])
predictions1 = model1(vectorized_data1)
print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy()))

vectorized_data2 = text_vectorization2([mock_review])
predictions2 = model2(vectorized_data2)
print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))


#%%

mock_review = """What an awesome movie. Best film of the year, worthy of 
an Oscar. No, just kidding. Seriously, don't watch it, you will not enjoy it."""

#"""it is a total waste of time."""

vectorized_data1 = text_vectorization1([mock_review])
predictions1 = model1(vectorized_data1)
print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy()))

vectorized_data2 = text_vectorization2([mock_review])
predictions2 = model2(vectorized_data2)
print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))

#%%

mock_review = """This is a great movie. It makes you feel what it is like to 
live through the horrors of a war, under utterly poor conditions."""

vectorized_data1 = text_vectorization1([mock_review])
predictions1 = model1(vectorized_data1)
print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy()))

vectorized_data2 = text_vectorization2([mock_review])
predictions2 = model2(vectorized_data2)
print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))

#%%

mock_review = """This is a great movie. It makes you feel what it is like to 
live through the horrors of a war, under utterly poor and awful conditions."""

vectorized_data1 = text_vectorization1([mock_review])
predictions1 = model1(vectorized_data1)
print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy()))

vectorized_data2 = text_vectorization2([mock_review])
predictions2 = model2(vectorized_data2)
print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))

#%%

mock_review = """This movie is great. It makes you feel what it is like to 
live through the horrors of a war, under utterly poor and awful conditions."""

vectorized_data1 = text_vectorization1([mock_review])
predictions1 = model1(vectorized_data1)
print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy()))

vectorized_data2 = text_vectorization2([mock_review])
predictions2 = model2(vectorized_data2)
print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))

#%%

mock_review = """This movie is great. It makes you feel what it is like to 
live through a horrible war, under utterly poor and awful conditions."""

vectorized_data1 = text_vectorization1([mock_review])
predictions1 = model1(vectorized_data1)
print("1-gram prediction: %.3f\n" % (predictions1[0][0].numpy()))

vectorized_data2 = text_vectorization2([mock_review])
predictions2 = model2(vectorized_data2)
print("2-gram prediction: %.3f\n" % (predictions2[0][0].numpy()))