#%%

"""
Credits: This code is adapted from the textbook "Deep Learning with Python", 
2nd Edition, by François Chollet. 
"""

#%%

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization


#%% Statistics on the length of each document.

batch_size = 1
train_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size)

val_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size)

test_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size)

dataset = val_ds
number = len(dataset)
objects = [None] * number    # initialize the list of training inputs
labels = [None] * number     # initialize the list of training labels
tensors = [None] * number    # initialize the list of training inputs
lengths_val = np.zeros((number))

total = 0
smallest = 1000000
largest = 0

counter = 0
for inputs, targets in dataset:
    objects[counter] = inputs[0]
    labels[counter] = targets[0]
    tensors[counter] = text_vectorization(objects[counter])
    num_words = len(tensors[counter])
    smallest = min(smallest, num_words)
    largest = max(largest, num_words)
    total = total + num_words
    lengths_val[counter] = num_words
    
    # Print rate of progress every thousand iterations, as this loop can be slow.
    if (counter % 1000== 0):
        print("processed %d out of %d entries" % (counter, number))
    counter = counter+1

print("processed %d out of %d entries" % (counter, number))
print("smallest = %d words" % (smallest))
print("largest = %d words" % (largest))
average = total / counter
print("average = %.1f words" % (average))

#%% Load training, validation, and test set for the Large Movie Review Dataset.
#   Each of these sets will be a BatchDataset object.

batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size)

val_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size)

test_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size)


#%% Map datsets of text objects to sequences of integers.

max_tokens = 20000
text_vectorization = TextVectorization(max_tokens=max_tokens, output_mode="int")

text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)
int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))


#%% Training a bidirectional LSTM model on the sequences
#   This takes 15 hours on my computer (1.5 hours per epoch)

inputs = keras.Input(shape=(None,), dtype="int64")
oh_vec = tf.one_hot(inputs, depth=max_tokens)
x1 = layers.Bidirectional(layers.LSTM(32))(oh_vec)
x2 = layers.Dropout(0.5)(x1)
outputs = layers.Dense(1, activation="sigmoid")(x2)
model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()

filename = "one_hot_bidir_lstm2.keras"
callbacks = [keras.callbacks.ModelCheckpoint(filename,
                                             save_best_only=True)]

#%%
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,
          callbacks=callbacks)
model = keras.models.load_model(filename)

(test_loss, test_acc) = model.evaluate(int_test_ds)
print("Test accuracy: %.2f%%" % (test_acc*100))

#%% Define an RNN with word embeddings. Training this model takes about 
#   55 minutes on my computer (5.5 minutes per epoch)

inputs = keras.Input(shape=(None,), dtype="int64")
em = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x1 = layers.Bidirectional(layers.LSTM(32))(em)
x2 = layers.Dropout(0.5)(x1)
outputs = layers.Dense(1, activation="sigmoid")(x2)
model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop", loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

filename = "embeddings_bidir_lstm.keras"
callbacks = [keras.callbacks.ModelCheckpoint(filename,
                                             save_best_only=True)]
#%% Train the RNN with word embeddings
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,
          callbacks=callbacks)

#%% Evaluate the RNN with word embeddings

model = keras.models.load_model(filename)
(test_loss, test_acc) = model.evaluate(int_test_ds)
print("Test accuracy: %.2f%%" % (test_acc*100))

#%% Code for computing the difference and distance between the 
#   embeddings of two words. Strings s1 and s2 are the two words.

def we_diff(model, tv_layer, s1, s2):
    em_model = keras.Sequential(model.layers[0:2])
    
    v1 = em_model(tv_layer([s1]))
    v1 = v1[0,0,:]
    
    v2 = em_model(tv_layer([s2]))
    v2 = v2[0,0,:]
    diff = v2 - v1
    
    return diff
    
def we_distance(model, tv_layer, s1, s2):
    diff = we_diff(model, tv_layer, s1, s2)
    dist = np.linalg.norm(diff)
    
    print("distance from \"%s\" to \"%s\" = %.2f" % (s1, s2, dist))
    return dist

#%% Printing the distance between various pairs of words, using 
#   the word embeddings that the model has learned.

filename = "embeddings_bidir_lstm.keras"
we_model = keras.models.load_model(filename)

we_distance(we_model, text_vectorization, "great", "excellent")
we_distance(we_model, text_vectorization, "awful", "horrible")
we_distance(we_model, text_vectorization, "great", "awful")
we_distance(we_model, text_vectorization, "excellent", "horrible")

we_distance(we_model, text_vectorization, "big", "large")
we_distance(we_model, text_vectorization, "big", "small")

#%%

we_distance(we_model, text_vectorization, "great", "excellent")
we_distance(we_model, text_vectorization, "awful", "horrible")

we_distance(we_model, text_vectorization, "great", "awful")
we_distance(we_model, text_vectorization, "excellent", "horrible")

#%%

we_distance(we_model, text_vectorization, "buy", "purchase")
we_distance(we_model, text_vectorization, "buy", "shop")
we_distance(we_model, text_vectorization, "buy", "swim")
we_distance(we_model, text_vectorization, "buy", "study")

#%%
we_distance(we_model, text_vectorization, "obtain", "acquire")
we_distance(we_model, text_vectorization, "obtain", "swim")
we_distance(we_model, text_vectorization, "acquire", "swim")
#%%

we_distance(we_model, text_vectorization, "big", "large")
we_distance(we_model, text_vectorization, "big", "small")