#%%

"""
Credits: This code is adapted from the textbook "Deep Learning with Python", 
2nd Edition, by François Chollet. 
"""

#%%

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization


#%% On my computer, using Anaconda, the get_vocabulary method of 
#   the TextVectorization layer gave me an error, because it could not
#   handle characters with ASCII codes > 127.
#   This code does some preprocessing (find characters with ASCII codes > 127
#   and replace them with the SPACE character) to the text saved in 
#   text_only_train_ds, to create a "cleaned-up" version called new_dataset. 

batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size)

test_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size)

text_only_train_ds = train_ds.map(lambda x, y: x)

text_list = list(text_only_train_ds.as_numpy_iterator())

bad_list = []
new_list = [[]] * len(text_list)

for i in range(0, len(text_list)):
    sublist = [[]] * len(text_list[i])
    for j in range(0, len(text_list[i])):
        entry = text_list[i][j]
        mutable = bytearray(entry)
        np_data = np.array(mutable)
        bad = (np_data >= 128)
        np_data[bad] = 32 # ascii for 0
        if (bad.sum() > 0):
            bad_list = bad_list + [(i,j)]
        sublist[j] = bytes(np_data)
    new_list[i] = np.array(sublist)


new_dataset = tf.data.Dataset.from_tensor_slices(new_list)
max_tokens = 20000

#%% Now we can create our text vectorization layer and call adapt()
tv = TextVectorization(max_tokens=max_tokens, output_mode="int")
tv.adapt(new_dataset)

#%% Creating training, validation, test datasets

max_tokens = 20000
text_vectorization = TextVectorization(max_tokens=max_tokens, output_mode="int")

text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)
int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))


#%% Code to load pre-trained GloVe embeddings, 
glove_file = "../../../../../home/cse4392_data/glove.6b.100d.txt"
embeddings_index = {}
with open(glove_file, encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

#%% This code creates an embedding layer that matches the vocabulary
#   of the text vectorization layer we just created, and uses the 
#   vectors from the GloVe file.

embedding_dim = embeddings_index["hello"].shape[0]
vocabulary = tv.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))
embedding_matrix = np.zeros((max_tokens, embedding_dim))

lengths = np.zeros((len(word_index)))
for word, i in word_index.items(): # for every word in our vocabulary from the reviews dataset
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word) # look up the GloVe vector for that word
    if embedding_vector is not None:
        if (embedding_vector.shape[0] == embedding_dim):
            embedding_matrix[i] = embedding_vector # store that vector

# here we create the actual Embedding layer, using embedding_matrix (whose values
# we set in previous loop) to initialize the weights of the layer. 
# Then we set trainable=False, to make sure that the pretrained GloVe vectors
# do not change during training.
embedding_layer = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
    )

#%% Here we create the RNN model using the GloVe embeddings
#   This takes about 30 minutes on my computer (3 minutes per epoch)
#   You can skip this step, and download the model from
#   "glove_embeddings_sequence_model.keras", on the course website.
#   

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

filename = "glove_embeddings_sequence_model.keras"
callbacks = [
    keras.callbacks.ModelCheckpoint(filename,
                                    save_best_only=True)
    ]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,
          callbacks=callbacks)

#%% Here we evaluate the test accuracy of the model. It is about 80%,
#   not as good as the accuracy we get when we learn embeddings from our
#   dataset.

filename = "glove_embeddings_sequence_model.keras"
model = keras.models.load_model(filename)
(test_loss, test_acc) = model.evaluate(int_test_ds)
print("Test accuracy: %.2f%%" % (test_acc*100))

#%% These functions compute the difference and distance between
#   the embeddings of two words.

filename = "glove_embeddings_sequence_model.keras"
model = keras.models.load_model(filename)

def we_diff(model, tv_layer, s1, s2):
    em_model = keras.Sequential(model.layers[0:2])
    
    v1 = em_model(tv_layer([s1]))
    v1 = v1[0,0,:]
    
    v2 = em_model(tv_layer([s2]))
    v2 = v2[0,0,:]
    diff = v2 - v1
    
    return diff
    
def we_distance(model, tv_layer, s1, s2):
    diff = we_diff(model, tv_layer, s1, s2)
    dist = np.linalg.norm(diff)
    
    print("distance from \"%s\" to \"%s\" = %.2f" % (s1, s2, dist))
    return dist

#%% Printing the distance between various pairs of words, using 
#   the GloVe embeddings

filename = "glove_embeddings_sequence_model.keras"
glove_model = keras.models.load_model(filename)

we_distance(glove_model, tv, "great", "excellent")
we_distance(glove_model, tv, "awful", "horrible")
we_distance(glove_model, tv, "great", "awful")
we_distance(glove_model, tv, "excellent", "horrible")

we_distance(glove_model, tv, "big", "large")
we_distance(glove_model, tv, "big", "small")

#%%

we_distance(glove_model, tv, "great", "excellent")
we_distance(glove_model, tv, "awful", "horrible")

we_distance(glove_model, tv, "great", "awful")
we_distance(glove_model, tv, "excellent", "horrible")

#%%

we_distance(glove_model, tv, "buy", "purchase")
we_distance(glove_model, tv, "buy", "shop")
we_distance(glove_model, tv, "buy", "swim")
we_distance(glove_model, tv, "buy", "study")

#%%
we_distance(glove_model, tv, "obtain", "acquire")
we_distance(glove_model, tv, "obtain", "swim")
we_distance(glove_model, tv, "acquire", "swim")
#%%

we_distance(glove_model, tv, "big", "large")
we_distance(glove_model, tv, "big", "small")

#%%

we_distance(glove_model, tv, "movie", "film")
we_distance(glove_model, tv, "movie", "shark")