#%%

"""
Credits: This code is adapted from the textbook "Deep Learning with Python", 
2nd Edition, by François Chollet. 
"""

#%%

import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization


#%% Load training, validation, and test set for the Large Movie Review Dataset.
#   Each of these sets will be a BatchDataset object.

batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size)

val_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size)

test_ds = keras.utils.text_dataset_from_directory(
    "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size)

#%% Here we use bigrams, but we count the number of 
#   times each token appears, as opposed to outputting a multi-hot vector.

text_vectorization = TextVectorization(max_tokens=20000, ngrams=2,
                                       output_mode="count")

text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)
count_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
count_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
count_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))


#%% Training a dense model for bigrams and count-of-tokens vectors

max_tokens = 20000
model = keras.Sequential([keras.Input(shape=(max_tokens,)),
                          layers.Dense(16, activation="tanh"),
                          layers.Dropout(0.5),
                          layers.Dense(1, activation="sigmoid")])

model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()
callbacks = [keras.callbacks.ModelCheckpoint("count_2gram_count.keras",
                                             save_best_only=True)]
model.fit(count_2gram_train_ds.cache(),
          validation_data=count_2gram_val_ds.cache(),
          epochs=10, callbacks=callbacks)

model = keras.models.load_model("count_2gram_count.keras")
(test_loss, test_acc) = model.evaluate(count_2gram_test_ds)
print("Test accuracy: %.2f%%" % (test_acc*100))

#%% Here we use bigrams, and we apply TF-IDF normalization

text_vectorization = TextVectorization(max_tokens=20000, ngrams=2,
                                       output_mode="tf_idf")

text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)
tfidf_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
tfidf_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
tfidf_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))


#%% Training a dense model for bigrams and TF-IDF vectors

max_tokens = 20000
model = keras.Sequential([keras.Input(shape=(max_tokens,)),
                          layers.Dense(16, activation="tanh"),
                          layers.Dropout(0.5),
                          layers.Dense(1, activation="sigmoid")])

model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()
callbacks = [keras.callbacks.ModelCheckpoint("tfidf_2gram_count.keras",
                                             save_best_only=True)]
model.fit(tfidf_2gram_train_ds.cache(),
          validation_data=tfidf_2gram_val_ds.cache(),
          epochs=10, callbacks=callbacks)

model = keras.models.load_model("tfidf_2gram_count.keras")
(test_loss, test_acc) = model.evaluate(tfidf_2gram_test_ds)
print("Test accuracy for TF-IDF: %.2f%%" % (test_acc*100))

#%%

inputs = keras.Input(shape=(1,), dtype="string")
processed_inputs = text_vectorization(inputs)
outputs = model(processed_inputs)
inference_model = keras.Model(inputs, outputs)