#%% """ Credits: This code is adapted from the textbook "Deep Learning with Python", 2nd Edition, by François Chollet. """ #%% import numpy as np from tensorflow import keras from tensorflow.keras import layers from tensorflow.keras.layers import TextVectorization #%% Load training, validation, and test set for the Large Movie Review Dataset. # Each of these sets will be a BatchDataset object. batch_size = 32 train_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/train", batch_size=batch_size) val_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/val", batch_size=batch_size) test_ds = keras.utils.text_dataset_from_directory( "../../../../../home/cse4392_data/20_text/aclImdb/test", batch_size=batch_size) #%% Here we use bigrams, but we count the number of # times each token appears, as opposed to outputting a multi-hot vector. text_vectorization = TextVectorization(max_tokens=20000, ngrams=2, output_mode="count") text_only_train_ds = train_ds.map(lambda x, y: x) text_vectorization.adapt(text_only_train_ds) count_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y)) count_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y)) count_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y)) #%% Training a dense model for bigrams and count-of-tokens vectors max_tokens = 20000 model = keras.Sequential([keras.Input(shape=(max_tokens,)), layers.Dense(16, activation="tanh"), layers.Dropout(0.5), layers.Dense(1, activation="sigmoid")]) model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"]) model.summary() callbacks = [keras.callbacks.ModelCheckpoint("count_2gram_count.keras", save_best_only=True)] model.fit(count_2gram_train_ds.cache(), validation_data=count_2gram_val_ds.cache(), epochs=10, callbacks=callbacks) model = keras.models.load_model("count_2gram_count.keras") (test_loss, test_acc) = model.evaluate(count_2gram_test_ds) print("Test accuracy: %.2f%%" % (test_acc*100)) #%% Here we use bigrams, and we apply TF-IDF normalization text_vectorization = TextVectorization(max_tokens=20000, ngrams=2, output_mode="tf_idf") text_only_train_ds = train_ds.map(lambda x, y: x) text_vectorization.adapt(text_only_train_ds) tfidf_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y)) tfidf_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y)) tfidf_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y)) #%% Training a dense model for bigrams and TF-IDF vectors max_tokens = 20000 model = keras.Sequential([keras.Input(shape=(max_tokens,)), layers.Dense(16, activation="tanh"), layers.Dropout(0.5), layers.Dense(1, activation="sigmoid")]) model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"]) model.summary() callbacks = [keras.callbacks.ModelCheckpoint("tfidf_2gram_count.keras", save_best_only=True)] model.fit(tfidf_2gram_train_ds.cache(), validation_data=tfidf_2gram_val_ds.cache(), epochs=10, callbacks=callbacks) model = keras.models.load_model("tfidf_2gram_count.keras") (test_loss, test_acc) = model.evaluate(tfidf_2gram_test_ds) print("Test accuracy for TF-IDF: %.2f%%" % (test_acc*100)) #%% inputs = keras.Input(shape=(1,), dtype="string") processed_inputs = text_vectorization(inputs) outputs = model(processed_inputs) inference_model = keras.Model(inputs, outputs)