#%% """ Credits: This code is adapted from the textbook "Deep Learning with Python", 2nd Edition, by François Chollet. """ #%% import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tensorflow.keras.layers import TextVectorization #%% # flattens a tf_dataset of strings def flatten_tf_text_dataset(tf_dataset): dataset = list(tf_dataset) result = [] for batch in dataset: for item in batch: string = str(item.numpy())[2:-1] result = result + [string] return result def save_text_vectorization(filename, tv_layer, training_text): if (filename[-3:-1] != ".tv"): filename = filename + ".tv" config = tv_layer.get_config() print("saving text vectorization layer to %s" % (filename)) f = open(filename, "w", encoding="utf-8") print(config["max_tokens"], file=f) print(config["ngrams"], file=f) print(config["output_mode"], file=f) print(config["output_sequence_length"], file=f) for text in training_text: print(text, file=f) f.close() def read_int_line(f): line = f.readline().strip() if (line == "None"): return None else: return int(line) def load_text_vectorization(filename): if (filename[-3:-1] != ".tv"): filename = filename + ".tv" print("loading text vectorization layer from %s" % (filename)) f = open(filename, encoding='utf-8') max_tokens = read_int_line(f) ngrams = read_int_line(f) output_mode = f.readline().strip() output_sequence_length = read_int_line(f) text_lines = f.readlines() f.close() result = TextVectorization(max_tokens = max_tokens, ngrams = ngrams, output_mode = output_mode, output_sequence_length = output_sequence_length) result.adapt(text_lines) return result def set_tv_vocabulary(tv_layer, tf_dataset, filename): flattened_tf_text = flatten_tf_text_dataset(tf_dataset) tv_layer.adapt(flattened_tf_text) save_text_vectorization(filename, tv_layer, flattened_tf_text) # verifies whether the text vectorization layer tv_layer1 and # text vectorization layer tv_layer2 map text to the same vector. def compare_tv_layers(tv_layer1, tv_layer2, text): original_embedding = tv_layer1([text]).numpy() stored_embedding = tv_layer2([text]).numpy() print("original_embedding shape:", original_embedding.shape) print("stored_embedding shape:", stored_embedding.shape) total = (original_embedding == stored_embedding).sum() diffs = (original_embedding != stored_embedding).sum() print("The two embeddings agree in %d places" % (total)) print("The two embeddings differ in %d places" % (diffs)) return (diffs == 0) # verifies whether the text vectorization layer tv_layer and the # text vectorization layer stored in filename map text to the same vector. def verify_tv_file(tv_layer, filename, text): (stored_tv, text_list) = load_text_vectorization(filename) return compare_tv_layers(tv_layer, stored_tv, text) def save_pairs(filename, pairs): f = open(filename, "w", encoding="utf-8") for pair in pairs: print(pair[0]+"\t"+pair[1], file=f) f.close() def load_pairs(filename): f = open(filename, encoding="utf-8") lines = f.readlines() number = len(lines) result = [None] * number counter = 0 for line in lines: line = line.strip() pair = line.split('\t') if (len(pair) != 2): print("failed to parse this line:\n%s" & (line)) print(pair) result[counter] = pair counter = counter + 1 return result