#%% """ Credits: This code is adapted from the textbook "Deep Learning with Python", 2nd Edition, by François Chollet. """ """ This version uses text vectorization with no fixed sequence length. Otherwise it is the same as erb_rnn_train1.py """ #%% import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tensorflow.keras.layers import TextVectorization import random import string import re from tv_to_file import * #%% text_file = "spa-eng/spa.txt" with open(text_file, encoding='utf-8') as f: lines = f.read().split("\n")[:-1] text_pairs = [] for line in lines: english, spanish = line.split("\t") spanish = "[start] " + spanish + " [end]" text_pairs.append((english, spanish)) #%% print(random.choice(text_pairs)) #%% random.shuffle(text_pairs) num_val_samples = int(0.15 * len(text_pairs)) num_train_samples = len(text_pairs) - 2 * num_val_samples train_pairs = text_pairs[:num_train_samples] val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples] test_pairs = text_pairs[num_train_samples + num_val_samples:] save_pairs("eng_spa_train.txt", train_pairs) save_pairs("eng_spa_val.txt", val_pairs) save_pairs("eng_spa_test.txt", test_pairs) #%% Create text vectorization layers for English text and for Spanish text. strip_chars = string.punctuation + "¿¡" strip_chars = strip_chars.replace("[", "") strip_chars = strip_chars.replace("]", "") def custom_standardization(input_string): lowercase = tf.strings.lower(input_string) return tf.strings.regex_replace( lowercase, f"[{re.escape(strip_chars)}]", "") vocab_size = 15000 source_vectorization = layers.TextVectorization(max_tokens=vocab_size, output_mode="int") target_vectorization = layers.TextVectorization(max_tokens=vocab_size, output_mode="int", standardize=custom_standardization) train_english_texts = [pair[0] for pair in train_pairs] train_spanish_texts = [pair[1] for pair in train_pairs] source_vectorization.adapt(train_english_texts) target_vectorization.adapt(train_spanish_texts) #%% Create Tensorflow datasets batch_size = 64 def format_dataset(eng, spa): eng = source_vectorization(eng) spa = target_vectorization(spa) return ({"english": eng, "spanish": spa[:, :-1]}, spa[:, 1:]) def make_dataset(pairs): eng_texts, spa_texts = zip(*pairs) eng_texts = list(eng_texts) spa_texts = list(spa_texts) dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts)) dataset = dataset.batch(batch_size) dataset = dataset.map(format_dataset) return dataset.shuffle(2048).prefetch(16).cache() train_ds = make_dataset(train_pairs) val_ds = make_dataset(val_pairs) #%% Define the model embed_dim = 256 latent_dim = 1024 source = keras.Input(shape=(None,), dtype="int64", name="english") x1 = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source) encoded_source = layers.Bidirectional(layers.GRU(latent_dim), merge_mode="sum")(x1) past_target = keras.Input(shape=(None,), dtype="int64", name="spanish") x2 = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(past_target) decoder_gru = layers.GRU(latent_dim, return_sequences=True) x3 = decoder_gru(x2, initial_state=encoded_source) x4 = layers.Dropout(0.5)(x3) target_next_step = layers.Dense(vocab_size, activation="softmax")(x4) seq2seq_rnn = keras.Model([source, past_target], target_next_step) filename = "eng_spa_rnn2_temp.keras" callbacks = [keras.callbacks.ModelCheckpoint(filename, save_best_only=True)] seq2seq_rnn.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) #%% Train the model # Takes about 35 minutes per epoch seq2seq_rnn.fit(train_ds, epochs=15, validation_data=val_ds, callbacks=callbacks) #%%