#%% """ Credits: This code is adapted from the textbook "Deep Learning with Python", 2nd Edition, by François Chollet. """ """ This version uses text vectorization with no fixed sequence length. Otherwise it is the same as erb_rnn_train1.py """ #%% import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tensorflow.keras.layers import TextVectorization import random import string import re from tv_to_file import * #%% train_pairs = load_pairs("eng_spa_train.txt") val_pairs = load_pairs("eng_spa_val.txt") test_pairs = load_pairs("eng_spa_test.txt") #%% Create text vectorization layers for English text and for Spanish text. strip_chars = string.punctuation + "¿¡" strip_chars = strip_chars.replace("[", "") strip_chars = strip_chars.replace("]", "") def custom_standardization(input_string): lowercase = tf.strings.lower(input_string) return tf.strings.regex_replace( lowercase, f"[{re.escape(strip_chars)}]", "") vocab_size = 15000 source_vectorization = layers.TextVectorization(max_tokens=vocab_size, output_mode="int") target_vectorization = layers.TextVectorization(max_tokens=vocab_size, output_mode="int", standardize=custom_standardization) train_english_texts = [pair[0] for pair in train_pairs] train_spanish_texts = [pair[1] for pair in train_pairs] source_vectorization.adapt(train_english_texts) target_vectorization.adapt(train_spanish_texts) #%% filename = "eng_spa_rnn.keras" seq2seq_rnn = keras.models.load_model(filename) spa_vocab = target_vectorization.get_vocabulary() spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab)) max_decoded_sentence_length = 20 def decode_sequence(input_sentence): tokenized_input_sentence = source_vectorization([input_sentence]) decoded_sentence = "[start]" for i in range(max_decoded_sentence_length): tokenized_target_sentence = target_vectorization([decoded_sentence]) next_token_predictions = seq2seq_rnn.predict( [tokenized_input_sentence, tokenized_target_sentence]) sampled_token_index = np.argmax(next_token_predictions[0, i, :]) sampled_token = spa_index_lookup[sampled_token_index] decoded_sentence += " " + sampled_token if sampled_token == "[end]": break return decoded_sentence test_eng_texts = [pair[0] for pair in test_pairs] #%% input_sentence = random.choice(test_eng_texts) print(input_sentence) print(decode_sequence(input_sentence)) #%% input_sentence = "I did not like this movie at all" print(input_sentence) print(decode_sequence(input_sentence))