library(tensorflow) library(keras) library(data.table) library(tfdatasets) library(tfaddons) # Preprocessing ----------------------------------------------------------- # Assumes you've downloaded and unzipped one of the bilingual datasets offered at # http://www.manythings.org/anki/ and put it into a directory "data" # This example translates English to Dutch. download_data = function(){ if(!dir.exists('data')) { dir.create('data') } if(!file.exists('data/nld-eng.zip')) { download.file('http://www.manythings.org/anki/nld-eng.zip', destfile = file.path("data", basename('nld-eng.zip'))) unzip('data/nld-eng.zip', exdir = 'data') } } download_data() filepath <- file.path("data", "nld.txt") df = data.table::fread(filepath, header = FALSE, encoding = 'UTF-8', select = c(1,2), nrows = -1) text_cleaner <- function(text){ text = text %>% # replace non ascii textclean::replace_non_ascii() %>% # remove all non relevant symbols (letters, spaces, and apostrophes are retained) textclean::strip(apostrophe.remove = TRUE) %>% paste('<start> ', ., ' <end>') } df = sapply(1:2, function(x) text_cleaner(df[[x]])) %>% as.data.table() text_tok <- function(text) { tokenizer = text_tokenizer(filters='') tokenizer %>% fit_text_tokenizer(text) vocab_size = tokenizer$word_index data = tokenizer %>% texts_to_sequences(text) %>% pad_sequences(padding='post') list(vocab_size,data,tokenizer) } c(input_vocab_size, data_en, tokenizer_en) %<-% c(df[['V1']] %>% text_tok()) c(output_vocab_size, data_de, tokenizer_de) %<-% c(df[['V2']] %>% text_tok()) # Split the dataset indices_to_take = sample.int(n = nrow(df), size = floor(0.8*nrow(df)), replace = FALSE) split_data <- function(data) { c(train, test) %<-% list(data[indices_to_take, ], data[-indices_to_take, ] ) list(train, test) } c(en_train, en_test, de_train, de_test) %<-% c(split_data(data_en), split_data(data_de)) rm(df, filepath, indices_to_take, download_data, split_data, text_cleaner, text_tok) batch_size = 64L buffer_size = nrow(en_train) steps_per_epoch = buffer_size %/% batch_size embedding_dims = 256L rnn_units = 1024L dense_units = 1024L dtype = tf$float32 #used to initialize DecoderCell Zero state dataset = tensor_slices_dataset(list(en_train, de_train)) %>% dataset_shuffle(buffer_size) %>% dataset_batch(batch_size, drop_remainder = TRUE) EncoderNetwork = reticulate::PyClass( 'EncoderNetwork', inherit = tf$keras$Model, defs = list( `__init__` = function(self, input_vocab_size, embedding_dims, rnn_units) { super()$`__init__`() self$encoder_embedding = layer_embedding(input_dim = length(input_vocab_size), output_dim = embedding_dims) self$encoder_rnnlayer = layer_lstm(units = rnn_units, return_sequences = TRUE, return_state = TRUE) NULL } ) ) DecoderNetwork = reticulate::PyClass( 'DecoderNetwork', inherit = tf$keras$Model, defs = list( `__init__` = function(self, output_vocab_size, embedding_dims, rnn_units) { super()$`__init__`() self$decoder_embedding = layer_embedding(input_dim = length(output_vocab_size), output_dim = embedding_dims) self$dense_layer = layer_dense(units = length(output_vocab_size)) self$decoder_rnncell = tf$keras$layers$LSTMCell(rnn_units) # Sampler self$sampler = sampler_training() # Create attention mechanism with memory = NULL self$attention_mechanism = self$build_attention_mechanism(dense_units, NULL, c(rep(ncol(data_en), batch_size))) self$rnn_cell = self$build_rnn_cell(batch_size) self$decoder = decoder_basic(cell=self$rnn_cell, sampler = self$sampler, output_layer = self$dense_layer) NULL }, build_attention_mechanism = function(self, units, memory, memory_sequence_length) { attention_luong(units = units , memory = memory, memory_sequence_length = memory_sequence_length) }, build_rnn_cell = function(self, batch_size) { rnn_cell = attention_wrapper(cell = self$decoder_rnncell, attention_mechanism = self$attention_mechanism, attention_layer_size = dense_units) rnn_cell }, build_decoder_initial_state = function(self, batch_size, encoder_state, dtype) { decoder_initial_state = self$rnn_cell$get_initial_state(batch_size = batch_size, dtype = dtype) decoder_initial_state = decoder_initial_state$clone(cell_state = encoder_state) decoder_initial_state } ) ) encoderNetwork = EncoderNetwork(input_vocab_size, embedding_dims, rnn_units) decoderNetwork = DecoderNetwork(output_vocab_size, embedding_dims, rnn_units) optimizer = tf$keras$optimizers$Adam() loss_function <- function(y_pred, y) { #shape of y [batch_size, ty] #shape of y_pred [batch_size, Ty, output_vocab_size] loss = keras::loss_sparse_categorical_crossentropy(y, y_pred) mask = tf$logical_not(tf$math$equal(y,0L)) #output 0 for y=0 else output 1 mask = tf$cast(mask, dtype=loss$dtype) loss = mask * loss loss = tf$reduce_mean(loss) loss } train_step <- function(input_batch, output_batch,encoder_initial_cell_state) { loss = 0L with(tf$GradientTape() %as% tape, { encoder_emb_inp = encoderNetwork$encoder_embedding(input_batch) c(a, a_tx, c_tx) %<-% encoderNetwork$encoder_rnnlayer(encoder_emb_inp, initial_state = encoder_initial_cell_state) #[last step activations,last memory_state] of encoder passed as input to decoder Network # Prepare correct Decoder input & output sequence data decoder_input = tf$convert_to_tensor(output_batch %>% as.array() %>% .[,1:45]) # ignore <end> #compare logits with timestepped +1 version of decoder_input decoder_output = tf$convert_to_tensor(output_batch %>% as.array() %>% .[,2:46]) #ignore <start> # Decoder Embeddings decoder_emb_inp = decoderNetwork$decoder_embedding(decoder_input) #Setting up decoder memory from encoder output and Zero State for AttentionWrapperState decoderNetwork$attention_mechanism$setup_memory(a) decoder_initial_state = decoderNetwork$build_decoder_initial_state(batch_size, encoder_state = list(a_tx, c_tx), dtype = tf$float32) #BasicDecoderOutput c(outputs, res1, res2) %<-% decoderNetwork$decoder(decoder_emb_inp,initial_state = decoder_initial_state, sequence_length = c(rep(ncol(data_en) - 1L, batch_size))) logits = outputs$rnn_output #Calculate loss loss = loss_function(logits, decoder_output) }) #Returns the list of all layer variables / weights. variables = c(encoderNetwork$trainable_variables, decoderNetwork$trainable_variables) # differentiate loss wrt variables gradients = tape$gradient(loss, variables) #grads_and_vars – List of(gradient, variable) pairs. grads_and_vars = purrr::transpose(list(gradients,variables)) optimizer$apply_gradients(grads_and_vars) loss } initialize_initial_state = function() { list(tf$zeros(c(batch_size, rnn_units)), tf$zeros(c(batch_size, rnn_units))) } epochs = 1 for (i in 1:sum(epochs + 1)) { encoder_initial_cell_state = initialize_initial_state() total_loss = 0.0 res = dataset %>% dataset_take(steps_per_epoch) %>% iterate() for (batch in 1:length(res)) { c(input_batch, output_batch) %<-% res[[batch]] batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state) total_loss = total_loss + batch_loss if((batch+1) %% 5 == 0) { print(paste('total loss:', batch_loss$numpy(), 'epoch', i, 'batch',batch+1)) } } }