Code Java to C# using T5

This tutorial contains complete code to fine-tune T5 to perform Seq2Seq on CodexGLUE Code to Code dataset. In addition to training a model, you will learn how to preprocess text into an appropriate format.

In this notebook, you will:

  • Load the CodexGLUE code to code dataset from HuggingFace

  • Load T5 Model using tf-transformers

  • Build train and validation dataset (on the fly) feature preparation using tokenizer from tf-transformers.

  • Train your own model, fine-tuning T5 as part of that

  • Evaluate BLEU on the generated text

  • Save your model and use it to convert Java to C# sentences

  • Use the end-to-end (preprocessing + inference) in production setup

If you’re new to working with the CodexGLUE dataset, please see CodexGLUE for more details.

!pip install tf-transformers

!pip install sentencepiece

!pip install tensorflow-text

!pip install transformers

!pip install wandb

!pip install datasets
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Supper TF warnings

import tensorflow as tf
import tensorflow_text as tf_text
import datasets
import tqdm
import wandb

print("Tensorflow version", tf.__version__)
print("Tensorflow text version", tf_text.__version__)
print("Devices", tf.config.list_physical_devices())

from tf_transformers.models import T5Model, T5TokenizerTFText
from tf_transformers.core import Trainer
from tf_transformers.optimization import create_optimizer
from tf_transformers.losses import cross_entropy_loss_label_smoothing
from tf_transformers.text import TextDecoder
Tensorflow version 2.7.0
Tensorflow text version 2.7.0
Devices [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

Load Model, Optimizer , Trainer

Our Trainer expects model, optimizer and loss to be a function.

# Load Model
def get_model(model_name, is_training, use_dropout):
  """Get Model"""

  def model_fn():
    model = T5Model.from_pretrained(model_name)
    return model
  return model_fn

# Load Optimizer
def get_optimizer(learning_rate, examples, batch_size, epochs, use_constant_lr=False):
    """Get optimizer"""
    steps_per_epoch = int(examples / batch_size)
    num_train_steps = steps_per_epoch * epochs
    warmup_steps = int(0.1 * num_train_steps)

    def optimizer_fn():
        optimizer, learning_rate_fn = create_optimizer(learning_rate, num_train_steps, warmup_steps, use_constant_lr=use_constant_lr)
        return optimizer

    return optimizer_fn

# Load trainer
def get_trainer(distribution_strategy, num_gpus=0, tpu_address=None):
    """Get Trainer"""
    trainer = Trainer(distribution_strategy, num_gpus=num_gpus, tpu_address=tpu_address)
    return trainer

# Load loss
def loss_fn(y_true_dict, y_pred_dict, smoothing=0.1):
    
    loss = cross_entropy_loss_label_smoothing(labels=y_true_dict['labels'], 
                                   logits=y_pred_dict['token_logits'],
                                   smoothing=smoothing,
                                      label_weights=y_true_dict['labels_mask'])
    return {'loss': loss}

Prepare Data for Training

We will make use of Tensorflow Text based tokenizer to do on-the-fly preprocessing, without having any overhead of pre prepapre the data in the form of pickle, numpy or tfrecords.

# Load dataset
def load_dataset(dataset, tokenizer_layer, max_seq_len, batch_size, drop_remainder):
    """
    Args:
      dataset; HuggingFace dataset
      tokenizer_layer: tf-transformers tokenizer
      max_seq_len: int (maximum sequence length of text)
      batch_size: int (batch_size)
      drop_remainder: bool (to drop remaining batch_size, when its uneven)
    """
    def parse(item):
        # Encoder inputs
        encoder_input_ids = tokenizer_layer({'text': [item['java']]})
        encoder_input_ids = encoder_input_ids.merge_dims(-2, 1)
        encoder_input_ids = encoder_input_ids[:max_seq_len-1]
        encoder_input_ids = tf.concat([encoder_input_ids, [tokenizer_layer.eos_token_id]], axis=0)

        # Decoder inputs
        decoder_input_ids = tokenizer_layer({'text': [item['cs']]})
        decoder_input_ids = decoder_input_ids.merge_dims(-2, 1)
        decoder_input_ids = decoder_input_ids[:max_seq_len-2]
        decoder_input_ids = tf.concat([[tokenizer_layer.pad_token_id] , decoder_input_ids, [tokenizer_layer.eos_token_id]], axis=0)


        encoder_input_mask = tf.ones_like(encoder_input_ids)
        labels = decoder_input_ids[1:]
        labels_mask = tf.ones_like(labels)
        decoder_input_ids = decoder_input_ids[:-1]

        result = {}
        result['encoder_input_ids'] = encoder_input_ids
        result['encoder_input_mask'] = encoder_input_mask
        result['decoder_input_ids'] = decoder_input_ids

        labels_dict = {}
        labels_dict['labels'] = labels
        labels_dict['labels_mask'] = labels_mask
        return result, labels_dict

    tfds_dict = dataset.to_dict()
    tfdataset = tf.data.Dataset.from_tensor_slices(tfds_dict).shuffle(100)

    tfdataset = tfdataset.map(parse, num_parallel_calls =tf.data.AUTOTUNE)
    tfdataset = tfdataset.padded_batch(batch_size, drop_remainder=drop_remainder)

    # Shard
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO
    tfdataset = tfdataset.with_options(options)
    
    return tfdataset

Prepare Dataset

  1. Set necessay hyperparameters.

  2. Prepare train dataset, validation dataset.

  3. Load model, optimizer, loss and trainer.

# Data configs
dataset_name = 'code_x_glue_cc_code_to_code_trans'
model_name  = 't5-small'
max_seq_len = 256
batch_size  = 32

# Model configs
learning_rate = 1e-4
epochs = 10
model_checkpoint_dir = 'MODELS/t5_code_to_code'

# Load HF dataset
dataset = datasets.load_dataset(dataset_name)
# Load tokenizer from tf-transformers
tokenizer_layer = T5TokenizerTFText.from_pretrained(model_name)
# Train Dataset
train_dataset = load_dataset(dataset['train'], tokenizer_layer, max_seq_len, batch_size, drop_remainder=True)
# Validation Dataset
validation_dataset = load_dataset(dataset['test'], tokenizer_layer, max_seq_len, batch_size, drop_remainder=False)

# Total train examples
total_train_examples = dataset['train'].num_rows
steps_per_epoch = total_train_examples // batch_size

# model
model_fn =  get_model(model_name, is_training=True, use_dropout=True)
# optimizer
optimizer_fn = get_optimizer(learning_rate, total_train_examples, batch_size, epochs, use_constant_lr=True)
# trainer
trainer = get_trainer(distribution_strategy='mirrored', num_gpus=1)
WARNING:datasets.builder:Reusing dataset code_x_glue_cc_code_to_code_trans (/home/jovyan/.cache/huggingface/datasets/code_x_glue_cc_code_to_code_trans/default/0.0.0/86dd57d2b1e88c6e589646133b76f2fef9d56c82e933d7f276e8a5b60ab18c34)
INFO:absl:Loading t5-small tokenizer to /tmp/tftransformers_tokenizer_cache/t5-small/spiece.model
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)

Wandb Configuration

project = "TUTORIALS"
display_name = "t5_code_to_code"
wandb.init(project=project, name=display_name)

Train :-)

history = trainer.run(
    model_fn=model_fn,
    optimizer_fn=optimizer_fn,
    train_dataset=train_dataset,
    train_loss_fn=loss_fn,
    epochs=epochs,
    steps_per_epoch=steps_per_epoch,
    model_checkpoint_dir=model_checkpoint_dir,
    batch_size=batch_size,
    wandb=wandb
)
INFO:absl:Make sure `steps_per_epoch` should be less than or equal to number of batches in dataset.
INFO:absl:Policy: ----> float32
INFO:absl:Strategy: ---> <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7f9d941a1810>
INFO:absl:Num GPU Devices: ---> 1
INFO:absl:Successful ✅✅: Model checkpoints matched and loaded from /home/jovyan/.cache/huggingface/hub/tftransformers__t5-small.main.699b12fe9601feda4892ca82c07e800f3c1da440/ckpt-1
INFO:absl:Successful ✅: Loaded model from tftransformers/t5-small
INFO:absl:Using Constant learning rate
INFO:absl:Using Adamw optimizer
INFO:absl:No ❌❌ checkpoint found in MODELS/t5_code_to_code
Train: Epoch 1/11 --- Step 300/321 --- total examples 6400 , trainable variables 132: 100%|██████████| 3/3 [02:17<00:00, 45.81s/batch , _runtime=365, _timestamp=1.65e+9, learning_rate=0.0001, loss=2.17]
INFO:absl:Model saved at epoch 1 at MODELS/t5_code_to_code/ckpt-1

Train: Epoch 2/11 --- Step 300/321 --- total examples 16000 , trainable variables 132: 100%|██████████| 3/3 [02:02<00:00, 40.96s/batch , _runtime=490, _timestamp=1.65e+9, learning_rate=0.0001, loss=0.926]
INFO:absl:Model saved at epoch 2 at MODELS/t5_code_to_code/ckpt-2

Train: Epoch 3/11 --- Step 300/321 --- total examples 25600 , trainable variables 132: 100%|██████████| 3/3 [02:03<00:00, 41.18s/batch , _runtime=616, _timestamp=1.65e+9, learning_rate=0.0001, loss=0.691]
INFO:absl:Model saved at epoch 3 at MODELS/t5_code_to_code/ckpt-3

Train: Epoch 4/11 --- Step 300/321 --- total examples 35200 , trainable variables 132: 100%|██████████| 3/3 [02:02<00:00, 40.97s/batch , _runtime=742, _timestamp=1.65e+9, learning_rate=0.0001, loss=0.575]
INFO:absl:Model saved at epoch 4 at MODELS/t5_code_to_code/ckpt-4

Train: Epoch 5/11 --- Step 300/321 --- total examples 44800 , trainable variables 132: 100%|██████████| 3/3 [02:02<00:00, 40.68s/batch , _runtime=866, _timestamp=1.65e+9, learning_rate=0.0001, loss=0.51] 
INFO:absl:Model saved at epoch 5 at MODELS/t5_code_to_code/ckpt-5

Train: Epoch 6/11 --- Step 300/321 --- total examples 54400 , trainable variables 132: 100%|██████████| 3/3 [02:03<00:00, 41.23s/batch , _runtime=992, _timestamp=1.65e+9, learning_rate=0.0001, loss=0.462]
INFO:absl:Model saved at epoch 6 at MODELS/t5_code_to_code/ckpt-6

Train: Epoch 7/11 --- Step 300/321 --- total examples 64000 , trainable variables 132: 100%|██████████| 3/3 [02:03<00:00, 41.30s/batch , _runtime=1118, _timestamp=1.65e+9, learning_rate=0.0001, loss=0.418]
INFO:absl:Model saved at epoch 7 at MODELS/t5_code_to_code/ckpt-7

Train: Epoch 8/11 --- Step 300/321 --- total examples 73600 , trainable variables 132: 100%|██████████| 3/3 [02:03<00:00, 41.03s/batch , _runtime=1243, _timestamp=1.65e+9, learning_rate=0.0001, loss=0.391]
INFO:absl:Model saved at epoch 8 at MODELS/t5_code_to_code/ckpt-8

Train: Epoch 9/11 --- Step 300/321 --- total examples 83200 , trainable variables 132: 100%|██████████| 3/3 [02:02<00:00, 40.91s/batch , _runtime=1368, _timestamp=1.65e+9, learning_rate=0.0001, loss=0.368]
INFO:absl:Model saved at epoch 9 at MODELS/t5_code_to_code/ckpt-9

Train: Epoch 10/11 --- Step 300/321 --- total examples 92800 , trainable variables 132: 100%|██████████| 3/3 [02:03<00:00, 41.30s/batch , _runtime=1495, _timestamp=1.65e+9, learning_rate=0.0001, loss=0.348]
INFO:absl:Model saved at epoch 10 at MODELS/t5_code_to_code/ckpt-10

Load and Serialize Model for Text Generation

    1. Load T5Model with use_auto_regressive=True

# Load T5 for Auto Regressive
model = T5Model.from_pretrained(model_name, use_auto_regressive=True)
# Load from checkpoint dir
model.load_checkpoint(model_checkpoint_dir)
# Save and serialize
model.save_transformers_serialized('{}/saved_model'.format(model_checkpoint_dir), overwrite=True)
# Load model
loaded = tf.saved_model.load('{}/saved_model'.format(model_checkpoint_dir))

Evaluate on Test ( BLEU ) score

# Load decoder
decoder = TextDecoder(model=loaded)

# greedy decoding
predicted_text = []
original_text  = []
for (batch_inputs, batch_labels) in tqdm.tqdm(validation_dataset):
    
    decoder_input_ids = batch_inputs['decoder_input_ids']
    
    # While decoding we do not need this, decoder_start_token_id will be automatically taken from saved model
    del batch_inputs['decoder_input_ids']
    
    predictions = decoder.decode(batch_inputs, 
                             mode='greedy', 
                             max_iterations=max_seq_len, 
                             eos_id=tokenizer_layer.eos_token_id)
    # Decode predictions
    predicted_text_batch = tokenizer_layer._tokenizer.detokenize(tf.cast(tf.squeeze(predictions['predicted_ids'], axis=1), tf.int32))
    predicted_text_batch = [text.numpy().decode() for text in predicted_text_batch]
    predicted_text.extend(predicted_text_batch)
    
    # Decoder original text
    original_text_batch = tokenizer_layer._tokenizer.detokenize(decoder_input_ids)
    original_text_batch = [text.numpy().decode() for text in original_text_batch]
    original_text.extend(original_text_batch)
    
100%|██████████| 32/32 [02:19<00:00,  4.36s/it]
import sacrebleu
from sacremoses import MosesDetokenizer
# Calculate and print the BLEU score
bleu = sacrebleu.corpus_bleu(predicted_text, [original_text])
print("BLEU: {}".format(bleu.score))
BLEU: 40.06566362514778