{ "cells": [ { "cell_type": "markdown", "id": "a635cd6a", "metadata": {}, "source": [ "# Create Sentence Embedding Roberta Model + Zeroshot from Scratch\n", "\n", "This tutorial contains complete code to fine-tune Roberta to build meaningful sentence transformers using Quora Dataset from HuggingFace. \n", "In addition to training a model, you will learn how to preprocess text into an appropriate format.\n", "\n", "In this notebook, you will:\n", "\n", "- Load the Quora dataset from HuggingFace\n", "- Load Roberta Model using tf-transformers\n", "- Build train and validation dataset feature preparation using\n", "tokenizer from transformers.\n", "- Build your own model by combining Roberta with a CustomWrapper\n", "- Train your own model, fine-tuning Roberta as part of that\n", "- Save your model and use it to extract sentence embeddings\n", "- Use the end-to-end (inference) in production setup\n", "\n", "If you're new to working with the Quora dataset, please see [QUORA](https://huggingface.co/datasets/quora) for more details." ] }, { "cell_type": "code", "execution_count": null, "id": "b090bead", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3843c4a3", "metadata": {}, "outputs": [], "source": [ "!pip install tf-transformers\n", "\n", "!pip install transformers\n", "\n", "!pip install wandb\n", "\n", "!pip install datasets" ] }, { "cell_type": "code", "execution_count": null, "id": "42175d0f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 2, "id": "7fda2757", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tensorflow version 2.7.0\n", "Devices [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]\n" ] } ], "source": [ "import tensorflow as tf\n", "import random\n", "import collections\n", "import wandb\n", "import tempfile\n", "import tqdm\n", "import json\n", "\n", "import os\n", "import numpy as np\n", "\n", "print(\"Tensorflow version\", tf.__version__)\n", "print(\"Devices\", tf.config.list_physical_devices())\n", "\n", "from tf_transformers.models import RobertaModel, Classification_Model\n", "from tf_transformers.core import Trainer\n", "from tf_transformers.optimization import create_optimizer\n", "from tf_transformers.data import TFWriter, TFReader\n", "from tf_transformers.losses import cross_entropy_loss_for_classification\n", "\n", "from datasets import load_dataset\n", "\n", "\n", "from transformers import RobertaTokenizer" ] }, { "cell_type": "code", "execution_count": null, "id": "a1ca794e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 3, "id": "88e23b9e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration default\n", "Reusing dataset quora (/home/jovyan/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "629843610aa143b5844179e02574e5f6", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00 device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:07:00.0, compute capability: 7.0\n", "2022-03-23 01:10:08.938622: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30945 MB memory: -> device: 1, name: Tesla V100-SXM2-32GB, pci bus id: 0000:86:00.0, compute capability: 7.0\n" ] } ], "source": [ "# Read TFRecord\n", "\n", "def add_mask_type_ids(item):\n", " \n", " item['input_mask_left'] = tf.ones_like(item['input_ids_left'])\n", " item['input_type_ids_left']= tf.zeros_like(item['input_ids_left'])\n", " item['input_mask_right'] = tf.ones_like(item['input_ids_right'])\n", " item['input_type_ids_right']= tf.zeros_like(item['input_ids_right'])\n", " \n", " labels = {}\n", " if 'score' in item:\n", " labels = {'score': item['score']}\n", " del item['score']\n", " \n", " return item, labels\n", "\n", "# Train dataset\n", "schema = json.load(open(\"{}/schema.json\".format(tfrecord_train_dir)))\n", "total_train_examples = json.load(open(\"{}/stats.json\".format(tfrecord_train_dir)))['total_records']\n", "\n", "\n", "all_files = tf.io.gfile.glob(\"{}/*.tfrecord\".format(tfrecord_train_dir))\n", "tf_reader = TFReader(schema=schema, \n", " tfrecord_files=all_files)\n", "\n", "x_keys = ['input_ids_left', 'input_ids_right']\n", "train_dataset = tf_reader.read_record(auto_batch=False, \n", " keys=x_keys,\n", " batch_size=batch_size, \n", " x_keys = x_keys, \n", " shuffle=True\n", " )\n", "train_dataset = train_dataset.map(add_mask_type_ids, num_parallel_calls=tf.data.AUTOTUNE).padded_batch(batch_size, drop_remainder=True)\n", "\n", "\n", "# Validation dataset\n", "val_schema = json.load(open(\"{}/schema.json\".format(tfrecord_validation_dir)))\n", "all_val_files = tf.io.gfile.glob(\"{}/*.tfrecord\".format(tfrecord_validation_dir))\n", "tf_reader_val = TFReader(schema=val_schema, \n", " tfrecord_files=all_val_files)\n", "\n", "x_keys_val = ['input_ids_left', 'input_ids_right', 'score']\n", "validation_dataset = tf_reader_val.read_record(auto_batch=False, \n", " keys=x_keys_val,\n", " batch_size=batch_size, \n", " x_keys = x_keys_val, \n", " shuffle=True\n", " )\n", "\n", "# Static shapes makes things faster inside tf.function\n", "# Especially for validation as we are passing batch examples to tf.function\n", "padded_shapes = ({'input_ids_left': [max_sequence_length,], \n", " 'input_mask_left':[max_sequence_length,],\n", " 'input_type_ids_left':[max_sequence_length,],\n", " 'input_ids_right': [max_sequence_length,],\n", " 'input_mask_right': [max_sequence_length,],\n", " 'input_type_ids_right': [max_sequence_length,]\n", " }, \n", " {'score': [None,]})\n", "validation_dataset = validation_dataset.map(add_mask_type_ids,\n", " num_parallel_calls=tf.data.AUTOTUNE).padded_batch(batch_size,\n", " drop_remainder=False,\n", " padded_shapes=padded_shapes\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "4bd01cdf", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "e8335058", "metadata": {}, "source": [ "### Build Sentence Transformer Model" ] }, { "cell_type": "code", "execution_count": 4, "id": "fc1e8b16", "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf\n", "from tf_transformers.core import LegacyLayer, LegacyModel\n", "\n", "\n", "class Sentence_Embedding_Model(LegacyLayer):\n", " def __init__(\n", " self,\n", " model,\n", " is_training=False,\n", " use_dropout=False,\n", " **kwargs,\n", " ):\n", " r\"\"\"\n", " Simple Sentence Embedding using Keras Layer\n", "\n", " Args:\n", " model (:obj:`LegacyLayer/LegacyModel`):\n", " Model.\n", " Eg:`~tf_transformers.model.BertModel`.\n", " is_training (:obj:`bool`, `optional`, defaults to False): To train\n", " use_dropout (:obj:`bool`, `optional`, defaults to False): Use dropout\n", " use_bias (:obj:`bool`, `optional`, defaults to True): use bias\n", " \"\"\"\n", " super(Sentence_Embedding_Model, self).__init__(\n", " is_training=is_training, use_dropout=use_dropout, name=model.name, **kwargs\n", " )\n", "\n", " self.model = model\n", " if isinstance(model, LegacyModel):\n", " self.model_config = model.model_config\n", " elif isinstance(model, tf.keras.layers.Layer):\n", " self.model_config = model._config_dict\n", " self._is_training = is_training\n", " self._use_dropout = use_dropout\n", "\n", " # Initialize model\n", " self.model_inputs, self.model_outputs = self.get_model(initialize_only=True)\n", " \n", " def get_mean_embeddings(self, token_embeddings, input_mask):\n", " \"\"\"\n", " Mean embeddings\n", " \"\"\"\n", " cls_embeddings = token_embeddings[:, 0, :] # 0 is CLS ()\n", " # mask PAD tokens\n", " token_emb_masked = token_embeddings * tf.cast(tf.expand_dims(input_mask, 2), tf.float32)\n", " total_non_padded_tokens_per_batch = tf.cast(tf.reduce_sum(input_mask, axis=1), tf.float32)\n", " # Convert to 2D\n", " total_non_padded_tokens_per_batch = tf.expand_dims(total_non_padded_tokens_per_batch, 1)\n", " mean_embeddings = tf.reduce_sum(token_emb_masked, axis=1)/ total_non_padded_tokens_per_batch\n", " return mean_embeddings\n", "\n", " def call(self, inputs):\n", " \"\"\"Call\"\"\"\n", " \n", " # Extract left and right input pairs\n", " left_inputs = {k.replace('_left', ''):v for k,v in inputs.items() if 'left' in k}\n", " right_inputs = {k.replace('_right', ''):v for k,v in inputs.items() if 'right' in k}\n", " model_outputs_left = self.model(left_inputs)\n", " model_outputs_right = self.model(right_inputs)\n", " \n", " left_cls = model_outputs_left['cls_output']\n", " right_cls = model_outputs_right['cls_output'] \n", "\n", " left_mean_embeddings = self.get_mean_embeddings(model_outputs_left['token_embeddings'], left_inputs['input_mask'])\n", " right_mean_embeddings = self.get_mean_embeddings(model_outputs_right['token_embeddings'], right_inputs['input_mask'])\n", " \n", " cls_logits = tf.matmul(left_cls, right_cls, transpose_b=True)\n", " mean_logits = tf.matmul(left_mean_embeddings, right_mean_embeddings, transpose_b=True)\n", " \n", " \n", " results = {'left_cls_output': left_cls, \n", " 'right_cls_output': right_cls, \n", " 'left_mean_embeddings': left_mean_embeddings,\n", " 'right_mean_embeddings': right_mean_embeddings,\n", " 'cls_logits': cls_logits, \n", " 'mean_logits': mean_logits}\n", " \n", " return results\n", " \n", "\n", " def get_model(self, initialize_only=False):\n", " \"\"\"Get model\"\"\"\n", " inputs = self.model.input\n", " # Left and Right inputs\n", " main_inputs = {}\n", " for k, v in inputs.items():\n", " shape = v.shape\n", " main_inputs[k+'_left'] = tf.keras.layers.Input(\n", " shape[1:], batch_size=v.shape[0], name=k+'_left', dtype=v.dtype\n", " )\n", " \n", " for k, v in inputs.items():\n", " shape = v.shape\n", " main_inputs[k+'_right'] = tf.keras.layers.Input(\n", " shape[1:], batch_size=v.shape[0], name=k+'_right', dtype=v.dtype\n", " ) \n", " layer_outputs = self(main_inputs)\n", " if initialize_only:\n", " return main_inputs, layer_outputs\n", " model = LegacyModel(inputs=main_inputs, outputs=layer_outputs, name=\"sentence_embedding_model\")\n", " model.model_config = self.model_config\n", " return model" ] }, { "cell_type": "code", "execution_count": null, "id": "a9b75439", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "9988fbb5", "metadata": {}, "source": [ "### Load Model, Optimizer , Trainer\n", "\n", "Our Trainer expects ```model```, ```optimizer``` and ```loss``` to be a function.\n", "\n", "* 1. We will use ```Roberta``` as the base model and pass it to ```Sentence_Embedding_Model```, layer we built\n", "* 2. We will use ```in-batch``` loss as the loss function, where every diagonal entry in the output is positive\n", "and rest is negative" ] }, { "cell_type": "code", "execution_count": 5, "id": "7be22032", "metadata": {}, "outputs": [], "source": [ "# Load Model\n", "def get_model(model_name, is_training, use_dropout):\n", " \"\"\"Get Model\"\"\"\n", " def model_fn():\n", " model = RobertaModel.from_pretrained(model_name)\n", " sentence_transformers_model = Sentence_Embedding_Model(model)\n", " sentence_transformers_model = sentence_transformers_model.get_model()\n", " return sentence_transformers_model\n", " return model_fn\n", "\n", "# Load Optimizer\n", "def get_optimizer(learning_rate, examples, batch_size, epochs, use_constant_lr=False):\n", " \"\"\"Get optimizer\"\"\"\n", " steps_per_epoch = int(examples / batch_size)\n", " num_train_steps = steps_per_epoch * epochs\n", " warmup_steps = int(0.1 * num_train_steps)\n", "\n", " def optimizer_fn():\n", " optimizer, learning_rate_fn = create_optimizer(learning_rate, num_train_steps, warmup_steps, use_constant_lr=use_constant_lr)\n", " return optimizer\n", "\n", " return optimizer_fn\n", "\n", "# Load trainer\n", "def get_trainer(distribution_strategy, num_gpus=0, tpu_address=None):\n", " \"\"\"Get Trainer\"\"\"\n", " trainer = Trainer(distribution_strategy, num_gpus=num_gpus, tpu_address=tpu_address)\n", " return trainer\n", "\n", "# Create loss\n", "def in_batch_negative_loss():\n", " \n", " def loss_fn(y_true_dict, y_pred_dict):\n", " \n", " labels = tf.range(y_pred_dict['cls_logits'].shape[0])\n", " cls_loss = cross_entropy_loss_for_classification(labels=labels, logits=y_pred_dict['cls_logits'])\n", " mean_loss = cross_entropy_loss_for_classification(labels=labels, logits=y_pred_dict['mean_logits'])\n", " \n", " result = {}\n", " result['cls_loss'] = cls_loss\n", " result['mean_loss'] = mean_loss\n", " result['loss'] = (cls_loss + mean_loss)/2.0\n", " return result\n", " \n", " return loss_fn" ] }, { "cell_type": "code", "execution_count": null, "id": "155acb99", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "d24ee408", "metadata": {}, "source": [ "### Wandb Configuration" ] }, { "cell_type": "code", "execution_count": null, "id": "a8192d7e", "metadata": {}, "outputs": [], "source": [ "project = \"TUTORIALS\"\n", "display_name = \"roberta_quora_sentence_embedding\"\n", "wandb.init(project=project, name=display_name)" ] }, { "cell_type": "markdown", "id": "ed867e8c", "metadata": {}, "source": [ "### Zero-Shot on STS before Training\n", "\n", "* 1. Lets evaluate how good ```Roberta``` is to capture sentence embeddings before ```fine-tuning``` with Quora.\n", "* 2. This gives us an indication whether the model is learning something or not on downstream fine-tuning.\n", "* 3. We use ```CLS_OUTPUT```, pooler output of ```Roberta``` model as sentence embedding and evaluate using\n", "```pearson``` and ```spearman``` correlation." ] }, { "cell_type": "code", "execution_count": 13, "id": "ae633ba3", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:absl:Successful ✅✅: Model checkpoints matched and loaded from /home/jovyan/.cache/huggingface/hub/tftransformers__roberta-base-no-mlm.main.9e4aa91ba5936c6ac98586f85c152831e421d0ec/ckpt-1\n", "INFO:absl:Successful ✅: Loaded model from tftransformers/roberta-base-no-mlm\n", "12it [00:12, 1.08s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Cosine-Similarity :\tPearson: 0.4278\tSpearman: 0.5293\n", "Manhattan-Distance:\tPearson: 0.4329\tSpearman: 0.5120\n", "Euclidean-Distance:\tPearson: 0.4365\tSpearman: 0.5125\n", "Dot-Product-Similarity:\tPearson: -0.0079\tSpearman: -0.0050\n" ] } ], "source": [ "from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances\n", "from scipy.stats import pearsonr, spearmanr\n", "\n", "model = RobertaModel.from_pretrained(model_name)\n", "\n", "sentence1_embeddings = []\n", "sentence2_embeddings = []\n", "sts_labels = []\n", "for batch_inputs, batch_labels in tqdm.tqdm(validation_dataset):\n", " left_inputs = {k.replace('_left', ''):v for k,v in batch_inputs.items() if 'left' in k}\n", " right_inputs = {k.replace('_right', ''):v for k,v in batch_inputs.items() if 'right' in k}\n", " left_outputs = model(left_inputs)\n", " right_outputs = model(right_inputs)\n", " \n", " # sentence 1 embeddings\n", " sentence1_embeddings.append(left_outputs['cls_output'])\n", " # sentence 2 embeddings\n", " sentence2_embeddings.append(right_outputs['cls_output'])\n", " sts_labels.append(batch_labels['score'])\n", " \n", "sts_labels = tf.squeeze(tf.concat(sts_labels, axis=0), axis=1)\n", "sentence1_embeddings = tf.concat(sentence1_embeddings, axis=0)\n", "sentence2_embeddings = tf.concat(sentence2_embeddings, axis=0)\n", "\n", "cosine_scores = 1 - (paired_cosine_distances(sentence1_embeddings.numpy(), sentence2_embeddings.numpy()))\n", "manhattan_distances = -paired_manhattan_distances(sentence1_embeddings.numpy(), sentence2_embeddings.numpy())\n", "euclidean_distances = -paired_euclidean_distances(sentence1_embeddings.numpy(), sentence2_embeddings.numpy())\n", "dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(sentence1_embeddings.numpy(), sentence2_embeddings.numpy())]\n", "\n", "\n", "eval_pearson_cosine, _ = pearsonr(sts_labels, cosine_scores)\n", "eval_spearman_cosine, _ = spearmanr(sts_labels, cosine_scores)\n", "\n", "eval_pearson_manhattan, _ = pearsonr(sts_labels, manhattan_distances)\n", "eval_spearman_manhattan, _ = spearmanr(sts_labels, manhattan_distances)\n", "\n", "eval_pearson_euclidean, _ = pearsonr(sts_labels, euclidean_distances)\n", "eval_spearman_euclidean, _ = spearmanr(sts_labels, euclidean_distances)\n", "\n", "eval_pearson_dot, _ = pearsonr(sts_labels, dot_products)\n", "eval_spearman_dot, _ = spearmanr(sts_labels, dot_products)\n", "\n", "\n", "print(\"Cosine-Similarity :\\tPearson: {:.4f}\\tSpearman: {:.4f}\".format(\n", " eval_pearson_cosine, eval_spearman_cosine))\n", "print(\"Manhattan-Distance:\\tPearson: {:.4f}\\tSpearman: {:.4f}\".format(\n", " eval_pearson_manhattan, eval_spearman_manhattan))\n", "print(\"Euclidean-Distance:\\tPearson: {:.4f}\\tSpearman: {:.4f}\".format(\n", " eval_pearson_euclidean, eval_spearman_euclidean))\n", "print(\"Dot-Product-Similarity:\\tPearson: {:.4f}\\tSpearman: {:.4f}\".format(\n", " eval_pearson_dot, eval_spearman_dot))" ] }, { "cell_type": "code", "execution_count": null, "id": "55c3039e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 14, "id": "063bd419", "metadata": {}, "outputs": [], "source": [ "import tqdm\n", "from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances\n", "from scipy.stats import pearsonr, spearmanr\n", "\n", "class STSEvaluationCallback:\n", " def __init__(self) -> None:\n", " pass\n", "\n", " def __call__(self, trainer_kwargs):\n", "\n", " validation_dataset_distributed = iter(\n", " trainer_kwargs[\"validation_dataset_distributed\"]\n", " )\n", " model = trainer_kwargs[\"model\"]\n", " wandb = trainer_kwargs[\"wandb\"]\n", " step = trainer_kwargs[\"global_step\"]\n", " strategy = trainer_kwargs[\"strategy\"]\n", " epoch = trainer_kwargs[\"epoch\"]\n", " epochs = trainer_kwargs[\"epochs\"]\n", " validation_steps = trainer_kwargs[\"validation_steps\"]\n", "\n", " if validation_dataset_distributed is None:\n", " raise ValueError(\n", " \"No validation dataset has been provided either in the trainer class, \\\n", " or when callback is initialized. Please provide a validation dataset\"\n", " )\n", "\n", " @tf.function\n", " def validate_run(dist_inputs):\n", " batch_inputs, batch_labels = dist_inputs\n", " model_outputs = model(batch_inputs)\n", " s1_cls = model_outputs['left_cls_output']\n", " s2_cls = model_outputs['right_cls_output']\n", " \n", " s1_mean = model_outputs['left_mean_embeddings']\n", " s2_mean = model_outputs['right_mean_embeddings']\n", " return s1_cls, s2_cls, s1_mean, s2_mean, batch_labels['score']\n", " \n", " S1_cls = []\n", " S2_cls = []\n", " S1_mean = []\n", " S2_mean = []\n", " sts_labels = []\n", " # This is a hack to make tqdm to print colour bar\n", " # TODO: fix it .\n", " pbar = tqdm.trange(validation_steps, colour=\"magenta\", unit=\"batch\")\n", " for step_counter in pbar:\n", " dist_inputs = next(validation_dataset_distributed)\n", " s1_cls, s2_cls, s1_mean, s2_mean, batch_scores = strategy.run(\n", " validate_run, args=(dist_inputs,)\n", " )\n", " s1_cls = tf.concat(\n", " trainer.distribution_strategy.experimental_local_results(s1_cls),\n", " axis=0,\n", " )\n", " s2_cls = tf.concat(\n", " trainer.distribution_strategy.experimental_local_results(s2_cls),\n", " axis=0,\n", " )\n", " s1_mean = tf.concat(\n", " trainer.distribution_strategy.experimental_local_results(s1_mean),\n", " axis=0,\n", " )\n", " s2_mean = tf.concat(\n", " trainer.distribution_strategy.experimental_local_results(s2_mean),\n", " axis=0,\n", " )\n", " \n", " scores = tf.concat(\n", " trainer.distribution_strategy.experimental_local_results(\n", " batch_scores\n", " ),\n", " axis=0,\n", " )\n", "\n", " S1_cls.append(s1_cls)\n", " S2_cls.append(s2_cls)\n", " S1_mean.append(s1_mean)\n", " S2_mean.append(s2_mean)\n", " sts_labels.append(scores)\n", " pbar.set_description(\n", " \"Callback: Epoch {}/{} --- Step {}/{} \".format(\n", " epoch, epochs, step_counter, validation_steps\n", " )\n", " )\n", " \n", " \n", " sts_labels = tf.squeeze(tf.concat(sts_labels, axis=0), axis=1)\n", " sentence1_embeddings = tf.concat(S1_cls, axis=0)\n", " sentence2_embeddings = tf.concat(S2_cls, axis=0)\n", "\n", " cosine_scores = 1 - (paired_cosine_distances(sentence1_embeddings.numpy(), sentence2_embeddings.numpy()))\n", " manhattan_distances = -paired_manhattan_distances(sentence1_embeddings.numpy(), sentence2_embeddings.numpy())\n", " euclidean_distances = -paired_euclidean_distances(sentence1_embeddings.numpy(), sentence2_embeddings.numpy())\n", " dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(sentence1_embeddings.numpy(), sentence2_embeddings.numpy())]\n", "\n", "\n", " eval_pearson_cosine, _ = pearsonr(sts_labels, cosine_scores)\n", " eval_spearman_cosine, _ = spearmanr(sts_labels, cosine_scores)\n", "\n", " eval_pearson_manhattan, _ = pearsonr(sts_labels, manhattan_distances)\n", " eval_spearman_manhattan, _ = spearmanr(sts_labels, manhattan_distances)\n", "\n", " eval_pearson_euclidean, _ = pearsonr(sts_labels, euclidean_distances)\n", " eval_spearman_euclidean, _ = spearmanr(sts_labels, euclidean_distances)\n", "\n", " eval_pearson_dot, _ = pearsonr(sts_labels, dot_products)\n", " eval_spearman_dot, _ = spearmanr(sts_labels, dot_products)\n", "\n", " metrics_result = {'pearson_cosine_cls': eval_pearson_cosine,\n", " 'spearman_cosine_cls': eval_spearman_cosine,\n", " 'pearson_manhattan_cls': eval_pearson_manhattan, \n", " 'spearman_manhattan_cls': eval_spearman_manhattan, \n", " 'pearson_euclidean_cls': eval_pearson_euclidean, \n", " 'spearman_euclidean_cls': eval_spearman_euclidean, \n", " 'pearson_dot_cls': eval_pearson_dot, \n", " 'spearman_dot_cls': eval_spearman_dot}\n", " \n", " sentence1_embeddings = tf.concat(S1_mean, axis=0)\n", " sentence2_embeddings = tf.concat(S2_mean, axis=0)\n", "\n", " cosine_scores = 1 - (paired_cosine_distances(sentence1_embeddings.numpy(), sentence2_embeddings.numpy()))\n", " manhattan_distances = -paired_manhattan_distances(sentence1_embeddings.numpy(), sentence2_embeddings.numpy())\n", " euclidean_distances = -paired_euclidean_distances(sentence1_embeddings.numpy(), sentence2_embeddings.numpy())\n", " dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(sentence1_embeddings.numpy(), sentence2_embeddings.numpy())]\n", "\n", "\n", " eval_pearson_cosine, _ = pearsonr(sts_labels, cosine_scores)\n", " eval_spearman_cosine, _ = spearmanr(sts_labels, cosine_scores)\n", "\n", " eval_pearson_manhattan, _ = pearsonr(sts_labels, manhattan_distances)\n", " eval_spearman_manhattan, _ = spearmanr(sts_labels, manhattan_distances)\n", "\n", " eval_pearson_euclidean, _ = pearsonr(sts_labels, euclidean_distances)\n", " eval_spearman_euclidean, _ = spearmanr(sts_labels, euclidean_distances)\n", "\n", " eval_pearson_dot, _ = pearsonr(sts_labels, dot_products)\n", " eval_spearman_dot, _ = spearmanr(sts_labels, dot_products)\n", " \n", " metrics_result_mean = {'pearson_cosine_mean': eval_pearson_cosine,\n", " 'spearman_cosine_mean': eval_spearman_cosine,\n", " 'pearson_manhattan_mean': eval_pearson_manhattan, \n", " 'spearman_manhattan_mean': eval_spearman_manhattan, \n", " 'pearson_euclidean_mean': eval_pearson_euclidean, \n", " 'spearman_euclidean_mean': eval_spearman_euclidean, \n", " 'pearson_dot_mean': eval_pearson_dot, \n", " 'spearman_dot_mean': eval_spearman_dot}\n", " \n", " metrics_result.update(metrics_result_mean)\n", " pbar.set_postfix(**metrics_result)\n", " \n", " if wandb:\n", " wandb.log(metrics_result, step=step)\n", "\n", " return metrics_result" ] }, { "cell_type": "code", "execution_count": null, "id": "02777496", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "2f9b69a7", "metadata": {}, "source": [ "### Set Hyperparameters and Configs\n", "\n", "1. Set necessay hyperparameters.\n", "2. Prepare ```train dataset```, ```validation dataset```.\n", "3. Load ```model```, ```optimizer```, ```loss``` and ```trainer```." ] }, { "cell_type": "code", "execution_count": 15, "id": "57756cd6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')\n" ] } ], "source": [ "# Model configs\n", "learning_rate = 2e-5\n", "epochs = 3\n", "model_checkpoint_dir = 'MODELS/roberta_quora_embeddings'\n", "\n", "\n", "# Total train examples\n", "steps_per_epoch = total_train_examples // batch_size\n", "\n", "# model\n", "model_fn = get_model(model_name, is_training=True, use_dropout=True)\n", "# optimizer\n", "optimizer_fn = get_optimizer(learning_rate, total_train_examples, batch_size, epochs)\n", "# trainer (multi gpu strategy)\n", "trainer = get_trainer(distribution_strategy='mirrored', num_gpus=2)\n", "# loss\n", "loss_fn = in_batch_negative_loss()" ] }, { "cell_type": "code", "execution_count": null, "id": "082505d7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "60c4246a", "metadata": {}, "source": [ "### Train :-)\n", "\n", "* 1. Loss is coming down in epoch 1 itself.\n", "* 2. Zershot evaluation after ```epoch 1``` shows that, ```pearson``` and ```spearman``` correlation increases to\n", "```0.80```, which is significant improvemnet over ```Roberta``` base model, where we got ```0.43```.\n", "* 3. Without training on ```STS-B```, we got a good evaluation score on ```STS-B dev``` using Zeroshot." ] }, { "cell_type": "code", "execution_count": 18, "id": "5fcd18c7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:absl:Make sure `steps_per_epoch` should be less than or equal to number of batches in dataset.\n", "INFO:absl:Policy: ----> float32\n", "INFO:absl:Strategy: ---> \n", "INFO:absl:Num GPU Devices: ---> 2\n", "INFO:absl:Successful ✅✅: Model checkpoints matched and loaded from /home/jovyan/.cache/huggingface/hub/tftransformers__roberta-base-no-mlm.main.9e4aa91ba5936c6ac98586f85c152831e421d0ec/ckpt-1\n", "INFO:absl:Successful ✅: Loaded model from tftransformers/roberta-base-no-mlm\n", "INFO:absl:Using linear optimization warmup\n", "INFO:absl:Using Adamw optimizer\n", "INFO:absl:No ❌❌ checkpoint found in MODELS/roberta_quora_embeddings\n", "Train: Epoch 1/4 --- Step 10/3158 --- total examples 0 , trainable variables 199: 0%|\u001b[32m \u001b[0m| 0/315 [00:00', best_prob)" ] }, { "cell_type": "code", "execution_count": null, "id": "bb054ce9", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 38, "id": "1e8a93c0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "How should I propose a girl? --> 0.9225553\n", "How do I propose to a girl? --> 0.8723614\n", "Which is the most romantic way to propose a girl? --> 0.8557819\n", "How do I propose a girl for sex? --> 0.80544627\n", "How did you propose your girlfriend? --> 0.69494146\n", "What are some of the best and unique ways to propose marriage? --> 0.6611091\n", "How can I propose to my crush? --> 0.64724606\n", "If I want to propose to a girl should I give her hints in advance? --> 0.6309003\n", "What doesit take for a man to propose? --> 0.6253518\n", "What is the right time to propose someone ? --> 0.5932445\n" ] } ], "source": [ "input_question = 'What is the best way to propose a girl?'\n", "most_similar(input_question)" ] }, { "cell_type": "code", "execution_count": null, "id": "f4700a4f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 40, "id": "9fd5c900", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "What's the most effective way to get started with Deep Learning? --> 0.83670104\n", "How do I learn deep learning in 1 month? --> 0.7423597\n", "Why is deep learning so important in machine learning? --> 0.7327932\n", "Does Quora use Deep Learning? --> 0.7260519\n", "Should a machine learning beginner go straight for deep learning? --> 0.719143\n", "Where should I start for machine learning? --> 0.71324116\n", "How do i get started on machine learning? --> 0.7123989\n", "I am New to Deep Learning. How do I start with Python? --> 0.710938\n", "How do I start learning machine learning? --> 0.7106862\n", "What is deep learning? How is related to AI and machine learning? --> 0.70124084\n" ] } ], "source": [ "input_question = 'How can I start learning Deep Learning?'\n", "most_similar(input_question)" ] }, { "cell_type": "code", "execution_count": 22, "id": "437da0b6", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 41, "id": "e3552f5e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "What are the must-visit and affordable tourist destinations in India? --> 0.8237094\n", "What is the most overrated tourist destination in India? --> 0.7374817\n", "What is the best sex tourism destination in India? --> 0.73366314\n", "What are the most popular tourist destinations? --> 0.7160436\n", "What are the best destination for a solo traveler in India? --> 0.7078299\n", "What is the best holiday destination? --> 0.675949\n", "Which places I should not visit in India as a Indian? --> 0.6656152\n", "What are the best places to go as a tourist? --> 0.66551954\n", "Which are some best places to visit in India? --> 0.66457677\n", "Which is your best holiday destination? --> 0.6640895\n" ] } ], "source": [ "input_question = 'Best tourist destinations in India'\n", "most_similar(input_question)" ] }, { "cell_type": "code", "execution_count": null, "id": "68e67afa", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 42, "id": "2e26ca30", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "What is your favourite piece of classical music and why? --> 0.75282526\n", "What are the benefits of listening to classical music? --> 0.7361862\n", "Why do some people only listen to classical music? --> 0.7289536\n", "Which music is the best for relaxation? --> 0.6762159\n", "Why is classical music better than most pop music? --> 0.6651089\n", "What are some classical and operant conditioning in education? --> 0.64240026\n", "Classical music in movies? --> 0.6344438\n", "Which classic music is this? --> 0.59156764\n", "Which ones are some of the most soothing tunes composed on a piano? --> 0.57644486\n", "What are the differences between Hindustani classical music and Carnatic music? --> 0.57415533\n" ] } ], "source": [ "input_question = 'Why classical music is so relaxing?'\n", "most_similar(input_question)" ] }, { "cell_type": "code", "execution_count": null, "id": "21956dae", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c2b77af1", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "jupytext": { "formats": "ipynb,md:myst" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" } }, "nbformat": 4, "nbformat_minor": 5 }