samsaara
/

medical_condition_classification

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0729b762-3b84-474f-b82a-df7622b91ccb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch, html\n",
+    "from transformers import AutoTokenizer\n",
+    "from datasets import load_dataset, load_from_disk\n",
+    "from huggingface_hub import notebook_login\n",
+    "from dotenv import load_dotenv\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "92ee5f76-2cd3-4af0-8687-dca782aa38a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "97d33c57-b03b-4bee-b051-04d707a8d773",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "access_token = os.environ['HF_TOKEN']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "4358520c-3d8c-42ef-967a-eddeef732ef1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'cuda'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1c2ec24f-4c6d-4469-8e85-601a4b0d3e4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],\n",
+       "        num_rows: 161297\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],\n",
+       "        num_rows: 53766\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset = load_dataset('csv', data_files={\n",
+    "    'train': 'data/drugsComTrain_raw.tsv',\n",
+    "    'test': 'data/drugsComTest_raw.tsv'\n",
+    "}, delimiter='\\t', num_proc=8)\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "dbb81021-9acc-46b4-87c0-23f0f787fef5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'train': (161297, 7), 'test': (53766, 7)}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "a983147c-eb04-455f-bf02-0c57c2a549e9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Unnamed: 0': 206461,\n",
+       " 'drugName': 'Valsartan',\n",
+       " 'condition': 'Left Ventricular Dysfunction',\n",
+       " 'review': '\"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil\"',\n",
+       " 'rating': 9.0,\n",
+       " 'date': 'May 20, 2012',\n",
+       " 'usefulCount': 27}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset['train'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ee2b8ddf-79d7-44d6-80ba-243bc2f04de8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],\n",
+       "        num_rows: 138514\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],\n",
+       "        num_rows: 46108\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset = (\n",
+    "    dataset\n",
+    "    .filter(lambda x: x['condition'] is not None)\n",
+    "    .rename_column('Unnamed: 0', 'row_id')\n",
+    "    .map(lambda x: {'condition': [row.lower() for row in x['condition']]}, batched=True, num_proc=8, batch_size=3000)\n",
+    "    .map(lambda x: {'review': [html.unescape(row) for row in x['review']]}, batched=True, num_proc=8, batch_size=3000)\n",
+    "    .map(lambda x: {'review_length': [len(row.split()) for row in x['review']]}, batched=True, num_proc=8, batch_size=3000)\n",
+    "    # .filter(lambda x: {'review_length': [row > 30 for row in x['review_length']]}, batched=True, num_proc=8)\n",
+    "    .filter(lambda x: x['review_length'] > 30, num_proc=8, batch_size=3000)\n",
+    ")\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7c4daf2-36c1-4074-91ca-8871a581052d",
+   "metadata": {},
+   "source": [
+    "# Exercises"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea14b998-69f1-40a7-a200-7cc53b0e22fd",
+   "metadata": {},
+   "source": [
+    "## Predict patient condition based on drug review"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "dc6b299b-2d0b-4475-bfff-d0180dd672c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AutoModel, DataCollatorWithPadding\n",
+    "from torch.utils.data import DataLoader\n",
+    "import evaluate, numpy as np\n",
+    "from huggingface_hub import HfApi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "77caa284-8307-40a0-8369-621195e5c7e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_condition_column(rows):\n",
+    "    target_text = 'users found this comment helpful'\n",
+    "    return {'condition': ['unknown' if target_text in condition else condition for condition in rows['condition']]}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "058d4c64-428b-43bb-86c4-ba8f5c1b8a84",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],\n",
+       "        num_rows: 138514\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],\n",
+       "        num_rows: 46108\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset = dataset.map(clean_condition_column, batched=True, batch_size=3000, num_proc=8)\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "80dc20fe-cb66-4b0d-99dc-88e84413975b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],\n",
+       "        num_rows: 110811\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],\n",
+       "        num_rows: 27703\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],\n",
+       "        num_rows: 46108\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "clean_data = dataset['train'].train_test_split(test_size=.2, seed=5, writer_batch_size=3000)\n",
+    "clean_data['validation'] = clean_data.pop('test')\n",
+    "clean_data['test'] = dataset['test']\n",
+    "\n",
+    "clean_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "8be33fbb-143f-45b5-9e18-c5662a7e0dad",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "751"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_conditions = sorted(set(clean_data['train']['condition']).union(set(clean_data['validation']['condition'])))\n",
+    "len(all_conditions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "912ef7d5-149a-48ed-ac6b-1ff2f3c2556a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "id2label = dict(enumerate(all_conditions))\n",
+    "label2id = {v:k for k, v in id2label.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "aca4a239-3f07-44bf-905e-2743b8f0889d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(label2id) == len(id2label)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "024d5faa-88f1-41b7-9f52-8178ad731089",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels'],\n",
+       "        num_rows: 110811\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels'],\n",
+       "        num_rows: 27703\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels'],\n",
+       "        num_rows: 46108\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "clean_data = clean_data.map(lambda x: {'labels': [label2id.get(condition, label2id['unknown']) for condition in x['condition']]}, batched=True, batch_size=3000, num_proc=8)\n",
+    "clean_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "2f71cacc-9fb4-4436-b32b-8f172bcc19b1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# checkpoint = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'\n",
+    "checkpoint = 'distilbert-base-uncased'\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(id2label)).to(device)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "e9b2c2bd-52d4-47e0-aaaf-eb76b3bab9fa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.config.id2label = id2label\n",
+    "model.config.label2id = label2id\n",
+    "model.num_labels = len(label2id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "2d3bb44b-e635-4e7c-b984-6379510b60b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collator = DataCollatorWithPadding(tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c22a17ab-4a43-45f6-ba99-62cdb94103c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_and_split(examples):\n",
+    "    tokens = tokenizer(\n",
+    "        examples[\"review\"],\n",
+    "        truncation=True,\n",
+    "        max_length=512,\n",
+    "        return_overflowing_tokens=True,\n",
+    "    )\n",
+    "    mappings = tokens.pop('overflow_to_sample_mapping')\n",
+    "    for key, values in examples.items():\n",
+    "        tokens[key] = [values[idx] for idx in mappings]\n",
+    "    return tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "5a1b9eb6-87a1-4d7f-855b-f1c9e5ae63c2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels', 'input_ids', 'attention_mask'],\n",
+       "        num_rows: 110857\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels', 'input_ids', 'attention_mask'],\n",
+       "        num_rows: 27717\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels', 'input_ids', 'attention_mask'],\n",
+       "        num_rows: 46118\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenized_dataset = clean_data.map(tokenize_and_split, batched=True, batch_size=3000, num_proc=8)\n",
+    "tokenized_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2729d5c2-499d-41f0-8ddb-27df3cf82475",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3488692d-44ef-4b99-af4c-8fa32d6ed3b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_dataset.save_to_disk('data/drugs', num_proc=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2bc7b3ea-5f48-4298-b625-d313c4dc1ea3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_dataset = load_from_disk('data/drugs/')\n",
+    "tokenized_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "001bb28a-9ff1-463f-90af-22dc7f6bce53",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "344a5505-f143-4389-8be3-282219f29d74",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'labels'],\n",
+       "        num_rows: 110857\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'labels'],\n",
+       "        num_rows: 27717\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'labels'],\n",
+       "        num_rows: 46118\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])\n",
+    "filtered"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "b31de787-0312-4d67-8b41-ce85732308ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "accuracy = evaluate.load('accuracy')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "f6d0543e-06d5-4930-93f6-8028e4e4ead5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_metrics(eval_preds):\n",
+    "    logits, labels = eval_preds\n",
+    "    preds = np.argmax(logits, axis=-1)\n",
+    "    return accuracy.compute(predictions=preds, references=labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "ec5be835-e194-47a4-8c2a-3eb7500645ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lr = 3e-5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "d04e4bae-8bb0-4e5e-be0f-2ce41db1bbe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_args = TrainingArguments(\n",
+    "    'medical_condition_classification', \n",
+    "    overwrite_output_dir=True, \n",
+    "    eval_strategy='steps', eval_steps=2000, \n",
+    "    per_device_train_batch_size=24, \n",
+    "    per_device_eval_batch_size=24, \n",
+    "    fp16=True, num_train_epochs=5,\n",
+    "    learning_rate=lr,\n",
+    "    push_to_hub=True,\n",
+    "    hub_token=access_token\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "e26faf3f-03ab-411d-97a2-c1a3b6e2b425",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer = Trainer(model, train_args, collator, filtered['train'], filtered['validation'], tokenizer, compute_metrics=compute_metrics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "52c55095-4761-4353-8222-887cdf309431",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='23100' max='23100' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [23100/23100 1:14:13, Epoch 5/5]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Accuracy</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>2000</td>\n",
+       "      <td>1.862500</td>\n",
+       "      <td>1.719871</td>\n",
+       "      <td>0.639680</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4000</td>\n",
+       "      <td>1.459000</td>\n",
+       "      <td>1.369566</td>\n",
+       "      <td>0.688963</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6000</td>\n",
+       "      <td>1.173700</td>\n",
+       "      <td>1.213141</td>\n",
+       "      <td>0.717249</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>8000</td>\n",
+       "      <td>1.042000</td>\n",
+       "      <td>1.101419</td>\n",
+       "      <td>0.732908</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>10000</td>\n",
+       "      <td>0.843100</td>\n",
+       "      <td>1.032237</td>\n",
+       "      <td>0.750983</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>12000</td>\n",
+       "      <td>0.801200</td>\n",
+       "      <td>0.988939</td>\n",
+       "      <td>0.758668</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>14000</td>\n",
+       "      <td>0.731200</td>\n",
+       "      <td>0.949687</td>\n",
+       "      <td>0.772703</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>16000</td>\n",
+       "      <td>0.656100</td>\n",
+       "      <td>0.933845</td>\n",
+       "      <td>0.780496</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>18000</td>\n",
+       "      <td>0.613200</td>\n",
+       "      <td>0.907262</td>\n",
+       "      <td>0.787531</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>20000</td>\n",
+       "      <td>0.519500</td>\n",
+       "      <td>0.901089</td>\n",
+       "      <td>0.792943</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>22000</td>\n",
+       "      <td>0.501500</td>\n",
+       "      <td>0.892959</td>\n",
+       "      <td>0.795072</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=23100, training_loss=1.0162131207949154, metrics={'train_runtime': 4454.3937, 'train_samples_per_second': 124.436, 'train_steps_per_second': 5.186, 'total_flos': 2.958796560013029e+16, 'train_loss': 1.0162131207949154, 'epoch': 5.0})"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "7c8d06d3-ef08-42ca-9dad-651c3a7c45fc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "with torch.no_grad():\n",
+    "    preds = trainer.predict(filtered['test'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "cab2f41e-d00f-41cb-a5a6-daf9e713077d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'test_loss': 0.8813542127609253,\n",
+       " 'test_accuracy': 0.8004249967474739,\n",
+       " 'test_runtime': 87.98,\n",
+       " 'test_samples_per_second': 524.188,\n",
+       " 'test_steps_per_second': 21.846}"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds.metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "1e323be4-ac78-498e-99fc-3133b11dc241",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d45b5b475cce4bb09298a7278ec51c64",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/270M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/samsaara/medical_condition_classification/commit/73d201a08dd78ce2afea66736b188d271e652052', commit_message='End of training', commit_description='', oid='73d201a08dd78ce2afea66736b188d271e652052', pr_url=None, repo_url=RepoUrl('https://huggingface.co/samsaara/medical_condition_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='samsaara/medical_condition_classification'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.push_to_hub()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "6e754047-4b6c-4b3c-80a1-8493009ac7ca",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9f853954de9b4fe5bcf797878413702f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "README.md:   0%|          | 0.00/2.11k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/samsaara/medical_condition_classification/commit/9a28b6773707f2b88e7eeb37bc811c642bb524c7', commit_message='tokenizer', commit_description='', oid='9a28b6773707f2b88e7eeb37bc811c642bb524c7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/samsaara/medical_condition_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='samsaara/medical_condition_classification'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.push_to_hub('medical_condition_classification', commit_message='tokenizer')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "2940e50a-328d-4f30-8cb7-30dd047d2f92",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "51c9d91e63664cba896e957841756995",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e3bb0c86842e427eba71257d113c0845",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/111 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "73d3373d5da04164a237ff56dc57fac5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "df6679bdb3c342ffa2aab53922a3d1af",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "60772b393cdc4dca9191a538b9116169",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9b859655cbe34da184c8e6b6a07d0911",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/datasets/samsaara/medical_condition_classification/commit/7aea5155fcba521a02ec3e9e8fb4e86d09dc44ba', commit_message='Upload dataset', commit_description='', oid='7aea5155fcba521a02ec3e9e8fb4e86d09dc44ba', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/samsaara/medical_condition_classification', endpoint='https://huggingface.co', repo_type='dataset', repo_id='samsaara/medical_condition_classification'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenized_dataset.push_to_hub('medical_condition_classification')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "4bbf39bb-fea9-4b5b-8d87-7a9835975358",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "api = HfApi(token=access_token)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "16e0625b-5518-463a-96e2-2d008341b1f1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/samsaara/medical_condition_classification/commit/1067a267d69af46563d3a6b5a36d65030ccaa318', commit_message='update README', commit_description='', oid='1067a267d69af46563d3a6b5a36d65030ccaa318', pr_url=None, repo_url=RepoUrl('https://huggingface.co/samsaara/medical_condition_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='samsaara/medical_condition_classification'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "api.upload_file(\n",
+    "    path_or_fileobj='./medical_condition_classification/README.md', \n",
+    "    path_in_repo='README.md',\n",
+    "    repo_id='samsaara/medical_condition_classification', \n",
+    "    commit_message='update README'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "031993dd-33b2-4c31-b199-3fe1c2fb88dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "api.delete_file('datasets.ipynb', 'samsaara/medical_condition_classification', commit_message='')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "4144a617-f0d6-41d3-9a44-90833b8bb1f5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/samsaara/medical_condition_classification/commit/f7008aa4e9f2c5d5fd4f87632cef56c86106a574', commit_message='update notebook', commit_description='', oid='f7008aa4e9f2c5d5fd4f87632cef56c86106a574', pr_url=None, repo_url=RepoUrl('https://huggingface.co/samsaara/medical_condition_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='samsaara/medical_condition_classification'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "api.upload_file(\n",
+    "    path_or_fileobj='datasets.ipynb', \n",
+    "    path_in_repo='datasets.ipynb',\n",
+    "    repo_id='samsaara/medical_condition_classification', \n",
+    "    commit_message='update notebook'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2de298eb-82f2-482b-acee-36c6a1e630b8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}