In [1]:
import torch, html
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk
from huggingface_hub import notebook_login
from dotenv import load_dotenv
import os

In [3]:
load_dotenv()

True

In [4]:
access_token = os.environ['HF_TOKEN']

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
dataset = load_dataset('csv', data_files={
    'train': 'data/drugsComTrain_raw.tsv',
    'test': 'data/drugsComTest_raw.tsv'
}, delimiter='\t', num_proc=8)
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [6]:
dataset.shape

{'train': (161297, 7), 'test': (53766, 7)}

In [7]:
dataset['train'][0]

{'Unnamed: 0': 206461,
 'drugName': 'Valsartan',
 'condition': 'Left Ventricular Dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27}

In [8]:
dataset = (
    dataset
    .filter(lambda x: x['condition'] is not None)
    .rename_column('Unnamed: 0', 'row_id')
    .map(lambda x: {'condition': [row.lower() for row in x['condition']]}, batched=True, num_proc=8, batch_size=3000)
    .map(lambda x: {'review': [html.unescape(row) for row in x['review']]}, batched=True, num_proc=8, batch_size=3000)
    .map(lambda x: {'review_length': [len(row.split()) for row in x['review']]}, batched=True, num_proc=8, batch_size=3000)
    # .filter(lambda x: {'review_length': [row > 30 for row in x['review_length']]}, batched=True, num_proc=8)
    .filter(lambda x: x['review_length'] > 30, num_proc=8, batch_size=3000)
)
dataset

DatasetDict({
    train: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

# Exercises

## Predict patient condition based on drug review

In [6]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AutoModel, DataCollatorWithPadding
from torch.utils.data import DataLoader
import evaluate, numpy as np
from huggingface_hub import HfApi

In [10]:
def clean_condition_column(rows):
    target_text = 'users found this comment helpful'
    return {'condition': ['unknown' if target_text in condition else condition for condition in rows['condition']]}

In [11]:
dataset = dataset.map(clean_condition_column, batched=True, batch_size=3000, num_proc=8)
dataset

DatasetDict({
    train: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [12]:
clean_data = dataset['train'].train_test_split(test_size=.2, seed=5, writer_batch_size=3000)
clean_data['validation'] = clean_data.pop('test')
clean_data['test'] = dataset['test']

clean_data

DatasetDict({
    train: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [13]:
all_conditions = sorted(set(clean_data['train']['condition']).union(set(clean_data['validation']['condition'])))
len(all_conditions)

751

In [14]:
id2label = dict(enumerate(all_conditions))
label2id = {v:k for k, v in id2label.items()}

In [15]:
len(label2id) == len(id2label)

True

In [16]:
clean_data = clean_data.map(lambda x: {'labels': [label2id.get(condition, label2id['unknown']) for condition in x['condition']]}, batched=True, batch_size=3000, num_proc=8)
clean_data

DatasetDict({
    train: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels'],
        num_rows: 46108
    })
})

In [17]:
# checkpoint = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'
checkpoint = 'distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(id2label)).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
model.config.id2label = id2label
model.config.label2id = label2id
model.num_labels = len(label2id)

In [19]:
collator = DataCollatorWithPadding(tokenizer)

In [20]:
def tokenize_and_split(examples):
    tokens = tokenizer(
        examples["review"],
        truncation=True,
        max_length=512,
        return_overflowing_tokens=True,
    )
    mappings = tokens.pop('overflow_to_sample_mapping')
    for key, values in examples.items():
        tokens[key] = [values[idx] for idx in mappings]
    return tokens

In [21]:
tokenized_dataset = clean_data.map(tokenize_and_split, batched=True, batch_size=3000, num_proc=8)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 110857
    })
    validation: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 27717
    })
    test: Dataset({
        features: ['row_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 46118
    })
})

In [None]:
tokenized_dataset.save_to_disk('data/drugs', num_proc=4)

In [None]:
tokenized_dataset = load_from_disk('data/drugs/')
tokenized_dataset

In [22]:
filtered = tokenized_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
filtered

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 110857
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 27717
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 46118
    })
})

In [23]:
accuracy = evaluate.load('accuracy')

In [24]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

In [25]:
lr = 3e-5

In [26]:
train_args = TrainingArguments(
    'medical_condition_classification', 
    overwrite_output_dir=True, 
    eval_strategy='steps', eval_steps=2000, 
    per_device_train_batch_size=24, 
    per_device_eval_batch_size=24, 
    fp16=True, num_train_epochs=5,
    learning_rate=lr,
    push_to_hub=True,
    hub_token=access_token
)

In [27]:
trainer = Trainer(model, train_args, collator, filtered['train'], filtered['validation'], tokenizer, compute_metrics=compute_metrics)

In [28]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
2000,1.8625,1.719871,0.63968
4000,1.459,1.369566,0.688963
6000,1.1737,1.213141,0.717249
8000,1.042,1.101419,0.732908
10000,0.8431,1.032237,0.750983
12000,0.8012,0.988939,0.758668
14000,0.7312,0.949687,0.772703
16000,0.6561,0.933845,0.780496
18000,0.6132,0.907262,0.787531
20000,0.5195,0.901089,0.792943


TrainOutput(global_step=23100, training_loss=1.0162131207949154, metrics={'train_runtime': 4454.3937, 'train_samples_per_second': 124.436, 'train_steps_per_second': 5.186, 'total_flos': 2.958796560013029e+16, 'train_loss': 1.0162131207949154, 'epoch': 5.0})

In [29]:
with torch.no_grad():
    preds = trainer.predict(filtered['test'])

In [33]:
preds.metrics

{'test_loss': 0.8813542127609253,
 'test_accuracy': 0.8004249967474739,
 'test_runtime': 87.98,
 'test_samples_per_second': 524.188,
 'test_steps_per_second': 21.846}

In [34]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/270M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/samsaara/medical_condition_classification/commit/73d201a08dd78ce2afea66736b188d271e652052', commit_message='End of training', commit_description='', oid='73d201a08dd78ce2afea66736b188d271e652052', pr_url=None, repo_url=RepoUrl('https://huggingface.co/samsaara/medical_condition_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='samsaara/medical_condition_classification'), pr_revision=None, pr_num=None)

In [36]:
tokenizer.push_to_hub('medical_condition_classification', commit_message='tokenizer')

README.md:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/samsaara/medical_condition_classification/commit/9a28b6773707f2b88e7eeb37bc811c642bb524c7', commit_message='tokenizer', commit_description='', oid='9a28b6773707f2b88e7eeb37bc811c642bb524c7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/samsaara/medical_condition_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='samsaara/medical_condition_classification'), pr_revision=None, pr_num=None)

In [40]:
tokenized_dataset.push_to_hub('medical_condition_classification')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/111 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/samsaara/medical_condition_classification/commit/7aea5155fcba521a02ec3e9e8fb4e86d09dc44ba', commit_message='Upload dataset', commit_description='', oid='7aea5155fcba521a02ec3e9e8fb4e86d09dc44ba', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/samsaara/medical_condition_classification', endpoint='https://huggingface.co', repo_type='dataset', repo_id='samsaara/medical_condition_classification'), pr_revision=None, pr_num=None)

In [7]:
api = HfApi(token=access_token)

In [48]:
api.upload_file(
    path_or_fileobj='./medical_condition_classification/README.md', 
    path_in_repo='README.md',
    repo_id='samsaara/medical_condition_classification', 
    commit_message='update README'
)

CommitInfo(commit_url='https://huggingface.co/samsaara/medical_condition_classification/commit/1067a267d69af46563d3a6b5a36d65030ccaa318', commit_message='update README', commit_description='', oid='1067a267d69af46563d3a6b5a36d65030ccaa318', pr_url=None, repo_url=RepoUrl('https://huggingface.co/samsaara/medical_condition_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='samsaara/medical_condition_classification'), pr_revision=None, pr_num=None)

In [None]:
api.delete_file('datasets.ipynb', 'samsaara/medical_condition_classification', commit_message='')

In [49]:
api.upload_file(
    path_or_fileobj='datasets.ipynb', 
    path_in_repo='datasets.ipynb',
    repo_id='samsaara/medical_condition_classification', 
    commit_message='update notebook'
)

CommitInfo(commit_url='https://huggingface.co/samsaara/medical_condition_classification/commit/f7008aa4e9f2c5d5fd4f87632cef56c86106a574', commit_message='update notebook', commit_description='', oid='f7008aa4e9f2c5d5fd4f87632cef56c86106a574', pr_url=None, repo_url=RepoUrl('https://huggingface.co/samsaara/medical_condition_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='samsaara/medical_condition_classification'), pr_revision=None, pr_num=None)