Sponsor NER Model (Brand & URL Extractor)

This model is a fine-tuned BERT-based Named Entity Recognition (NER) model designed to extract sponsor brand names and URLs from YouTube video descriptions.

It was trained for a client project using annotated YouTube text data and is capable of identifying structured sponsor-related information in informal or semi-formal natural language.

Model Architecture

Base model: bert-base-cased
Task: Token Classification (NER)
Fine-tuned on: YouTube descriptions with custom sponsor tags
Entities extracted:
- B-BRAND / I-BRAND – Sponsor/brand name
- B-URL / I-URL – Sponsor link

How to Use in Python

import torch
from transformers import BertTokenizerFast, BertForTokenClassification

# Load model + tokenizer from Hugging Face Hub
model_path = "Bhavya54/sponsor-ner-model"
model = BertForTokenClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model.eval()

# ID to label mapping
id2label = {
    0: "B-Brand",
    1: "B-URL",
    2: "I-Brand",
    3: "I-URL",
    4: "O"
}

# NER prediction function
def predict_ner(text, model, tokenizer, id2label):
    encoded = tokenizer(text, return_tensors="pt", truncation=True, return_offsets_mapping=True)
    offset_mapping = encoded.pop("offset_mapping")

    with torch.no_grad():
        outputs = model(**encoded)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)

    tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"][0])
    labels = [id2label[p.item()] for p in predictions[0]]

    entities = []
    current_entity = ""
    current_label = None

    for token, label in zip(tokens, labels):
        if token in ['[CLS]', '[SEP]', '[PAD]']:
            continue

        if label.startswith("B-"):
            if current_entity:
                entities.append((current_label, current_entity.strip()))
            current_label = label[2:]
            current_entity = token
        elif label.startswith("I-") and current_label == label[2:]:
            if token.startswith("##"):
                current_entity += token[2:]
            else:
                if current_label == "URL":
                    current_entity += token
                else:
                    current_entity += " " + token
        else:
            if current_entity:
                entities.append((current_label, current_entity.strip()))
            current_entity = ""
            current_label = None

    if current_entity:
        entities.append((current_label, current_entity.strip()))

    # Separate entities
    brands = [ent.replace(" ##", "") for label, ent in entities if label == "Brand"]
    urls = [ent.replace(" ", "").replace("##", "") for label, ent in entities if label == "URL"]

    return brands, urls

# Example usage
text = "Graduation season hits different when you’re glowing and protected — ZEELOOL’s got you covered: https://zeelool.com//ytb?type=ZJTA855"
brands, urls = predict_ner(text, model, tokenizer, id2label)

print("brands =", brands)
print("urls =", urls)