Uploaded model

  • Developed by: AlberBshara
  • License: apache-2.0
  • Finetuned from model : llama-3-8b-Instruct-bnb-4bit

you can use this model for QA task, the context window of this model is 8K

How to Use it:

Installs Unsloth, Xformers (Flash Attention) and all other packages!

%%capture !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" !pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes !pip install triton

import sys, os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from unsloth.chat_templates import get_chat_template
from typing import Tuple, Dict, Any
import torch

class LLM:
    def __init__(self, load_in_4bit: bool = True,
                 load_cpu_mem_usage: bool = True,
                 hf_model_path: str = "AlberBshara/scholara_QA"):
        """
        Args:
            load_in_4bit (bool): Use 4-bit quantization. Defaults to True.
            load_cpu_mem_usage (bool): Reduce CPU memory usage. Defaults to True.
            hf_model_path (str): The path of your model on HuggingFace-Hub like "your-user-name/model-name".
        """
        assert torch.cuda.is_available(), "CUDA is not available. An NVIDIA GPU is required."
        hf_auth_token = HUGGING_FACE_API_TOKEN
        # Specify the quantization config
        self._bnb_config = BitsAndBytesConfig(load_in_4bit=load_in_4bit)

        # Load model directly with quantization config
        self._model = AutoModelForCausalLM.from_pretrained(
            hf_model_path,
            low_cpu_mem_usage=load_cpu_mem_usage,
            quantization_config=self._bnb_config,
            use_auth_token=hf_auth_token
        )

        # Load the tokenizer
        self._tokenizer = AutoTokenizer.from_pretrained(
            hf_model_path,
            use_auth_token=hf_auth_token
        )
        self._tokenizer = get_chat_template(
            self._tokenizer,
            chat_template="llama-3",
            mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, # ShareGPT style
        )

        self._hf_model_path = hf_model_path
        self._EOS_TOKEN_ID = self._tokenizer.eos_token_id

        self._prompt = lambda context, question: f"""
        Answer the following question, use the given context.
        Context: [{context}]
        Question: [{question}]
        """

    def invoke(self, context: str, question: str) -> Tuple:
        if not question.strip():
            raise ValueError("question cannot be empty or None")

        if not context.strip():
            raise ValueError("context cannot be empty or None")

        inputs = self._prompt(context, question)

        messages = [{"from": "human", "value": inputs}]
        inputs = self._tokenizer.apply_chat_template(
              messages,
              tokenize=True,
              add_generation_prompt=True, # Must add for generation
              return_tensors="pt",
        ).to("cuda")

        # Increase the max_new_tokens to allow more detailed responses
        output_ids = self._model.generate(inputs, max_new_tokens=2048, pad_token_id=self._EOS_TOKEN_ID)
        output_ids = output_ids.tolist()[0] if output_ids.size(0) == 1 else output_ids.tolist()

        output_text = self._tokenizer.decode(output_ids, skip_special_tokens=True)

        # free GPU Mem.
        del inputs
        torch.cuda.empty_cache()

        return output_text, output_ids, None

    def extract_answer(self, response: str) -> str:
        start_with: str = ".assistant"
        start_index = response.find(start_with)

        # If the word is found, extract the substring from that point onward
        if start_index != -1:
            # Move start_index to the end of the word
            start_index += len(start_with)
            return response[start_index:]
        else:
            return response

    def get_metadata(self) -> Dict[str, Any]:
        return {
            "class_name": self.__class__.__name__,
            "init_params": {
                "load_in_4bit": True,
                "load_cpu_mem_usage": True,
                "hf_model_path": "AlberBshara/scholara_QA",
                "hf_auth_token": "--%$%--"
            },
            "methods": ["invoke", "extract_answer"]
        }


test_llm = LLM()
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no pipeline_tag.

Model tree for AlberBshara/scholara_QA

Finetuned
(923)
this model