diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb962c121c1c0d151cbfa00da825eab804a87cdd --- /dev/null +++ b/README.md @@ -0,0 +1,328 @@ +--- +license: other +license_name: nvidia-open-model-license +license_link: >- + https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf +--- + +# NOTICE; PLEASE READ. NO INFERENCE. (YET) + +**This has no support for inference, yet.** All I've done is move the weights out of NVIDIAs NeMo architecture so people smarter than me can get a headstart on making it work with other backends. + +## Nemotron-4-340B-Instruct + +[![Model architecture](https://img.shields.io/badge/Model%20Arch-Transformer%20Decoder-green)](#model-architecture)[![Model size](https://img.shields.io/badge/Params-340B-green)](#model-architecture)[![Language](https://img.shields.io/badge/Language-Multilingual-green)](#datasets) + +### Model Overview + +Nemotron-4-340B-Instruct is a large language model (LLM) that can be used as part of a synthetic data generation pipeline to create training data that helps researchers and developers build their own LLMs. It is a fine-tuned version of the Nemotron-4-340B-Base model, optimized for English-based single and multi-turn chat use-cases. It supports a context length of 4,096 tokens. + +The base model was pre-trained on a corpus of 9 trillion tokens consisting of a diverse assortment of English based texts, 50+ natural languages, and 40+ coding languages. Subsequently the Nemotron-4-340B-Instruct model went through additional alignment steps including: + +- Supervised Fine-tuning (SFT) +- Direct Preference Optimization (DPO) +- Reward-aware Preference Optimization (RPO) ([Additional in-house alignment technique](https://research.nvidia.com/publication/2024-06_nemotron-4-340b)) + +Throughout the alignment process, we relied on only approximately 20K human-annotated data while our data generation pipeline synthesized over 98% of the data used for supervised fine-tuning and preference fine-tuning (DPO & RPO). We provide comprehensive details about our synthetic data generation pipeline in the [technical report](https://research.nvidia.com/publication/2024-06_nemotron-4-340b). + +This results in a model that is aligned for human chat preferences, improvements in mathematical reasoning, coding and instruction-following, and is capable of generating high quality synthetic data for a variety of use cases. + +Under the NVIDIA Open Model License, NVIDIA confirms: +- Models are commercially usable. +- You are free to create and distribute Derivative Models. +- NVIDIA does not claim ownership to any outputs generated using the Models or Derivative Models. + +### License: + +[NVIDIA Open Model License](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf) + +### Intended use + +Nemotron-4-340B-Instruct is a chat model intended for use for the English language. + +Nemotron-4-340B-Instruct is designed for Synthetic Data Generation to enable developers and enterprises for building and customizing their own large language models and LLM applications. + +The instruct model itself can be further customized using the [NeMo Framework](https://docs.nvidia.com/nemo-framework/index.html) suite of customization tools including Parameter-Efficient Fine-Tuning (P-tuning, Adapters, LoRA, and more), and Model Alignment (SFT, SteerLM, RLHF, and more) using [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner). + +**Model Developer:** NVIDIA + +**Model Dates:** Nemotron-4-340B-Instruct was trained between December 2023 and May 2024. + +**Data Freshness:** The pretraining data has a cutoff of June 2023. + +### Required Hardware + +BF16 Inference: +- 8x H200 (1x H200 node) +- 16x H100 (2x H100 nodes) +- 16x A100 80GB (2x A100 80GB nodes) + + +### Model Architecture: + +Nemotron-4-340B-Instruct is standard decoder-only Transformer, trained with a sequence length of 4096 tokens, uses Grouped-Query Attention (GQA), and Rotary Position Embeddings (RoPE). + +**Architecture Type:** Transformer Decoder (auto-regressive language model) + +**Network Architecture:** +Nemotron-4 + +### Prompt Format + +Note: For Nemotron-4-340B-Instruct we recommend keeping the system prompt empty. + +#### Single Turn + +```text +System + +User +{prompt} +Assistant +``` + +#### Multi-Turn or Few-shot + +```text +System + +User +{prompt 1} +Assistant +{response 1} +User +{prompt 2} +Assistant +{response 2} +... +User +{prompt N} +Assistant +``` + +An example of a formattable prompt template is available in the following section. + +### Usage + +Deployment and inference with Nemotron-4-340B-Instruct can be done in three steps using NeMo Framework: + +Create a Python script to interact with the deployed model. +Create a Bash script to start the inference server +Schedule a Slurm job to distribute the model across 2 nodes and associate them with the inference server. + +1. Define the Python script ``call_server.py`` + +```python +import json +import requests + +headers = {"Content-Type": "application/json"} + +def text_generation(data, ip='localhost', port=None): + resp = requests.put(f'http://{ip}:{port}/generate', data=json.dumps(data), headers=headers) + return resp.json() + + +def get_generation(prompt, greedy, add_BOS, token_to_gen, min_tokens, temp, top_p, top_k, repetition, batch=False): + data = { + "sentences": [prompt] if not batch else prompt, + "tokens_to_generate": int(token_to_gen), + "temperature": temp, + "add_BOS": add_BOS, + "top_k": top_k, + "top_p": top_p, + "greedy": greedy, + "all_probs": False, + "repetition_penalty": repetition, + "min_tokens_to_generate": int(min_tokens), + "end_strings": ["<|endoftext|>", "", "\x11", "User"], + } + sentences = text_generation(data, port=1424)['sentences'] + return sentences[0] if not batch else sentences + +PROMPT_TEMPLATE = """System + +User +{prompt} +Assistant +""" + +question = "Write a poem on NVIDIA in the style of Shakespeare" +prompt = PROMPT_TEMPLATE.format(prompt=question) +print(prompt) + +response = get_generation(prompt, greedy=True, add_BOS=False, token_to_gen=1024, min_tokens=1, temp=1.0, top_p=1.0, top_k=0, repetition=1.0, batch=False) +response = response[len(prompt):] +if response.endswith(""): + response = response[:-len("")] +print(response) +``` + +2. Given this Python script, create a Bash script which spins up the inference server within the [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) (```docker pull nvcr.io/nvidia/nemo:24.01.framework```) and calls the Python script ``call_server.py``. The Bash script ``nemo_inference.sh`` is as follows, + +```bash +NEMO_FILE=$1 +WEB_PORT=1424 + +depends_on () { + HOST=$1 + PORT=$2 + STATUS=$(curl -X PUT http://$HOST:$PORT >/dev/null 2>/dev/null; echo $?) + while [ $STATUS -ne 0 ] + do + echo "waiting for server ($HOST:$PORT) to be up" + sleep 10 + STATUS=$(curl -X PUT http://$HOST:$PORT >/dev/null 2>/dev/null; echo $?) + done + echo "server ($HOST:$PORT) is up running" +} + + +/usr/bin/python3 /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_eval.py \ + gpt_model_file=$NEMO_FILE \ + pipeline_model_parallel_split_rank=0 \ + server=True tensor_model_parallel_size=8 \ + trainer.precision=bf16 pipeline_model_parallel_size=2 \ + trainer.devices=8 \ + trainer.num_nodes=2 \ + web_server=False \ + port=${WEB_PORT} & + SERVER_PID=$! + + readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}" + if [ $SLURM_NODEID -eq 0 ] && [ $local_rank -eq 0 ]; then + depends_on "0.0.0.0" ${WEB_PORT} + + echo "start get json" + sleep 5 + + echo "SLURM_NODEID: $SLURM_NODEID" + echo "local_rank: $local_rank" + /usr/bin/python3 /scripts/call_server.py + echo "clean up dameons: $$" + kill -9 $SERVER_PID + pkill python + fi + wait +``` + + +3. Launch ``nemo_inference.sh`` with a Slurm script defined like below, which starts a 2-node job for model inference. + +``` +#!/bin/bash +#SBATCH -A SLURM-ACCOUNT +#SBATCH -p SLURM-PARITION +#SBATCH -N 2 +#SBATCH -J generation +#SBATCH --ntasks-per-node=8 +#SBATCH --gpus-per-node=8 +set -x + +RESULTS= +OUTFILE="${RESULTS}/slurm-%j-%n.out" +ERRFILE="${RESULTS}/error-%j-%n.out" +MODEL=/Nemotron-4-340B-Instruct +CONTAINER="nvcr.io/nvidia/nemo:24.01.framework" +MOUNTS="--container-mounts=:/scripts,MODEL:/model" + +read -r -d '' cmd <", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a093afc1a84460c37d9af55bb7e8a056851d6c29 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf6ac4271cd566ddd5cb6e1675a2457bacce5e683b30b35f9668377b36ca2628 +size 21919422 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f59f15ef480afcbcb6f981ebdda6e95a1eaf7e85 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,6 @@ +{ + "bos_token": "", + "eos_token": "", + "model_max_length": 4096, + "tokenizer_class": "PreTrainedTokenizerFast" +} \ No newline at end of file