schuler
/

experimental-JP47D55C

Text Generation

Model card Files Files and versions

schuler commited on Feb 12

Commit

8f57e4f

·

verified ·

1 Parent(s): cca4f76

Update README.md

Files changed (1) hide show

README.md +9 -3

README.md CHANGED Viewed

@@ -3,6 +3,8 @@ library_name: transformers
 license: mit
 datasets:
 - MBZUAI/LaMini-instruction
 ---
 # Saving 77% of the Parameters in Large Language Models Technical Report
 This repository contains experiment results for the [Saving 77% of the Parameters in Large Language Models Technical Report (PDF)](https://www.researchgate.net/publication/388835829_SAVING_77_OF_THE_PARAMETERS_IN_LARGE_LANGUAGE_MODELS_TECHNICAL_REPORT).
@@ -36,6 +38,10 @@ The following table shows LaMini training results with the baseline and the opti
 ## Usage:
 ```
 from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, pipeline
 from transformers import LlamaTokenizer
 import torch
@@ -45,8 +51,8 @@ REPO_NAME = 'schuler/experimental-JP47D55C'
 def load_model(local_repo_name):
     tokenizer = LlamaTokenizer.from_pretrained(local_repo_name, trust_remote_code=True)
     generator_conf = GenerationConfig.from_pretrained(local_repo_name)
-    model = AutoModelForCausalLM.from_pretrained(local_repo_name, trust_remote_code=True, torch_dtype=torch.bfloat16, attn_implementation="eager")
-    # model.to('cuda')
     return tokenizer, generator_conf, model
 tokenizer, generator_conf, model = load_model(REPO_NAME)
@@ -57,7 +63,7 @@ except Exception as e:
   global_error =  f"Failed to load model: {str(e)}"
 def PrintTest(str):
-  print(generator(str, max_new_tokens=256, do_sample=True, top_p=0.25, repetition_penalty=1.2))
 PrintTest(f"<|user|>\nHello\n<|end|>\n<|assistant|>\n")
 PrintTest(f"<|user|>Hello\n<|end|><|assistant|>")

 license: mit
 datasets:
 - MBZUAI/LaMini-instruction
+language:
+- en
 ---
 # Saving 77% of the Parameters in Large Language Models Technical Report
 This repository contains experiment results for the [Saving 77% of the Parameters in Large Language Models Technical Report (PDF)](https://www.researchgate.net/publication/388835829_SAVING_77_OF_THE_PARAMETERS_IN_LARGE_LANGUAGE_MODELS_TECHNICAL_REPORT).
 ## Usage:
 ```
+!pip install -q -U transformers
+!pip install -q -U accelerate
+!pip install -q -U flash-attn --no-build-isolation
 from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, pipeline
 from transformers import LlamaTokenizer
 import torch
 def load_model(local_repo_name):
     tokenizer = LlamaTokenizer.from_pretrained(local_repo_name, trust_remote_code=True)
     generator_conf = GenerationConfig.from_pretrained(local_repo_name)
+    model = AutoModelForCausalLM.from_pretrained(local_repo_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16)
+    model.to('cuda')
     return tokenizer, generator_conf, model
 tokenizer, generator_conf, model = load_model(REPO_NAME)
   global_error =  f"Failed to load model: {str(e)}"
 def PrintTest(str):
+  print(generator(str, max_new_tokens=256, do_sample=True, top_p=0.5, repetition_penalty=1.2))
 PrintTest(f"<|user|>\nHello\n<|end|>\n<|assistant|>\n")
 PrintTest(f"<|user|>Hello\n<|end|><|assistant|>")