Spaces:

rodrigomasini
/

advanced-ui-for-gw

Runtime error

App Files Files Community

rodrigomasini commited on Feb 27, 2024

Commit

dd7ed3d

verified ·

1 Parent(s): 2a9b1c8

Upload 4 files

Browse files

Files changed (4) hide show

api-examples/api-example-chat-stream.py +101 -0
api-examples/api-example-model.py +176 -0
api-examples/api-example-stream.py +80 -0
api-examples/api-example.py +57 -0

api-examples/api-example-chat-stream.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import asyncio
+import json
+import sys
+try:
+    import websockets
+except ImportError:
+    print("Websockets package not found. Make sure it's installed.")
+# For local streaming, the websockets are hosted without ssl - ws://
+HOST = 'localhost:5005'
+URI = f'ws://{HOST}/api/v1/chat-stream'
+# For reverse-proxied streaming, the remote will likely host with ssl - wss://
+# URI = 'wss://your-uri-here.trycloudflare.com/api/v1/stream'
+async def run(user_input, history):
+    # Note: the selected defaults change from time to time.
+    request = {
+        'user_input': user_input,
+        'max_new_tokens': 250,
+        'history': history,
+        'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
+        'character': 'Example',
+        'instruction_template': 'Vicuna-v1.1',  # Will get autodetected if unset
+        # 'context_instruct': '',  # Optional
+        'your_name': 'You',
+        'regenerate': False,
+        '_continue': False,
+        'stop_at_newline': False,
+        'chat_generation_attempts': 1,
+        'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
+        # Generation params. If 'preset' is set to different than 'None', the values
+        # in presets/preset-name.yaml are used instead of the individual numbers.
+        'preset': 'None',
+        'do_sample': True,
+        'temperature': 0.7,
+        'top_p': 0.1,
+        'typical_p': 1,
+        'epsilon_cutoff': 0,  # In units of 1e-4
+        'eta_cutoff': 0,  # In units of 1e-4
+        'tfs': 1,
+        'top_a': 0,
+        'repetition_penalty': 1.18,
+        'repetition_penalty_range': 0,
+        'top_k': 40,
+        'min_length': 0,
+        'no_repeat_ngram_size': 0,
+        'num_beams': 1,
+        'penalty_alpha': 0,
+        'length_penalty': 1,
+        'early_stopping': False,
+        'mirostat_mode': 0,
+        'mirostat_tau': 5,
+        'mirostat_eta': 0.1,
+        'seed': -1,
+        'add_bos_token': True,
+        'truncation_length': 2048,
+        'ban_eos_token': False,
+        'skip_special_tokens': True,
+        'stopping_strings': []
+    }
+    async with websockets.connect(URI, ping_interval=None) as websocket:
+        await websocket.send(json.dumps(request))
+        while True:
+            incoming_data = await websocket.recv()
+            incoming_data = json.loads(incoming_data)
+            match incoming_data['event']:
+                case 'text_stream':
+                    yield incoming_data['history']
+                case 'stream_end':
+                    return
+async def print_response_stream(user_input, history):
+    cur_len = 0
+    async for new_history in run(user_input, history):
+        cur_message = new_history['visible'][-1][1][cur_len:]
+        cur_len += len(cur_message)
+        print(cur_message, end='')
+        sys.stdout.flush()  # If we don't flush, we won't see tokens in realtime.
+if __name__ == '__main__':
+    user_input = "Please give me a step-by-step guide on how to plant a tree in my backyard."
+    # Basic example
+    history = {'internal': [], 'visible': []}
+    # "Continue" example. Make sure to set '_continue' to True above
+    # arr = [user_input, 'Surely, here is']
+    # history = {'internal': [arr], 'visible': [arr]}
+    asyncio.run(print_response_stream(user_input, history))

api-examples/api-example-model.py ADDED Viewed

	@@ -0,0 +1,176 @@

+#!/usr/bin/env python3
+import requests
+HOST = '0.0.0.0:5000'
+def generate(prompt, tokens=200):
+    request = {'prompt': prompt, 'max_new_tokens': tokens}
+    response = requests.post(f'http://{HOST}/api/v1/generate', json=request)
+    if response.status_code == 200:
+        return response.json()['results'][0]['text']
+def model_api(request):
+    response = requests.post(f'http://{HOST}/api/v1/model', json=request)
+    return response.json()
+# print some common settings
+def print_basic_model_info(response):
+    basic_settings = ['truncation_length', 'instruction_template']
+    print("Model: ", response['result']['model_name'])
+    print("Lora(s): ", response['result']['lora_names'])
+    for setting in basic_settings:
+        print(setting, "=", response['result']['shared.settings'][setting])
+# model info
+def model_info():
+    response = model_api({'action': 'info'})
+    print_basic_model_info(response)
+# simple loader
+def model_load(model_name):
+    return model_api({'action': 'load', 'model_name': model_name})
+# complex loader
+def complex_model_load(model):
+    def guess_groupsize(model_name):
+        if '1024g' in model_name:
+            return 1024
+        elif '128g' in model_name:
+            return 128
+        elif '32g' in model_name:
+            return 32
+        else:
+            return -1
+    req = {
+        'action': 'load',
+        'model_name': model,
+        'args': {
+            'loader': 'AutoGPTQ',
+            'bf16': False,
+            'load_in_8bit': False,
+            'groupsize': 0,
+            'wbits': 0,
+            # llama.cpp
+            'threads': 0,
+            'n_batch': 512,
+            'no_mmap': False,
+            'mlock': False,
+            'cache_capacity': None,
+            'n_gpu_layers': 0,
+            'n_ctx': 2048,
+            # RWKV
+            'rwkv_strategy': None,
+            'rwkv_cuda_on': False,
+            # b&b 4-bit
+            # 'load_in_4bit': False,
+            # 'compute_dtype': 'float16',
+            # 'quant_type': 'nf4',
+            # 'use_double_quant': False,
+            # "cpu": false,
+            # "auto_devices": false,
+            # "gpu_memory": null,
+            # "cpu_memory": null,
+            # "disk": false,
+            # "disk_cache_dir": "cache",
+        },
+    }
+    model = model.lower()
+    if '4bit' in model or 'gptq' in model or 'int4' in model:
+        req['args']['wbits'] = 4
+        req['args']['groupsize'] = guess_groupsize(model)
+    elif '3bit' in model:
+        req['args']['wbits'] = 3
+        req['args']['groupsize'] = guess_groupsize(model)
+    else:
+        req['args']['gptq_for_llama'] = False
+    if '8bit' in model:
+        req['args']['load_in_8bit'] = True
+    elif '-hf' in model or 'fp16' in model:
+        if '7b' in model:
+            req['args']['bf16'] = True  # for 24GB
+        elif '13b' in model:
+            req['args']['load_in_8bit'] = True  # for 24GB
+    elif 'ggml' in model:
+        # req['args']['threads'] = 16
+        if '7b' in model:
+            req['args']['n_gpu_layers'] = 100
+        elif '13b' in model:
+            req['args']['n_gpu_layers'] = 100
+        elif '30b' in model or '33b' in model:
+            req['args']['n_gpu_layers'] = 59  # 24GB
+        elif '65b' in model:
+            req['args']['n_gpu_layers'] = 42  # 24GB
+    elif 'rwkv' in model:
+        req['args']['rwkv_cuda_on'] = True
+        if '14b' in model:
+            req['args']['rwkv_strategy'] = 'cuda f16i8'  # 24GB
+        else:
+            req['args']['rwkv_strategy'] = 'cuda f16'  # 24GB
+    return model_api(req)
+if __name__ == '__main__':
+    for model in model_api({'action': 'list'})['result']:
+        try:
+            resp = complex_model_load(model)
+            if 'error' in resp:
+                print(f"❌ {model} FAIL Error: {resp['error']['message']}")
+                continue
+            else:
+                print_basic_model_info(resp)
+            ans = generate("0,1,1,2,3,5,8,13,", tokens=2)
+            if '21' in ans:
+                print(f"✅ {model} PASS ({ans})")
+            else:
+                print(f"❌ {model} FAIL ({ans})")
+        except Exception as e:
+            print(f"❌ {model} FAIL Exception: {repr(e)}")
+# 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.
+# Some results below.
+""" $ ./model-api-example.py
+Model:  4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda
+Lora(s):  []
+truncation_length = 2048
+instruction_template = Alpaca
+✅ 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21)
+Model:  4bit_WizardLM-13B-Uncensored-4bit-128g
+Lora(s):  []
+truncation_length = 2048
+instruction_template = WizardLM
+✅ 4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21)
+Model:  Aeala_VicUnlocked-alpaca-30b-4bit
+Lora(s):  []
+truncation_length = 2048
+instruction_template = Alpaca
+✅ Aeala_VicUnlocked-alpaca-30b-4bit PASS (21)
+Model:  alpaca-30b-4bit
+Lora(s):  []
+truncation_length = 2048
+instruction_template = Alpaca
+✅ alpaca-30b-4bit PASS (21)
+"""

api-examples/api-example-stream.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import asyncio
+import json
+import sys
+try:
+    import websockets
+except ImportError:
+    print("Websockets package not found. Make sure it's installed.")
+# For local streaming, the websockets are hosted without ssl - ws://
+HOST = 'localhost:5005'
+URI = f'ws://{HOST}/api/v1/stream'
+# For reverse-proxied streaming, the remote will likely host with ssl - wss://
+# URI = 'wss://your-uri-here.trycloudflare.com/api/v1/stream'
+async def run(context):
+    # Note: the selected defaults change from time to time.
+    request = {
+        'prompt': context,
+        'max_new_tokens': 250,
+        # Generation params. If 'preset' is set to different than 'None', the values
+        # in presets/preset-name.yaml are used instead of the individual numbers.
+        'preset': 'None',
+        'do_sample': True,
+        'temperature': 0.7,
+        'top_p': 0.1,
+        'typical_p': 1,
+        'epsilon_cutoff': 0,  # In units of 1e-4
+        'eta_cutoff': 0,  # In units of 1e-4
+        'tfs': 1,
+        'top_a': 0,
+        'repetition_penalty': 1.18,
+        'repetition_penalty_range': 0,
+        'top_k': 40,
+        'min_length': 0,
+        'no_repeat_ngram_size': 0,
+        'num_beams': 1,
+        'penalty_alpha': 0,
+        'length_penalty': 1,
+        'early_stopping': False,
+        'mirostat_mode': 0,
+        'mirostat_tau': 5,
+        'mirostat_eta': 0.1,
+        'seed': -1,
+        'add_bos_token': True,
+        'truncation_length': 2048,
+        'ban_eos_token': False,
+        'skip_special_tokens': True,
+        'stopping_strings': []
+    }
+    async with websockets.connect(URI, ping_interval=None) as websocket:
+        await websocket.send(json.dumps(request))
+        yield context  # Remove this if you just want to see the reply
+        while True:
+            incoming_data = await websocket.recv()
+            incoming_data = json.loads(incoming_data)
+            match incoming_data['event']:
+                case 'text_stream':
+                    yield incoming_data['text']
+                case 'stream_end':
+                    return
+async def print_response_stream(prompt):
+    async for response in run(prompt):
+        print(response, end='')
+        sys.stdout.flush()  # If we don't flush, we won't see tokens in realtime.
+if __name__ == '__main__':
+    prompt = "In order to make homemade bread, follow these steps:\n1)"
+    asyncio.run(print_response_stream(prompt))

api-examples/api-example.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import requests
+# For local streaming, the websockets are hosted without ssl - http://
+HOST = 'localhost:5000'
+URI = f'http://{HOST}/api/v1/generate'
+# For reverse-proxied streaming, the remote will likely host with ssl - https://
+# URI = 'https://your-uri-here.trycloudflare.com/api/v1/generate'
+def run(prompt):
+    request = {
+        'prompt': prompt,
+        'max_new_tokens': 250,
+        # Generation params. If 'preset' is set to different than 'None', the values
+        # in presets/preset-name.yaml are used instead of the individual numbers.
+        'preset': 'None',
+        'do_sample': True,
+        'temperature': 0.7,
+        'top_p': 0.1,
+        'typical_p': 1,
+        'epsilon_cutoff': 0,  # In units of 1e-4
+        'eta_cutoff': 0,  # In units of 1e-4
+        'tfs': 1,
+        'top_a': 0,
+        'repetition_penalty': 1.18,
+        'repetition_penalty_range': 0,
+        'top_k': 40,
+        'min_length': 0,
+        'no_repeat_ngram_size': 0,
+        'num_beams': 1,
+        'penalty_alpha': 0,
+        'length_penalty': 1,
+        'early_stopping': False,
+        'mirostat_mode': 0,
+        'mirostat_tau': 5,
+        'mirostat_eta': 0.1,
+        'seed': -1,
+        'add_bos_token': True,
+        'truncation_length': 2048,
+        'ban_eos_token': False,
+        'skip_special_tokens': True,
+        'stopping_strings': []
+    }
+    response = requests.post(URI, json=request)
+    if response.status_code == 200:
+        result = response.json()['results'][0]['text']
+        print(prompt + result)
+if __name__ == '__main__':
+    prompt = "In order to make homemade bread, follow these steps:\n1)"
+    run(prompt)