Spaces:

BridgeEight
/

internlm-20B-chat-w4-turbomind

Runtime error

File size: 5,005 Bytes

5dedc73
 
38fe132
167b9ae
 
38fe132
5dedc73
 
 
 
 
38fe132
16a3b53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b031b71
16a3b53
 
 
b031b71
16a3b53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38fe132
 
5dedc73
38fe132
 
 
 
 
 
5dedc73
38fe132
 
 
 
5dedc73
 
 
 
 
167b9ae
5dedc73
 
167b9ae
5dedc73
 
167b9ae
5dedc73
 
 
 
 
 
 
38fe132
 
 
 
 
5dedc73
 
 
16a3b53
5dedc73
 
 
 
16a3b53
38fe132
 
5dedc73

from lmdeploy.serve.gradio.turbomind_coupled import *
from lmdeploy.messages import TurbomindEngineConfig

backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05)#, model_format='awq')
model_path = 'internlm/internlm2-math-7b'

InterFace.async_engine = AsyncEngine(
    model_path=model_path,
    backend='turbomind',
    backend_config=backend_config,
    tp=1)

async def reset_local_func(instruction_txtbox: gr.Textbox,
                           state_chatbot: Sequence, session_id: int):
    """reset the session.

    Args:
        instruction_txtbox (str): user's prompt
        state_chatbot (Sequence): the chatting history
        session_id (int): the session id
    """
    state_chatbot = []
    # end the session
    with InterFace.lock:
        InterFace.global_session_id += 1
        session_id = InterFace.global_session_id
    return (state_chatbot, state_chatbot, gr.Textbox.update(value=''), session_id)

async def cancel_local_func(state_chatbot: Sequence, cancel_btn: gr.Button,
                            reset_btn: gr.Button, session_id: int):
    """stop the session.

    Args:
        instruction_txtbox (str): user's prompt
        state_chatbot (Sequence): the chatting history
        cancel_btn (gr.Button): the cancel button
        reset_btn (gr.Button): the reset button
        session_id (int): the session id
    """
    yield (state_chatbot, disable_btn, disable_btn, session_id)
    InterFace.async_engine.stop_session(session_id)
    # pytorch backend does not support resume chat history now
    if InterFace.async_engine.backend == 'pytorch':
        yield (state_chatbot, disable_btn, enable_btn, session_id)
    else:
        with InterFace.lock:
            InterFace.global_session_id += 1
            session_id = InterFace.global_session_id
        messages = []
        for qa in state_chatbot:
            messages.append(dict(role='user', content=qa[0]))
            if qa[1] is not None:
                messages.append(dict(role='assistant', content=qa[1]))
        gen_config = GenerationConfig(max_new_tokens=0)
        async for out in InterFace.async_engine.generate(messages,
                                                         session_id,
                                                         gen_config=gen_config,
                                                         stream_response=True,
                                                         sequence_start=True,
                                                         sequence_end=False):
            pass
        yield (state_chatbot, disable_btn, enable_btn, session_id)

with gr.Blocks(css=CSS, theme=THEME) as demo:
    state_chatbot = gr.State([])
    state_session_id = gr.State(0)

    with gr.Column(elem_id='container'):
        gr.Markdown('## LMDeploy Playground')

        chatbot = gr.Chatbot(
            elem_id='chatbot',
            label=InterFace.async_engine.engine.model_name)
        instruction_txtbox = gr.Textbox(
            placeholder='Please input the instruction',
            label='Instruction')
        with gr.Row():
            cancel_btn = gr.Button(value='Cancel', interactive=False)
            reset_btn = gr.Button(value='Reset')
        with gr.Row():
            request_output_len = gr.Slider(1,
                                            2048,
                                            value=1024,
                                            step=1,
                                            label='Maximum new tokens')
            top_p = gr.Slider(0.01, 1, value=1.0, step=0.01, label='Top_p')
            temperature = gr.Slider(0.01,
                                    1.5,
                                    value=0.01,
                                    step=0.01,
                                    label='Temperature')

    send_event = instruction_txtbox.submit(chat_stream_local, [
        instruction_txtbox, state_chatbot, cancel_btn, reset_btn,
        state_session_id, top_p, temperature, request_output_len
    ], [state_chatbot, chatbot, cancel_btn, reset_btn])
    instruction_txtbox.submit(
        lambda: gr.Textbox.update(value=''),
        [],
        [instruction_txtbox],
    )
    cancel_btn.click(
        cancel_local_func,
        [state_chatbot, cancel_btn, reset_btn, state_session_id],
        [state_chatbot, cancel_btn, reset_btn, state_session_id],
        cancels=[send_event])

    reset_btn.click(reset_local_func,
                    [instruction_txtbox, state_chatbot, state_session_id],
                    [state_chatbot, chatbot, instruction_txtbox, state_session_id],
                    cancels=[send_event])

    def init():
        with InterFace.lock:
            InterFace.global_session_id += 1
        new_session_id = InterFace.global_session_id
        return new_session_id

    demo.load(init, inputs=None, outputs=[state_session_id])

demo.queue(concurrency_count=InterFace.async_engine.instance_num,
            max_size=100).launch()