Spaces:
Running
on
L4
Running
on
L4
tricktreat
commited on
Commit
·
c5ea4b9
1
Parent(s):
56994ad
update gpt4
Browse files- app.py +1 -1
- awesome_chat.py +10 -5
- config.gradio.yaml +28 -3
- demos/demo_parse_task.json +308 -16
- get_token_ids.py +1 -1
app.py
CHANGED
@@ -60,7 +60,7 @@ class Client:
|
|
60 |
|
61 |
def add_text(self, messages, message):
|
62 |
if not self.OPENAI_KEY or not self.OPENAI_KEY.startswith("sk-") or not self.HUGGINGFACE_TOKEN or not self.HUGGINGFACE_TOKEN.startswith("hf_"):
|
63 |
-
return messages, "Please set your OpenAI API key and Hugging Face token first
|
64 |
self.add_message(message, "user")
|
65 |
messages = messages + [(message, None)]
|
66 |
urls, image_urls, audio_urls, video_urls = self.extract_medias(message)
|
|
|
60 |
|
61 |
def add_text(self, messages, message):
|
62 |
if not self.OPENAI_KEY or not self.OPENAI_KEY.startswith("sk-") or not self.HUGGINGFACE_TOKEN or not self.HUGGINGFACE_TOKEN.startswith("hf_"):
|
63 |
+
return messages, "Please set your OpenAI API key and Hugging Face token first!"
|
64 |
self.add_message(message, "user")
|
65 |
messages = messages + [(message, None)]
|
66 |
urls, image_urls, audio_urls, video_urls = self.extract_medias(message)
|
awesome_chat.py
CHANGED
@@ -49,13 +49,15 @@ if LOG_HF_TOKEN:
|
|
49 |
)
|
50 |
|
51 |
logger = logging.getLogger(__name__)
|
52 |
-
logger.setLevel(logging.
|
|
|
|
|
53 |
|
54 |
handler = logging.StreamHandler()
|
55 |
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
56 |
handler.setFormatter(formatter)
|
57 |
-
if
|
58 |
-
handler.setLevel(logging.
|
59 |
logger.addHandler(handler)
|
60 |
|
61 |
log_file = config["log_file"]
|
@@ -175,7 +177,7 @@ def replace_slot(text, entries):
|
|
175 |
for key, value in entries.items():
|
176 |
if not isinstance(value, str):
|
177 |
value = str(value)
|
178 |
-
text = text.replace("{{" + key +"}}", value.replace('"', "'").replace('\n', ""))
|
179 |
return text
|
180 |
|
181 |
def find_json(s):
|
@@ -275,6 +277,9 @@ def chitchat(messages, openaikey=None):
|
|
275 |
def parse_task(context, input, openaikey=None):
|
276 |
demos_or_presteps = parse_task_demos_or_presteps
|
277 |
messages = json.loads(demos_or_presteps)
|
|
|
|
|
|
|
278 |
messages.insert(0, {"role": "system", "content": parse_task_tprompt})
|
279 |
|
280 |
# cut chat logs
|
@@ -337,7 +342,7 @@ def response_results(input, results, openaikey=None):
|
|
337 |
"input": input,
|
338 |
"processes": results
|
339 |
})
|
340 |
-
messages = json.loads(demos_or_presteps)
|
341 |
messages.insert(0, {"role": "system", "content": response_results_tprompt})
|
342 |
messages.append({"role": "user", "content": prompt})
|
343 |
logger.debug(messages)
|
|
|
49 |
)
|
50 |
|
51 |
logger = logging.getLogger(__name__)
|
52 |
+
logger.setLevel(logging.INFO)
|
53 |
+
logger.handlers = []
|
54 |
+
logger.propagate = False
|
55 |
|
56 |
handler = logging.StreamHandler()
|
57 |
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
58 |
handler.setFormatter(formatter)
|
59 |
+
if config["debug"]:
|
60 |
+
handler.setLevel(logging.DEBUG)
|
61 |
logger.addHandler(handler)
|
62 |
|
63 |
log_file = config["log_file"]
|
|
|
177 |
for key, value in entries.items():
|
178 |
if not isinstance(value, str):
|
179 |
value = str(value)
|
180 |
+
text = text.replace("{{" + key +"}}", value.replace('"', "'").replace('\n', "").replace('\\', '\\\\'))
|
181 |
return text
|
182 |
|
183 |
def find_json(s):
|
|
|
277 |
def parse_task(context, input, openaikey=None):
|
278 |
demos_or_presteps = parse_task_demos_or_presteps
|
279 |
messages = json.loads(demos_or_presteps)
|
280 |
+
for message in messages:
|
281 |
+
if not isinstance(message["content"], str):
|
282 |
+
message["content"] = json.dumps(message["content"], ensure_ascii=False)
|
283 |
messages.insert(0, {"role": "system", "content": parse_task_tprompt})
|
284 |
|
285 |
# cut chat logs
|
|
|
342 |
"input": input,
|
343 |
"processes": results
|
344 |
})
|
345 |
+
messages = json.loads(demos_or_presteps, strict=False)
|
346 |
messages.insert(0, {"role": "system", "content": response_results_tprompt})
|
347 |
messages.append({"role": "user", "content": prompt})
|
348 |
logger.debug(messages)
|
config.gradio.yaml
CHANGED
@@ -16,8 +16,32 @@ logit_bias:
|
|
16 |
parse_task: 0.5
|
17 |
choose_model: 5
|
18 |
tprompt:
|
19 |
-
parse_task:
|
20 |
-
#1 Task Planning Stage:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
choose_model: >-
|
22 |
#2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability.
|
23 |
response_results: >-
|
@@ -27,7 +51,8 @@ demos_or_presteps:
|
|
27 |
choose_model: demos/demo_choose_model.json
|
28 |
response_results: demos/demo_response_results.json
|
29 |
prompt:
|
30 |
-
parse_task:
|
|
|
31 |
choose_model: >-
|
32 |
Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}.
|
33 |
response_results: >-
|
|
|
16 |
parse_task: 0.5
|
17 |
choose_model: 5
|
18 |
tprompt:
|
19 |
+
parse_task: |-
|
20 |
+
#1 Task Planning Stage:
|
21 |
+
# Objective:
|
22 |
+
Parse the user input into a set of sub tasks.
|
23 |
+
# Task Structure:
|
24 |
+
{
|
25 |
+
"task": task,_name,
|
26 |
+
"id": task_id,
|
27 |
+
"dep": [dependency_task_id,s],
|
28 |
+
"args": {
|
29 |
+
"text": text orOR <GENERATED>-dep_id,
|
30 |
+
"image": image_url orOR <GENERATED>-dep_id,
|
31 |
+
"audio": audio_url orOR <GENERATED>-dep_id}}]. The special tag "
|
32 |
+
}
|
33 |
+
}
|
34 |
+
# Key Points:
|
35 |
+
Key Points:
|
36 |
+
1. GENERATED-dep_id Tag: This refers to a resource (text, image, audio) generated by a dependent task. Ensure the dependency task can produce that type of resource.
|
37 |
+
2. dep Field: Lists the IDs of prerequisite tasks. These tasks generate resources required by the current task.
|
38 |
+
3. args Field: Contains parameters for the task. Only "text", "image", and "audio" are accepted.
|
39 |
+
4. Task Options: The task must be one of the following:
|
40 |
+
"token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". Note: You can have multiple tasks of the same type.
|
41 |
+
5. Efficiency: Aim to parse the fewest tasks needed to fulfill the user's request.
|
42 |
+
6. Ordering: Ensure that tasks are in the correct sequence based on their dependencies.
|
43 |
+
# Output:
|
44 |
+
If the user input can be parsed into tasks, return the tasks as JSON. If not, return an empty JSON array [].
|
45 |
choose_model: >-
|
46 |
#2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability.
|
47 |
response_results: >-
|
|
|
51 |
choose_model: demos/demo_choose_model.json
|
52 |
response_results: demos/demo_response_results.json
|
53 |
prompt:
|
54 |
+
parse_task: >+
|
55 |
+
The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Please parse the input into tasks, output in a strict JSON object:
|
56 |
choose_model: >-
|
57 |
Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}.
|
58 |
response_results: >-
|
demos/demo_parse_task.json
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
[
|
2 |
{
|
3 |
"role": "user",
|
@@ -5,70 +6,361 @@
|
|
5 |
},
|
6 |
{
|
7 |
"role": "assistant",
|
8 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
},
|
10 |
|
|
|
11 |
{
|
12 |
-
"role":"user",
|
13 |
-
"content":"Look at /e.jpg, can you tell me how many objects in the picture? Give me a picture and video similar to this one."
|
14 |
},
|
15 |
{
|
16 |
-
"role":"assistant",
|
17 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
},
|
19 |
|
20 |
{
|
21 |
-
"role":"user",
|
22 |
-
"content":"given a document /images/e.jpeg, answer me what is the student amount? And describe the image with your voice"
|
23 |
},
|
24 |
{
|
25 |
-
"role":"assistant",
|
26 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
},
|
28 |
|
|
|
29 |
{
|
30 |
"role": "user",
|
31 |
-
"content": "Given an image /example.jpg, first generate a
|
32 |
},
|
33 |
{
|
34 |
"role": "assistant",
|
35 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
},
|
37 |
|
|
|
38 |
{
|
39 |
"role": "user",
|
40 |
"content": "please show me a video and an image of (based on the text) 'a boy is running' and dub it"
|
41 |
},
|
42 |
{
|
43 |
"role": "assistant",
|
44 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
},
|
46 |
|
47 |
-
|
48 |
{
|
49 |
"role": "user",
|
50 |
"content": "please show me a joke and an image of cat"
|
51 |
},
|
52 |
{
|
53 |
"role": "assistant",
|
54 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
},
|
56 |
|
|
|
57 |
{
|
58 |
"role": "user",
|
59 |
"content": "give me a picture about a cut dog, then describe the image to me and tell a story about it"
|
60 |
},
|
61 |
{
|
62 |
"role": "assistant",
|
63 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
},
|
65 |
|
|
|
66 |
{
|
67 |
"role": "user",
|
68 |
"content": "give you a picture /example.jpg, what's in it and tell me a joke about it"
|
69 |
},
|
70 |
{
|
71 |
"role": "assistant",
|
72 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
}
|
74 |
]
|
|
|
1 |
+
|
2 |
[
|
3 |
{
|
4 |
"role": "user",
|
|
|
6 |
},
|
7 |
{
|
8 |
"role": "assistant",
|
9 |
+
"content": [
|
10 |
+
{
|
11 |
+
"task": "image-to-text",
|
12 |
+
"id": 0,
|
13 |
+
"dep": [-1],
|
14 |
+
"args": {
|
15 |
+
"image": "e1.jpg"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"task": "object-detection",
|
20 |
+
"id": 1,
|
21 |
+
"dep": [-1],
|
22 |
+
"args": {
|
23 |
+
"image": "e1.jpg"
|
24 |
+
}
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"task": "visual-question-answering",
|
28 |
+
"id": 2,
|
29 |
+
"dep": [1],
|
30 |
+
"args": {
|
31 |
+
"image": "<GENERATED>-1",
|
32 |
+
"text": "How many sheep in the picture"
|
33 |
+
}
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"task": "image-to-text",
|
37 |
+
"id": 3,
|
38 |
+
"dep": [-1],
|
39 |
+
"args": {
|
40 |
+
"image": "e2.png"
|
41 |
+
}
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"task": "object-detection",
|
45 |
+
"id": 4,
|
46 |
+
"dep": [-1],
|
47 |
+
"args": {
|
48 |
+
"image": "e2.png"
|
49 |
+
}
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"task": "visual-question-answering",
|
53 |
+
"id": 5,
|
54 |
+
"dep": [4],
|
55 |
+
"args": {
|
56 |
+
"image": "<GENERATED>-4",
|
57 |
+
"text": "How many sheep in the picture"
|
58 |
+
}
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"task": "image-to-text",
|
62 |
+
"id": 6,
|
63 |
+
"dep": [-1],
|
64 |
+
"args": {
|
65 |
+
"image": "e3.jpg"
|
66 |
+
}
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"task": "object-detection",
|
70 |
+
"id": 7,
|
71 |
+
"dep": [-1],
|
72 |
+
"args": {
|
73 |
+
"image": "e3.jpg"
|
74 |
+
}
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"task": "visual-question-answering",
|
78 |
+
"id": 8,
|
79 |
+
"dep": [7],
|
80 |
+
"args": {
|
81 |
+
"image": "<GENERATED>-7",
|
82 |
+
"text": "How many sheep in the picture"
|
83 |
+
}
|
84 |
+
}
|
85 |
+
]
|
86 |
},
|
87 |
|
88 |
+
|
89 |
{
|
90 |
+
"role": "user",
|
91 |
+
"content": "Look at /e.jpg, can you tell me how many objects in the picture? Give me a picture and video similar to this one."
|
92 |
},
|
93 |
{
|
94 |
+
"role": "assistant",
|
95 |
+
"content": [
|
96 |
+
{
|
97 |
+
"task": "image-to-text",
|
98 |
+
"id": 0,
|
99 |
+
"dep": [-1],
|
100 |
+
"args": {
|
101 |
+
"image": "/e.jpg"
|
102 |
+
}
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"task": "object-detection",
|
106 |
+
"id": 1,
|
107 |
+
"dep": [-1],
|
108 |
+
"args": {
|
109 |
+
"image": "/e.jpg"
|
110 |
+
}
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"task": "visual-question-answering",
|
114 |
+
"id": 2,
|
115 |
+
"dep": [1],
|
116 |
+
"args": {
|
117 |
+
"image": "<GENERATED>-1",
|
118 |
+
"text": "how many objects in the picture?"
|
119 |
+
}
|
120 |
+
},
|
121 |
+
{
|
122 |
+
"task": "text-to-image",
|
123 |
+
"id": 3,
|
124 |
+
"dep": [0],
|
125 |
+
"args": {
|
126 |
+
"text": "<GENERATED-0>"
|
127 |
+
}
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"task": "image-to-image",
|
131 |
+
"id": 4,
|
132 |
+
"dep": [-1],
|
133 |
+
"args": {
|
134 |
+
"image": "/e.jpg"
|
135 |
+
}
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"task": "text-to-video",
|
139 |
+
"id": 5,
|
140 |
+
"dep": [0],
|
141 |
+
"args": {
|
142 |
+
"text": "<GENERATED-0>"
|
143 |
+
}
|
144 |
+
}
|
145 |
+
]
|
146 |
},
|
147 |
|
148 |
{
|
149 |
+
"role": "user",
|
150 |
+
"content": "given a document /images/e.jpeg, answer me what is the student amount? And describe the image with your voice"
|
151 |
},
|
152 |
{
|
153 |
+
"role": "assistant",
|
154 |
+
"content": [
|
155 |
+
{
|
156 |
+
"task": "document-question-answering",
|
157 |
+
"id": 0,
|
158 |
+
"dep": [-1],
|
159 |
+
"args": {
|
160 |
+
"image": "/images/e.jpeg",
|
161 |
+
"text": "what is the student amount?"
|
162 |
+
}
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"task": "visual-question-answering",
|
166 |
+
"id": 1,
|
167 |
+
"dep": [-1],
|
168 |
+
"args": {
|
169 |
+
"image": "/images/e.jpeg",
|
170 |
+
"text": "what is the student amount?"
|
171 |
+
}
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"task": "image-to-text",
|
175 |
+
"id": 2,
|
176 |
+
"dep": [-1],
|
177 |
+
"args": {
|
178 |
+
"image": "/images/e.jpeg"
|
179 |
+
}
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"task": "text-to-speech",
|
183 |
+
"id": 3,
|
184 |
+
"dep": [2],
|
185 |
+
"args": {
|
186 |
+
"text": "<GENERATED>-2"
|
187 |
+
}
|
188 |
+
}
|
189 |
+
]
|
190 |
},
|
191 |
|
192 |
+
|
193 |
{
|
194 |
"role": "user",
|
195 |
+
"content": "Given an image /example.jpg, first generate a pose image, then based on the pose generate a new image where a girl is reading a book"
|
196 |
},
|
197 |
{
|
198 |
"role": "assistant",
|
199 |
+
"content": [
|
200 |
+
{
|
201 |
+
"task": "openpose-control",
|
202 |
+
"id": 0,
|
203 |
+
"dep": [-1],
|
204 |
+
"args": {
|
205 |
+
"image": "/example.jpg"
|
206 |
+
}
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"task": "openpose-text-to-image",
|
210 |
+
"id": 1,
|
211 |
+
"dep": [0],
|
212 |
+
"args": {
|
213 |
+
"text": "a girl is reading a book",
|
214 |
+
"image": "<GENERATED>-0"
|
215 |
+
}
|
216 |
+
}
|
217 |
+
]
|
218 |
},
|
219 |
|
220 |
+
|
221 |
{
|
222 |
"role": "user",
|
223 |
"content": "please show me a video and an image of (based on the text) 'a boy is running' and dub it"
|
224 |
},
|
225 |
{
|
226 |
"role": "assistant",
|
227 |
+
"content": [
|
228 |
+
{
|
229 |
+
"task": "text-to-video",
|
230 |
+
"id": 0,
|
231 |
+
"dep": [-1],
|
232 |
+
"args": {
|
233 |
+
"text": "a boy is running"
|
234 |
+
}
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"task": "text-to-speech",
|
238 |
+
"id": 1,
|
239 |
+
"dep": [-1],
|
240 |
+
"args": {
|
241 |
+
"text": "a boy is running"
|
242 |
+
}
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"task": "text-to-image",
|
246 |
+
"id": 2,
|
247 |
+
"dep": [-1],
|
248 |
+
"args": {
|
249 |
+
"text": "a boy is running"
|
250 |
+
}
|
251 |
+
}
|
252 |
+
]
|
253 |
},
|
254 |
|
|
|
255 |
{
|
256 |
"role": "user",
|
257 |
"content": "please show me a joke and an image of cat"
|
258 |
},
|
259 |
{
|
260 |
"role": "assistant",
|
261 |
+
"content": [
|
262 |
+
{
|
263 |
+
"task": "conversational",
|
264 |
+
"id": 0,
|
265 |
+
"dep": [-1],
|
266 |
+
"args": {
|
267 |
+
"text": "please show me a joke of cat"
|
268 |
+
}
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"task": "text-to-image",
|
272 |
+
"id": 1,
|
273 |
+
"dep": [-1],
|
274 |
+
"args": {
|
275 |
+
"text": "a photo of cat"
|
276 |
+
}
|
277 |
+
}
|
278 |
+
]
|
279 |
},
|
280 |
|
281 |
+
|
282 |
{
|
283 |
"role": "user",
|
284 |
"content": "give me a picture about a cut dog, then describe the image to me and tell a story about it"
|
285 |
},
|
286 |
{
|
287 |
"role": "assistant",
|
288 |
+
"content": [
|
289 |
+
{
|
290 |
+
"task": "text-to-image",
|
291 |
+
"id": 0,
|
292 |
+
"dep": [-1],
|
293 |
+
"args": {
|
294 |
+
"text": "a picture of a cute dog"
|
295 |
+
}
|
296 |
+
},
|
297 |
+
{
|
298 |
+
"task": "image-to-text",
|
299 |
+
"id": 1,
|
300 |
+
"dep": [0],
|
301 |
+
"args": {
|
302 |
+
"image": "<GENERATED>-0"
|
303 |
+
}
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"task": "text-generation",
|
307 |
+
"id": 2,
|
308 |
+
"dep": [1],
|
309 |
+
"args": {
|
310 |
+
"text": "<GENERATED>-1"
|
311 |
+
}
|
312 |
+
},
|
313 |
+
{
|
314 |
+
"task": "text-to-speech",
|
315 |
+
"id": 3,
|
316 |
+
"dep": [2],
|
317 |
+
"args": {
|
318 |
+
"text": "<GENERATED>-2"
|
319 |
+
}
|
320 |
+
}
|
321 |
+
]
|
322 |
},
|
323 |
|
324 |
+
|
325 |
{
|
326 |
"role": "user",
|
327 |
"content": "give you a picture /example.jpg, what's in it and tell me a joke about it"
|
328 |
},
|
329 |
{
|
330 |
"role": "assistant",
|
331 |
+
"content": [
|
332 |
+
{
|
333 |
+
"task": "image-to-text",
|
334 |
+
"id": 0,
|
335 |
+
"dep": [-1],
|
336 |
+
"args": {
|
337 |
+
"image": "/example.jpg"
|
338 |
+
}
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"task": "object-detection",
|
342 |
+
"id": 1,
|
343 |
+
"dep": [-1],
|
344 |
+
"args": {
|
345 |
+
"image": "/example.jpg"
|
346 |
+
}
|
347 |
+
},
|
348 |
+
{
|
349 |
+
"task": "conversational",
|
350 |
+
"id": 2,
|
351 |
+
"dep": [0],
|
352 |
+
"args": {
|
353 |
+
"text": "<GENERATED>-0"
|
354 |
+
}
|
355 |
+
},
|
356 |
+
{
|
357 |
+
"task": "text-to-speech",
|
358 |
+
"id": 3,
|
359 |
+
"dep": [2],
|
360 |
+
"args": {
|
361 |
+
"text": "<GENERATED>-1"
|
362 |
+
}
|
363 |
+
}
|
364 |
+
]
|
365 |
}
|
366 |
]
|
get_token_ids.py
CHANGED
@@ -17,7 +17,7 @@ encodings = {
|
|
17 |
}
|
18 |
|
19 |
max_length = {
|
20 |
-
"gpt-4":
|
21 |
"gpt-3.5-turbo": 4096,
|
22 |
"gpt-3.5-turbo-0301": 4096,
|
23 |
"text-davinci-003": 4096,
|
|
|
17 |
}
|
18 |
|
19 |
max_length = {
|
20 |
+
"gpt-4": 8192,
|
21 |
"gpt-3.5-turbo": 4096,
|
22 |
"gpt-3.5-turbo-0301": 4096,
|
23 |
"text-davinci-003": 4096,
|