Upload folder using huggingface_hub
Browse files
README.md
CHANGED
@@ -12,7 +12,7 @@ tags:
|
|
12 |
- Food NEL
|
13 |
- Food NER
|
14 |
---
|
15 |
-
#
|
16 |
|
17 |
## The model is based on Meta-Llama-3-8B-Instruct, which was fine-tuned (using LoRA) for food and nutrition analysis.
|
18 |
|
@@ -24,55 +24,130 @@ More specifically, it can conduct the following tasks:
|
|
24 |
|
25 |
## How to use it: ##
|
26 |
|
|
|
27 |
```python
|
28 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
29 |
import torch
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
- Food NEL
|
13 |
- Food NER
|
14 |
---
|
15 |
+
# FoodyLLM: A FAIR-aligned specialized large language model for food and nutrition analysis
|
16 |
|
17 |
## The model is based on Meta-Llama-3-8B-Instruct, which was fine-tuned (using LoRA) for food and nutrition analysis.
|
18 |
|
|
|
24 |
|
25 |
## How to use it: ##
|
26 |
|
27 |
+
###Load the model:
|
28 |
```python
|
29 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
30 |
import torch
|
31 |
|
32 |
+
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
|
33 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
34 |
+
|
35 |
+
bnb_config = BitsAndBytesConfig(
|
36 |
+
load_in_4bit=True,
|
37 |
+
bnb_4bit_quant_type="nf4",
|
38 |
+
bnb_4bit_compute_dtype=torch.float16,
|
39 |
+
bnb_4bit_use_double_quant=True,
|
40 |
+
)
|
41 |
+
|
42 |
+
model = AutoModelForCausalLM.from_pretrained(
|
43 |
+
base_model,
|
44 |
+
quantization_config=bnb_config,
|
45 |
+
device_map={"": 0},
|
46 |
+
attn_implementation="eager"
|
47 |
+
)
|
48 |
+
|
49 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
|
50 |
+
tokenizer.pad_token = '<|pad|>'
|
51 |
+
tokenizer.pad_token_id = 128255
|
52 |
+
|
53 |
+
#Load LORA weights
|
54 |
+
model.load_adapter("Matej/FoodyLLM")
|
55 |
+
model.config.use_cache = True
|
56 |
+
model.eval()
|
57 |
+
```
|
58 |
+
|
59 |
+
###Return the nutrient values for an example recipe:
|
60 |
+
```python
|
61 |
+
system_prompt = ""
|
62 |
+
user_prompt = "Compute the nutrient values per 100 grams in a recipe with the following ingredients: 250 g cream, whipped, cream topping, pressurized, 250 g yogurt, greek, plain, nonfat, 50 g sugars, powdered"
|
63 |
+
|
64 |
+
messages = [
|
65 |
+
{
|
66 |
+
"role": "user",
|
67 |
+
"content": f"{system_prompt} {user_prompt}".strip()
|
68 |
+
}
|
69 |
+
]
|
70 |
+
|
71 |
+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
72 |
+
|
73 |
+
#Here we have a batch of one
|
74 |
+
tokenizer_input = [prompt]
|
75 |
+
|
76 |
+
inputs = tokenizer(tokenizer_input, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
|
77 |
+
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True)
|
78 |
+
answers = tokenizer.batch_decode(generated_ids[:, inputs['input_ids'].shape[1]:])
|
79 |
+
answers = [x.split('<|eot_id|>')[0].strip() for x in answers]
|
80 |
+
print(answers[0])
|
81 |
+
```
|
82 |
+
###Expected answer: Nutrient values per 100 g highlighted: energy - 134.24, fat - 5.78, protein - 7.51, salt - 0.06, saturates - 3.58, sugars - 13.00
|
83 |
+
|
84 |
+
###Classifying recipes by traffic light nutrition labels:
|
85 |
+
```python
|
86 |
+
user_prompt = "Review the fsa traffic lights per 100 grams in a recipe using the following ingredients: 1/2 cup soup, swanson chicken broth 99% fat free, 1 pinch salt, table"
|
87 |
+
|
88 |
+
messages = [
|
89 |
+
{
|
90 |
+
"role": "user",
|
91 |
+
"content": f"{system_prompt} {user_prompt}".strip()
|
92 |
+
}
|
93 |
+
]
|
94 |
+
|
95 |
+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
96 |
+
|
97 |
+
# Here we have a batch of one
|
98 |
+
tokenizer_input = [prompt]
|
99 |
+
|
100 |
+
inputs = tokenizer(tokenizer_input, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
|
101 |
+
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True)
|
102 |
+
answers = tokenizer.batch_decode(generated_ids[:, inputs['input_ids'].shape[1]:])
|
103 |
+
answers = [x.split('<|eot_id|>')[0].strip() for x in answers]
|
104 |
+
print(answers[0])
|
105 |
+
```
|
106 |
+
|
107 |
+
###Expected answer: Food Standards Agency lights in each 100 g: fat - green, salt - red, saturates - green, sugars - green
|
108 |
+
|
109 |
+
###Extract food named entities:
|
110 |
+
```python
|
111 |
+
user_prompt = "Retrieve all food entities referenced in the text: Line a large colander with a cheesecloth. Stir salt into the yogurt, and pour the yogurt into the cheesecloth. Set the colander in the sink or bowl to catch the liquid that drains off. Leave to drain for 24 hours. After draining for the 24 hours, transfer the resulting cheese to a bowl. Stir in the olive oil. Store in a covered container in the refrigerator."
|
112 |
+
messages = [
|
113 |
+
{
|
114 |
+
"role": "user",
|
115 |
+
"content": f"{system_prompt} {user_prompt}".strip()
|
116 |
+
}
|
117 |
+
]
|
118 |
+
|
119 |
+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
120 |
+
|
121 |
+
# Here we have a batch of one
|
122 |
+
tokenizer_input = [prompt]
|
123 |
+
|
124 |
+
inputs = tokenizer(tokenizer_input, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
|
125 |
+
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True)
|
126 |
+
answers = tokenizer.batch_decode(generated_ids[:, inputs['input_ids'].shape[1]:])
|
127 |
+
answers = [x.split('<|eot_id|>')[0].strip() for x in answers]
|
128 |
+
print(answers[0])
|
129 |
+
```
|
130 |
+
###Expected answer: Indeed, the entities concerning food are outlined below: salt, yogurt, liquid, cheese, olive oil.
|
131 |
+
|
132 |
+
###Link named entities to the SNOMEDCT ontology:
|
133 |
+
```python
|
134 |
+
user_prompt = "Link the following food entities to a SNOMEDCT ontology: cream cheese, meat"
|
135 |
+
messages = [
|
136 |
+
{
|
137 |
+
"role": "user",
|
138 |
+
"content": f"{system_prompt} {user_prompt}".strip()
|
139 |
+
}
|
140 |
+
]
|
141 |
+
|
142 |
+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
143 |
+
|
144 |
+
# Here we have a batch of one
|
145 |
+
tokenizer_input = [prompt]
|
146 |
+
|
147 |
+
inputs = tokenizer(tokenizer_input, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
|
148 |
+
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True)
|
149 |
+
answers = tokenizer.batch_decode(generated_ids[:, inputs['input_ids'].shape[1]:])
|
150 |
+
answers = [x.split('<|eot_id|>')[0].strip() for x in answers]
|
151 |
+
print(answers[0])
|
152 |
+
```
|
153 |
+
###Expected answer: Indeed, the entities are connected in this fashion: cream cheese - http://purl.bioontology.org/ontology/SNOMEDCT/226849005;http://purl.bioontology.org/ontology/SNOMEDCT/255621006;http://purl.bioontology.org/ontology/SNOMEDCT/102264005, meat - http://purl.bioontology.org/ontology/SNOMEDCT/28647000.
|
apply.py
CHANGED
@@ -28,8 +28,9 @@ if __name__ == '__main__':
|
|
28 |
model.config.use_cache = True
|
29 |
model.eval()
|
30 |
|
|
|
31 |
system_prompt = ""
|
32 |
-
user_prompt = "
|
33 |
|
34 |
messages = [
|
35 |
{
|
@@ -47,4 +48,71 @@ if __name__ == '__main__':
|
|
47 |
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True)
|
48 |
answers = tokenizer.batch_decode(generated_ids[:, inputs['input_ids'].shape[1]:])
|
49 |
answers = [x.split('<|eot_id|>')[0].strip() for x in answers]
|
50 |
-
print(answers)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
model.config.use_cache = True
|
29 |
model.eval()
|
30 |
|
31 |
+
#Return the nutrient values for an example recipe
|
32 |
system_prompt = ""
|
33 |
+
user_prompt = "Compute the nutrient values per 100 grams in a recipe with the following ingredients: 250 g cream, whipped, cream topping, pressurized, 250 g yogurt, greek, plain, nonfat, 50 g sugars, powdered"
|
34 |
|
35 |
messages = [
|
36 |
{
|
|
|
48 |
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True)
|
49 |
answers = tokenizer.batch_decode(generated_ids[:, inputs['input_ids'].shape[1]:])
|
50 |
answers = [x.split('<|eot_id|>')[0].strip() for x in answers]
|
51 |
+
print(answers[0])
|
52 |
+
#Expected answer: Nutrient values per 100 g highlighted: energy - 134.24, fat - 5.78, protein - 7.51, salt - 0.06, saturates - 3.58, sugars - 13.00
|
53 |
+
|
54 |
+
|
55 |
+
#Classifying recipes by traffic light nutrition labels
|
56 |
+
user_prompt = "Review the fsa traffic lights per 100 grams in a recipe using the following ingredients: 1/2 cup soup, swanson chicken broth 99% fat free, 1 pinch salt, table"
|
57 |
+
|
58 |
+
messages = [
|
59 |
+
{
|
60 |
+
"role": "user",
|
61 |
+
"content": f"{system_prompt} {user_prompt}".strip()
|
62 |
+
}
|
63 |
+
]
|
64 |
+
|
65 |
+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
66 |
+
|
67 |
+
# Here we have a batch of one
|
68 |
+
tokenizer_input = [prompt]
|
69 |
+
|
70 |
+
inputs = tokenizer(tokenizer_input, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
|
71 |
+
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True)
|
72 |
+
answers = tokenizer.batch_decode(generated_ids[:, inputs['input_ids'].shape[1]:])
|
73 |
+
answers = [x.split('<|eot_id|>')[0].strip() for x in answers]
|
74 |
+
print(answers[0])
|
75 |
+
#Expected answer: Food Standards Agency lights in each 100 g: fat - green, salt - red, saturates - green, sugars - green
|
76 |
+
|
77 |
+
|
78 |
+
#Extract food named entities
|
79 |
+
user_prompt = "Retrieve all food entities referenced in the text: Line a large colander with a cheesecloth. Stir salt into the yogurt, and pour the yogurt into the cheesecloth. Set the colander in the sink or bowl to catch the liquid that drains off. Leave to drain for 24 hours. After draining for the 24 hours, transfer the resulting cheese to a bowl. Stir in the olive oil. Store in a covered container in the refrigerator."
|
80 |
+
messages = [
|
81 |
+
{
|
82 |
+
"role": "user",
|
83 |
+
"content": f"{system_prompt} {user_prompt}".strip()
|
84 |
+
}
|
85 |
+
]
|
86 |
+
|
87 |
+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
88 |
+
|
89 |
+
# Here we have a batch of one
|
90 |
+
tokenizer_input = [prompt]
|
91 |
+
|
92 |
+
inputs = tokenizer(tokenizer_input, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
|
93 |
+
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True)
|
94 |
+
answers = tokenizer.batch_decode(generated_ids[:, inputs['input_ids'].shape[1]:])
|
95 |
+
answers = [x.split('<|eot_id|>')[0].strip() for x in answers]
|
96 |
+
print(answers[0])
|
97 |
+
#Expected answer: Indeed, the entities concerning food are outlined below: salt, yogurt, liquid, cheese, olive oil.
|
98 |
+
|
99 |
+
#Link named entities to the SNOMEDCT ontology
|
100 |
+
user_prompt = "Link the following food entities to a SNOMEDCT ontology: cream cheese, meat"
|
101 |
+
messages = [
|
102 |
+
{
|
103 |
+
"role": "user",
|
104 |
+
"content": f"{system_prompt} {user_prompt}".strip()
|
105 |
+
}
|
106 |
+
]
|
107 |
+
|
108 |
+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
109 |
+
|
110 |
+
# Here we have a batch of one
|
111 |
+
tokenizer_input = [prompt]
|
112 |
+
|
113 |
+
inputs = tokenizer(tokenizer_input, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
|
114 |
+
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True)
|
115 |
+
answers = tokenizer.batch_decode(generated_ids[:, inputs['input_ids'].shape[1]:])
|
116 |
+
answers = [x.split('<|eot_id|>')[0].strip() for x in answers]
|
117 |
+
print(answers[0])
|
118 |
+
#Expected answer: Indeed, the entities are connected in this fashion: cream cheese - http://purl.bioontology.org/ontology/SNOMEDCT/226849005;http://purl.bioontology.org/ontology/SNOMEDCT/255621006;http://purl.bioontology.org/ontology/SNOMEDCT/102264005, meat - http://purl.bioontology.org/ontology/SNOMEDCT/28647000.
|