Upload LLM2Vec4CXR fine-tuned model
Browse files- README.md +5 -0
- usage_example.py +4 -0
README.md
CHANGED
@@ -82,6 +82,9 @@ tokenizer.padding_side = 'left'
|
|
82 |
# Example usage for chest X-ray report analysis
|
83 |
def encode_text(text):
|
84 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
|
|
|
|
|
85 |
with torch.no_grad():
|
86 |
embeddings = model(inputs)
|
87 |
return embeddings
|
@@ -91,6 +94,8 @@ report = "There is a small increase in the left-sided effusion. There continues
|
|
91 |
embedding = encode_text(report)
|
92 |
```
|
93 |
|
|
|
|
|
94 |
### Advanced Usage with Separator-based Processing
|
95 |
|
96 |
The model supports special separator-based processing for instruction-following tasks:
|
|
|
82 |
# Example usage for chest X-ray report analysis
|
83 |
def encode_text(text):
|
84 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
85 |
+
# IMPORTANT: Add embed_mask for proper model functioning
|
86 |
+
# For simple text encoding, embed_mask is the same as attention_mask
|
87 |
+
inputs["embed_mask"] = inputs["attention_mask"].clone()
|
88 |
with torch.no_grad():
|
89 |
embeddings = model(inputs)
|
90 |
return embeddings
|
|
|
94 |
embedding = encode_text(report)
|
95 |
```
|
96 |
|
97 |
+
**Note**: The model requires an `embed_mask` input. For simple text encoding, set `embed_mask` equal to `attention_mask`. For instruction-following tasks, use the separator-based tokenization shown below.
|
98 |
+
|
99 |
### Advanced Usage with Separator-based Processing
|
100 |
|
101 |
The model supports special separator-based processing for instruction-following tasks:
|
usage_example.py
CHANGED
@@ -162,6 +162,8 @@ def main():
|
|
162 |
report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
|
163 |
|
164 |
inputs = tokenizer(report, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
|
|
|
165 |
inputs = inputs.to(device)
|
166 |
|
167 |
with torch.no_grad():
|
@@ -223,6 +225,8 @@ def main():
|
|
223 |
|
224 |
print("Computing embeddings for multiple reports...")
|
225 |
inputs = tokenizer(reports, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
|
|
|
226 |
inputs = inputs.to(device)
|
227 |
|
228 |
with torch.no_grad():
|
|
|
162 |
report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
|
163 |
|
164 |
inputs = tokenizer(report, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
165 |
+
# Add embed_mask (same as attention_mask for simple text encoding)
|
166 |
+
inputs["embed_mask"] = inputs["attention_mask"].clone()
|
167 |
inputs = inputs.to(device)
|
168 |
|
169 |
with torch.no_grad():
|
|
|
225 |
|
226 |
print("Computing embeddings for multiple reports...")
|
227 |
inputs = tokenizer(reports, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
228 |
+
# Add embed_mask (same as attention_mask for simple text encoding)
|
229 |
+
inputs["embed_mask"] = inputs["attention_mask"].clone()
|
230 |
inputs = inputs.to(device)
|
231 |
|
232 |
with torch.no_grad():
|