lukeingawesome commited on
Commit
af8b7ea
·
verified ·
1 Parent(s): 1cc6474

Upload LLM2Vec4CXR fine-tuned model

Browse files
Files changed (2) hide show
  1. README.md +5 -0
  2. usage_example.py +4 -0
README.md CHANGED
@@ -82,6 +82,9 @@ tokenizer.padding_side = 'left'
82
  # Example usage for chest X-ray report analysis
83
  def encode_text(text):
84
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
 
 
85
  with torch.no_grad():
86
  embeddings = model(inputs)
87
  return embeddings
@@ -91,6 +94,8 @@ report = "There is a small increase in the left-sided effusion. There continues
91
  embedding = encode_text(report)
92
  ```
93
 
 
 
94
  ### Advanced Usage with Separator-based Processing
95
 
96
  The model supports special separator-based processing for instruction-following tasks:
 
82
  # Example usage for chest X-ray report analysis
83
  def encode_text(text):
84
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
85
+ # IMPORTANT: Add embed_mask for proper model functioning
86
+ # For simple text encoding, embed_mask is the same as attention_mask
87
+ inputs["embed_mask"] = inputs["attention_mask"].clone()
88
  with torch.no_grad():
89
  embeddings = model(inputs)
90
  return embeddings
 
94
  embedding = encode_text(report)
95
  ```
96
 
97
+ **Note**: The model requires an `embed_mask` input. For simple text encoding, set `embed_mask` equal to `attention_mask`. For instruction-following tasks, use the separator-based tokenization shown below.
98
+
99
  ### Advanced Usage with Separator-based Processing
100
 
101
  The model supports special separator-based processing for instruction-following tasks:
usage_example.py CHANGED
@@ -162,6 +162,8 @@ def main():
162
  report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
163
 
164
  inputs = tokenizer(report, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
 
165
  inputs = inputs.to(device)
166
 
167
  with torch.no_grad():
@@ -223,6 +225,8 @@ def main():
223
 
224
  print("Computing embeddings for multiple reports...")
225
  inputs = tokenizer(reports, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
 
226
  inputs = inputs.to(device)
227
 
228
  with torch.no_grad():
 
162
  report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
163
 
164
  inputs = tokenizer(report, return_tensors="pt", padding=True, truncation=True, max_length=512)
165
+ # Add embed_mask (same as attention_mask for simple text encoding)
166
+ inputs["embed_mask"] = inputs["attention_mask"].clone()
167
  inputs = inputs.to(device)
168
 
169
  with torch.no_grad():
 
225
 
226
  print("Computing embeddings for multiple reports...")
227
  inputs = tokenizer(reports, return_tensors="pt", padding=True, truncation=True, max_length=512)
228
+ # Add embed_mask (same as attention_mask for simple text encoding)
229
+ inputs["embed_mask"] = inputs["attention_mask"].clone()
230
  inputs = inputs.to(device)
231
 
232
  with torch.no_grad():