Simon Clematide commited on
Commit
fed6436
·
1 Parent(s): 18af486

update to new model with weighted loss 0.1 for class 0

Browse files
config.json CHANGED
@@ -57,7 +57,6 @@
57
  "num_hidden_layers": 12,
58
  "pad_token_id": 0,
59
  "position_embedding_type": "absolute",
60
- "problem_type": "single_label_classification",
61
  "torch_dtype": "float32",
62
  "transformers_version": "4.41.2",
63
  "type_vocab_size": 2,
 
57
  "num_hidden_layers": 12,
58
  "pad_token_id": 0,
59
  "position_embedding_type": "absolute",
 
60
  "torch_dtype": "float32",
61
  "transformers_version": "4.41.2",
62
  "type_vocab_size": 2,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39cb42cf548de528ab2d1505a7ba84031e250ba2d092ddd861ad109ad0acbcb3
3
  size 439832632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7556885b937a337a064f088a19141cd29ef6e6f2276cf53b70b1a1730b2c99d4
3
  size 439832632
sdg_predict/cli_predict.py CHANGED
@@ -6,31 +6,53 @@ from tqdm import tqdm
6
  import sys
7
  import torch
8
  from sdg_predict.inference import load_model, predict
 
 
 
 
 
 
 
9
 
10
  def main():
11
- parser = argparse.ArgumentParser(description="Batch inference using Hugging Face model.")
 
 
12
  parser.add_argument("input", type=Path, help="Input JSONL file")
13
- parser.add_argument("--key", type=str, required=True, help="JSON key with text input")
14
- parser.add_argument("--batch_size", type=int, default=8, help="Batch size")
15
- parser.add_argument("--model", type=str, default="simon-clmtd/sdg-scibert-zo_up", help="Model name on the Hub")
16
- parser.add_argument("--top1", action="store_true", help="Return only top prediction")
17
- parser.add_argument("--output", type=Path, help="Output file (optional, otherwise stdout)")
 
 
 
 
 
 
 
 
 
 
 
18
  args = parser.parse_args()
19
 
20
  # -------------------------------
21
- # 1. Device Setup (MPS support for Apple Silicon)
22
- # -------------------------------
23
  if torch.backends.mps.is_available():
24
  device = torch.device("mps")
25
- print("Using MPS device")
26
  elif torch.cuda.is_available():
27
  device = torch.device("cuda")
28
- print("Using CUDA device")
29
  else:
30
  device = torch.device("cpu")
31
- print("Using CPU device")
32
-
 
33
  tokenizer, model = load_model(args.model, device)
 
34
 
35
  with args.input.open() as f:
36
  texts = []
@@ -40,20 +62,44 @@ def main():
40
  if args.key not in row:
41
  continue
42
  texts.append(row[args.key])
 
43
  rows.append(row)
44
 
 
45
  predictions = predict(
46
  texts,
47
  tokenizer,
48
  model,
49
  device,
50
  batch_size=args.batch_size,
51
- return_all_scores=not args.top1
52
  )
 
53
 
54
  output_stream = args.output.open("w") if args.output else sys.stdout
55
  for row, pred in zip(rows, predictions):
56
- row["prediction"] = pred
57
- print(json.dumps(row, ensure_ascii=False), file=output_stream)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  if args.output:
59
  output_stream.close()
 
 
 
 
 
 
6
  import sys
7
  import torch
8
  from sdg_predict.inference import load_model, predict
9
+ import logging
10
+
11
+ # Set up logging
12
+ logging.basicConfig(
13
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", force=True
14
+ )
15
+
16
 
17
  def main():
18
+ parser = argparse.ArgumentParser(
19
+ description="Batch inference using Hugging Face model."
20
+ )
21
  parser.add_argument("input", type=Path, help="Input JSONL file")
22
+ parser.add_argument(
23
+ "--key", type=str, default="text", help="JSON key with text input"
24
+ )
25
+ parser.add_argument("--batch_size", "-b", type=int, default=8, help="Batch size")
26
+ parser.add_argument(
27
+ "--model",
28
+ type=str,
29
+ default="simon-clmtd/sdg-scibert-zo_up",
30
+ help="Model name on the Hub",
31
+ )
32
+ parser.add_argument(
33
+ "--top1", action="store_true", help="Return only top prediction"
34
+ )
35
+ parser.add_argument(
36
+ "--output", type=Path, help="Output file (optional, otherwise stdout)"
37
+ )
38
  args = parser.parse_args()
39
 
40
  # -------------------------------
41
+ # 1. Device Setup (MPS support for Apple Silicon)
42
+ # -------------------------------
43
  if torch.backends.mps.is_available():
44
  device = torch.device("mps")
45
+ logging.info("Using MPS device")
46
  elif torch.cuda.is_available():
47
  device = torch.device("cuda")
48
+ logging.info("Using CUDA device")
49
  else:
50
  device = torch.device("cpu")
51
+ logging.info("Using CPU device")
52
+ # device = torch.device("cpu")
53
+ logging.info("Loading model: %s", args.model)
54
  tokenizer, model = load_model(args.model, device)
55
+ logging.info("Model loaded successfully")
56
 
57
  with args.input.open() as f:
58
  texts = []
 
62
  if args.key not in row:
63
  continue
64
  texts.append(row[args.key])
65
+ logging.debug("Text: %s", row[args.key])
66
  rows.append(row)
67
 
68
+ logging.info("Starting predictions on %d texts", len(texts))
69
  predictions = predict(
70
  texts,
71
  tokenizer,
72
  model,
73
  device,
74
  batch_size=args.batch_size,
75
+ return_all_scores=not args.top1,
76
  )
77
+ logging.info("Predictions completed")
78
 
79
  output_stream = args.output.open("w") if args.output else sys.stdout
80
  for row, pred in zip(rows, predictions):
81
+ # Compute binary probabilities for labels 1-17
82
+ binary_predictions = {}
83
+ for label_data in pred:
84
+ label_data["score"] = round(
85
+ label_data["score"], 3
86
+ ) # Round prediction scores to 3 decimal places
87
+ label = int(label_data["label"])
88
+ if 1 <= label <= 17:
89
+ binary_prob = label_data["score"] # Already rounded
90
+ binary_predictions[str(label)] = binary_prob
91
+
92
+ output_row = {
93
+ "id": row.get("id"),
94
+ "text": row.get("text"),
95
+ "prediction": pred,
96
+ "binary_predictions": binary_predictions,
97
+ }
98
+ print(json.dumps(output_row, ensure_ascii=False), file=output_stream)
99
  if args.output:
100
  output_stream.close()
101
+ logging.info("Output written to %s", args.output)
102
+
103
+
104
+ if __name__ == "__main__":
105
+ main()
sdg_predict/inference.py CHANGED
@@ -1,42 +1,45 @@
1
  # sdg_predict/inference.py
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
  import torch
 
 
4
 
5
  def load_model(model_name, device):
6
- tokenizer = AutoTokenizer.from_pretrained(model_name)
7
  model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
8
  model.eval()
9
  return tokenizer, model
10
 
 
11
  def batched(iterable, batch_size):
12
  for i in range(0, len(iterable), batch_size):
13
- yield iterable[i:i + batch_size]
 
14
 
15
  def predict(texts, tokenizer, model, device, batch_size=8, return_all_scores=True):
16
- results = []
17
- for batch_texts in batched(texts, batch_size):
18
- inputs = tokenizer(
19
- batch_texts,
20
- return_tensors="pt",
21
- padding=True,
22
- truncation=True,
23
- max_length=512
24
- ).to(device)
25
-
26
- with torch.no_grad():
27
- logits = model(**inputs).logits
28
- probs = torch.nn.functional.softmax(logits, dim=-1)
29
-
30
- for prob in probs:
31
- if return_all_scores:
32
- results.append([
33
- {"label": model.config.id2label[i], "score": prob[i].item()}
34
- for i in range(len(prob))
35
- ])
36
- else:
37
- top = torch.argmax(prob).item()
38
- results.append({
39
- "label": model.config.id2label[top],
40
- "score": prob[top].item()
41
- })
42
  return results
 
1
  # sdg_predict/inference.py
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
3
  import torch
4
+ import logging
5
+
6
 
7
  def load_model(model_name, device):
8
+ tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)
9
  model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
10
  model.eval()
11
  return tokenizer, model
12
 
13
+
14
  def batched(iterable, batch_size):
15
  for i in range(0, len(iterable), batch_size):
16
+ yield iterable[i : i + batch_size]
17
+
18
 
19
  def predict(texts, tokenizer, model, device, batch_size=8, return_all_scores=True):
20
+ classifier = pipeline(
21
+ "text-classification",
22
+ model=model,
23
+ tokenizer=tokenizer,
24
+ device=device,
25
+ batch_size=batch_size,
26
+ truncation=True,
27
+ padding=True,
28
+ max_length=512,
29
+ top_k=None if return_all_scores else 1,
30
+ )
31
+
32
+ results = classifier(texts)
33
+ if return_all_scores:
34
+ for result in results:
35
+ for score in result:
36
+ score["score"] = round(
37
+ score["score"], 3
38
+ ) # Round scores to 3 decimal places
39
+ else:
40
+ for result in results:
41
+ result["score"] = round(
42
+ result["score"], 3
43
+ ) # Round top score to 3 decimal places
44
+
 
45
  return results
setup.py CHANGED
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
 
3
  setup(
4
  name="sdg-predict",
5
- version="0.1",
6
  packages=find_packages(),
7
  install_requires=[
8
  "transformers>=4.36",
 
2
 
3
  setup(
4
  name="sdg-predict",
5
+ version="0.2",
6
  packages=find_packages(),
7
  install_requires=[
8
  "transformers>=4.36",
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6728d45aae9302ac2656ddf218d44224848150f29ea3fafc5535dde9cd98003
3
  size 5713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4b473744ec4c80646022813576aa0fa32733d67c31a15bc75b51c2d5cb456e6
3
  size 5713