YoussefMorad1 commited on
Commit
115f5eb
·
1 Parent(s): f6503f8

Used chunks to handle LARGE job descriptions (above 512 tokens).

Browse files
skills_extraction/skills_extraction.py CHANGED
@@ -1,15 +1,28 @@
1
- import json
2
-
3
  import numpy as np
4
  from fastapi import FastAPI
5
  from pydantic import BaseModel
6
- from transformers import pipeline
 
 
 
7
 
8
  # Load models and tokenizers
9
- knowledge_nlp = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first")
10
- skill_nlp = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
 
 
 
 
 
11
 
12
- app = FastAPI()
 
 
 
 
 
 
13
 
14
 
15
  class TextInput(BaseModel):
@@ -19,7 +32,7 @@ class TextInput(BaseModel):
19
  def convert_from_numpy(predictions):
20
  for pred in predictions:
21
  for key, value in pred.items():
22
- if isinstance(value, (np.float32, np.int32, np.int64)): # Handle NumPy numeric types
23
  pred[key] = float(value)
24
  return predictions
25
 
@@ -27,33 +40,63 @@ def convert_from_numpy(predictions):
27
  def merge_BI_and_get_results(predictions):
28
  results, curSkill, curScore, curNoWords = [], "", 0, 0
29
  for pred in predictions:
30
- if pred['entity_group'] == 'B':
31
  if curSkill:
32
- results.append({"name": curSkill, "confidence": curScore / curNoWords}) # Average confidence
33
- curSkill, curScore, curNoWords = pred['word'], pred['score'], 1
 
 
34
  else:
35
- curSkill += " " + pred['word']
36
- curScore += pred['score']
37
  curNoWords += 1
38
  if curSkill:
39
- results.append({"name": curSkill, "confidence": curScore / curNoWords})
40
  return results
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  @app.post("/predict_knowledge")
44
  def predict_knowledge(input_data: TextInput):
45
- predictions = knowledge_nlp(input_data.jobDescription)
46
- predictions = convert_from_numpy(predictions)
47
- # print(json.dumps(predictions, indent=2))
48
- return {"knowledge_predictions": merge_BI_and_get_results(predictions)}
 
 
 
 
49
 
50
 
51
  @app.post("/predict_skills")
52
  def predict_skills(input_data: TextInput):
53
- predictions = skill_nlp(input_data.jobDescription)
54
- predictions = convert_from_numpy(predictions)
55
- # print(json.dumps(predictions, indent=2))
56
- return {"skills_predictions": merge_BI_and_get_results(predictions)}
 
 
 
57
 
58
  # Run with:
59
  # uvicorn main:app --host 0.0.0.0 --port 8000
 
1
+ import string
 
2
  import numpy as np
3
  from fastapi import FastAPI
4
  from pydantic import BaseModel
5
+ from transformers import pipeline, AutoTokenizer
6
+
7
+ # Initialize FastAPI
8
+ app = FastAPI()
9
 
10
  # Load models and tokenizers
11
+ knowledge_model_name = "jjzha/jobbert_knowledge_extraction"
12
+ knowledge_tokenizer = AutoTokenizer.from_pretrained(knowledge_model_name)
13
+ knowledge_nlp = pipeline(
14
+ model=knowledge_model_name,
15
+ tokenizer=knowledge_tokenizer,
16
+ aggregation_strategy="first",
17
+ )
18
 
19
+ skill_model_name = "jjzha/jobbert_skill_extraction"
20
+ skill_tokenizer = AutoTokenizer.from_pretrained(skill_model_name)
21
+ skill_nlp = pipeline(
22
+ model=skill_model_name,
23
+ tokenizer=skill_tokenizer,
24
+ aggregation_strategy="first",
25
+ )
26
 
27
 
28
  class TextInput(BaseModel):
 
32
  def convert_from_numpy(predictions):
33
  for pred in predictions:
34
  for key, value in pred.items():
35
+ if isinstance(value, (np.float32, np.int32, np.int64)):
36
  pred[key] = float(value)
37
  return predictions
38
 
 
40
  def merge_BI_and_get_results(predictions):
41
  results, curSkill, curScore, curNoWords = [], "", 0, 0
42
  for pred in predictions:
43
+ if pred["entity_group"] == "B":
44
  if curSkill:
45
+ results.append(
46
+ {"name": curSkill.strip(), "confidence": curScore / curNoWords}
47
+ )
48
+ curSkill, curScore, curNoWords = pred["word"], pred["score"], 1
49
  else:
50
+ curSkill += " " + pred["word"]
51
+ curScore += pred["score"]
52
  curNoWords += 1
53
  if curSkill:
54
+ results.append({"name": curSkill.strip(), "confidence": curScore / curNoWords})
55
  return results
56
 
57
 
58
+ def chunk_text(text, tokenizer, max_length=500, overlap=100):
59
+ """
60
+ Uses the tokenizer's built-in overflow mechanism to split `text` into
61
+ chunks of at most `max_length` tokens, each overlapping the previous
62
+ by `overlap` tokens.
63
+ """
64
+ enc = tokenizer(
65
+ text,
66
+ truncation=True,
67
+ max_length=max_length,
68
+ stride=overlap,
69
+ return_overflowing_tokens=True,
70
+ return_special_tokens_mask=False,
71
+ )
72
+ chunks = []
73
+ for ids in enc["input_ids"]:
74
+ # decode each chunk back to string
75
+ chunks.append(tokenizer.decode(ids, skip_special_tokens=True))
76
+ return chunks
77
+
78
+
79
  @app.post("/predict_knowledge")
80
  def predict_knowledge(input_data: TextInput):
81
+ # Clean non-printable chars
82
+ text = "".join(filter(lambda x: x in string.printable, input_data.jobDescription))
83
+ chunks = chunk_text(text, knowledge_tokenizer)
84
+ all_preds = []
85
+ for chunk in chunks:
86
+ preds = knowledge_nlp(chunk)
87
+ all_preds.extend(convert_from_numpy(preds))
88
+ return {"knowledge_predictions": merge_BI_and_get_results(all_preds)}
89
 
90
 
91
  @app.post("/predict_skills")
92
  def predict_skills(input_data: TextInput):
93
+ text = "".join(filter(lambda x: x in string.printable, input_data.jobDescription))
94
+ chunks = chunk_text(text, skill_tokenizer)
95
+ all_preds = []
96
+ for chunk in chunks:
97
+ preds = skill_nlp(chunk)
98
+ all_preds.extend(convert_from_numpy(preds))
99
+ return {"skills_predictions": merge_BI_and_get_results(all_preds)}
100
 
101
  # Run with:
102
  # uvicorn main:app --host 0.0.0.0 --port 8000