Akhilgautam30 commited on
Commit
4370811
·
verified ·
1 Parent(s): 9f8c141

Update model_utils.py

Browse files
Files changed (1) hide show
  1. model_utils.py +126 -123
model_utils.py CHANGED
@@ -1,123 +1,126 @@
1
- # model_utils.py
2
- import os
3
- import nltk
4
- import ssl
5
- import tempfile
6
-
7
- # Create a temporary directory for NLTK data
8
- nltk_data_dir = tempfile.mkdtemp()
9
-
10
- # Set the NLTK data path
11
- nltk.data.path.append(nltk_data_dir)
12
-
13
- # Download stopwords to the temporary directory
14
- try:
15
- _create_unverified_https_context = ssl._create_unverified_context
16
- except AttributeError:
17
- pass
18
- else:
19
- ssl._create_default_https_context = _create_unverified_https_context
20
-
21
- nltk.download('stopwords', download_dir=nltk_data_dir, quiet=True)
22
-
23
- from nltk.corpus import stopwords
24
- import tensorflow as tf
25
- from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
26
- import numpy as np
27
- from keras.preprocessing.text import Tokenizer
28
- # Define the personality trait labels
29
- traits = ['cAGR', 'cCON', 'cEXT', 'cOPN', 'cNEU']
30
-
31
- def preprocess(docs):
32
- stopwrd = set(stopwords.words('english'))
33
- t = Tokenizer(num_words=20000, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
34
- t.fit_on_texts(docs)
35
- encoded_docs = t.texts_to_sequences(docs)
36
- idx2word = {v: k for k, v in t.word_index.items()}
37
-
38
- def abbreviation_handler(text):
39
- ln = text.lower()
40
- ln = ln.replace(r"'t", " not")
41
- ln = ln.replace(r"'s", " is")
42
- ln = ln.replace(r"'ll", " will")
43
- ln = ln.replace(r"'ve", " have")
44
- ln = ln.replace(r"'re", " are")
45
- ln = ln.replace(r"'m", " am")
46
- ln = ln.replace(r"'", " ")
47
- return ln
48
-
49
- def stopwords_handler(text):
50
- words = text.split()
51
- new_words = [w for w in words if w not in stopwrd]
52
- return ' '.join(new_words)
53
-
54
- def sequence_to_text(listOfSequences):
55
- tokenized_list = []
56
- for text in listOfSequences:
57
- newText = ''
58
- for num in text:
59
- newText += idx2word[num] + ' '
60
- newText = abbreviation_handler(newText)
61
- newText = stopwords_handler(newText)
62
- tokenized_list.append(newText)
63
- return tokenized_list
64
-
65
- newLists = sequence_to_text(encoded_docs)
66
- return newLists
67
-
68
- def tokenize_text(text, hugging_model='roberta-base'):
69
- print("tokenize_text")
70
- clean_text = preprocess(text)
71
- tokenizer = AutoTokenizer.from_pretrained(hugging_model)
72
- inputs = tokenizer(clean_text, padding=True, truncation=True, return_tensors='tf')
73
- x = dict(inputs)
74
- return x
75
-
76
- def single_predict(model, text, traits=['cAGR', 'cCON', 'cEXT', 'cOPN', 'cNEU']):
77
- print("predict function-----")
78
- traits_scores = dict()
79
- predicted_labels = dict()
80
- x = tokenize_text([text])
81
- logits = model.predict(x, verbose=0).logits
82
- print("logits function-----")
83
- probs = tf.math.sigmoid(logits).numpy()
84
- print("sigmoid function-----")
85
- predictions = np.where(probs > 0.5, 1, 0)
86
- print("predictions function------")
87
- for t, s in zip(traits, probs[0]):
88
- traits_scores[t] = s
89
- for t, l in zip(traits, predictions[0]):
90
- predicted_labels[t] = l
91
- final_dic = {'probability': traits_scores, 'predicted_label': predicted_labels}
92
- return final_dic
93
-
94
- def load_model_and_weights(hugging_model='roberta-base', output_folder='.'):
95
-
96
- print(f"Current working directory: {os.getcwd()}")
97
- print(f"Output folder: {output_folder}")
98
- print("Files in the output folder:")
99
- for file in os.listdir(output_folder):
100
- print(f"- {file}")
101
-
102
- model = TFAutoModelForSequenceClassification.from_pretrained(
103
- hugging_model, num_labels=len(traits), problem_type="multi_label_classification"
104
- )
105
- if len(hugging_model.split('/')) > 1:
106
- _hugging_model = hugging_model.split('/')[1]
107
- else:
108
- _hugging_model = hugging_model.split('/')[0]
109
-
110
- weights_path = os.path.join(output_folder, f'weights-{_hugging_model}.h5')
111
- print(f"Looking for weights file at: {weights_path}")
112
- if os.path.exists(weights_path):
113
- try:
114
- model.load_weights(weights_path)
115
- print("Custom weights loaded successfully.")
116
- except Exception as e:
117
- print(f"Error loading weights: {str(e)}")
118
- print("Using default weights.")
119
- return e
120
- else:
121
- print(f"Warning: Custom weights file not found at {weights_path}")
122
- print("Using default weights.")
123
- return model
 
 
 
 
1
+ # model_utils.py
2
+ import os
3
+ import nltk
4
+ import ssl
5
+ import tempfile
6
+
7
+ # Create a temporary directory for NLTK data
8
+ nltk_data_dir = tempfile.mkdtemp()
9
+
10
+ # Set the NLTK data path
11
+ nltk.data.path.append(nltk_data_dir)
12
+
13
+ # Download stopwords to the temporary directory
14
+ try:
15
+ _create_unverified_https_context = ssl._create_unverified_context
16
+ except AttributeError:
17
+ pass
18
+ else:
19
+ ssl._create_default_https_context = _create_unverified_https_context
20
+
21
+ nltk.download('stopwords', download_dir=nltk_data_dir, quiet=True)
22
+
23
+ from nltk.corpus import stopwords
24
+ import tensorflow as tf
25
+ from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
26
+ import numpy as np
27
+ from keras.preprocessing.text import Tokenizer
28
+ # Define the personality trait labels
29
+ traits = ['cAGR', 'cCON', 'cEXT', 'cOPN', 'cNEU']
30
+
31
+ def preprocess(docs):
32
+ stopwrd = set(stopwords.words('english'))
33
+ t = Tokenizer(num_words=20000, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
34
+ t.fit_on_texts(docs)
35
+ encoded_docs = t.texts_to_sequences(docs)
36
+ idx2word = {v: k for k, v in t.word_index.items()}
37
+
38
+ def abbreviation_handler(text):
39
+ ln = text.lower()
40
+ ln = ln.replace(r"'t", " not")
41
+ ln = ln.replace(r"'s", " is")
42
+ ln = ln.replace(r"'ll", " will")
43
+ ln = ln.replace(r"'ve", " have")
44
+ ln = ln.replace(r"'re", " are")
45
+ ln = ln.replace(r"'m", " am")
46
+ ln = ln.replace(r"'", " ")
47
+ return ln
48
+
49
+ def stopwords_handler(text):
50
+ words = text.split()
51
+ new_words = [w for w in words if w not in stopwrd]
52
+ return ' '.join(new_words)
53
+
54
+ def sequence_to_text(listOfSequences):
55
+ tokenized_list = []
56
+ for text in listOfSequences:
57
+ newText = ''
58
+ for num in text:
59
+ newText += idx2word[num] + ' '
60
+ newText = abbreviation_handler(newText)
61
+ newText = stopwords_handler(newText)
62
+ tokenized_list.append(newText)
63
+ return tokenized_list
64
+
65
+ newLists = sequence_to_text(encoded_docs)
66
+ return newLists
67
+
68
+ def tokenize_text(text, hugging_model='roberta-base'):
69
+ print("tokenize_text")
70
+ clean_text = preprocess(text)
71
+ tokenizer = AutoTokenizer.from_pretrained(hugging_model)
72
+ inputs = tokenizer(clean_text, padding=True, truncation=True, return_tensors='tf')
73
+ x = dict(inputs)
74
+ return x
75
+
76
+ def single_predict(model, text, traits=['cAGR', 'cCON', 'cEXT', 'cOPN', 'cNEU']):
77
+ print("predict function-----")
78
+ traits_scores = dict()
79
+ predicted_labels = dict()
80
+ x = tokenize_text([text])
81
+ logits = model.predict(x, verbose=0).logits
82
+ print("logits function-----")
83
+ probs = tf.math.sigmoid(logits).numpy()
84
+ print("sigmoid function-----")
85
+ predictions = np.where(probs > 0.5, 1, 0)
86
+ print("predictions function------")
87
+ for t, s in zip(traits, probs[0]):
88
+ traits_scores[t] = s
89
+ print("t, s in")
90
+ for t, l in zip(traits, predictions[0]):
91
+ predicted_labels[t] = l
92
+ print("t, l in")
93
+ final_dic = {'probability': traits_scores, 'predicted_label': predicted_labels}
94
+ print("end predict function------")
95
+ return final_dic
96
+
97
+ def load_model_and_weights(hugging_model='roberta-base', output_folder='.'):
98
+
99
+ print(f"Current working directory: {os.getcwd()}")
100
+ print(f"Output folder: {output_folder}")
101
+ print("Files in the output folder:")
102
+ for file in os.listdir(output_folder):
103
+ print(f"- {file}")
104
+
105
+ model = TFAutoModelForSequenceClassification.from_pretrained(
106
+ hugging_model, num_labels=len(traits), problem_type="multi_label_classification"
107
+ )
108
+ if len(hugging_model.split('/')) > 1:
109
+ _hugging_model = hugging_model.split('/')[1]
110
+ else:
111
+ _hugging_model = hugging_model.split('/')[0]
112
+
113
+ weights_path = os.path.join(output_folder, f'weights-{_hugging_model}.h5')
114
+ print(f"Looking for weights file at: {weights_path}")
115
+ if os.path.exists(weights_path):
116
+ try:
117
+ model.load_weights(weights_path)
118
+ print("Custom weights loaded successfully.")
119
+ except Exception as e:
120
+ print(f"Error loading weights: {str(e)}")
121
+ print("Using default weights.")
122
+ return e
123
+ else:
124
+ print(f"Warning: Custom weights file not found at {weights_path}")
125
+ print("Using default weights.")
126
+ return model