Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,22 +4,21 @@ import json
|
|
4 |
import gradio as gr
|
5 |
|
6 |
# --- Step 1: Load the vocabularies ---
|
7 |
-
# These files are in your Hugging Face Space repository, so we can load them directly.
|
8 |
with open('char_to_int.json', 'r') as f:
|
9 |
char_to_int = json.load(f)
|
10 |
with open('int_to_lang.json', 'r') as f:
|
11 |
-
|
|
|
12 |
|
13 |
# --- Step 2: Re-define the Model Architecture ---
|
14 |
# This MUST be the exact same architecture as the one you trained.
|
15 |
-
# All the hyperparameters (embedding_dim, hidden_dim, etc.) must match.
|
16 |
class CodeClassifierRNN(nn.Module):
|
17 |
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
|
18 |
super().__init__()
|
19 |
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
|
20 |
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first=True)
|
21 |
self.dropout = nn.Dropout(dropout)
|
22 |
-
self.fc = nn.Linear(hidden_dim * 2, output_dim)
|
23 |
def forward(self, text):
|
24 |
embedded = self.embedding(text)
|
25 |
_, (hidden, _) = self.lstm(embedded)
|
@@ -29,45 +28,35 @@ class CodeClassifierRNN(nn.Module):
|
|
29 |
return output
|
30 |
|
31 |
# --- Step 3: Instantiate the model and load the trained weights ---
|
32 |
-
# Set hyperparameters to match your training script
|
33 |
PAD_IDX = char_to_int['<PAD>']
|
34 |
VOCAB_SIZE = len(char_to_int)
|
35 |
EMBEDDING_DIM = 128
|
36 |
-
HIDDEN_DIM = 192
|
37 |
OUTPUT_DIM = len(int_to_lang)
|
38 |
N_LAYERS = 2
|
39 |
BIDIRECTIONAL = True
|
40 |
DROPOUT = 0.5
|
41 |
|
42 |
-
# Create an instance of the model
|
43 |
model = CodeClassifierRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
|
44 |
-
|
45 |
-
# Load the saved state dictionary.
|
46 |
-
# We use map_location='cpu' because the Space runs on a CPU.
|
47 |
model.load_state_dict(torch.load('polyglot_classifier.pt', map_location='cpu'))
|
48 |
-
model.eval()
|
49 |
|
50 |
# --- Step 4: Create the prediction function ---
|
51 |
def classify_code(code_snippet):
|
52 |
-
if not code_snippet:
|
53 |
return {}
|
54 |
|
55 |
-
# 1. Convert snippet to tensor of indices
|
56 |
indexed = [char_to_int.get(c, char_to_int['<UNK>']) for c in code_snippet]
|
57 |
-
tensor = torch.LongTensor(indexed).unsqueeze(0)
|
58 |
|
59 |
-
# 2. Make prediction
|
60 |
with torch.no_grad():
|
61 |
prediction = model(tensor)
|
62 |
|
63 |
-
# 3. Get probabilities using softmax
|
64 |
probabilities = torch.softmax(prediction, dim=1)
|
65 |
-
|
66 |
-
# 4. Get top 5 predictions
|
67 |
top5_probs, top5_indices = torch.topk(probabilities, 5)
|
68 |
|
69 |
-
#
|
70 |
-
confidences = {int_to_lang[
|
71 |
|
72 |
return confidences
|
73 |
|
|
|
4 |
import gradio as gr
|
5 |
|
6 |
# --- Step 1: Load the vocabularies ---
|
|
|
7 |
with open('char_to_int.json', 'r') as f:
|
8 |
char_to_int = json.load(f)
|
9 |
with open('int_to_lang.json', 'r') as f:
|
10 |
+
# ROBUSTNESS FIX: Convert JSON string keys ("0", "1") to integer keys (0, 1)
|
11 |
+
int_to_lang = {int(k): v for k, v in json.load(f).items()}
|
12 |
|
13 |
# --- Step 2: Re-define the Model Architecture ---
|
14 |
# This MUST be the exact same architecture as the one you trained.
|
|
|
15 |
class CodeClassifierRNN(nn.Module):
|
16 |
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
|
17 |
super().__init__()
|
18 |
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
|
19 |
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first=True)
|
20 |
self.dropout = nn.Dropout(dropout)
|
21 |
+
self.fc = nn.Linear(hidden_dim * 2, output_dim)
|
22 |
def forward(self, text):
|
23 |
embedded = self.embedding(text)
|
24 |
_, (hidden, _) = self.lstm(embedded)
|
|
|
28 |
return output
|
29 |
|
30 |
# --- Step 3: Instantiate the model and load the trained weights ---
|
|
|
31 |
PAD_IDX = char_to_int['<PAD>']
|
32 |
VOCAB_SIZE = len(char_to_int)
|
33 |
EMBEDDING_DIM = 128
|
34 |
+
HIDDEN_DIM = 192
|
35 |
OUTPUT_DIM = len(int_to_lang)
|
36 |
N_LAYERS = 2
|
37 |
BIDIRECTIONAL = True
|
38 |
DROPOUT = 0.5
|
39 |
|
|
|
40 |
model = CodeClassifierRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
|
|
|
|
|
|
|
41 |
model.load_state_dict(torch.load('polyglot_classifier.pt', map_location='cpu'))
|
42 |
+
model.eval()
|
43 |
|
44 |
# --- Step 4: Create the prediction function ---
|
45 |
def classify_code(code_snippet):
|
46 |
+
if not code_snippet or not code_snippet.strip():
|
47 |
return {}
|
48 |
|
|
|
49 |
indexed = [char_to_int.get(c, char_to_int['<UNK>']) for c in code_snippet]
|
50 |
+
tensor = torch.LongTensor(indexed).unsqueeze(0)
|
51 |
|
|
|
52 |
with torch.no_grad():
|
53 |
prediction = model(tensor)
|
54 |
|
|
|
55 |
probabilities = torch.softmax(prediction, dim=1)
|
|
|
|
|
56 |
top5_probs, top5_indices = torch.topk(probabilities, 5)
|
57 |
|
58 |
+
# ROBUSTNESS FIX: Simplified lookup using integer keys
|
59 |
+
confidences = {int_to_lang[idx.item()]: prob.item() for idx, prob in zip(top5_indices[0], top5_probs[0])}
|
60 |
|
61 |
return confidences
|
62 |
|