szili2011 commited on
Commit
298e502
·
verified ·
1 Parent(s): 720fd6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -20
app.py CHANGED
@@ -4,22 +4,21 @@ import json
4
  import gradio as gr
5
 
6
  # --- Step 1: Load the vocabularies ---
7
- # These files are in your Hugging Face Space repository, so we can load them directly.
8
  with open('char_to_int.json', 'r') as f:
9
  char_to_int = json.load(f)
10
  with open('int_to_lang.json', 'r') as f:
11
- int_to_lang = json.load(f)
 
12
 
13
  # --- Step 2: Re-define the Model Architecture ---
14
  # This MUST be the exact same architecture as the one you trained.
15
- # All the hyperparameters (embedding_dim, hidden_dim, etc.) must match.
16
  class CodeClassifierRNN(nn.Module):
17
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
18
  super().__init__()
19
  self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
20
  self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first=True)
21
  self.dropout = nn.Dropout(dropout)
22
- self.fc = nn.Linear(hidden_dim * 2, output_dim) # * 2 for bidirectional
23
  def forward(self, text):
24
  embedded = self.embedding(text)
25
  _, (hidden, _) = self.lstm(embedded)
@@ -29,45 +28,35 @@ class CodeClassifierRNN(nn.Module):
29
  return output
30
 
31
  # --- Step 3: Instantiate the model and load the trained weights ---
32
- # Set hyperparameters to match your training script
33
  PAD_IDX = char_to_int['<PAD>']
34
  VOCAB_SIZE = len(char_to_int)
35
  EMBEDDING_DIM = 128
36
- HIDDEN_DIM = 192 # Must match the final trained model
37
  OUTPUT_DIM = len(int_to_lang)
38
  N_LAYERS = 2
39
  BIDIRECTIONAL = True
40
  DROPOUT = 0.5
41
 
42
- # Create an instance of the model
43
  model = CodeClassifierRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
44
-
45
- # Load the saved state dictionary.
46
- # We use map_location='cpu' because the Space runs on a CPU.
47
  model.load_state_dict(torch.load('polyglot_classifier.pt', map_location='cpu'))
48
- model.eval() # Set the model to evaluation mode
49
 
50
  # --- Step 4: Create the prediction function ---
51
  def classify_code(code_snippet):
52
- if not code_snippet:
53
  return {}
54
 
55
- # 1. Convert snippet to tensor of indices
56
  indexed = [char_to_int.get(c, char_to_int['<UNK>']) for c in code_snippet]
57
- tensor = torch.LongTensor(indexed).unsqueeze(0) # Add batch dimension
58
 
59
- # 2. Make prediction
60
  with torch.no_grad():
61
  prediction = model(tensor)
62
 
63
- # 3. Get probabilities using softmax
64
  probabilities = torch.softmax(prediction, dim=1)
65
-
66
- # 4. Get top 5 predictions
67
  top5_probs, top5_indices = torch.topk(probabilities, 5)
68
 
69
- # 5. Format for Gradio output
70
- confidences = {int_to_lang[str(idx.item())]: prob.item() for idx, prob in zip(top5_indices[0], top5_probs[0])}
71
 
72
  return confidences
73
 
 
4
  import gradio as gr
5
 
6
  # --- Step 1: Load the vocabularies ---
 
7
  with open('char_to_int.json', 'r') as f:
8
  char_to_int = json.load(f)
9
  with open('int_to_lang.json', 'r') as f:
10
+ # ROBUSTNESS FIX: Convert JSON string keys ("0", "1") to integer keys (0, 1)
11
+ int_to_lang = {int(k): v for k, v in json.load(f).items()}
12
 
13
  # --- Step 2: Re-define the Model Architecture ---
14
  # This MUST be the exact same architecture as the one you trained.
 
15
  class CodeClassifierRNN(nn.Module):
16
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
17
  super().__init__()
18
  self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
19
  self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first=True)
20
  self.dropout = nn.Dropout(dropout)
21
+ self.fc = nn.Linear(hidden_dim * 2, output_dim)
22
  def forward(self, text):
23
  embedded = self.embedding(text)
24
  _, (hidden, _) = self.lstm(embedded)
 
28
  return output
29
 
30
  # --- Step 3: Instantiate the model and load the trained weights ---
 
31
  PAD_IDX = char_to_int['<PAD>']
32
  VOCAB_SIZE = len(char_to_int)
33
  EMBEDDING_DIM = 128
34
+ HIDDEN_DIM = 192
35
  OUTPUT_DIM = len(int_to_lang)
36
  N_LAYERS = 2
37
  BIDIRECTIONAL = True
38
  DROPOUT = 0.5
39
 
 
40
  model = CodeClassifierRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
 
 
 
41
  model.load_state_dict(torch.load('polyglot_classifier.pt', map_location='cpu'))
42
+ model.eval()
43
 
44
  # --- Step 4: Create the prediction function ---
45
  def classify_code(code_snippet):
46
+ if not code_snippet or not code_snippet.strip():
47
  return {}
48
 
 
49
  indexed = [char_to_int.get(c, char_to_int['<UNK>']) for c in code_snippet]
50
+ tensor = torch.LongTensor(indexed).unsqueeze(0)
51
 
 
52
  with torch.no_grad():
53
  prediction = model(tensor)
54
 
 
55
  probabilities = torch.softmax(prediction, dim=1)
 
 
56
  top5_probs, top5_indices = torch.topk(probabilities, 5)
57
 
58
+ # ROBUSTNESS FIX: Simplified lookup using integer keys
59
+ confidences = {int_to_lang[idx.item()]: prob.item() for idx, prob in zip(top5_indices[0], top5_probs[0])}
60
 
61
  return confidences
62