Spaces:

szili2011
/

polyglot-code-classifier

Sleeping

szili2011 commited on Jun 9

Commit

a61bd5c

verified ·

1 Parent(s): 298e502

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,11 +3,14 @@ import torch.nn as nn
 import json
 import gradio as gr
-# --- Step 1: Load the vocabularies ---
-with open('char_to_int.json', 'r') as f:
     char_to_int = json.load(f)
-with open('int_to_lang.json', 'r') as f:
-    # ROBUSTNESS FIX: Convert JSON string keys ("0", "1") to integer keys (0, 1)
     int_to_lang = {int(k): v for k, v in json.load(f).items()}
 # --- Step 2: Re-define the Model Architecture ---
@@ -18,7 +21,7 @@ class CodeClassifierRNN(nn.Module):
         self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
         self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first=True)
         self.dropout = nn.Dropout(dropout)
-        self.fc = nn.Linear(hidden_dim * 2, output_dim)
     def forward(self, text):
         embedded = self.embedding(text)
         _, (hidden, _) = self.lstm(embedded)
@@ -55,7 +58,7 @@ def classify_code(code_snippet):
     probabilities = torch.softmax(prediction, dim=1)
     top5_probs, top5_indices = torch.topk(probabilities, 5)
-    # ROBUSTNESS FIX: Simplified lookup using integer keys
     confidences = {int_to_lang[idx.item()]: prob.item() for idx, prob in zip(top5_indices[0], top5_probs[0])}
     return confidences

 import json
 import gradio as gr
+# --- Step 1: Load the vocabularies CORRECTLY ---
+with open('char_to_int.json', 'r', encoding='utf-8') as f:
     char_to_int = json.load(f)
+# This loads your ACTUAL int_to_lang.json file: {"0": "C#", "1": "C++", ...}
+# The FIX is to correctly convert the string keys "0", "1", etc., to integer keys 0, 1, etc.
+with open('int_to_lang.json', 'r', encoding='utf-8') as f:
     int_to_lang = {int(k): v for k, v in json.load(f).items()}
 # --- Step 2: Re-define the Model Architecture ---
         self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
         self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first=True)
         self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(hidden_dim * 2, output_dim) # * 2 for bidirectional
     def forward(self, text):
         embedded = self.embedding(text)
         _, (hidden, _) = self.lstm(embedded)
     probabilities = torch.softmax(prediction, dim=1)
     top5_probs, top5_indices = torch.topk(probabilities, 5)
+    # This lookup is now guaranteed to work with the correctly loaded dictionary.
     confidences = {int_to_lang[idx.item()]: prob.item() for idx, prob in zip(top5_indices[0], top5_probs[0])}
     return confidences