Spaces:

szili2011
/

polyglot-code-classifier

Sleeping

App Files Files Community

szili2011 commited on Jun 9

Commit

298e502

verified ·

1 Parent(s): 720fd6b

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -20

app.py CHANGED Viewed

@@ -4,22 +4,21 @@ import json
 import gradio as gr
 # --- Step 1: Load the vocabularies ---
-# These files are in your Hugging Face Space repository, so we can load them directly.
 with open('char_to_int.json', 'r') as f:
     char_to_int = json.load(f)
 with open('int_to_lang.json', 'r') as f:
-    int_to_lang = json.load(f)
 # --- Step 2: Re-define the Model Architecture ---
 # This MUST be the exact same architecture as the one you trained.
-# All the hyperparameters (embedding_dim, hidden_dim, etc.) must match.
 class CodeClassifierRNN(nn.Module):
     def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
         super().__init__()
         self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
         self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first=True)
         self.dropout = nn.Dropout(dropout)
-        self.fc = nn.Linear(hidden_dim * 2, output_dim) # * 2 for bidirectional
     def forward(self, text):
         embedded = self.embedding(text)
         _, (hidden, _) = self.lstm(embedded)
@@ -29,45 +28,35 @@ class CodeClassifierRNN(nn.Module):
         return output
 # --- Step 3: Instantiate the model and load the trained weights ---
-# Set hyperparameters to match your training script
 PAD_IDX = char_to_int['<PAD>']
 VOCAB_SIZE = len(char_to_int)
 EMBEDDING_DIM = 128
-HIDDEN_DIM = 192  # Must match the final trained model
 OUTPUT_DIM = len(int_to_lang)
 N_LAYERS = 2
 BIDIRECTIONAL = True
 DROPOUT = 0.5
-# Create an instance of the model
 model = CodeClassifierRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
-# Load the saved state dictionary.
-# We use map_location='cpu' because the Space runs on a CPU.
 model.load_state_dict(torch.load('polyglot_classifier.pt', map_location='cpu'))
-model.eval() # Set the model to evaluation mode
 # --- Step 4: Create the prediction function ---
 def classify_code(code_snippet):
-    if not code_snippet:
         return {}
-    # 1. Convert snippet to tensor of indices
     indexed = [char_to_int.get(c, char_to_int['<UNK>']) for c in code_snippet]
-    tensor = torch.LongTensor(indexed).unsqueeze(0) # Add batch dimension
-    # 2. Make prediction
     with torch.no_grad():
         prediction = model(tensor)
-    # 3. Get probabilities using softmax
     probabilities = torch.softmax(prediction, dim=1)
-    # 4. Get top 5 predictions
     top5_probs, top5_indices = torch.topk(probabilities, 5)
-    # 5. Format for Gradio output
-    confidences = {int_to_lang[str(idx.item())]: prob.item() for idx, prob in zip(top5_indices[0], top5_probs[0])}
     return confidences

 import gradio as gr
 # --- Step 1: Load the vocabularies ---
 with open('char_to_int.json', 'r') as f:
     char_to_int = json.load(f)
 with open('int_to_lang.json', 'r') as f:
+    # ROBUSTNESS FIX: Convert JSON string keys ("0", "1") to integer keys (0, 1)
+    int_to_lang = {int(k): v for k, v in json.load(f).items()}
 # --- Step 2: Re-define the Model Architecture ---
 # This MUST be the exact same architecture as the one you trained.
 class CodeClassifierRNN(nn.Module):
     def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
         super().__init__()
         self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
         self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first=True)
         self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(hidden_dim * 2, output_dim)
     def forward(self, text):
         embedded = self.embedding(text)
         _, (hidden, _) = self.lstm(embedded)
         return output
 # --- Step 3: Instantiate the model and load the trained weights ---
 PAD_IDX = char_to_int['<PAD>']
 VOCAB_SIZE = len(char_to_int)
 EMBEDDING_DIM = 128
+HIDDEN_DIM = 192
 OUTPUT_DIM = len(int_to_lang)
 N_LAYERS = 2
 BIDIRECTIONAL = True
 DROPOUT = 0.5
 model = CodeClassifierRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
 model.load_state_dict(torch.load('polyglot_classifier.pt', map_location='cpu'))
+model.eval()
 # --- Step 4: Create the prediction function ---
 def classify_code(code_snippet):
+    if not code_snippet or not code_snippet.strip():
         return {}
     indexed = [char_to_int.get(c, char_to_int['<UNK>']) for c in code_snippet]
+    tensor = torch.LongTensor(indexed).unsqueeze(0)
     with torch.no_grad():
         prediction = model(tensor)
     probabilities = torch.softmax(prediction, dim=1)
     top5_probs, top5_indices = torch.topk(probabilities, 5)
+    # ROBUSTNESS FIX: Simplified lookup using integer keys
+    confidences = {int_to_lang[idx.item()]: prob.item() for idx, prob in zip(top5_indices[0], top5_probs[0])}
     return confidences