Spaces:

szili2011
/

polyglot-code-classifier

Sleeping

App Files Files Community

szili2011 commited on Jun 9

Commit

0cf02bf

verified ·

1 Parent(s): 6925566

Create app.py

Browse files

Files changed (1) hide show

app.py +88 -0

app.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+import torch.nn as nn
+import json
+import gradio as gr
+# --- Step 1: Load the vocabularies ---
+# These files are in your Hugging Face Space repository, so we can load them directly.
+with open('char_to_int.json', 'r') as f:
+    char_to_int = json.load(f)
+with open('int_to_lang.json', 'r') as f:
+    int_to_lang = json.load(f)
+# --- Step 2: Re-define the Model Architecture ---
+# This MUST be the exact same architecture as the one you trained.
+# All the hyperparameters (embedding_dim, hidden_dim, etc.) must match.
+class CodeClassifierRNN(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
+        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first=True)
+        self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(hidden_dim * 2, output_dim) # * 2 for bidirectional
+    def forward(self, text):
+        embedded = self.embedding(text)
+        _, (hidden, _) = self.lstm(embedded)
+        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
+        hidden = self.dropout(hidden)
+        output = self.fc(hidden)
+        return output
+# --- Step 3: Instantiate the model and load the trained weights ---
+# Set hyperparameters to match your training script
+PAD_IDX = char_to_int['<PAD>']
+VOCAB_SIZE = len(char_to_int)
+EMBEDDING_DIM = 128
+HIDDEN_DIM = 192  # Must match the final trained model
+OUTPUT_DIM = len(int_to_lang)
+N_LAYERS = 2
+BIDIRECTIONAL = True
+DROPOUT = 0.5
+# Create an instance of the model
+model = CodeClassifierRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
+# Load the saved state dictionary.
+# We use map_location='cpu' because the Space runs on a CPU.
+model.load_state_dict(torch.load('polyglot_classifier.pt', map_location='cpu'))
+model.eval() # Set the model to evaluation mode
+# --- Step 4: Create the prediction function ---
+def classify_code(code_snippet):
+    if not code_snippet:
+        return {}
+    # 1. Convert snippet to tensor of indices
+    indexed = [char_to_int.get(c, char_to_int['<UNK>']) for c in code_snippet]
+    tensor = torch.LongTensor(indexed).unsqueeze(0) # Add batch dimension
+    # 2. Make prediction
+    with torch.no_grad():
+        prediction = model(tensor)
+    # 3. Get probabilities using softmax
+    probabilities = torch.softmax(prediction, dim=1)
+    # 4. Get top 5 predictions
+    top5_probs, top5_indices = torch.topk(probabilities, 5)
+    # 5. Format for Gradio output
+    confidences = {int_to_lang[str(idx.item())]: prob.item() for idx, prob in zip(top5_indices[0], top5_probs[0])}
+    return confidences
+# --- Step 5: Create and launch the Gradio Interface ---
+iface = gr.Interface(
+    fn=classify_code,
+    inputs=gr.Code(language=None, label="Code Snippet"),
+    outputs=gr.Label(num_top_classes=5, label="Predicted Language"),
+    title="Polyglot Code Classifier",
+    description="Enter a code snippet to see which programming language the AI thinks it is. This model was trained from scratch on a custom dataset.",
+    examples=[
+        ["def hello_world():\n    print('Hello from Python!')"],
+        ["function greet() {\n    console.log('Hello from JavaScript!');\n}"],
+        ["public class Main {\n    public static void main(String[] args) {\n        System.out.println(\"Hello, Java!\");\n    }\n}"]
+    ]
+)
+iface.launch()