Vedant Pungliya commited on
Commit
0d00ce9
·
unverified ·
1 Parent(s): fb3fcab
codenet_4000_CasingClassVariable/java/input.in ADDED
The diff for this file is too large to render. See raw diff
 
codenet_4000_CasingClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt ADDED
The diff for this file is too large to render. See raw diff
 
codenet_4000_exactNameClassVariable/java/input.in ADDED
The diff for this file is too large to render. See raw diff
 
codenet_4000_exactNameClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt ADDED
The diff for this file is too large to render. See raw diff
 
codenet_4000_lexical_similar/java/input.in CHANGED
The diff for this file is too large to render. See raw diff
 
codenet_4000_lexical_similar/java/layer12/kmeans/clusters-kmeans-350.txt CHANGED
The diff for this file is too large to render. See raw diff
 
pert.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import numpy as np
3
+ from collections import defaultdict
4
+ from scipy.optimize import linear_sum_assignment
5
+ import os
6
+
7
+ def load_clusters(path):
8
+ cluster_to_tokens = defaultdict(set)
9
+ with open(path, "r", encoding="utf-8") as f:
10
+ for line in f:
11
+ parts = line.strip().split("|||")
12
+ if len(parts) < 2:
13
+ continue
14
+ token = parts[0]
15
+ cluster_id = parts[-1]
16
+ cluster_to_tokens[cluster_id].add(token)
17
+ return cluster_to_tokens
18
+
19
+ def compute_jaccard_matrix(clusters_a, clusters_b):
20
+ a_keys = list(clusters_a.keys())
21
+ b_keys = list(clusters_b.keys())
22
+ matrix = np.zeros((len(a_keys), len(b_keys)))
23
+
24
+ for i, ca in enumerate(a_keys):
25
+ for j, cb in enumerate(b_keys):
26
+ set_a = clusters_a[ca]
27
+ set_b = clusters_b[cb]
28
+ intersection = len(set_a & set_b)
29
+ union = len(set_a | set_b)
30
+ matrix[i, j] = intersection / union if union > 0 else 0.0
31
+
32
+ return matrix, a_keys, b_keys
33
+
34
+ # Dictionary mapping perturbation names to their descriptions
35
+ perturbation_descriptions = {
36
+ "Scope Modification": "Identifies variables in complex scopes and moves them to unrelated blocks.",
37
+ "Lexical Similarity Modification": "Generates lexical variations of class and variable names with different casing.",
38
+ "Log Modification": "Adds logging statements to blocks of code for tracking execution flow.",
39
+ "Operator Modification": "Modifies boolean expressions by negating them in various contexts.",
40
+ "Pointer Modification": "Add C style pointer to the code.",
41
+ "POS finetuned": "Clusters based on finetuned POS codebert model",
42
+ "Random Modification": "Permutes statements within basic blocks, allowing different execution orders.",
43
+ "Try Catch Modification": "Converts switch statements into equivalent if statements.",
44
+ "Unused Statement Modification": "Inserts unused statements into blocks of code for testing/debugging.",
45
+ "Exact Name Class Variable Modification": "Renames classes and variables to a specific randomly generated name.",
46
+ "Casing Class Variable Modification": "Generates lexical variations of class and variable names with different casing."
47
+ }
48
+
49
+ def compute_and_log_csi(file_orig, file_pert, perturbation_name, output_csv="results/csi_summary.csv"):
50
+ clusters_orig = load_clusters(file_orig)
51
+ clusters_pert = load_clusters(file_pert)
52
+
53
+ if len(clusters_orig) != len(clusters_pert):
54
+ raise ValueError(f"Cluster count mismatch: {len(clusters_orig)} (original) vs {len(clusters_pert)} (perturbed)")
55
+
56
+ jaccard_matrix, orig_ids, pert_ids = compute_jaccard_matrix(clusters_orig, clusters_pert)
57
+
58
+ row_ind, col_ind = linear_sum_assignment(-jaccard_matrix)
59
+
60
+ matched_similarities = [jaccard_matrix[i, j] for i, j in zip(row_ind, col_ind)]
61
+ avg_jaccard = np.mean(matched_similarities)
62
+ csi = 1.0 - avg_jaccard
63
+
64
+ print(f"Perturbation: {perturbation_name}")
65
+ print(f" Average Jaccard Similarity: {avg_jaccard:.4f}")
66
+ print(f" Cluster Sensitivity Index (CSI): {csi:.4f}")
67
+
68
+ # Append to CSV
69
+ os.makedirs(os.path.dirname(output_csv), exist_ok=True)
70
+ file_exists = os.path.isfile(output_csv)
71
+
72
+ with open(output_csv, mode="a", newline='', encoding="utf-8") as file:
73
+ writer = csv.writer(file)
74
+ if not file_exists:
75
+ writer.writerow(["Perturbation", "Average Jaccard", "CSI", "Description"])
76
+ writer.writerow([perturbation_name, avg_jaccard, csi, perturbation_descriptions.get(perturbation_name, "No description available")])
77
+
78
+ return avg_jaccard, csi
79
+
80
+ # Example usage
81
+ compute_and_log_csi(
82
+ "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
83
+ "codenet_4000_scope_error/java/layer12/kmeans/clusters-kmeans-350.txt",
84
+ perturbation_name="Scope Modification",
85
+ output_csv="results/csi_summary.csv"
86
+ )
87
+
88
+ compute_and_log_csi(
89
+ "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
90
+ "codenet_4000_lexical_similar/java/layer12/kmeans/clusters-kmeans-350.txt",
91
+ perturbation_name="Lexical Similarity Modification",
92
+ output_csv="results/csi_summary.csv"
93
+ )
94
+
95
+ compute_and_log_csi(
96
+ "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
97
+ "codenet_4000_log/java/layer12/kmeans/clusters-kmeans-350.txt",
98
+ perturbation_name="Log Modification",
99
+ output_csv="results/csi_summary.csv"
100
+ )
101
+
102
+ compute_and_log_csi(
103
+ "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
104
+ "codenet_4000_operator/java/layer12/kmeans/clusters-kmeans-350.txt",
105
+ perturbation_name="Operator Modification",
106
+ output_csv="results/csi_summary.csv"
107
+ )
108
+
109
+ compute_and_log_csi(
110
+ "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
111
+ "codenet_4000_pointer/java/layer12/kmeans/clusters-kmeans-350.txt",
112
+ perturbation_name="Pointer Modification",
113
+ output_csv="results/csi_summary.csv"
114
+ )
115
+
116
+ compute_and_log_csi(
117
+ "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
118
+ "codenet_4000_POS/java/layer12/kmeans/clusters-kmeans-350.txt",
119
+ perturbation_name="POS Modification",
120
+ output_csv="results/csi_summary.csv"
121
+ )
122
+
123
+ compute_and_log_csi(
124
+ "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
125
+ "codenet_4000_random/java/layer12/kmeans/clusters-kmeans-350.txt",
126
+ perturbation_name="Random Modification",
127
+ output_csv="results/csi_summary.csv"
128
+ )
129
+
130
+ compute_and_log_csi(
131
+ "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
132
+ "codenet_4000_trycatch/java/layer12/kmeans/clusters-kmeans-350.txt",
133
+ perturbation_name="Try Catch Modification",
134
+ output_csv="results/csi_summary.csv"
135
+ )
136
+
137
+ compute_and_log_csi(
138
+ "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
139
+ "codenet_4000_unusedStatement/java/layer12/kmeans/clusters-kmeans-350.txt",
140
+ perturbation_name="Unused Statement Modification",
141
+ output_csv="results/csi_summary.csv"
142
+ )
143
+
144
+ compute_and_log_csi(
145
+ "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
146
+ "codenet_4000_exactNameClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt",
147
+ perturbation_name="Exact Name Class Variable Modification",
148
+ output_csv="results/csi_summary.csv"
149
+ )
150
+
151
+ compute_and_log_csi(
152
+ "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
153
+ "codenet_4000_CasingClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt",
154
+ perturbation_name="Casing Class Variable Modification",
155
+ output_csv="results/csi_summary.csv"
156
+ )
157
+
158
+ # You can now call compute_and_log_csi again and again for other perturbations!
results/csi_summary.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Perturbation,Average Jaccard,CSI,Description
2
+ Scope Modification,0.6788942354152336,0.32110576458476636,Identifies variables in complex scopes and moves them to unrelated blocks.
3
+ Lexical Similarity Modification,0.5546761391746684,0.4453238608253316,Generates lexical variations of class and variable names with different casing.
4
+ Log Modification,0.5597545985057552,0.44024540149424485,Adds logging statements to blocks of code for tracking execution flow.
5
+ Operator Modification,0.7675911973340813,0.23240880266591868,Modifies boolean expressions by negating them in various contexts.
6
+ Pointer Modification,0.7341816285924795,0.2658183714075205,Add C style pointer to the code.
7
+ POS Modification,0.39399085068850775,0.6060091493114923,No description available
8
+ Random Modification,0.5314837325594708,0.4685162674405292,"Permutes statements within basic blocks, allowing different execution orders."
9
+ Try Catch Modification,0.6985673658171294,0.3014326341828706,Converts switch statements into equivalent if statements.
10
+ Unused Statement Modification,0.5844954343120634,0.4155045656879366,Inserts unused statements into blocks of code for testing/debugging.
11
+ Exact Name Class Variable Modification,0.675121649837896,0.324878350162104,Renames classes and variables to a specific randomly generated name.
12
+ Casing Class Variable Modification,0.6722713965133429,0.3277286034866571,Generates lexical variations of class and variable names with different casing.