Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import json | |
import os | |
import logging | |
import pickle | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.linear_model import LinearRegression | |
from sklearn.metrics import f1_score, mean_absolute_error, accuracy_score, precision_score, recall_score | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]: %(message)s") | |
class StudentRecommendationSystem: | |
def __init__(self, json_dir: str = "./data"): | |
self.json_dir = json_dir | |
try: | |
with open(os.path.join(json_dir, "subjects.json"), "r") as f: | |
subjects_data = json.load(f) | |
self.core_subjects = subjects_data["core_subjects"] | |
logging.info("Loaded subjects data successfully.") | |
except Exception as e: | |
logging.error("Error loading subjects data: " + str(e)) | |
raise | |
try: | |
with open(os.path.join(json_dir, "universities.json"), "r") as f: | |
universities_data = json.load(f) | |
self.top_universities = universities_data["top_universities"] | |
logging.info("Loaded universities data successfully.") | |
except Exception as e: | |
logging.error("Error loading universities data: " + str(e)) | |
raise | |
try: | |
with open(os.path.join(json_dir, "departments.json"), "r") as f: | |
departments_data = json.load(f) | |
self.program_departments = departments_data["program_departments"] | |
logging.info("Loaded departments data successfully.") | |
except Exception as e: | |
logging.error("Error loading departments data: " + str(e)) | |
raise | |
self.university_programs = {} | |
for program in self.core_subjects: | |
self.university_programs[program] = {"core_subjects": self.core_subjects[program]} | |
logging.info("University programs mapping created.") | |
self.ai_model = self._train_dummy_model() | |
def _train_dummy_model(self) -> Pipeline: | |
X = np.array([[0], [50], [100]]) | |
y = np.array([0, 50, 100]) | |
pipeline = Pipeline([ | |
('scaler', StandardScaler()), | |
('regressor', LinearRegression()) | |
]) | |
pipeline.fit(X, y) | |
logging.info("Dummy AI model pipeline trained successfully.") | |
return pipeline | |
def _refine_match_score(self, score: float) -> float: | |
refined = self.ai_model.predict(np.array([[score]]))[0] | |
logging.debug(f"Refined score for raw score {score} is {refined}.") | |
return refined | |
def predict_success_probability(self, refined_score: float) -> float: | |
probability = refined_score / 100.0 | |
logging.debug(f"Predicted success probability from refined score {refined_score} is {probability}.") | |
return probability | |
def load_student_grades(self, grades_data: dict = None, grades_file: str = None) -> pd.DataFrame: | |
if grades_file: | |
try: | |
with open(grades_file, "r") as f: | |
grades_data = json.load(f) | |
if "sample_grades" in grades_data: | |
grades_data = grades_data["sample_grades"] | |
logging.info(f"Student grades loaded from file: {grades_file}") | |
except Exception as e: | |
logging.error("Error loading student grades file: " + str(e)) | |
raise | |
if not grades_data: | |
raise ValueError("Either grades_data or grades_file must be provided") | |
self.student_data = pd.DataFrame(list(grades_data.items()), columns=['Subject', 'Grade']) | |
return self.student_data | |
def identify_strengths(self, threshold: float = 85) -> pd.DataFrame: | |
strengths = self.student_data[self.student_data['Grade'] >= threshold] | |
return strengths.sort_values(by='Grade', ascending=False) | |
def calculate_program_match(self, strengths: pd.DataFrame) -> pd.DataFrame: | |
program_scores = {} | |
for program, details in self.university_programs.items(): | |
score = 0 | |
core_subjects = details["core_subjects"] | |
total_possible_score = len(core_subjects) * 100 | |
for subject in core_subjects: | |
subject_grade = self.student_data[self.student_data['Subject'] == subject] | |
if not subject_grade.empty: | |
score += subject_grade.iloc[0]['Grade'] | |
raw_score = (score / total_possible_score) * 100 if total_possible_score > 0 else 0 | |
refined_score = self._refine_match_score(raw_score) | |
success_probability = self.predict_success_probability(refined_score) | |
program_scores[program] = { | |
"raw_score": raw_score, | |
"refined_score": refined_score, | |
"success_probability": success_probability | |
} | |
logging.debug(f"Program {program}: raw_score {raw_score}, refined_score {refined_score}, success_probability {success_probability}") | |
program_df = pd.DataFrame([ | |
{"Program": program, | |
"Raw Score": scores["raw_score"], | |
"AI Refined Score": scores["refined_score"], | |
"Success Probability": scores["success_probability"]} | |
for program, scores in program_scores.items() | |
]) | |
return program_df.sort_values(by='AI Refined Score', ascending=False) | |
def get_top_recommendations(self, program_matches: pd.DataFrame, top_n: int = 3) -> list: | |
recommendations = [] | |
for i in range(min(top_n, len(program_matches))): | |
program = program_matches.iloc[i]['Program'] | |
raw_score = program_matches.iloc[i]['Raw Score'] | |
refined_score = program_matches.iloc[i]['AI Refined Score'] | |
success_probability = program_matches.iloc[i]['Success Probability'] | |
if refined_score >= 50: | |
universities = self.top_universities.get(program, ["No specific recommendations"]) | |
departments = self.program_departments.get(program, ["No specific departments"]) | |
recommendations.append({ | |
"program": program, | |
"raw_score": raw_score, | |
"refined_score": refined_score, | |
"success_probability": success_probability, | |
"recommended_universities": universities[:3], | |
"recommended_departments": departments[:3] | |
}) | |
return recommendations | |
def evaluate_recommendations(self, program_matches_df: pd.DataFrame, ground_truth: dict, threshold: float = 60) -> dict: | |
predictions = program_matches_df.apply(lambda row: 1 if row["AI Refined Score"] >= threshold else 0, axis=1).tolist() | |
actuals = [ground_truth.get(program, 0) for program in program_matches_df["Program"].tolist()] | |
metrics = { | |
"f1_score": f1_score(actuals, predictions), | |
"accuracy": accuracy_score(actuals, predictions), | |
"precision": precision_score(actuals, predictions, zero_division=0), | |
"recall": recall_score(actuals, predictions, zero_division=0), | |
"mae": mean_absolute_error(actuals, predictions) | |
} | |
logging.info("Evaluation metrics computed.") | |
return metrics | |
def save_ai_model(self, file_path: str) -> None: | |
try: | |
with open(file_path, "wb") as f: | |
pickle.dump(self.ai_model, f) | |
logging.info(f"AI model saved to {file_path}") | |
except Exception as e: | |
logging.error("Error saving AI model: " + str(e)) | |
raise | |
def load_ai_model(self, file_path: str) -> None: | |
try: | |
with open(file_path, "rb") as f: | |
self.ai_model = pickle.load(f) | |
logging.info(f"AI model loaded from {file_path}") | |
except Exception as e: | |
logging.error("Error loading AI model: " + str(e)) | |
raise | |
def process_student_data(self, grades_data: dict = None, grades_file: str = None, strength_threshold: float = 85) -> dict: | |
self.load_student_grades(grades_data, grades_file) | |
strengths = self.identify_strengths(strength_threshold) | |
program_matches = self.calculate_program_match(strengths) | |
recommendations = self.get_top_recommendations(program_matches) | |
report = { | |
"strengths": strengths.to_dict('records'), | |
"program_matches": program_matches.to_dict('records'), | |
"top_recommendations": recommendations | |
} | |
return report | |
def setup_json_directory(json_dir: str = "./data") -> str: | |
if not os.path.exists(json_dir): | |
os.makedirs(json_dir) | |
return json_dir | |
def main(): | |
json_dir = setup_json_directory() | |
sample_data_file = os.path.join(json_dir, "sample-data.json") | |
recommendation_system = StudentRecommendationSystem(json_dir) | |
results = recommendation_system.process_student_data(grades_file=sample_data_file) | |
with open(sample_data_file, "r") as f: | |
sample_data = json.load(f) | |
sample_grades = sample_data["sample_grades"] | |
print("\n===== STUDENT ACADEMIC PROFILE =====") | |
print("\nSubjects and Grades:") | |
for subject in sample_grades: | |
print(f"- {subject}: {sample_grades[subject]}") | |
print("\n===== ACADEMIC STRENGTHS =====") | |
for strength in results["strengths"]: | |
print(f"- {strength['Subject']}: {strength['Grade']}") | |
print("\n===== PROGRAM MATCHES (Including AI Details) =====") | |
for match in results["program_matches"]: | |
print(f"- {match['Program']}: Raw Score = {match['Raw Score']:.1f}%, AI Refined Score = {match['AI Refined Score']:.1f}%, Success Probability = {match['Success Probability']:.2f}") | |
print("\n===== PROGRAM RECOMMENDATIONS =====") | |
for i, rec in enumerate(results["top_recommendations"], 1): | |
print(f"\n{i}. {rec['program']} (Raw Score: {rec['raw_score']:.1f}%, AI Refined Score: {rec['refined_score']:.1f}%, Success Probability: {rec['success_probability']:.2f})") | |
print(" Recommended Universities:") | |
for uni in rec['recommended_universities']: | |
print(f" - {uni}") | |
print(" Recommended Departments:") | |
for dept in rec['recommended_departments']: | |
print(f" - {dept}") | |
program_matches_df = pd.DataFrame(results["program_matches"]) | |
dummy_ground_truth = {row["Program"]: (1 if row["Raw Score"] >= 65 else 0) for idx, row in program_matches_df.iterrows()} | |
evaluation_metrics = recommendation_system.evaluate_recommendations(program_matches_df, dummy_ground_truth) | |
print("\n===== EVALUATION METRICS =====") | |
for metric, value in evaluation_metrics.items(): | |
print(f"{metric.capitalize()}: {value:.2f}") | |
recommendation_system.save_ai_model(os.path.join(json_dir, "ai_model.pkl")) | |
if __name__ == "__main__": | |
main() | |