UniRecommend / student_recommendation_system.py
kerols77's picture
Upload 8 files
1f8582e verified
import pandas as pd
import numpy as np
import json
import os
import logging
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, mean_absolute_error, accuracy_score, precision_score, recall_score
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]: %(message)s")
class StudentRecommendationSystem:
def __init__(self, json_dir: str = "./data"):
self.json_dir = json_dir
try:
with open(os.path.join(json_dir, "subjects.json"), "r") as f:
subjects_data = json.load(f)
self.core_subjects = subjects_data["core_subjects"]
logging.info("Loaded subjects data successfully.")
except Exception as e:
logging.error("Error loading subjects data: " + str(e))
raise
try:
with open(os.path.join(json_dir, "universities.json"), "r") as f:
universities_data = json.load(f)
self.top_universities = universities_data["top_universities"]
logging.info("Loaded universities data successfully.")
except Exception as e:
logging.error("Error loading universities data: " + str(e))
raise
try:
with open(os.path.join(json_dir, "departments.json"), "r") as f:
departments_data = json.load(f)
self.program_departments = departments_data["program_departments"]
logging.info("Loaded departments data successfully.")
except Exception as e:
logging.error("Error loading departments data: " + str(e))
raise
self.university_programs = {}
for program in self.core_subjects:
self.university_programs[program] = {"core_subjects": self.core_subjects[program]}
logging.info("University programs mapping created.")
self.ai_model = self._train_dummy_model()
def _train_dummy_model(self) -> Pipeline:
X = np.array([[0], [50], [100]])
y = np.array([0, 50, 100])
pipeline = Pipeline([
('scaler', StandardScaler()),
('regressor', LinearRegression())
])
pipeline.fit(X, y)
logging.info("Dummy AI model pipeline trained successfully.")
return pipeline
def _refine_match_score(self, score: float) -> float:
refined = self.ai_model.predict(np.array([[score]]))[0]
logging.debug(f"Refined score for raw score {score} is {refined}.")
return refined
def predict_success_probability(self, refined_score: float) -> float:
probability = refined_score / 100.0
logging.debug(f"Predicted success probability from refined score {refined_score} is {probability}.")
return probability
def load_student_grades(self, grades_data: dict = None, grades_file: str = None) -> pd.DataFrame:
if grades_file:
try:
with open(grades_file, "r") as f:
grades_data = json.load(f)
if "sample_grades" in grades_data:
grades_data = grades_data["sample_grades"]
logging.info(f"Student grades loaded from file: {grades_file}")
except Exception as e:
logging.error("Error loading student grades file: " + str(e))
raise
if not grades_data:
raise ValueError("Either grades_data or grades_file must be provided")
self.student_data = pd.DataFrame(list(grades_data.items()), columns=['Subject', 'Grade'])
return self.student_data
def identify_strengths(self, threshold: float = 85) -> pd.DataFrame:
strengths = self.student_data[self.student_data['Grade'] >= threshold]
return strengths.sort_values(by='Grade', ascending=False)
def calculate_program_match(self, strengths: pd.DataFrame) -> pd.DataFrame:
program_scores = {}
for program, details in self.university_programs.items():
score = 0
core_subjects = details["core_subjects"]
total_possible_score = len(core_subjects) * 100
for subject in core_subjects:
subject_grade = self.student_data[self.student_data['Subject'] == subject]
if not subject_grade.empty:
score += subject_grade.iloc[0]['Grade']
raw_score = (score / total_possible_score) * 100 if total_possible_score > 0 else 0
refined_score = self._refine_match_score(raw_score)
success_probability = self.predict_success_probability(refined_score)
program_scores[program] = {
"raw_score": raw_score,
"refined_score": refined_score,
"success_probability": success_probability
}
logging.debug(f"Program {program}: raw_score {raw_score}, refined_score {refined_score}, success_probability {success_probability}")
program_df = pd.DataFrame([
{"Program": program,
"Raw Score": scores["raw_score"],
"AI Refined Score": scores["refined_score"],
"Success Probability": scores["success_probability"]}
for program, scores in program_scores.items()
])
return program_df.sort_values(by='AI Refined Score', ascending=False)
def get_top_recommendations(self, program_matches: pd.DataFrame, top_n: int = 3) -> list:
recommendations = []
for i in range(min(top_n, len(program_matches))):
program = program_matches.iloc[i]['Program']
raw_score = program_matches.iloc[i]['Raw Score']
refined_score = program_matches.iloc[i]['AI Refined Score']
success_probability = program_matches.iloc[i]['Success Probability']
if refined_score >= 50:
universities = self.top_universities.get(program, ["No specific recommendations"])
departments = self.program_departments.get(program, ["No specific departments"])
recommendations.append({
"program": program,
"raw_score": raw_score,
"refined_score": refined_score,
"success_probability": success_probability,
"recommended_universities": universities[:3],
"recommended_departments": departments[:3]
})
return recommendations
def evaluate_recommendations(self, program_matches_df: pd.DataFrame, ground_truth: dict, threshold: float = 60) -> dict:
predictions = program_matches_df.apply(lambda row: 1 if row["AI Refined Score"] >= threshold else 0, axis=1).tolist()
actuals = [ground_truth.get(program, 0) for program in program_matches_df["Program"].tolist()]
metrics = {
"f1_score": f1_score(actuals, predictions),
"accuracy": accuracy_score(actuals, predictions),
"precision": precision_score(actuals, predictions, zero_division=0),
"recall": recall_score(actuals, predictions, zero_division=0),
"mae": mean_absolute_error(actuals, predictions)
}
logging.info("Evaluation metrics computed.")
return metrics
def save_ai_model(self, file_path: str) -> None:
try:
with open(file_path, "wb") as f:
pickle.dump(self.ai_model, f)
logging.info(f"AI model saved to {file_path}")
except Exception as e:
logging.error("Error saving AI model: " + str(e))
raise
def load_ai_model(self, file_path: str) -> None:
try:
with open(file_path, "rb") as f:
self.ai_model = pickle.load(f)
logging.info(f"AI model loaded from {file_path}")
except Exception as e:
logging.error("Error loading AI model: " + str(e))
raise
def process_student_data(self, grades_data: dict = None, grades_file: str = None, strength_threshold: float = 85) -> dict:
self.load_student_grades(grades_data, grades_file)
strengths = self.identify_strengths(strength_threshold)
program_matches = self.calculate_program_match(strengths)
recommendations = self.get_top_recommendations(program_matches)
report = {
"strengths": strengths.to_dict('records'),
"program_matches": program_matches.to_dict('records'),
"top_recommendations": recommendations
}
return report
def setup_json_directory(json_dir: str = "./data") -> str:
if not os.path.exists(json_dir):
os.makedirs(json_dir)
return json_dir
def main():
json_dir = setup_json_directory()
sample_data_file = os.path.join(json_dir, "sample-data.json")
recommendation_system = StudentRecommendationSystem(json_dir)
results = recommendation_system.process_student_data(grades_file=sample_data_file)
with open(sample_data_file, "r") as f:
sample_data = json.load(f)
sample_grades = sample_data["sample_grades"]
print("\n===== STUDENT ACADEMIC PROFILE =====")
print("\nSubjects and Grades:")
for subject in sample_grades:
print(f"- {subject}: {sample_grades[subject]}")
print("\n===== ACADEMIC STRENGTHS =====")
for strength in results["strengths"]:
print(f"- {strength['Subject']}: {strength['Grade']}")
print("\n===== PROGRAM MATCHES (Including AI Details) =====")
for match in results["program_matches"]:
print(f"- {match['Program']}: Raw Score = {match['Raw Score']:.1f}%, AI Refined Score = {match['AI Refined Score']:.1f}%, Success Probability = {match['Success Probability']:.2f}")
print("\n===== PROGRAM RECOMMENDATIONS =====")
for i, rec in enumerate(results["top_recommendations"], 1):
print(f"\n{i}. {rec['program']} (Raw Score: {rec['raw_score']:.1f}%, AI Refined Score: {rec['refined_score']:.1f}%, Success Probability: {rec['success_probability']:.2f})")
print(" Recommended Universities:")
for uni in rec['recommended_universities']:
print(f" - {uni}")
print(" Recommended Departments:")
for dept in rec['recommended_departments']:
print(f" - {dept}")
program_matches_df = pd.DataFrame(results["program_matches"])
dummy_ground_truth = {row["Program"]: (1 if row["Raw Score"] >= 65 else 0) for idx, row in program_matches_df.iterrows()}
evaluation_metrics = recommendation_system.evaluate_recommendations(program_matches_df, dummy_ground_truth)
print("\n===== EVALUATION METRICS =====")
for metric, value in evaluation_metrics.items():
print(f"{metric.capitalize()}: {value:.2f}")
recommendation_system.save_ai_model(os.path.join(json_dir, "ai_model.pkl"))
if __name__ == "__main__":
main()