import yaml from loguru import logger def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str): """Creates the base config dictionary""" return { "hf_configuration": { "token": "$HF_TOKEN", "private": True, "hf_organization": hf_org, "hf_dataset_name": hf_dataset_name, }, "model_list": [ { "model_name": "meta-llama/Llama-3.3-70B-Instruct", "provider": "novita", "max_concurrent_requests": 32, }, { "model_name": "Qwen/Qwen2.5-72B-Instruct", "provider": "novita", "max_concurrent_requests": 32, } ], "model_roles": { "ingestion": ["meta-llama/Llama-3.3-70B-Instruct"], "summarization": ["Qwen/Qwen2.5-72B-Instruct"], "single_shot_question_generation": ["meta-llama/Llama-3.3-70B-Instruct"], "multi_hop_question_generation": ["meta-llama/Llama-3.3-70B-Instruct"], "answer_generation": ["Qwen/Qwen2.5-72B-Instruct"], "judge_answers": ["meta-llama/Llama-3.3-70B-Instruct"], }, "pipeline": { "ingestion": { "source_documents_dir": f"/app/{session_uid}/uploaded_files/", "output_dir": f"/app/{session_uid}/ingested", "run": True, }, "upload_ingest_to_hub": { "source_documents_dir": f"/app/{session_uid}/ingested", "run": True, }, "summarization": {"run": True}, "chunking": { "chunking_configuration": { "l_min_tokens": 64, "l_max_tokens": 128, "tau_threshold": 0.3, "h_min": 2, "h_max": 4, }, "run": True, }, "single_shot_question_generation": { "diversification_seed": "24 year old adult", "run": True, }, "multi_hop_question_generation": {"run": False}, "answer_generation": { "question_type": "single_shot", "run": True, "strategies": [ { "name": "zeroshot", "prompt": "ZEROSHOT_QA_USER_PROMPT", "model_name": "meta-llama/Llama-3.3-70B-Instruct", }, { "name": "gold", "prompt": "GOLD_QA_USER_PROMPT", "model_name": "meta-llama/Llama-3.3-70B-Instruct", }, ], }, "judge_answers": { "run": False, # to change when fixed "comparing_strategies": [["zeroshot", "gold"]], "chunk_column_index": 0, "random_seed": 42, }, }, } def save_yaml_file(config: str, path: str): """Saves the given config dictionary to a YAML file""" with open(path, "w") as file: yaml.dump(config, file, default_flow_style=False, sort_keys=False) return path def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config_path: str): """Generates and saves the YAML configuration file""" logger.debug(f"Generating config with org: {hf_org}, dataset name: {hf_name}") config = generate_base_config(hf_org, hf_name, session_uid) file_path = save_yaml_file(config, config_path) logger.success(f"Config saved at: {file_path}") return file_path