from ruamel.yaml import YAML from loguru import logger from yourbench_space import PATH from yourbench_space.utils import to_commentable_yaml def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str): """Creates the base config dictionary""" return { "hf_configuration": { "token": "$HF_TOKEN", "hf_organization": hf_org, "private": True, "hf_dataset_name": hf_dataset_name, "concat_if_exist": False, }, "model_list": [ { "model_name": "Qwen/Qwen2.5-VL-72B-Instruct", "provider": "novita", "max_concurrent_requests": 32, }, { "model_name": "Qwen/Qwen2.5-72B-Instruct", "provider": "novita", "max_concurrent_requests": 32, }, ], "model_roles": { "ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"], "summarization": ["Qwen/Qwen2.5-72B-Instruct"], "chunking": ["intfloat/multilingual-e5-large-instruct"], "single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"], "multi_hop_question_generation": ["Qwen/Qwen2.5-72B-Instruct"], }, "pipeline": { "ingestion": { "source_documents_dir": f"{PATH}/{session_uid}/uploaded_files/", "output_dir": f"{PATH}/{session_uid}/ingested", "run": True, }, "upload_ingest_to_hub": { "source_documents_dir": f"{PATH}/{session_uid}/ingested", "run": True, }, "summarization": { "run": True, }, "chunking": { "run": True, "chunking_configuration": { "l_min_tokens": 64, "l_max_tokens": 128, "tau_threshold": 0.8, "h_min": 2, "h_max": 5, "num_multihops_factor": 2, }, }, "single_shot_question_generation": { "run": True, "additional_instructions": "Generate questions to test a curious adult", "chunk_sampling": { "mode": "count", "value": 5, "random_seed": 123, }, }, "multi_hop_question_generation": { "run": True, "additional_instructions": "Generate questions to test a curious adult", "chunk_sampling": { "mode": "percentage", "value": 0.3, "random_seed": 42, }, }, "lighteval": { "run": True, }, }, } def save_yaml_file(config: dict, path: str): """Saves the given config dictionary to a YAML file with helpful comments.""" yaml = YAML() yaml.indent(mapping=2, sequence=4, offset=2) config_cm = to_commentable_yaml(config) # Now we can add inline comments ingestion = config_cm["pipeline"]["ingestion"] ingestion.yaml_set_comment_before_after_key("source_documents_dir", before="⚠️ Change this path to match your local directory") ingestion.yaml_set_comment_before_after_key("output_dir", before="⚠️ This is where ingested data will be saved") upload = config_cm["pipeline"]["upload_ingest_to_hub"] upload.yaml_set_comment_before_after_key("source_documents_dir", before="⚠️ Same as output_dir from ingestion — adjust as needed") with open(path, "w") as file: yaml.dump(config_cm, file) return path def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config_path: str): """Generates and saves the YAML configuration file""" logger.debug(f"Generating config with org: {hf_org}, dataset name: {hf_name}") config = generate_base_config(hf_org, hf_name, session_uid) file_path = save_yaml_file(config, config_path) logger.success(f"Config saved at: {file_path}") return file_path