alozowski's picture
alozowski HF Staff
Add comments to generated config
f05dc8f
from ruamel.yaml import YAML
from loguru import logger
from yourbench_space import PATH
from yourbench_space.utils import to_commentable_yaml
def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
"""Creates the base config dictionary"""
return {
"hf_configuration": {
"token": "$HF_TOKEN",
"hf_organization": hf_org,
"private": True,
"hf_dataset_name": hf_dataset_name,
"concat_if_exist": False,
},
"model_list": [
{
"model_name": "Qwen/Qwen2.5-VL-72B-Instruct",
"provider": "novita",
"max_concurrent_requests": 32,
},
{
"model_name": "Qwen/Qwen2.5-72B-Instruct",
"provider": "novita",
"max_concurrent_requests": 32,
},
],
"model_roles": {
"ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"],
"summarization": ["Qwen/Qwen2.5-72B-Instruct"],
"chunking": ["intfloat/multilingual-e5-large-instruct"],
"single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
"multi_hop_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
},
"pipeline": {
"ingestion": {
"source_documents_dir": f"{PATH}/{session_uid}/uploaded_files/",
"output_dir": f"{PATH}/{session_uid}/ingested",
"run": True,
},
"upload_ingest_to_hub": {
"source_documents_dir": f"{PATH}/{session_uid}/ingested",
"run": True,
},
"summarization": {
"run": True,
},
"chunking": {
"run": True,
"chunking_configuration": {
"l_min_tokens": 64,
"l_max_tokens": 128,
"tau_threshold": 0.8,
"h_min": 2,
"h_max": 5,
"num_multihops_factor": 2,
},
},
"single_shot_question_generation": {
"run": True,
"additional_instructions": "Generate questions to test a curious adult",
"chunk_sampling": {
"mode": "count",
"value": 5,
"random_seed": 123,
},
},
"multi_hop_question_generation": {
"run": True,
"additional_instructions": "Generate questions to test a curious adult",
"chunk_sampling": {
"mode": "percentage",
"value": 0.3,
"random_seed": 42,
},
},
"lighteval": {
"run": True,
},
},
}
def save_yaml_file(config: dict, path: str):
"""Saves the given config dictionary to a YAML file with helpful comments."""
yaml = YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
config_cm = to_commentable_yaml(config)
# Now we can add inline comments
ingestion = config_cm["pipeline"]["ingestion"]
ingestion.yaml_set_comment_before_after_key("source_documents_dir", before="⚠️ Change this path to match your local directory")
ingestion.yaml_set_comment_before_after_key("output_dir", before="⚠️ This is where ingested data will be saved")
upload = config_cm["pipeline"]["upload_ingest_to_hub"]
upload.yaml_set_comment_before_after_key("source_documents_dir", before="⚠️ Same as output_dir from ingestion — adjust as needed")
with open(path, "w") as file:
yaml.dump(config_cm, file)
return path
def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config_path: str):
"""Generates and saves the YAML configuration file"""
logger.debug(f"Generating config with org: {hf_org}, dataset name: {hf_name}")
config = generate_base_config(hf_org, hf_name, session_uid)
file_path = save_yaml_file(config, config_path)
logger.success(f"Config saved at: {file_path}")
return file_path