Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
from ruamel.yaml import YAML | |
from loguru import logger | |
from yourbench_space import PATH | |
from yourbench_space.utils import to_commentable_yaml | |
def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str): | |
"""Creates the base config dictionary""" | |
return { | |
"hf_configuration": { | |
"token": "$HF_TOKEN", | |
"hf_organization": hf_org, | |
"private": True, | |
"hf_dataset_name": hf_dataset_name, | |
"concat_if_exist": False, | |
}, | |
"model_list": [ | |
{ | |
"model_name": "Qwen/Qwen2.5-VL-72B-Instruct", | |
"provider": "novita", | |
"max_concurrent_requests": 32, | |
}, | |
{ | |
"model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"provider": "novita", | |
"max_concurrent_requests": 32, | |
}, | |
], | |
"model_roles": { | |
"ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"], | |
"summarization": ["Qwen/Qwen2.5-72B-Instruct"], | |
"chunking": ["intfloat/multilingual-e5-large-instruct"], | |
"single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"], | |
"multi_hop_question_generation": ["Qwen/Qwen2.5-72B-Instruct"], | |
}, | |
"pipeline": { | |
"ingestion": { | |
"source_documents_dir": f"{PATH}/{session_uid}/uploaded_files/", | |
"output_dir": f"{PATH}/{session_uid}/ingested", | |
"run": True, | |
}, | |
"upload_ingest_to_hub": { | |
"source_documents_dir": f"{PATH}/{session_uid}/ingested", | |
"run": True, | |
}, | |
"summarization": { | |
"run": True, | |
}, | |
"chunking": { | |
"run": True, | |
"chunking_configuration": { | |
"l_min_tokens": 64, | |
"l_max_tokens": 128, | |
"tau_threshold": 0.8, | |
"h_min": 2, | |
"h_max": 5, | |
"num_multihops_factor": 2, | |
}, | |
}, | |
"single_shot_question_generation": { | |
"run": True, | |
"additional_instructions": "Generate questions to test a curious adult", | |
"chunk_sampling": { | |
"mode": "count", | |
"value": 5, | |
"random_seed": 123, | |
}, | |
}, | |
"multi_hop_question_generation": { | |
"run": True, | |
"additional_instructions": "Generate questions to test a curious adult", | |
"chunk_sampling": { | |
"mode": "percentage", | |
"value": 0.3, | |
"random_seed": 42, | |
}, | |
}, | |
"lighteval": { | |
"run": True, | |
}, | |
}, | |
} | |
def save_yaml_file(config: dict, path: str): | |
"""Saves the given config dictionary to a YAML file with helpful comments.""" | |
yaml = YAML() | |
yaml.indent(mapping=2, sequence=4, offset=2) | |
config_cm = to_commentable_yaml(config) | |
# Now we can add inline comments | |
ingestion = config_cm["pipeline"]["ingestion"] | |
ingestion.yaml_set_comment_before_after_key("source_documents_dir", before="⚠️ Change this path to match your local directory") | |
ingestion.yaml_set_comment_before_after_key("output_dir", before="⚠️ This is where ingested data will be saved") | |
upload = config_cm["pipeline"]["upload_ingest_to_hub"] | |
upload.yaml_set_comment_before_after_key("source_documents_dir", before="⚠️ Same as output_dir from ingestion — adjust as needed") | |
with open(path, "w") as file: | |
yaml.dump(config_cm, file) | |
return path | |
def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config_path: str): | |
"""Generates and saves the YAML configuration file""" | |
logger.debug(f"Generating config with org: {hf_org}, dataset name: {hf_name}") | |
config = generate_base_config(hf_org, hf_name, session_uid) | |
file_path = save_yaml_file(config, config_path) | |
logger.success(f"Config saved at: {file_path}") | |
return file_path | |