alozowski HF Staff commited on
Commit
f05dc8f
·
1 Parent(s): 4366589

Add comments to generated config

Browse files
yourbench_space/config.py CHANGED
@@ -1,7 +1,8 @@
1
- import yaml
2
  from loguru import logger
3
 
4
  from yourbench_space import PATH
 
5
 
6
 
7
  def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
@@ -82,10 +83,24 @@ def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
82
  }
83
 
84
 
85
- def save_yaml_file(config: str, path: str):
86
- """Saves the given config dictionary to a YAML file"""
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  with open(path, "w") as file:
88
- yaml.dump(config, file, default_flow_style=False, sort_keys=False)
 
89
  return path
90
 
91
 
 
1
+ from ruamel.yaml import YAML
2
  from loguru import logger
3
 
4
  from yourbench_space import PATH
5
+ from yourbench_space.utils import to_commentable_yaml
6
 
7
 
8
  def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
 
83
  }
84
 
85
 
86
+ def save_yaml_file(config: dict, path: str):
87
+ """Saves the given config dictionary to a YAML file with helpful comments."""
88
+ yaml = YAML()
89
+ yaml.indent(mapping=2, sequence=4, offset=2)
90
+
91
+ config_cm = to_commentable_yaml(config)
92
+
93
+ # Now we can add inline comments
94
+ ingestion = config_cm["pipeline"]["ingestion"]
95
+ ingestion.yaml_set_comment_before_after_key("source_documents_dir", before="⚠️ Change this path to match your local directory")
96
+ ingestion.yaml_set_comment_before_after_key("output_dir", before="⚠️ This is where ingested data will be saved")
97
+
98
+ upload = config_cm["pipeline"]["upload_ingest_to_hub"]
99
+ upload.yaml_set_comment_before_after_key("source_documents_dir", before="⚠️ Same as output_dir from ingestion — adjust as needed")
100
+
101
  with open(path, "w") as file:
102
+ yaml.dump(config_cm, file)
103
+
104
  return path
105
 
106
 
yourbench_space/utils.py CHANGED
@@ -5,6 +5,7 @@ import shutil
5
  import pathlib
6
  import subprocess
7
  from typing import List, Union, Optional
 
8
 
9
  import pandas as pd
10
  from loguru import logger
@@ -34,6 +35,22 @@ STAGE_DISPLAY_MAP = {
34
  "lighteval": "Generate Lighteval Subset",
35
  }
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def map_stage_names(stages: list[str]) -> list[str]:
39
  return [STAGE_DISPLAY_MAP.get(stage, stage) for stage in stages]
 
5
  import pathlib
6
  import subprocess
7
  from typing import List, Union, Optional
8
+ from ruamel.yaml.comments import CommentedMap, CommentedSeq
9
 
10
  import pandas as pd
11
  from loguru import logger
 
35
  "lighteval": "Generate Lighteval Subset",
36
  }
37
 
38
+ def to_commentable_yaml(obj):
39
+ """
40
+ Recursively converts standard Python dicts and lists into
41
+ ruamel.yaml's CommentedMap and CommentedSeq so that comments
42
+ can be attached when dumping YAML
43
+ """
44
+ # Convert dict to CommentedMap with recursively processed values
45
+ if isinstance(obj, dict):
46
+ return CommentedMap({k: to_commentable_yaml(v) for k, v in obj.items()})
47
+
48
+ # Convert list to CommentedSeq with recursively processed elements
49
+ elif isinstance(obj, list):
50
+ return CommentedSeq([to_commentable_yaml(i) for i in obj])
51
+
52
+ # Return non-container values as-is
53
+ return obj
54
 
55
  def map_stage_names(stages: list[str]) -> list[str]:
56
  return [STAGE_DISPLAY_MAP.get(stage, stage) for stage in stages]