rlhf: model: name: EleutherAI/pythia-70M learning_rate: 1.0e-05 ppo_epochs: 4 init_kl_coef: 0.2 target: 6 cliprange: 0.2 cliprange_value: 0.2 vf_coef: 0.1 adap_kl_ctrl: true use_score_norm: true ratio_threshold: 10.0 batch_size: 128 mini_batch_size: 8 forward_batch_size: 8 gradient_accumulation_steps: 8 reward_model: facebook/roberta-hate-speech-dynabench-r4-target use_raw_logits: true generation: min_length: 5 output_min_length: 15 output_max_length: 20 do_sample: true top_k: 0.0 top_p: 1.0 training: num_train_epochs: 100 save_freq: 20 eval_freq: 20 seed: 42 dataset: name: allenai/real-toxicity-prompts toxicity_threshold: 0.3 input_min_text_length: 15 input_max_text_length: 20 test_size: 0.1 output: push_to_hub: true organization: null repository_name: pythia-70m-detox wandb: project: irl_llms entity: null name: null irl: mode: train dataset: original_model_name: EleutherAI/pythia-70M detoxified_model_name: ajagota71/pythia-70m-detox-epoch-100 original_dataset_path: null detoxified_dataset_path: null cache_dir: ${hydra:runtime.cwd}/datasets num_samples: 1000 max_new_tokens: 30 batch_size: 16 temperature: 0.7 top_p: 1.0 seed: ${seed} use_cached: false toxicity_threshold: 0.3 push_to_hub: false hub_org: null hub_token: ${oc.env:HF_TOKEN,null} private: false use_half_precision: null model: reward_model_base: null use_half_precision: null num_unfrozen_layers: 1 training: irl_method: max_margin learning_rate: 1.0e-05 epochs: 20 batch_size: 4 eval_interval: 5 max_length: 512 train_test_split: 0.8 grad_clip: 1.0 weight_decay: 0.01 margin: 0.1 temperature: 0.1 adam_epsilon: 1.0e-08 seed: ${seed} include_prompt: true output: repo_name_prefix: irl-reward-model base_dir: ${hydra:runtime.cwd}/outputs/irl save_checkpoints: true push_to_hub: false hub_org: ajagota71 private: false evaluation: true_reward_model: facebook/roberta-hate-speech-dynabench-r4-target logging: project_name: irl-detoxification use_wandb: true wandb_mode: online now: 2025-05-16_09-29-11 seed: 42 output_dir: ${hydra:runtime.cwd}/outputs/${now:%Y-%m-%d_%H-%M-%S} mode: train model: name: EleutherAI/pythia-70M learning_rate: 1.0e-05 batch_size: 128 mini_batch_size: 8 forward_batch_size: 8 gradient_accumulation_steps: 8 reward_model: facebook/roberta-hate-speech-dynabench-r4-target use_raw_logits: true ppo_epochs: 4 init_kl_coef: 0.2 target: 6 cliprange: 0.2 cliprange_value: 0.2 vf_coef: 0.1 adap_kl_ctrl: true use_score_norm: true ratio_threshold: 10.0 generation: min_length: 5 output_min_length: 15 output_max_length: 20 do_sample: true top_k: 0.0 top_p: 1.0 training: num_train_epochs: 200 save_freq: 50 eval_freq: 20 seed: 42 dataset: name: allenai/real-toxicity-prompts toxicity_threshold: 0.3 input_min_text_length: 15 input_max_text_length: 20 test_size: 0.1 output: push_to_hub: true push_checkpoints_to_hub: true checkpoint_push_freq: 20 organization: ajagota71 repository_name: pythia-70m-fb-detox private: false wandb: project: irl_llms entity: null name: pythia-70M-2025-05-16_09-29-11