rlhf:
  model:
    name: EleutherAI/pythia-70M
    learning_rate: 1.0e-05
    ppo_epochs: 4
    init_kl_coef: 0.2
    target: 6
    cliprange: 0.2
    cliprange_value: 0.2
    vf_coef: 0.1
    adap_kl_ctrl: true
    use_score_norm: true
    ratio_threshold: 10.0
    batch_size: 128
    mini_batch_size: 8
    forward_batch_size: 8
    gradient_accumulation_steps: 8
    reward_model: facebook/roberta-hate-speech-dynabench-r4-target
    use_raw_logits: true
    generation:
      min_length: 5
      output_min_length: 15
      output_max_length: 20
      do_sample: true
      top_k: 0.0
      top_p: 1.0
  training:
    num_train_epochs: 100
    save_freq: 20
    eval_freq: 20
    seed: 42
  dataset:
    name: allenai/real-toxicity-prompts
    toxicity_threshold: 0.3
    input_min_text_length: 15
    input_max_text_length: 20
    test_size: 0.1
  output:
    push_to_hub: true
    organization: null
    repository_name: pythia-70m-detox
  wandb:
    project: irl_llms
    entity: null
    name: null
irl:
  mode: train
  dataset:
    original_model_name: EleutherAI/pythia-70M
    detoxified_model_name: ajagota71/pythia-70m-detox-epoch-100
    original_dataset_path: null
    detoxified_dataset_path: null
    cache_dir: ${hydra:runtime.cwd}/datasets
    num_samples: 1000
    max_new_tokens: 30
    batch_size: 16
    temperature: 0.7
    top_p: 1.0
    seed: ${seed}
    use_cached: false
    toxicity_threshold: 0.3
    push_to_hub: false
    hub_org: null
    hub_token: ${oc.env:HF_TOKEN,null}
    private: false
    use_half_precision: null
  model:
    reward_model_base: null
    use_half_precision: null
    num_unfrozen_layers: 1
  training:
    irl_method: max_margin
    learning_rate: 1.0e-05
    epochs: 20
    batch_size: 4
    eval_interval: 5
    max_length: 512
    train_test_split: 0.8
    grad_clip: 1.0
    weight_decay: 0.01
    margin: 0.1
    temperature: 0.1
    adam_epsilon: 1.0e-08
    seed: ${seed}
    include_prompt: true
  output:
    repo_name_prefix: irl-reward-model
    base_dir: ${hydra:runtime.cwd}/outputs/irl
    save_checkpoints: true
    push_to_hub: false
    hub_org: ajagota71
    private: false
  evaluation:
    true_reward_model: facebook/roberta-hate-speech-dynabench-r4-target
  logging:
    project_name: irl-detoxification
    use_wandb: true
    wandb_mode: online
now: 2025-05-16_09-29-11
seed: 42
output_dir: ${hydra:runtime.cwd}/outputs/${now:%Y-%m-%d_%H-%M-%S}
mode: train
model:
  name: EleutherAI/pythia-70M
  learning_rate: 1.0e-05
  batch_size: 128
  mini_batch_size: 8
  forward_batch_size: 8
  gradient_accumulation_steps: 8
  reward_model: facebook/roberta-hate-speech-dynabench-r4-target
  use_raw_logits: true
  ppo_epochs: 4
  init_kl_coef: 0.2
  target: 6
  cliprange: 0.2
  cliprange_value: 0.2
  vf_coef: 0.1
  adap_kl_ctrl: true
  use_score_norm: true
  ratio_threshold: 10.0
  generation:
    min_length: 5
    output_min_length: 15
    output_max_length: 20
    do_sample: true
    top_k: 0.0
    top_p: 1.0
training:
  num_train_epochs: 200
  save_freq: 50
  eval_freq: 20
  seed: 42
dataset:
  name: allenai/real-toxicity-prompts
  toxicity_threshold: 0.3
  input_min_text_length: 15
  input_max_text_length: 20
  test_size: 0.1
output:
  push_to_hub: true
  push_checkpoints_to_hub: true
  checkpoint_push_freq: 20
  organization: ajagota71
  repository_name: pythia-70m-fb-detox
  private: false
wandb:
  project: irl_llms
  entity: null
  name: pythia-70M-2025-05-16_09-29-11