OpenMLRL · LovelyBuggies · Feb 17, 2026 · Feb 15, 2026 · Feb 15, 2026 · Feb 15, 2026
diff --git a/house_build/configs/house_build_iac_config.yaml b/house_build/configs/house_build_iac_config.yaml
@@ -3,16 +3,15 @@ agent_model:
   type: qwen
   temperature: 0.6
   top_p: 0.6
+  top_k: null
   max_length: 2048
   dtype: bf16
 
 agents: null
 
 critic_model:
-  name: "Qwen/Qwen3-4B-Instruct-2507"
+  name: Qwen/Qwen3-4B-Instruct-2507
   type: qwen
-  temperature: 0.6
-  top_p: 0.6
   max_length: 2048
   dtype: bf16
 
@@ -21,25 +20,59 @@ critics: null
 dataset:
   name: house_build
   type: house_build
+  train_split: '[:8]'
+  eval_split: '[8:]'
   json_path: ../dataset/data.json
-  train_split: "[:8]"
-  eval_split: "[8:]"
+
+prompt:
+  use_chat_template: true
+
+task:
+  player:
+    hp: 5
+  spider:
+    atk_high: 3
+    atk_low: 1
+    num: 3
+  max_commands: 600
+  limited_resource: true
+  block_agent1:
+  - white_concrete
+  - obsidian
+  - stone_stairs
+  - stone_bricks
+  - planks
+  - air
+  block_agent2:
+  - white_concrete
+  - obsidian
+  - stone_stairs
+  - stone_bricks
+  - planks
+  - air
 
 output:
-  base_dir: output
-  save_final_model: false
-  save_path: output/final_model
+  base_dir: output_iac_house_build
   verbose: false
+  save_final_model: false
+  save_path: output_iac_house_build
 
 external:
   mode: score_feedback
   original_prompt: true
   previous_response: true
   lim: 20
+  external_prompt_passthrough: false
 
 iac:
+  parallel_training: none
+  agent_devices:
+  - cuda:0
+  critic_devices:
+  - cuda:0
   num_agents: 2
   num_turns: 4
+  use_separate_critic: true
   num_train_epochs: 150
   agent_learning_rate: 5e-6
   critic_learning_rate: 5e-6
@@ -48,10 +81,6 @@ iac:
   rollout_buffer_size: 1
   train_batch_size: 1
   max_new_tokens: 512
-  temperature: 0.6
-  top_p: 0.6
-  top_k: null
-  use_separate_critic: true
   discount: 0.9
   early_termination_threshold: 0.0
   eval_interval: 10
@@ -68,20 +97,7 @@ wandb:
   project: house_build
   entity: OpenMLRL
   run_name: house_build_iac
-  dir: output
-  tags: ["iac", "house_build"]
-
-prompt:
-  use_chat_template: true
-
-task:
-  block_agent1: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
-  block_agent2: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
-  max_commands: 600
-  limited_resource: true
-  player:
-    hp: 5
-  spider:
-    num: 3
-    atk_low: 1
-    atk_high: 3
+  dir: output_iac_house_build
+  tags:
+  - iac
+  - house_build
diff --git a/house_build/configs/house_build_maac_config.yaml b/house_build/configs/house_build_maac_config.yaml
@@ -3,16 +3,15 @@ agent_model:
   type: qwen
   temperature: 0.6
   top_p: 0.6
+  top_k: null
   max_length: 2048
   dtype: bf16
 
 agents: null
 
 critic_model:
-  name: "Qwen/Qwen3-4B-Instruct-2507"
+  name: Qwen/Qwen3-4B-Instruct-2507
   type: qwen
-  temperature: 0.6
-  top_p: 0.6
   max_length: 2048
   dtype: bf16
 
@@ -21,23 +20,56 @@ critics: null
 dataset:
   name: house_build
   type: house_build
+  train_split: '[:8]'
+  eval_split: '[8:]'
   json_path: ../dataset/data.json
-  train_split: "[:8]"
-  eval_split: "[8:]"
+
+prompt:
+  use_chat_template: true
+
+task:
+  player:
+    hp: 5
+  spider:
+    atk_high: 3
+    atk_low: 1
+    num: 3
+  max_commands: 600
+  limited_resource: true
+  block_agent1:
+  - white_concrete
+  - obsidian
+  - stone_stairs
+  - stone_bricks
+  - planks
+  - air
+  block_agent2:
+  - white_concrete
+  - obsidian
+  - stone_stairs
+  - stone_bricks
+  - planks
+  - air
 
 output:
-  base_dir: output
-  save_final_model: false
-  save_path: output/final_model
+  base_dir: output_maac_house_build
   verbose: false
+  save_final_model: false
+  save_path: output_maac_house_build
 
 external:
   mode: score_feedback
   original_prompt: true
   previous_response: true
   lim: 20
+  external_prompt_passthrough: false
 
 maac:
+  parallel_training: none
+  agent_devices:
+  - cuda:0
+  critic_devices:
+  - cuda:0
   num_agents: 2
   num_turns: 4
   critic_type: v
@@ -48,9 +80,6 @@ maac:
   rollout_buffer_size: 1
   train_batch_size: 1
   max_new_tokens: 512
-  temperature: 0.6
-  top_p: 0.6
-  top_k: null
   discount: 0.9
   early_termination_threshold: 0.0
   eval_interval: 10
@@ -67,20 +96,7 @@ wandb:
   project: house_build
   entity: OpenMLRL
   run_name: house_build_maac
-  dir: output
-  tags: ["maac", "house_build"]
-
-prompt:
-  use_chat_template: true
-
-task:
-  block_agent1: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
-  block_agent2: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
-  max_commands: 600
-  limited_resource: true
-  player:
-    hp: 5
-  spider:
-    num: 3
-    atk_low: 1
-    atk_high: 3
+  dir: output_maac_house_build
+  tags:
+  - maac
+  - house_build
diff --git a/house_build/configs/house_build_magrpo_config.yaml b/house_build/configs/house_build_magrpo_config.yaml
@@ -3,6 +3,7 @@ agent_model:
   type: qwen
   temperature: 0.6
   top_p: 0.6
+  top_k: null
   max_length: 2048
   dtype: bf16
 
@@ -15,33 +16,61 @@ critics: null
 dataset:
   name: house_build
   type: house_build
+  train_split: '[:8]'
+  eval_split: '[8:]'
   json_path: ../dataset/data.json
-  train_split: "[:8]"
-  eval_split: "[8:]"
+
+prompt:
+  use_chat_template: true
+
+task:
+  player:
+    hp: 5
+  spider:
+    atk_high: 3
+    atk_low: 1
+    num: 3
+  max_commands: 600
+  limited_resource: true
+  block_agent1:
+  - white_concrete
+  - obsidian
+  - stone_stairs
+  - stone_bricks
+  - planks
+  - air
+  block_agent2:
+  - white_concrete
+  - obsidian
+  - stone_stairs
+  - stone_bricks
+  - planks
+  - air
 
 output:
-  base_dir: output
-  save_final_model: false
-  save_path: output/final_model
+  base_dir: output_magrpo_house_build
   verbose: false
+  save_final_model: false
+  save_path: output_magrpo_house_build
 
 external:
   mode: score_feedback
   original_prompt: true
   previous_response: true
   lim: 20
+  external_prompt_passthrough: false
 
 magrpo:
+  parallel_training: none
+  agent_devices:
+  - cuda:0
   num_agents: 2
   num_turns: 4
   num_train_epochs: 20
   agent_learning_rate: 1e-5
   logging_steps: 5
   num_generations: 2
   max_new_tokens: 512
-  temperature: 0.6
-  top_p: 0.6
-  top_k: null
   discount: 0.9
   joint_mode: aligned
   early_termination_threshold: -0.1
@@ -61,20 +90,7 @@ wandb:
   project: house_build
   entity: OpenMLRL
   run_name: house_build_magrpo
-  dir: output
-  tags: ["magrpo", "house_build"]
-
-prompt:
-  use_chat_template: true
-
-task:
-  block_agent1: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
-  block_agent2: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
-  max_commands: 600
-  limited_resource: true
-  player:
-    hp: 5
-  spider:
-    num: 3
-    atk_low: 1
-    atk_high: 3
+  dir: output_magrpo_house_build
+  tags:
+  - magrpo
+  - house_build
diff --git a/house_build/train/train_iac.py b/house_build/train/train_iac.py
@@ -17,6 +17,9 @@
 
 REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.insert(0, os.path.dirname(REPO_ROOT))
+COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL")
+if COMLRL_ROOT not in sys.path:
+    sys.path.insert(0, COMLRL_ROOT)
 
 from datasets import Dataset  # type: ignore
 from transformers import AutoTokenizer  # type: ignore
@@ -41,7 +44,10 @@
 )
 from LLM_Collab_Minecraft.house_build.utils.config import apply_overrides, load_yaml, resolve_path
 from LLM_Collab_Minecraft.house_build.utils.prompting import apply_prompt_defaults
-from LLM_Collab_Minecraft.house_build.utils.trainer_args import get_iac_args
+from LLM_Collab_Minecraft.house_build.utils.trainer_args import (
+    get_iac_args,
+    get_agent_sampling_config,
+)
 
 
 def _slice_items(items: List[Dict[str, Any]], split_expr: Any) -> List[Dict[str, Any]]:
@@ -450,7 +456,8 @@ def main() -> int:
             tok.pad_token = tok.eos_token
     tokenizer = tokenizers[0]
 
-    iac_args = get_iac_args(cfg, model_name=model_name)
+    sampling_cfg = get_agent_sampling_config(cfg)
+    iac_args = get_iac_args(cfg, sampling_cfg=sampling_cfg)
     formatters = _build_formatters(cfg, num_agents=num_agents, tokenizer=tokenizer)
     prompt_to_item: Dict[str, Dict[str, Any]] = {}
     dataset_prompt_map: Dict[str, Dict[str, Any]] = {}