Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 45 additions & 29 deletions house_build/configs/house_build_iac_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,15 @@ agent_model:
type: qwen
temperature: 0.6
top_p: 0.6
top_k: null
max_length: 2048
dtype: bf16

agents: null

critic_model:
name: "Qwen/Qwen3-4B-Instruct-2507"
name: Qwen/Qwen3-4B-Instruct-2507
type: qwen
temperature: 0.6
top_p: 0.6
max_length: 2048
dtype: bf16

Expand All @@ -21,25 +20,59 @@ critics: null
dataset:
name: house_build
type: house_build
train_split: '[:8]'
eval_split: '[8:]'
json_path: ../dataset/data.json
train_split: "[:8]"
eval_split: "[8:]"

prompt:
use_chat_template: true

task:
player:
hp: 5
spider:
atk_high: 3
atk_low: 1
num: 3
max_commands: 600
limited_resource: true
block_agent1:
- white_concrete
- obsidian
- stone_stairs
- stone_bricks
- planks
- air
block_agent2:
- white_concrete
- obsidian
- stone_stairs
- stone_bricks
- planks
- air

output:
base_dir: output
save_final_model: false
save_path: output/final_model
base_dir: output_iac_house_build
verbose: false
save_final_model: false
save_path: output_iac_house_build

external:
mode: score_feedback
original_prompt: true
previous_response: true
lim: 20
external_prompt_passthrough: false

iac:
parallel_training: none
agent_devices:
- cuda:0
critic_devices:
- cuda:0
num_agents: 2
num_turns: 4
use_separate_critic: true
num_train_epochs: 150
agent_learning_rate: 5e-6
critic_learning_rate: 5e-6
Expand All @@ -48,10 +81,6 @@ iac:
rollout_buffer_size: 1
train_batch_size: 1
max_new_tokens: 512
temperature: 0.6
top_p: 0.6
top_k: null
use_separate_critic: true
discount: 0.9
early_termination_threshold: 0.0
eval_interval: 10
Expand All @@ -68,20 +97,7 @@ wandb:
project: house_build
entity: OpenMLRL
run_name: house_build_iac
dir: output
tags: ["iac", "house_build"]

prompt:
use_chat_template: true

task:
block_agent1: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
block_agent2: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
max_commands: 600
limited_resource: true
player:
hp: 5
spider:
num: 3
atk_low: 1
atk_high: 3
dir: output_iac_house_build
tags:
- iac
- house_build
72 changes: 44 additions & 28 deletions house_build/configs/house_build_maac_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,15 @@ agent_model:
type: qwen
temperature: 0.6
top_p: 0.6
top_k: null
max_length: 2048
dtype: bf16

agents: null

critic_model:
name: "Qwen/Qwen3-4B-Instruct-2507"
name: Qwen/Qwen3-4B-Instruct-2507
type: qwen
temperature: 0.6
top_p: 0.6
max_length: 2048
dtype: bf16

Expand All @@ -21,23 +20,56 @@ critics: null
dataset:
name: house_build
type: house_build
train_split: '[:8]'
eval_split: '[8:]'
json_path: ../dataset/data.json
train_split: "[:8]"
eval_split: "[8:]"

prompt:
use_chat_template: true

task:
player:
hp: 5
spider:
atk_high: 3
atk_low: 1
num: 3
max_commands: 600
limited_resource: true
block_agent1:
- white_concrete
- obsidian
- stone_stairs
- stone_bricks
- planks
- air
block_agent2:
- white_concrete
- obsidian
- stone_stairs
- stone_bricks
- planks
- air

output:
base_dir: output
save_final_model: false
save_path: output/final_model
base_dir: output_maac_house_build
verbose: false
save_final_model: false
save_path: output_maac_house_build

external:
mode: score_feedback
original_prompt: true
previous_response: true
lim: 20
external_prompt_passthrough: false

maac:
parallel_training: none
agent_devices:
- cuda:0
critic_devices:
- cuda:0
num_agents: 2
num_turns: 4
critic_type: v
Expand All @@ -48,9 +80,6 @@ maac:
rollout_buffer_size: 1
train_batch_size: 1
max_new_tokens: 512
temperature: 0.6
top_p: 0.6
top_k: null
discount: 0.9
early_termination_threshold: 0.0
eval_interval: 10
Expand All @@ -67,20 +96,7 @@ wandb:
project: house_build
entity: OpenMLRL
run_name: house_build_maac
dir: output
tags: ["maac", "house_build"]

prompt:
use_chat_template: true

task:
block_agent1: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
block_agent2: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
max_commands: 600
limited_resource: true
player:
hp: 5
spider:
num: 3
atk_low: 1
atk_high: 3
dir: output_maac_house_build
tags:
- maac
- house_build
66 changes: 41 additions & 25 deletions house_build/configs/house_build_magrpo_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ agent_model:
type: qwen
temperature: 0.6
top_p: 0.6
top_k: null
max_length: 2048
dtype: bf16

Expand All @@ -15,33 +16,61 @@ critics: null
dataset:
name: house_build
type: house_build
train_split: '[:8]'
eval_split: '[8:]'
json_path: ../dataset/data.json
train_split: "[:8]"
eval_split: "[8:]"

prompt:
use_chat_template: true

task:
player:
hp: 5
spider:
atk_high: 3
atk_low: 1
num: 3
max_commands: 600
limited_resource: true
block_agent1:
- white_concrete
- obsidian
- stone_stairs
- stone_bricks
- planks
- air
block_agent2:
- white_concrete
- obsidian
- stone_stairs
- stone_bricks
- planks
- air

output:
base_dir: output
save_final_model: false
save_path: output/final_model
base_dir: output_magrpo_house_build
verbose: false
save_final_model: false
save_path: output_magrpo_house_build

external:
mode: score_feedback
original_prompt: true
previous_response: true
lim: 20
external_prompt_passthrough: false

magrpo:
parallel_training: none
agent_devices:
- cuda:0
num_agents: 2
num_turns: 4
num_train_epochs: 20
agent_learning_rate: 1e-5
logging_steps: 5
num_generations: 2
max_new_tokens: 512
temperature: 0.6
top_p: 0.6
top_k: null
discount: 0.9
joint_mode: aligned
early_termination_threshold: -0.1
Expand All @@ -61,20 +90,7 @@ wandb:
project: house_build
entity: OpenMLRL
run_name: house_build_magrpo
dir: output
tags: ["magrpo", "house_build"]

prompt:
use_chat_template: true

task:
block_agent1: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
block_agent2: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air]
max_commands: 600
limited_resource: true
player:
hp: 5
spider:
num: 3
atk_low: 1
atk_high: 3
dir: output_magrpo_house_build
tags:
- magrpo
- house_build
11 changes: 9 additions & 2 deletions house_build/train/train_iac.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@

REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.insert(0, os.path.dirname(REPO_ROOT))
COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL")
if COMLRL_ROOT not in sys.path:
sys.path.insert(0, COMLRL_ROOT)

from datasets import Dataset # type: ignore
from transformers import AutoTokenizer # type: ignore
Expand All @@ -41,7 +44,10 @@
)
from LLM_Collab_Minecraft.house_build.utils.config import apply_overrides, load_yaml, resolve_path
from LLM_Collab_Minecraft.house_build.utils.prompting import apply_prompt_defaults
from LLM_Collab_Minecraft.house_build.utils.trainer_args import get_iac_args
from LLM_Collab_Minecraft.house_build.utils.trainer_args import (
get_iac_args,
get_agent_sampling_config,
)


def _slice_items(items: List[Dict[str, Any]], split_expr: Any) -> List[Dict[str, Any]]:
Expand Down Expand Up @@ -450,7 +456,8 @@ def main() -> int:
tok.pad_token = tok.eos_token
tokenizer = tokenizers[0]

iac_args = get_iac_args(cfg, model_name=model_name)
sampling_cfg = get_agent_sampling_config(cfg)
iac_args = get_iac_args(cfg, sampling_cfg=sampling_cfg)
formatters = _build_formatters(cfg, num_agents=num_agents, tokenizer=tokenizer)
prompt_to_item: Dict[str, Dict[str, Any]] = {}
dataset_prompt_map: Dict[str, Dict[str, Any]] = {}
Expand Down
Loading