From bb419802407caec602b3ee7180783ce968b94720 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 11:27:07 -0500 Subject: [PATCH 01/11] Prefer workspace CoMLRL in training entrypoints --- house_build/train/train_iac.py | 3 +++ house_build/train/train_maac.py | 3 +++ house_build/train/train_magrpo.py | 3 +++ str_build/train/train_iac.py | 3 +++ str_build/train/train_maac.py | 3 +++ str_build/train/train_magrpo.py | 3 +++ 6 files changed, 18 insertions(+) diff --git a/house_build/train/train_iac.py b/house_build/train/train_iac.py index d999472..958504f 100644 --- a/house_build/train/train_iac.py +++ b/house_build/train/train_iac.py @@ -17,6 +17,9 @@ REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(REPO_ROOT)) +COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL") +if COMLRL_ROOT not in sys.path: + sys.path.insert(0, COMLRL_ROOT) from datasets import Dataset # type: ignore from transformers import AutoTokenizer # type: ignore diff --git a/house_build/train/train_maac.py b/house_build/train/train_maac.py index ce5dd1a..f6885bd 100644 --- a/house_build/train/train_maac.py +++ b/house_build/train/train_maac.py @@ -17,6 +17,9 @@ REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(REPO_ROOT)) +COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL") +if COMLRL_ROOT not in sys.path: + sys.path.insert(0, COMLRL_ROOT) from datasets import Dataset # type: ignore from transformers import AutoTokenizer # type: ignore diff --git a/house_build/train/train_magrpo.py b/house_build/train/train_magrpo.py index d74b599..2e7b49f 100644 --- a/house_build/train/train_magrpo.py +++ b/house_build/train/train_magrpo.py @@ -17,6 +17,9 @@ REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(REPO_ROOT)) +COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL") +if COMLRL_ROOT not in sys.path: + sys.path.insert(0, COMLRL_ROOT) from datasets import Dataset # type: ignore from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore diff --git a/str_build/train/train_iac.py b/str_build/train/train_iac.py index 3abdb06..8752354 100644 --- a/str_build/train/train_iac.py +++ b/str_build/train/train_iac.py @@ -16,6 +16,9 @@ REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(REPO_ROOT)) +COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL") +if COMLRL_ROOT not in sys.path: + sys.path.insert(0, COMLRL_ROOT) from datasets import Dataset # type: ignore from transformers import AutoTokenizer # type: ignore diff --git a/str_build/train/train_maac.py b/str_build/train/train_maac.py index 8631687..cf17c14 100644 --- a/str_build/train/train_maac.py +++ b/str_build/train/train_maac.py @@ -16,6 +16,9 @@ REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(REPO_ROOT)) +COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL") +if COMLRL_ROOT not in sys.path: + sys.path.insert(0, COMLRL_ROOT) from datasets import Dataset # type: ignore from transformers import AutoTokenizer # type: ignore diff --git a/str_build/train/train_magrpo.py b/str_build/train/train_magrpo.py index 57e29d9..384c22c 100644 --- a/str_build/train/train_magrpo.py +++ b/str_build/train/train_magrpo.py @@ -16,6 +16,9 @@ REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(REPO_ROOT)) +COMLRL_ROOT = os.path.join(os.path.dirname(REPO_ROOT), "CoMLRL") +if COMLRL_ROOT not in sys.path: + sys.path.insert(0, COMLRL_ROOT) from datasets import Dataset # type: ignore from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore From 97c37934d8e657bf46ff61709454b8654dffda80 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 13:39:23 -0500 Subject: [PATCH 02/11] add parallel mode and device spec passthrough in trainer args --- house_build/utils/trainer_args.py | 21 +++++++++++++++++++++ str_build/utils/trainer_args.py | 21 +++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/house_build/utils/trainer_args.py b/house_build/utils/trainer_args.py index 2e32e51..8a1402c 100644 --- a/house_build/utils/trainer_args.py +++ b/house_build/utils/trainer_args.py @@ -86,6 +86,19 @@ def _as_bool(x: Any, default: bool) -> bool: return bool(x) +def _as_device_spec(x: Any) -> Any: + if x is None: + return None + if isinstance(x, str): + s = x.strip() + if s.lower() in ("none", "null", ""): + return None + return s + if isinstance(x, (list, tuple)): + return [str(v) for v in x] + return str(x) + + def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: tr = cfg.get("magrpo") or {} if not isinstance(tr, dict): @@ -114,6 +127,8 @@ def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: candidate["top_k"] = _as_opt_int(tr.get("top_k", None), None) candidate.update( { + "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", tr.get("gamma", 0.9)), 0.9), "joint_mode": joint_mode_str, } @@ -172,6 +187,9 @@ def get_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> M "top_k": _as_opt_int(tr.get("top_k", None), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), + "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "agent_devices": _as_device_spec(tr.get("agent_devices", None)), + "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), "critic_type": str(tr.get("critic_type", "v")), "early_termination_threshold": _as_opt_float( @@ -221,6 +239,9 @@ def get_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> IA "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, + "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "agent_devices": _as_device_spec(tr.get("agent_devices", None)), + "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( tr.get("critic_value_head_hidden_dim", None), None ), diff --git a/str_build/utils/trainer_args.py b/str_build/utils/trainer_args.py index 3fc7ded..548253e 100644 --- a/str_build/utils/trainer_args.py +++ b/str_build/utils/trainer_args.py @@ -86,6 +86,19 @@ def _as_bool(x: Any, default: bool) -> bool: return bool(x) +def _as_device_spec(x: Any) -> Any: + if x is None: + return None + if isinstance(x, str): + s = x.strip() + if s.lower() in ("none", "null", ""): + return None + return s + if isinstance(x, (list, tuple)): + return [str(v) for v in x] + return str(x) + + def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: tr = cfg.get("magrpo") or {} if not isinstance(tr, dict): @@ -114,6 +127,8 @@ def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: candidate["top_k"] = _as_opt_int(tr.get("top_k", None), None) candidate.update( { + "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), "joint_mode": joint_mode_str, } @@ -172,6 +187,9 @@ def get_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> M "top_k": _as_opt_int(tr.get("top_k", None), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), + "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "agent_devices": _as_device_spec(tr.get("agent_devices", None)), + "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), "critic_type": str(tr.get("critic_type", "v")), "early_termination_threshold": _as_opt_float( @@ -221,6 +239,9 @@ def get_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> IA "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, + "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "agent_devices": _as_device_spec(tr.get("agent_devices", None)), + "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( tr.get("critic_value_head_hidden_dim", None), None ), From 16968bd94e3c68eea2f6d8a561fb9fe03069bfb4 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 14:02:34 -0500 Subject: [PATCH 03/11] rename parallel_training field and set yaml defaults --- house_build/configs/house_build_iac_config.yaml | 1 + house_build/configs/house_build_maac_config.yaml | 1 + house_build/configs/house_build_magrpo_config.yaml | 1 + house_build/utils/trainer_args.py | 6 +++--- str_build/configs/str_build_iac_config.yaml | 1 + str_build/configs/str_build_maac_config.yaml | 1 + str_build/configs/str_build_magrpo_config.yaml | 1 + str_build/utils/trainer_args.py | 6 +++--- 8 files changed, 12 insertions(+), 6 deletions(-) diff --git a/house_build/configs/house_build_iac_config.yaml b/house_build/configs/house_build_iac_config.yaml index f4b034e..26bfb67 100644 --- a/house_build/configs/house_build_iac_config.yaml +++ b/house_build/configs/house_build_iac_config.yaml @@ -38,6 +38,7 @@ external: lim: 20 iac: + parallel_training: auto num_agents: 2 num_turns: 4 num_train_epochs: 150 diff --git a/house_build/configs/house_build_maac_config.yaml b/house_build/configs/house_build_maac_config.yaml index ebbb4e1..b4bae39 100644 --- a/house_build/configs/house_build_maac_config.yaml +++ b/house_build/configs/house_build_maac_config.yaml @@ -38,6 +38,7 @@ external: lim: 20 maac: + parallel_training: auto num_agents: 2 num_turns: 4 critic_type: v diff --git a/house_build/configs/house_build_magrpo_config.yaml b/house_build/configs/house_build_magrpo_config.yaml index 6d7f178..3f3a92e 100644 --- a/house_build/configs/house_build_magrpo_config.yaml +++ b/house_build/configs/house_build_magrpo_config.yaml @@ -32,6 +32,7 @@ external: lim: 20 magrpo: + parallel_training: auto num_agents: 2 num_turns: 4 num_train_epochs: 20 diff --git a/house_build/utils/trainer_args.py b/house_build/utils/trainer_args.py index 8a1402c..151fd9e 100644 --- a/house_build/utils/trainer_args.py +++ b/house_build/utils/trainer_args.py @@ -127,7 +127,7 @@ def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: candidate["top_k"] = _as_opt_int(tr.get("top_k", None), None) candidate.update( { - "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", tr.get("gamma", 0.9)), 0.9), "joint_mode": joint_mode_str, @@ -187,7 +187,7 @@ def get_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> M "top_k": _as_opt_int(tr.get("top_k", None), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), - "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), @@ -239,7 +239,7 @@ def get_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> IA "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, - "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( diff --git a/str_build/configs/str_build_iac_config.yaml b/str_build/configs/str_build_iac_config.yaml index f9ded34..938507b 100644 --- a/str_build/configs/str_build_iac_config.yaml +++ b/str_build/configs/str_build_iac_config.yaml @@ -39,6 +39,7 @@ external: previous_response: true iac: + parallel_training: auto num_agents: 2 num_turns: 4 num_train_epochs: 150 diff --git a/str_build/configs/str_build_maac_config.yaml b/str_build/configs/str_build_maac_config.yaml index b37ff94..0798dea 100644 --- a/str_build/configs/str_build_maac_config.yaml +++ b/str_build/configs/str_build_maac_config.yaml @@ -39,6 +39,7 @@ external: previous_response: true maac: + parallel_training: auto num_agents: 2 num_turns: 4 critic_type: v diff --git a/str_build/configs/str_build_magrpo_config.yaml b/str_build/configs/str_build_magrpo_config.yaml index ea683d8..21c93c6 100644 --- a/str_build/configs/str_build_magrpo_config.yaml +++ b/str_build/configs/str_build_magrpo_config.yaml @@ -33,6 +33,7 @@ external: previous_response: true magrpo: + parallel_training: auto num_agents: 2 num_turns: 4 num_train_epochs: 20 diff --git a/str_build/utils/trainer_args.py b/str_build/utils/trainer_args.py index 548253e..16df36e 100644 --- a/str_build/utils/trainer_args.py +++ b/str_build/utils/trainer_args.py @@ -127,7 +127,7 @@ def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: candidate["top_k"] = _as_opt_int(tr.get("top_k", None), None) candidate.update( { - "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), "joint_mode": joint_mode_str, @@ -187,7 +187,7 @@ def get_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> M "top_k": _as_opt_int(tr.get("top_k", None), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), - "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), @@ -239,7 +239,7 @@ def get_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> IA "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, - "parallel_mode": str(tr.get("parallel_mode", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( From 249672adc07de12b9b179d139f555b2b993ba8a1 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 15:57:26 -0500 Subject: [PATCH 04/11] ud --- .../configs/house_build_iac_config.yaml | 6 +- .../configs/house_build_maac_config.yaml | 6 +- .../configs/house_build_magrpo_config.yaml | 4 +- house_build/train/train_iac.py | 8 ++- house_build/train/train_maac.py | 8 ++- house_build/train/train_magrpo.py | 8 ++- house_build/utils/trainer_args.py | 64 +++++++++++++++---- str_build/configs/str_build_iac_config.yaml | 6 +- str_build/configs/str_build_maac_config.yaml | 6 +- .../configs/str_build_magrpo_config.yaml | 4 +- str_build/train/train_iac.py | 8 ++- str_build/train/train_maac.py | 8 ++- str_build/train/train_magrpo.py | 8 ++- str_build/utils/trainer_args.py | 64 +++++++++++++++---- 14 files changed, 144 insertions(+), 64 deletions(-) diff --git a/house_build/configs/house_build_iac_config.yaml b/house_build/configs/house_build_iac_config.yaml index 26bfb67..3198e24 100644 --- a/house_build/configs/house_build_iac_config.yaml +++ b/house_build/configs/house_build_iac_config.yaml @@ -3,6 +3,7 @@ agent_model: type: qwen temperature: 0.6 top_p: 0.6 + top_k: null max_length: 2048 dtype: bf16 @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-4B-Instruct-2507" type: qwen - temperature: 0.6 - top_p: 0.6 max_length: 2048 dtype: bf16 @@ -49,9 +48,6 @@ iac: rollout_buffer_size: 1 train_batch_size: 1 max_new_tokens: 512 - temperature: 0.6 - top_p: 0.6 - top_k: null use_separate_critic: true discount: 0.9 early_termination_threshold: 0.0 diff --git a/house_build/configs/house_build_maac_config.yaml b/house_build/configs/house_build_maac_config.yaml index b4bae39..fbc1a27 100644 --- a/house_build/configs/house_build_maac_config.yaml +++ b/house_build/configs/house_build_maac_config.yaml @@ -3,6 +3,7 @@ agent_model: type: qwen temperature: 0.6 top_p: 0.6 + top_k: null max_length: 2048 dtype: bf16 @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-4B-Instruct-2507" type: qwen - temperature: 0.6 - top_p: 0.6 max_length: 2048 dtype: bf16 @@ -49,9 +48,6 @@ maac: rollout_buffer_size: 1 train_batch_size: 1 max_new_tokens: 512 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 early_termination_threshold: 0.0 eval_interval: 10 diff --git a/house_build/configs/house_build_magrpo_config.yaml b/house_build/configs/house_build_magrpo_config.yaml index 3f3a92e..df9ec93 100644 --- a/house_build/configs/house_build_magrpo_config.yaml +++ b/house_build/configs/house_build_magrpo_config.yaml @@ -3,6 +3,7 @@ agent_model: type: qwen temperature: 0.6 top_p: 0.6 + top_k: null max_length: 2048 dtype: bf16 @@ -40,9 +41,6 @@ magrpo: logging_steps: 5 num_generations: 2 max_new_tokens: 512 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 joint_mode: aligned early_termination_threshold: -0.1 diff --git a/house_build/train/train_iac.py b/house_build/train/train_iac.py index 958504f..9a48520 100644 --- a/house_build/train/train_iac.py +++ b/house_build/train/train_iac.py @@ -44,7 +44,10 @@ ) from LLM_Collab_Minecraft.house_build.utils.config import apply_overrides, load_yaml, resolve_path from LLM_Collab_Minecraft.house_build.utils.prompting import apply_prompt_defaults -from LLM_Collab_Minecraft.house_build.utils.trainer_args import get_iac_args +from LLM_Collab_Minecraft.house_build.utils.trainer_args import ( + get_iac_args, + get_agent_sampling_config, +) def _slice_items(items: List[Dict[str, Any]], split_expr: Any) -> List[Dict[str, Any]]: @@ -453,7 +456,8 @@ def main() -> int: tok.pad_token = tok.eos_token tokenizer = tokenizers[0] - iac_args = get_iac_args(cfg, model_name=model_name) + sampling_cfg = get_agent_sampling_config(cfg) + iac_args = get_iac_args(cfg, sampling_cfg=sampling_cfg) formatters = _build_formatters(cfg, num_agents=num_agents, tokenizer=tokenizer) prompt_to_item: Dict[str, Dict[str, Any]] = {} dataset_prompt_map: Dict[str, Dict[str, Any]] = {} diff --git a/house_build/train/train_maac.py b/house_build/train/train_maac.py index f6885bd..8011e50 100644 --- a/house_build/train/train_maac.py +++ b/house_build/train/train_maac.py @@ -44,7 +44,10 @@ ) from LLM_Collab_Minecraft.house_build.utils.config import apply_overrides, load_yaml, resolve_path from LLM_Collab_Minecraft.house_build.utils.prompting import apply_prompt_defaults -from LLM_Collab_Minecraft.house_build.utils.trainer_args import get_maac_args +from LLM_Collab_Minecraft.house_build.utils.trainer_args import ( + get_maac_args, + get_agent_sampling_config, +) def _slice_items(items: List[Dict[str, Any]], split_expr: Any) -> List[Dict[str, Any]]: @@ -453,7 +456,8 @@ def main() -> int: tok.pad_token = tok.eos_token tokenizer = tokenizers[0] - maac_args = get_maac_args(cfg, model_name=model_name) + sampling_cfg = get_agent_sampling_config(cfg) + maac_args = get_maac_args(cfg, sampling_cfg=sampling_cfg) formatters = _build_formatters(cfg, num_agents=num_agents, tokenizer=tokenizer) prompt_to_item: Dict[str, Dict[str, Any]] = {} dataset_prompt_map: Dict[str, Dict[str, Any]] = {} diff --git a/house_build/train/train_magrpo.py b/house_build/train/train_magrpo.py index 2e7b49f..7204b8a 100644 --- a/house_build/train/train_magrpo.py +++ b/house_build/train/train_magrpo.py @@ -44,7 +44,10 @@ ) from LLM_Collab_Minecraft.house_build.utils.config import apply_overrides, load_yaml, resolve_path from LLM_Collab_Minecraft.house_build.utils.prompting import apply_prompt_defaults -from LLM_Collab_Minecraft.house_build.utils.trainer_args import get_trainer_args +from LLM_Collab_Minecraft.house_build.utils.trainer_args import ( + get_trainer_args, + get_agent_sampling_config, +) def _slice_items(items: List[Dict[str, Any]], split_expr: Any) -> List[Dict[str, Any]]: @@ -451,7 +454,8 @@ def main() -> int: agent = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs) agents.append(agent) - magrpo_args = get_trainer_args(cfg) + sampling_cfg = get_agent_sampling_config(cfg) + magrpo_args = get_trainer_args(cfg, sampling_cfg=sampling_cfg) formatters = _build_formatters(cfg, num_agents=num_agents, tokenizer=tokenizer) reward_func = get_reward_function(cfg=cfg, num_agents=num_agents) diff --git a/house_build/utils/trainer_args.py b/house_build/utils/trainer_args.py index 151fd9e..969bbd9 100644 --- a/house_build/utils/trainer_args.py +++ b/house_build/utils/trainer_args.py @@ -99,7 +99,46 @@ def _as_device_spec(x: Any) -> Any: return str(x) -def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: +def get_agent_sampling_config(cfg: Dict[str, Any]) -> Dict[str, Any]: + model_cfg = cfg.get("agent_model") + if not isinstance(model_cfg, dict): + raise ValueError("agent_model must be a mapping.") + missing = [key for key in ("temperature", "top_p", "top_k") if key not in model_cfg] + if missing: + raise ValueError( + f"agent_model is missing required sampling fields: {', '.join(missing)}" + ) + + def _require_float(key: str) -> float: + value = model_cfg.get(key) + if value is None or isinstance(value, bool): + raise ValueError(f"agent_model.{key} must be provided as a float.") + try: + return float(value) + except Exception as exc: + raise ValueError(f"agent_model.{key} must be a float, got {value!r}.") from exc + + top_k_raw = model_cfg.get("top_k") + if isinstance(top_k_raw, str) and top_k_raw.strip().lower() in ("none", "null", ""): + top_k_val: Optional[int] = None + elif top_k_raw is None: + top_k_val = None + else: + try: + top_k_val = int(float(top_k_raw)) + except Exception as exc: + raise ValueError( + f"agent_model.top_k must be an integer or null, got {top_k_raw!r}." + ) from exc + + return { + "temperature": _require_float("temperature"), + "top_p": _require_float("top_p"), + "top_k": top_k_val, + } + + +def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAGRPOConfig: tr = cfg.get("magrpo") or {} if not isinstance(tr, dict): tr = {} @@ -120,11 +159,10 @@ def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: "logging_steps": _as_int(tr.get("logging_steps", 50), 50), "num_generations": _as_int(tr.get("num_generations", 4), 4), "max_new_tokens": _as_int(tr.get("max_new_tokens", 512), 512), - "temperature": _as_float(tr.get("temperature", 0.2), 0.2), - "top_p": _as_float(tr.get("top_p", 0.95), 0.95), + "temperature": _as_float(sampling_cfg.get("temperature"), 0.2), + "top_p": _as_float(sampling_cfg.get("top_p"), 0.95), + "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), } - if "top_k" in tr: - candidate["top_k"] = _as_opt_int(tr.get("top_k", None), None) candidate.update( { "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), @@ -164,7 +202,7 @@ def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: return cfg_obj -def get_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> MAACConfig: +def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACConfig: tr = cfg.get("maac") or {} if not isinstance(tr, dict): tr = {} @@ -182,9 +220,9 @@ def get_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> M "value_loss_coef": _as_float(tr.get("value_loss_coef", 0.6), 0.6), "advantage_normalization": _as_bool(adv_norm, True), "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), - "temperature": _as_float(tr.get("temperature", 0.6), 0.6), - "top_p": _as_float(tr.get("top_p", 0.6), 0.6), - "top_k": _as_opt_int(tr.get("top_k", None), None), + "temperature": _as_float(sampling_cfg.get("temperature"), 0.6), + "top_p": _as_float(sampling_cfg.get("top_p"), 0.6), + "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), @@ -213,7 +251,7 @@ def get_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> M return MAACConfig(**filtered) -def get_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> IACConfig: +def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACConfig: tr = cfg.get("iac") or {} if not isinstance(tr, dict): tr = {} @@ -233,9 +271,9 @@ def get_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> IA "value_clip_range": _as_opt_float(tr.get("value_clip_range", 0.05), 0.05), "advantage_normalization": _as_bool(adv_norm, True), "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), - "temperature": _as_float(tr.get("temperature", 0.6), 0.6), - "top_p": _as_float(tr.get("top_p", 0.6), 0.6), - "top_k": _as_opt_int(tr.get("top_k", None), None), + "temperature": _as_float(sampling_cfg.get("temperature"), 0.6), + "top_p": _as_float(sampling_cfg.get("top_p"), 0.6), + "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, diff --git a/str_build/configs/str_build_iac_config.yaml b/str_build/configs/str_build_iac_config.yaml index 938507b..172e541 100644 --- a/str_build/configs/str_build_iac_config.yaml +++ b/str_build/configs/str_build_iac_config.yaml @@ -3,6 +3,7 @@ agent_model: type: qwen temperature: 0.6 top_p: 0.6 + top_k: null max_length: 2048 dtype: bf16 @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-4B-Instruct-2507" type: qwen - temperature: 0.6 - top_p: 0.6 max_length: 2048 dtype: bf16 @@ -50,9 +49,6 @@ iac: rollout_buffer_size: 1 train_batch_size: 1 max_new_tokens: 512 - temperature: 0.6 - top_p: 0.6 - top_k: null use_separate_critic: true discount: 0.9 early_termination_threshold: -0.1 diff --git a/str_build/configs/str_build_maac_config.yaml b/str_build/configs/str_build_maac_config.yaml index 0798dea..e79a67b 100644 --- a/str_build/configs/str_build_maac_config.yaml +++ b/str_build/configs/str_build_maac_config.yaml @@ -3,6 +3,7 @@ agent_model: type: qwen temperature: 0.6 top_p: 0.6 + top_k: null max_length: 2048 dtype: bf16 @@ -11,8 +12,6 @@ agents: null critic_model: name: "Qwen/Qwen3-4B-Instruct-2507" type: qwen - temperature: 0.6 - top_p: 0.6 max_length: 2048 dtype: bf16 @@ -50,9 +49,6 @@ maac: rollout_buffer_size: 1 train_batch_size: 1 max_new_tokens: 512 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 early_termination_threshold: -0.1 eval_interval: 10 diff --git a/str_build/configs/str_build_magrpo_config.yaml b/str_build/configs/str_build_magrpo_config.yaml index 21c93c6..d945a4c 100644 --- a/str_build/configs/str_build_magrpo_config.yaml +++ b/str_build/configs/str_build_magrpo_config.yaml @@ -3,6 +3,7 @@ agent_model: type: qwen temperature: 0.6 top_p: 0.6 + top_k: null max_length: 2048 dtype: bf16 @@ -41,9 +42,6 @@ magrpo: logging_steps: 1 num_generations: 2 max_new_tokens: 512 - temperature: 0.6 - top_p: 0.6 - top_k: null discount: 0.9 joint_mode: aligned early_termination_threshold: -0.1 diff --git a/str_build/train/train_iac.py b/str_build/train/train_iac.py index 8752354..3c0c26d 100644 --- a/str_build/train/train_iac.py +++ b/str_build/train/train_iac.py @@ -35,7 +35,10 @@ from LLM_Collab_Minecraft.str_build.utils.config import apply_overrides, load_yaml, resolve_path from LLM_Collab_Minecraft.str_build.utils.prompting import apply_graph_setting, apply_prompt_defaults from LLM_Collab_Minecraft.str_build.utils.str_builder import load_tasks_from_csv -from LLM_Collab_Minecraft.str_build.utils.trainer_args import get_iac_args +from LLM_Collab_Minecraft.str_build.utils.trainer_args import ( + get_iac_args, + get_agent_sampling_config, +) def _slice_items(items: List[Dict[str, Any]], split_expr: Any) -> List[Dict[str, Any]]: @@ -320,7 +323,8 @@ def main() -> int: tok.pad_token = tok.eos_token tokenizer = tokenizers[0] - iac_args = get_iac_args(cfg, model_name=model_name) + sampling_cfg = get_agent_sampling_config(cfg) + iac_args = get_iac_args(cfg, sampling_cfg=sampling_cfg) formatters = _build_formatters(cfg, num_agents=num_agents, tokenizer=tokenizer) prompt_to_item: Dict[str, Dict[str, Any]] = {} dataset_prompt_map: Dict[str, Dict[str, Any]] = {} diff --git a/str_build/train/train_maac.py b/str_build/train/train_maac.py index cf17c14..06a7a29 100644 --- a/str_build/train/train_maac.py +++ b/str_build/train/train_maac.py @@ -35,7 +35,10 @@ from LLM_Collab_Minecraft.str_build.utils.config import apply_overrides, load_yaml, resolve_path from LLM_Collab_Minecraft.str_build.utils.prompting import apply_graph_setting, apply_prompt_defaults from LLM_Collab_Minecraft.str_build.utils.str_builder import load_tasks_from_csv -from LLM_Collab_Minecraft.str_build.utils.trainer_args import get_maac_args +from LLM_Collab_Minecraft.str_build.utils.trainer_args import ( + get_maac_args, + get_agent_sampling_config, +) def _slice_items(items: List[Dict[str, Any]], split_expr: Any) -> List[Dict[str, Any]]: @@ -320,7 +323,8 @@ def main() -> int: tok.pad_token = tok.eos_token tokenizer = tokenizers[0] - maac_args = get_maac_args(cfg, model_name=model_name) + sampling_cfg = get_agent_sampling_config(cfg) + maac_args = get_maac_args(cfg, sampling_cfg=sampling_cfg) formatters = _build_formatters(cfg, num_agents=num_agents, tokenizer=tokenizer) prompt_to_item: Dict[str, Dict[str, Any]] = {} dataset_prompt_map: Dict[str, Dict[str, Any]] = {} diff --git a/str_build/train/train_magrpo.py b/str_build/train/train_magrpo.py index 384c22c..7df1ab7 100644 --- a/str_build/train/train_magrpo.py +++ b/str_build/train/train_magrpo.py @@ -35,7 +35,10 @@ from LLM_Collab_Minecraft.str_build.utils.config import apply_overrides, load_yaml, resolve_path from LLM_Collab_Minecraft.str_build.utils.prompting import apply_graph_setting, apply_prompt_defaults from LLM_Collab_Minecraft.str_build.utils.str_builder import load_tasks_from_csv -from LLM_Collab_Minecraft.str_build.utils.trainer_args import get_trainer_args +from LLM_Collab_Minecraft.str_build.utils.trainer_args import ( + get_trainer_args, + get_agent_sampling_config, +) def _slice_items(items: List[Dict[str, Any]], split_expr: Any) -> List[Dict[str, Any]]: @@ -318,7 +321,8 @@ def main() -> int: agent = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs) agents.append(agent) - magrpo_args = get_trainer_args(cfg) + sampling_cfg = get_agent_sampling_config(cfg) + magrpo_args = get_trainer_args(cfg, sampling_cfg=sampling_cfg) formatters = _build_formatters(cfg, num_agents=num_agents, tokenizer=tokenizer) reward_func = get_reward_function(cfg=cfg, num_agents=num_agents) diff --git a/str_build/utils/trainer_args.py b/str_build/utils/trainer_args.py index 16df36e..4b9e9b5 100644 --- a/str_build/utils/trainer_args.py +++ b/str_build/utils/trainer_args.py @@ -99,7 +99,46 @@ def _as_device_spec(x: Any) -> Any: return str(x) -def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: +def get_agent_sampling_config(cfg: Dict[str, Any]) -> Dict[str, Any]: + model_cfg = cfg.get("agent_model") + if not isinstance(model_cfg, dict): + raise ValueError("agent_model must be a mapping.") + missing = [key for key in ("temperature", "top_p", "top_k") if key not in model_cfg] + if missing: + raise ValueError( + f"agent_model is missing required sampling fields: {', '.join(missing)}" + ) + + def _require_float(key: str) -> float: + value = model_cfg.get(key) + if value is None or isinstance(value, bool): + raise ValueError(f"agent_model.{key} must be provided as a float.") + try: + return float(value) + except Exception as exc: + raise ValueError(f"agent_model.{key} must be a float, got {value!r}.") from exc + + top_k_raw = model_cfg.get("top_k") + if isinstance(top_k_raw, str) and top_k_raw.strip().lower() in ("none", "null", ""): + top_k_val: Optional[int] = None + elif top_k_raw is None: + top_k_val = None + else: + try: + top_k_val = int(float(top_k_raw)) + except Exception as exc: + raise ValueError( + f"agent_model.top_k must be an integer or null, got {top_k_raw!r}." + ) from exc + + return { + "temperature": _require_float("temperature"), + "top_p": _require_float("top_p"), + "top_k": top_k_val, + } + + +def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAGRPOConfig: tr = cfg.get("magrpo") or {} if not isinstance(tr, dict): tr = {} @@ -120,11 +159,10 @@ def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: "logging_steps": _as_int(tr.get("logging_steps", 50), 50), "num_generations": _as_int(tr.get("num_generations", 4), 4), "max_new_tokens": _as_int(tr.get("max_new_tokens", 512), 512), - "temperature": _as_float(tr.get("temperature", 0.2), 0.2), - "top_p": _as_float(tr.get("top_p", 0.95), 0.95), + "temperature": _as_float(sampling_cfg.get("temperature"), 0.2), + "top_p": _as_float(sampling_cfg.get("top_p"), 0.95), + "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), } - if "top_k" in tr: - candidate["top_k"] = _as_opt_int(tr.get("top_k", None), None) candidate.update( { "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), @@ -164,7 +202,7 @@ def get_trainer_args(cfg: Dict[str, Any]) -> MAGRPOConfig: return cfg_obj -def get_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> MAACConfig: +def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACConfig: tr = cfg.get("maac") or {} if not isinstance(tr, dict): tr = {} @@ -182,9 +220,9 @@ def get_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> M "value_loss_coef": _as_float(tr.get("value_loss_coef", 0.6), 0.6), "advantage_normalization": _as_bool(adv_norm, True), "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), - "temperature": _as_float(tr.get("temperature", 0.6), 0.6), - "top_p": _as_float(tr.get("top_p", 0.6), 0.6), - "top_k": _as_opt_int(tr.get("top_k", None), None), + "temperature": _as_float(sampling_cfg.get("temperature"), 0.6), + "top_p": _as_float(sampling_cfg.get("top_p"), 0.6), + "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), @@ -213,7 +251,7 @@ def get_maac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> M return MAACConfig(**filtered) -def get_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> IACConfig: +def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACConfig: tr = cfg.get("iac") or {} if not isinstance(tr, dict): tr = {} @@ -233,9 +271,9 @@ def get_iac_args(cfg: Dict[str, Any], *, model_name: Optional[str] = None) -> IA "value_clip_range": _as_opt_float(tr.get("value_clip_range", 0.05), 0.05), "advantage_normalization": _as_bool(adv_norm, True), "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), - "temperature": _as_float(tr.get("temperature", 0.6), 0.6), - "top_p": _as_float(tr.get("top_p", 0.6), 0.6), - "top_k": _as_opt_int(tr.get("top_k", None), None), + "temperature": _as_float(sampling_cfg.get("temperature"), 0.6), + "top_p": _as_float(sampling_cfg.get("top_p"), 0.6), + "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, From a4a8e9789eef1466110eaf7762a4e5bd65753584 Mon Sep 17 00:00:00 2001 From: N!no Date: Sun, 15 Feb 2026 16:33:52 -0500 Subject: [PATCH 05/11] ud --- house_build/configs/house_build_iac_config.yaml | 6 +++--- house_build/configs/house_build_maac_config.yaml | 6 +++--- house_build/configs/house_build_magrpo_config.yaml | 6 +++--- str_build/configs/str_build_iac_config.yaml | 6 +++--- str_build/configs/str_build_maac_config.yaml | 6 +++--- str_build/configs/str_build_magrpo_config.yaml | 6 +++--- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/house_build/configs/house_build_iac_config.yaml b/house_build/configs/house_build_iac_config.yaml index 3198e24..487ba5a 100644 --- a/house_build/configs/house_build_iac_config.yaml +++ b/house_build/configs/house_build_iac_config.yaml @@ -25,9 +25,9 @@ dataset: eval_split: "[8:]" output: - base_dir: output + base_dir: output_iac_house_build save_final_model: false - save_path: output/final_model + save_path: output_iac_house_build verbose: false external: @@ -65,7 +65,7 @@ wandb: project: house_build entity: OpenMLRL run_name: house_build_iac - dir: output + dir: output_iac_house_build tags: ["iac", "house_build"] prompt: diff --git a/house_build/configs/house_build_maac_config.yaml b/house_build/configs/house_build_maac_config.yaml index fbc1a27..6460ab6 100644 --- a/house_build/configs/house_build_maac_config.yaml +++ b/house_build/configs/house_build_maac_config.yaml @@ -25,9 +25,9 @@ dataset: eval_split: "[8:]" output: - base_dir: output + base_dir: output_maac_house_build save_final_model: false - save_path: output/final_model + save_path: output_maac_house_build verbose: false external: @@ -64,7 +64,7 @@ wandb: project: house_build entity: OpenMLRL run_name: house_build_maac - dir: output + dir: output_maac_house_build tags: ["maac", "house_build"] prompt: diff --git a/house_build/configs/house_build_magrpo_config.yaml b/house_build/configs/house_build_magrpo_config.yaml index df9ec93..aec8d08 100644 --- a/house_build/configs/house_build_magrpo_config.yaml +++ b/house_build/configs/house_build_magrpo_config.yaml @@ -21,9 +21,9 @@ dataset: eval_split: "[8:]" output: - base_dir: output + base_dir: output_magrpo_house_build save_final_model: false - save_path: output/final_model + save_path: output_magrpo_house_build verbose: false external: @@ -60,7 +60,7 @@ wandb: project: house_build entity: OpenMLRL run_name: house_build_magrpo - dir: output + dir: output_magrpo_house_build tags: ["magrpo", "house_build"] prompt: diff --git a/str_build/configs/str_build_iac_config.yaml b/str_build/configs/str_build_iac_config.yaml index 172e541..c7eeb71 100644 --- a/str_build/configs/str_build_iac_config.yaml +++ b/str_build/configs/str_build_iac_config.yaml @@ -27,9 +27,9 @@ dataset: local_z: 0 output: - base_dir: output + base_dir: output_iac_str_build save_final_model: false - save_path: output/final_model + save_path: output_iac_str_build verbose: false external: @@ -66,7 +66,7 @@ wandb: project: str_build entity: OpenMLRL run_name: str_build_iac - dir: output + dir: output_iac_str_build tags: ["iac", "str_build"] prompt: diff --git a/str_build/configs/str_build_maac_config.yaml b/str_build/configs/str_build_maac_config.yaml index e79a67b..6b0cfde 100644 --- a/str_build/configs/str_build_maac_config.yaml +++ b/str_build/configs/str_build_maac_config.yaml @@ -27,9 +27,9 @@ dataset: local_z: 0 output: - base_dir: output + base_dir: output_maac_str_build save_final_model: false - save_path: output/final_model + save_path: output_maac_str_build verbose: false external: @@ -65,7 +65,7 @@ wandb: project: str_build entity: OpenMLRL run_name: str_build_maac - dir: output + dir: output_maac_str_build tags: ["maac", "str_build"] prompt: diff --git a/str_build/configs/str_build_magrpo_config.yaml b/str_build/configs/str_build_magrpo_config.yaml index d945a4c..05ea9db 100644 --- a/str_build/configs/str_build_magrpo_config.yaml +++ b/str_build/configs/str_build_magrpo_config.yaml @@ -23,9 +23,9 @@ dataset: local_z: 0 output: - base_dir: output + base_dir: output_magrpo_str_build save_final_model: false - save_path: output/final_model + save_path: output_magrpo_str_build verbose: false external: @@ -61,7 +61,7 @@ wandb: project: str_build entity: OpenMLRL run_name: str_build_magrpo - dir: output + dir: output_magrpo_str_build tags: ["magrpo", "str_build"] prompt: From 79802621d04a2ac6176b4e2a059ee3766b65dcf1 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 00:08:15 -0500 Subject: [PATCH 06/11] ud --- house_build/configs/house_build_iac_config.yaml | 2 +- house_build/configs/house_build_maac_config.yaml | 2 +- house_build/configs/house_build_magrpo_config.yaml | 2 +- house_build/utils/trainer_args.py | 6 +++--- str_build/configs/str_build_iac_config.yaml | 2 +- str_build/configs/str_build_maac_config.yaml | 2 +- str_build/configs/str_build_magrpo_config.yaml | 2 +- str_build/utils/trainer_args.py | 6 +++--- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/house_build/configs/house_build_iac_config.yaml b/house_build/configs/house_build_iac_config.yaml index 487ba5a..f884600 100644 --- a/house_build/configs/house_build_iac_config.yaml +++ b/house_build/configs/house_build_iac_config.yaml @@ -37,7 +37,7 @@ external: lim: 20 iac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 4 num_train_epochs: 150 diff --git a/house_build/configs/house_build_maac_config.yaml b/house_build/configs/house_build_maac_config.yaml index 6460ab6..ffe98fd 100644 --- a/house_build/configs/house_build_maac_config.yaml +++ b/house_build/configs/house_build_maac_config.yaml @@ -37,7 +37,7 @@ external: lim: 20 maac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 4 critic_type: v diff --git a/house_build/configs/house_build_magrpo_config.yaml b/house_build/configs/house_build_magrpo_config.yaml index aec8d08..9f926a3 100644 --- a/house_build/configs/house_build_magrpo_config.yaml +++ b/house_build/configs/house_build_magrpo_config.yaml @@ -33,7 +33,7 @@ external: lim: 20 magrpo: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 4 num_train_epochs: 20 diff --git a/house_build/utils/trainer_args.py b/house_build/utils/trainer_args.py index 969bbd9..b35d489 100644 --- a/house_build/utils/trainer_args.py +++ b/house_build/utils/trainer_args.py @@ -165,7 +165,7 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA } candidate.update( { - "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", tr.get("gamma", 0.9)), 0.9), "joint_mode": joint_mode_str, @@ -225,7 +225,7 @@ def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACC "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), - "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), @@ -277,7 +277,7 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, - "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( diff --git a/str_build/configs/str_build_iac_config.yaml b/str_build/configs/str_build_iac_config.yaml index c7eeb71..d9883ff 100644 --- a/str_build/configs/str_build_iac_config.yaml +++ b/str_build/configs/str_build_iac_config.yaml @@ -38,7 +38,7 @@ external: previous_response: true iac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 4 num_train_epochs: 150 diff --git a/str_build/configs/str_build_maac_config.yaml b/str_build/configs/str_build_maac_config.yaml index 6b0cfde..d96d6fe 100644 --- a/str_build/configs/str_build_maac_config.yaml +++ b/str_build/configs/str_build_maac_config.yaml @@ -38,7 +38,7 @@ external: previous_response: true maac: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 4 critic_type: v diff --git a/str_build/configs/str_build_magrpo_config.yaml b/str_build/configs/str_build_magrpo_config.yaml index 05ea9db..a9433dd 100644 --- a/str_build/configs/str_build_magrpo_config.yaml +++ b/str_build/configs/str_build_magrpo_config.yaml @@ -34,7 +34,7 @@ external: previous_response: true magrpo: - parallel_training: auto + parallel_training: mp num_agents: 2 num_turns: 4 num_train_epochs: 20 diff --git a/str_build/utils/trainer_args.py b/str_build/utils/trainer_args.py index 4b9e9b5..58e8fd4 100644 --- a/str_build/utils/trainer_args.py +++ b/str_build/utils/trainer_args.py @@ -165,7 +165,7 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA } candidate.update( { - "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), "joint_mode": joint_mode_str, @@ -225,7 +225,7 @@ def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACC "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), - "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), @@ -277,7 +277,7 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, - "parallel_training": str(tr.get("parallel_training", "auto")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( From 6ae2eb6405c3df6f75e016ef9d8ebe98699138ce Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 10:10:24 -0500 Subject: [PATCH 07/11] ud --- house_build/configs/house_build_iac_config.yaml | 4 +++- house_build/configs/house_build_maac_config.yaml | 4 +++- house_build/configs/house_build_magrpo_config.yaml | 3 ++- house_build/utils/trainer_args.py | 6 +++--- str_build/configs/str_build_iac_config.yaml | 4 +++- str_build/configs/str_build_maac_config.yaml | 4 +++- str_build/configs/str_build_magrpo_config.yaml | 3 ++- str_build/utils/trainer_args.py | 6 +++--- 8 files changed, 22 insertions(+), 12 deletions(-) diff --git a/house_build/configs/house_build_iac_config.yaml b/house_build/configs/house_build_iac_config.yaml index f884600..f61f97a 100644 --- a/house_build/configs/house_build_iac_config.yaml +++ b/house_build/configs/house_build_iac_config.yaml @@ -37,7 +37,9 @@ external: lim: 20 iac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 4 num_train_epochs: 150 diff --git a/house_build/configs/house_build_maac_config.yaml b/house_build/configs/house_build_maac_config.yaml index ffe98fd..8821609 100644 --- a/house_build/configs/house_build_maac_config.yaml +++ b/house_build/configs/house_build_maac_config.yaml @@ -37,7 +37,9 @@ external: lim: 20 maac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 4 critic_type: v diff --git a/house_build/configs/house_build_magrpo_config.yaml b/house_build/configs/house_build_magrpo_config.yaml index 9f926a3..79dc721 100644 --- a/house_build/configs/house_build_magrpo_config.yaml +++ b/house_build/configs/house_build_magrpo_config.yaml @@ -33,7 +33,8 @@ external: lim: 20 magrpo: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] num_agents: 2 num_turns: 4 num_train_epochs: 20 diff --git a/house_build/utils/trainer_args.py b/house_build/utils/trainer_args.py index b35d489..a50e6a2 100644 --- a/house_build/utils/trainer_args.py +++ b/house_build/utils/trainer_args.py @@ -165,7 +165,7 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA } candidate.update( { - "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", tr.get("gamma", 0.9)), 0.9), "joint_mode": joint_mode_str, @@ -225,7 +225,7 @@ def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACC "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), - "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), @@ -277,7 +277,7 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, - "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( diff --git a/str_build/configs/str_build_iac_config.yaml b/str_build/configs/str_build_iac_config.yaml index d9883ff..5c91f57 100644 --- a/str_build/configs/str_build_iac_config.yaml +++ b/str_build/configs/str_build_iac_config.yaml @@ -38,7 +38,9 @@ external: previous_response: true iac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 4 num_train_epochs: 150 diff --git a/str_build/configs/str_build_maac_config.yaml b/str_build/configs/str_build_maac_config.yaml index d96d6fe..88a9f6c 100644 --- a/str_build/configs/str_build_maac_config.yaml +++ b/str_build/configs/str_build_maac_config.yaml @@ -38,7 +38,9 @@ external: previous_response: true maac: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] + critic_devices: ["cuda:0"] num_agents: 2 num_turns: 4 critic_type: v diff --git a/str_build/configs/str_build_magrpo_config.yaml b/str_build/configs/str_build_magrpo_config.yaml index a9433dd..a17dafb 100644 --- a/str_build/configs/str_build_magrpo_config.yaml +++ b/str_build/configs/str_build_magrpo_config.yaml @@ -34,7 +34,8 @@ external: previous_response: true magrpo: - parallel_training: mp + parallel_training: none + agent_devices: ["cuda:0"] num_agents: 2 num_turns: 4 num_train_epochs: 20 diff --git a/str_build/utils/trainer_args.py b/str_build/utils/trainer_args.py index 58e8fd4..27a1e8e 100644 --- a/str_build/utils/trainer_args.py +++ b/str_build/utils/trainer_args.py @@ -165,7 +165,7 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA } candidate.update( { - "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), "joint_mode": joint_mode_str, @@ -225,7 +225,7 @@ def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACC "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), - "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "discount": _as_float(tr.get("discount", 0.9), 0.9), @@ -277,7 +277,7 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, - "parallel_training": str(tr.get("parallel_training", "mp")).strip().lower(), + "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), "agent_devices": _as_device_spec(tr.get("agent_devices", None)), "critic_devices": _as_device_spec(tr.get("critic_devices", None)), "critic_value_head_hidden_dim": _as_opt_int( From 17d8f795c5bfbe5357052794775d583a930bbc85 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 11:01:03 -0500 Subject: [PATCH 08/11] ud --- house_build/utils/trainer_args.py | 73 ++++++++++++++-------------- str_build/utils/trainer_args.py | 81 +++++++++++++++---------------- 2 files changed, 76 insertions(+), 78 deletions(-) diff --git a/house_build/utils/trainer_args.py b/house_build/utils/trainer_args.py index a50e6a2..2a89592 100644 --- a/house_build/utils/trainer_args.py +++ b/house_build/utils/trainer_args.py @@ -143,7 +143,7 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA if not isinstance(tr, dict): tr = {} - lr_val = tr.get("agent_learning_rate", 3e-5) + lr_val = tr.get("agent_learning_rate", 1e-5) joint_mode = tr.get("joint_mode", tr.get("joint_action_mode", None)) joint_mode_str = str(joint_mode or "aligned").strip().lower() @@ -153,37 +153,36 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA joint_mode_str = "cross" candidate = { - "num_turns": _as_int(tr.get("num_turns", 1), 1), - "num_train_epochs": _as_int(tr.get("num_train_epochs", 3), 3), - "agent_learning_rate": _as_float(lr_val, 3e-5), - "logging_steps": _as_int(tr.get("logging_steps", 50), 50), - "num_generations": _as_int(tr.get("num_generations", 4), 4), + "num_turns": _as_int(tr.get("num_turns", 4), 4), + "num_train_epochs": _as_int(tr.get("num_train_epochs", 20), 20), + "agent_learning_rate": _as_float(lr_val, 1e-5), + "logging_steps": _as_int(tr.get("logging_steps", 5), 5), + "num_generations": _as_int(tr.get("num_generations", 2), 2), "max_new_tokens": _as_int(tr.get("max_new_tokens", 512), 512), - "temperature": _as_float(sampling_cfg.get("temperature"), 0.2), - "top_p": _as_float(sampling_cfg.get("top_p"), 0.95), + "temperature": _as_float(sampling_cfg.get("temperature"), 0.6), + "top_p": _as_float(sampling_cfg.get("top_p"), 0.6), "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), } candidate.update( { "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), - "agent_devices": _as_device_spec(tr.get("agent_devices", None)), + "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), "discount": _as_float(tr.get("discount", tr.get("gamma", 0.9)), 0.9), "joint_mode": joint_mode_str, + "early_termination_threshold": _as_opt_float( + tr.get("early_termination_threshold", -0.1), -0.1 + ), } ) - if "early_termination_threshold" in tr: - candidate["early_termination_threshold"] = _as_opt_float( - tr.get("early_termination_threshold", None), None - ) candidate.update( { - "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 2), 2), - "train_batch_size": _as_opt_int(tr.get("train_batch_size", None), None), + "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 1), 1), + "train_batch_size": _as_opt_int(tr.get("train_batch_size", 1), 1), "advantage_normalization": _as_bool( tr.get("advantage_normalization", True), True ), - "eval_interval": _as_int(tr.get("eval_interval", 16), 16), - "eval_num_samples": _as_int(tr.get("eval_num_samples", 4), 4), + "eval_interval": _as_int(tr.get("eval_interval", 2), 2), + "eval_num_samples": _as_int(tr.get("eval_num_samples", 2), 2), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), } ) @@ -210,33 +209,33 @@ def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACC adv_norm = tr.get("advantage_normalization", tr.get("normalize_advantage", True)) candidate = { - "num_turns": _as_int(tr.get("num_turns", 1), 1), - "num_train_epochs": _as_int(tr.get("num_train_epochs", 40), 40), + "num_turns": _as_int(tr.get("num_turns", 4), 4), + "num_train_epochs": _as_int(tr.get("num_train_epochs", 150), 150), "agent_learning_rate": _as_float(tr.get("agent_learning_rate", 5e-6), 5e-6), "critic_learning_rate": _as_float( tr.get("critic_learning_rate", 5e-6), 5e-6 ), - "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 8), 8), + "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 1), 1), "value_loss_coef": _as_float(tr.get("value_loss_coef", 0.6), 0.6), "advantage_normalization": _as_bool(adv_norm, True), - "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), + "max_new_tokens": _as_int(tr.get("max_new_tokens", 512), 512), "temperature": _as_float(sampling_cfg.get("temperature"), 0.6), "top_p": _as_float(sampling_cfg.get("top_p"), 0.6), "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), - "agent_devices": _as_device_spec(tr.get("agent_devices", None)), - "critic_devices": _as_device_spec(tr.get("critic_devices", None)), + "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), + "critic_devices": _as_device_spec(tr.get("critic_devices", ["cuda:0"])), "discount": _as_float(tr.get("discount", 0.9), 0.9), "critic_type": str(tr.get("critic_type", "v")), "early_termination_threshold": _as_opt_float( - tr.get("early_termination_threshold", None), None + tr.get("early_termination_threshold", 0.0), 0.0 ), - "eval_interval": _as_int(tr.get("eval_interval", 16), 16), - "eval_num_samples": _as_int(tr.get("eval_num_samples", 4), 4), + "eval_interval": _as_int(tr.get("eval_interval", 10), 10), + "eval_num_samples": _as_int(tr.get("eval_num_samples", 2), 2), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), - "logging_steps": _as_int(tr.get("logging_steps", 1), 1), + "logging_steps": _as_int(tr.get("logging_steps", 40), 40), } try: @@ -260,17 +259,17 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon adv_norm = tr.get("advantage_normalization", tr.get("normalize_advantage", True)) candidate = { - "num_turns": _as_int(tr.get("num_turns", 1), 1), - "num_train_epochs": _as_int(tr.get("num_train_epochs", 40), 40), + "num_turns": _as_int(tr.get("num_turns", 4), 4), + "num_train_epochs": _as_int(tr.get("num_train_epochs", 150), 150), "agent_learning_rate": _as_float(tr.get("agent_learning_rate", 5e-6), 5e-6), "critic_learning_rate": _as_opt_float( tr.get("critic_learning_rate", 5e-6), 5e-6 ), - "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 8), 8), + "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 1), 1), "value_loss_coef": _as_float(tr.get("value_loss_coef", 0.6), 0.6), "value_clip_range": _as_opt_float(tr.get("value_clip_range", 0.05), 0.05), "advantage_normalization": _as_bool(adv_norm, True), - "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), + "max_new_tokens": _as_int(tr.get("max_new_tokens", 512), 512), "temperature": _as_float(sampling_cfg.get("temperature"), 0.6), "top_p": _as_float(sampling_cfg.get("top_p"), 0.6), "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), @@ -278,20 +277,20 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), - "agent_devices": _as_device_spec(tr.get("agent_devices", None)), - "critic_devices": _as_device_spec(tr.get("critic_devices", None)), + "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), + "critic_devices": _as_device_spec(tr.get("critic_devices", ["cuda:0"])), "critic_value_head_hidden_dim": _as_opt_int( tr.get("critic_value_head_hidden_dim", None), None ), "value_head_hidden_dim": _as_opt_int(tr.get("value_head_hidden_dim", None), None), "discount": _as_float(tr.get("discount", 0.9), 0.9), "early_termination_threshold": _as_opt_float( - tr.get("early_termination_threshold", None), None + tr.get("early_termination_threshold", 0.0), 0.0 ), - "eval_interval": _as_int(tr.get("eval_interval", 16), 16), - "eval_num_samples": _as_int(tr.get("eval_num_samples", 4), 4), + "eval_interval": _as_int(tr.get("eval_interval", 10), 10), + "eval_num_samples": _as_int(tr.get("eval_num_samples", 2), 2), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), - "logging_steps": _as_int(tr.get("logging_steps", 1), 1), + "logging_steps": _as_int(tr.get("logging_steps", 40), 40), } try: diff --git a/str_build/utils/trainer_args.py b/str_build/utils/trainer_args.py index 27a1e8e..438d8ee 100644 --- a/str_build/utils/trainer_args.py +++ b/str_build/utils/trainer_args.py @@ -143,7 +143,7 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA if not isinstance(tr, dict): tr = {} - lr_val = tr.get("agent_learning_rate", 3e-5) + lr_val = tr.get("agent_learning_rate", 5e-6) joint_mode = tr.get("joint_mode", tr.get("joint_action_mode", None)) joint_mode_str = str(joint_mode or "aligned").strip().lower() @@ -153,37 +153,36 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA joint_mode_str = "cross" candidate = { - "num_turns": _as_int(tr.get("num_turns", 1), 1), - "num_train_epochs": _as_int(tr.get("num_train_epochs", 3), 3), - "agent_learning_rate": _as_float(lr_val, 3e-5), - "logging_steps": _as_int(tr.get("logging_steps", 50), 50), - "num_generations": _as_int(tr.get("num_generations", 4), 4), + "num_turns": _as_int(tr.get("num_turns", 4), 4), + "num_train_epochs": _as_int(tr.get("num_train_epochs", 20), 20), + "agent_learning_rate": _as_float(lr_val, 5e-6), + "logging_steps": _as_int(tr.get("logging_steps", 1), 1), + "num_generations": _as_int(tr.get("num_generations", 2), 2), "max_new_tokens": _as_int(tr.get("max_new_tokens", 512), 512), - "temperature": _as_float(sampling_cfg.get("temperature"), 0.2), - "top_p": _as_float(sampling_cfg.get("top_p"), 0.95), + "temperature": _as_float(sampling_cfg.get("temperature"), 0.6), + "top_p": _as_float(sampling_cfg.get("top_p"), 0.6), "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), } candidate.update( { "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), - "agent_devices": _as_device_spec(tr.get("agent_devices", None)), + "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), "discount": _as_float(tr.get("discount", 0.9), 0.9), "joint_mode": joint_mode_str, + "early_termination_threshold": _as_opt_float( + tr.get("early_termination_threshold", -0.1), -0.1 + ), } ) - if "early_termination_threshold" in tr: - candidate["early_termination_threshold"] = _as_opt_float( - tr.get("early_termination_threshold", None), None - ) candidate.update( { - "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 2), 2), - "train_batch_size": _as_opt_int(tr.get("train_batch_size", None), None), + "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 1), 1), + "train_batch_size": _as_opt_int(tr.get("train_batch_size", 1), 1), "advantage_normalization": _as_bool( tr.get("advantage_normalization", True), True ), - "eval_interval": _as_int(tr.get("eval_interval", 16), 16), - "eval_num_samples": _as_int(tr.get("eval_num_samples", 4), 4), + "eval_interval": _as_int(tr.get("eval_interval", 2), 2), + "eval_num_samples": _as_int(tr.get("eval_num_samples", 2), 2), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), } ) @@ -210,33 +209,33 @@ def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACC adv_norm = tr.get("advantage_normalization", tr.get("normalize_advantage", True)) candidate = { - "num_turns": _as_int(tr.get("num_turns", 1), 1), - "num_train_epochs": _as_int(tr.get("num_train_epochs", 40), 40), - "agent_learning_rate": _as_float(tr.get("agent_learning_rate", 5e-6), 5e-6), + "num_turns": _as_int(tr.get("num_turns", 4), 4), + "num_train_epochs": _as_int(tr.get("num_train_epochs", 150), 150), + "agent_learning_rate": _as_float(tr.get("agent_learning_rate", 2.5e-6), 2.5e-6), "critic_learning_rate": _as_float( - tr.get("critic_learning_rate", 5e-6), 5e-6 + tr.get("critic_learning_rate", 2.5e-6), 2.5e-6 ), - "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 8), 8), + "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 1), 1), "value_loss_coef": _as_float(tr.get("value_loss_coef", 0.6), 0.6), "advantage_normalization": _as_bool(adv_norm, True), - "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), + "max_new_tokens": _as_int(tr.get("max_new_tokens", 512), 512), "temperature": _as_float(sampling_cfg.get("temperature"), 0.6), "top_p": _as_float(sampling_cfg.get("top_p"), 0.6), "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), "num_agents": _as_int(tr.get("num_agents", 2), 2), "num_generations": _as_int(tr.get("num_generations", 1), 1), "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), - "agent_devices": _as_device_spec(tr.get("agent_devices", None)), - "critic_devices": _as_device_spec(tr.get("critic_devices", None)), + "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), + "critic_devices": _as_device_spec(tr.get("critic_devices", ["cuda:0"])), "discount": _as_float(tr.get("discount", 0.9), 0.9), "critic_type": str(tr.get("critic_type", "v")), "early_termination_threshold": _as_opt_float( - tr.get("early_termination_threshold", None), None + tr.get("early_termination_threshold", -0.1), -0.1 ), - "eval_interval": _as_int(tr.get("eval_interval", 16), 16), - "eval_num_samples": _as_int(tr.get("eval_num_samples", 4), 4), + "eval_interval": _as_int(tr.get("eval_interval", 10), 10), + "eval_num_samples": _as_int(tr.get("eval_num_samples", 2), 2), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), - "logging_steps": _as_int(tr.get("logging_steps", 1), 1), + "logging_steps": _as_int(tr.get("logging_steps", 20), 20), } try: @@ -260,17 +259,17 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon adv_norm = tr.get("advantage_normalization", tr.get("normalize_advantage", True)) candidate = { - "num_turns": _as_int(tr.get("num_turns", 1), 1), - "num_train_epochs": _as_int(tr.get("num_train_epochs", 40), 40), - "agent_learning_rate": _as_float(tr.get("agent_learning_rate", 5e-6), 5e-6), + "num_turns": _as_int(tr.get("num_turns", 4), 4), + "num_train_epochs": _as_int(tr.get("num_train_epochs", 150), 150), + "agent_learning_rate": _as_float(tr.get("agent_learning_rate", 2.5e-6), 2.5e-6), "critic_learning_rate": _as_opt_float( - tr.get("critic_learning_rate", 5e-6), 5e-6 + tr.get("critic_learning_rate", 2.5e-6), 2.5e-6 ), - "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 8), 8), + "rollout_buffer_size": _as_int(tr.get("rollout_buffer_size", 1), 1), "value_loss_coef": _as_float(tr.get("value_loss_coef", 0.6), 0.6), "value_clip_range": _as_opt_float(tr.get("value_clip_range", 0.05), 0.05), "advantage_normalization": _as_bool(adv_norm, True), - "max_new_tokens": _as_int(tr.get("max_new_tokens", 256), 256), + "max_new_tokens": _as_int(tr.get("max_new_tokens", 512), 512), "temperature": _as_float(sampling_cfg.get("temperature"), 0.6), "top_p": _as_float(sampling_cfg.get("top_p"), 0.6), "top_k": _as_opt_int(sampling_cfg.get("top_k"), None), @@ -278,20 +277,20 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon "num_generations": _as_int(tr.get("num_generations", 1), 1), "use_separate_critic": use_separate_critic, "parallel_training": str(tr.get("parallel_training", "none")).strip().lower(), - "agent_devices": _as_device_spec(tr.get("agent_devices", None)), - "critic_devices": _as_device_spec(tr.get("critic_devices", None)), + "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), + "critic_devices": _as_device_spec(tr.get("critic_devices", ["cuda:0"])), "critic_value_head_hidden_dim": _as_opt_int( tr.get("critic_value_head_hidden_dim", None), None ), "value_head_hidden_dim": _as_opt_int(tr.get("value_head_hidden_dim", None), None), "discount": _as_float(tr.get("discount", 0.9), 0.9), "early_termination_threshold": _as_opt_float( - tr.get("early_termination_threshold", None), None + tr.get("early_termination_threshold", -0.1), -0.1 ), - "eval_interval": _as_int(tr.get("eval_interval", 16), 16), - "eval_num_samples": _as_int(tr.get("eval_num_samples", 4), 4), + "eval_interval": _as_int(tr.get("eval_interval", 10), 10), + "eval_num_samples": _as_int(tr.get("eval_num_samples", 2), 2), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), - "logging_steps": _as_int(tr.get("logging_steps", 1), 1), + "logging_steps": _as_int(tr.get("logging_steps", 20), 20), } try: From 5a0afc3562610a4d05eee4db308c84f8241b1969 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 14:10:10 -0500 Subject: [PATCH 09/11] ud --- .../configs/house_build_iac_config.yaml | 1 + .../configs/house_build_maac_config.yaml | 1 + .../configs/house_build_magrpo_config.yaml | 1 + house_build/utils/trainer_args.py | 18 ++++++++++++++++++ str_build/configs/str_build_iac_config.yaml | 1 + str_build/configs/str_build_maac_config.yaml | 1 + str_build/configs/str_build_magrpo_config.yaml | 1 + str_build/utils/trainer_args.py | 18 ++++++++++++++++++ 8 files changed, 42 insertions(+) diff --git a/house_build/configs/house_build_iac_config.yaml b/house_build/configs/house_build_iac_config.yaml index f61f97a..82f53ef 100644 --- a/house_build/configs/house_build_iac_config.yaml +++ b/house_build/configs/house_build_iac_config.yaml @@ -35,6 +35,7 @@ external: original_prompt: true previous_response: true lim: 20 + external_prompt_passthrough: false iac: parallel_training: none diff --git a/house_build/configs/house_build_maac_config.yaml b/house_build/configs/house_build_maac_config.yaml index 8821609..627607b 100644 --- a/house_build/configs/house_build_maac_config.yaml +++ b/house_build/configs/house_build_maac_config.yaml @@ -35,6 +35,7 @@ external: original_prompt: true previous_response: true lim: 20 + external_prompt_passthrough: false maac: parallel_training: none diff --git a/house_build/configs/house_build_magrpo_config.yaml b/house_build/configs/house_build_magrpo_config.yaml index 79dc721..87e367a 100644 --- a/house_build/configs/house_build_magrpo_config.yaml +++ b/house_build/configs/house_build_magrpo_config.yaml @@ -31,6 +31,7 @@ external: original_prompt: true previous_response: true lim: 20 + external_prompt_passthrough: false magrpo: parallel_training: none diff --git a/house_build/utils/trainer_args.py b/house_build/utils/trainer_args.py index 2a89592..8ff6a66 100644 --- a/house_build/utils/trainer_args.py +++ b/house_build/utils/trainer_args.py @@ -142,6 +142,9 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA tr = cfg.get("magrpo") or {} if not isinstance(tr, dict): tr = {} + ext = cfg.get("external") or {} + if not isinstance(ext, dict): + ext = {} lr_val = tr.get("agent_learning_rate", 1e-5) @@ -184,6 +187,9 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA "eval_interval": _as_int(tr.get("eval_interval", 2), 2), "eval_num_samples": _as_int(tr.get("eval_num_samples", 2), 2), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), + "external_prompt_passthrough": _as_bool( + ext.get("external_prompt_passthrough", False), False + ), } ) @@ -205,6 +211,9 @@ def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACC tr = cfg.get("maac") or {} if not isinstance(tr, dict): tr = {} + ext = cfg.get("external") or {} + if not isinstance(ext, dict): + ext = {} adv_norm = tr.get("advantage_normalization", tr.get("normalize_advantage", True)) @@ -228,6 +237,9 @@ def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACC "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), "critic_devices": _as_device_spec(tr.get("critic_devices", ["cuda:0"])), "discount": _as_float(tr.get("discount", 0.9), 0.9), + "external_prompt_passthrough": _as_bool( + ext.get("external_prompt_passthrough", False), False + ), "critic_type": str(tr.get("critic_type", "v")), "early_termination_threshold": _as_opt_float( tr.get("early_termination_threshold", 0.0), 0.0 @@ -254,6 +266,9 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon tr = cfg.get("iac") or {} if not isinstance(tr, dict): tr = {} + ext = cfg.get("external") or {} + if not isinstance(ext, dict): + ext = {} use_separate_critic = _as_bool(tr.get("use_separate_critic", True), True) adv_norm = tr.get("advantage_normalization", tr.get("normalize_advantage", True)) @@ -284,6 +299,9 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon ), "value_head_hidden_dim": _as_opt_int(tr.get("value_head_hidden_dim", None), None), "discount": _as_float(tr.get("discount", 0.9), 0.9), + "external_prompt_passthrough": _as_bool( + ext.get("external_prompt_passthrough", False), False + ), "early_termination_threshold": _as_opt_float( tr.get("early_termination_threshold", 0.0), 0.0 ), diff --git a/str_build/configs/str_build_iac_config.yaml b/str_build/configs/str_build_iac_config.yaml index 5c91f57..a7263ed 100644 --- a/str_build/configs/str_build_iac_config.yaml +++ b/str_build/configs/str_build_iac_config.yaml @@ -36,6 +36,7 @@ external: mode: position_feedback original_prompt: true previous_response: true + external_prompt_passthrough: false iac: parallel_training: none diff --git a/str_build/configs/str_build_maac_config.yaml b/str_build/configs/str_build_maac_config.yaml index 88a9f6c..63cd447 100644 --- a/str_build/configs/str_build_maac_config.yaml +++ b/str_build/configs/str_build_maac_config.yaml @@ -36,6 +36,7 @@ external: mode: position_feedback original_prompt: true previous_response: true + external_prompt_passthrough: false maac: parallel_training: none diff --git a/str_build/configs/str_build_magrpo_config.yaml b/str_build/configs/str_build_magrpo_config.yaml index a17dafb..1a91427 100644 --- a/str_build/configs/str_build_magrpo_config.yaml +++ b/str_build/configs/str_build_magrpo_config.yaml @@ -32,6 +32,7 @@ external: mode: position_feedback original_prompt: true previous_response: true + external_prompt_passthrough: false magrpo: parallel_training: none diff --git a/str_build/utils/trainer_args.py b/str_build/utils/trainer_args.py index 438d8ee..901b9d2 100644 --- a/str_build/utils/trainer_args.py +++ b/str_build/utils/trainer_args.py @@ -142,6 +142,9 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA tr = cfg.get("magrpo") or {} if not isinstance(tr, dict): tr = {} + ext = cfg.get("external") or {} + if not isinstance(ext, dict): + ext = {} lr_val = tr.get("agent_learning_rate", 5e-6) @@ -184,6 +187,9 @@ def get_trainer_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MA "eval_interval": _as_int(tr.get("eval_interval", 2), 2), "eval_num_samples": _as_int(tr.get("eval_num_samples", 2), 2), "eval_batch_size": _as_int(tr.get("eval_batch_size", 1), 1), + "external_prompt_passthrough": _as_bool( + ext.get("external_prompt_passthrough", False), False + ), } ) @@ -205,6 +211,9 @@ def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACC tr = cfg.get("maac") or {} if not isinstance(tr, dict): tr = {} + ext = cfg.get("external") or {} + if not isinstance(ext, dict): + ext = {} adv_norm = tr.get("advantage_normalization", tr.get("normalize_advantage", True)) @@ -228,6 +237,9 @@ def get_maac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> MAACC "agent_devices": _as_device_spec(tr.get("agent_devices", ["cuda:0"])), "critic_devices": _as_device_spec(tr.get("critic_devices", ["cuda:0"])), "discount": _as_float(tr.get("discount", 0.9), 0.9), + "external_prompt_passthrough": _as_bool( + ext.get("external_prompt_passthrough", False), False + ), "critic_type": str(tr.get("critic_type", "v")), "early_termination_threshold": _as_opt_float( tr.get("early_termination_threshold", -0.1), -0.1 @@ -254,6 +266,9 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon tr = cfg.get("iac") or {} if not isinstance(tr, dict): tr = {} + ext = cfg.get("external") or {} + if not isinstance(ext, dict): + ext = {} use_separate_critic = _as_bool(tr.get("use_separate_critic", True), True) adv_norm = tr.get("advantage_normalization", tr.get("normalize_advantage", True)) @@ -284,6 +299,9 @@ def get_iac_args(cfg: Dict[str, Any], *, sampling_cfg: Dict[str, Any]) -> IACCon ), "value_head_hidden_dim": _as_opt_int(tr.get("value_head_hidden_dim", None), None), "discount": _as_float(tr.get("discount", 0.9), 0.9), + "external_prompt_passthrough": _as_bool( + ext.get("external_prompt_passthrough", False), False + ), "early_termination_threshold": _as_opt_float( tr.get("early_termination_threshold", -0.1), -0.1 ), From e4036fc683b3e9f74d7c385f7d57c7edab786148 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 17:01:39 -0500 Subject: [PATCH 10/11] ud --- .../configs/house_build_iac_config.yaml | 62 ++++++++++++------- .../configs/house_build_maac_config.yaml | 60 +++++++++++------- .../configs/house_build_magrpo_config.yaml | 55 ++++++++++------ str_build/configs/str_build_iac_config.yaml | 50 +++++++++------ str_build/configs/str_build_maac_config.yaml | 48 ++++++++------ .../configs/str_build_magrpo_config.yaml | 39 +++++++----- 6 files changed, 195 insertions(+), 119 deletions(-) diff --git a/house_build/configs/house_build_iac_config.yaml b/house_build/configs/house_build_iac_config.yaml index 82f53ef..05ec2ed 100644 --- a/house_build/configs/house_build_iac_config.yaml +++ b/house_build/configs/house_build_iac_config.yaml @@ -10,7 +10,7 @@ agent_model: agents: null critic_model: - name: "Qwen/Qwen3-4B-Instruct-2507" + name: Qwen/Qwen3-4B-Instruct-2507 type: qwen max_length: 2048 dtype: bf16 @@ -20,15 +20,42 @@ critics: null dataset: name: house_build type: house_build + train_split: '[:8]' + eval_split: '[8:]' json_path: ../dataset/data.json - train_split: "[:8]" - eval_split: "[8:]" + +prompt: + use_chat_template: true + +task: + player: + hp: 5 + spider: + atk_high: 3 + atk_low: 1 + num: 3 + max_commands: 600 + limited_resource: true + block_agent1: + - white_concrete + - obsidian + - stone_stairs + - stone_bricks + - planks + - air + block_agent2: + - white_concrete + - obsidian + - stone_stairs + - stone_bricks + - planks + - air output: base_dir: output_iac_house_build + verbose: false save_final_model: false save_path: output_iac_house_build - verbose: false external: mode: score_feedback @@ -39,10 +66,13 @@ external: iac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 4 + use_separate_critic: true num_train_epochs: 150 agent_learning_rate: 5e-6 critic_learning_rate: 5e-6 @@ -51,7 +81,6 @@ iac: rollout_buffer_size: 1 train_batch_size: 1 max_new_tokens: 512 - use_separate_critic: true discount: 0.9 early_termination_threshold: 0.0 eval_interval: 10 @@ -69,19 +98,6 @@ wandb: entity: OpenMLRL run_name: house_build_iac dir: output_iac_house_build - tags: ["iac", "house_build"] - -prompt: - use_chat_template: true - -task: - block_agent1: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air] - block_agent2: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air] - max_commands: 600 - limited_resource: true - player: - hp: 5 - spider: - num: 3 - atk_low: 1 - atk_high: 3 + tags: + - iac + - house_build diff --git a/house_build/configs/house_build_maac_config.yaml b/house_build/configs/house_build_maac_config.yaml index 627607b..b97e05a 100644 --- a/house_build/configs/house_build_maac_config.yaml +++ b/house_build/configs/house_build_maac_config.yaml @@ -10,7 +10,7 @@ agent_model: agents: null critic_model: - name: "Qwen/Qwen3-4B-Instruct-2507" + name: Qwen/Qwen3-4B-Instruct-2507 type: qwen max_length: 2048 dtype: bf16 @@ -20,15 +20,42 @@ critics: null dataset: name: house_build type: house_build + train_split: '[:8]' + eval_split: '[8:]' json_path: ../dataset/data.json - train_split: "[:8]" - eval_split: "[8:]" + +prompt: + use_chat_template: true + +task: + player: + hp: 5 + spider: + atk_high: 3 + atk_low: 1 + num: 3 + max_commands: 600 + limited_resource: true + block_agent1: + - white_concrete + - obsidian + - stone_stairs + - stone_bricks + - planks + - air + block_agent2: + - white_concrete + - obsidian + - stone_stairs + - stone_bricks + - planks + - air output: base_dir: output_maac_house_build + verbose: false save_final_model: false save_path: output_maac_house_build - verbose: false external: mode: score_feedback @@ -39,8 +66,10 @@ external: maac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 4 critic_type: v @@ -68,19 +97,6 @@ wandb: entity: OpenMLRL run_name: house_build_maac dir: output_maac_house_build - tags: ["maac", "house_build"] - -prompt: - use_chat_template: true - -task: - block_agent1: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air] - block_agent2: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air] - max_commands: 600 - limited_resource: true - player: - hp: 5 - spider: - num: 3 - atk_low: 1 - atk_high: 3 + tags: + - maac + - house_build diff --git a/house_build/configs/house_build_magrpo_config.yaml b/house_build/configs/house_build_magrpo_config.yaml index 87e367a..e1f2e4b 100644 --- a/house_build/configs/house_build_magrpo_config.yaml +++ b/house_build/configs/house_build_magrpo_config.yaml @@ -16,15 +16,42 @@ critics: null dataset: name: house_build type: house_build + train_split: '[:8]' + eval_split: '[8:]' json_path: ../dataset/data.json - train_split: "[:8]" - eval_split: "[8:]" + +prompt: + use_chat_template: true + +task: + player: + hp: 5 + spider: + atk_high: 3 + atk_low: 1 + num: 3 + max_commands: 600 + limited_resource: true + block_agent1: + - white_concrete + - obsidian + - stone_stairs + - stone_bricks + - planks + - air + block_agent2: + - white_concrete + - obsidian + - stone_stairs + - stone_bricks + - planks + - air output: base_dir: output_magrpo_house_build + verbose: false save_final_model: false save_path: output_magrpo_house_build - verbose: false external: mode: score_feedback @@ -35,7 +62,8 @@ external: magrpo: parallel_training: none - agent_devices: ["cuda:0"] + agent_devices: + - cuda:0 num_agents: 2 num_turns: 4 num_train_epochs: 20 @@ -63,19 +91,6 @@ wandb: entity: OpenMLRL run_name: house_build_magrpo dir: output_magrpo_house_build - tags: ["magrpo", "house_build"] - -prompt: - use_chat_template: true - -task: - block_agent1: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air] - block_agent2: [white_concrete, obsidian, stone_stairs, stone_bricks, planks, air] - max_commands: 600 - limited_resource: true - player: - hp: 5 - spider: - num: 3 - atk_low: 1 - atk_high: 3 + tags: + - magrpo + - house_build diff --git a/str_build/configs/str_build_iac_config.yaml b/str_build/configs/str_build_iac_config.yaml index a7263ed..b197cc8 100644 --- a/str_build/configs/str_build_iac_config.yaml +++ b/str_build/configs/str_build_iac_config.yaml @@ -10,7 +10,7 @@ agent_model: agents: null critic_model: - name: "Qwen/Qwen3-4B-Instruct-2507" + name: Qwen/Qwen3-4B-Instruct-2507 type: qwen max_length: 2048 dtype: bf16 @@ -20,17 +20,32 @@ critics: null dataset: name: str_build type: str_build + train_split: '[:8]' + eval_split: '[8:]' csv_path: ../dataset/data.csv - train_split: "[:8]" - eval_split: "[8:]" - spacing: 2 local_z: 0 + spacing: 2 + +prompt: + use_chat_template: true + provide_graph: false + +task: + max_commands: 300 + block_agent1: + - oak_planks + - stone + - air + block_agent2: + - white_concrete + - obsidian + - air output: base_dir: output_iac_str_build + verbose: false save_final_model: false save_path: output_iac_str_build - verbose: false external: mode: position_feedback @@ -40,19 +55,21 @@ external: iac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 4 + use_separate_critic: true num_train_epochs: 150 - agent_learning_rate: 2.5e-6 - critic_learning_rate: 2.5e-6 + agent_learning_rate: 2.5e-06 + critic_learning_rate: 2.5e-06 value_loss_coef: 0.6 value_clip_range: 0.05 rollout_buffer_size: 1 train_batch_size: 1 max_new_tokens: 512 - use_separate_critic: true discount: 0.9 early_termination_threshold: -0.1 eval_interval: 10 @@ -70,13 +87,6 @@ wandb: entity: OpenMLRL run_name: str_build_iac dir: output_iac_str_build - tags: ["iac", "str_build"] - -prompt: - provide_graph: false - use_chat_template: true - -task: - block_agent1: [oak_planks, stone, air] - block_agent2: [white_concrete, obsidian, air] - max_commands: 300 + tags: + - iac + - str_build diff --git a/str_build/configs/str_build_maac_config.yaml b/str_build/configs/str_build_maac_config.yaml index 63cd447..fd44b13 100644 --- a/str_build/configs/str_build_maac_config.yaml +++ b/str_build/configs/str_build_maac_config.yaml @@ -10,7 +10,7 @@ agent_model: agents: null critic_model: - name: "Qwen/Qwen3-4B-Instruct-2507" + name: Qwen/Qwen3-4B-Instruct-2507 type: qwen max_length: 2048 dtype: bf16 @@ -20,17 +20,32 @@ critics: null dataset: name: str_build type: str_build + train_split: '[:8]' + eval_split: '[8:]' csv_path: ../dataset/data.csv - train_split: "[:8]" - eval_split: "[8:]" - spacing: 2 local_z: 0 + spacing: 2 + +prompt: + use_chat_template: true + provide_graph: false + +task: + max_commands: 300 + block_agent1: + - oak_planks + - stone + - air + block_agent2: + - white_concrete + - obsidian + - air output: base_dir: output_maac_str_build + verbose: false save_final_model: false save_path: output_maac_str_build - verbose: false external: mode: position_feedback @@ -40,14 +55,16 @@ external: maac: parallel_training: none - agent_devices: ["cuda:0"] - critic_devices: ["cuda:0"] + agent_devices: + - cuda:0 + critic_devices: + - cuda:0 num_agents: 2 num_turns: 4 critic_type: v num_train_epochs: 150 - agent_learning_rate: 2.5e-6 - critic_learning_rate: 2.5e-6 + agent_learning_rate: 2.5e-06 + critic_learning_rate: 2.5e-06 value_loss_coef: 0.6 rollout_buffer_size: 1 train_batch_size: 1 @@ -69,13 +86,6 @@ wandb: entity: OpenMLRL run_name: str_build_maac dir: output_maac_str_build - tags: ["maac", "str_build"] - -prompt: - provide_graph: false - use_chat_template: true - -task: - block_agent1: [oak_planks, stone, air] - block_agent2: [white_concrete, obsidian, air] - max_commands: 300 + tags: + - maac + - str_build diff --git a/str_build/configs/str_build_magrpo_config.yaml b/str_build/configs/str_build_magrpo_config.yaml index 1a91427..b6e25cc 100644 --- a/str_build/configs/str_build_magrpo_config.yaml +++ b/str_build/configs/str_build_magrpo_config.yaml @@ -16,17 +16,32 @@ critics: null dataset: name: str_build type: str_build + train_split: '[:8]' + eval_split: '[8:]' csv_path: ../dataset/data.csv - train_split: "[:8]" - eval_split: "[8:]" - spacing: 2 local_z: 0 + spacing: 2 + +prompt: + use_chat_template: true + provide_graph: false + +task: + max_commands: 300 + block_agent1: + - oak_planks + - stone + - air + block_agent2: + - white_concrete + - obsidian + - air output: base_dir: output_magrpo_str_build + verbose: false save_final_model: false save_path: output_magrpo_str_build - verbose: false external: mode: position_feedback @@ -36,7 +51,8 @@ external: magrpo: parallel_training: none - agent_devices: ["cuda:0"] + agent_devices: + - cuda:0 num_agents: 2 num_turns: 4 num_train_epochs: 20 @@ -64,13 +80,6 @@ wandb: entity: OpenMLRL run_name: str_build_magrpo dir: output_magrpo_str_build - tags: ["magrpo", "str_build"] - -prompt: - provide_graph: false - use_chat_template: true - -task: - block_agent1: [oak_planks, stone, air] - block_agent2: [white_concrete, obsidian, air] - max_commands: 300 + tags: + - magrpo + - str_build From 0110baad97027967af977c8aa9f6f0e23359bbd3 Mon Sep 17 00:00:00 2001 From: N!no Date: Mon, 16 Feb 2026 17:09:17 -0500 Subject: [PATCH 11/11] ud --- house_build/train/train_magrpo.py | 22 ++++++---------------- str_build/train/train_magrpo.py | 22 ++++++---------------- 2 files changed, 12 insertions(+), 32 deletions(-) diff --git a/house_build/train/train_magrpo.py b/house_build/train/train_magrpo.py index 7204b8a..919e08b 100644 --- a/house_build/train/train_magrpo.py +++ b/house_build/train/train_magrpo.py @@ -22,7 +22,7 @@ sys.path.insert(0, COMLRL_ROOT) from datasets import Dataset # type: ignore -from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore +from transformers import AutoTokenizer # type: ignore import torch # type: ignore from comlrl.trainers.reinforce import MAGRPOTrainer # type: ignore @@ -426,11 +426,7 @@ def main() -> int: ): raise ValueError("agents must be a list of model names.") agent_names = [str(x) for x in agent_names] - model_kwargs: Dict[str, Any] = {} - dtype = _map_dtype(model_cfg.get("dtype") or model_cfg.get("torch_dtype")) - if dtype is not None: - model_kwargs["torch_dtype"] = dtype tokenizer_source = agent_names[0] if agent_names else model_name if not tokenizer_source: @@ -444,16 +440,6 @@ def main() -> int: tok.pad_token = tok.eos_token tokenizer = tokenizers[0] - agents = [] - if agent_names: - for name in agent_names: - agent = AutoModelForCausalLM.from_pretrained(name, **model_kwargs) - agents.append(agent) - else: - for _ in range(num_agents): - agent = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs) - agents.append(agent) - sampling_cfg = get_agent_sampling_config(cfg) magrpo_args = get_trainer_args(cfg, sampling_cfg=sampling_cfg) formatters = _build_formatters(cfg, num_agents=num_agents, tokenizer=tokenizer) @@ -534,8 +520,12 @@ def main() -> int: trainer_kwargs: Dict[str, Any] = { "agent_model": model_name or None, - "agents": agents, + "agents": agent_names, "num_agents": num_agents, + "model_config": { + "torch_dtype": dtype, + "special_tokens": model_cfg.get("special_tokens", {}), + }, "reward_func": reward_func, "formatters": formatters, "args": magrpo_args, diff --git a/str_build/train/train_magrpo.py b/str_build/train/train_magrpo.py index 7df1ab7..186bc43 100644 --- a/str_build/train/train_magrpo.py +++ b/str_build/train/train_magrpo.py @@ -21,7 +21,7 @@ sys.path.insert(0, COMLRL_ROOT) from datasets import Dataset # type: ignore -from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore +from transformers import AutoTokenizer # type: ignore import torch # type: ignore from comlrl.trainers.reinforce import MAGRPOTrainer # type: ignore @@ -293,11 +293,7 @@ def main() -> int: ): raise ValueError("agents must be a list of model names.") agent_names = [str(x) for x in agent_names] - model_kwargs: Dict[str, Any] = {} - dtype = _map_dtype(model_cfg.get("dtype") or model_cfg.get("torch_dtype")) - if dtype is not None: - model_kwargs["torch_dtype"] = dtype tokenizer_source = agent_names[0] if agent_names else model_name if not tokenizer_source: @@ -311,16 +307,6 @@ def main() -> int: tok.pad_token = tok.eos_token tokenizer = tokenizers[0] - agents = [] - if agent_names: - for name in agent_names: - agent = AutoModelForCausalLM.from_pretrained(name, **model_kwargs) - agents.append(agent) - else: - for _ in range(num_agents): - agent = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs) - agents.append(agent) - sampling_cfg = get_agent_sampling_config(cfg) magrpo_args = get_trainer_args(cfg, sampling_cfg=sampling_cfg) formatters = _build_formatters(cfg, num_agents=num_agents, tokenizer=tokenizer) @@ -401,8 +387,12 @@ def main() -> int: trainer_kwargs: Dict[str, Any] = { "agent_model": model_name or None, - "agents": agents, + "agents": agent_names, "num_agents": num_agents, + "model_config": { + "torch_dtype": dtype, + "special_tokens": model_cfg.get("special_tokens", {}), + }, "reward_func": reward_func, "formatters": formatters, "args": magrpo_args,