diff --git a/src/art/rewards/__init__.py b/src/art/rewards/__init__.py index 02c366cee..da14e5769 100644 --- a/src/art/rewards/__init__.py +++ b/src/art/rewards/__init__.py @@ -1,3 +1,3 @@ -from .ruler import ruler, ruler_score_group +from .ruler import DEFAULT_RUBRIC, ruler, ruler_score_group -__all__ = ["ruler", "ruler_score_group"] +__all__ = ["DEFAULT_RUBRIC", "ruler", "ruler_score_group"] diff --git a/src/art/rewards/ruler.py b/src/art/rewards/ruler.py index ecb4b714d..4fa8c5aa0 100644 --- a/src/art/rewards/ruler.py +++ b/src/art/rewards/ruler.py @@ -54,7 +54,7 @@ async def ruler( message_lists: list[list[ChatCompletionMessageParam]], judge_model: str = "openai/o3", extra_litellm_params: dict | None = None, - rubric: str = DEFAULT_RUBRIC, + rubric: str | None = DEFAULT_RUBRIC, tools: list | None = None, *, debug: bool = False, @@ -81,7 +81,8 @@ async def ruler( - "anthropic/claude-3-opus-20240229" - Alternative judge extra_litellm_params: Additional parameters to pass to LiteLLM completion. Can include temperature, max_tokens, etc. - rubric: The grading rubric. The default rubric works well for most tasks. + rubric: The grading rubric, or None to use DEFAULT_RUBRIC. + The default rubric works well for most tasks. tools: Optional list of tool definitions available to the agent. When provided, the judge will see which tools were available when evaluating tool usage. debug: If True, pretty-print the judge's reasoning to help understand scores. @@ -103,6 +104,9 @@ async def ruler( 0.9 """ + if rubric is None: + rubric = DEFAULT_RUBRIC + # Short-circuit for the trivial case if not message_lists: return [] @@ -230,7 +234,7 @@ async def ruler_score_group( group: art.TrajectoryGroup, judge_model: str = "openai/o3", extra_litellm_params: dict | None = None, - rubric: str = DEFAULT_RUBRIC, + rubric: str | None = DEFAULT_RUBRIC, *, swallow_exceptions: bool = False, debug: bool = False, @@ -251,7 +255,8 @@ async def ruler_score_group( group: A TrajectoryGroup containing trajectories to score. judge_model: The model to use for judging. See `ruler` for options. extra_litellm_params: Additional parameters to pass to LiteLLM completion. - rubric: Custom rubric or use the default which works well for most tasks. + rubric: Custom rubric, or None to use DEFAULT_RUBRIC. The default works well + for most tasks. swallow_exceptions: If True, returns None on errors instead of raising. This is recommended for production to handle API failures gracefully. debug: If True, prints the judge's reasoning. @@ -272,6 +277,9 @@ async def ruler_score_group( For complete documentation and examples, see: https://art.openpipe.ai/fundamentals/ruler """ + if rubric is None: + rubric = DEFAULT_RUBRIC + # Validate that we don't have additional histories (not yet supported) for traj in group.trajectories: if len(traj.additional_histories) > 0: