From 9b2996a0ac7975309766bb1e460f5371b3903a7b Mon Sep 17 00:00:00 2001 From: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Date: Fri, 5 Jun 2026 16:29:20 +0800 Subject: [PATCH] fix: log eigenvalue monitor values Signed-off-by: Yufeng He <40085740+he-yufeng@users.noreply.github.com> --- deepspeed/runtime/engine.py | 16 +++++++++------- tests/unit/runtime/test_engine_eigenvalue.py | 16 ++++++++++++++++ 2 files changed, 25 insertions(+), 7 deletions(-) create mode 100644 tests/unit/runtime/test_engine_eigenvalue.py diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 82a7592f14cb..2cdd161a5125 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -205,6 +205,13 @@ def active_timers(self): return self.micro_timers + self.global_timers +def _eigenvalue_summary_events(block_eigenvalue, global_samples): + return [ + (f"Train/Eigenvalues/ModelBlockParam_{i}", ev_value[0], global_samples) + for i, ev_value in enumerate(block_eigenvalue.values()) + ] + + class DeepSpeedEngine(Module): r"""DeepSpeed engine for training.""" @@ -2965,13 +2972,8 @@ def step(self, lr_kwargs=None): if (self.eigenvalue_enabled() and not self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution()): - ev_values = self.block_eigenvalue.values() - for i in range(len(ev_values)): - self.summary_events.append(( - f"Train/Eigenvalues/ModelBlockParam_{i}", - self.ev_values[i][0], - self.global_samples, - )) + self.summary_events.extend( + _eigenvalue_summary_events(self.block_eigenvalue, self.global_samples)) self.monitor.write_events(self.summary_events) # Check flops profiling diff --git a/tests/unit/runtime/test_engine_eigenvalue.py b/tests/unit/runtime/test_engine_eigenvalue.py new file mode 100644 index 000000000000..c7f7e062eeb7 --- /dev/null +++ b/tests/unit/runtime/test_engine_eigenvalue.py @@ -0,0 +1,16 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +from deepspeed.runtime.engine import _eigenvalue_summary_events + + +def test_eigenvalue_summary_events_use_block_values(): + block_eigenvalue = { + "layer0.weight": (1.25, 0), + "layer1.weight": (0.5, 1), + } + + assert _eigenvalue_summary_events(block_eigenvalue, global_samples=128) == [ + ("Train/Eigenvalues/ModelBlockParam_0", 1.25, 128), + ("Train/Eigenvalues/ModelBlockParam_1", 0.5, 128), + ]