diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 4c78e2f..33a3661 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -67,10 +67,11 @@ StaticXgmiPlpd, ValueUnit, ) +from nodescraper.plugins.inband.amdsmi.collector_args import AmdSmiCollectorArgs from nodescraper.utils import get_exception_traceback -class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): +class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, AmdSmiCollectorArgs]): """Class for collection of inband tool amd-smi data.""" AMD_SMI_EXE = "amd-smi" @@ -87,6 +88,7 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): CMD_STATIC = "static -g all --json" CMD_STATIC_GPU = "static -g {gpu_id} --json" CMD_RAS = "ras --cper --folder={folder}" + CMD_RAS_AFID = "ras --afid --cper-file {cper_file}" def _check_amdsmi_installed(self) -> bool: """Check if amd-smi is installed @@ -331,7 +333,7 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]: firmware = self.get_firmware() gpu_list = self.get_gpu_list() statics = self.get_static() - cper_data = self.get_cper_data() + cper_data, cper_afids = self.get_cper_data() except Exception as e: self._log_event( category=EventCategory.APPLICATION, @@ -352,6 +354,7 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]: firmware=firmware, static=statics, cper_data=cper_data, + cper_afids=cper_afids, ) except ValidationError as err: self.logger.warning("Validation err: %s", err) @@ -1173,11 +1176,12 @@ def _parse_clock_dict(self, data: dict) -> Optional[dict[str, Union[StaticClockD return clock_dict if clock_dict else None - def get_cper_data(self) -> List[FileModel]: - """Collect CPER data from amd-smi ras command + def get_cper_data(self) -> tuple[List[FileModel], Dict[str, int]]: + """Collect CPER data from amd-smi ras command and extract AFID for each file Returns: - list[FileModel]: List of CPER files or empty list if not supported/available + tuple[list[FileModel], dict[str, int]]: Tuple of (list of CPER files, dict mapping filenames to AFIDs) + Returns empty list and dict if not supported/available """ try: AMD_SMI_CPER_FOLDER = "/tmp/amd_smi_cper" @@ -1192,14 +1196,14 @@ def get_cper_data(self) -> List[FileModel]: sudo=True, ) if cper_cmd_ret.exit_code != 0: - # Command failed, return empty list - return [] + # Command failed, return empty list and dict + return [], {} cper_cmd = cper_cmd_ret.stdout # search that a CPER is actually created here regex_cper_search = re.findall(r"(\w+\.cper)", cper_cmd) if not regex_cper_search: # Early exit if no CPER files were created - return [] + return [], {} # tar the cper folder self._run_sut_cmd( f"tar -czf {AMD_SMI_CPER_FOLDER}.tar.gz -C {AMD_SMI_CPER_FOLDER} .", @@ -1213,11 +1217,12 @@ def get_cper_data(self) -> List[FileModel]: if hasattr(cper_zip, "contents"): io_bytes = io.BytesIO(cper_zip.contents) # type: ignore[attr-defined] else: - return [] + return [], {} del cper_zip # Free memory after reading the file try: with TarFile.open(fileobj=io_bytes, mode="r:gz") as tar_file: cper_data = [] + cper_afids = {} for member in tar_file.getmembers(): if member.isfile() and member.name.endswith(".cper"): file_content = tar_file.extractfile(member) @@ -1230,6 +1235,12 @@ def get_cper_data(self) -> List[FileModel]: cper_data.append( FileModel(file_contents=file_content_bytes, file_name=member.name) ) + + cper_file_path = f"{AMD_SMI_CPER_FOLDER}/{member.name}" + afid = self._get_cper_afid(cper_file_path) + if afid is not None: + cper_afids[member.name] = afid + # Since we do not log the cper data in the data model create an event informing the user if CPER created if cper_data: self._log_event( @@ -1237,9 +1248,12 @@ def get_cper_data(self) -> List[FileModel]: description="CPER data has been extracted from amd-smi", data={ "cper_count": len(cper_data), + "afid_count": len(cper_afids), }, priority=EventPriority.INFO, + console_log=True, ) + return cper_data, cper_afids except Exception as e: self._log_event( category=EventCategory.APPLICATION, @@ -1250,11 +1264,8 @@ def get_cper_data(self) -> List[FileModel]: priority=EventPriority.ERROR, console_log=True, ) - return [] - return cper_data + return [], {} except Exception as e: - # If any unexpected error occurs during CPER collection, log it and return empty list - # This ensures CPER collection failures don't break the entire data collection self._log_event( category=EventCategory.APPLICATION, description="Error collecting CPER data", @@ -1264,19 +1275,61 @@ def get_cper_data(self) -> List[FileModel]: priority=EventPriority.WARNING, console_log=False, ) - return [] + return [], {} + + def _get_cper_afid(self, cper_file_path: str) -> Optional[int]: + """Get AFID from a CPER file using amd-smi ras --afid --cper-file command + + Args: + cper_file_path (str): Path to the CPER file + + Returns: + Optional[int]: AFID value or None if command fails or no value found + """ + cmd = self.CMD_RAS_AFID.format(cper_file=cper_file_path) + result = self._run_amd_smi(cmd) + + if result is None: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Failed to get AFID from CPER file: {cper_file_path}", + priority=EventPriority.ERROR, + console_log=True, + ) + return None + + try: + afid = int(result.strip()) + self._log_event( + category=EventCategory.APPLICATION, + description=f"Successfully retrieved AFID from CPER file: {cper_file_path}", + data={"afid": afid, "cper_file": cper_file_path}, + priority=EventPriority.INFO, + console_log=True, + ) + return afid + except ValueError: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Failed to parse AFID value from output: {result}", + data={"output": result, "cper_file": cper_file_path}, + priority=EventPriority.ERROR, + console_log=True, + ) + return None def collect_data( self, - args: Any = None, + args: Optional[AmdSmiCollectorArgs] = None, ) -> tuple[TaskResult, Optional[AmdSmiDataModel]]: """Collect AmdSmi data from system Args: - args (Any, optional): optional arguments for data collection. Defaults to None. + args: Optional collector arguments. If cper_file_path is provided, + AFID will be extracted and stored in cper_afids dict. Returns: - tuple[TaskResult, Optional[AmdSmiDataModel]]: task result and collected data model + tuple[TaskResult, Optional[AmdSmiDataModel]]: task result and data model """ if not self._check_amdsmi_installed(): @@ -1300,6 +1353,11 @@ def collect_data( if amd_smi_data is None: return self.result, None + if args and args.cper_file_path: + afid = self._get_cper_afid(args.cper_file_path) + if afid is not None: + amd_smi_data.cper_afids[args.cper_file_path] = afid + return self.result, amd_smi_data except Exception as e: self._log_event( diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py index 67eda94..abfdf75 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py @@ -29,15 +29,18 @@ from .amdsmi_collector import AmdSmiCollector from .amdsmidata import AmdSmiDataModel from .analyzer_args import AmdSmiAnalyzerArgs +from .collector_args import AmdSmiCollectorArgs -class AmdSmiPlugin(InBandDataPlugin[AmdSmiDataModel, None, AmdSmiAnalyzerArgs]): +class AmdSmiPlugin(InBandDataPlugin[AmdSmiDataModel, AmdSmiCollectorArgs, AmdSmiAnalyzerArgs]): """Plugin for collection and analysis of amdsmi data""" DATA_MODEL = AmdSmiDataModel COLLECTOR = AmdSmiCollector + COLLECTOR_ARGS = AmdSmiCollectorArgs + ANALYZER = AmdSmiAnalyzer ANALYZER_ARGS = AmdSmiAnalyzerArgs diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index aacca2a..d1278bc 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -954,6 +954,7 @@ class AmdSmiDataModel(DataModel): xgmi_metric: Optional[list[XgmiMetrics]] = Field(default_factory=list) xgmi_link: Optional[list[XgmiLinks]] = Field(default_factory=list) cper_data: Optional[list[FileModel]] = Field(default_factory=list) + cper_afids: dict[str, int] = Field(default_factory=dict) amdsmitst_data: AmdSmiTstData = Field(default_factory=AmdSmiTstData) def get_list(self, gpu: int) -> Optional[AmdSmiListItem]: diff --git a/nodescraper/plugins/inband/amdsmi/collector_args.py b/nodescraper/plugins/inband/amdsmi/collector_args.py new file mode 100644 index 0000000..97b5f90 --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/collector_args.py @@ -0,0 +1,34 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional + +from nodescraper.models import CollectorArgs + + +class AmdSmiCollectorArgs(CollectorArgs): + """Collector arguments for AmdSmiPlugin""" + + cper_file_path: Optional[str] = None diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py index 51e2bca..d6583ba 100644 --- a/test/unit/plugin/test_amdsmi_collector.py +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -23,7 +23,9 @@ # SOFTWARE. # ############################################################################### +import io import json +import tarfile from typing import Any from unittest.mock import MagicMock @@ -31,6 +33,32 @@ from nodescraper.enums.systeminteraction import SystemInteractionLevel from nodescraper.plugins.inband.amdsmi.amdsmi_collector import AmdSmiCollector +from nodescraper.plugins.inband.amdsmi.amdsmidata import AmdSmiDataModel +from nodescraper.plugins.inband.amdsmi.collector_args import AmdSmiCollectorArgs + + +def test_collector_args_instantiation(): + """Test AmdSmiCollectorArgs can be instantiated with and without cper_file_path""" + args1 = AmdSmiCollectorArgs() + assert args1.cper_file_path is None + + args2 = AmdSmiCollectorArgs(cper_file_path="/path/to/test.cper") + assert args2.cper_file_path == "/path/to/test.cper" + + args3 = AmdSmiCollectorArgs(**{"cper_file_path": "/another/path.cper"}) + assert args3.cper_file_path == "/another/path.cper" + + +def test_data_model_cper_afids_field(): + """Test AmdSmiDataModel accepts cper_afids dict field""" + data1 = AmdSmiDataModel(cper_afids={"file1.cper": 12345, "file2.cper": 67890}) + assert data1.cper_afids == {"file1.cper": 12345, "file2.cper": 67890} + + data2 = AmdSmiDataModel() + assert data2.cper_afids == {} + + data3 = AmdSmiDataModel(**{"cper_afids": {"test.cper": 99999}}) + assert data3.cper_afids == {"test.cper": 99999} def make_cmd_result(stdout: str, stderr: str = "", exit_code: int = 0) -> MagicMock: @@ -483,3 +511,243 @@ def mock_single_json(cmd: str) -> MagicMock: assert isinstance(result, list) assert len(result) == 1 assert result[0]["tool"] == "amdsmi" + + +def test_get_cper_afid_success(conn_mock, system_info, monkeypatch): + """Test successful AFID retrieval from CPER file""" + + def mock_run_sut_cmd(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "ras --afid --cper-file" in cmd: + return make_cmd_result("12345\n") + return make_cmd_result("") + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_run_sut_cmd) + + afid = c._get_cper_afid("/path/to/test.cper") + + assert afid is not None + assert afid == 12345 + + +def test_get_cper_afid_invalid_output(conn_mock, system_info, monkeypatch): + """Test AFID retrieval with invalid/non-integer output""" + + def mock_run_sut_cmd(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "ras --afid --cper-file" in cmd: + return make_cmd_result("not_a_number\n") + return make_cmd_result("") + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_run_sut_cmd) + + afid = c._get_cper_afid("/path/to/test.cper") + + assert afid is None + + +def test_get_cper_afid_command_failure(conn_mock, system_info, monkeypatch): + """Test AFID retrieval when command fails""" + + def mock_run_sut_cmd(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "ras --afid --cper-file" in cmd: + return make_cmd_result("", stderr="Error: file not found", exit_code=1) + return make_cmd_result("") + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_run_sut_cmd) + + # _run_amd_smi returns None on non-zero exit code + afid = c._get_cper_afid("/path/to/test.cper") + + assert afid is None + + +def test_collect_data_with_cper_file(conn_mock, system_info, mock_commands): + """Test collect_data with cper_file_path argument""" + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + + # Add mock for AFID command + original_mock = mock_commands + + def extended_mock(cmd: str) -> MagicMock: + if "ras --afid --cper-file" in cmd: + return make_cmd_result("99999\n") + return original_mock(cmd) + + c._run_sut_cmd = extended_mock + + args = AmdSmiCollectorArgs(cper_file_path="/path/to/test.cper") + result, data = c.collect_data(args) + + assert result is not None + assert data is not None + assert "/path/to/test.cper" in data.cper_afids + assert data.cper_afids["/path/to/test.cper"] == 99999 + + +def test_collect_data_without_cper_file(conn_mock, system_info, mock_commands): + """Test collect_data without cper_file_path argument""" + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + c._run_sut_cmd = mock_commands + + result, data = c.collect_data() + + assert result is not None + assert data is not None + assert isinstance(data.cper_afids, dict) + + +def test_get_cper_data_with_afids(conn_mock, system_info, monkeypatch): + """Test get_cper_data returns both CPER files and AFIDs""" + + def mock_run_sut_cmd(cmd: str, sudo: bool = False) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "ras --cper --folder" in cmd: + return make_cmd_result("Created test1.cper\nCreated test2.cper\n") + if "mkdir -p" in cmd or "tar -czf" in cmd: + return make_cmd_result("") + if "ras --afid --cper-file" in cmd: + if "test1.cper" in cmd: + return make_cmd_result("12345\n") + elif "test2.cper" in cmd: + return make_cmd_result("67890\n") + return make_cmd_result("") + + def mock_read_sut_file(path: str, encoding=None, strip=False, log_artifact=False): + tar_buffer = io.BytesIO() + with tarfile.open(fileobj=tar_buffer, mode="w:gz") as tar: + for name in ["test1.cper", "test2.cper"]: + info = tarfile.TarInfo(name=name) + info.size = len(b"fake cper data") + tar.addfile(info, io.BytesIO(b"fake cper data")) + + tar_buffer.seek(0) + mock_artifact = MagicMock() + mock_artifact.contents = tar_buffer.read() + return mock_artifact + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_run_sut_cmd) + monkeypatch.setattr(c, "_read_sut_file", mock_read_sut_file) + + cper_files, cper_afids = c.get_cper_data() + + assert len(cper_files) == 2 + assert cper_files[0].file_name == "test1.cper" + assert cper_files[1].file_name == "test2.cper" + assert cper_afids == {"test1.cper": 12345, "test2.cper": 67890} + + +def test_get_cper_data_no_cper_files(conn_mock, system_info, monkeypatch): + """Test get_cper_data returns empty lists when no CPER files created""" + + def mock_run_sut_cmd(cmd: str, sudo: bool = False) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "ras --cper --folder" in cmd: + return make_cmd_result("No CPER files created\n") + if "mkdir -p" in cmd: + return make_cmd_result("") + return make_cmd_result("") + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_run_sut_cmd) + + cper_files, cper_afids = c.get_cper_data() + + assert cper_files == [] + assert cper_afids == {} + + +def test_collect_data_with_both_auto_and_custom_cper(conn_mock, system_info, monkeypatch): + """Test that both auto-collected and custom CPER AFIDs are stored in cper_afids""" + + def mock_run_sut_cmd(cmd: str, sudo: bool = False) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "version --json" in cmd: + return make_cmd_result( + make_json_response( + [{"tool": "amdsmi", "amdsmi_library_version": "1.2.3", "rocm_version": "6.1.0"}] + ) + ) + if "list --json" in cmd: + return make_cmd_result(make_json_response([{"gpu": 0, "bdf": "0000:0b:00.0"}])) + if "ras --cper --folder" in cmd: + return make_cmd_result("Created auto1.cper\n") + if "ras --afid --cper-file" in cmd: + if "auto1.cper" in cmd: + return make_cmd_result("11111\n") + elif "custom.cper" in cmd: + return make_cmd_result("99999\n") + if "mkdir -p" in cmd or "tar -czf" in cmd: + return make_cmd_result("") + if "static -g all --json" in cmd: + return make_cmd_result(make_json_response({"gpu_data": []})) + return make_cmd_result("") + + def mock_read_sut_file(path: str, encoding=None, strip=False, log_artifact=False): + tar_buffer = io.BytesIO() + with tarfile.open(fileobj=tar_buffer, mode="w:gz") as tar: + info = tarfile.TarInfo(name="auto1.cper") + info.size = len(b"auto cper data") + tar.addfile(info, io.BytesIO(b"auto cper data")) + tar_buffer.seek(0) + mock_artifact = MagicMock() + mock_artifact.contents = tar_buffer.read() + return mock_artifact + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_run_sut_cmd) + monkeypatch.setattr(c, "_read_sut_file", mock_read_sut_file) + + args = AmdSmiCollectorArgs(cper_file_path="/home/user/custom.cper") + result, data = c.collect_data(args) + + assert result is not None + assert data is not None + assert "auto1.cper" in data.cper_afids + assert data.cper_afids["auto1.cper"] == 11111 + assert "/home/user/custom.cper" in data.cper_afids + assert data.cper_afids["/home/user/custom.cper"] == 99999