diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 5870d7e6b602..0c79e8db4c5a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. +# Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,13 +28,13 @@ import os from functools import reduce from .utils import extract_RJ45_ports_index + from . import module_host_mgmt_initializer from . import utils from .device_data import DeviceDataManager import re - import queue + import select import threading import time - from sonic_platform import modules_mgmt except ImportError as e: raise ImportError (str(e) + "- required module not found") @@ -132,9 +132,9 @@ def __init__(self): Chassis.chassis_instance = self - self.modules_mgmt_thread = threading.Thread() - self.modules_changes_queue = queue.Queue() - self.modules_mgmt_task_stopping_event = threading.Event() + self.module_host_mgmt_initializer = module_host_mgmt_initializer.ModuleHostMgmtInitializer() + self.poll_obj = None + self.registered_fds = None logger.log_info("Chassis loaded successfully") @@ -338,8 +338,11 @@ def get_all_sfps(self): Returns: A list of objects derived from SfpBase representing all sfps available on this chassis - """ - self.initialize_sfp() + """ + if DeviceDataManager.is_module_host_management_mode(): + self.module_host_mgmt_initializer.initialize(self) + else: + self.initialize_sfp() return self._sfp_list def get_sfp(self, index): @@ -356,7 +359,10 @@ def get_sfp(self, index): An object dervied from SfpBase representing the specified sfp """ index = index - 1 - self.initialize_single_sfp(index) + if DeviceDataManager.is_module_host_management_mode(): + self.module_host_mgmt_initializer.initialize(self) + else: + self.initialize_single_sfp(index) return super(Chassis, self).get_sfp(index) def get_port_or_cage_type(self, index): @@ -406,42 +412,223 @@ def get_change_event(self, timeout=0): indicates that fan 0 has been removed, fan 2 has been inserted and sfp 11 has been removed. """ - if not self.modules_mgmt_thread.is_alive(): - # open new SFP change events thread - self.modules_mgmt_thread = modules_mgmt.ModulesMgmtTask(q=self.modules_changes_queue - , main_thread_stop_event = self.modules_mgmt_task_stopping_event) - # Set the thread as daemon so when pmon/xcvrd are shutting down, modules_mgmt will shut down immedietly. - self.modules_mgmt_thread.daemon = True - self.modules_mgmt_thread.start() - self.initialize_sfp() - wait_for_ever = (timeout == 0) + if DeviceDataManager.is_module_host_management_mode(): + self.module_host_mgmt_initializer.initialize(self) + return self.get_change_event_for_module_host_management_mode(timeout) + else: + self.initialize_sfp() + return self.get_change_event_legacy(timeout) + + def get_change_event_for_module_host_management_mode(self, timeout): + """Get SFP change event when module host management mode is enabled. + + Args: + timeout: Timeout in milliseconds (optional). If timeout == 0, + this method will block until a change is detected. + + Returns: + (bool, dict): + - True if call successful, False if not; - Deprecated, will always return True + - A nested dictionary where key is a device type, + value is a dictionary with key:value pairs in the format of + {'device_id':'device_event'}, + where device_id is the device ID for this device and + device_event, + status='1' represents device inserted, + status='0' represents device removed. + Ex. {'fan':{'0':'0', '2':'1'}, 'sfp':{'11':'0'}} + indicates that fan 0 has been removed, fan 2 + has been inserted and sfp 11 has been removed. + """ + if not self.poll_obj: + self.poll_obj = select.poll() + self.registered_fds = {} + for s in self._sfp_list: + fds = s.get_fds_for_poling() + for fd_type, fd in fds.items(): + self.poll_obj.register(fd, select.POLLERR | select.POLLPRI) + self.registered_fds[fd.fileno()] = (s.sdk_index, fd, fd_type) + + logger.log_debug(f'Registered SFP file descriptors for polling: {self.registered_fds}') + + from . import sfp + + wait_forever = (timeout == 0) + # poll timeout should be no more than 1000ms to ensure fast shutdown flow + timeout = 1000.0 if timeout >= 1000 else float(timeout) + port_dict = {} + error_dict = {} + begin = time.time() + wait_ready_task = sfp.SFP.get_wait_ready_task() + + while True: + fds_events = self.poll_obj.poll(timeout) + for fileno, _ in fds_events: + if fileno not in self.registered_fds: + logger.log_error(f'Unknown file no {fileno} from poll event, registered files are {self.registered_fds}') + continue + + sfp_index, fd, fd_type = self.registered_fds[fileno] + s = self._sfp_list[sfp_index] + fd_value = int(fd.read().strip()) + + # Detecting dummy event + if s.is_dummy_event(fd_type, fd_value): + # Ignore dummy event for the first poll, assume SDK only provide 1 dummy event + logger.log_debug(f'Ignore dummy event {fd_type}:{fd_value} for SFP {sfp_index}') + continue + + logger.log_notice(f'Got SFP event: index={sfp_index}, type={fd_type}, value={fd_value}') + if fd_type == 'hw_present': + # event could be EVENT_NOT_PRESENT or EVENT_PRESENT + event = sfp.EVENT_NOT_PRESENT if fd_value == 0 else sfp.EVENT_PRESENT + s.on_event(event) + elif fd_type == 'present': + if str(fd_value) == sfp.SFP_STATUS_ERROR: + # FW control cable got an error, no need trigger state machine + sfp_status, error_desc = s.get_error_info_from_sdk_error_type() + port_dict[sfp_index + 1] = sfp_status + if error_desc: + error_dict[sfp_index + 1] = error_desc + continue + elif str(fd_value) == sfp.SFP_STATUS_INSERTED: + # FW control cable got present, only case is that the cable is recovering + # from an error. FW control cable has no transition from "Not Present" to "Present" + # because "Not Present" cable is always "software control" and should always poll + # hw_present sysfs instead of present sysfs. + port_dict[sfp_index + 1] = sfp.SFP_STATUS_INSERTED + continue + else: + s.on_event(sfp.EVENT_NOT_PRESENT) + else: + # event could be EVENT_POWER_GOOD or EVENT_POWER_BAD + event = sfp.EVENT_POWER_BAD if fd_value == 0 else sfp.EVENT_POWER_GOOD + s.on_event(event) + + if s.in_stable_state(): + s.fill_change_event(port_dict) + s.refresh_poll_obj(self.poll_obj, self.registered_fds) + else: + logger.log_debug(f'SFP {sfp_index} does not reach stable state, state={s.state}') + + ready_sfp_set = wait_ready_task.get_ready_set() + for sfp_index in ready_sfp_set: + s = self._sfp_list[sfp_index] + s.on_event(sfp.EVENT_RESET_DONE) + if s.in_stable_state(): + s.fill_change_event(port_dict) + s.refresh_poll_obj(self.poll_obj, self.registered_fds) + else: + logger.log_error(f'SFP {sfp_index} failed to reach stable state, state={s.state}') + + if port_dict: + logger.log_notice(f'Sending SFP change event: {port_dict}, error event: {error_dict}') + self.reinit_sfps(port_dict) + return True, { + 'sfp': port_dict, + 'sfp_error': error_dict + } + else: + if not wait_forever: + elapse = time.time() - begin + if elapse * 1000 >= timeout: + return True, {'sfp': {}} + + def get_change_event_legacy(self, timeout): + """Get SFP change event when module host management is disabled. + + Args: + timeout (int): polling timeout in ms + + Returns: + (bool, dict): + - True if call successful, False if not; - Deprecated, will always return True + - A nested dictionary where key is a device type, + value is a dictionary with key:value pairs in the format of + {'device_id':'device_event'}, + where device_id is the device ID for this device and + device_event, + status='1' represents device inserted, + status='0' represents device removed. + Ex. {'fan':{'0':'0', '2':'1'}, 'sfp':{'11':'0'}} + indicates that fan 0 has been removed, fan 2 + has been inserted and sfp 11 has been removed. + """ + if not self.poll_obj: + self.poll_obj = select.poll() + self.registered_fds = {} + # SDK always sent event for the first time polling. Such event should not be sent to xcvrd. + # Store SFP state before first time polling so that we can detect dummy event. + self.sfp_states_before_first_poll = {} + for s in self._sfp_list: + fd = s.get_fd_for_polling_legacy() + self.poll_obj.register(fd, select.POLLERR | select.POLLPRI) + self.registered_fds[fd.fileno()] = (s.sdk_index, fd) + self.sfp_states_before_first_poll[s.sdk_index] = s.get_module_status() + + logger.log_debug(f'Registered SFP file descriptors for polling: {self.registered_fds}') + + from . import sfp + + wait_forever = (timeout == 0) # poll timeout should be no more than 1000ms to ensure fast shutdown flow timeout = 1000.0 if timeout >= 1000 else float(timeout) port_dict = {} error_dict = {} begin = time.time() - i = 0 + while True: - try: - logger.log_info(f'get_change_event() trying to get changes from queue on iteration {i}') - port_dict = self.modules_changes_queue.get(timeout=timeout / 1000) - logger.log_info(f'get_change_event() iteration {i} port_dict: {port_dict}') - except queue.Empty: - logger.log_info(f"failed to get item from modules changes queue on itertaion {i}") + fds_events = self.poll_obj.poll(timeout) + for fileno, _ in fds_events: + if fileno not in self.registered_fds: + logger.log_error(f'Unknown file no {fileno} from poll event, registered files are {self.registered_fds}') + continue + + sfp_index, fd = self.registered_fds[fileno] + fd.seek(0) + fd.read() + s = self._sfp_list[sfp_index] + sfp_status = s.get_module_status() + + if sfp_index in self.sfp_states_before_first_poll: + # Detecting dummy event + sfp_state_before_poll = self.sfp_states_before_first_poll[sfp_index] + self.sfp_states_before_first_poll.pop(sfp_index) + if sfp_state_before_poll == sfp_status: + # Ignore dummy event for the first poll, assume SDK only provide 1 dummy event + logger.log_debug(f'Ignore dummy event {sfp_status} for SFP {sfp_index}') + continue + + logger.log_notice(f'Got SFP event: index={sfp_index}, value={sfp_status}') + if sfp_status == sfp.SFP_STATUS_UNKNOWN: + # in the following sequence, STATUS_UNKNOWN can be returned. + # so we shouldn't raise exception here. + # 1. some sfp module is inserted + # 2. sfp_event gets stuck and fails to fetch the change event instantaneously + # 3. and then the sfp module is removed + # 4. sfp_event starts to try fetching the change event + logger.log_info("unknown module state, maybe the port suffers two adjacent insertion/removal") + continue + + if sfp_status == sfp.SFP_STATUS_ERROR: + s = self._sfp_list[sfp_index] + sfp_status, error_desc = s.get_error_info_from_sdk_error_type() + if error_desc: + error_dict[sfp_index + 1] = error_desc + port_dict[sfp_index + 1] = sfp_status if port_dict: + logger.log_notice(f'Sending SFP change event: {port_dict}, error event: {error_dict}') self.reinit_sfps(port_dict) - result_dict = {'sfp': port_dict} - result_dict['sfp_error'] = error_dict - return True, result_dict + return True, { + 'sfp': port_dict, + 'sfp_error': error_dict + } else: - if not wait_for_ever: + if not wait_forever: elapse = time.time() - begin - logger.log_info(f"get_change_event: wait_for_ever {wait_for_ever} elapse {elapse} iteartion {i}") if elapse * 1000 >= timeout: - logger.log_info(f"elapse {elapse} > timeout {timeout} iteartion {i} returning empty dict") return True, {'sfp': {}} - i += 1 def reinit_sfps(self, port_dict): """ diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py index aeceb15d1983..29445ac0da04 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -242,7 +242,7 @@ def get_cpld_component_list(cls): @classmethod @utils.read_only_cache() - def is_independent_mode(cls): + def is_module_host_management_mode(cls): from sonic_py_common import device_info _, hwsku_dir = device_info.get_paths_to_platform_and_hwsku_dirs() sai_profile_file = os.path.join(hwsku_dir, 'sai.profile') @@ -258,7 +258,7 @@ def wait_platform_ready(cls): """ conditions = [] sysfs_nodes = ['power_mode', 'power_mode_policy', 'present', 'reset', 'status', 'statuserror'] - if cls.is_independent_mode(): + if cls.is_module_host_management_mode(): sysfs_nodes.extend(['control', 'frequency', 'frequency_support', 'hw_present', 'hw_reset', 'power_good', 'power_limit', 'power_on', 'temperature/input']) else: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/module_host_mgmt_initializer.py b/platform/mellanox/mlnx-platform-api/sonic_platform/module_host_mgmt_initializer.py new file mode 100644 index 000000000000..d9bec65987e0 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/module_host_mgmt_initializer.py @@ -0,0 +1,128 @@ +# +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from . import utils +from sonic_py_common.logger import Logger + +import atexit +import os +import sys +import threading + +MODULE_READY_MAX_WAIT_TIME = 300 +MODULE_READY_CHECK_INTERVAL = 5 +MODULE_READY_CONTAINER_FILE = '/tmp/module_host_mgmt_ready' +MODULE_READY_HOST_FILE = '/tmp/nv-syncd-shared/module_host_mgmt_ready' +DEDICATE_INIT_DAEMON = 'xcvrd' +initialization_owner = False + +logger = Logger() + + +class ModuleHostMgmtInitializer: + """Responsible for initializing modules for host management mode. + """ + def __init__(self): + self.initialized = False + self.lock = threading.Lock() + + def initialize(self, chassis): + """Initialize all modules. Only applicable for module host management mode. + The real initialization job shall only be done in xcvrd. Only 1 owner is allowed + to to the initialization. Other daemon/CLI shall wait for the initialization done. + + Args: + chassis (object): chassis object + """ + global initialization_owner + if self.initialized: + return + + if utils.is_host(): + self.wait_module_ready() + chassis.initialize_sfp() + else: + if self.is_initialization_owner(): + if not self.initialized: + with self.lock: + if not self.initialized: + logger.log_notice('Starting module initialization for module host management...') + initialization_owner = True + self.remove_module_ready_file() + + chassis.initialize_sfp() + + from .sfp import SFP + SFP.initialize_sfp_modules(chassis._sfp_list) + + self.create_module_ready_file() + self.initialized = True + logger.log_notice('Module initialization for module host management done') + else: + self.wait_module_ready() + chassis.initialize_sfp() + + @classmethod + def create_module_ready_file(cls): + """Create module ready file + """ + with open(MODULE_READY_CONTAINER_FILE, 'w'): + pass + + @classmethod + def remove_module_ready_file(cls): + """Remove module ready file + """ + if os.path.exists(MODULE_READY_CONTAINER_FILE): + os.remove(MODULE_READY_CONTAINER_FILE) + + def wait_module_ready(self): + """Wait up to MODULE_READY_MAX_WAIT_TIME seconds for all modules to be ready + """ + if utils.is_host(): + module_ready_file = MODULE_READY_HOST_FILE + else: + module_ready_file = MODULE_READY_CONTAINER_FILE + + if os.path.exists(module_ready_file): + self.initialized = True + return + else: + print('Waiting module to be initialized...') + + if utils.wait_until(os.path.exists, MODULE_READY_MAX_WAIT_TIME, MODULE_READY_CHECK_INTERVAL, module_ready_file): + self.initialized = True + else: + logger.log_error('Module initialization timeout', True) + + def is_initialization_owner(self): + """Indicate whether current thread is the owner of doing module initialization + + Returns: + bool: True if current thread is the owner + """ + cmd = os.path.basename(sys.argv[0]) + return DEDICATE_INIT_DAEMON in cmd + +@atexit.register +def clean_up(): + """Remove module ready file when program exits. + When module host management is enabled, xcvrd is the dependency for all other + daemon/CLI who potentially uses SFP API. + """ + if initialization_owner: + ModuleHostMgmtInitializer.remove_module_ready_file() diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py index 90462e9ed0fe..94c3bea12b8c 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py @@ -24,16 +24,18 @@ try: import ctypes + import select import subprocess import os import threading + import time from sonic_py_common.logger import Logger from sonic_py_common.general import check_output_pipe from . import utils from .device_data import DeviceDataManager from sonic_platform_base.sonic_xcvr.sfp_optoe_base import SfpOptoeBase from sonic_platform_base.sonic_xcvr.fields import consts - from sonic_platform_base.sonic_xcvr.api.public import sff8636, sff8436 + from sonic_platform_base.sonic_xcvr.api.public import cmis, sff8636, sff8436 except ImportError as e: raise ImportError (str(e) + "- required module not found") @@ -127,7 +129,44 @@ CPU_MASK = PORT_TYPE_MASK & (PORT_TYPE_CPU << PORT_TYPE_OFFSET) # parameters for SFP presence +SFP_STATUS_REMOVED = '0' SFP_STATUS_INSERTED = '1' +SFP_STATUS_ERROR = '2' +SFP_STATUS_UNKNOWN = '-1' + +# SFP status from PMAOS register +# 0x1 plug in +# 0x2 plug out +# 0x3 plug in with error +# 0x4 disabled, at this status SFP eeprom is not accessible, +# and presence status also will be not present, +# so treate it as plug out. +SDK_SFP_STATE_IN = 0x1 +SDK_SFP_STATE_OUT = 0x2 +SDK_SFP_STATE_ERR = 0x3 +SDK_SFP_STATE_DIS = 0x4 +SDK_SFP_STATE_UNKNOWN = 0x5 + +SDK_STATUS_TO_SONIC_STATUS = { + SDK_SFP_STATE_IN: SFP_STATUS_INSERTED, + SDK_SFP_STATE_OUT: SFP_STATUS_REMOVED, + SDK_SFP_STATE_ERR: SFP_STATUS_ERROR, + SDK_SFP_STATE_DIS: SFP_STATUS_REMOVED, + SDK_SFP_STATE_UNKNOWN: SFP_STATUS_UNKNOWN +} + +# SDK error definitions begin + +# SFP errors that will block eeprom accessing +SDK_SFP_BLOCKING_ERRORS = [ + 0x2, # SFP.SFP_ERROR_BIT_I2C_STUCK, + 0x3, # SFP.SFP_ERROR_BIT_BAD_EEPROM, + 0x5, # SFP.SFP_ERROR_BIT_UNSUPPORTED_CABLE, + 0x6, # SFP.SFP_ERROR_BIT_HIGH_TEMP, + 0x7, # SFP.SFP_ERROR_BIT_BAD_CABLE +] + +# SDK error definitions end # SFP constants SFP_PAGE_SIZE = 256 # page size of page0h @@ -162,6 +201,60 @@ SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD = 80.0 SFP_TEMPERATURE_SCALE = 8.0 +# Module host management definitions begin +SFP_SW_CONTROL = 1 +SFP_FW_CONTROL = 0 + +CMIS_MAX_POWER_OFFSET = 201 + +SFF_POWER_CLASS_MASK = 0xE3 +SFF_POWER_CLASS_MAPPING = { + 0: 1.5, # 1.5W + 64: 2, # 2.0W + 128: 2.5, # 2.5W + 192: 3.5, # 3.5W + 193: 4, # 4.0W + 194: 4.5, # 4.5W + 195: 5 # 5.0W +} +SFF_POWER_CLASS_OFFSET = 129 +SFF_POWER_CLASS_8_INDICATOR = 32 +SFF_POWER_CLASS_8_OFFSET = 107 + +CMIS_MCI_EEPROM_OFFSET = 2 +CMIS_MCI_MASK = 0b00001100 + +STATE_DOWN = 'Down' # Initial state +STATE_INIT = 'Initializing' # Module starts initializing, check module present, also power on the module if need +STATE_RESETTING = 'Resetting' # Module is resetting the firmware +STATE_POWERED_ON = 'Power On' # Module is powered on, module firmware has been loaded, check module power is in good state +STATE_SW_CONTROL = 'Software Control' # Module is under software control +STATE_FW_CONTROL = 'Firmware Control' # Module is under firmware control +STATE_POWER_BAD = 'Power Bad' # Module power_good returns 0 +STATE_POWER_LIMIT_ERROR = 'Exceed Power Limit' # Module power exceeds cage power limit +STATE_NOT_PRESENT = 'Not Present' # Module is not present + +EVENT_START = 'Start' +EVENT_NOT_PRESENT = 'Not Present' +EVENT_RESET = 'Reset' +EVENT_POWER_ON = 'Power On' +EVENT_RESET_DONE = 'Reset Done' +EVENT_POWER_BAD = 'Power Bad' +EVENT_SW_CONTROL = 'Software Control' +EVENT_FW_CONTROL = 'Firmware Control' +EVENT_POWER_LIMIT_EXCEED = 'Power Limit Exceed' +EVENT_POWER_GOOD = 'Power Good' +EVENT_PRESENT = 'Present' + +ACTION_ON_START = 'On Start' +ACTION_ON_RESET = 'On Reset' +ACTION_ON_POWERED = 'On Powered' +ACTION_ON_SW_CONTROL = 'On Software Control' +ACTION_ON_FW_CONTROL = 'On Firmware Control' +ACTION_ON_POWER_LIMIT_ERROR = 'On Power Limit Error' +ACTION_ON_NOT_PRESENT = 'On Not Present' +# Module host management definitions end + # SFP EEPROM limited bytes limited_eeprom = { SFP_TYPE_CMIS: { @@ -252,30 +345,6 @@ def _get_module_info(self, sdk_index): return oper_state, error_type - @classmethod - def get_sfp_index_to_logical_port(cls, force=False): - if not cls.sfp_index_to_logical_port_dict or force: - config_db = utils.DbUtils.get_db_instance('CONFIG_DB') - port_data = config_db.get_table('PORT') - for key, data in port_data.items(): - if data['index'] not in cls.sfp_index_to_logical_port_dict: - cls.sfp_index_to_logical_port_dict[int(data['index']) - 1] = key - - @classmethod - def get_logical_port_by_sfp_index(cls, sfp_index): - with cls.sfp_index_to_logical_lock: - cls.get_sfp_index_to_logical_port() - logical_port_name = cls.sfp_index_to_logical_port_dict.get(sfp_index) - if not logical_port_name: - cls.get_sfp_index_to_logical_port(force=True) - else: - config_db = utils.DbUtils.get_db_instance('CONFIG_DB') - current_index = int(config_db.get('CONFIG_DB', f'PORT|{logical_port_name}', 'index')) - if current_index != sfp_index: - cls.get_sfp_index_to_logical_port(force=True) - logical_port_name = cls.sfp_index_to_logical_port_dict.get(sfp_index) - return logical_port_name - class SFP(NvidiaSFPCommon): """Platform-specific SFP class""" @@ -285,12 +354,43 @@ class SFP(NvidiaSFPCommon): SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED = 'PMD type not enabled' SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED = 'PCIE system power slot exceeded' SFP_MLNX_ERROR_DESCRIPTION_RESERVED = 'Reserved' + + SDK_ERRORS_TO_DESCRIPTION = { + 0x1: SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE, + 0x4: SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST, + 0x8: SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED, + 0xc: SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED + } SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE = 0x00010000 SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST = 0x00020000 SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED = 0x00040000 SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED = 0x00080000 SFP_MLNX_ERROR_BIT_RESERVED = 0x80000000 + + SDK_ERRORS_TO_ERROR_BITS = { + 0x0: SfpOptoeBase.SFP_ERROR_BIT_POWER_BUDGET_EXCEEDED, + 0x1: SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE, + 0x2: SfpOptoeBase.SFP_ERROR_BIT_I2C_STUCK, + 0x3: SfpOptoeBase.SFP_ERROR_BIT_BAD_EEPROM, + 0x4: SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST, + 0x5: SfpOptoeBase.SFP_ERROR_BIT_UNSUPPORTED_CABLE, + 0x6: SfpOptoeBase.SFP_ERROR_BIT_HIGH_TEMP, + 0x7: SfpOptoeBase.SFP_ERROR_BIT_BAD_CABLE, + 0x8: SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED, + 0xc: SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED + } + + # Class level state machine object, only applicable for module host management + sm = None + + # Class level wait SFP ready task, the task waits for module to load its firmware after resetting, + # only applicable for module host management + wait_ready_task = None + + # Class level action table which stores the mapping from action name to action function, + # only applicable for module host management + action_table = None def __init__(self, sfp_index, sfp_type=None, slot_id=0, linecard_port_count=0, lc_name=None): super(SFP, self).__init__(sfp_index) @@ -311,6 +411,11 @@ def __init__(self, sfp_index, sfp_type=None, slot_id=0, linecard_port_count=0, l self.slot_id = slot_id self._sfp_type_str = None + # SFP state, only applicable for module host management + self.state = STATE_DOWN + + def __str__(self): + return f'SFP {self.sdk_index}' def reinit(self): """ @@ -318,7 +423,7 @@ def reinit(self): :return: """ self._sfp_type_str = None - self.refresh_xcvr_api() + self._xcvr_api = None def get_presence(self): """ @@ -327,10 +432,6 @@ def get_presence(self): Returns: bool: True if device is present, False if not """ - try: - self.is_sw_control() - except: - return False eeprom_raw = self._read_eeprom(0, 1, log_on_error=False) return eeprom_raw is not None @@ -439,7 +540,7 @@ def get_lpmode(self): if self.is_sw_control(): api = self.get_xcvr_api() return api.get_lpmode() if api else False - elif DeviceDataManager.is_independent_mode(): + elif DeviceDataManager.is_module_host_management_mode(): file_path = SFP_SDK_MODULE_SYSFS_ROOT_TEMPLATE.format(self.sdk_index) + SFP_SYSFS_POWER_MODE power_mode = utils.read_int_from_file(file_path) return power_mode == POWER_MODE_LOW @@ -646,7 +747,7 @@ def set_lpmode(self, lpmode): # If at some point get_lpmode=desired_lpmode, it will return true. # If after timeout ends, lpmode will not be desired_lpmode, it will return false. return utils.wait_until(check_lpmode, 2, 1, api=api, lpmode=lpmode) - elif DeviceDataManager.is_independent_mode(): + elif DeviceDataManager.is_module_host_management_mode(): # FW control under CMIS host management mode. # Currently, we don't support set LPM under this mode. # Just return False to indicate set Fail @@ -745,6 +846,31 @@ def get_error_description(self): else: error_description = "Unknow SFP module status ({})".format(oper_status) return error_description + + def get_error_info_from_sdk_error_type(self): + """Translate SDK error type to SONiC error state and error description. Only calls + when sysfs "present" returns "2". + + Returns: + tuple: (error state, error description) + """ + error_type = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/statuserror') + sfp_state_bits = SFP.SDK_ERRORS_TO_ERROR_BITS.get(error_type) + if sfp_state_bits is None: + logger.log_error(f"Unrecognized error {error_type} detected on SFP {self.sdk_index}") + return SFP_STATUS_ERROR, "Unknown error ({})".format(error_type) + + if error_type in SDK_SFP_BLOCKING_ERRORS: + # In SFP at error status case, need to overwrite the sfp_state with the exact error code + sfp_state_bits |= SFP.SFP_ERROR_BIT_BLOCKING + + # An error should be always set along with 'INSERTED' + sfp_state_bits |= SFP.SFP_STATUS_BIT_INSERTED + + # For vendor specific errors, the description should be returned as well + error_description = SFP.SDK_ERRORS_TO_DESCRIPTION.get(error_type) + sfp_state = str(sfp_state_bits) + return sfp_state, error_description def _get_eeprom_path(self): return SFP_EEPROM_ROOT_TEMPLATE.format(self.sdk_index) @@ -976,24 +1102,550 @@ def get_xcvr_api(self): return self._xcvr_api def is_sw_control(self): - if not DeviceDataManager.is_independent_mode(): + if not DeviceDataManager.is_module_host_management_mode(): return False - - db = utils.DbUtils.get_db_instance('STATE_DB') - logical_port = NvidiaSFPCommon.get_logical_port_by_sfp_index(self.sdk_index) - if not logical_port: - raise Exception(f'Module {self.sdk_index} is not present or under initialization') - - initialized = db.exists('STATE_DB', f'TRANSCEIVER_STATUS|{logical_port}') - if not initialized: - raise Exception(f'Module {self.sdk_index} is not present or under initialization') - try: return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control', raise_exception=True, log_func=None) == 1 except: # just in case control file does not exist - raise Exception(f'Module {self.sdk_index} is under initialization') + raise Exception(f'control sysfs for SFP {self.sdk_index} does not exist') + + def get_module_status(self): + """Get value of sysfs status. It could return: + SXD_PMPE_MODULE_STATUS_PLUGGED_ENABLED_E = 0x1, + SXD_PMPE_MODULE_STATUS_UNPLUGGED_E = 0x2, + SXD_PMPE_MODULE_STATUS_MODULE_PLUGGED_ERROR_E = 0x3, + SXD_PMPE_MODULE_STATUS_PLUGGED_DISABLED_E = 0x4, + SXD_PMPE_MODULE_STATUS_UNKNOWN_E = 0x5, + + Returns: + str: sonic status of the module + """ + status = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/status') + return SDK_STATUS_TO_SONIC_STATUS[status] + + def get_hw_present(self): + """Get hardware present status, only applicable on host management mode + + Returns: + bool: True if module is in the cage + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_present') == 1 + + def get_power_on(self): + """Get power on status, only applicable on host management mode + + Returns: + bool: True if the module is powered on + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_on') == 1 + + def set_power(self, on): + """Control the power of this module, only applicable on host management mode + + Args: + on (bool): True if on + """ + value = 1 if on else 0 + utils.write_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_on', value) + + def get_reset_state(self): + """Get reset state of this module, only applicable on host management mode + + Returns: + bool: True if module is not in reset status + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_reset') == 1 + + def set_hw_reset(self, value): + """Set the module reset status + + Args: + value (int): 1 for reset, 0 for leaving reset + """ + utils.write_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_reset', value) + + def get_power_good(self): + """Get power good status of this module, only applicable on host management mode + + Returns: + bool: True if the power is in good status + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_good') == 1 + + def set_control_type(self, control_type): + """Set control type for the module + + Args: + control_type (int): 0 for firmware control, currently only 0 is allowed + """ + utils.write_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control', control_type) + + def get_control_type(self): + """Get control type of this module + + Returns: + int: 0 - firmware control, 1 - software control + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control_type') + + def determine_control_type(self): + """Determine control type according to module type + + Returns: + enum: software control or firmware control + """ + api = self.get_xcvr_api() + if not api: + logger.log_error(f'Failed to get api object for SFP {self.sdk_index}, probably module EEPROM is not ready') + return SFP_FW_CONTROL + + if not self.is_supported_for_software_control(api): + return SFP_FW_CONTROL + else: + return SFP_SW_CONTROL + + def is_cmis_api(self, xcvr_api): + """Check if the api type is CMIS + + Args: + xcvr_api (object): xcvr api object + + Returns: + bool: True if the api is of type CMIS + """ + return isinstance(xcvr_api, cmis.CmisApi) + + def is_sff_api(self, xcvr_api): + """Check if the api type is SFF + + Args: + xcvr_api (object): xcvr api object + + Returns: + bool: True if the api is of type SFF + """ + return isinstance(xcvr_api, sff8636.Sff8636Api) or isinstance(xcvr_api, sff8436.Sff8436Api) + + def is_supported_for_software_control(self, xcvr_api): + """Check if the api object supports software control + + Args: + xcvr_api (object): xcvr api object + + Returns: + bool: True if the api object supports software control + """ + return self.is_cmis_api(xcvr_api) or self.is_sff_api(xcvr_api) + + def check_power_capability(self): + """Check module max power with cage power limit + + Returns: + bool: True if max power does not exceed cage power limit + """ + max_power = self.get_module_max_power() + if max_power < 0: + return False + + power_limit = self.get_power_limit() + logger.log_info(f'SFP {self.sdk_index}: max_power={max_power}, power_limit={power_limit}') + return max_power <= power_limit + + def get_power_limit(self): + """Get power limit of this module + + Returns: + int: Power limit in unit of 0.25W + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_limit') + + def get_module_max_power(self): + """Get module max power from EEPROM + + Returns: + int: max power in terms of 0.25W. Return POWER_CLASS_INVALID if EEPROM data is incorrect. + """ + xcvr_api = self.get_xcvr_api() + if self.is_cmis_api(xcvr_api): + powercap_raw = self.read_eeprom(CMIS_MAX_POWER_OFFSET, 1) + return powercap_raw[0] + elif self.is_sff_api(xcvr_api): + power_class_raw = self.read_eeprom(SFF_POWER_CLASS_OFFSET, 1) + power_class_bit = power_class_raw[0] & SFF_POWER_CLASS_MASK + if power_class_bit in SFF_POWER_CLASS_MAPPING: + powercap = SFF_POWER_CLASS_MAPPING[power_class_bit] + elif power_class_bit == SFF_POWER_CLASS_8_INDICATOR: + # According to standard: + # Byte 128: + # if bit 5 is 1, "Power Class 8 implemented (Max power declared in byte 107)" + # Byte 107: + # "Maximum power consumption of module. Unsigned integer with LSB = 0.1 W." + power_class_8_byte = self.read_eeprom(SFF_POWER_CLASS_8_OFFSET, 1) + powercap = power_class_8_byte[0] * 0.1 + else: + logger.log_error(f'SFP {self.sdk_index} got invalid value for power class field: {power_class_bit}') + return -1 + + # Multiplying the sysfs value (0.25 Watt units) by 4 aligns it with the EEPROM max power value (1 Watt units), + # ensuring both are in the same unit for a meaningful comparison + return powercap * 4 # + else: + # Should never hit, just in case + logger.log_error(f'SFP {self.sdk_index} with api type {xcvr_api} does not support getting max power') + return -1 + + def update_i2c_frequency(self): + """Update I2C frequency for the module. + """ + if self.get_frequency_support(): + api = self.get_xcvr_api() + if self.is_cmis_api(api): + # for CMIS modules, read the module maximum supported clock of Management Comm Interface (MCI) from module EEPROM. + # from byte 2 bits 3-2: + # 00b means module supports up to 400KHz + # 01b means module supports up to 1MHz + logger.log_debug(f"Reading mci max frequency for SFP {self.sdk_index}") + read_mci = self.read_eeprom(CMIS_MCI_EEPROM_OFFSET, 1) + logger.log_debug(f"Read mci max frequency {read_mci[0]} for SFP {self.sdk_index}") + frequency = (read_mci[0] & CMIS_MCI_MASK) >> 2 + elif self.is_sff_api(api): + # for SFF modules, frequency is always 400KHz + frequency = 0 + else: + # Should never hit, just in case + logger.log_error(f'SFP {self.sdk_index} with api type {api} does not support updating frequency but frequency_support sysfs return 1') + return + + logger.log_info(f"Read mci max frequency bits {frequency} for SFP {self.sdk_index}") + self.set_frequency(frequency) + + def get_frequency_support(self): + """Get frequency support for this module + + Returns: + bool: True if supported + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/frequency_support') == 1 + + def set_frequency(self, freqeuncy): + """Set module frequency. + + Args: + freqeuncy (int): 0 - up to 400KHz, 1 - up to 1MHz + """ + utils.write_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/frequency', freqeuncy) + + def disable_tx_for_sff_optics(self): + """Disable TX for SFF optics + """ + api = self.get_xcvr_api() + if self.is_sff_api(api) and api.get_tx_disable_support(): + logger.log_info(f'Disabling tx for SFP {self.sdk_index}') + api.tx_disable(True) + + @classmethod + def get_state_machine(cls): + """Get state machine object, create if not exists + + Returns: + object: state machine object + """ + if not cls.sm: + from .state_machine import StateMachine + sm = StateMachine() + sm.add_state(STATE_DOWN).add_transition(EVENT_START, STATE_INIT) + sm.add_state(STATE_INIT).set_entry_action(ACTION_ON_START) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) \ + .add_transition(EVENT_RESET, STATE_RESETTING) \ + .add_transition(EVENT_POWER_ON, STATE_POWERED_ON) + sm.add_state(STATE_RESETTING).set_entry_action(ACTION_ON_RESET) \ + .add_transition(EVENT_RESET_DONE, STATE_POWERED_ON) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) + sm.add_state(STATE_POWERED_ON).set_entry_action(ACTION_ON_POWERED) \ + .add_transition(EVENT_POWER_BAD, STATE_POWER_BAD) \ + .add_transition(EVENT_SW_CONTROL, STATE_SW_CONTROL) \ + .add_transition(EVENT_FW_CONTROL, STATE_FW_CONTROL) + sm.add_state(STATE_SW_CONTROL).set_entry_action(ACTION_ON_SW_CONTROL) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) \ + .add_transition(EVENT_POWER_LIMIT_EXCEED, STATE_POWER_LIMIT_ERROR) \ + .add_transition(EVENT_POWER_BAD, STATE_POWER_BAD) + sm.add_state(STATE_FW_CONTROL).set_entry_action(ACTION_ON_FW_CONTROL) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) + sm.add_state(STATE_POWER_BAD).add_transition(EVENT_POWER_GOOD, STATE_POWERED_ON) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) + sm.add_state(STATE_NOT_PRESENT).set_entry_action(ACTION_ON_NOT_PRESENT) \ + .add_transition(EVENT_PRESENT, STATE_INIT) + sm.add_state(STATE_POWER_LIMIT_ERROR).set_entry_action(ACTION_ON_POWER_LIMIT_ERROR) \ + .add_transition(EVENT_POWER_GOOD, STATE_POWERED_ON) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) + + cls.action_table = {} + cls.action_table[ACTION_ON_START] = cls.action_on_start + cls.action_table[ACTION_ON_RESET] = cls.action_on_reset + cls.action_table[ACTION_ON_POWERED] = cls.action_on_powered + cls.action_table[ACTION_ON_SW_CONTROL] = cls.action_on_sw_control + cls.action_table[ACTION_ON_FW_CONTROL] = cls.action_on_fw_control + cls.action_table[ACTION_ON_NOT_PRESENT] = cls.action_on_not_present + cls.action_table[ACTION_ON_POWER_LIMIT_ERROR] = cls.action_on_power_limit_error + + cls.sm = sm + + return cls.sm + + @classmethod + def action_on_start(cls, sfp): + if not sfp.get_hw_present(): + logger.log_info(f'SFP {sfp.sdk_index} is not present') + sfp.on_event(EVENT_NOT_PRESENT) + return + + if not sfp.get_power_on(): + logger.log_info(f'SFP {sfp.sdk_index} is not powered on') + sfp.set_power(True) + sfp.set_hw_reset(1) + sfp.on_event(EVENT_RESET) + else: + if not sfp.get_reset_state(): + logger.log_info(f'SFP {sfp.sdk_index} is in reset state') + sfp.set_hw_reset(1) + sfp.on_event(EVENT_RESET) + else: + sfp.on_event(EVENT_POWER_ON) + + @classmethod + def action_on_reset(cls, sfp): + logger.log_info(f'SFP {sfp.sdk_index} is scheduled to wait for resetting done') + cls.get_wait_ready_task().schedule_wait(sfp.sdk_index) + + @classmethod + def action_on_powered(cls, sfp): + if not sfp.get_power_good(): + logger.log_info(f'SFP {sfp.sdk_index} is not in power good state') + sfp.on_event(EVENT_POWER_BAD) + return + + control_type = sfp.determine_control_type() + if control_type == SFP_SW_CONTROL: + sfp.on_event(EVENT_SW_CONTROL) + else: + sfp.on_event(EVENT_FW_CONTROL) + + @classmethod + def action_on_sw_control(cls, sfp): + if not sfp.check_power_capability(): + sfp.on_event(EVENT_POWER_LIMIT_EXCEED) + return + + sfp.update_i2c_frequency() + sfp.disable_tx_for_sff_optics() + logger.log_info(f'SFP {sfp.sdk_index} is set to software control') + + @classmethod + def action_on_fw_control(cls, sfp): + logger.log_info(f'SFP {sfp.sdk_index} is set to firmware control') + sfp.set_control_type(SFP_FW_CONTROL) + + @classmethod + def action_on_not_present(cls, sfp): + cls.get_wait_ready_task().cancel_wait(sfp.sdk_index) + + @classmethod + def action_on_power_limit_error(cls, sfp): + logger.log_info(f'SFP {sfp.sdk_index} is powered off due to exceeding power limit') + sfp.set_power(False) + sfp.set_hw_reset(0) + + @classmethod + def get_wait_ready_task(cls): + """Get SFP wait ready task. Create if not exists. + + Returns: + object: an instance of WaitSfpReadyTask + """ + if not cls.wait_ready_task: + from .wait_sfp_ready_task import WaitSfpReadyTask + cls.wait_ready_task = WaitSfpReadyTask() + return cls.wait_ready_task + + def get_state(self): + """Return the current state. + + Returns: + str: current state + """ + return self.state + + def change_state(self, new_state): + """Change from old state to new state + + Args: + new_state (str): new state + """ + self.state = new_state + + def on_action(self, action_name): + """Called when a state machine action is executing + + Args: + action_name (str): action name + """ + SFP.action_table[action_name](self) + + def on_event(self, event): + """Called when a state machine event arrives + + Args: + event (str): State machine event + """ + SFP.get_state_machine().on_event(self, event) + + def in_stable_state(self): + """Indicate whether this module is in a stable state. 'Stable state' means the module is pending on a polling event + from SDK. + + Returns: + bool: True if the module is in a stable state + """ + return self.state in (STATE_NOT_PRESENT, STATE_SW_CONTROL, STATE_FW_CONTROL, STATE_POWER_BAD, STATE_POWER_LIMIT_ERROR) + + def get_fd(self, fd_type): + return open(f'/sys/module/sx_core/asic0/module{self.sdk_index}/{fd_type}') + + def get_fds_for_poling(self): + if self.state == STATE_FW_CONTROL: + return { + 'present': self.get_fd('present') + } + else: + return { + 'hw_present': self.get_fd('hw_present'), + 'power_good': self.get_fd('power_good') + } + + def get_fd_for_polling_legacy(self): + """Get polling fds for when module host management is disabled + + Returns: + object: file descriptor of present + """ + return self.get_fd('present') + + def fill_change_event(self, port_dict): + """Fill change event data based on current state. + + Args: + port_dict (dict): {:} + """ + if self.state == STATE_NOT_PRESENT: + port_dict[self.sdk_index + 1] = SFP_STATUS_REMOVED + elif self.state == STATE_SW_CONTROL: + port_dict[self.sdk_index + 1] = SFP_STATUS_INSERTED + elif self.state == STATE_FW_CONTROL: + port_dict[self.sdk_index + 1] = SFP_STATUS_INSERTED + elif self.state == STATE_POWER_BAD or self.state == STATE_POWER_LIMIT_ERROR: + sfp_state = SFP.SFP_ERROR_BIT_POWER_BUDGET_EXCEEDED | SFP.SFP_STATUS_BIT_INSERTED + port_dict[self.sdk_index + 1] = str(sfp_state) + + def refresh_poll_obj(self, poll_obj, all_registered_fds): + """Refresh polling object and registered fds. This function is usually called when a cable plugin + event occurs. For example, user plugs out a software control module and replaces with a firmware + control cable. In such case, poll_obj was polling "hw_present" and "power_good" for software control, + and it needs to be changed to poll "present" for new control type which is firmware control. + + Args: + poll_obj (object): poll object + all_registered_fds (dict): fds that have been registered to poll object + """ + # find fds registered by this SFP + current_registered_fds = {item[2]: (fileno, item[1]) for fileno, item in all_registered_fds.items() if item[0] == self.sdk_index} + logger.log_debug(f'SFP {self.sdk_index} registered fds are: {current_registered_fds}') + if self.state == STATE_FW_CONTROL: + target_poll_types = ['present'] + else: + target_poll_types = ['hw_present', 'power_good'] + + for target_poll_type in target_poll_types: + if target_poll_type not in current_registered_fds: + # need add new fd for polling + logger.log_debug(f'SFP {self.sdk_index} is registering file descriptor: {target_poll_type}') + fd = self.get_fd(target_poll_type) + poll_obj.register(fd, select.POLLERR | select.POLLPRI) + all_registered_fds[fd.fileno()] = (self.sdk_index, fd, target_poll_type) + else: + # the fd is already in polling + current_registered_fds.pop(target_poll_type) + + for _, item in current_registered_fds.items(): + # Deregister poll, close fd + logger.log_debug(f'SFP {self.sdk_index} is de-registering file descriptor: {item}') + poll_obj.poll_obj.unregister(item[1]) + all_registered_fds.pop(item[0]) + item[1].close() + + def is_dummy_event(self, fd_type, fd_value): + """Check whether an event is dummy event + + Args: + origin_state (str): original state before polling + fd_type (str): polling sysfs type + fd_value (int): polling sysfs value + + Returns: + bool: True if the event is a dummy event + """ + if fd_type == 'hw_present' or fd_type == 'present': + if fd_value == int(SFP_STATUS_INSERTED): + return self.state in (STATE_SW_CONTROL, STATE_FW_CONTROL, STATE_POWER_BAD, STATE_POWER_LIMIT_ERROR) + elif fd_value == int(SFP_STATUS_REMOVED): + return self.state == STATE_NOT_PRESENT + elif fd_type == 'power_good': + if fd_value == 1: + return self.state == STATE_SW_CONTROL + else: + return self.state in (STATE_POWER_BAD, STATE_POWER_LIMIT_ERROR) + return False + + @classmethod + def initialize_sfp_modules(cls, sfp_list): + """Initialize all modules. Only applicable when module host management is enabled + + Args: + sfp_list (object): all sfps + """ + wait_ready_task = cls.get_wait_ready_task() + wait_ready_task.start() + + for s in sfp_list: + s.on_event(EVENT_START) + + if not wait_ready_task.empty(): + # Wait until wait_ready_task is up + while not wait_ready_task.is_alive(): + pass + + # Resetting SFP requires a reloading of module firmware, it takes up to 3 seconds + # according to standard + max_wait_time = 3.5 + begin = time.time() + while True: + ready_sfp_set = wait_ready_task.get_ready_set() + for sfp_index in ready_sfp_set: + s = sfp_list[sfp_index] + logger.log_debug(f'SFP {sfp_index} is recovered from resetting state') + s.on_event(EVENT_RESET_DONE) + elapse = time.time() - begin + if elapse < max_wait_time: + time.sleep(0.5) + else: + break + + # Verify that all modules are in a stable state + for index, s in enumerate(sfp_list): + if not s.in_stable_state(): + logger.log_error(f'SFP {index} is not in stable state after initializing, state={s.state}') + logger.log_notice(f'SFP {index} is in state {s.state} after module initialization') class RJ45Port(NvidiaSFPCommon): diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/state_machine.py b/platform/mellanox/mlnx-platform-api/sonic_platform/state_machine.py new file mode 100644 index 000000000000..1efbc4517b54 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/state_machine.py @@ -0,0 +1,158 @@ +# +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from sonic_py_common.logger import Logger + +logger = Logger() + + +class State: + """Represent a state in a state machine + """ + def __init__(self, name): + self.name = name + self.entry_action = None + self.leave_action = None + self.transitions = {} + + def set_entry_action(self, action_name): + """Set an action when entering this state + + Args: + action_name (str): action name + + Returns: + object: self + """ + self.entry_action = action_name + return self + + def set_leave_action(self, action_name): + """Set a leave action when leaving the state + + Args: + action_name (str): action name + + Returns: + object: self + """ + self.leave_action = action_name + return self + + def add_transition(self, event, next_state): + """Add a transition item to this state + + Args: + event (str): event name + next_state (str): next state that the state entity will transit to upon this event. + + Raises: + RuntimeError: raise if the event is already in the transition table + + Returns: + object: self + """ + if event in self.transitions: + raise RuntimeError(f'event {event} already exists in state {self.name}') + + self.transitions[event] = next_state + return self + + def on_enter(self, entity): + """Called when state entity enters the state + + Args: + entity (obj): state entity + """ + if self.entry_action: + logger.log_debug(f'{entity} entered state [{self.name}] and is triggering action [{self.entry_action}]') + entity.on_action(self.entry_action) + else: + logger.log_debug(f'{entity} entered state [{self.name}]') + + def on_leave(self, entity): + """Called when state entity leaves the state + + Args: + entity (obj): state entity + """ + if self.leave_action: + entity.on_action(self.leave_action) + + def on_event(self, event): + """Called when state entity has got an event + + Args: + event (str): event name + + Returns: + str: next event name + """ + if event not in self.transitions: + logger.log_error(f'{event} is not defined in state {self.name}') + return self.name + else: + return self.transitions[event] + + +class StateMachine: + def __init__(self): + self.states = {} + + def add_state(self, state_name): + """Register a state to state machine + + Args: + state_name (str): name of the state + + Raises: + RuntimeError: raise if state name already exists + + Returns: + object: the new state object + """ + if state_name in self.states: + raise RuntimeError(f'state {state_name} already exists') + + state = State(state_name) + self.states[state_name] = state + return state + + def on_event(self, entity, event): + """Called when an event occurs + + Args: + entity (object): state entity + event (str): event name + + Raises: + RuntimeError: raise if the current state is not registered + RuntimeError: raise if next state is not registered + """ + current_state_name = entity.get_state() + if current_state_name not in self.states: + raise RuntimeError(f'Unknown state {current_state_name}') + + current_state = self.states[current_state_name] + next_state_name = current_state.on_event(event) + logger.log_debug(f'{entity} has got event [{event}], it is changing from state [{current_state}] to [{next_state_name}]') + if next_state_name not in self.states: + raise RuntimeError(f'Unknown next state {next_state_name}') + if next_state_name != current_state_name: + current_state.on_leave(entity) + entity.change_state(next_state_name) + self.states[next_state_name].on_enter(entity) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 5c118b4c9a07..944a28b054e2 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -33,7 +33,7 @@ def initialize(cls): and any other vendor specific initialization. :return: """ - if DeviceDataManager.is_independent_mode(): + if DeviceDataManager.is_module_host_management_mode(): from .chassis import Chassis cls.thermal_updater_task = thermal_updater.ThermalUpdater(Chassis.chassis_instance.get_all_sfps()) cls.thermal_updater_task.start() @@ -46,5 +46,5 @@ def deinitialize(cls): is a no-op. :return: """ - if DeviceDataManager.is_independent_mode() and cls.thermal_updater_task: + if DeviceDataManager.is_module_host_management_mode() and cls.thermal_updater_task: cls.thermal_updater_task.stop() diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py index f2f0f75b2fd1..889bc96d3bec 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -81,10 +81,6 @@ def load_tc_config(self): def start(self): self.clean_thermal_data() - if not self.wait_all_sfp_ready(): - logger.log_error('Failed to wait for all SFP ready, will put hw-management-tc to suspend') - self.control_tc(True) - return self.control_tc(False) self.load_tc_config() self._timer.start() @@ -106,25 +102,6 @@ def clean_thermal_data(self): sfp.sdk_index + 1 ) - def wait_all_sfp_ready(self): - logger.log_notice('Waiting for all SFP modules ready...') - max_wait_time = 300 - ready_set = set() - while len(ready_set) != len(self._sfp_list): - for sfp in self._sfp_list: - try: - sfp.is_sw_control() - ready_set.add(sfp) - except: - continue - max_wait_time -= 1 - if max_wait_time == 0: - return False - time.sleep(1) - - logger.log_notice('All SFP modules are ready') - return True - def get_asic_temp(self): temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None) return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py b/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py index a7354ac7b864..77aad4a315c7 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. +# Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -282,11 +282,13 @@ def wait_until(predict, timeout, interval=1, *args, **kwargs): Returns: _type_: _description_ """ + if predict(*args, **kwargs): + return True while timeout > 0: - if predict(*args, **kwargs): - return True time.sleep(interval) timeout -= interval + if predict(*args, **kwargs): + return True return False diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/wait_sfp_ready_task.py b/platform/mellanox/mlnx-platform-api/sonic_platform/wait_sfp_ready_task.py new file mode 100644 index 000000000000..56b1f479fd44 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/wait_sfp_ready_task.py @@ -0,0 +1,139 @@ +# +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import copy +import threading +import time +from sonic_py_common.logger import Logger + +logger = Logger() +EMPTY_SET = set() + + +class WaitSfpReadyTask(threading.Thread): + """When bring a module from powered off to powered on, it takes 3 seconds + for module to load its firmware. This class is designed to perform a wait for + those modules who are loading firmware. + """ + WAIT_TIME = 3 + + def __init__(self): + # Set daemon to True so that the thread will be destroyed when daemon exits. + super().__init__(daemon=True) + self.running = False + + # Lock to protect the wait list + self.lock = threading.Lock() + + # Event to wake up thread function + self.event = threading.Event() + + # A list of SFP to be waited. Key is SFP index, value is the expire time. + self._wait_dict = {} + + # The queue to store those SFPs who finish loading firmware. + self._ready_set = set() + + def stop(self): + """Stop the task, only used in unit test + """ + self.running = False + self.event.set() + + def schedule_wait(self, sfp_index): + """Add a SFP to the wait list + + Args: + sfp_index (int): the index of the SFP object + """ + logger.log_debug(f'SFP {sfp_index} is scheduled for waiting reset done') + with self.lock: + if len(self._wait_dict) == 0: + is_empty = True + # The item will be expired in 3 seconds + self._wait_dict[sfp_index] = time.time() + self.WAIT_TIME + + if is_empty: + logger.log_debug('An item arrives, wake up WaitSfpReadyTask') + # wake up the thread + self.event.set() + + def cancel_wait(self, sfp_index): + """Cancel a SFP from the wait list + + Args: + sfp_index (int): the index of the SFP object + """ + logger.log_debug(f'SFP {sfp_index} is canceled for waiting reset done') + with self.lock: + if sfp_index in self._wait_dict: + self._wait_dict.pop(sfp_index) + if sfp_index in self._ready_set: + self._ready_set.pop(sfp_index) + + def get_ready_set(self): + """Get ready set and clear it + + Returns: + set: a deep copy of self._ready_set + """ + with self.lock: + if not self._ready_set: + return EMPTY_SET + ready_set = copy.deepcopy(self._ready_set) + self._ready_set.clear() + return ready_set + + def empty(self): + """Indicate if wait_dict is empty + + Returns: + bool: True if wait_dict is empty + """ + with self.lock: + return len(self._wait_dict) == 0 + + def run(self): + """Thread function + """ + self.running = True + pending_remove_set = set() + is_empty = True + while self.running: + if is_empty: + logger.log_debug(f'WaitSfpReadyTask is waiting for task...') + # If wait_dict is empty, hold the thread until an item coming + self.event.wait() + self.event.clear() + + now = time.time() + with self.lock: + logger.log_debug(f'Processing wait SFP dict: {self._wait_dict}, now={now}') + for sfp_index, expire_time in self._wait_dict.items(): + # If now time is greater than the expire time, remove + # the item from wait_dict + if now >= expire_time: + pending_remove_set.add(sfp_index) + + for sfp_index in pending_remove_set: + self._wait_dict.pop(sfp_index) + self._ready_set.add(sfp_index) + + is_empty = (len(self._wait_dict) == 0) + + pending_remove_set.clear() + time.sleep(1) diff --git a/platform/mellanox/mlnx-platform-api/tests/test_change_event.py b/platform/mellanox/mlnx-platform-api/tests/test_change_event.py new file mode 100644 index 000000000000..309c06ff0e8f --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_change_event.py @@ -0,0 +1,219 @@ +# +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys + +if sys.version_info.major == 3: + from unittest import mock +else: + import mock + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform import chassis +from sonic_platform import sfp + + +class TestChangeEvent: + @mock.patch('sonic_platform.sfp.SFP.get_fd_for_polling_legacy') + @mock.patch('select.poll') + @mock.patch('time.time') + @mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode', mock.MagicMock(return_value=False)) + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_sfp_count', mock.MagicMock(return_value=1)) + @mock.patch('sonic_platform.chassis.extract_RJ45_ports_index', mock.MagicMock(return_value=[])) + @mock.patch('sonic_platform.sfp.SFP.get_module_status') + def test_get_change_event_legacy(self, mock_status, mock_time, mock_create_poll, mock_get_fd): + c = chassis.Chassis() + s = c.get_sfp(1) + + mock_status.return_value = sfp.SFP_STATUS_INSERTED + + # mock poll object + mock_poll = mock.MagicMock() + mock_create_poll.return_value = mock_poll + mock_poll.poll = mock.MagicMock(return_value = []) + + # mock file descriptor for polling + mock_file = mock.MagicMock() + mock_get_fd.return_value = mock_file + mock_file.fileno = mock.MagicMock(return_value = 1) + + timeout = 1000 + # mock time function so that the while loop exit early + mock_time.side_effect = [0, timeout] + + # no event, expect returning empty change event + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and not change_event['sfp'] + + # dummy event, expect returning empty change event + sfp_index = s.sdk_index + 1 + mock_poll.poll.return_value = [(1, 10)] + mock_time.side_effect = [0, timeout] + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and not change_event['sfp'] + + # plug out event, expect returning remove event + mock_time.side_effect = [0, timeout] + mock_status.return_value = sfp.SFP_STATUS_REMOVED + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and sfp_index in change_event['sfp'] and change_event['sfp'][sfp_index] == sfp.SFP_STATUS_REMOVED + + # error event, expect returning error event + mock_time.side_effect = [0, timeout] + mock_status.return_value = sfp.SFP_STATUS_ERROR + s.get_error_info_from_sdk_error_type = mock.MagicMock(return_value=('2', 'some error')) + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and sfp_index in change_event['sfp'] and change_event['sfp'][sfp_index] == '2' + assert 'sfp_error' in change_event and sfp_index in change_event['sfp_error'] and change_event['sfp_error'][sfp_index] == 'some error' + + @mock.patch('sonic_platform.sfp.SFP.get_fd') + @mock.patch('select.poll') + @mock.patch('time.time') + @mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode', mock.MagicMock(return_value=True)) + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_sfp_count', mock.MagicMock(return_value=1)) + @mock.patch('sonic_platform.chassis.extract_RJ45_ports_index', mock.MagicMock(return_value=[])) + @mock.patch('sonic_platform.module_host_mgmt_initializer.ModuleHostMgmtInitializer.initialize', mock.MagicMock()) + def test_get_change_event_for_module_host_management_mode(self, mock_time, mock_create_poll, mock_get_fd): + """Test steps: + 1. Simulate polling with no event + 2. Simulate polling the first dummy event. (SDK always return a event when first polling the fd even if there is no change) + 3. Simulate a plug out event, module transfer from sw control to not present + 4. Simulate plugging in a fw control module, module transfer to fw control + 5. Simulate an error event + 6. Simulate a plug out event, module transfer from fw control to not present + 7. Simulate plugging in a sw control module, module transfer to sw control + 8. Simulate a power bad event, module transfer from sw control to power bad + 9. Simulate a power good event, module transfer from power bad to sw control + """ + c = chassis.Chassis() + c.initialize_sfp() + s = c._sfp_list[0] + s.state = sfp.STATE_SW_CONTROL + + # mock poll object + mock_poll = mock.MagicMock() + mock_create_poll.return_value = mock_poll + mock_poll.poll = mock.MagicMock(return_value = []) + + # mock file descriptors for polling + mock_hw_present_file = mock.MagicMock() + mock_power_good_file = mock.MagicMock() + mock_present_file = mock.MagicMock() + mock_hw_present_file.read = mock.MagicMock(return_value=sfp.SFP_STATUS_INSERTED) + mock_hw_present_file.fileno = mock.MagicMock(return_value = 1) + mock_power_good_file.read = mock.MagicMock(return_value=1) + mock_power_good_file.fileno = mock.MagicMock(return_value = 2) + mock_present_file.read = mock.MagicMock(return_value=sfp.SFP_STATUS_INSERTED) + mock_present_file.fileno = mock.MagicMock(return_value = 3) + def get_fd(fd_type): + if fd_type == 'hw_present': + return mock_hw_present_file + elif fd_type == 'power_good': + return mock_power_good_file + else: + return mock_present_file + mock_get_fd.side_effect = get_fd + + timeout = 1000 + # mock time function so that the while loop exit early + mock_time.side_effect = [0, timeout] + + # no event, expect returning empty change event + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and not change_event['sfp'] + + # dummy event, expect returning empty change event + sfp_index = s.sdk_index + 1 + mock_poll.poll.return_value = [(1, 10)] + mock_time.side_effect = [0, timeout] + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and not change_event['sfp'] + + # plug out event, expect returning remove event + mock_time.side_effect = [0, timeout] + mock_hw_present_file.read.return_value = sfp.SFP_STATUS_REMOVED + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and sfp_index in change_event['sfp'] and change_event['sfp'][sfp_index] == sfp.SFP_STATUS_REMOVED + assert s.state == sfp.STATE_NOT_PRESENT + + # plug in with a fw control cable, expect returning insert event + s.get_hw_present = mock.MagicMock(return_value=True) + s.get_power_on = mock.MagicMock(return_value=True) + s.get_reset_state = mock.MagicMock(return_value=True) + s.get_power_good = mock.MagicMock(return_value=True) + s.determine_control_type = mock.MagicMock(return_value=sfp.SFP_FW_CONTROL) + s.set_control_type = mock.MagicMock() + mock_time.side_effect = [0, timeout] + mock_hw_present_file.read.return_value = sfp.SFP_STATUS_INSERTED + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and sfp_index in change_event['sfp'] and change_event['sfp'][sfp_index] == sfp.SFP_STATUS_INSERTED + assert s.state == sfp.STATE_FW_CONTROL + assert 1 not in c.registered_fds # stop polling hw_present + assert 2 not in c.registered_fds # stop polling power_good + assert 3 in c.registered_fds # start polling present because it is firmware control + print(c.registered_fds) + + # error event, expect returning error + mock_time.side_effect = [0, timeout] + mock_poll.poll.return_value = [(3, 10)] + mock_present_file.read.return_value = sfp.SFP_STATUS_ERROR + s.get_error_info_from_sdk_error_type = mock.MagicMock(return_value=('2', 'some error')) + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and sfp_index in change_event['sfp'] and change_event['sfp'][sfp_index] == '2' + assert 'sfp_error' in change_event and sfp_index in change_event['sfp_error'] and change_event['sfp_error'][sfp_index] == 'some error' + + # plug out the firmware control cable, expect returning remove event + mock_time.side_effect = [0, timeout] + mock_present_file.read.return_value = sfp.SFP_STATUS_REMOVED + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and sfp_index in change_event['sfp'] and change_event['sfp'][sfp_index] == sfp.SFP_STATUS_REMOVED + assert s.state == sfp.STATE_NOT_PRESENT + assert 1 in c.registered_fds # start polling hw_present because cable is not present, always assume software control + assert 2 in c.registered_fds # start polling power_good because cable is not present, always assume software control + assert 3 not in c.registered_fds # stop polling present + + # plug in a software control cable, expect returning insert event + mock_time.side_effect = [0, timeout] + mock_poll.poll.return_value = [(1, 10)] + mock_hw_present_file.read.return_value = sfp.SFP_STATUS_INSERTED + s.determine_control_type.return_value = sfp.SFP_SW_CONTROL + s.check_power_capability = mock.MagicMock(return_value=True) + s.update_i2c_frequency = mock.MagicMock() + s.disable_tx_for_sff_optics = mock.MagicMock() + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and sfp_index in change_event['sfp'] and change_event['sfp'][sfp_index] == sfp.SFP_STATUS_INSERTED + assert s.state == sfp.STATE_SW_CONTROL + + # power bad event, expect returning error event + mock_time.side_effect = [0, timeout] + mock_poll.poll.return_value = [(2, 10)] + mock_power_good_file.read.return_value = '0' + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and sfp_index in change_event['sfp'] and change_event['sfp'][sfp_index] == '5' + assert s.state == sfp.STATE_POWER_BAD + + # power good event, expect returning insert event + mock_time.side_effect = [0, timeout] + mock_poll.poll.return_value = [(2, 10)] + mock_power_good_file.read.return_value = '1' + _, change_event = c.get_change_event(timeout) + assert 'sfp' in change_event and sfp_index in change_event['sfp'] and change_event['sfp'][sfp_index] == '1' + assert s.state == sfp.STATE_SW_CONTROL diff --git a/platform/mellanox/mlnx-platform-api/tests/test_chassis.py b/platform/mellanox/mlnx-platform-api/tests/test_chassis.py index ffe86aaf3d08..64b101edcd2e 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_chassis.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_chassis.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. +# Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -124,6 +124,7 @@ def test_fan(self): chassis._fan_drawer_list = [] assert chassis.get_num_fan_drawers() == 2 + @mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode', mock.MagicMock(return_value=False)) def test_sfp(self): # Test get_num_sfps, it should not create any SFP objects DeviceDataManager.get_sfp_count = mock.MagicMock(return_value=3) @@ -169,6 +170,7 @@ def test_sfp(self): assert len(sfp_list) == 3 assert chassis.sfp_initialized_count == 3 + @mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode', mock.MagicMock(return_value=False)) def test_create_sfp_in_multi_thread(self): DeviceDataManager.get_sfp_count = mock.MagicMock(return_value=3) @@ -192,25 +194,6 @@ def test_create_sfp_in_multi_thread(self): assert s.sdk_index == index iteration_num -= 1 - - @mock.patch('sonic_platform.device_data.DeviceDataManager.get_sfp_count', MagicMock(return_value=3)) - def test_change_event(self): - chassis = Chassis() - chassis.modules_mgmt_thread.is_alive = MagicMock(return_value=True) - chassis.modules_changes_queue.get = MagicMock(return_value={1: '1'}) - - # Call get_change_event with timeout=0, wait until an event is detected - status, event_dict = chassis.get_change_event() - assert status is True - assert 'sfp' in event_dict and event_dict['sfp'][1] == '1' - assert len(chassis._sfp_list) == 3 - - # Call get_change_event with timeout=1.0 - chassis.modules_changes_queue.get.return_value = {} - status, event_dict = chassis.get_change_event(timeout=1.0) - assert status is True - assert 'sfp' in event_dict and not event_dict['sfp'] - @mock.patch('sonic_platform.chassis.Chassis._wait_reboot_cause_ready', MagicMock(return_value=True)) def test_reboot_cause(self): from sonic_platform import utils diff --git a/platform/mellanox/mlnx-platform-api/tests/test_device_data.py b/platform/mellanox/mlnx-platform-api/tests/test_device_data.py index c172b82a30b7..35179d925861 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_device_data.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_device_data.py @@ -56,9 +56,9 @@ def test_get_bios_component(self): @mock.patch('sonic_platform.device_data.utils.read_key_value_file') def test_is_independent_mode(self, mock_read): mock_read.return_value = {} - assert not DeviceDataManager.is_independent_mode() + assert not DeviceDataManager.is_module_host_management_mode() mock_read.return_value = {'SAI_INDEPENDENT_MODULE_MODE': '1'} - assert DeviceDataManager.is_independent_mode() + assert DeviceDataManager.is_module_host_management_mode() @mock.patch('sonic_py_common.device_info.get_path_to_platform_dir', mock.MagicMock(return_value='/tmp')) @mock.patch('sonic_platform.device_data.utils.load_json_file') diff --git a/platform/mellanox/mlnx-platform-api/tests/test_module_initializer.py b/platform/mellanox/mlnx-platform-api/tests/test_module_initializer.py new file mode 100644 index 000000000000..ad833a70f85c --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_module_initializer.py @@ -0,0 +1,98 @@ +# +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys + +if sys.version_info.major == 3: + from unittest import mock +else: + import mock + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform import chassis +from sonic_platform import module_host_mgmt_initializer + + +class TestModuleInitializer: + @mock.patch('os.path.exists') + @mock.patch('sonic_platform.utils.wait_until') + @mock.patch('sonic_platform.utils.is_host') + def test_wait_module_ready(self, mock_is_host, mock_wait, mock_exists): + initializer = module_host_mgmt_initializer.ModuleHostMgmtInitializer() + mock_is_host.return_value = True + mock_exists.return_value = False + mock_wait.return_value = True + initializer.wait_module_ready() + mock_exists.assert_called_with(module_host_mgmt_initializer.MODULE_READY_HOST_FILE) + assert initializer.initialized + + initializer.initialized = False + mock_is_host.return_value = False + initializer.wait_module_ready() + mock_exists.assert_called_with(module_host_mgmt_initializer.MODULE_READY_CONTAINER_FILE) + + initializer.initialized = False + mock_exists.return_value = True + initializer.wait_module_ready() + assert initializer.initialized + + initializer.initialized = False + mock_wait.return_value = False + mock_exists.return_value = False + initializer.wait_module_ready() + assert not initializer.initialized + + + @mock.patch('sonic_platform.chassis.extract_RJ45_ports_index', mock.MagicMock(return_value=[])) + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_sfp_count', mock.MagicMock(return_value=1)) + @mock.patch('sonic_platform.sfp.SFP.initialize_sfp_modules', mock.MagicMock()) + @mock.patch('sonic_platform.module_host_mgmt_initializer.ModuleHostMgmtInitializer.is_initialization_owner') + @mock.patch('sonic_platform.module_host_mgmt_initializer.ModuleHostMgmtInitializer.wait_module_ready') + @mock.patch('sonic_platform.utils.is_host') + def test_initialize(self, mock_is_host, mock_wait_ready, mock_owner): + c = chassis.Chassis() + initializer = module_host_mgmt_initializer.ModuleHostMgmtInitializer() + mock_is_host.return_value = True + mock_owner.return_value = False + # called from host side, just wait + initializer.initialize(c) + mock_wait_ready.assert_called_once() + mock_wait_ready.reset_mock() + + mock_is_host.return_value = False + # non-initializer-owner called from container side, just wait + initializer.initialize(c) + mock_wait_ready.assert_called_once() + mock_wait_ready.reset_mock() + + mock_owner.return_value = True + initializer.initialize(c) + mock_wait_ready.assert_not_called() + assert initializer.initialized + assert module_host_mgmt_initializer.initialization_owner + assert os.path.exists(module_host_mgmt_initializer.MODULE_READY_CONTAINER_FILE) + + module_host_mgmt_initializer.clean_up() + assert not os.path.exists(module_host_mgmt_initializer.MODULE_READY_CONTAINER_FILE) + + def test_is_initialization_owner(self): + initializer = module_host_mgmt_initializer.ModuleHostMgmtInitializer() + assert not initializer.is_initialization_owner() diff --git a/platform/mellanox/mlnx-platform-api/tests/test_sfp.py b/platform/mellanox/mlnx-platform-api/tests/test_sfp.py index 499983a01e15..32489d94ecd1 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_sfp.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_sfp.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. +# Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -61,10 +61,7 @@ def test_sfp_index(self, mock_max_port): @mock.patch('sonic_platform.chassis.Chassis.get_num_sfps', mock.MagicMock(return_value=2)) @mock.patch('sonic_platform.chassis.extract_RJ45_ports_index', mock.MagicMock(return_value=[])) def test_sfp_get_error_status(self, mock_get_error_code, mock_control): - chassis = Chassis() - - # Fetch an SFP module to test - sfp = chassis.get_sfp(1) + sfp = SFP(1) mock_control.return_value = False description_dict = sfp._get_error_description_dict() for error in description_dict.keys(): @@ -230,18 +227,14 @@ def test_get_page_and_page_offset(self, mock_get_type_str, mock_eeprom_path, moc assert page == '/tmp/1/data' assert page_offset is 0 - @mock.patch('sonic_platform.sfp.SFP.is_sw_control') @mock.patch('sonic_platform.sfp.SFP._read_eeprom') - def test_sfp_get_presence(self, mock_read, mock_control): + def test_sfp_get_presence(self, mock_read): sfp = SFP(0) mock_read.return_value = None assert not sfp.get_presence() mock_read.return_value = 0 assert sfp.get_presence() - - mock_control.side_effect = RuntimeError('') - assert not sfp.get_presence() @mock.patch('sonic_platform.utils.read_int_from_file') def test_rj45_get_presence(self, mock_read_int): @@ -343,28 +336,14 @@ def test_get_temperature_threshold(self): assert sfp.get_temperature_warning_threshold() == 75.0 assert sfp.get_temperature_critical_threshold() == 85.0 - @mock.patch('sonic_platform.sfp.NvidiaSFPCommon.get_logical_port_by_sfp_index') @mock.patch('sonic_platform.utils.read_int_from_file') @mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode') - @mock.patch('sonic_platform.utils.DbUtils.get_db_instance') - def test_is_sw_control(self, mock_get_db, mock_mode, mock_read, mock_get_logical): + def test_is_sw_control(self, mock_mode, mock_read): sfp = SFP(0) mock_mode.return_value = False assert not sfp.is_sw_control() mock_mode.return_value = True - mock_get_logical.return_value = None - with pytest.raises(Exception): - sfp.is_sw_control() - - mock_get_logical.return_value = 'Ethernet0' - mock_db = mock.MagicMock() - mock_get_db.return_value = mock_db - mock_db.exists = mock.MagicMock(return_value=False) - with pytest.raises(Exception): - sfp.is_sw_control() - - mock_db.exists.return_value = True mock_read.return_value = 0 assert not sfp.is_sw_control() mock_read.return_value = 1 @@ -437,3 +416,115 @@ def test_set_lpmode_cmis_host_mangagement(self, mock_control): mock_control.return_value = False assert not sfp.set_lpmode(True) assert not sfp.set_lpmode(False) + + def test_determine_control_type(self): + sfp = SFP(0) + sfp.get_xcvr_api = mock.MagicMock(return_value=None) + assert sfp.determine_control_type() == 0 + + sfp.get_xcvr_api.return_value = 1 # Just make it not None + sfp.is_supported_for_software_control = mock.MagicMock(return_value=True) + assert sfp.determine_control_type() == 1 + + sfp.is_supported_for_software_control.return_value = False + assert sfp.determine_control_type() == 0 + + def test_check_power_capability(self): + sfp = SFP(0) + sfp.get_module_max_power = mock.MagicMock(return_value=-1) + assert not sfp.check_power_capability() + + sfp.get_module_max_power.return_value = 48 + sfp.get_power_limit = mock.MagicMock(return_value=48) + assert sfp.check_power_capability() + + sfp.get_power_limit.return_value = 1 + assert not sfp.check_power_capability() + + def test_get_module_max_power(self): + sfp = SFP(0) + sfp.is_cmis_api = mock.MagicMock(return_value=True) + sfp.read_eeprom = mock.MagicMock(return_value=bytearray([48])) + assert sfp.get_module_max_power() == 48 + + sfp.is_cmis_api.return_value = False + sfp.is_sff_api = mock.MagicMock(return_value=True) + sfp.read_eeprom.return_value = bytearray([128]) + assert sfp.get_module_max_power() == 2.5 * 4 + + sfp.read_eeprom.return_value = bytearray([32]) + assert sfp.get_module_max_power() == 3.2 * 4 + + # Simulate invalid value + sfp.read_eeprom.return_value = bytearray([33]) + assert sfp.get_module_max_power() == -1 + + # Simulate unsupported module type + sfp.is_sff_api .return_value = False + assert sfp.get_module_max_power() == -1 + + def test_update_i2c_frequency(self): + sfp = SFP(0) + sfp.get_frequency_support = mock.MagicMock(return_value=False) + sfp.set_frequency = mock.MagicMock() + sfp.update_i2c_frequency() + sfp.set_frequency.assert_not_called() + + sfp.get_frequency_support.return_value = True + sfp.update_i2c_frequency() + sfp.set_frequency.assert_not_called() + + sfp.is_cmis_api = mock.MagicMock(return_value=True) + sfp.read_eeprom = mock.MagicMock(return_value=bytearray([0])) + sfp.update_i2c_frequency() + sfp.set_frequency.assert_called_with(0) + + sfp.is_cmis_api.return_value = False + sfp.is_sff_api = mock.MagicMock(return_value=True) + sfp.update_i2c_frequency() + sfp.set_frequency.assert_called_with(0) + + def test_disable_tx_for_sff_optics(self): + sfp = SFP(0) + mock_api = mock.MagicMock() + sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api) + mock_api.tx_disable = mock.MagicMock() + sfp.disable_tx_for_sff_optics() + mock_api.tx_disable.assert_not_called() + + sfp.is_sff_api = mock.MagicMock(return_value=True) + mock_api.get_tx_disable_support = mock.MagicMock(return_value=True) + sfp.disable_tx_for_sff_optics() + mock_api.tx_disable.assert_called_with(True) + + @mock.patch('sonic_platform.utils.read_int_from_file') + def test_get_error_info_from_sdk_error_type(self, mock_read): + sfp = SFP(0) + # Unknown error + mock_read.return_value = -1 + sfp_state, error_desc = sfp.get_error_info_from_sdk_error_type() + assert sfp_state == '2' + assert 'Unknown error' in error_desc + + mock_read.return_value = 2 + sfp_state, error_desc = sfp.get_error_info_from_sdk_error_type() + assert sfp_state == '11' + assert error_desc is None + + @mock.patch('sonic_platform.chassis.extract_RJ45_ports_index', mock.MagicMock(return_value=[])) + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_sfp_count', mock.MagicMock(return_value=1)) + def test_initialize_sfp_modules(self): + c = Chassis() + c.initialize_sfp() + s = c._sfp_list[0] + s.get_hw_present = mock.MagicMock(return_value=True) + s.get_power_on = mock.MagicMock(return_value=False) + s.get_reset_state = mock.MagicMock(return_value=True) + s.get_power_good = mock.MagicMock(return_value=True) + s.determine_control_type = mock.MagicMock(return_value=1) # software control + s.set_control_type = mock.MagicMock() + SFP.initialize_sfp_modules(c._sfp_list) + assert s.in_stable_state() + SFP.wait_ready_task.stop() + SFP.wait_ready_task.join() + SFP.wait_ready_task = None diff --git a/platform/mellanox/mlnx-platform-api/tests/test_sfp_sm.py b/platform/mellanox/mlnx-platform-api/tests/test_sfp_sm.py new file mode 100644 index 000000000000..684fa3af11f8 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_sfp_sm.py @@ -0,0 +1,156 @@ +# +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys +if sys.version_info.major == 3: + from unittest import mock +else: + import mock + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform import sfp +from sonic_platform import utils + +origin_read = utils.read_from_file +origin_write = utils.write_file + + +class TestSfpStateMachine: + PATH_PREFIX = '/sys/module/sx_core/asic0/module0' + mock_file_content = {} + + @classmethod + def setup_class(cls): + utils.read_from_file = cls.mock_read + utils.write_file = cls.mock_write + + @classmethod + def teardown_class(cls): + utils.read_from_file = origin_read + utils.write_file = origin_write + + @classmethod + def mock_value(cls, file_name, value): + cls.mock_file_content[f'{cls.PATH_PREFIX}/{file_name}'] = value + + @classmethod + def get_value(cls, file_name): + return cls.mock_file_content[f'{cls.PATH_PREFIX}/{file_name}'] + + @classmethod + def mock_write(cls, file_path, value, *args, **kwargs): + cls.mock_file_content[file_path] = value + + @classmethod + def mock_read(cls, file_path, *args, **kwargs): + return cls.mock_file_content[file_path] + + def test_no_hw_present(self): + self.mock_value('hw_present', 0) + s = sfp.SFP(0) + s.on_event(sfp.EVENT_START) + assert s.get_state() == sfp.STATE_NOT_PRESENT + + def test_not_powered(self): + self.mock_value('hw_present', 1) + self.mock_value('power_on', 0) + s = sfp.SFP(0) + s.on_event(sfp.EVENT_START) + assert s.get_state() == sfp.STATE_RESETTING + assert self.get_value('power_on') == 1 + assert self.get_value('hw_reset') == 1 + assert 0 in sfp.SFP.get_wait_ready_task()._wait_dict + sfp.SFP.get_wait_ready_task()._wait_dict.pop(0) + + def test_in_reset_state(self): + self.mock_value('hw_present', 1) + self.mock_value('power_on', 1) + self.mock_value('hw_reset', 0) + s = sfp.SFP(0) + s.on_event(sfp.EVENT_START) + assert s.get_state() == sfp.STATE_RESETTING + assert self.get_value('hw_reset') == 1 + assert 0 in sfp.SFP.get_wait_ready_task()._wait_dict + s.on_event(sfp.EVENT_NOT_PRESENT) + assert s.get_state() == sfp.STATE_NOT_PRESENT + assert 0 not in sfp.SFP.get_wait_ready_task()._wait_dict + + def test_reset_done(self): + self.mock_value('hw_present', 1) + self.mock_value('power_on', 1) + self.mock_value('hw_reset', 0) + self.mock_value('power_good', 1) + s = sfp.SFP(0) + s.determine_control_type = mock.MagicMock(return_value=sfp.SFP_FW_CONTROL) + s.on_event(sfp.EVENT_START) + assert s.get_state() == sfp.STATE_RESETTING + s.on_event(sfp.EVENT_RESET_DONE) + assert s.get_state() == sfp.STATE_FW_CONTROL + + def test_no_power_good(self): + self.mock_value('hw_present', 1) + self.mock_value('power_on', 1) + self.mock_value('hw_reset', 1) + self.mock_value('power_good', 0) + s = sfp.SFP(0) + s.on_event(sfp.EVENT_START) + assert s.get_state() == sfp.STATE_POWER_BAD + s.on_event(sfp.EVENT_NOT_PRESENT) + assert s.get_state() == sfp.STATE_NOT_PRESENT + + def test_fw_control(self): + self.mock_value('hw_present', 1) + self.mock_value('power_on', 1) + self.mock_value('hw_reset', 1) + self.mock_value('power_good', 1) + s = sfp.SFP(0) + s.determine_control_type = mock.MagicMock(return_value=sfp.SFP_FW_CONTROL) + s.on_event(sfp.EVENT_START) + assert s.get_state() == sfp.STATE_FW_CONTROL + assert self.get_value('control') == sfp.SFP_FW_CONTROL + + def test_power_exceed(self): + self.mock_value('hw_present', 1) + self.mock_value('power_on', 1) + self.mock_value('hw_reset', 1) + self.mock_value('power_good', 1) + s = sfp.SFP(0) + s.determine_control_type = mock.MagicMock(return_value=sfp.SFP_SW_CONTROL) + s.check_power_capability = mock.MagicMock(return_value=False) + s.on_event(sfp.EVENT_START) + assert s.get_state() == sfp.STATE_POWER_LIMIT_ERROR + assert self.get_value('power_on') == 0 + assert self.get_value('hw_reset') == 0 + s.on_event(sfp.EVENT_NOT_PRESENT) + assert s.get_state() == sfp.STATE_NOT_PRESENT + + def test_sw_control(self): + self.mock_value('hw_present', 1) + self.mock_value('power_on', 1) + self.mock_value('hw_reset', 1) + self.mock_value('power_good', 1) + s = sfp.SFP(0) + s.determine_control_type = mock.MagicMock(return_value=sfp.SFP_SW_CONTROL) + s.check_power_capability = mock.MagicMock(return_value=True) + s.update_i2c_frequency = mock.MagicMock() + s.disable_tx_for_sff_optics = mock.MagicMock() + s.on_event(sfp.EVENT_START) + assert s.get_state() == sfp.STATE_SW_CONTROL \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/test_statemachine.py b/platform/mellanox/mlnx-platform-api/tests/test_statemachine.py new file mode 100644 index 000000000000..f2193a6866d4 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_statemachine.py @@ -0,0 +1,137 @@ +# +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import pytest +import sys + +from mock import MagicMock +if sys.version_info.major == 3: + from unittest import mock +else: + import mock + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform import state_machine + +STATE_DOWN = 'Down' +STATE_INIT = 'Initializing' +STATE_UP = 'Up' + +ACTION_LEAVE_DOWN = 'Leave Down' +ACTION_INIT = 'Initializing' +ACTION_UP = 'Up' + +EVENT_START = 'Start' +EVENT_INIT_DONE = 'Initialize Done' +EVENT_STOP = 'Stop' + +class StateEntity: + def __init__(self): + self.state = STATE_DOWN + self.current_action = None + self.triggered_actions = [] + + def get_state(self): + return self.state + + def change_state(self, new_state): + self.state = new_state + + def on_event(self, event): + pass + + def on_action(self, action_name): + self.current_action = action_name + self.triggered_actions.append(action_name) + + +class TestStateMachine: + sm = None + @classmethod + def setup_class(cls): + sm = state_machine.StateMachine() + sm.add_state(STATE_DOWN).set_leave_action(ACTION_LEAVE_DOWN) \ + .add_transition(EVENT_START, STATE_INIT) + sm.add_state(STATE_INIT).set_entry_action(ACTION_INIT) \ + .add_transition(EVENT_INIT_DONE, STATE_UP) \ + .add_transition(EVENT_STOP, STATE_DOWN) + sm.add_state(STATE_UP).set_entry_action(ACTION_UP) \ + .add_transition(EVENT_STOP, STATE_DOWN) + cls.sm = sm + + def test_state_machine(self): + state_entity = StateEntity() + + # Start + self.sm.on_event(state_entity, EVENT_START) + assert state_entity.triggered_actions == [ACTION_LEAVE_DOWN, ACTION_INIT] + assert state_entity.get_state() == STATE_INIT + + # Initialize done + self.sm.on_event(state_entity, EVENT_INIT_DONE) + assert state_entity.current_action == ACTION_UP + assert state_entity.get_state() == STATE_UP + + # Stop + self.sm.on_event(state_entity, EVENT_STOP) + assert state_entity.get_state() == STATE_DOWN + + # Quick start/stop + self.sm.on_event(state_entity, EVENT_START) + self.sm.on_event(state_entity, EVENT_STOP) + assert state_entity.get_state() == STATE_DOWN + + # Event not defined for this state, state machine should ignore it + self.sm.on_event(state_entity, EVENT_STOP) + assert state_entity.get_state() == STATE_DOWN + + def test_unknown_state(self): + state_entity = StateEntity() + state_entity.state = 'unknown' + with pytest.raises(RuntimeError): + # Trigger unknown event + self.sm.on_event(state_entity, EVENT_START) + + def test_duplicate_state(self): + sm = state_machine.StateMachine() + sm.add_state(STATE_DOWN) + with pytest.raises(RuntimeError): + # Add duplicate state + sm.add_state(STATE_DOWN) + + def test_duplicate_transition(self): + sm = state_machine.StateMachine() + with pytest.raises(RuntimeError): + # Add duplicate transition + sm.add_state(STATE_DOWN) \ + .add_transition(EVENT_START, STATE_INIT) \ + .add_transition(EVENT_START, STATE_INIT) + + def test_unknown_transition_target(self): + sm = state_machine.StateMachine() + # Add unknown transition target + sm.add_state(STATE_DOWN) \ + .add_transition(EVENT_START, 'unknown') + + state_entity = StateEntity() + with pytest.raises(RuntimeError): + sm.on_event(state_entity, EVENT_START) + \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py index 8e7509ce9b69..c135395c363b 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -61,10 +61,8 @@ def test_load_tc_config_mocked(self): @mock.patch('sonic_platform.thermal_updater.ThermalUpdater.update_asic', mock.MagicMock()) @mock.patch('sonic_platform.thermal_updater.ThermalUpdater.update_module', mock.MagicMock()) - @mock.patch('sonic_platform.thermal_updater.ThermalUpdater.wait_all_sfp_ready') @mock.patch('sonic_platform.utils.write_file') - def test_start_stop(self, mock_write, mock_wait): - mock_wait.return_value = True + def test_start_stop(self, mock_write): mock_sfp = mock.MagicMock() mock_sfp.sdk_index = 1 updater = ThermalUpdater([mock_sfp]) @@ -77,21 +75,6 @@ def test_start_stop(self, mock_write, mock_wait): assert not updater._timer.is_alive() mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1) - mock_wait.return_value = False - mock_write.reset_mock() - updater.start() - mock_write.assert_called_once_with('/run/hw-management/config/suspend', 1) - updater.stop() - - @mock.patch('sonic_platform.thermal_updater.time.sleep', mock.MagicMock()) - def test_wait_all_sfp_ready(self): - mock_sfp = mock.MagicMock() - mock_sfp.is_sw_control = mock.MagicMock(return_value=True) - updater = ThermalUpdater([mock_sfp]) - assert updater.wait_all_sfp_ready() - mock_sfp.is_sw_control.side_effect = Exception('') - assert not updater.wait_all_sfp_ready() - @mock.patch('sonic_platform.utils.read_int_from_file') def test_update_asic(self, mock_read): mock_read.return_value = 8 diff --git a/platform/mellanox/mlnx-platform-api/tests/test_wait_sfp_ready_task.py b/platform/mellanox/mlnx-platform-api/tests/test_wait_sfp_ready_task.py new file mode 100644 index 000000000000..16e361f09327 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_wait_sfp_ready_task.py @@ -0,0 +1,51 @@ +# +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys + +if sys.version_info.major == 3: + from unittest import mock +else: + import mock + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform import wait_sfp_ready_task +from sonic_platform import utils + + +class TestWaitSfpReadyTask: + def test_schedule(self): + task = wait_sfp_ready_task.WaitSfpReadyTask() + task.schedule_wait(0) + assert not task.empty() + task.cancel_wait(0) + assert task.empty() + + def test_run(self): + task = wait_sfp_ready_task.WaitSfpReadyTask() + task.WAIT_TIME = 1 # Fast the test + task.start() + task.schedule_wait(0) + assert utils.wait_until(lambda: 0 in task.get_ready_set(), 4, 0.5), 'sfp does not reach ready in 4 seconds' + assert 0 not in task._wait_dict + assert len(task._ready_set) == 0 + task.stop() + task.join()