From c7d3ded70ca2be3bffb71f48ed9409a5fe73d49b Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox Date: Tue, 9 Apr 2024 09:40:21 +0300 Subject: [PATCH] [Mellanox] run module initialization when any SFP related API is called --- .../sonic_platform/chassis.py | 244 +++++- .../module_host_mgmt_initializer.py | 84 +++ .../mlnx-platform-api/sonic_platform/sfp.py | 703 +++++++++++++++++- .../sonic_platform/statemachine.py | 141 ++++ .../sonic_platform/thermal_updater.py | 23 - .../sonic_platform/wait_sfp_ready_task.py | 109 +++ 6 files changed, 1210 insertions(+), 94 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/module_host_mgmt_initializer.py create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/statemachine.py create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/wait_sfp_ready_task.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 5870d7e6b602..86cec7aff341 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -28,13 +28,13 @@ import os from functools import reduce from .utils import extract_RJ45_ports_index + from . import module_host_mgmt_initializer from . import utils from .device_data import DeviceDataManager import re - import queue + import select import threading import time - from sonic_platform import modules_mgmt except ImportError as e: raise ImportError (str(e) + "- required module not found") @@ -132,9 +132,9 @@ def __init__(self): Chassis.chassis_instance = self - self.modules_mgmt_thread = threading.Thread() - self.modules_changes_queue = queue.Queue() - self.modules_mgmt_task_stopping_event = threading.Event() + self.module_host_mgmt_initializer = module_host_mgmt_initializer.ModuleHostMgmtInitializer() + self.poll_obj = None + self.registered_fds = None logger.log_info("Chassis loaded successfully") @@ -338,8 +338,11 @@ def get_all_sfps(self): Returns: A list of objects derived from SfpBase representing all sfps available on this chassis - """ - self.initialize_sfp() + """ + if DeviceDataManager.is_independent_mode(): + self.module_host_mgmt_initializer.initialize(self) + else: + self.initialize_sfp() return self._sfp_list def get_sfp(self, index): @@ -356,7 +359,10 @@ def get_sfp(self, index): An object dervied from SfpBase representing the specified sfp """ index = index - 1 - self.initialize_single_sfp(index) + if DeviceDataManager.is_independent_mode(): + self.module_host_mgmt_initializer.initialize(self) + else: + self.initialize_single_sfp(index) return super(Chassis, self).get_sfp(index) def get_port_or_cage_type(self, index): @@ -406,42 +412,218 @@ def get_change_event(self, timeout=0): indicates that fan 0 has been removed, fan 2 has been inserted and sfp 11 has been removed. """ - if not self.modules_mgmt_thread.is_alive(): - # open new SFP change events thread - self.modules_mgmt_thread = modules_mgmt.ModulesMgmtTask(q=self.modules_changes_queue - , main_thread_stop_event = self.modules_mgmt_task_stopping_event) - # Set the thread as daemon so when pmon/xcvrd are shutting down, modules_mgmt will shut down immedietly. - self.modules_mgmt_thread.daemon = True - self.modules_mgmt_thread.start() - self.initialize_sfp() - wait_for_ever = (timeout == 0) + if DeviceDataManager.is_independent_mode(): + self.module_host_mgmt_initializer.initialize(self) + return self.get_change_event_for_module_host_management_mode(timeout) + else: + self.initialize_sfp() + return self.get_change_event_legacy(timeout) + + def get_change_event_for_module_host_management_mode(self, timeout): + """Get SFP change event when module host management mode is enabled. + + Args: + timeout: Timeout in milliseconds (optional). If timeout == 0, + this method will block until a change is detected. + + Returns: + (bool, dict): + - True if call successful, False if not; - Deprecated, will always return True + - A nested dictionary where key is a device type, + value is a dictionary with key:value pairs in the format of + {'device_id':'device_event'}, + where device_id is the device ID for this device and + device_event, + status='1' represents device inserted, + status='0' represents device removed. + Ex. {'fan':{'0':'0', '2':'1'}, 'sfp':{'11':'0'}} + indicates that fan 0 has been removed, fan 2 + has been inserted and sfp 11 has been removed. + """ + if not self.poll_obj: + # SDK always sent event for the first time polling. Such event should not be sent to xcvrd. + # Store SFP state before first time polling so that we can detect dummy event. + self.sfp_states_before_first_poll = {} + self.poll_obj = select.poll() + self.registered_fds = {} + for s in self.get_all_sfps(): + fds = s.get_fds_for_poling() + for fd_type, fd in fds.items(): + self.poll_obj.register(fd, select.POLLERR | select.POLLPRI) + self.registered_fds[fd.fileno()] = (s.sdk_index, fd, fd_type) + self.sfp_states_before_first_poll[s.sdk_index] = s.state + + logger.log_debug(f'Registered SFP file descriptors for polling: {self.registered_fds}') + + from . import sfp + + wait_forever = (timeout == 0) + # poll timeout should be no more than 1000ms to ensure fast shutdown flow + timeout = 1000.0 if timeout >= 1000 else float(timeout) + port_dict = {} + error_dict = {} + begin = time.time() + wait_ready_task = sfp.SFP.get_wait_ready_task() + + while True: + fds_events = self.poll_obj.poll(timeout) + for fileno, _ in fds_events: + if fileno not in self.registered_fds: + logger.log_error(f'Unknown file no {fileno} from poll event, registered files are {self.registered_fds}') + continue + + sfp_index, fd, fd_type = self.registered_fds[fileno] + s = self._sfp_list[sfp_index] + fd_value = int(fd.read().strip()) + + if sfp_index in self.sfp_states_before_first_poll: + # Detecting dummy event + sfp_state_before_first_poll = self.sfp_states_before_first_poll[sfp_index] + self.sfp_states_before_first_poll.pop(sfp_index) + if s.is_dummy_event(sfp_state_before_first_poll, fd_type, fd_value): + # Ignore dummy event for the first poll, assume SDK only provide 1 dummy event + logger.log_debug(f'Ignore dummy event {fd_type}:{fd_value} for SFP {sfp_index}') + continue + + logger.log_notice(f'Got SFP event: index={sfp_index}, type={fd_type}, value={fd_value}') + if fd_type == 'hw_present': + # event could be EVENT_NOT_PRESENT or EVENT_PRESENT + event = sfp.EVENT_NOT_PRESENT if fd_value == 0 else sfp.EVENT_PRESENT + s.on_event(event) + elif fd_type == 'present': + if str(fd_value) == sfp.SFP_STATUS_ERROR: + # FW control cable got an error, no need trigger state machine + sfp_status, error_desc = s.get_error_info_from_sdk_error_type() + port_dict[sfp_index + 1] = sfp_status + if error_desc: + error_dict[sfp_index + 1] = error_desc + continue + elif str(fd_value) == sfp.SFP_STATUS_INSERTED: + # FW control cable got present, only case is that the cable is recovering + # from an error. FW control cable has no transition from "Not Present" to "Present" + # because "Not Present" cable is always "software control" and should always poll + # hw_present sysfs instead of present sysfs. + port_dict[sfp_index + 1] = sfp.SFP_STATUS_INSERTED + continue + else: + s.on_event(sfp.EVENT_NOT_PRESENT) + else: + # event could be EVENT_POWER_GOOD or EVENT_POWER_BAD + event = sfp.EVENT_POWER_BAD if fd_value == 0 else sfp.EVENT_POWER_GOOD + s.on_event(event) + + if s.in_stable_state(): + s.fill_change_event(port_dict) + s.refresh_poll_obj(self.poll_obj, self.registered_fds) + else: + logger.log_debug(f'SFP {sfp_index} does not reach stable state, state={s.state}') + + ready_sfp_set = wait_ready_task.get_ready_set() + for sfp_index in ready_sfp_set: + s = self._sfp_list[sfp_index] + s.on_event(sfp.EVENT_RESET_DONE) + if s.in_stable_state(): + s.fill_change_event(port_dict) + s.refresh_poll_obj(self.poll_obj, self.registered_fds) + else: + logger.log_error(f'SFP {sfp_index} failed to reach stable state, state={s.state}') + + if port_dict: + logger.log_notice(f'Sending SFP change event: {port_dict}, error event: {error_dict}') + self.reinit_sfps(port_dict) + return True, { + 'sfp': port_dict, + 'sfp_error': error_dict + } + else: + if not wait_forever: + elapse = time.time() - begin + if elapse * 1000 >= timeout: + return True, {'sfp': {}} + + def get_change_event_legacy(self, timeout): + """Get SFP change event when module host management is disabled. + + Args: + timeout (int): polling timeout in ms + + Returns: + (bool, dict): + - True if call successful, False if not; - Deprecated, will always return True + - A nested dictionary where key is a device type, + value is a dictionary with key:value pairs in the format of + {'device_id':'device_event'}, + where device_id is the device ID for this device and + device_event, + status='1' represents device inserted, + status='0' represents device removed. + Ex. {'fan':{'0':'0', '2':'1'}, 'sfp':{'11':'0'}} + indicates that fan 0 has been removed, fan 2 + has been inserted and sfp 11 has been removed. + """ + if not self.poll_obj: + self.poll_obj = select.poll() + self.registered_fds = {} + # SDK always sent event for the first time polling. Such event should not be sent to xcvrd. + # Store SFP state before first time polling so that we can detect dummy event. + self.sfp_states_before_first_poll = {} + for s in self.get_all_sfps(): + fd = s.get_fd_for_polling_legacy() + self.poll_obj.register(fd, select.POLLERR | select.POLLPRI) + self.registered_fds[fd.fileno()] = (s.sdk_index, fd) + self.sfp_states_before_first_poll[s.sdk_index] = s.get_present_value() + + logger.log_debug(f'Registered SFP file descriptors for polling: {self.registered_fds}') + + from . import sfp + + wait_forever = (timeout == 0) # poll timeout should be no more than 1000ms to ensure fast shutdown flow timeout = 1000.0 if timeout >= 1000 else float(timeout) port_dict = {} error_dict = {} begin = time.time() - i = 0 + while True: - try: - logger.log_info(f'get_change_event() trying to get changes from queue on iteration {i}') - port_dict = self.modules_changes_queue.get(timeout=timeout / 1000) - logger.log_info(f'get_change_event() iteration {i} port_dict: {port_dict}') - except queue.Empty: - logger.log_info(f"failed to get item from modules changes queue on itertaion {i}") + fds_events = self.poll_obj.poll(timeout) + for fileno, _ in fds_events: + if fileno not in self.registered_fds: + logger.log_error(f'Unknown file no {fileno} from poll event, registered files are {self.registered_fds}') + continue + + sfp_index, fd = self.registered_fds[fileno] + fd.seek(0) + sfp_status = fd.read().strip() + + if sfp_index in self.sfp_states_before_first_poll: + # Detecting dummy event + sfp_state_before_poll = self.sfp_states_before_first_poll[sfp_index] + self.sfp_states_before_first_poll.pop(sfp_index) + if sfp_state_before_poll == sfp_status: + # Ignore dummy event for the first poll, assume SDK only provide 1 dummy event + logger.log_debug(f'Ignore dummy event {sfp_status} for SFP {sfp_index}') + continue + + logger.log_notice(f'Got SFP event: index={sfp_index}, value={sfp_status}') + if sfp_status == sfp.SFP_STATUS_ERROR: + s = self._sfp_list[sfp_index] + sfp_status, error_desc = s.get_error_info_from_sdk_error_type() + if error_desc: + error_dict[sfp_index + 1] = error_desc + port_dict[sfp_index + 1] = sfp_status if port_dict: + logger.log_notice(f'Sending SFP change event: {port_dict}, error event: {error_dict}') self.reinit_sfps(port_dict) - result_dict = {'sfp': port_dict} - result_dict['sfp_error'] = error_dict - return True, result_dict + return True, { + 'sfp': port_dict, + 'sfp_error': error_dict + } else: - if not wait_for_ever: + if not wait_forever: elapse = time.time() - begin - logger.log_info(f"get_change_event: wait_for_ever {wait_for_ever} elapse {elapse} iteartion {i}") if elapse * 1000 >= timeout: - logger.log_info(f"elapse {elapse} > timeout {timeout} iteartion {i} returning empty dict") return True, {'sfp': {}} - i += 1 def reinit_sfps(self, port_dict): """ diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/module_host_mgmt_initializer.py b/platform/mellanox/mlnx-platform-api/sonic_platform/module_host_mgmt_initializer.py new file mode 100644 index 000000000000..fe21156599ae --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/module_host_mgmt_initializer.py @@ -0,0 +1,84 @@ +from .device_data import DeviceDataManager +from . import utils +from sonic_py_common.logger import Logger + +import atexit +import os +import sys +import threading + +MODULE_READY_MAX_WAIT_TIME = 300 +MODULE_READY_CHECK_INTERVAL = 5 +MODULE_READY_CONTAINER_FILE = '/tmp/module_host_mgmt_ready' +MODULE_READY_HOST_FILE = '/tmp/nv-syncd-shared/module_host_mgmt_ready' +DEDICATE_INIT_DAEMON = 'xcvrd' +initialization_owner = False + +logger = Logger() + + +class ModuleHostMgmtInitializer: + def __init__(self): + self.initialized = False + self.lock = threading.Lock() + + def initialize(self, chassis): + global initialization_owner + if self.initialized: + return + + if utils.is_host(): + self.wait_module_ready() + else: + cmd = os.path.basename(sys.argv[0]) + if DEDICATE_INIT_DAEMON in cmd: + if not self.initialized: + with self.lock: + if not self.initialized: + logger.log_notice('Starting module initialization for module host management...') + initialization_owner = True + self.remove_module_ready_file() + + chassis.initialize_sfp() + + from .sfp import SFP + SFP.initialize_sfp_modules(chassis._sfp_list) + + self.create_module_ready_file() + self.initialized = True + logger.log_notice('Module initialization for module host management done') + else: + self.wait_module_ready() + + @classmethod + def create_module_ready_file(cls): + with open(MODULE_READY_CONTAINER_FILE, 'w'): + pass + + @classmethod + def remove_module_ready_file(cls): + if os.path.exists(MODULE_READY_CONTAINER_FILE): + os.remove(MODULE_READY_CONTAINER_FILE) + + def wait_module_ready(self): + if utils.is_host(): + module_ready_file = MODULE_READY_HOST_FILE + else: + module_ready_file = MODULE_READY_CONTAINER_FILE + + if os.path.exists(module_ready_file): + self.initialized = True + return + else: + print('Waiting module to be initialized...') + + if utils.wait_until(os.path.exists, MODULE_READY_MAX_WAIT_TIME, MODULE_READY_CHECK_INTERVAL, module_ready_file): + self.initialized = True + else: + logger.log_error('Module initialization timeout', True) + + +@atexit.register +def clean_up(): + if initialization_owner: + ModuleHostMgmtInitializer.remove_module_ready_file() diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py index 90462e9ed0fe..de0180a51b1b 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py @@ -24,16 +24,18 @@ try: import ctypes + import select import subprocess import os import threading + import time from sonic_py_common.logger import Logger from sonic_py_common.general import check_output_pipe from . import utils from .device_data import DeviceDataManager from sonic_platform_base.sonic_xcvr.sfp_optoe_base import SfpOptoeBase from sonic_platform_base.sonic_xcvr.fields import consts - from sonic_platform_base.sonic_xcvr.api.public import sff8636, sff8436 + from sonic_platform_base.sonic_xcvr.api.public import cmis, sff8636, sff8436 except ImportError as e: raise ImportError (str(e) + "- required module not found") @@ -127,7 +129,22 @@ CPU_MASK = PORT_TYPE_MASK & (PORT_TYPE_CPU << PORT_TYPE_OFFSET) # parameters for SFP presence +SFP_STATUS_REMOVED = '0' SFP_STATUS_INSERTED = '1' +SFP_STATUS_ERROR = '2' + +# SDK error definitions begin + +# SFP errors that will block eeprom accessing +SDK_SFP_BLOCKING_ERRORS = [ + 0x2, # SFP.SFP_ERROR_BIT_I2C_STUCK, + 0x3, # SFP.SFP_ERROR_BIT_BAD_EEPROM, + 0x5, # SFP.SFP_ERROR_BIT_UNSUPPORTED_CABLE, + 0x6, # SFP.SFP_ERROR_BIT_HIGH_TEMP, + 0x7, # SFP.SFP_ERROR_BIT_BAD_CABLE +] + +# SDK error definitions end # SFP constants SFP_PAGE_SIZE = 256 # page size of page0h @@ -162,6 +179,53 @@ SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD = 80.0 SFP_TEMPERATURE_SCALE = 8.0 +# Module host management definitions begin +SFP_SW_CONTROL = 1 +SFP_FW_CONTROL = 0 + +POWER_CLASS_1_MAX_POWER = 1.5 +POWER_CLASS_2_MAX_POWER = 2 +POWER_CLASS_3_MAX_POWER = 2.5 +POWER_CLASS_4_MAX_POWER = 3.5 +POWER_CLASS_5_MAX_POWER = 4 +POWER_CLASS_6_MAX_POWER = 4.5 +POWER_CLASS_7_MAX_POWER = 5 +POWER_CLASS_INVALID = -1 + +CMIS_MCI_EEPROM_OFFSET = 2 +CMIS_MCI_MASK = 0b00001100 + +STATE_DOWN = 'Down' # Initial state +STATE_INIT = 'Initializing' # Module starts initializing, check module present, also power on the module if need +STATE_RESETTING = 'Resetting' # Module is resetting the firmware +STATE_POWERED_ON = 'Power On' # Module is powered on, module firmware has been loaded, check module power is in good state +STATE_SW_CONTROL = 'Software Control' # Module is under software control +STATE_FW_CONTROL = 'Firmware Control' # Module is under firmware control +STATE_POWER_BAD = 'Power Bad' # Module power_good returns 0 +STATE_POWER_LIMIT_ERROR = 'Exceed Power Limit' # Module power exceeds cage power limit +STATE_NOT_PRESENT = 'Not Present' # Module is not present + +EVENT_START = 'Start' +EVENT_NOT_PRESENT = 'Not Present' +EVENT_RESET = 'Reset' +EVENT_POWER_ON = 'Power On' +EVENT_RESET_DONE = 'Reset Done' +EVENT_POWER_BAD = 'Power Bad' +EVENT_SW_CONTROL = 'Software Control' +EVENT_FW_CONTROL = 'Firmware Control' +EVENT_POWER_LIMIT_EXCEED = 'Power Limit Exceed' +EVENT_POWER_GOOD = 'Power Good' +EVENT_PRESENT = 'Present' + +ACTION_ON_START = 'On Start' +ACTION_ON_RESET = 'On Reset' +ACTION_ON_POWERED = 'On Powered' +ACTION_ON_SW_CONTROL = 'On Software Control' +ACTION_ON_FW_CONTROL = 'On Firmware Control' +ACTION_ON_POWER_LIMIT_ERROR = 'On Power Limit Error' +ACTION_ON_NOT_PRESENT = 'On Not Present' +# Module host management definitions end + # SFP EEPROM limited bytes limited_eeprom = { SFP_TYPE_CMIS: { @@ -252,30 +316,6 @@ def _get_module_info(self, sdk_index): return oper_state, error_type - @classmethod - def get_sfp_index_to_logical_port(cls, force=False): - if not cls.sfp_index_to_logical_port_dict or force: - config_db = utils.DbUtils.get_db_instance('CONFIG_DB') - port_data = config_db.get_table('PORT') - for key, data in port_data.items(): - if data['index'] not in cls.sfp_index_to_logical_port_dict: - cls.sfp_index_to_logical_port_dict[int(data['index']) - 1] = key - - @classmethod - def get_logical_port_by_sfp_index(cls, sfp_index): - with cls.sfp_index_to_logical_lock: - cls.get_sfp_index_to_logical_port() - logical_port_name = cls.sfp_index_to_logical_port_dict.get(sfp_index) - if not logical_port_name: - cls.get_sfp_index_to_logical_port(force=True) - else: - config_db = utils.DbUtils.get_db_instance('CONFIG_DB') - current_index = int(config_db.get('CONFIG_DB', f'PORT|{logical_port_name}', 'index')) - if current_index != sfp_index: - cls.get_sfp_index_to_logical_port(force=True) - logical_port_name = cls.sfp_index_to_logical_port_dict.get(sfp_index) - return logical_port_name - class SFP(NvidiaSFPCommon): """Platform-specific SFP class""" @@ -285,12 +325,43 @@ class SFP(NvidiaSFPCommon): SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED = 'PMD type not enabled' SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED = 'PCIE system power slot exceeded' SFP_MLNX_ERROR_DESCRIPTION_RESERVED = 'Reserved' + + SDK_ERRORS_TO_DESCRIPTION = { + 0x1: SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE, + 0x4: SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST, + 0x8: SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED, + 0xc: SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED + } SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE = 0x00010000 SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST = 0x00020000 SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED = 0x00040000 SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED = 0x00080000 SFP_MLNX_ERROR_BIT_RESERVED = 0x80000000 + + SDK_ERRORS_TO_ERROR_BITS = { + 0x0: SfpOptoeBase.SFP_ERROR_BIT_POWER_BUDGET_EXCEEDED, + 0x1: SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE, + 0x2: SfpOptoeBase.SFP_ERROR_BIT_I2C_STUCK, + 0x3: SfpOptoeBase.SFP_ERROR_BIT_BAD_EEPROM, + 0x4: SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST, + 0x5: SfpOptoeBase.SFP_ERROR_BIT_UNSUPPORTED_CABLE, + 0x6: SfpOptoeBase.SFP_ERROR_BIT_HIGH_TEMP, + 0x7: SfpOptoeBase.SFP_ERROR_BIT_BAD_CABLE, + 0x8: SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED, + 0xc: SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED + } + + # Class level state machine object, only applicable for module host management + sm = None + + # Class level wait SFP ready task, the task waits for module to load its firmware after resetting, + # only applicable for module host management + wait_ready_task = None + + # Class level action table which stores the mapping from action name to action function, + # only applicable for module host management + action_table = None def __init__(self, sfp_index, sfp_type=None, slot_id=0, linecard_port_count=0, lc_name=None): super(SFP, self).__init__(sfp_index) @@ -311,6 +382,11 @@ def __init__(self, sfp_index, sfp_type=None, slot_id=0, linecard_port_count=0, l self.slot_id = slot_id self._sfp_type_str = None + # SFP state, only applicable for module host management + self.state = STATE_DOWN + + def __str__(self): + return f'SFP {self.sdk_index}' def reinit(self): """ @@ -327,10 +403,6 @@ def get_presence(self): Returns: bool: True if device is present, False if not """ - try: - self.is_sw_control() - except: - return False eeprom_raw = self._read_eeprom(0, 1, log_on_error=False) return eeprom_raw is not None @@ -745,6 +817,31 @@ def get_error_description(self): else: error_description = "Unknow SFP module status ({})".format(oper_status) return error_description + + def get_error_info_from_sdk_error_type(self): + """Translate SDK error type to SONiC error state and error description. Only calls + when sysfs "present" returns "2". + + Returns: + tuple: (error state, error description) + """ + error_type = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/statuserror') + sfp_state_bits = SFP.SDK_ERRORS_TO_ERROR_BITS.get(error_type) + if sfp_state_bits is None: + logger.log_error(f"Unrecognized error {error_type} detected on SFP {self.sdk_index}") + return SFP_STATUS_ERROR, "Unknown error ({})".format(error_type) + + if error_type in SDK_SFP_BLOCKING_ERRORS: + # In SFP at error status case, need to overwrite the sfp_state with the exact error code + sfp_state_bits |= SFP.SFP_ERROR_BIT_BLOCKING + + # An error should be always set along with 'INSERTED' + sfp_state_bits |= SFP.SFP_STATUS_BIT_INSERTED + + # For vendor specific errors, the description should be returned as well + error_description = SFP.SDK_ERRORS_TO_DESCRIPTION.get(error_type) + sfp_state = str(sfp_state_bits) + return sfp_state, error_description def _get_eeprom_path(self): return SFP_EEPROM_ROOT_TEMPLATE.format(self.sdk_index) @@ -978,22 +1075,548 @@ def get_xcvr_api(self): def is_sw_control(self): if not DeviceDataManager.is_independent_mode(): return False - - db = utils.DbUtils.get_db_instance('STATE_DB') - logical_port = NvidiaSFPCommon.get_logical_port_by_sfp_index(self.sdk_index) - if not logical_port: - raise Exception(f'Module {self.sdk_index} is not present or under initialization') - - initialized = db.exists('STATE_DB', f'TRANSCEIVER_STATUS|{logical_port}') - if not initialized: - raise Exception(f'Module {self.sdk_index} is not present or under initialization') - try: return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control', raise_exception=True, log_func=None) == 1 except: # just in case control file does not exist - raise Exception(f'Module {self.sdk_index} is under initialization') + raise Exception(f'control sysfs for SFP {self.sdk_index} does not exist') + + def get_present_value(self): + """Get present status. + + Returns: + str: content of present sysfs + """ + return utils.read_str_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/present') + + def get_hw_present(self): + """Get hardware present status, only applicable on host management mode + + Returns: + bool: True if module is in the cage + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_present') == 1 + + def get_power_on(self): + """Get power on status, only applicable on host management mode + + Returns: + bool: True if the module is powered on + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_on') == 1 + + def set_power(self, on): + """Control the power of this module, only applicable on host management mode + + Args: + on (bool): True if on + """ + value = 1 if on else 0 + utils.write_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_on', value) + + def get_reset_state(self): + """Get reset state of this module, only applicable on host management mode + + Returns: + bool: True if module is not in reset status + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_reset') == 1 + + def set_hw_reset(self, value): + """Set the module reset status + + Args: + value (int): 1 for reset, 0 for leaving reset + """ + utils.write_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_reset', value) + + def get_power_good(self): + """Get power good status of this module, only applicable on host management mode + + Returns: + bool: True if the power is in good status + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_good') == 1 + + def set_control_type(self, control_type): + """Set control type for the module + + Args: + control_type (int): 0 for firmware control, currently only 0 is allowed + """ + utils.write_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control', control_type) + + def get_control_type(self): + """Get control type of this module + + Returns: + int: 0 - firmware control, 1 - software control + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control_type') + + def determine_control_type(self): + """Determine control type according to module type + + Returns: + enum: software control or firmware control + """ + api = self.get_xcvr_api() + if not api: + logger.log_error(f'Failed to get api object for SFP {self.sdk_index}, probably module EEPROM is not ready') + return SFP_FW_CONTROL + + if not self.is_supported_for_software_control(api): + return SFP_FW_CONTROL + else: + return SFP_SW_CONTROL + + def is_cmis_api(self, xcvr_api): + """Check if the api type is CMIS + + Args: + xcvr_api (object): xcvr api object + + Returns: + bool: True if the api is of type CMIS + """ + return isinstance(xcvr_api, cmis.CmisApi) + + def is_sff_api(self, xcvr_api): + """Check if the api type is SFF + + Args: + xcvr_api (object): xcvr api object + + Returns: + bool: True if the api is of type SFF + """ + return isinstance(xcvr_api, sff8636.Sff8636Api) or isinstance(xcvr_api, sff8436.Sff8436Api) + + def is_supported_for_software_control(self, xcvr_api): + """Check if the api object supports software control + + Args: + xcvr_api (object): xcvr api object + + Returns: + bool: True if the api object supports software control + """ + return self.is_cmis_api(xcvr_api) or self.is_sff_api(xcvr_api) + + def check_power_capability(self): + """Check module max power with cage power limit + + Returns: + bool: True if max power does not exceed cage power limit + """ + max_power = self.get_module_max_power() + if max_power == POWER_CLASS_INVALID: + return False + + power_limit = self.get_power_limit() + logger.log_info(f'SFP {self.sdk_index}: max_power={max_power}, power_limit={power_limit}') + return max_power <= power_limit + + def get_power_limit(self): + """Get power limit of this module + + Returns: + int: Power limit in unit of 0.25W + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_limit') + + def get_module_max_power(self): + """Get module max power from EEPROM + + Returns: + int: max power in terms of 0.25W. Return POWER_CLASS_INVALID if EEPROM data is incorrect. + """ + xcvr_api = self.get_xcvr_api() + if isinstance(xcvr_api, cmis.CmisApi): + field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.MAX_POWER_FIELD) + powercap_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) + powercap = int.from_bytes(powercap_ba, "big") + return powercap + elif isinstance(xcvr_api, sff8636.Sff8636Api) or isinstance(xcvr_api, sff8436.Sff8436Api): + field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.POWER_CLASS_FIELD) + power_class_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) + power_class_bits = {bit_id: int((power_class_ba[0] >> bit_id) & 0b1) for bit_id in [7, 6, 5, 1, 0]} + if (power_class_bits[7], power_class_bits[6], power_class_bits[1], power_class_bits[0]) == (0, 0, 0, 0): + powercap = POWER_CLASS_1_MAX_POWER + elif (power_class_bits[7], power_class_bits[6], power_class_bits[1], power_class_bits[0]) == (0, 1, 0, 0): + powercap = POWER_CLASS_2_MAX_POWER + elif (power_class_bits[7], power_class_bits[6], power_class_bits[1], power_class_bits[0]) == (1, 0, 0, 0): + powercap = POWER_CLASS_3_MAX_POWER + elif (power_class_bits[7], power_class_bits[6], power_class_bits[1], power_class_bits[0]) == (1, 1, 0, 0): + powercap = POWER_CLASS_4_MAX_POWER + elif (power_class_bits[7], power_class_bits[6], power_class_bits[1], power_class_bits[0]) == (1, 1, 0, 1): + powercap = POWER_CLASS_5_MAX_POWER + elif (power_class_bits[7], power_class_bits[6], power_class_bits[1], power_class_bits[0]) == (1, 1, 1, 0): + powercap = POWER_CLASS_6_MAX_POWER + elif (power_class_bits[7], power_class_bits[6], power_class_bits[1], power_class_bits[0]) == (1, 1, 1, 1): + powercap = POWER_CLASS_7_MAX_POWER + else: + logger.log_error(f'SFP {self.sdk_index} got invalid value for power class field: {power_class_ba}') + return POWER_CLASS_INVALID + + if power_class_bits[5] == 1: + read_power_class_8_byte = xcvr_api.xcvr_eeprom.read_raw(107, 1) + powercap = max(read_power_class_8_byte, powercap) + # Multiplying the sysfs value (0.25 Watt units) by 4 aligns it with the EEPROM max power value (1 Watt units), + # ensuring both are in the same unit for a meaningful comparison + return powercap * 4 # + else: + # Should never hit, just in case + logger.log_error(f'SFP {self.sdk_index} with api type {xcvr_api} does not support getting max power') + return POWER_CLASS_INVALID + + def update_i2c_frequency(self): + """Update I2C frequency for the module. + """ + if self.get_frequency_support(): + api = self.get_xcvr_api() + if self.is_cmis_api(api): + # for CMIS modules, read the module maximum supported clock of Management Comm Interface (MCI) from module EEPROM. + # from byte 2 bits 3-2: + # 00b means module supports up to 400KHz + # 01b means module supports up to 1MHz + logger.log_debug(f"Reading mci max frequency for SFP {self.sdk_index}") + read_mci = api.xcvr_eeprom.read_raw(CMIS_MCI_EEPROM_OFFSET, 1) + logger.log_debug(f"Read mci max frequency {read_mci} for SFP {self.sdk_index}") + frequency = (read_mci & CMIS_MCI_MASK) >> 2 + elif self.is_sff_api(api): + # for SFF modules, frequency is always 400KHz + frequency = 0 + else: + # Should never hit, just in case + logger.log_error(f'SFP {self.sdk_index} with api type {api} does not support updating frequency but frequency_support sysfs return 1') + return + + logger.log_info(f"Read mci max frequency bits {frequency} for SFP {self.sdk_index}") + self.set_frequency(frequency) + + def get_frequency_support(self): + """Get frequency support for this module + + Returns: + bool: True if supported + """ + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/frequency_support') == 1 + + def set_frequency(self, freqeuncy): + """Set module frequency. + + Args: + freqeuncy (int): 0 - up to 400KHz, 1 - up to 1MHz + """ + utils.write_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/frequency', freqeuncy) + + def disable_tx_for_sff_optics(self): + """Disable TX for SFF optics + """ + api = self.get_xcvr_api() + if self.is_sff_api(api) and api.get_tx_disable_support(): + logger.log_info(f'Disabling tx for SFP {self.sdk_index}') + api.tx_disable(True) + + @classmethod + def get_state_machine(cls): + """Get state machine object, create if not exists + + Returns: + object: state machine object + """ + if not cls.sm: + from .statemachine import StateMachine + sm = StateMachine() + sm.add_state(STATE_DOWN).add_transition(EVENT_START, STATE_INIT) + sm.add_state(STATE_INIT).set_entry_action(ACTION_ON_START) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) \ + .add_transition(EVENT_RESET, STATE_RESETTING) \ + .add_transition(EVENT_POWER_ON, STATE_POWERED_ON) + sm.add_state(STATE_RESETTING).set_entry_action(ACTION_ON_RESET) \ + .add_transition(EVENT_RESET_DONE, STATE_POWERED_ON) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) + sm.add_state(STATE_POWERED_ON).set_entry_action(ACTION_ON_POWERED) \ + .add_transition(EVENT_POWER_BAD, STATE_POWER_BAD) \ + .add_transition(EVENT_SW_CONTROL, STATE_SW_CONTROL) \ + .add_transition(EVENT_FW_CONTROL, STATE_FW_CONTROL) + sm.add_state(STATE_SW_CONTROL).set_entry_action(ACTION_ON_SW_CONTROL) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) \ + .add_transition(EVENT_POWER_LIMIT_EXCEED, STATE_POWER_LIMIT_ERROR) \ + .add_transition(EVENT_POWER_BAD, STATE_POWER_BAD) + sm.add_state(STATE_FW_CONTROL).set_entry_action(ACTION_ON_FW_CONTROL) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) + sm.add_state(STATE_POWER_BAD).add_transition(EVENT_POWER_GOOD, STATE_POWERED_ON) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) + sm.add_state(STATE_NOT_PRESENT).set_entry_action(ACTION_ON_NOT_PRESENT) \ + .add_transition(EVENT_PRESENT, STATE_INIT) + sm.add_state(STATE_POWER_LIMIT_ERROR).set_entry_action(ACTION_ON_POWER_LIMIT_ERROR) \ + .add_transition(EVENT_POWER_GOOD, STATE_POWERED_ON) \ + .add_transition(EVENT_NOT_PRESENT, STATE_NOT_PRESENT) + + cls.action_table = {} + cls.action_table[ACTION_ON_START] = cls.action_on_start + cls.action_table[ACTION_ON_RESET] = cls.action_on_reset + cls.action_table[ACTION_ON_POWERED] = cls.action_on_powered + cls.action_table[ACTION_ON_SW_CONTROL] = cls.action_on_sw_control + cls.action_table[ACTION_ON_FW_CONTROL] = cls.action_on_fw_control + cls.action_table[ACTION_ON_NOT_PRESENT] = cls.action_on_not_present + cls.action_table[ACTION_ON_POWER_LIMIT_ERROR] = cls.action_on_power_limit_error + + cls.sm = sm + + return cls.sm + + @classmethod + def action_on_start(cls, sfp): + if not sfp.get_hw_present(): + logger.log_info(f'SFP {sfp.sdk_index} is not present') + sfp.on_event(EVENT_NOT_PRESENT) + return + + if not sfp.get_power_on(): + logger.log_info(f'SFP {sfp.sdk_index} is not powered on') + sfp.set_power(True) + sfp.set_hw_reset(1) + sfp.on_event(EVENT_RESET) + else: + if not sfp.get_reset_state(): + logger.log_info(f'SFP {sfp.sdk_index} is in reset state') + sfp.set_hw_reset(1) + sfp.on_event(EVENT_RESET) + else: + sfp.on_event(EVENT_POWER_ON) + + @classmethod + def action_on_reset(cls, sfp): + logger.log_info(f'SFP {sfp.sdk_index} is scheduled to wait for resetting done') + cls.get_wait_ready_task().schedule_wait(sfp.sdk_index) + + @classmethod + def action_on_powered(cls, sfp): + if not sfp.get_power_good(): + logger.log_info(f'SFP {sfp.sdk_index} is not in power good state') + sfp.on_event(EVENT_POWER_BAD) + return + + control_type = sfp.determine_control_type() + if control_type == SFP_SW_CONTROL: + sfp.on_event(EVENT_SW_CONTROL) + else: + sfp.on_event(EVENT_FW_CONTROL) + + @classmethod + def action_on_sw_control(cls, sfp): + if not sfp.check_power_capability(): + sfp.on_event(EVENT_POWER_LIMIT_EXCEED) + return + + sfp.update_i2c_frequency() + sfp.disable_tx_for_sff_optics() + logger.log_info(f'SFP {sfp.sdk_index} is set to software control') + + @classmethod + def action_on_fw_control(cls, sfp): + logger.log_info(f'SFP {sfp.sdk_index} is set to firmware control') + sfp.set_control_type(SFP_FW_CONTROL) + + @classmethod + def action_on_not_present(cls, sfp): + cls.get_wait_ready_task().cancel_wait(sfp.sdk_index) + + @classmethod + def action_on_power_limit_error(cls, sfp): + logger.log_info(f'SFP {sfp.sdk_index} is powered off due to exceeding power limit') + sfp.set_power(False) + sfp.set_hw_reset(0) + + @classmethod + def get_wait_ready_task(cls): + """Get SFP wait ready task. Create if not exists. + + Returns: + object: an instance of WaitSfpReadyTask + """ + if not cls.wait_ready_task: + from .wait_sfp_ready_task import WaitSfpReadyTask + cls.wait_ready_task = WaitSfpReadyTask() + return cls.wait_ready_task + + def get_state(self): + """Return the current state. + + Returns: + str: current state + """ + return self.state + + def change_state(self, new_state): + """Change from old state to new state + + Args: + new_state (str): new state + """ + self.state = new_state + + def on_action(self, action_name): + """Called when a state machine action is executing + + Args: + action_name (str): action name + """ + SFP.action_table[action_name](self) + + def on_event(self, event): + """Called when a state machine event arrives + + Args: + event (str): State machine event + """ + SFP.get_state_machine().on_event(self, event) + + def in_stable_state(self): + """Indicate whether this module is in a stable state. 'Stable state' means the module is pending on a polling event + from SDK. + + Returns: + bool: True if the module is in a stable state + """ + return self.state in (STATE_NOT_PRESENT, STATE_SW_CONTROL, STATE_FW_CONTROL, STATE_POWER_BAD, STATE_POWER_LIMIT_ERROR) + + def get_fds_for_poling(self): + if self.state == STATE_FW_CONTROL: + return { + 'present': open(f'/sys/module/sx_core/asic0/module{self.sdk_index}/present') + } + elif self.state == STATE_NOT_PRESENT: + return { + 'hw_present': open(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_present') + } + else: + return { + 'hw_present': open(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_present'), + 'power_good': open(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_good') + } + + def get_fd_for_polling_legacy(self): + """Get polling fds for when module host management is disabled + + Returns: + object: file descriptor of present + """ + return open(f'/sys/module/sx_core/asic0/module{self.sdk_index}/present') + + def fill_change_event(self, port_dict): + """Fill change event data based on current state. + + Args: + port_dict (dict): {:} + """ + if self.state == STATE_NOT_PRESENT: + port_dict[self.sdk_index + 1] = SFP_STATUS_REMOVED + elif self.state == STATE_SW_CONTROL: + port_dict[self.sdk_index + 1] = SFP_STATUS_INSERTED + elif self.state == STATE_FW_CONTROL: + port_dict[self.sdk_index + 1] = SFP_STATUS_INSERTED + elif self.state == STATE_POWER_BAD or self.state == STATE_POWER_LIMIT_ERROR: + sfp_state = SFP.SFP_ERROR_BIT_POWER_BUDGET_EXCEEDED | SFP.SFP_STATUS_BIT_INSERTED + port_dict[self.sdk_index + 1] = str(sfp_state) + + def refresh_poll_obj(self, poll_obj, all_registered_fds): + """Refresh polling object and registered fds. This function is usually called when a cable plugin + event occurs. For example, user plugs out a software control module and replaces with a firmware + control cable. In such case, poll_obj was polling "hw_present" and "power_good" for software control, + and it needs to be changed to poll "present" for new control type which is firmware control. + + Args: + poll_obj (object): poll object + all_registered_fds (dict): fds that have been registered to poll object + """ + # find fds registered by this SFP + current_registered_fds = {item[2]: (fileno, item[1]) for fileno, item in all_registered_fds if item[0] == self.sdk_index} + logger.log_debug(f'SFP {self.sdk_index} registered fds are: {current_registered_fds}') + if self.state == STATE_FW_CONTROL: + target_poll_types = ['present'] + else: + target_poll_types = ['hw_present', 'power_good'] + + for target_poll_type in target_poll_types: + if target_poll_type not in current_registered_fds: + # need add new fd for polling + logger.log_debug(f'SFP {self.sdk_index} is registering file descriptor: {target_poll_type}') + fd = open(f'/sys/module/sx_core/asic0/module{self.sdk_index}/{target_poll_type}') + poll_obj.register(fd, select.POLLERR | select.POLLPRI) + all_registered_fds[fd.fileno()] = (self.sdk_index, fd, target_poll_type) + else: + # the fd is already in polling + current_registered_fds.pop(target_poll_type) + + for _, item in current_registered_fds: + # Deregister poll, close fd + logger.log_debug(f'SFP {self.sdk_index} is de-registering file descriptor: {item}') + poll_obj.poll_obj.unregister(item[1]) + all_registered_fds.pop(item[0]) + item[1].close() + + def is_dummy_event(self, origin_state, fd_type, fd_value): + """Check whether an event is dummy event + + Args: + origin_state (str): original state before polling + fd_type (str): polling sysfs type + fd_value (int): polling sysfs value + + Returns: + bool: True if the event is a dummy event + """ + if fd_type == 'hw_present' or fd_type == 'present': + if fd_value == int(SFP_STATUS_INSERTED): + return origin_state in (STATE_SW_CONTROL, STATE_FW_CONTROL, STATE_POWER_BAD, STATE_POWER_LIMIT_ERROR) + elif fd_value == int(SFP_STATUS_REMOVED): + return origin_state == STATE_NOT_PRESENT + return False + + @classmethod + def initialize_sfp_modules(cls, sfp_list): + """Initialize all modules. Only applicable when module host management is enabled + + Args: + sfp_list (object): all sfps + """ + wait_ready_task = cls.get_wait_ready_task() + wait_ready_task.start() + + for s in sfp_list: + s.on_event(EVENT_START) + + if not wait_ready_task.empty(): + # Wait until wait_ready_task is up + while not wait_ready_task.is_alive(): + pass + + # Resetting SFP requires a reloading of module firmware, it takes up to 3 seconds + # according to standard + max_wait_time = 3.5 + begin = time.time() + while True: + ready_sfp_set = wait_ready_task.get_ready_set() + for sfp_index in ready_sfp_set: + s = sfp_list[sfp_index] + logger.log_debug(f'SFP {sfp_index} is recovered from resetting state') + s.on_event(EVENT_RESET_DONE) + elapse = time.time() - begin + if elapse < max_wait_time: + time.sleep(0.5) + else: + break + + # Verify that all modules are in a stable state + for index, s in enumerate(sfp_list): + if not s.in_stable_state(): + logger.log_error(f'SFP {index} is not in stable state after initializing, state={s.state}') + logger.log_notice(f'SFP {index} is in state {s.state} after module initialization') class RJ45Port(NvidiaSFPCommon): diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/statemachine.py b/platform/mellanox/mlnx-platform-api/sonic_platform/statemachine.py new file mode 100644 index 000000000000..788717454641 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/statemachine.py @@ -0,0 +1,141 @@ +from sonic_py_common.logger import Logger + +logger = Logger() + + +class State: + """Represent a state in a state machine + """ + def __init__(self, name): + self.name = name + self.entry_action = None + self.leave_action = None + self.transitions = {} + + def set_entry_action(self, action_name): + """Set an action when entering this state + + Args: + action_name (str): action name + + Returns: + object: self + """ + self.entry_action = action_name + return self + + def set_leave_action(self, action_name): + """Set a leave action when leaving the state + + Args: + action_name (str): action name + + Returns: + object: self + """ + self.leave_action = action_name + return self + + def add_transition(self, event, next_state): + """Add a transition item to this state + + Args: + event (str): event name + next_state (str): next state that the state entity will transit to upon this event. + + Raises: + RuntimeError: raise if the event is already in the transition table + + Returns: + object: self + """ + if event in self.transitions: + raise RuntimeError(f'event {event} already exists in state {self.name}') + + self.transitions[event] = next_state + return self + + def on_enter(self, entity): + """Called when state entity enters the state + + Args: + entity (obj): state entity + """ + if self.entry_action: + logger.log_debug(f'{entity} entered state [{self.name}] and is triggering action [{self.entry_action}]') + entity.on_action(self.entry_action) + else: + logger.log_debug(f'{entity} entered state [{self.name}]') + + def on_leave(self, entity): + """Called when state entity leaves the state + + Args: + entity (obj): state entity + """ + if self.leave_action: + entity.on_action(self.leave_action) + + def on_event(self, event): + """Called when state entity has got an event + + Args: + event (str): event name + + Returns: + str: next event name + """ + if event not in self.transitions: + logger.log_error(f'{event} is not defined in state {self.name}') + return self.name + else: + return self.transitions[event] + + +class StateMachine: + def __init__(self): + self.states = {} + + def add_state(self, state_name): + """Register a state to state machine + + Args: + state_name (str): name of the state + + Raises: + RuntimeError: raise if state name already exists + + Returns: + object: the new state object + """ + if state_name in self.states: + raise RuntimeError(f'state {state_name} already exists') + + state = State(state_name) + self.states[state_name] = state + return state + + def on_event(self, entity, event): + """Called when an event occurs + + Args: + entity (object): state entity + event (str): event name + + Raises: + RuntimeError: raise if the current state is not registered + RuntimeError: raise if next state is not registered + """ + current_state_name = entity.get_state() + if current_state_name not in self.states: + raise RuntimeError(f'Unknown state {current_state_name}') + + current_state = self.states[current_state_name] + next_state_name = current_state.on_event(event) + logger.log_debug(f'{entity} has got event [{event}], it is changing from state [{current_state}] to [{next_state_name}]') + if next_state_name not in self.states: + raise RuntimeError(f'Unknown next state {next_state_name}') + if next_state_name != current_state_name: + current_state.on_leave(entity) + entity.change_state(next_state_name) + self.states[next_state_name].on_enter(entity) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py index f2f0f75b2fd1..07f6d4b49203 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py @@ -81,10 +81,6 @@ def load_tc_config(self): def start(self): self.clean_thermal_data() - if not self.wait_all_sfp_ready(): - logger.log_error('Failed to wait for all SFP ready, will put hw-management-tc to suspend') - self.control_tc(True) - return self.control_tc(False) self.load_tc_config() self._timer.start() @@ -106,25 +102,6 @@ def clean_thermal_data(self): sfp.sdk_index + 1 ) - def wait_all_sfp_ready(self): - logger.log_notice('Waiting for all SFP modules ready...') - max_wait_time = 300 - ready_set = set() - while len(ready_set) != len(self._sfp_list): - for sfp in self._sfp_list: - try: - sfp.is_sw_control() - ready_set.add(sfp) - except: - continue - max_wait_time -= 1 - if max_wait_time == 0: - return False - time.sleep(1) - - logger.log_notice('All SFP modules are ready') - return True - def get_asic_temp(self): temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None) return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/wait_sfp_ready_task.py b/platform/mellanox/mlnx-platform-api/sonic_platform/wait_sfp_ready_task.py new file mode 100644 index 000000000000..f37f5b375c88 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/wait_sfp_ready_task.py @@ -0,0 +1,109 @@ +import copy +import threading +import time +from sonic_py_common.logger import Logger + +logger = Logger() + + +class WaitSfpReadyTask(threading.Thread): + """When bring a module from powered off to powered on, it takes 3 seconds + for module to load its firmware. This class is designed to perform a wait for + those modules who are loading firmware. + """ + def __init__(self): + # Set daemon to True so that the thread will be destroyed when daemon exits. + super().__init__(daemon=True) + + # Lock to protect the wait list + self.lock = threading.Lock() + + # Event to wake up thread function + self.event = threading.Event() + + # A list of SFP to be waited. Key is SFP index, value is the expire time. + self._wait_dict = {} + + # The queue to store those SFPs who finish loading firmware. + self._ready_set = set() + + def schedule_wait(self, sfp_index): + """Add a SFP to the wait list + + Args: + sfp_index (int): the index of the SFP object + """ + logger.log_debug(f'SFP {sfp_index} is scheduled for waiting reset done') + with self.lock: + if len(self._wait_dict) == 0: + is_empty = True + # The item will be expired in 3 seconds + self._wait_dict[sfp_index] = time.time() + 3 + + if is_empty: + logger.log_debug('An item arrives, wake up WaitSfpReadyTask') + # wake up the thread + self.event.set() + + def cancel_wait(self, sfp_index): + """Cancel a SFP from the wait list + + Args: + sfp_index (int): the index of the SFP object + """ + logger.log_debug(f'SFP {sfp_index} is canceled for waiting reset done') + with self.lock: + if sfp_index in self._wait_dict: + self._wait_dict.pop(sfp_index) + if sfp_index in self._ready_set: + self._ready_set.pop(sfp_index) + + def get_ready_set(self): + """Get ready set and clear it + + Returns: + set: a deep copy of self._ready_set + """ + with self.lock: + ready_set = copy.deepcopy(self._ready_set) + self._ready_set.clear() + return ready_set + + def empty(self): + """Indicate if wait_dict is empty + + Returns: + bool: True if wait_dict is empty + """ + with self.lock: + return len(self._wait_dict) == 0 + + def run(self): + """Thread function + """ + pending_remove_set = set() + is_empty = True + while True: + if is_empty: + logger.log_debug(f'WaitSfpReadyTask is waiting for task...') + # If wait_dict is empty, hold the thread until an item coming + self.event.wait() + self.event.clear() + + now = time.time() + with self.lock: + logger.log_debug(f'Processing wait SFP dict: {self._wait_dict}, now={now}') + for sfp_index, expire_time in self._wait_dict.items(): + # If now time is greater than the expire time, remove + # the item from wait_dict + if now >= expire_time: + pending_remove_set.add(sfp_index) + + for sfp_index in pending_remove_set: + self._wait_dict.pop(sfp_index) + self._ready_set.add(sfp_index) + + is_empty = (len(self._wait_dict) == 0) + + pending_remove_set.clear() + time.sleep(1)