From 4b857575044903448612f15036c2fd43d8863e35 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Thu, 17 Aug 2023 08:15:19 +0000 Subject: [PATCH 01/26] temp commit for independent modules code --- .../sonic_platform/chassis.py | 90 ++- .../sonic_platform/modules_mgmt.py | 537 ++++++++++++++++++ 2 files changed, 601 insertions(+), 26 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index e911c9617e1a..b248e9999413 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -31,7 +31,10 @@ from . import utils from .device_data import DeviceDataManager import re + import queue + import threading import time + from sonic_platform import modules_mgmt except ImportError as e: raise ImportError (str(e) + "- required module not found") @@ -123,6 +126,17 @@ def __init__(self): self._RJ45_port_inited = False self._RJ45_port_list = None + self.threads = [] + self.modules_mgmt_thread = threading.Thread() + self.modules_changes_queue = queue.Queue() + self.modules_queue_lock = threading.Lock() + #self.modules_changes_dict = {} + + self.is_independent_modules_system = False + SAI_INDEPENDENT_MODULE_MODE = True + if SAI_INDEPENDENT_MODULE_MODE: + self.is_independent_modules_system = True + logger.log_info("Chassis loaded successfully") def __del__(self): @@ -278,6 +292,12 @@ def initialize_single_sfp(self, index): self.sfp_initialized_count += 1 def initialize_sfp(self): + if not self.modules_mgmt_thread.is_alive(): + # open new SFP change events thread + self.modules_mgmt_thread = modules_mgmt.ModulesMgmtTask(q=self.modules_changes_queue + , l=self.modules_queue_lock) + self.modules_mgmt_thread.start() + self.threads.append(self.modules_mgmt_thread) if not self._sfp_list: sfp_module = self._import_sfp_module() sfp_count = self.get_num_sfps() @@ -332,8 +352,11 @@ def get_sfp(self, index): An object dervied from SfpBase representing the specified sfp """ index = index - 1 - self.initialize_single_sfp(index) - return super(Chassis, self).get_sfp(index) + if utils.is_host(): + self.initialize_single_sfp(index) + return super(Chassis, self).get_sfp(index) + else: + return None def get_port_or_cage_type(self, index): """ @@ -384,36 +407,51 @@ def get_change_event(self, timeout=0): """ self.initialize_sfp() # Initialize SFP event first - if not self.sfp_event: - from .sfp_event import sfp_event - self.sfp_event = sfp_event(self.RJ45_port_list) - self.sfp_event.initialize() - - wait_for_ever = (timeout == 0) - # select timeout should be no more than 1000ms to ensure fast shutdown flow - select_timeout = 1000.0 if timeout >= 1000 else float(timeout) + # if not self.sfp_event: + # from .sfp_event import sfp_event + # self.sfp_event = sfp_event(self.RJ45_port_list) + # self.sfp_event.initialize() + # + # wait_for_ever = (timeout == 0) + # # select timeout should be no more than 1000ms to ensure fast shutdown flow + # select_timeout = 1000.0 if timeout >= 1000 else float(timeout) port_dict = {} error_dict = {} - begin = time.time() + # begin = time.time() while True: - status = self.sfp_event.check_sfp_status(port_dict, error_dict, select_timeout) - if bool(port_dict): - break - - if not wait_for_ever: - elapse = time.time() - begin - if elapse * 1000 > timeout: - break - - if status: + print('get_change_event() acquiring queue lock') + self.modules_queue_lock.acquire() + if self.modules_changes_queue.qsize() > 0: + #with self.modules_changes_queue.mutex: + if True: + try: + print('get_change_event() trying to get changes from queue') + port_dict = self.modules_changes_queue.get(timeout=1) + print ('get_change_event() port_dict: {}'.format(port_dict)) + except queue.Empty: + logger.log_info("failed to get item from modules changes queue") + print("failed to get item from modules changes queue") + print('get_change_event() releasing queue lock') + self.modules_queue_lock.release() + time.sleep(1) + # status = self.sfp_event.check_sfp_status(port_dict, error_dict, select_timeout) + # if bool(port_dict): + # break + # + # if not wait_for_ever: + # elapse = time.time() - begin + # if elapse * 1000 > timeout: + # break + + # if status: if port_dict: self.reinit_sfps(port_dict) - result_dict = {'sfp': port_dict} - if error_dict: + result_dict = {'sfp': port_dict} + # if error_dict: result_dict['sfp_error'] = error_dict - return True, result_dict - else: - return True, {'sfp': {}} + return True, result_dict + # else: + # return True, {'sfp': {}} def reinit_sfps(self, port_dict): """ diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py new file mode 100644 index 000000000000..5d33e1d0faf8 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -0,0 +1,537 @@ +import threading +import time +import queue +import os +import select + +try: + from sonic_py_common.logger import Logger + from sonic_py_common import device_info + from .device_data import DeviceDataManager + from sonic_platform_base.sfp_base import SfpBase + from sonic_platform_base.sonic_xcvr.fields import consts + from . import sfp as sfp_module +except ImportError as e: + raise ImportError (str(e) + "- required module not found") + +# Global logger class instance +logger = Logger() + +STATE_HW_NOT_PRESENT = "Initial state. module is not plugged to cage." +STATE_HW_PRESENT = "Module is plugged to cage" +STATE_MODULE_AVAILABLE = "Module hw present and power is good" +STATE_POWERED = "Module power is already loaded" +STATE_NOT_POWERED = "Module power is not loaded" +STATE_FW_CONTROL = "The module is not CMIS and FW needs to handle" +STATE_SW_CONTROL = "The module is CMIS and SW needs to handle" +STATE_ERROR_HANDLER = "An error occurred - read/write error, power limit or power cap." +STATE_POWER_LIMIT_ERROR = "The cage has not enough power for the plugged module" +STATE_SYSFS_ERROR = "An error occurred while writing/reading SySFS." + +INDEP_PROFILE_FILE = "/{}/independent_mode_support.profile" +SAI_INDEP_MODULE_MODE = "SAI_INDEPENDENT_MODULE_MODE" +SAI_INDEP_MODULE_MODE_DELIMITER = "=" +SAI_INDEP_MODULE_MODE_TRUE_STR = "1" +SYSFS_LEGACY_FD_PREFIX = "/sys/module/sx_netdev/{}/module/" +SYSFS_LEGACY_PRESENCE_FD = "/sys/module/sx_netdev/{}/module/present" +ASIC_NUM = 0 +PORT_BREAKOUT = 8 +SYSFS_INDEPENDENT_FD_PREFIX_WO_MODULE = "/sys/module/sx_core/asic{}".format(ASIC_NUM) +SYSFS_INDEPENDENT_FD_PREFIX = SYSFS_INDEPENDENT_FD_PREFIX_WO_MODULE + "/module{}" +SYSFS_INDEPENDENT_FD_PRESENCE = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "hw_present"]) +SYSFS_INDEPENDENT_FD_POWER_GOOD = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "power_good"]) +SYSFS_INDEPENDENT_FD_POWER_ON = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "power_on"]) +SYSFS_INDEPENDENT_FD_HW_RESET = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "hw_reset"]) +SYSFS_INDEPENDENT_FD_POWER_LIMIT = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "power_limit"]) +SYSFS_INDEPENDENT_FD_FW_CONTROL = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "control"]) +# echo /sys/module/sx_core/$asic/$module/frequency // val: 0 - up to 400KHz, 1 - up to 1MHz +SYSFS_INDEPENDENT_FD_FREQ = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "frequency"]) +IS_INDEPENDENT_MODULE = 'is_independent_module' +SYSFS_LEGACY_FD_POWER_MODE = '/'.join([SYSFS_LEGACY_FD_PREFIX, "power_mode"]) +SYSFS_LEGACY_FD_POWER_ON = '/'.join([SYSFS_LEGACY_FD_PREFIX, "power_on"]) +SYSFS_LEGACY_FD_HW_RESET = '/'.join([SYSFS_LEGACY_FD_PREFIX, "reset"]) +SYSFS_LEGACY_FD_POWER_LIMIT = '/'.join([SYSFS_LEGACY_FD_PREFIX, "power_mode_policy"]) + +class ModulesMgmtTask(threading.Thread): + RETRY_EEPROM_READING_INTERVAL = 60 + + def __init__(self, namespaces=None, port_mapping=None, main_thread_stop_event=None, sfp_error_event=None, q=None + ,l=None): + threading.Thread.__init__(self) + self.name = "SfpStateUpdateTask" + self.exc = None + self.task_stopping_event = threading.Event() + self.main_thread_stop_event = main_thread_stop_event + self.sfp_error_event = sfp_error_event + #self.port_mapping = copy.deepcopy(port_mapping) + # A set to hold those logical port name who fail to read EEPROM + self.retry_eeprom_set = set() + # To avoid retry EEPROM read too fast, record the last EEPROM read timestamp in this member + self.last_retry_eeprom_time = 0 + # A dict to hold SFP error event, for SFP insert/remove event, it is not necessary to cache them + # because _wrapper_get_presence returns the SFP presence status + self.sfp_error_dict = {} + self.sfp_insert_events = {} + self.sfp_port_dict_initial = {} + self.sfp_port_dict = {} + self.sfp_changes_dict = {} + self.namespaces = namespaces + self.modules_changes_queue = q + self.modules_queue_lock = l + self.is_supported_indep_mods_system = False + self.modules_lock_list = [] + self.waiting_modules_list = [] + self.timer = threading.Thread() + self.timer_queue = queue.Queue() + self.timer_queue_lock = threading.Lock() + self.poll_obj = None + self.fds_mapping_to_obj = {} + + # SFPs state machine + def get_sm_func(self, sm): + SFP_SM_ENUM = {STATE_HW_NOT_PRESENT: self.check_if_hw_present + , STATE_HW_PRESENT: self.checkIfModuleAvailable + , STATE_MODULE_AVAILABLE: self.checkIfPowerOn + , STATE_NOT_POWERED: self.powerOnModule + , STATE_POWERED: self.checkModuleType + , STATE_FW_CONTROL: self.saveModuleControlMode + , STATE_SW_CONTROL: self.saveModuleControlMode + , STATE_ERROR_HANDLER: STATE_ERROR_HANDLER + , STATE_POWER_LIMIT_ERROR: STATE_POWER_LIMIT_ERROR + , STATE_SYSFS_ERROR: STATE_SYSFS_ERROR + } + print ("getting func for state {}".format(sm)) + func = SFP_SM_ENUM[sm] + print ("got func {} for state {}".format(func, sm)) + return SFP_SM_ENUM[sm] + + + def run(self): + # check first if the system supports independent mode and set boolean accordingly + (platform_path, hwsku_dir) = device_info.get_paths_to_platform_and_hwsku_dirs() + #hwsku = device_info.get_hwsku() + independent_file = INDEP_PROFILE_FILE.format(hwsku_dir) + if os.path.isfile(independent_file): + print_and_log("file {} found, checking content for independent mode value".format(independent_file)) + with open(independent_file, "r") as independent_file_fd: + independent_file_content = independent_file_fd.read() + if SAI_INDEP_MODULE_MODE in independent_file_content and \ + SAI_INDEP_MODULE_MODE_DELIMITER in independent_file_content: + independent_file_splitted = independent_file_content.split(SAI_INDEP_MODULE_MODE_DELIMITER) + if (len(independent_file_splitted) > 1): + self.is_supported_indep_mods_system = int(independent_file_splitted[1]) == int(SAI_INDEP_MODULE_MODE_TRUE_STR) + print_and_log("file {} found, system will work in independent mode".format(independent_file)) + print_and_log("value of indep mode var: {} found in file".format(independent_file_splitted[1])) + else: + print_and_log("file {} not found, system stays in legacy mode".format(independent_file)) + + # static init - at first go over all ports and check each one if it's independent module or legacy + self.sfp_changes_dict = {} + # check for each port if the module connected and if it supports independent mode or legacy + num_of_ports = DeviceDataManager.get_sfp_count() + # create the modules sysfs fds poller + self.poll_obj = select.poll() + for port in range(num_of_ports): + #temp_port_dict = {IS_INDEPENDENT_MODULE: False} + # check sysfs per port whether it's independent mode or legacy + temp_module_sm = ModuleStateMachine(port_num=port, initial_state=STATE_HW_NOT_PRESENT + , current_state=STATE_HW_NOT_PRESENT) + module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) + print_and_log("system in indep mode: {} port {}".format(self.is_supported_indep_mods_system, port)) + if self.is_supported_indep_mods_system and os.path.isfile(module_fd_indep_path): + print_and_log("system in indep mode: {} port {} reading file {}".format(self.is_supported_indep_mods_system, port, module_fd_indep_path)) + temp_module_sm.set_is_indep_modules(True) + temp_module_sm.set_module_fd_path(module_fd_indep_path) + module_fd = open(module_fd_indep_path, "r") + temp_module_sm.set_module_fd(module_fd) + else: + module_fd_legacy_path = self.get_sysfs_legacy_ethernet_port_fd(SYSFS_LEGACY_PRESENCE_FD, port) + temp_module_sm.set_module_fd_path(module_fd_legacy_path) + module_fd = open(module_fd_legacy_path, "r") + temp_module_sm.set_module_fd(module_fd) + # add lock to use with timer task updating next state per module object + self.modules_lock_list.append(threading.Lock()) + # register the module's sysfs fd to poller with ERR and PRI attrs + + self.poll_obj.register(module_fd, select.POLLERR | select.POLLPRI) + self.fds_mapping_to_obj[module_fd] = temp_module_sm + temp_module_sm.set_poll_obj(self.poll_obj) + # start SM for this independent module + print_and_log("adding temp_module_sm {} to sfp_port_dict".format(temp_module_sm)) + self.sfp_port_dict_initial[port] = temp_module_sm + self.sfp_port_dict[port] = temp_module_sm + + print ("sfp_port_dict: {}".format(self.sfp_port_dict)) + # loop on listening to changes, gather and put them into shared queue, then continue looping + i = 0 + # need at least 1 module in final state until it makes sense to poll for changes + is_final_state_module = False + while not self.task_stopping_event or not self.main_thread_stop_event: + print_and_log("running iteration {}".format(i)) + for port_num, module_sm_obj in self.sfp_port_dict.items(): + curr_state = module_sm_obj.get_current_state() + func = self.get_sm_func(curr_state) + print_and_log("got returned func {} for state {}".format(func, curr_state)) + next_state = func(port_num, module_sm_obj) + if self.timer.is_alive(): + print_and_log("timer threads is alive, acquiring lock") + self.modules_lock_list[port_num].acquire() + if curr_state != STATE_NOT_POWERED or not module_sm_obj.wait_for_power_on: + module_sm_obj.set_next_state(next_state) + module_sm_obj.advance_state() + if module_sm_obj.get_final_state(): + is_final_state_module = True + if self.timer.is_alive(): + self.modules_lock_list[port_num].release() + is_timer_alive = self.timer.is_alive() + print_and_log("timer thread is_alive {} port {}".format(is_timer_alive, port_num)) + if STATE_NOT_POWERED == curr_state: + if not is_timer_alive: + print_and_log ("curr_state is {} and timer thread is_alive {}, running timer task thread" + .format(curr_state, is_timer_alive)) + # call timer task + self.timer = threading.Timer(1.0, self.timerTask) + self.timer.start() + self.timer_queue.put(module_sm_obj) + if self.timer.is_alive(): + print_and_log ("timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) + self.modules_lock_list[port_num].acquire() + module_sm_obj.set_next_state(next_state) + if self.timer.is_alive(): + print_and_log ("timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) + self.modules_lock_list[port_num].release() + for port, module_obj in self.sfp_port_dict_initial.items(): + final_state = module_obj.get_final_state() + if port in self.sfp_port_dict.keys() and final_state: + del self.sfp_port_dict[port] + self.sfp_changes_dict[str(module_obj.port_num)] = '0' if final_state in [STATE_HW_NOT_PRESENT, STATE_ERROR_HANDLER] else '1' + if is_final_state_module: + # poll for changes with 1 second timeout + fds_events = self.poll_obj.poll(1000) + for fd, event in fds_events: + # get modules object from fd according to saved key-value of fd-module obj saved earlier + module_obj = self.fds_mapping_to_obj[fd] + # put again module obj in sfp_port_dict so next loop will work on it + self.sfp_port_dict[module_obj.port_num] = module_obj + # put port number in changes dict to pass back to xcvrd's calling SfpStateUpdateTask thread + #self.sfp_changes_dict[module_obj.port_num] = module_obj + if self.sfp_changes_dict: + print_and_log("putting sfp_changes_dict {} in modules changes queue...".format(self.sfp_changes_dict)) + #with self.modules_changes_queue.mutex: + if True: + try: + self.modules_queue_lock.acquire() + self.modules_changes_queue.put(self.sfp_changes_dict, timeout=1) + self.modules_queue_lock.release() + self.sfp_changes_dict = {} + except queue.Full: + print_and_log("failed to put item from modules changes queue, queue is full") + else: + print_and_log("sfp_changes_dict {} is empty...".format(self.sfp_changes_dict)) + #time.sleep(3) + i += 1 + if 10 == i: + self.task_stopping_event.set() + print_and_log("sfp_port_dict: {}".format(self.sfp_port_dict)) + for port_num, module_sm_obj in self.sfp_port_dict.items(): + print_and_log("port_num: {} module_sm_obj initial state: {} current_state: {} next_state: {}" + .format(port_num, module_sm_obj.initial_state, module_sm_obj.get_current_state(), module_sm_obj.get_next_state())) + + #while not self.task_stopping_event: + + + def check_if_hw_present(self, port, module_sm_obj): + #module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) + #if os.path.isfile(module_fd_indep_path): + module_fd = module_sm_obj.module_fd + if module_fd: + try: + val = module_fd.read() + #val_int = int(val) + val_int = val + print_and_log("read val {} with type {} from module_fd {} int(val): {}".format(val, type(val), module_fd, val_int)) + val_int = int(val) + if 0 == val_int: + print_and_log("returning {} for val {}".format(STATE_HW_NOT_PRESENT, val)) + module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) + return STATE_HW_NOT_PRESENT + elif 1 == val_int: + print_and_log("returning {} for val {}".format(STATE_HW_PRESENT, val)) + return STATE_HW_PRESENT + except Exception as e: + print_and_log("exception {} for port {}".format(e, port)) + return STATE_ERROR_HANDLER + return STATE_HW_NOT_PRESENT + + def checkIfModuleAvailable(self, port, module_sm_obj): + print_and_log("enter check_if_module_available port {} module_sm_obj {}".format(port, module_sm_obj)) + module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_GOOD.format(port) + #module_fd_indep_path = SYSFS_LEGACY_FD_POWER_MODE.format("Ethernet{}".format(port*PORT_BREAKOUT)) + if os.path.isfile(module_fd_indep_path): + try: + #with open(module_fd_indep_path, "r") as module_fd: + module_fd = open(module_fd_indep_path, "r") + if module_fd: + val = module_fd.read() + val_int = int(val) + if 0 == val_int: + return STATE_HW_NOT_PRESENT + elif 1 == val_int: + #elif 2 == val_int: + self.poll_obj.register(module_fd, select.POLLERR | select.POLLPRI) + self.fds_mapping_to_obj[module_fd] = module_sm_obj + return STATE_MODULE_AVAILABLE + except Exception as e: + print_and_log("exception {} for port {}".format(e, port)) + return STATE_HW_NOT_PRESENT + return STATE_HW_NOT_PRESENT + + def checkIfPowerOn(self, port, module_sm_obj): + #module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) + module_fd_indep_path = SYSFS_LEGACY_FD_POWER_ON.format("Ethernet{}".format(port * PORT_BREAKOUT)) + if os.path.isfile(module_fd_indep_path): + try: + with open(module_fd_indep_path, "r") as module_fd: + val = module_fd.read() + val_int = int(val) + if 0 == val_int: + return STATE_NOT_POWERED + elif 1 == val_int: + return STATE_POWERED + except Exception as e: + return STATE_HW_NOT_PRESENT + + def powerOnModule(self, port, module_sm_obj): + #if module_sm_obj not in self.waiting_modules_list: + if not module_sm_obj.wait_for_power_on: + module_fd_indep_path_po = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) + #module_fd_indep_path_po = SYSFS_LEGACY_FD_POWER_ON.format("Ethernet{}".format(port * PORT_BREAKOUT)) + module_fd_indep_path_r = SYSFS_INDEPENDENT_FD_HW_RESET.format(port) + #module_fd_indep_path_r = SYSFS_LEGACY_FD_HW_RESET.format("Ethernet{}".format(port * PORT_BREAKOUT)) + try: + if os.path.isfile(module_fd_indep_path_po): + print_and_log("powerOnModule powering on via {} for port {}".format(module_fd_indep_path_po, port)) + # echo 1 > /sys/module/sx_core/$asic/$module/power_on + with open(module_fd_indep_path_po, "w") as module_fd: + module_fd.write("1") + if os.path.isfile(module_fd_indep_path_r): + print_and_log("powerOnModule resetting via {} for port {}".format(module_fd_indep_path_r, port)) + # echo 0 > /sys/module/sx_core/$asic/$module/hw_reset + with open(module_fd_indep_path_r, "w") as module_fd: + module_fd.write("0") + # with open(module_fd_indep_path, "r") as module_fd: + # val = module_fd.read() + # if 0 == val: + # return STATE_NOT_POWERED + # elif 1 == val: + # return STATE_POWERED + module_sm_obj.reset_start_time = time.time() + module_sm_obj.wait_for_power_on = True + self.waiting_modules_list.append(module_sm_obj) + + except Exception as e: + print_and_log("exception in powerOnModule {} for port {}".format(e, port)) + return STATE_HW_NOT_PRESENT + return STATE_NOT_POWERED + + def checkModuleType(self, port, module_sm_obj): + print_and_log("enter checkModuleType port {} module_sm_obj {}".format(port, module_sm_obj)) + #sfp_base_module = SfpBase() + sfp = sfp_module.SFP(port) + xcvr_api = sfp.get_xcvr_api() + #if not hasattr(xcvr_api, "xcvr_eeprom"): + if not xcvr_api: + print_and_log("checkModuleType calling sfp reinit for port {} module_sm_obj {}".format(port, module_sm_obj)) + sfp.reinit() + print_and_log("checkModuleType setting as FW control as xcvr_api is empty for port {} module_sm_obj {}".format(port, module_sm_obj)) + return STATE_FW_CONTROL + field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.ID_FIELD) + module_type_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) + #module_type = xcvr_api.xcvr_eeprom.read_raw(consts.ID_FIELD) + module_type = int.from_bytes(module_type_ba, "big") + print_and_log("got module_type {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) + if not 24 == module_type: + print_and_log("setting STATE_FW_CONTROL for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) + module_sm_obj.set_final_state = STATE_FW_CONTROL + power_cap = self.checkPowerCapNonCMIS(port, module_sm_obj) + return STATE_FW_CONTROL + else: + print_and_log("checking power cap for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) + power_cap = self.checkPowerCap(port, module_sm_obj) + if power_cap is STATE_POWER_LIMIT_ERROR: + return STATE_POWER_LIMIT_ERROR + else: + # read the module maximum supported clock of Management Comm Interface (MCI) from module EEPROM. + read_mci = "mci" + # Then, set it to frequency Sysfs using: + # echo /sys/module/sx_core/$asic/$module/frequency // val: 0 - up to 400KHz, 1 - up to 1MHz + #indep_fd_freq = SYSFS_INDEPENDENT_FD_FREQ.format(port) + #with open(indep_fd_freq, "r") as freq_fd: + # freq_fd.write(read_mci) + return STATE_SW_CONTROL + + def checkPowerCap(self, port, module_sm_obj): + print_and_log("enter checkPowerCap port {} module_sm_obj {}".format(port, module_sm_obj)) + #sfp_base_module = SfpBase() + sfp = sfp_module.SFP(port) + xcvr_api = sfp.get_xcvr_api() + field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.MAX_POWER_FIELD) + powercap_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) + print_and_log("checkPowerCap got powercap bytearray {} for port {} module_sm_obj {}".format(powercap_ba, port, module_sm_obj)) + powercap = int.from_bytes(powercap_ba, "big") + print_and_log("checkPowerCap got powercap {} for port {} module_sm_obj {}".format(powercap, port, module_sm_obj)) + indep_fd_power_limit = self.get_sysfs_ethernet_port_fd(SYSFS_INDEPENDENT_FD_POWER_LIMIT, port) + #indep_fd_power_limit = self.get_sysfs_ethernet_port_legacy_fd(SYSFS_LEGACY_FD_POWER_LIMIT, port) + with open(indep_fd_power_limit, "r") as power_limit_fd: + cage_power_limit = power_limit_fd.read() + print_and_log("checkPowerCap got cage_power_limit {} for port {} module_sm_obj {}".format(cage_power_limit, port, module_sm_obj)) + if powercap > int(cage_power_limit): + print_and_log("checkPowerCap powercap {} != cage_power_limit {} for port {} module_sm_obj {}".format(powercap, cage_power_limit, port, module_sm_obj)) + module_sm_obj.set_final_state(STATE_POWER_LIMIT_ERROR) + return STATE_POWER_LIMIT_ERROR + + def checkPowerCapNonCMIS(self, port, module_sm_obj): + print_and_log("enter checkPowerCapNonCMIS port {} module_sm_obj {}".format(port, module_sm_obj)) + sfp = sfp_module.SFP(port) + xcvr_api = sfp.get_xcvr_api() + serial_id = xcvr_api.xcvr_eeprom.read(consts.SERIAL_ID_FIELD) + if serial_id is None: + return None + + ext_id = serial_id[consts.EXT_ID_FIELD] + power_class = ext_id[consts.POWER_CLASS_FIELD] + clei_code = ext_id[consts.CLEI_CODE_FIELD] + cdr_tx = ext_id[consts.CDR_TX_FIELD] + cdr_rx = ext_id[consts.CDR_RX_FIELD] + print_and_log("checkPowerCapNonCMIS got powercap {} for port {} module_sm_obj {} clei {} cdr_tx {} cdr_rx {}" + .format(power_class, port, module_sm_obj, clei_code, cdr_tx, cdr_rx)) + field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.EXT_ID_FIELD) + powercap_ba = xcvr_api.xcvr_eeprom.read_raw(field.get_offset(), field.get_size()) + print_and_log("checkPowerCapNonCMIS got powercap bytearray {} for port {} module_sm_obj {}".format(powercap_ba, port, module_sm_obj)) + powercap = int.from_bytes(powercap_ba, "big") if type(powercap_ba) is bytearray else powercap_ba + print_and_log("checkPowerCapNonCMIS got powercap {} for port {} module_sm_obj {}".format(powercap, port, module_sm_obj)) + + + def saveModuleControlMode(self, port, module_sm_obj): + print_and_log("saveModuleControlMode setting current state {} for port {} as final state".format(module_sm_obj.get_current_state(), port)) + # bug - need to find root cause and fix + #module_sm_obj.set_final_state(module_sm_obj.get_current_state()) + state = module_sm_obj.get_current_state() + module_sm_obj.final_state = state + if state == STATE_FW_CONTROL: + #"echo 0 > /sys/module/sx_core/$asic/$module/control" + indep_fd_fw_control = SYSFS_INDEPENDENT_FD_FW_CONTROL.format(port) + with open(indep_fd_fw_control, "w") as fw_control_fd: + fw_control_fd.write("0") + print_and_log("saveModuleControlMode set FW control for state {} port {}".format(state, port)) + print_and_log("saveModuleControlMode set current state {} for port {} as final state {}".format( + module_sm_obj.get_current_state(), port, module_sm_obj.get_final_state())) + + def STATE_ERROR_HANDLER(self): + pass + + def STATE_POWER_LIMIT_ERROR(self): + pass + + def STATE_SYSFS_ERROR(self): + pass + + def timerTask(self): # wakes up every 1 second + empty = False + while not empty: + empty = True + for module in self.waiting_modules_list: + print_and_log("timerTask working on module {}".format(module)) + empty = False + state = module.get_current_state() + if module and state == STATE_NOT_POWERED: + print_and_log("timerTask module {} current_state {} counting seconds sinc reset_start_time".format(module, module.get_current_state())) + if time.time() - module.reset_start_time >= 3: + # set next state as STATE_POWERED state to trigger the function of check module type + print_and_log("timerTask module port {} locking lock of port {}".format(module.port_num, module.port_num)) + self.modules_lock_list[module.port_num].acquire() + print_and_log("timerTask module port {} setting next state to STATE_POWERED".format(module.port_num)) + module.set_next_state = STATE_POWERED + print_and_log("timerTask module port {} advancing next state".format(module.port_num)) + module.advance_state() + print_and_log("timerTask module {} releasing lock of port {}".format(module, module.port_num)) + self.modules_lock_list[module.port_num].release() + print_and_log("timerTask module port {} removing module from waiting_modules_list".format(module.port_num)) + self.waiting_modules_list.remove(module) + time.sleep(1) + def get_sysfs_legacy_ethernet_port_fd(self, sysfs_fd, port): + breakout_port = "Ethernet{}".format(port * PORT_BREAKOUT) + sysfs_eth_port_fd = sysfs_fd.format(breakout_port) + return sysfs_eth_port_fd + + #def get_sysfs_ethernet_port_fd(self, sysfs_fd, port): + # sysfs_eth_port_fd = '/'.join([sysfs_fd, "Ethernet{}".format(port * PORT_BREAKOUT)]) + # return sysfs_eth_port_fd + + def get_sysfs_ethernet_port_fd(self, sysfs_fd, port): + sysfs_eth_port_fd = sysfs_fd.format(port) + return sysfs_eth_port_fd + + +class ModuleStateMachine(object): + + def __init__(self, port_num=0, initial_state=STATE_HW_NOT_PRESENT, current_state=STATE_HW_NOT_PRESENT + , next_state=STATE_HW_NOT_PRESENT, final_state='', is_indep_module=False + , module_fd_path='', module_fd=None, poll_obj=None, reset_start_time=None): + self.port_num = port_num + self.initial_state = initial_state + self.current_state = current_state + self.next_state = next_state + self.final_state = final_state + self.is_indep_modules = is_indep_module + self.module_fd_path = module_fd_path + self.module_fd = module_fd + self.poll_obj = poll_obj + self.reset_start_time = reset_start_time + self.wait_for_power_on = False + + def set_initial_state(self, state): + self.initial_state = state + + def get_current_state(self): + return self.current_state + + def set_current_state(self, state): + self.current_state = state + + def get_next_state(self): + return self.next_state + + def set_next_state(self, state): + self.next_state = state + + def get_final_state(self): + return self.final_state + + def set_final_state(self, state): + self.final_state = state + + + + def advance_state(self): + self.set_current_state(self.next_state) + self.next_state = '' + + def set_is_indep_modules(self, is_indep_modules): + self.is_indep_modules = is_indep_modules + + def set_module_fd_path(self, module_fd_path): + self.module_fd_path = module_fd_path + + def set_module_fd(self, module_fd): + self.module_fd = module_fd + + def get_poll_obj(self): + return self.poll_obj + + def set_poll_obj(self, poll_obj): + self.poll_obj = poll_obj + +def print_and_log(msg): + logger.log_info(msg) + print(msg) From 90aa94e463a475a8e94f8f1f00f2b19f24362782 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Wed, 20 Sep 2023 19:06:03 +0000 Subject: [PATCH 02/26] update modules mgmt to have new state db table with final decision per port if it is FW or SW control --- .../sonic_platform/chassis.py | 19 ++++---- .../sonic_platform/modules_mgmt.py | 45 ++++++++++++++----- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index b248e9999413..27c7a74ace92 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -292,12 +292,6 @@ def initialize_single_sfp(self, index): self.sfp_initialized_count += 1 def initialize_sfp(self): - if not self.modules_mgmt_thread.is_alive(): - # open new SFP change events thread - self.modules_mgmt_thread = modules_mgmt.ModulesMgmtTask(q=self.modules_changes_queue - , l=self.modules_queue_lock) - self.modules_mgmt_thread.start() - self.threads.append(self.modules_mgmt_thread) if not self._sfp_list: sfp_module = self._import_sfp_module() sfp_count = self.get_num_sfps() @@ -352,11 +346,8 @@ def get_sfp(self, index): An object dervied from SfpBase representing the specified sfp """ index = index - 1 - if utils.is_host(): - self.initialize_single_sfp(index) - return super(Chassis, self).get_sfp(index) - else: - return None + self.initialize_single_sfp(index) + return super(Chassis, self).get_sfp(index) def get_port_or_cage_type(self, index): """ @@ -405,6 +396,12 @@ def get_change_event(self, timeout=0): indicates that fan 0 has been removed, fan 2 has been inserted and sfp 11 has been removed. """ + if not self.modules_mgmt_thread.is_alive(): + # open new SFP change events thread + self.modules_mgmt_thread = modules_mgmt.ModulesMgmtTask(q=self.modules_changes_queue + , l=self.modules_queue_lock) + self.modules_mgmt_thread.start() + self.threads.append(self.modules_mgmt_thread) self.initialize_sfp() # Initialize SFP event first # if not self.sfp_event: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 5d33e1d0faf8..15c05df91111 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -6,11 +6,12 @@ try: from sonic_py_common.logger import Logger - from sonic_py_common import device_info + from sonic_py_common import device_info, multi_asic from .device_data import DeviceDataManager from sonic_platform_base.sfp_base import SfpBase from sonic_platform_base.sonic_xcvr.fields import consts from . import sfp as sfp_module + from swsscommon.swsscommon import SonicV2Connector except ImportError as e: raise ImportError (str(e) + "- required module not found") @@ -88,7 +89,7 @@ def __init__(self, namespaces=None, port_mapping=None, main_thread_stop_event=No self.fds_mapping_to_obj = {} # SFPs state machine - def get_sm_func(self, sm): + def get_sm_func(self, sm, port): SFP_SM_ENUM = {STATE_HW_NOT_PRESENT: self.check_if_hw_present , STATE_HW_PRESENT: self.checkIfModuleAvailable , STATE_MODULE_AVAILABLE: self.checkIfPowerOn @@ -100,11 +101,14 @@ def get_sm_func(self, sm): , STATE_POWER_LIMIT_ERROR: STATE_POWER_LIMIT_ERROR , STATE_SYSFS_ERROR: STATE_SYSFS_ERROR } - print ("getting func for state {}".format(sm)) - func = SFP_SM_ENUM[sm] - print ("got func {} for state {}".format(func, sm)) - return SFP_SM_ENUM[sm] - + print_and_log("getting func for state {} for port {}".format(sm, port)) + try: + func = SFP_SM_ENUM[sm] + print_and_log("got func {} for state {} for port {}".format(func, sm, port)) + return func + except KeyError: + print_and_log("exception {} for port {}".format(e, port)) + return None def run(self): # check first if the system supports independent mode and set boolean accordingly @@ -161,7 +165,7 @@ def run(self): self.sfp_port_dict_initial[port] = temp_module_sm self.sfp_port_dict[port] = temp_module_sm - print ("sfp_port_dict: {}".format(self.sfp_port_dict)) + print_and_log("sfp_port_dict: {}".format(self.sfp_port_dict)) # loop on listening to changes, gather and put them into shared queue, then continue looping i = 0 # need at least 1 module in final state until it makes sense to poll for changes @@ -170,7 +174,7 @@ def run(self): print_and_log("running iteration {}".format(i)) for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() - func = self.get_sm_func(curr_state) + func = self.get_sm_func(curr_state, port) print_and_log("got returned func {} for state {}".format(func, curr_state)) next_state = func(port_num, module_sm_obj) if self.timer.is_alive(): @@ -194,17 +198,36 @@ def run(self): self.timer.start() self.timer_queue.put(module_sm_obj) if self.timer.is_alive(): - print_and_log ("timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) + print_and_log("timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].acquire() module_sm_obj.set_next_state(next_state) if self.timer.is_alive(): - print_and_log ("timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) + print_and_log("timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].release() + state_db = None for port, module_obj in self.sfp_port_dict_initial.items(): final_state = module_obj.get_final_state() if port in self.sfp_port_dict.keys() and final_state: del self.sfp_port_dict[port] self.sfp_changes_dict[str(module_obj.port_num)] = '0' if final_state in [STATE_HW_NOT_PRESENT, STATE_ERROR_HANDLER] else '1' + if final_state in [STATE_SW_CONTROL, STATE_FW_CONTROL]: + namespaces = multi_asic.get_front_end_namespaces() + for namespace in namespaces: + print_and_log("getting state_db for port {} namespace {}".format(port, namespace)) + state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) + print_and_log("got state_db for port {} namespace {}".format(port, namespace)) + if state_db is not None: + print_and_log("connecting to state_db for port {} namespace {}".format(port, namespace)) + state_db.connect(state_db.STATE_DB) + if final_state in [STATE_FW_CONTROL]: + control_type = 'FW_CONTROL' + elif final_state in [STATE_SW_CONTROL]: + control_type = 'SW_CONTROL' + table_name = 'TRANSCEIVER_MODULES_MGMT|{}'.format(port) + print_and_log("setting state_db table {} for port {} namespace {} control_type {}".format(table_name, port, namespace, control_type)) + state_db.set(state_db.STATE_DB, table_name, + "control type", control_type) + if is_final_state_module: # poll for changes with 1 second timeout fds_events = self.poll_obj.poll(1000) From c19187de90ce0139be788ccc2e0276627c2e2be7 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Tue, 26 Sep 2023 22:32:47 +0000 Subject: [PATCH 03/26] Fix issues and separate to static module detection and dynamic module detection --- .../sonic_platform/modules_mgmt.py | 71 +++++++++---------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 15c05df91111..801d3c46d3b5 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -11,6 +11,7 @@ from sonic_platform_base.sfp_base import SfpBase from sonic_platform_base.sonic_xcvr.fields import consts from . import sfp as sfp_module + from . import utils from swsscommon.swsscommon import SonicV2Connector except ImportError as e: raise ImportError (str(e) + "- required module not found") @@ -33,8 +34,7 @@ SAI_INDEP_MODULE_MODE = "SAI_INDEPENDENT_MODULE_MODE" SAI_INDEP_MODULE_MODE_DELIMITER = "=" SAI_INDEP_MODULE_MODE_TRUE_STR = "1" -SYSFS_LEGACY_FD_PREFIX = "/sys/module/sx_netdev/{}/module/" -SYSFS_LEGACY_PRESENCE_FD = "/sys/module/sx_netdev/{}/module/present" +SYSFS_LEGACY_PRESENCE_FD = "/sys/module/sx_core/asic0/module{}/present" ASIC_NUM = 0 PORT_BREAKOUT = 8 SYSFS_INDEPENDENT_FD_PREFIX_WO_MODULE = "/sys/module/sx_core/asic{}".format(ASIC_NUM) @@ -48,10 +48,6 @@ # echo /sys/module/sx_core/$asic/$module/frequency // val: 0 - up to 400KHz, 1 - up to 1MHz SYSFS_INDEPENDENT_FD_FREQ = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "frequency"]) IS_INDEPENDENT_MODULE = 'is_independent_module' -SYSFS_LEGACY_FD_POWER_MODE = '/'.join([SYSFS_LEGACY_FD_PREFIX, "power_mode"]) -SYSFS_LEGACY_FD_POWER_ON = '/'.join([SYSFS_LEGACY_FD_PREFIX, "power_on"]) -SYSFS_LEGACY_FD_HW_RESET = '/'.join([SYSFS_LEGACY_FD_PREFIX, "reset"]) -SYSFS_LEGACY_FD_POWER_LIMIT = '/'.join([SYSFS_LEGACY_FD_PREFIX, "power_mode_policy"]) class ModulesMgmtTask(threading.Thread): RETRY_EEPROM_READING_INTERVAL = 60 @@ -158,7 +154,7 @@ def run(self): # register the module's sysfs fd to poller with ERR and PRI attrs self.poll_obj.register(module_fd, select.POLLERR | select.POLLPRI) - self.fds_mapping_to_obj[module_fd] = temp_module_sm + self.fds_mapping_to_obj[module_fd.fileno()] = temp_module_sm temp_module_sm.set_poll_obj(self.poll_obj) # start SM for this independent module print_and_log("adding temp_module_sm {} to sfp_port_dict".format(temp_module_sm)) @@ -174,9 +170,11 @@ def run(self): print_and_log("running iteration {}".format(i)) for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() + print_and_log(f'STATE_LOG {port_num}: curr_state is {curr_state}') func = self.get_sm_func(curr_state, port) print_and_log("got returned func {} for state {}".format(func, curr_state)) next_state = func(port_num, module_sm_obj) + print_and_log(f'STATE_LOG {port_num}: next_state is {next_state}') if self.timer.is_alive(): print_and_log("timer threads is alive, acquiring lock") self.modules_lock_list[port_num].acquire() @@ -184,6 +182,7 @@ def run(self): module_sm_obj.set_next_state(next_state) module_sm_obj.advance_state() if module_sm_obj.get_final_state(): + print_and_log(f'STATE_LOG {port_num}: enter final state {module_sm_obj.get_final_state()}') is_final_state_module = True if self.timer.is_alive(): self.modules_lock_list[port_num].release() @@ -254,6 +253,7 @@ def run(self): #time.sleep(3) i += 1 if 10 == i: + print_and_log('stopping the loop for no reason?') self.task_stopping_event.set() print_and_log("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): @@ -264,22 +264,16 @@ def run(self): def check_if_hw_present(self, port, module_sm_obj): - #module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) - #if os.path.isfile(module_fd_indep_path): - module_fd = module_sm_obj.module_fd - if module_fd: + module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) + if os.path.isfile(module_fd_indep_path): try: - val = module_fd.read() - #val_int = int(val) - val_int = val - print_and_log("read val {} with type {} from module_fd {} int(val): {}".format(val, type(val), module_fd, val_int)) - val_int = int(val) + val_int = utils.read_int_from_file(module_fd_indep_path) if 0 == val_int: - print_and_log("returning {} for val {}".format(STATE_HW_NOT_PRESENT, val)) + print_and_log("returning {} for val {}".format(STATE_HW_NOT_PRESENT, val_int)) module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT elif 1 == val_int: - print_and_log("returning {} for val {}".format(STATE_HW_PRESENT, val)) + print_and_log("returning {} for val {}".format(STATE_HW_PRESENT, val_int)) return STATE_HW_PRESENT except Exception as e: print_and_log("exception {} for port {}".format(e, port)) @@ -289,48 +283,54 @@ def check_if_hw_present(self, port, module_sm_obj): def checkIfModuleAvailable(self, port, module_sm_obj): print_and_log("enter check_if_module_available port {} module_sm_obj {}".format(port, module_sm_obj)) module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_GOOD.format(port) - #module_fd_indep_path = SYSFS_LEGACY_FD_POWER_MODE.format("Ethernet{}".format(port*PORT_BREAKOUT)) if os.path.isfile(module_fd_indep_path): try: - #with open(module_fd_indep_path, "r") as module_fd: - module_fd = open(module_fd_indep_path, "r") - if module_fd: - val = module_fd.read() - val_int = int(val) - if 0 == val_int: - return STATE_HW_NOT_PRESENT - elif 1 == val_int: - #elif 2 == val_int: - self.poll_obj.register(module_fd, select.POLLERR | select.POLLPRI) - self.fds_mapping_to_obj[module_fd] = module_sm_obj - return STATE_MODULE_AVAILABLE + val_int = utils.read_int_from_file(module_fd_indep_path) + if 0 == val_int: + print_and_log(f'port {port} power is not good') + return STATE_HW_NOT_PRESENT + elif 1 == val_int: + print_and_log(f'port {port} power is good') + #elif 2 == val_int: + self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) + self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = module_sm_obj + return STATE_MODULE_AVAILABLE except Exception as e: print_and_log("exception {} for port {}".format(e, port)) return STATE_HW_NOT_PRESENT + print_and_log(f'port {port} has no power good file {module_fd_indep_path}') return STATE_HW_NOT_PRESENT def checkIfPowerOn(self, port, module_sm_obj): - #module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) - module_fd_indep_path = SYSFS_LEGACY_FD_POWER_ON.format("Ethernet{}".format(port * PORT_BREAKOUT)) + print_and_log(f'enter checkIfPowerOn for port {port}') + module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) if os.path.isfile(module_fd_indep_path): try: with open(module_fd_indep_path, "r") as module_fd: val = module_fd.read() val_int = int(val) if 0 == val_int: + print_and_log(f'port {port} is not powered') return STATE_NOT_POWERED elif 1 == val_int: + if not module_sm_obj.wait_for_power_on and utils.read_int_from_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port)) == 1: + print_and_log(f'port {port} is powered, but need reset') + utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 0) + module_sm_obj.reset_start_time = time.time() + module_sm_obj.wait_for_power_on = True + self.waiting_modules_list.append(module_sm_obj) + return STATE_NOT_POWERED + print_and_log(f'port {port} is powered, does not need reset') return STATE_POWERED except Exception as e: + print_and_log(f'got exception {e} in checkIfPowerOn') return STATE_HW_NOT_PRESENT def powerOnModule(self, port, module_sm_obj): #if module_sm_obj not in self.waiting_modules_list: if not module_sm_obj.wait_for_power_on: module_fd_indep_path_po = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) - #module_fd_indep_path_po = SYSFS_LEGACY_FD_POWER_ON.format("Ethernet{}".format(port * PORT_BREAKOUT)) module_fd_indep_path_r = SYSFS_INDEPENDENT_FD_HW_RESET.format(port) - #module_fd_indep_path_r = SYSFS_LEGACY_FD_HW_RESET.format("Ethernet{}".format(port * PORT_BREAKOUT)) try: if os.path.isfile(module_fd_indep_path_po): print_and_log("powerOnModule powering on via {} for port {}".format(module_fd_indep_path_po, port)) @@ -404,7 +404,6 @@ def checkPowerCap(self, port, module_sm_obj): powercap = int.from_bytes(powercap_ba, "big") print_and_log("checkPowerCap got powercap {} for port {} module_sm_obj {}".format(powercap, port, module_sm_obj)) indep_fd_power_limit = self.get_sysfs_ethernet_port_fd(SYSFS_INDEPENDENT_FD_POWER_LIMIT, port) - #indep_fd_power_limit = self.get_sysfs_ethernet_port_legacy_fd(SYSFS_LEGACY_FD_POWER_LIMIT, port) with open(indep_fd_power_limit, "r") as power_limit_fd: cage_power_limit = power_limit_fd.read() print_and_log("checkPowerCap got cage_power_limit {} for port {} module_sm_obj {}".format(cage_power_limit, port, module_sm_obj)) From b71e69efbe3680eaad6f2cbbc24e12188a30d286 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Sun, 1 Oct 2023 08:25:55 +0000 Subject: [PATCH 04/26] fix issues and change waiting list to Set --- .../sonic_platform/modules_mgmt.py | 354 +++++++++++++----- 1 file changed, 264 insertions(+), 90 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 801d3c46d3b5..339bf25eabb8 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -3,6 +3,7 @@ import queue import os import select +import traceback try: from sonic_py_common.logger import Logger @@ -49,6 +50,9 @@ SYSFS_INDEPENDENT_FD_FREQ = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "frequency"]) IS_INDEPENDENT_MODULE = 'is_independent_module' +STATE_DB_TABLE_NAME_PREFIX = 'TRANSCEIVER_MODULES_MGMT|{}' + + class ModulesMgmtTask(threading.Thread): RETRY_EEPROM_READING_INTERVAL = 60 @@ -72,12 +76,13 @@ def __init__(self, namespaces=None, port_mapping=None, main_thread_stop_event=No self.sfp_port_dict_initial = {} self.sfp_port_dict = {} self.sfp_changes_dict = {} + self.sfp_delete_list_from_port_dict = [] self.namespaces = namespaces self.modules_changes_queue = q self.modules_queue_lock = l self.is_supported_indep_mods_system = False self.modules_lock_list = [] - self.waiting_modules_list = [] + self.waiting_modules_list = Set() self.timer = threading.Thread() self.timer_queue = queue.Queue() self.timer_queue_lock = threading.Lock() @@ -102,7 +107,7 @@ def get_sm_func(self, sm, port): func = SFP_SM_ENUM[sm] print_and_log("got func {} for state {} for port {}".format(func, sm, port)) return func - except KeyError: + except KeyError as e: print_and_log("exception {} for port {}".format(e, port)) return None @@ -130,7 +135,8 @@ def run(self): # check for each port if the module connected and if it supports independent mode or legacy num_of_ports = DeviceDataManager.get_sfp_count() # create the modules sysfs fds poller - self.poll_obj = select.poll() + #self.poll_obj = select.poll() + self.poll_obj = [] for port in range(num_of_ports): #temp_port_dict = {IS_INDEPENDENT_MODULE: False} # check sysfs per port whether it's independent mode or legacy @@ -145,52 +151,68 @@ def run(self): module_fd = open(module_fd_indep_path, "r") temp_module_sm.set_module_fd(module_fd) else: - module_fd_legacy_path = self.get_sysfs_legacy_ethernet_port_fd(SYSFS_LEGACY_PRESENCE_FD, port) + module_fd_legacy_path = SYSFS_LEGACY_PRESENCE_FD.format(port) temp_module_sm.set_module_fd_path(module_fd_legacy_path) module_fd = open(module_fd_legacy_path, "r") temp_module_sm.set_module_fd(module_fd) # add lock to use with timer task updating next state per module object self.modules_lock_list.append(threading.Lock()) # register the module's sysfs fd to poller with ERR and PRI attrs - - self.poll_obj.register(module_fd, select.POLLERR | select.POLLPRI) - self.fds_mapping_to_obj[module_fd.fileno()] = temp_module_sm - temp_module_sm.set_poll_obj(self.poll_obj) + #self.poll_obj.register(module_fd, select.POLLERR | select.POLLPRI) + #self.fds_mapping_to_obj[module_fd.fileno()] = temp_module_sm + #temp_module_sm.set_poll_obj(self.poll_obj) # start SM for this independent module print_and_log("adding temp_module_sm {} to sfp_port_dict".format(temp_module_sm)) self.sfp_port_dict_initial[port] = temp_module_sm self.sfp_port_dict[port] = temp_module_sm - print_and_log("sfp_port_dict: {}".format(self.sfp_port_dict)) - # loop on listening to changes, gather and put them into shared queue, then continue looping i = 0 - # need at least 1 module in final state until it makes sense to poll for changes + # need at least 1 module in final state until it makes sense to send changes dict is_final_state_module = False - while not self.task_stopping_event or not self.main_thread_stop_event: - print_and_log("running iteration {}".format(i)) + all_static_detection_done = False + print_and_log("sfp_port_dict before starting static detection: {}".format(self.sfp_port_dict)) + # loop on different state for ports in static detection until all done + while (not self.task_stopping_event or not self.main_thread_stop_event) and not all_static_detection_done: + print_and_log("static detection running iteration {}".format(i)) + waiting_list_len = len(self.waiting_modules_list) + sfp_port_dict_keys_len = len(self.sfp_port_dict.keys()) + # if all ports in waiting list - sleep one second rather than looping over and over again on same state + if waiting_list_len == sfp_port_dict_keys_len: + print_and_log("static detection length of waiting list {} and sfp port dict keys {} is the same, sleeping 1 second..." + .format(waiting_list_len, sfp_port_dict_keys_len)) + time.sleep(1) + else: + print_and_log("static detectionlength of waiting list {} and sfp port dict keys {} is different, NOT sleeping 1 second" + .format(waiting_list_len, sfp_port_dict_keys_len)) for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() - print_and_log(f'STATE_LOG {port_num}: curr_state is {curr_state}') - func = self.get_sm_func(curr_state, port) - print_and_log("got returned func {} for state {}".format(func, curr_state)) - next_state = func(port_num, module_sm_obj) - print_and_log(f'STATE_LOG {port_num}: next_state is {next_state}') + print_and_log(f'static detection STATE_LOG {port_num}: curr_state is {curr_state}') + func = self.get_sm_func(curr_state, port_num) + print_and_log("static detectiongot returned func {} for state {}".format(func, curr_state)) + try: + if not isinstance(func, str): + next_state = func(port_num, module_sm_obj) + except TypeError as e: + print_and_log("static detection exception {} for port {} traceback:\n{}".format(e, port_num, traceback.format_exc())) + continue + print_and_log(f'static detection STATE_LOG {port_num}: next_state is {next_state}') if self.timer.is_alive(): - print_and_log("timer threads is alive, acquiring lock") + #print_and_log("static detection timer threads is alive, acquiring lock") self.modules_lock_list[port_num].acquire() + # for STATE_NOT_POWERED we dont advance to next state, timerTask is doing it into STATE_POWERED if curr_state != STATE_NOT_POWERED or not module_sm_obj.wait_for_power_on: module_sm_obj.set_next_state(next_state) module_sm_obj.advance_state() if module_sm_obj.get_final_state(): - print_and_log(f'STATE_LOG {port_num}: enter final state {module_sm_obj.get_final_state()}') + print_and_log(f'static detection STATE_LOG {port_num}: enter final state {module_sm_obj.get_final_state()}') is_final_state_module = True if self.timer.is_alive(): self.modules_lock_list[port_num].release() is_timer_alive = self.timer.is_alive() - print_and_log("timer thread is_alive {} port {}".format(is_timer_alive, port_num)) + #print_and_log("static detection timer thread is_alive {} port {}".format(is_timer_alive, port_num)) if STATE_NOT_POWERED == curr_state: if not is_timer_alive: - print_and_log ("curr_state is {} and timer thread is_alive {}, running timer task thread" + print_and_log ("static detection curr_state is {} and timer thread is_alive {}, running timer task thread" .format(curr_state, is_timer_alive)) # call timer task self.timer = threading.Timer(1.0, self.timerTask) @@ -203,65 +225,175 @@ def run(self): if self.timer.is_alive(): print_and_log("timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].release() + state_db = None for port, module_obj in self.sfp_port_dict_initial.items(): final_state = module_obj.get_final_state() if port in self.sfp_port_dict.keys() and final_state: - del self.sfp_port_dict[port] self.sfp_changes_dict[str(module_obj.port_num)] = '0' if final_state in [STATE_HW_NOT_PRESENT, STATE_ERROR_HANDLER] else '1' if final_state in [STATE_SW_CONTROL, STATE_FW_CONTROL]: namespaces = multi_asic.get_front_end_namespaces() for namespace in namespaces: - print_and_log("getting state_db for port {} namespace {}".format(port, namespace)) + print_and_log("static detection getting state_db for port {} namespace {}".format(port, namespace)) state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) - print_and_log("got state_db for port {} namespace {}".format(port, namespace)) + print_and_log("static detection got state_db for port {} namespace {}".format(port, namespace)) if state_db is not None: - print_and_log("connecting to state_db for port {} namespace {}".format(port, namespace)) + print_and_log("static detection connecting to state_db for port {} namespace {}".format(port, namespace)) state_db.connect(state_db.STATE_DB) if final_state in [STATE_FW_CONTROL]: control_type = 'FW_CONTROL' elif final_state in [STATE_SW_CONTROL]: control_type = 'SW_CONTROL' - table_name = 'TRANSCEIVER_MODULES_MGMT|{}'.format(port) - print_and_log("setting state_db table {} for port {} namespace {} control_type {}".format(table_name, port, namespace, control_type)) - state_db.set(state_db.STATE_DB, table_name, - "control type", control_type) + table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) + print_and_log("static detection setting state_db table {} for port {} namespace {} control_type {}" + .format(table_name, port, namespace, control_type)) + state_db.set(state_db.STATE_DB, table_name, "control type", control_type) + del self.sfp_port_dict[port] + + if len(self.sfp_changes_dict) > 0: + print_and_log("static detection putting sfp_changes_dict {} in modules changes queue..." + .format(self.sfp_changes_dict)) + try: + self.modules_queue_lock.acquire() + self.modules_changes_queue.put(self.sfp_changes_dict, timeout=1) + self.modules_queue_lock.release() + self.sfp_changes_dict = {} + except queue.Full: + print_and_log("failed to put item from modules changes queue, queue is full") + else: + print_and_log("static detection sfp_changes_dict {} is empty...".format(self.sfp_changes_dict)) + i += 1 + print_and_log("sfp_port_dict: {}".format(self.sfp_port_dict)) + for port_num, module_sm_obj in self.sfp_port_dict.items(): + print_and_log("static detection port_num: {} initial state: {} current_state: {} next_state: {}" + .format(port_num, module_sm_obj.initial_state, module_sm_obj.get_current_state() + , module_sm_obj.get_next_state())) + sfp_port_dict_keys_len = len(self.sfp_port_dict.keys()) + if sfp_port_dict_keys_len == 0: + print_and_log("static detection len of keys of sfp_port_dict is 0: {}".format(sfp_port_dict_keys_len)) + all_static_detection_done = True + else: + print_and_log("static detection len of keys of sfp_port_dict is not 0: {}".format(sfp_port_dict_keys_len)) + print_and_log("static detection all_static_detection_done: {}".format(all_static_detection_done)) + + + print_and_log("sfp_port_dict before dynamic detection: {}".format(self.sfp_port_dict)) + # loop on listening to changes, gather and put them into shared queue, then continue looping + i = 0 + # need at least 1 module in final state until it makes sense to send changes dict + is_final_state_module = False + while not self.task_stopping_event or not self.main_thread_stop_event: + print_and_log("dynamic detection running iteration {}".format(i)) + # poll for changes with 1 second timeout + #fds_events = self.poll_obj.poll(1000) + fds_events = select.select(self.poll_obj, [], [], 1000) + print_and_log("dynamic detection polled obj checking fds_events iteration {}".format(i)) + for fd, event in fds_events: + # get modules object from fd according to saved key-value of fd-module obj saved earlier + print_and_log("dynamic detection working on fd {} event {}".format(fd, event)) + module_obj = self.fds_mapping_to_obj[fd] + print_and_log("dynamic detection got module_obj {} with port {} from fd number {} path {}" + .format(module_obj, module_obj.port_num, fd, module_obj.module_fd_path)) + # put again module obj in sfp_port_dict so next loop will work on it + module_obj.reset_all_states() + self.sfp_port_dict[module_obj.port_num] = module_obj + # put port number in changes dict to pass back to xcvrd's calling SfpStateUpdateTask thread + #self.sfp_changes_dict[module_obj.port_num] = module_obj + + for port_num, module_sm_obj in self.sfp_port_dict.items(): + curr_state = module_sm_obj.get_current_state() + print_and_log(f'dynamic detection STATE_LOG {port_num}: curr_state is {curr_state}') + func = self.get_sm_func(curr_state, port) + print_and_log("dynamic detection got returned func {} for state {}".format(func, curr_state)) + try: + next_state = func(port_num, module_sm_obj) + except TypeError as e: + print_and_log("dynamic detection exception {} for port {}".format(e, port_num)) + continue + print_and_log(f'dynamic detection STATE_LOG {port_num}: next_state is {next_state}') + if self.timer.is_alive(): + #print_and_log("dynamic detection timer threads is alive, acquiring lock") + self.modules_lock_list[port_num].acquire() + if curr_state != STATE_NOT_POWERED or not module_sm_obj.wait_for_power_on: + module_sm_obj.set_next_state(next_state) + module_sm_obj.advance_state() + if module_sm_obj.get_final_state(): + #print_and_log(f'dynamic detection STATE_LOG {port_num}: enter final state {module_sm_obj.get_final_state()}') + is_final_state_module = True + if self.timer.is_alive(): + self.modules_lock_list[port_num].release() + is_timer_alive = self.timer.is_alive() + #print_and_log("dynamic detection timer thread is_alive {} port {}".format(is_timer_alive, port_num)) + if STATE_NOT_POWERED == curr_state: + if not is_timer_alive: + print_and_log("dynamic detection curr_state is {} and timer thread is_alive {}, running timer task thread" + .format(curr_state, is_timer_alive)) + # call timer task + self.timer = threading.Timer(1.0, self.timerTask) + self.timer.start() + self.timer_queue.put(module_sm_obj) + if self.timer.is_alive(): + print_and_log("dynamic detection timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) + self.modules_lock_list[port_num].acquire() + module_sm_obj.set_next_state(next_state) + if self.timer.is_alive(): + print_and_log( + "dynamic detection timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) + self.modules_lock_list[port_num].release() if is_final_state_module: - # poll for changes with 1 second timeout - fds_events = self.poll_obj.poll(1000) - for fd, event in fds_events: - # get modules object from fd according to saved key-value of fd-module obj saved earlier - module_obj = self.fds_mapping_to_obj[fd] - # put again module obj in sfp_port_dict so next loop will work on it - self.sfp_port_dict[module_obj.port_num] = module_obj - # put port number in changes dict to pass back to xcvrd's calling SfpStateUpdateTask thread - #self.sfp_changes_dict[module_obj.port_num] = module_obj + state_db = None + for port, module_obj in self.sfp_port_dict.items(): + final_state = module_obj.get_final_state() + if final_state: + #del self.sfp_port_dict[port] + # add port to delete list that we will iterate on later and delete the ports from sfp_port_dict + self.sfp_delete_list_from_port_dict.append(port) + self.sfp_changes_dict[str(module_obj.port_num)] = '0' if final_state in [STATE_HW_NOT_PRESENT, + STATE_ERROR_HANDLER] else '1' + if final_state in [STATE_SW_CONTROL, STATE_FW_CONTROL]: + namespaces = multi_asic.get_front_end_namespaces() + for namespace in namespaces: + print_and_log("dynamic detection getting state_db for port {} namespace {}".format(port, namespace)) + state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) + print_and_log("dynamic detection got state_db for port {} namespace {}".format(port, namespace)) + if state_db is not None: + print_and_log( + "dynamic detection connecting to state_db for port {} namespace {}".format(port, namespace)) + state_db.connect(state_db.STATE_DB) + if final_state in [STATE_FW_CONTROL]: + control_type = 'FW_CONTROL' + elif final_state in [STATE_SW_CONTROL]: + control_type = 'SW_CONTROL' + table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) + print_and_log( + "dynamic detection setting state_db table {} for port {} namespace {} control_type {}" + .format(table_name, port, namespace, control_type)) + state_db.set(state_db.STATE_DB, table_name,"control type", control_type) + + print_and_log("dynamic detection sfp_port_dict before deletion: {}".format(self.sfp_port_dict)) + for port in self.sfp_delete_list_from_port_dict: + print_and_log("dynamic detection deleting port {} from sfp_port_dict".format(port)) + del self.sfp_port_dict[port] + self.sfp_delete_list_from_port_dict = [] + print_and_log("dynamic detection sfp_port_dict after deletion: {}".format(self.sfp_port_dict)) if self.sfp_changes_dict: - print_and_log("putting sfp_changes_dict {} in modules changes queue...".format(self.sfp_changes_dict)) - #with self.modules_changes_queue.mutex: - if True: - try: - self.modules_queue_lock.acquire() - self.modules_changes_queue.put(self.sfp_changes_dict, timeout=1) - self.modules_queue_lock.release() - self.sfp_changes_dict = {} - except queue.Full: - print_and_log("failed to put item from modules changes queue, queue is full") + print_and_log("dynamic detection putting sfp_changes_dict {} in modules changes queue...".format(self.sfp_changes_dict)) + try: + self.modules_queue_lock.acquire() + self.modules_changes_queue.put(self.sfp_changes_dict, timeout=1) + self.modules_queue_lock.release() + self.sfp_changes_dict = {} + except queue.Full: + print_and_log("failed to put item from modules changes queue, queue is full") else: print_and_log("sfp_changes_dict {} is empty...".format(self.sfp_changes_dict)) - #time.sleep(3) i += 1 - if 10 == i: - print_and_log('stopping the loop for no reason?') - self.task_stopping_event.set() print_and_log("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): print_and_log("port_num: {} module_sm_obj initial state: {} current_state: {} next_state: {}" .format(port_num, module_sm_obj.initial_state, module_sm_obj.get_current_state(), module_sm_obj.get_next_state())) - #while not self.task_stopping_event: - def check_if_hw_present(self, port, module_sm_obj): module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) @@ -276,7 +408,8 @@ def check_if_hw_present(self, port, module_sm_obj): print_and_log("returning {} for val {}".format(STATE_HW_PRESENT, val_int)) return STATE_HW_PRESENT except Exception as e: - print_and_log("exception {} for port {}".format(e, port)) + print_and_log("exception {} for port {}, setting as final state STATE_ERROR_HANDLER".format(e, port)) + module_sm_obj.set_final_state(STATE_ERROR_HANDLER) return STATE_ERROR_HANDLER return STATE_HW_NOT_PRESENT @@ -292,7 +425,8 @@ def checkIfModuleAvailable(self, port, module_sm_obj): elif 1 == val_int: print_and_log(f'port {port} power is good') #elif 2 == val_int: - self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) + #self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) + self.poll_obj.append(module_sm_obj.module_fd) self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = module_sm_obj return STATE_MODULE_AVAILABLE except Exception as e: @@ -313,13 +447,21 @@ def checkIfPowerOn(self, port, module_sm_obj): print_and_log(f'port {port} is not powered') return STATE_NOT_POWERED elif 1 == val_int: - if not module_sm_obj.wait_for_power_on and utils.read_int_from_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port)) == 1: - print_and_log(f'port {port} is powered, but need reset') - utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 0) - module_sm_obj.reset_start_time = time.time() - module_sm_obj.wait_for_power_on = True - self.waiting_modules_list.append(module_sm_obj) - return STATE_NOT_POWERED + if not module_sm_obj.wait_for_power_on and \ + utils.read_int_from_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port)) == 1: + sfp = sfp_module.SFP(port) + xcvr_api = sfp.get_xcvr_api() + # only if xcvr_api is None or if it is not active optics cables need reset + if not xcvr_api or xcvr_api.is_flat_memory(): + print_and_log(f'port {port} is powered, but need reset') + utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 0) + module_sm_obj.reset_start_time = time.time() + module_sm_obj.wait_for_power_on = True + utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 1) + module_sm_obj.reset_start_time = time.time() + module_sm_obj.wait_for_power_on = True + self.waiting_modules_list.append(module_sm_obj) + return STATE_NOT_POWERED print_and_log(f'port {port} is powered, does not need reset') return STATE_POWERED except Exception as e: @@ -342,16 +484,9 @@ def powerOnModule(self, port, module_sm_obj): # echo 0 > /sys/module/sx_core/$asic/$module/hw_reset with open(module_fd_indep_path_r, "w") as module_fd: module_fd.write("0") - # with open(module_fd_indep_path, "r") as module_fd: - # val = module_fd.read() - # if 0 == val: - # return STATE_NOT_POWERED - # elif 1 == val: - # return STATE_POWERED module_sm_obj.reset_start_time = time.time() module_sm_obj.wait_for_power_on = True self.waiting_modules_list.append(module_sm_obj) - except Exception as e: print_and_log("exception in powerOnModule {} for port {}".format(e, port)) return STATE_HW_NOT_PRESENT @@ -359,10 +494,8 @@ def powerOnModule(self, port, module_sm_obj): def checkModuleType(self, port, module_sm_obj): print_and_log("enter checkModuleType port {} module_sm_obj {}".format(port, module_sm_obj)) - #sfp_base_module = SfpBase() sfp = sfp_module.SFP(port) xcvr_api = sfp.get_xcvr_api() - #if not hasattr(xcvr_api, "xcvr_eeprom"): if not xcvr_api: print_and_log("checkModuleType calling sfp reinit for port {} module_sm_obj {}".format(port, module_sm_obj)) sfp.reinit() @@ -371,14 +504,31 @@ def checkModuleType(self, port, module_sm_obj): field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.ID_FIELD) module_type_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) #module_type = xcvr_api.xcvr_eeprom.read_raw(consts.ID_FIELD) + if module_type_ba is None: + print_and_log("checkModuleType module_type is None for port {} - checking if we didnt retry yet".format(port)) + # if we didnt do this retry yet - do it once + if not module_sm_obj.eeprom_poweron_reset_retry: + print_and_log("checkModuleType module_type is None retrying by falling back to STATE_NOT_POWERED" + "for port {}".format(port)) + module_sm_obj.eeprom_poweron_reset_retry = True + self.add_port_to_wait_reset(module_sm_obj) + return STATE_NOT_POWERED + else: + print_and_log("checkModuleType module_type is None and already retried - setting as STATE_ERROR_HANDLER" + "for port {}".format(port)) + return STATE_ERROR_HANDLER module_type = int.from_bytes(module_type_ba, "big") - print_and_log("got module_type {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) + print_and_log("checkModuleType got module_type {} in check_module_type port {}".format(port, module_type)) if not 24 == module_type: - print_and_log("setting STATE_FW_CONTROL for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) + print_and_log("check_module_type port {} setting STATE_FW_CONTROL due to module ID {}".format(port, module_type)) module_sm_obj.set_final_state = STATE_FW_CONTROL - power_cap = self.checkPowerCapNonCMIS(port, module_sm_obj) + #power_cap = self.checkPowerCapNonCMIS(port, module_sm_obj) return STATE_FW_CONTROL else: + if xcvr_api.is_flat_memory(): + print_and_log("check_module_type port {} setting STATE_FW_CONTROL module ID {} due to flat_mem device" + .format(module_type, port)) + return STATE_FW_CONTROL print_and_log("checking power cap for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) power_cap = self.checkPowerCap(port, module_sm_obj) if power_cap is STATE_POWER_LIMIT_ERROR: @@ -429,13 +579,16 @@ def checkPowerCapNonCMIS(self, port, module_sm_obj): .format(power_class, port, module_sm_obj, clei_code, cdr_tx, cdr_rx)) field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.EXT_ID_FIELD) powercap_ba = xcvr_api.xcvr_eeprom.read_raw(field.get_offset(), field.get_size()) - print_and_log("checkPowerCapNonCMIS got powercap bytearray {} for port {} module_sm_obj {}".format(powercap_ba, port, module_sm_obj)) + print_and_log("checkPowerCapNonCMIS got powercap bytearray {} for port {} module_sm_obj {}" + .format(powercap_ba, port, module_sm_obj)) powercap = int.from_bytes(powercap_ba, "big") if type(powercap_ba) is bytearray else powercap_ba - print_and_log("checkPowerCapNonCMIS got powercap {} for port {} module_sm_obj {}".format(powercap, port, module_sm_obj)) + print_and_log("checkPowerCapNonCMIS got powercap {} for port {} module_sm_obj {}" + .format(powercap, port, module_sm_obj)) def saveModuleControlMode(self, port, module_sm_obj): - print_and_log("saveModuleControlMode setting current state {} for port {} as final state".format(module_sm_obj.get_current_state(), port)) + print_and_log("saveModuleControlMode setting current state {} for port {} as final state" + .format(module_sm_obj.get_current_state(), port)) # bug - need to find root cause and fix #module_sm_obj.set_final_state(module_sm_obj.get_current_state()) state = module_sm_obj.get_current_state() @@ -446,6 +599,16 @@ def saveModuleControlMode(self, port, module_sm_obj): with open(indep_fd_fw_control, "w") as fw_control_fd: fw_control_fd.write("0") print_and_log("saveModuleControlMode set FW control for state {} port {}".format(state, port)) + module_fd_legacy_path = SYSFS_LEGACY_PRESENCE_FD.format(port) + module_sm_obj.set_module_fd_path(module_fd_legacy_path) + module_fd = open(module_fd_legacy_path, "r") + module_sm_obj.set_module_fd(module_fd) + print_and_log("saveModuleControlMode changed module fd to legacy present for port {}".format(port)) + print_and_log("saveModuleControlMode registering sysfs fd {} number {} path {} for port {}".format( + module_sm_obj.module_fd, module_sm_obj.module_fd.fileno(), module_sm_obj.set_module_fd_path, port)) + self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) + self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = module_sm_obj + module_sm_obj.set_poll_obj(self.poll_obj) print_and_log("saveModuleControlMode set current state {} for port {} as final state {}".format( module_sm_obj.get_current_state(), port, module_sm_obj.get_final_state())) @@ -467,13 +630,14 @@ def timerTask(self): # wakes up every 1 second empty = False state = module.get_current_state() if module and state == STATE_NOT_POWERED: - print_and_log("timerTask module {} current_state {} counting seconds sinc reset_start_time".format(module, module.get_current_state())) + print_and_log("timerTask module {} current_state {} counting seconds since reset_start_time" + .format(module, module.get_current_state())) if time.time() - module.reset_start_time >= 3: # set next state as STATE_POWERED state to trigger the function of check module type print_and_log("timerTask module port {} locking lock of port {}".format(module.port_num, module.port_num)) self.modules_lock_list[module.port_num].acquire() print_and_log("timerTask module port {} setting next state to STATE_POWERED".format(module.port_num)) - module.set_next_state = STATE_POWERED + module.set_next_state(STATE_POWERED) print_and_log("timerTask module port {} advancing next state".format(module.port_num)) module.advance_state() print_and_log("timerTask module {} releasing lock of port {}".format(module, module.port_num)) @@ -481,25 +645,29 @@ def timerTask(self): # wakes up every 1 second print_and_log("timerTask module port {} removing module from waiting_modules_list".format(module.port_num)) self.waiting_modules_list.remove(module) time.sleep(1) - def get_sysfs_legacy_ethernet_port_fd(self, sysfs_fd, port): + + def get_sysfs_netdev_legacy_ethernet_port_fd(self, sysfs_fd, port): breakout_port = "Ethernet{}".format(port * PORT_BREAKOUT) sysfs_eth_port_fd = sysfs_fd.format(breakout_port) return sysfs_eth_port_fd - #def get_sysfs_ethernet_port_fd(self, sysfs_fd, port): - # sysfs_eth_port_fd = '/'.join([sysfs_fd, "Ethernet{}".format(port * PORT_BREAKOUT)]) - # return sysfs_eth_port_fd - def get_sysfs_ethernet_port_fd(self, sysfs_fd, port): sysfs_eth_port_fd = sysfs_fd.format(port) return sysfs_eth_port_fd + def add_port_to_wait_reset(self, module_sm_obj): + module_sm_obj.reset_start_time = time.time() + module_sm_obj.wait_for_power_on = True + self.waiting_modules_list.append(module_sm_obj) + + class ModuleStateMachine(object): def __init__(self, port_num=0, initial_state=STATE_HW_NOT_PRESENT, current_state=STATE_HW_NOT_PRESENT - , next_state=STATE_HW_NOT_PRESENT, final_state='', is_indep_module=False - , module_fd_path='', module_fd=None, poll_obj=None, reset_start_time=None): + , next_state=STATE_HW_NOT_PRESENT, final_state='', is_indep_module=False + , module_fd_path='', module_fd=None, poll_obj=None, reset_start_time=None + , eeprom_poweron_reset_retry=False): self.port_num = port_num self.initial_state = initial_state self.current_state = current_state @@ -511,6 +679,7 @@ def __init__(self, port_num=0, initial_state=STATE_HW_NOT_PRESENT, current_state self.poll_obj = poll_obj self.reset_start_time = reset_start_time self.wait_for_power_on = False + self.eeprom_poweron_reset_retry = eeprom_poweron_reset_retry def set_initial_state(self, state): self.initial_state = state @@ -533,8 +702,6 @@ def get_final_state(self): def set_final_state(self, state): self.final_state = state - - def advance_state(self): self.set_current_state(self.next_state) self.next_state = '' @@ -554,6 +721,13 @@ def get_poll_obj(self): def set_poll_obj(self, poll_obj): self.poll_obj = poll_obj + def reset_all_states(self, def_state=STATE_HW_NOT_PRESENT): + self.initial_state = def_state + self.current_state = def_state + self.next_state = def_state + self.final_state = '' + + def print_and_log(msg): logger.log_info(msg) print(msg) From f10550149d9e55443af89972c4f5fb9642e432ad Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Sun, 8 Oct 2023 22:26:51 +0000 Subject: [PATCH 05/26] added fixes and retries to power on and hw reset workaround for FW issue of eeprom access blocked for passive cables --- .../sonic_platform/chassis.py | 44 +- .../sonic_platform/modules_mgmt.py | 480 ++++++++++-------- 2 files changed, 266 insertions(+), 258 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 27c7a74ace92..f008c99092f9 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -380,11 +380,11 @@ def get_change_event(self, timeout=0): Args: timeout: Timeout in milliseconds (optional). If timeout == 0, - this method will block until a change is detected. + this method will block until a change is detected. - Deprecated Returns: (bool, dict): - - True if call successful, False if not; + - True if call successful, False if not; - Deprecated, will always return True - A nested dictionary where key is a device type, value is a dictionary with key:value pairs in the format of {'device_id':'device_event'}, @@ -403,52 +403,30 @@ def get_change_event(self, timeout=0): self.modules_mgmt_thread.start() self.threads.append(self.modules_mgmt_thread) self.initialize_sfp() - # Initialize SFP event first - # if not self.sfp_event: - # from .sfp_event import sfp_event - # self.sfp_event = sfp_event(self.RJ45_port_list) - # self.sfp_event.initialize() - # - # wait_for_ever = (timeout == 0) - # # select timeout should be no more than 1000ms to ensure fast shutdown flow - # select_timeout = 1000.0 if timeout >= 1000 else float(timeout) port_dict = {} error_dict = {} - # begin = time.time() + i = 0 while True: - print('get_change_event() acquiring queue lock') + logger.log_warning('get_change_event() acquiring queue lock iteration {}'.format(i)) self.modules_queue_lock.acquire() if self.modules_changes_queue.qsize() > 0: - #with self.modules_changes_queue.mutex: if True: try: - print('get_change_event() trying to get changes from queue') + logger.log_warning('get_change_event() trying to get changes from queue') port_dict = self.modules_changes_queue.get(timeout=1) - print ('get_change_event() port_dict: {}'.format(port_dict)) + logger.log_warning ('get_change_event() port_dict: {}'.format(port_dict)) except queue.Empty: - logger.log_info("failed to get item from modules changes queue") - print("failed to get item from modules changes queue") - print('get_change_event() releasing queue lock') + logger.log_warning("failed to get item from modules changes queue") + logger.log_warning('get_change_event() releasing queue lock iteration {}'.format(i)) self.modules_queue_lock.release() - time.sleep(1) - # status = self.sfp_event.check_sfp_status(port_dict, error_dict, select_timeout) - # if bool(port_dict): - # break - # - # if not wait_for_ever: - # elapse = time.time() - begin - # if elapse * 1000 > timeout: - # break - - # if status: + if port_dict: self.reinit_sfps(port_dict) result_dict = {'sfp': port_dict} - # if error_dict: result_dict['sfp_error'] = error_dict return True, result_dict - # else: - # return True, {'sfp': {}} + time.sleep(1) + i += 1 def reinit_sfps(self, port_dict): """ diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 339bf25eabb8..67dd3bb3359a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -35,7 +35,7 @@ SAI_INDEP_MODULE_MODE = "SAI_INDEPENDENT_MODULE_MODE" SAI_INDEP_MODULE_MODE_DELIMITER = "=" SAI_INDEP_MODULE_MODE_TRUE_STR = "1" -SYSFS_LEGACY_PRESENCE_FD = "/sys/module/sx_core/asic0/module{}/present" +SYSFS_LEGACY_FD_PRESENCE = "/sys/module/sx_core/asic0/module{}/present" ASIC_NUM = 0 PORT_BREAKOUT = 8 SYSFS_INDEPENDENT_FD_PREFIX_WO_MODULE = "/sys/module/sx_core/asic{}".format(ASIC_NUM) @@ -52,6 +52,7 @@ STATE_DB_TABLE_NAME_PREFIX = 'TRANSCEIVER_MODULES_MGMT|{}' +MAX_EEPROM_ERROR_RESET_RETRIES = 4 class ModulesMgmtTask(threading.Thread): RETRY_EEPROM_READING_INTERVAL = 60 @@ -59,18 +60,10 @@ class ModulesMgmtTask(threading.Thread): def __init__(self, namespaces=None, port_mapping=None, main_thread_stop_event=None, sfp_error_event=None, q=None ,l=None): threading.Thread.__init__(self) - self.name = "SfpStateUpdateTask" - self.exc = None + self.name = "ModulesMgmtTask" self.task_stopping_event = threading.Event() self.main_thread_stop_event = main_thread_stop_event self.sfp_error_event = sfp_error_event - #self.port_mapping = copy.deepcopy(port_mapping) - # A set to hold those logical port name who fail to read EEPROM - self.retry_eeprom_set = set() - # To avoid retry EEPROM read too fast, record the last EEPROM read timestamp in this member - self.last_retry_eeprom_time = 0 - # A dict to hold SFP error event, for SFP insert/remove event, it is not necessary to cache them - # because _wrapper_get_presence returns the SFP presence status self.sfp_error_dict = {} self.sfp_insert_events = {} self.sfp_port_dict_initial = {} @@ -82,12 +75,14 @@ def __init__(self, namespaces=None, port_mapping=None, main_thread_stop_event=No self.modules_queue_lock = l self.is_supported_indep_mods_system = False self.modules_lock_list = [] - self.waiting_modules_list = Set() + # A set to hold those modules waiting 3 seconds since power on and hw reset + self.waiting_modules_list = set() self.timer = threading.Thread() self.timer_queue = queue.Queue() self.timer_queue_lock = threading.Lock() self.poll_obj = None self.fds_mapping_to_obj = {} + self.fds_events_count_dict = {} # SFPs state machine def get_sm_func(self, sm, port): @@ -102,13 +97,13 @@ def get_sm_func(self, sm, port): , STATE_POWER_LIMIT_ERROR: STATE_POWER_LIMIT_ERROR , STATE_SYSFS_ERROR: STATE_SYSFS_ERROR } - print_and_log("getting func for state {} for port {}".format(sm, port)) + logger.log_warning("getting func for state {} for port {}".format(sm, port)) try: func = SFP_SM_ENUM[sm] - print_and_log("got func {} for state {} for port {}".format(func, sm, port)) + logger.log_warning("got func {} for state {} for port {}".format(func, sm, port)) return func except KeyError as e: - print_and_log("exception {} for port {}".format(e, port)) + logger.log_warning("exception {} for port {}".format(e, port)) return None def run(self): @@ -117,7 +112,7 @@ def run(self): #hwsku = device_info.get_hwsku() independent_file = INDEP_PROFILE_FILE.format(hwsku_dir) if os.path.isfile(independent_file): - print_and_log("file {} found, checking content for independent mode value".format(independent_file)) + logger.log_warning("file {} found, checking content for independent mode value".format(independent_file)) with open(independent_file, "r") as independent_file_fd: independent_file_content = independent_file_fd.read() if SAI_INDEP_MODULE_MODE in independent_file_content and \ @@ -125,33 +120,33 @@ def run(self): independent_file_splitted = independent_file_content.split(SAI_INDEP_MODULE_MODE_DELIMITER) if (len(independent_file_splitted) > 1): self.is_supported_indep_mods_system = int(independent_file_splitted[1]) == int(SAI_INDEP_MODULE_MODE_TRUE_STR) - print_and_log("file {} found, system will work in independent mode".format(independent_file)) - print_and_log("value of indep mode var: {} found in file".format(independent_file_splitted[1])) + logger.log_warning("file {} found, system will work in independent mode".format(independent_file)) + logger.log_warning("value of indep mode var: {} found in file".format(independent_file_splitted[1])) else: - print_and_log("file {} not found, system stays in legacy mode".format(independent_file)) + logger.log_warning("file {} not found, system stays in legacy mode".format(independent_file)) # static init - at first go over all ports and check each one if it's independent module or legacy self.sfp_changes_dict = {} # check for each port if the module connected and if it supports independent mode or legacy num_of_ports = DeviceDataManager.get_sfp_count() # create the modules sysfs fds poller - #self.poll_obj = select.poll() - self.poll_obj = [] + self.poll_obj = select.poll() + #self.poll_obj = [] for port in range(num_of_ports): #temp_port_dict = {IS_INDEPENDENT_MODULE: False} # check sysfs per port whether it's independent mode or legacy temp_module_sm = ModuleStateMachine(port_num=port, initial_state=STATE_HW_NOT_PRESENT , current_state=STATE_HW_NOT_PRESENT) module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) - print_and_log("system in indep mode: {} port {}".format(self.is_supported_indep_mods_system, port)) + logger.log_warning("system in indep mode: {} port {}".format(self.is_supported_indep_mods_system, port)) if self.is_supported_indep_mods_system and os.path.isfile(module_fd_indep_path): - print_and_log("system in indep mode: {} port {} reading file {}".format(self.is_supported_indep_mods_system, port, module_fd_indep_path)) + logger.log_warning("system in indep mode: {} port {} reading file {}".format(self.is_supported_indep_mods_system, port, module_fd_indep_path)) temp_module_sm.set_is_indep_modules(True) temp_module_sm.set_module_fd_path(module_fd_indep_path) module_fd = open(module_fd_indep_path, "r") temp_module_sm.set_module_fd(module_fd) else: - module_fd_legacy_path = SYSFS_LEGACY_PRESENCE_FD.format(port) + module_fd_legacy_path = self.get_sysfs_legacy_ethernet_port_fd(SYSFS_LEGACY_FD_PRESENCE, port) temp_module_sm.set_module_fd_path(module_fd_legacy_path) module_fd = open(module_fd_legacy_path, "r") temp_module_sm.set_module_fd(module_fd) @@ -160,9 +155,9 @@ def run(self): # register the module's sysfs fd to poller with ERR and PRI attrs #self.poll_obj.register(module_fd, select.POLLERR | select.POLLPRI) #self.fds_mapping_to_obj[module_fd.fileno()] = temp_module_sm - #temp_module_sm.set_poll_obj(self.poll_obj) + temp_module_sm.set_poll_obj(self.poll_obj) # start SM for this independent module - print_and_log("adding temp_module_sm {} to sfp_port_dict".format(temp_module_sm)) + logger.log_warning("adding temp_module_sm {} to sfp_port_dict".format(temp_module_sm)) self.sfp_port_dict_initial[port] = temp_module_sm self.sfp_port_dict[port] = temp_module_sm @@ -170,60 +165,60 @@ def run(self): # need at least 1 module in final state until it makes sense to send changes dict is_final_state_module = False all_static_detection_done = False - print_and_log("sfp_port_dict before starting static detection: {}".format(self.sfp_port_dict)) + logger.log_warning("sfp_port_dict before starting static detection: {}".format(self.sfp_port_dict)) # loop on different state for ports in static detection until all done while (not self.task_stopping_event or not self.main_thread_stop_event) and not all_static_detection_done: - print_and_log("static detection running iteration {}".format(i)) + logger.log_warning("static detection running iteration {}".format(i)) waiting_list_len = len(self.waiting_modules_list) sfp_port_dict_keys_len = len(self.sfp_port_dict.keys()) - # if all ports in waiting list - sleep one second rather than looping over and over again on same state if waiting_list_len == sfp_port_dict_keys_len: - print_and_log("static detection length of waiting list {} and sfp port dict keys {} is the same, sleeping 1 second..." - .format(waiting_list_len, sfp_port_dict_keys_len)) + logger.log_warning("static detection length of waiting list {}: {} and sfp port dict keys {}:{} is the same, sleeping 1 second..." + .format(waiting_list_len, self.waiting_modules_list, sfp_port_dict_keys_len, self.sfp_port_dict.keys())) time.sleep(1) else: - print_and_log("static detectionlength of waiting list {} and sfp port dict keys {} is different, NOT sleeping 1 second" - .format(waiting_list_len, sfp_port_dict_keys_len)) + logger.log_warning("static detectionlength of waiting list {}: {} and sfp port dict keys {}: {} is different, NOT sleeping 1 second" + .format(waiting_list_len, self.waiting_modules_list, sfp_port_dict_keys_len, self.sfp_port_dict.keys())) for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() - print_and_log(f'static detection STATE_LOG {port_num}: curr_state is {curr_state}') + logger.log_warning(f'static detection STATE_LOG {port_num}: curr_state is {curr_state}') func = self.get_sm_func(curr_state, port_num) - print_and_log("static detectiongot returned func {} for state {}".format(func, curr_state)) + logger.log_warning("static detectiongot returned func {} for state {}".format(func, curr_state)) try: if not isinstance(func, str): next_state = func(port_num, module_sm_obj) except TypeError as e: - print_and_log("static detection exception {} for port {} traceback:\n{}".format(e, port_num, traceback.format_exc())) + logger.log_warning("static detection exception {} for port {} traceback:\n{}".format(e, port_num, traceback.format_exc())) + module_sm_obj.set_final_state(STATE_ERROR_HANDLER) continue - print_and_log(f'static detection STATE_LOG {port_num}: next_state is {next_state}') + logger.log_warning(f'static detection STATE_LOG {port_num}: next_state is {next_state}') if self.timer.is_alive(): - #print_and_log("static detection timer threads is alive, acquiring lock") + logger.log_info("static detection timer threads is alive, acquiring lock") self.modules_lock_list[port_num].acquire() # for STATE_NOT_POWERED we dont advance to next state, timerTask is doing it into STATE_POWERED if curr_state != STATE_NOT_POWERED or not module_sm_obj.wait_for_power_on: module_sm_obj.set_next_state(next_state) module_sm_obj.advance_state() if module_sm_obj.get_final_state(): - print_and_log(f'static detection STATE_LOG {port_num}: enter final state {module_sm_obj.get_final_state()}') + logger.log_warning(f'static detection STATE_LOG {port_num}: enter final state {module_sm_obj.get_final_state()}') is_final_state_module = True if self.timer.is_alive(): self.modules_lock_list[port_num].release() is_timer_alive = self.timer.is_alive() - #print_and_log("static detection timer thread is_alive {} port {}".format(is_timer_alive, port_num)) + logger.log_info("static detection timer thread is_alive {} port {}".format(is_timer_alive, port_num)) if STATE_NOT_POWERED == curr_state: if not is_timer_alive: - print_and_log ("static detection curr_state is {} and timer thread is_alive {}, running timer task thread" + logger.log_warning ("static detection curr_state is {} and timer thread is_alive {}, running timer task thread" .format(curr_state, is_timer_alive)) # call timer task self.timer = threading.Timer(1.0, self.timerTask) self.timer.start() self.timer_queue.put(module_sm_obj) if self.timer.is_alive(): - print_and_log("timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) + logger.log_warning("timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].acquire() module_sm_obj.set_next_state(next_state) if self.timer.is_alive(): - print_and_log("timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) + logger.log_warning("timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].release() state_db = None @@ -234,24 +229,24 @@ def run(self): if final_state in [STATE_SW_CONTROL, STATE_FW_CONTROL]: namespaces = multi_asic.get_front_end_namespaces() for namespace in namespaces: - print_and_log("static detection getting state_db for port {} namespace {}".format(port, namespace)) + logger.log_warning("static detection getting state_db for port {} namespace {}".format(port, namespace)) state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) - print_and_log("static detection got state_db for port {} namespace {}".format(port, namespace)) + logger.log_warning("static detection got state_db for port {} namespace {}".format(port, namespace)) if state_db is not None: - print_and_log("static detection connecting to state_db for port {} namespace {}".format(port, namespace)) + logger.log_warning("static detection connecting to state_db for port {} namespace {}".format(port, namespace)) state_db.connect(state_db.STATE_DB) if final_state in [STATE_FW_CONTROL]: control_type = 'FW_CONTROL' elif final_state in [STATE_SW_CONTROL]: control_type = 'SW_CONTROL' table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) - print_and_log("static detection setting state_db table {} for port {} namespace {} control_type {}" + logger.log_warning("static detection setting state_db table {} for port {} namespace {} control_type {}" .format(table_name, port, namespace, control_type)) state_db.set(state_db.STATE_DB, table_name, "control type", control_type) del self.sfp_port_dict[port] if len(self.sfp_changes_dict) > 0: - print_and_log("static detection putting sfp_changes_dict {} in modules changes queue..." + logger.log_warning("static detection putting sfp_changes_dict {} in modules changes queue..." .format(self.sfp_changes_dict)) try: self.modules_queue_lock.acquire() @@ -259,85 +254,111 @@ def run(self): self.modules_queue_lock.release() self.sfp_changes_dict = {} except queue.Full: - print_and_log("failed to put item from modules changes queue, queue is full") + logger.log_warning("failed to put item from modules changes queue, queue is full") else: - print_and_log("static detection sfp_changes_dict {} is empty...".format(self.sfp_changes_dict)) + logger.log_warning("static detection sfp_changes_dict {} is empty...".format(self.sfp_changes_dict)) i += 1 - print_and_log("sfp_port_dict: {}".format(self.sfp_port_dict)) + logger.log_warning("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): - print_and_log("static detection port_num: {} initial state: {} current_state: {} next_state: {}" + logger.log_warning("static detection port_num: {} initial state: {} current_state: {} next_state: {}" .format(port_num, module_sm_obj.initial_state, module_sm_obj.get_current_state() , module_sm_obj.get_next_state())) sfp_port_dict_keys_len = len(self.sfp_port_dict.keys()) if sfp_port_dict_keys_len == 0: - print_and_log("static detection len of keys of sfp_port_dict is 0: {}".format(sfp_port_dict_keys_len)) + logger.log_warning("static detection len of keys of sfp_port_dict is 0: {}".format(sfp_port_dict_keys_len)) all_static_detection_done = True else: - print_and_log("static detection len of keys of sfp_port_dict is not 0: {}".format(sfp_port_dict_keys_len)) - print_and_log("static detection all_static_detection_done: {}".format(all_static_detection_done)) + logger.log_warning("static detection len of keys of sfp_port_dict is not 0: {}".format(sfp_port_dict_keys_len)) + logger.log_warning("static detection all_static_detection_done: {}".format(all_static_detection_done)) - print_and_log("sfp_port_dict before dynamic detection: {}".format(self.sfp_port_dict)) + logger.log_warning("sfp_port_dict before dynamic detection: {}".format(self.sfp_port_dict)) # loop on listening to changes, gather and put them into shared queue, then continue looping i = 0 # need at least 1 module in final state until it makes sense to send changes dict is_final_state_module = False + # initialize fds events count to 0 + for fd_fileno in self.fds_mapping_to_obj: + module_obj = self.fds_mapping_to_obj[fd_fileno] + self.fds_events_count_dict[module_obj.port_num] = 0 + dummy_read = False while not self.task_stopping_event or not self.main_thread_stop_event: - print_and_log("dynamic detection running iteration {}".format(i)) + logger.log_warning("dynamic detection running iteration {}".format(i)) + # dummy read all sysfs fds before polling them + if not dummy_read: + for fd_fileno in self.fds_mapping_to_obj: + module_obj = self.fds_mapping_to_obj[fd_fileno] + module_obj.module_fd = open(module_obj.module_fd_path, "r") + try: + logger.log_warning("dynamic detection dummy reading from fd path {} for port {}" + .format(module_obj.module_fd_path, module_obj.port_num)) + val = module_obj.module_fd.read() + val_int = None + if len(val) > 0: + val_int = int(val) + logger.log_warning("dynamic detection dummy read {} int {} for port {} before polling" + .format(val, val_int, module_obj.port_num)) + except Exception as e: + logger.log_warning("dynamic detection exception on dummy read {} for port {} traceback:\n{}" + .format(e, module_obj.port_num, traceback.format_exc())) + dummy_read = True + logger.log_warning("dynamic detection sleeping 1 second before polling...") + time.sleep(1) # poll for changes with 1 second timeout - #fds_events = self.poll_obj.poll(1000) - fds_events = select.select(self.poll_obj, [], [], 1000) - print_and_log("dynamic detection polled obj checking fds_events iteration {}".format(i)) + fds_events = self.poll_obj.poll(1000) + logger.log_warning("dynamic detection polled obj checking fds_events iteration {}".format(i)) for fd, event in fds_events: # get modules object from fd according to saved key-value of fd-module obj saved earlier - print_and_log("dynamic detection working on fd {} event {}".format(fd, event)) + logger.log_warning("dynamic detection working on fd {} event {}".format(fd, event)) module_obj = self.fds_mapping_to_obj[fd] - print_and_log("dynamic detection got module_obj {} with port {} from fd number {} path {}" - .format(module_obj, module_obj.port_num, fd, module_obj.module_fd_path)) - # put again module obj in sfp_port_dict so next loop will work on it - module_obj.reset_all_states() - self.sfp_port_dict[module_obj.port_num] = module_obj - # put port number in changes dict to pass back to xcvrd's calling SfpStateUpdateTask thread - #self.sfp_changes_dict[module_obj.port_num] = module_obj - + self.fds_events_count_dict[module_obj.port_num] += 1 + logger.log_warning("dynamic detection got module_obj {} with port {} from fd number {} path {} count {}" + .format(module_obj, module_obj.port_num, fd, module_obj.module_fd_path, self.fds_events_count_dict[module_obj.port_num])) + if module_obj.port_num not in self.sfp_port_dict.keys(): + logger.log_warning("dynamic detection port {} not found in sfp_port_dict keys: {} resetting all states".format(module_obj.port_num, self.sfp_port_dict.keys())) + module_obj.reset_all_states() + # put again module obj in sfp_port_dict so next loop will work on it + self.sfp_port_dict[module_obj.port_num] = module_obj + logger.log_warning("dynamic detection sleeping 2 second...") + time.sleep(2) for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() - print_and_log(f'dynamic detection STATE_LOG {port_num}: curr_state is {curr_state}') + logger.log_warning(f'dynamic detection STATE_LOG {port_num}: curr_state is {curr_state}') func = self.get_sm_func(curr_state, port) - print_and_log("dynamic detection got returned func {} for state {}".format(func, curr_state)) + logger.log_warning("dynamic detection got returned func {} for state {}".format(func, curr_state)) try: - next_state = func(port_num, module_sm_obj) + next_state = func(port_num, module_sm_obj, dynamic=True) except TypeError as e: - print_and_log("dynamic detection exception {} for port {}".format(e, port_num)) + logger.log_warning("exception {} for port {}".format(e, port_num)) continue - print_and_log(f'dynamic detection STATE_LOG {port_num}: next_state is {next_state}') + logger.log_warning(f'dynamic detection STATE_LOG {port_num}: next_state is {next_state}') if self.timer.is_alive(): - #print_and_log("dynamic detection timer threads is alive, acquiring lock") + logger.log_info("dynamic detection timer threads is alive, acquiring lock") self.modules_lock_list[port_num].acquire() if curr_state != STATE_NOT_POWERED or not module_sm_obj.wait_for_power_on: module_sm_obj.set_next_state(next_state) module_sm_obj.advance_state() if module_sm_obj.get_final_state(): - #print_and_log(f'dynamic detection STATE_LOG {port_num}: enter final state {module_sm_obj.get_final_state()}') + logger.log_info(f'dynamic detection STATE_LOG {port_num}: enter final state {module_sm_obj.get_final_state()}') is_final_state_module = True if self.timer.is_alive(): self.modules_lock_list[port_num].release() is_timer_alive = self.timer.is_alive() - #print_and_log("dynamic detection timer thread is_alive {} port {}".format(is_timer_alive, port_num)) + logger.log_info("dynamic detection timer thread is_alive {} port {}".format(is_timer_alive, port_num)) if STATE_NOT_POWERED == curr_state: if not is_timer_alive: - print_and_log("dynamic detection curr_state is {} and timer thread is_alive {}, running timer task thread" + logger.log_warning("dynamic detection curr_state is {} and timer thread is_alive {}, running timer task thread" .format(curr_state, is_timer_alive)) # call timer task self.timer = threading.Timer(1.0, self.timerTask) self.timer.start() self.timer_queue.put(module_sm_obj) if self.timer.is_alive(): - print_and_log("dynamic detection timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) + logger.log_warning("dynamic detection timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].acquire() module_sm_obj.set_next_state(next_state) if self.timer.is_alive(): - print_and_log( + logger.log_warning( "dynamic detection timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].release() @@ -354,11 +375,11 @@ def run(self): if final_state in [STATE_SW_CONTROL, STATE_FW_CONTROL]: namespaces = multi_asic.get_front_end_namespaces() for namespace in namespaces: - print_and_log("dynamic detection getting state_db for port {} namespace {}".format(port, namespace)) + logger.log_warning("dynamic detection getting state_db for port {} namespace {}".format(port, namespace)) state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) - print_and_log("dynamic detection got state_db for port {} namespace {}".format(port, namespace)) + logger.log_warning("dynamic detection got state_db for port {} namespace {}".format(port, namespace)) if state_db is not None: - print_and_log( + logger.log_warning( "dynamic detection connecting to state_db for port {} namespace {}".format(port, namespace)) state_db.connect(state_db.STATE_DB) if final_state in [STATE_FW_CONTROL]: @@ -366,170 +387,192 @@ def run(self): elif final_state in [STATE_SW_CONTROL]: control_type = 'SW_CONTROL' table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) - print_and_log( + logger.log_warning( "dynamic detection setting state_db table {} for port {} namespace {} control_type {}" .format(table_name, port, namespace, control_type)) state_db.set(state_db.STATE_DB, table_name,"control type", control_type) - print_and_log("dynamic detection sfp_port_dict before deletion: {}".format(self.sfp_port_dict)) + logger.log_warning("dynamic detection sfp_port_dict before deletion: {}".format(self.sfp_port_dict)) for port in self.sfp_delete_list_from_port_dict: - print_and_log("dynamic detection deleting port {} from sfp_port_dict".format(port)) del self.sfp_port_dict[port] self.sfp_delete_list_from_port_dict = [] - print_and_log("dynamic detection sfp_port_dict after deletion: {}".format(self.sfp_port_dict)) + logger.log_warning("dynamic detection sfp_port_dict after deletion: {}".format(self.sfp_port_dict)) if self.sfp_changes_dict: - print_and_log("dynamic detection putting sfp_changes_dict {} in modules changes queue...".format(self.sfp_changes_dict)) - try: - self.modules_queue_lock.acquire() - self.modules_changes_queue.put(self.sfp_changes_dict, timeout=1) - self.modules_queue_lock.release() - self.sfp_changes_dict = {} - except queue.Full: - print_and_log("failed to put item from modules changes queue, queue is full") + logger.log_warning("dynamic detection putting sfp_changes_dict {} in modules changes queue...".format(self.sfp_changes_dict)) + #with self.modules_changes_queue.mutex: + if True: + try: + self.modules_queue_lock.acquire() + self.modules_changes_queue.put(self.sfp_changes_dict, timeout=1) + self.modules_queue_lock.release() + self.sfp_changes_dict = {} + except queue.Full: + logger.log_warning("failed to put item from modules changes queue, queue is full") else: - print_and_log("sfp_changes_dict {} is empty...".format(self.sfp_changes_dict)) + logger.log_warning("sfp_changes_dict {} is empty...".format(self.sfp_changes_dict)) i += 1 - print_and_log("sfp_port_dict: {}".format(self.sfp_port_dict)) + logger.log_warning("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): - print_and_log("port_num: {} module_sm_obj initial state: {} current_state: {} next_state: {}" + logger.log_warning("port_num: {} module_sm_obj initial state: {} current_state: {} next_state: {}" .format(port_num, module_sm_obj.initial_state, module_sm_obj.get_current_state(), module_sm_obj.get_next_state())) - def check_if_hw_present(self, port, module_sm_obj): + def check_if_hw_present(self, port, module_sm_obj, dynamic=False): module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) if os.path.isfile(module_fd_indep_path): try: val_int = utils.read_int_from_file(module_fd_indep_path) if 0 == val_int: - print_and_log("returning {} for val {}".format(STATE_HW_NOT_PRESENT, val_int)) + logger.log_warning("returning {} for val {}".format(STATE_HW_NOT_PRESENT, val_int)) module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT elif 1 == val_int: - print_and_log("returning {} for val {}".format(STATE_HW_PRESENT, val_int)) + logger.log_warning("returning {} for val {}".format(STATE_HW_PRESENT, val_int)) return STATE_HW_PRESENT except Exception as e: - print_and_log("exception {} for port {}, setting as final state STATE_ERROR_HANDLER".format(e, port)) - module_sm_obj.set_final_state(STATE_ERROR_HANDLER) - return STATE_ERROR_HANDLER + if not dynamic: + logger.log_warning("exception {} for port {} setting final state STATE_ERROR_HANDLER".format(e, port)) + module_sm_obj.set_final_state(STATE_ERROR_HANDLER) + return STATE_ERROR_HANDLER + else: + module_fd_legacy_path = SYSFS_LEGACY_FD_PRESENCE.format(port) + logger.log_warning("falling back to legacy sysfs {} for port {} dynamic: {}".format(port, module_fd_legacy_path, dynamic)) + if os.path.isfile(module_fd_legacy_path): + logger.log_warning("reading legacy sysfs {} for port {} dynamic: {}".format(port, module_fd_legacy_path, dynamic)) + try: + val_int = utils.read_int_from_file(module_fd_legacy_path) + if 0 == val_int: + logger.log_warning("returning {} for val {} legacy sysfd {} port {} dynamic {}".format(STATE_HW_NOT_PRESENT, val_int, module_fd_legacy_path, port, dynamic)) + module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) + return STATE_HW_NOT_PRESENT + elif 1 == val_int: + logger.log_warning("returning {} for val {} legacy sysfd {} port {} dynamic {}".format(STATE_HW_PRESENT, val_int, module_fd_legacy_path, port, dynamic)) + return STATE_HW_PRESENT + except Exception as e: + logger.log_warning("check_if_hw_present dynamic exception {} for port {} setting final state STATE_ERROR_HANDLER".format(e, port)) + module_sm_obj.set_final_state(STATE_ERROR_HANDLER) + return STATE_ERROR_HANDLER return STATE_HW_NOT_PRESENT - def checkIfModuleAvailable(self, port, module_sm_obj): - print_and_log("enter check_if_module_available port {} module_sm_obj {}".format(port, module_sm_obj)) + def checkIfModuleAvailable(self, port, module_sm_obj, dynamic=False): + logger.log_warning("enter check_if_module_available port {} module_sm_obj {}".format(port, module_sm_obj)) module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_GOOD.format(port) if os.path.isfile(module_fd_indep_path): try: val_int = utils.read_int_from_file(module_fd_indep_path) if 0 == val_int: - print_and_log(f'port {port} power is not good') + logger.log_warning(f'port {port} power is not good') + self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) + self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = module_sm_obj return STATE_HW_NOT_PRESENT elif 1 == val_int: - print_and_log(f'port {port} power is good') - #elif 2 == val_int: - #self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) - self.poll_obj.append(module_sm_obj.module_fd) - self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = module_sm_obj + logger.log_warning(f'port {port} power is good') return STATE_MODULE_AVAILABLE except Exception as e: - print_and_log("exception {} for port {}".format(e, port)) + logger.log_warning("exception {} for port {}".format(e, port)) return STATE_HW_NOT_PRESENT - print_and_log(f'port {port} has no power good file {module_fd_indep_path}') + logger.log_warning(f'port {port} has no power good file {module_fd_indep_path}') return STATE_HW_NOT_PRESENT - def checkIfPowerOn(self, port, module_sm_obj): - print_and_log(f'enter checkIfPowerOn for port {port}') + def checkIfPowerOn(self, port, module_sm_obj, dynamic=False): + logger.log_warning(f'enter checkIfPowerOn for port {port}') module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) if os.path.isfile(module_fd_indep_path): try: - with open(module_fd_indep_path, "r") as module_fd: - val = module_fd.read() - val_int = int(val) - if 0 == val_int: - print_and_log(f'port {port} is not powered') - return STATE_NOT_POWERED - elif 1 == val_int: - if not module_sm_obj.wait_for_power_on and \ - utils.read_int_from_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port)) == 1: - sfp = sfp_module.SFP(port) - xcvr_api = sfp.get_xcvr_api() - # only if xcvr_api is None or if it is not active optics cables need reset - if not xcvr_api or xcvr_api.is_flat_memory(): - print_and_log(f'port {port} is powered, but need reset') - utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 0) - module_sm_obj.reset_start_time = time.time() - module_sm_obj.wait_for_power_on = True - utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 1) - module_sm_obj.reset_start_time = time.time() - module_sm_obj.wait_for_power_on = True - self.waiting_modules_list.append(module_sm_obj) - return STATE_NOT_POWERED - print_and_log(f'port {port} is powered, does not need reset') - return STATE_POWERED + val = utils.read_int_from_file(module_fd_indep_path) + val_int = int(val) + if 0 == val_int: + logger.log_warning(f'port {port} is not powered') + return STATE_NOT_POWERED + elif 1 == val_int: + if not module_sm_obj.wait_for_power_on and \ + utils.read_int_from_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port)) == 1: + sfp = sfp_module.SFP(port) + xcvr_api = sfp.get_xcvr_api() + # only if xcvr_api is None or if it is not active optics cables need reset + if not xcvr_api or xcvr_api.is_flat_memory(): + logger.log_warning(f'port {port} is powered, but need reset') + utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 0) + module_sm_obj.reset_start_time = time.time() + module_sm_obj.wait_for_power_on = True + utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 1) + module_sm_obj.reset_start_time = time.time() + module_sm_obj.wait_for_power_on = True + self.waiting_modules_list.add(module_sm_obj.port_num) + return STATE_NOT_POWERED + logger.log_warning(f'port {port} is powered, does not need reset') + return STATE_POWERED except Exception as e: - print_and_log(f'got exception {e} in checkIfPowerOn') + logger.log_warning(f'got exception {e} in checkIfPowerOn') return STATE_HW_NOT_PRESENT - def powerOnModule(self, port, module_sm_obj): + def powerOnModule(self, port, module_sm_obj, dynamic=False): #if module_sm_obj not in self.waiting_modules_list: if not module_sm_obj.wait_for_power_on: module_fd_indep_path_po = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) module_fd_indep_path_r = SYSFS_INDEPENDENT_FD_HW_RESET.format(port) try: if os.path.isfile(module_fd_indep_path_po): - print_and_log("powerOnModule powering on via {} for port {}".format(module_fd_indep_path_po, port)) + logger.log_warning("powerOnModule powering on via {} for port {}".format(module_fd_indep_path_po, port)) # echo 1 > /sys/module/sx_core/$asic/$module/power_on with open(module_fd_indep_path_po, "w") as module_fd: module_fd.write("1") if os.path.isfile(module_fd_indep_path_r): - print_and_log("powerOnModule resetting via {} for port {}".format(module_fd_indep_path_r, port)) + logger.log_warning("powerOnModule resetting via {} for port {}".format(module_fd_indep_path_r, port)) # echo 0 > /sys/module/sx_core/$asic/$module/hw_reset with open(module_fd_indep_path_r, "w") as module_fd: module_fd.write("0") module_sm_obj.reset_start_time = time.time() module_sm_obj.wait_for_power_on = True - self.waiting_modules_list.append(module_sm_obj) + self.waiting_modules_list.add(module_sm_obj.port_num) except Exception as e: - print_and_log("exception in powerOnModule {} for port {}".format(e, port)) + logger.log_warning("exception in powerOnModule {} for port {}".format(e, port)) return STATE_HW_NOT_PRESENT return STATE_NOT_POWERED - def checkModuleType(self, port, module_sm_obj): - print_and_log("enter checkModuleType port {} module_sm_obj {}".format(port, module_sm_obj)) + def checkModuleType(self, port, module_sm_obj, dynamic=False): + logger.log_warning("enter checkModuleType port {} module_sm_obj {}".format(port, module_sm_obj)) sfp = sfp_module.SFP(port) xcvr_api = sfp.get_xcvr_api() if not xcvr_api: - print_and_log("checkModuleType calling sfp reinit for port {} module_sm_obj {}".format(port, module_sm_obj)) + logger.log_warning("checkModuleType calling sfp reinit for port {} module_sm_obj {}".format(port, module_sm_obj)) sfp.reinit() - print_and_log("checkModuleType setting as FW control as xcvr_api is empty for port {} module_sm_obj {}".format(port, module_sm_obj)) + logger.log_warning("checkModuleType setting as FW control as xcvr_api is empty for port {} module_sm_obj {}".format(port, module_sm_obj)) return STATE_FW_CONTROL field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.ID_FIELD) module_type_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) - #module_type = xcvr_api.xcvr_eeprom.read_raw(consts.ID_FIELD) if module_type_ba is None: - print_and_log("checkModuleType module_type is None for port {} - checking if we didnt retry yet".format(port)) - # if we didnt do this retry yet - do it once - if not module_sm_obj.eeprom_poweron_reset_retry: - print_and_log("checkModuleType module_type is None retrying by falling back to STATE_NOT_POWERED" - "for port {}".format(port)) - module_sm_obj.eeprom_poweron_reset_retry = True + logger.log_warning("checkModuleType module_type is None for port {} - checking if we didnt retry yet max number of retries: {}".format(port, MAX_EEPROM_ERROR_RESET_RETRIES)) + # if we didnt do this retry yet - do it up to 3 times - workaround for FW issue blocking upper page access + if module_sm_obj.eeprom_poweron_reset_retries < MAX_EEPROM_ERROR_RESET_RETRIES: + logger.log_warning("checkModuleType module_type is None retrying by falling back to STATE_NOT_POWERED eeprom reset retries {}" + " for port {}".format(module_sm_obj.eeprom_poweron_reset_retries, port)) + if module_sm_obj.eeprom_poweron_reset_retries % 2 == 0: + utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "0") + logger.log_warning("checkModuleType sleeping 1 second...") + time.sleep(1) + else: + utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "1") self.add_port_to_wait_reset(module_sm_obj) + module_sm_obj.eeprom_poweron_reset_retries += 1 return STATE_NOT_POWERED else: - print_and_log("checkModuleType module_type is None and already retried - setting as STATE_ERROR_HANDLER" + logger.log_warning("checkModuleType module_type is None and already retried - setting as STATE_ERROR_HANDLER" "for port {}".format(port)) + module_sm_obj.set_final_state(STATE_ERROR_HANDLER) return STATE_ERROR_HANDLER module_type = int.from_bytes(module_type_ba, "big") - print_and_log("checkModuleType got module_type {} in check_module_type port {}".format(port, module_type)) + logger.log_warning("got module_type {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) if not 24 == module_type: - print_and_log("check_module_type port {} setting STATE_FW_CONTROL due to module ID {}".format(port, module_type)) + logger.log_warning("setting STATE_FW_CONTROL for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) module_sm_obj.set_final_state = STATE_FW_CONTROL - #power_cap = self.checkPowerCapNonCMIS(port, module_sm_obj) return STATE_FW_CONTROL else: if xcvr_api.is_flat_memory(): - print_and_log("check_module_type port {} setting STATE_FW_CONTROL module ID {} due to flat_mem device" + logger.log_warning("check_module_type port {} setting STATE_FW_CONTROL module ID {} due to flat_mem device" .format(module_type, port)) return STATE_FW_CONTROL - print_and_log("checking power cap for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) + logger.log_warning("checking power cap for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) power_cap = self.checkPowerCap(port, module_sm_obj) if power_cap is STATE_POWER_LIMIT_ERROR: return STATE_POWER_LIMIT_ERROR @@ -543,52 +586,28 @@ def checkModuleType(self, port, module_sm_obj): # freq_fd.write(read_mci) return STATE_SW_CONTROL - def checkPowerCap(self, port, module_sm_obj): - print_and_log("enter checkPowerCap port {} module_sm_obj {}".format(port, module_sm_obj)) + def checkPowerCap(self, port, module_sm_obj, dynamic=False): + logger.log_warning("enter checkPowerCap port {} module_sm_obj {}".format(port, module_sm_obj)) #sfp_base_module = SfpBase() sfp = sfp_module.SFP(port) xcvr_api = sfp.get_xcvr_api() field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.MAX_POWER_FIELD) powercap_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) - print_and_log("checkPowerCap got powercap bytearray {} for port {} module_sm_obj {}".format(powercap_ba, port, module_sm_obj)) + logger.log_warning("checkPowerCap got powercap bytearray {} for port {} module_sm_obj {}".format(powercap_ba, port, module_sm_obj)) powercap = int.from_bytes(powercap_ba, "big") - print_and_log("checkPowerCap got powercap {} for port {} module_sm_obj {}".format(powercap, port, module_sm_obj)) + logger.log_warning("checkPowerCap got powercap {} for port {} module_sm_obj {}".format(powercap, port, module_sm_obj)) indep_fd_power_limit = self.get_sysfs_ethernet_port_fd(SYSFS_INDEPENDENT_FD_POWER_LIMIT, port) - with open(indep_fd_power_limit, "r") as power_limit_fd: - cage_power_limit = power_limit_fd.read() - print_and_log("checkPowerCap got cage_power_limit {} for port {} module_sm_obj {}".format(cage_power_limit, port, module_sm_obj)) + #with open(indep_fd_power_limit, "r") as power_limit_fd: + # cage_power_limit = power_limit_fd.read() + cage_power_limit = utils.read_int_from_file(indep_fd_power_limit) + logger.log_warning("checkPowerCap got cage_power_limit {} for port {} module_sm_obj {}".format(cage_power_limit, port, module_sm_obj)) if powercap > int(cage_power_limit): - print_and_log("checkPowerCap powercap {} != cage_power_limit {} for port {} module_sm_obj {}".format(powercap, cage_power_limit, port, module_sm_obj)) + logger.log_warning("checkPowerCap powercap {} != cage_power_limit {} for port {} module_sm_obj {}".format(powercap, cage_power_limit, port, module_sm_obj)) module_sm_obj.set_final_state(STATE_POWER_LIMIT_ERROR) return STATE_POWER_LIMIT_ERROR - def checkPowerCapNonCMIS(self, port, module_sm_obj): - print_and_log("enter checkPowerCapNonCMIS port {} module_sm_obj {}".format(port, module_sm_obj)) - sfp = sfp_module.SFP(port) - xcvr_api = sfp.get_xcvr_api() - serial_id = xcvr_api.xcvr_eeprom.read(consts.SERIAL_ID_FIELD) - if serial_id is None: - return None - - ext_id = serial_id[consts.EXT_ID_FIELD] - power_class = ext_id[consts.POWER_CLASS_FIELD] - clei_code = ext_id[consts.CLEI_CODE_FIELD] - cdr_tx = ext_id[consts.CDR_TX_FIELD] - cdr_rx = ext_id[consts.CDR_RX_FIELD] - print_and_log("checkPowerCapNonCMIS got powercap {} for port {} module_sm_obj {} clei {} cdr_tx {} cdr_rx {}" - .format(power_class, port, module_sm_obj, clei_code, cdr_tx, cdr_rx)) - field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.EXT_ID_FIELD) - powercap_ba = xcvr_api.xcvr_eeprom.read_raw(field.get_offset(), field.get_size()) - print_and_log("checkPowerCapNonCMIS got powercap bytearray {} for port {} module_sm_obj {}" - .format(powercap_ba, port, module_sm_obj)) - powercap = int.from_bytes(powercap_ba, "big") if type(powercap_ba) is bytearray else powercap_ba - print_and_log("checkPowerCapNonCMIS got powercap {} for port {} module_sm_obj {}" - .format(powercap, port, module_sm_obj)) - - - def saveModuleControlMode(self, port, module_sm_obj): - print_and_log("saveModuleControlMode setting current state {} for port {} as final state" - .format(module_sm_obj.get_current_state(), port)) + def saveModuleControlMode(self, port, module_sm_obj, dynamic=False): + logger.log_warning("saveModuleControlMode setting current state {} for port {} as final state".format(module_sm_obj.get_current_state(), port)) # bug - need to find root cause and fix #module_sm_obj.set_final_state(module_sm_obj.get_current_state()) state = module_sm_obj.get_current_state() @@ -598,18 +617,18 @@ def saveModuleControlMode(self, port, module_sm_obj): indep_fd_fw_control = SYSFS_INDEPENDENT_FD_FW_CONTROL.format(port) with open(indep_fd_fw_control, "w") as fw_control_fd: fw_control_fd.write("0") - print_and_log("saveModuleControlMode set FW control for state {} port {}".format(state, port)) - module_fd_legacy_path = SYSFS_LEGACY_PRESENCE_FD.format(port) + logger.log_warning("saveModuleControlMode set FW control for state {} port {}".format(state, port)) + module_fd_legacy_path = SYSFS_LEGACY_FD_PRESENCE.format(port) module_sm_obj.set_module_fd_path(module_fd_legacy_path) module_fd = open(module_fd_legacy_path, "r") module_sm_obj.set_module_fd(module_fd) - print_and_log("saveModuleControlMode changed module fd to legacy present for port {}".format(port)) - print_and_log("saveModuleControlMode registering sysfs fd {} number {} path {} for port {}".format( - module_sm_obj.module_fd, module_sm_obj.module_fd.fileno(), module_sm_obj.set_module_fd_path, port)) + logger.log_warning("saveModuleControlMode changed module fd to legacy present for port {}".format(port)) + logger.log_warning("saveModuleControlMode registering sysfs fd {} number {} path {} for port {}" + .format(module_sm_obj.module_fd, module_sm_obj.module_fd.fileno(), module_sm_obj.set_module_fd_path, port)) self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = module_sm_obj module_sm_obj.set_poll_obj(self.poll_obj) - print_and_log("saveModuleControlMode set current state {} for port {} as final state {}".format( + logger.log_warning("saveModuleControlMode set current state {} for port {} as final state {}".format( module_sm_obj.get_current_state(), port, module_sm_obj.get_final_state())) def STATE_ERROR_HANDLER(self): @@ -622,31 +641,42 @@ def STATE_SYSFS_ERROR(self): pass def timerTask(self): # wakes up every 1 second + logger.log_warning("timerTask entered run state") empty = False + i = 0 while not empty: + logger.log_warning("timerTask while loop itartion {}".format(i)) empty = True - for module in self.waiting_modules_list: - print_and_log("timerTask working on module {}".format(module)) + port_list_to_delete = [] + for port in self.waiting_modules_list: + logger.log_warning("timerTask working on port {}".format(port)) empty = False + module = self.sfp_port_dict[port] + logger.log_warning("timerTask got module with port_num {} from port {}".format(module.port_num, port)) state = module.get_current_state() if module and state == STATE_NOT_POWERED: - print_and_log("timerTask module {} current_state {} counting seconds since reset_start_time" + logger.log_warning("timerTask module {} current_state {} counting seconds since reset_start_time" .format(module, module.get_current_state())) if time.time() - module.reset_start_time >= 3: # set next state as STATE_POWERED state to trigger the function of check module type - print_and_log("timerTask module port {} locking lock of port {}".format(module.port_num, module.port_num)) + logger.log_warning("timerTask module port {} locking lock of port {}".format(module.port_num, module.port_num)) self.modules_lock_list[module.port_num].acquire() - print_and_log("timerTask module port {} setting next state to STATE_POWERED".format(module.port_num)) + logger.log_warning("timerTask module port {} setting next state to STATE_POWERED".format(module.port_num)) module.set_next_state(STATE_POWERED) - print_and_log("timerTask module port {} advancing next state".format(module.port_num)) + logger.log_warning("timerTask module port {} advancing next state".format(module.port_num)) module.advance_state() - print_and_log("timerTask module {} releasing lock of port {}".format(module, module.port_num)) + logger.log_warning("timerTask module port {} releasing lock of port {}".format(port, module.port_num)) self.modules_lock_list[module.port_num].release() - print_and_log("timerTask module port {} removing module from waiting_modules_list".format(module.port_num)) - self.waiting_modules_list.remove(module) + logger.log_warning("timerTask module port {} adding to delete list to remove from waiting_modules_list".format(module.port_num)) + port_list_to_delete.append(module.port_num) + logger.log_warning("timerTask deleting ports {} from waiting_modules_list...".format(port_list_to_delete)) + for port in port_list_to_delete: + logger.log_warning("timerTask deleting port {} from waiting_modules_list".format(port)) + self.waiting_modules_list.remove(port) + logger.log_warning("timerTask waiting_modules_list after deletion: {}".format(self.waiting_modules_list)) time.sleep(1) - - def get_sysfs_netdev_legacy_ethernet_port_fd(self, sysfs_fd, port): + i += 1 + def get_sysfs_legacy_ethernet_port_fd(self, sysfs_fd, port): breakout_port = "Ethernet{}".format(port * PORT_BREAKOUT) sysfs_eth_port_fd = sysfs_fd.format(breakout_port) return sysfs_eth_port_fd @@ -657,17 +687,20 @@ def get_sysfs_ethernet_port_fd(self, sysfs_fd, port): def add_port_to_wait_reset(self, module_sm_obj): module_sm_obj.reset_start_time = time.time() + logger.log_warning("add_port_to_wait_reset reset_start_time {}".format(module_sm_obj.reset_start_time)) module_sm_obj.wait_for_power_on = True - self.waiting_modules_list.append(module_sm_obj) - + logger.log_warning("add_port_to_wait_reset wait_for_power_on {}".format(module_sm_obj.wait_for_power_on)) + self.waiting_modules_list.add(module_sm_obj.port_num) + logger.log_warning("add_port_to_wait_reset waiting_list after adding: {}".format(self.waiting_modules_list)) class ModuleStateMachine(object): def __init__(self, port_num=0, initial_state=STATE_HW_NOT_PRESENT, current_state=STATE_HW_NOT_PRESENT , next_state=STATE_HW_NOT_PRESENT, final_state='', is_indep_module=False - , module_fd_path='', module_fd=None, poll_obj=None, reset_start_time=None - , eeprom_poweron_reset_retry=False): + , module_fd_path='', module_fd=None, poll_obj=None, reset_start_time=None + , eeprom_poweron_reset_retries=1): + self.port_num = port_num self.initial_state = initial_state self.current_state = current_state @@ -679,7 +712,7 @@ def __init__(self, port_num=0, initial_state=STATE_HW_NOT_PRESENT, current_state self.poll_obj = poll_obj self.reset_start_time = reset_start_time self.wait_for_power_on = False - self.eeprom_poweron_reset_retry = eeprom_poweron_reset_retry + self.eeprom_poweron_reset_retries = eeprom_poweron_reset_retries def set_initial_state(self, state): self.initial_state = state @@ -721,13 +754,10 @@ def get_poll_obj(self): def set_poll_obj(self, poll_obj): self.poll_obj = poll_obj - def reset_all_states(self, def_state=STATE_HW_NOT_PRESENT): + def reset_all_states(self, def_state=STATE_HW_NOT_PRESENT, retries=1): self.initial_state = def_state self.current_state = def_state self.next_state = def_state self.final_state = '' - - -def print_and_log(msg): - logger.log_info(msg) - print(msg) + self.wait_for_power_on = False + self.eeprom_poweron_reset_retries = retries From 87fe6e818254e6d16abcfa2c2f3174989fb602f9 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Wed, 11 Oct 2023 00:16:25 +0000 Subject: [PATCH 06/26] added mci max freq. write commmented out due to bug in SDK added common code in functions added power_good sysfs countinga and poll dummy read added chassis thread destructor commented code --- .../sonic_platform/chassis.py | 15 +- .../sonic_platform/modules_mgmt.py | 319 +++++++++--------- 2 files changed, 173 insertions(+), 161 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index f008c99092f9..1e8eb84e0156 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -130,12 +130,7 @@ def __init__(self): self.modules_mgmt_thread = threading.Thread() self.modules_changes_queue = queue.Queue() self.modules_queue_lock = threading.Lock() - #self.modules_changes_dict = {} - - self.is_independent_modules_system = False - SAI_INDEPENDENT_MODULE_MODE = True - if SAI_INDEPENDENT_MODULE_MODE: - self.is_independent_modules_system = True + self.modules_mgmt_task_stopping_event = threading.Event() logger.log_info("Chassis loaded successfully") @@ -147,6 +142,11 @@ def __del__(self): if self.sfp_module.SFP.shared_sdk_handle: self.sfp_module.deinitialize_sdk_handle(self.sfp_module.SFP.shared_sdk_handle) + #self.modules_mgmt_task_stopping_event.set() + #logger.log_warning('set modules_mgmt_task_stopping_event {self.modules_mgmt_task_stopping_event}') + #self.modules_mgmt_thread.join(timeout=10) + #logger.log_warning('joined modules_mgmt_thread thread') + @property def RJ45_port_list(self): if not self._RJ45_port_inited: @@ -399,7 +399,8 @@ def get_change_event(self, timeout=0): if not self.modules_mgmt_thread.is_alive(): # open new SFP change events thread self.modules_mgmt_thread = modules_mgmt.ModulesMgmtTask(q=self.modules_changes_queue - , l=self.modules_queue_lock) + , l=self.modules_queue_lock + , main_thread_stop_event = self.modules_mgmt_task_stopping_event) self.modules_mgmt_thread.start() self.threads.append(self.modules_mgmt_thread) self.initialize_sfp() diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 67dd3bb3359a..981592bcf817 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -57,13 +57,10 @@ class ModulesMgmtTask(threading.Thread): RETRY_EEPROM_READING_INTERVAL = 60 - def __init__(self, namespaces=None, port_mapping=None, main_thread_stop_event=None, sfp_error_event=None, q=None - ,l=None): + def __init__(self, namespaces=None, main_thread_stop_event=None, q=None, l=None): threading.Thread.__init__(self) self.name = "ModulesMgmtTask" - self.task_stopping_event = threading.Event() self.main_thread_stop_event = main_thread_stop_event - self.sfp_error_event = sfp_error_event self.sfp_error_dict = {} self.sfp_insert_events = {} self.sfp_port_dict_initial = {} @@ -83,6 +80,7 @@ def __init__(self, namespaces=None, port_mapping=None, main_thread_stop_event=No self.poll_obj = None self.fds_mapping_to_obj = {} self.fds_events_count_dict = {} + self.delete_ports_from_state_db_list = [] # SFPs state machine def get_sm_func(self, sm, port): @@ -152,9 +150,6 @@ def run(self): temp_module_sm.set_module_fd(module_fd) # add lock to use with timer task updating next state per module object self.modules_lock_list.append(threading.Lock()) - # register the module's sysfs fd to poller with ERR and PRI attrs - #self.poll_obj.register(module_fd, select.POLLERR | select.POLLPRI) - #self.fds_mapping_to_obj[module_fd.fileno()] = temp_module_sm temp_module_sm.set_poll_obj(self.poll_obj) # start SM for this independent module logger.log_warning("adding temp_module_sm {} to sfp_port_dict".format(temp_module_sm)) @@ -166,8 +161,8 @@ def run(self): is_final_state_module = False all_static_detection_done = False logger.log_warning("sfp_port_dict before starting static detection: {}".format(self.sfp_port_dict)) - # loop on different state for ports in static detection until all done - while (not self.task_stopping_event or not self.main_thread_stop_event) and not all_static_detection_done: + # static detection - loop on different state for all ports until all done + while not self.main_thread_stop_event and not all_static_detection_done: logger.log_warning("static detection running iteration {}".format(i)) waiting_list_len = len(self.waiting_modules_list) sfp_port_dict_keys_len = len(self.sfp_port_dict.keys()) @@ -182,7 +177,7 @@ def run(self): curr_state = module_sm_obj.get_current_state() logger.log_warning(f'static detection STATE_LOG {port_num}: curr_state is {curr_state}') func = self.get_sm_func(curr_state, port_num) - logger.log_warning("static detectiongot returned func {} for state {}".format(func, curr_state)) + logger.log_warning("static detection got returned func {} for state {}".format(func, curr_state)) try: if not isinstance(func, str): next_state = func(port_num, module_sm_obj) @@ -221,42 +216,10 @@ def run(self): logger.log_warning("timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].release() - state_db = None - for port, module_obj in self.sfp_port_dict_initial.items(): - final_state = module_obj.get_final_state() - if port in self.sfp_port_dict.keys() and final_state: - self.sfp_changes_dict[str(module_obj.port_num)] = '0' if final_state in [STATE_HW_NOT_PRESENT, STATE_ERROR_HANDLER] else '1' - if final_state in [STATE_SW_CONTROL, STATE_FW_CONTROL]: - namespaces = multi_asic.get_front_end_namespaces() - for namespace in namespaces: - logger.log_warning("static detection getting state_db for port {} namespace {}".format(port, namespace)) - state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) - logger.log_warning("static detection got state_db for port {} namespace {}".format(port, namespace)) - if state_db is not None: - logger.log_warning("static detection connecting to state_db for port {} namespace {}".format(port, namespace)) - state_db.connect(state_db.STATE_DB) - if final_state in [STATE_FW_CONTROL]: - control_type = 'FW_CONTROL' - elif final_state in [STATE_SW_CONTROL]: - control_type = 'SW_CONTROL' - table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) - logger.log_warning("static detection setting state_db table {} for port {} namespace {} control_type {}" - .format(table_name, port, namespace, control_type)) - state_db.set(state_db.STATE_DB, table_name, "control type", control_type) - del self.sfp_port_dict[port] - - if len(self.sfp_changes_dict) > 0: - logger.log_warning("static detection putting sfp_changes_dict {} in modules changes queue..." - .format(self.sfp_changes_dict)) - try: - self.modules_queue_lock.acquire() - self.modules_changes_queue.put(self.sfp_changes_dict, timeout=1) - self.modules_queue_lock.release() - self.sfp_changes_dict = {} - except queue.Full: - logger.log_warning("failed to put item from modules changes queue, queue is full") - else: - logger.log_warning("static detection sfp_changes_dict {} is empty...".format(self.sfp_changes_dict)) + if is_final_state_module: + self.add_ports_state_to_state_db() + self.delete_ports_from_dict() + self.send_changes_to_shared_queue() i += 1 logger.log_warning("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): @@ -271,56 +234,72 @@ def run(self): logger.log_warning("static detection len of keys of sfp_port_dict is not 0: {}".format(sfp_port_dict_keys_len)) logger.log_warning("static detection all_static_detection_done: {}".format(all_static_detection_done)) - logger.log_warning("sfp_port_dict before dynamic detection: {}".format(self.sfp_port_dict)) - # loop on listening to changes, gather and put them into shared queue, then continue looping + # dynamic detection - loop on polling changes, run state machine for them and put them into shared queue i = 0 # need at least 1 module in final state until it makes sense to send changes dict is_final_state_module = False # initialize fds events count to 0 for fd_fileno in self.fds_mapping_to_obj: module_obj = self.fds_mapping_to_obj[fd_fileno] - self.fds_events_count_dict[module_obj.port_num] = 0 + # for debug purposes + self.fds_events_count_dict[module_obj.port_num] = { 'presence' : 0 , 'power_good' : 0 } dummy_read = False - while not self.task_stopping_event or not self.main_thread_stop_event: + while not self.main_thread_stop_event: logger.log_warning("dynamic detection running iteration {}".format(i)) - # dummy read all sysfs fds before polling them + # dummy read all sysfs fds before polling them due to linux kernel implementation of poll if not dummy_read: for fd_fileno in self.fds_mapping_to_obj: - module_obj = self.fds_mapping_to_obj[fd_fileno] - module_obj.module_fd = open(module_obj.module_fd_path, "r") + # dummy read present / hw_present / power_good sysfs + module_obj = self.fds_mapping_to_obj[fd_fileno]['module_obj'] + module_fd = self.fds_mapping_to_obj[fd_fileno]['fd'] + fd_name = self.fds_mapping_to_obj[fd_fileno]['fd_name'] + if fd_name in ['presence']: + module_fd_path = module_obj.module_fd_path + elif fd_name in ['power_good']: + module_fd_path = module_obj.module_power_good_fd_path try: logger.log_warning("dynamic detection dummy reading from fd path {} for port {}" - .format(module_obj.module_fd_path, module_obj.port_num)) - val = module_obj.module_fd.read() + .format(module_fd_path, module_obj.port_num)) + val = module_fd.read() + module_fd.seek(0) val_int = None if len(val) > 0: val_int = int(val) - logger.log_warning("dynamic detection dummy read {} int {} for port {} before polling" + logger.log_warning("dynamic detection dummy read presence {} int {} for port {} before polling" .format(val, val_int, module_obj.port_num)) except Exception as e: - logger.log_warning("dynamic detection exception on dummy read {} for port {} traceback:\n{}" + logger.log_warning("dynamic detection exception on dummy read presence {} for port {} traceback:\n{}" .format(e, module_obj.port_num, traceback.format_exc())) dummy_read = True - logger.log_warning("dynamic detection sleeping 1 second before polling...") - time.sleep(1) # poll for changes with 1 second timeout fds_events = self.poll_obj.poll(1000) logger.log_warning("dynamic detection polled obj checking fds_events iteration {}".format(i)) for fd, event in fds_events: # get modules object from fd according to saved key-value of fd-module obj saved earlier logger.log_warning("dynamic detection working on fd {} event {}".format(fd, event)) - module_obj = self.fds_mapping_to_obj[fd] - self.fds_events_count_dict[module_obj.port_num] += 1 + #module_obj = self.fds_mapping_to_obj[fd] + module_obj = self.fds_mapping_to_obj[fd_fileno]['module_obj'] + module_fd = self.fds_mapping_to_obj[fd_fileno]['fd'] + fd_name = self.fds_mapping_to_obj[fd_fileno]['fd_name'] + if fd_name in ['presence']: + module_fd_path = module_obj.module_fd_path + elif fd_name in ['power_good']: + module_fd_path = module_obj.module_power_good_fd_path + self.fds_events_count_dict[module_obj.port_num][fd_name] += 1 + val = module_fd.read() + module_fd.seek(0) logger.log_warning("dynamic detection got module_obj {} with port {} from fd number {} path {} count {}" - .format(module_obj, module_obj.port_num, fd, module_obj.module_fd_path, self.fds_events_count_dict[module_obj.port_num])) + .format(module_obj, module_obj.port_num, fd, module_fd_path, self.fds_events_count_dict[module_obj.port_num])) if module_obj.port_num not in self.sfp_port_dict.keys(): logger.log_warning("dynamic detection port {} not found in sfp_port_dict keys: {} resetting all states".format(module_obj.port_num, self.sfp_port_dict.keys())) module_obj.reset_all_states() # put again module obj in sfp_port_dict so next loop will work on it self.sfp_port_dict[module_obj.port_num] = module_obj - logger.log_warning("dynamic detection sleeping 2 second...") - time.sleep(2) + self.delete_ports_from_state_db_list.append(module_obj.port_num) + self.delete_ports_state_from_state_db(self.delete_ports_from_state_db_list) + logger.log_warning("dynamic detection sleeping 1 second...") + time.sleep(1) for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() logger.log_warning(f'dynamic detection STATE_LOG {port_num}: curr_state is {curr_state}') @@ -363,53 +342,9 @@ def run(self): self.modules_lock_list[port_num].release() if is_final_state_module: - state_db = None - for port, module_obj in self.sfp_port_dict.items(): - final_state = module_obj.get_final_state() - if final_state: - #del self.sfp_port_dict[port] - # add port to delete list that we will iterate on later and delete the ports from sfp_port_dict - self.sfp_delete_list_from_port_dict.append(port) - self.sfp_changes_dict[str(module_obj.port_num)] = '0' if final_state in [STATE_HW_NOT_PRESENT, - STATE_ERROR_HANDLER] else '1' - if final_state in [STATE_SW_CONTROL, STATE_FW_CONTROL]: - namespaces = multi_asic.get_front_end_namespaces() - for namespace in namespaces: - logger.log_warning("dynamic detection getting state_db for port {} namespace {}".format(port, namespace)) - state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) - logger.log_warning("dynamic detection got state_db for port {} namespace {}".format(port, namespace)) - if state_db is not None: - logger.log_warning( - "dynamic detection connecting to state_db for port {} namespace {}".format(port, namespace)) - state_db.connect(state_db.STATE_DB) - if final_state in [STATE_FW_CONTROL]: - control_type = 'FW_CONTROL' - elif final_state in [STATE_SW_CONTROL]: - control_type = 'SW_CONTROL' - table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) - logger.log_warning( - "dynamic detection setting state_db table {} for port {} namespace {} control_type {}" - .format(table_name, port, namespace, control_type)) - state_db.set(state_db.STATE_DB, table_name,"control type", control_type) - - logger.log_warning("dynamic detection sfp_port_dict before deletion: {}".format(self.sfp_port_dict)) - for port in self.sfp_delete_list_from_port_dict: - del self.sfp_port_dict[port] - self.sfp_delete_list_from_port_dict = [] - logger.log_warning("dynamic detection sfp_port_dict after deletion: {}".format(self.sfp_port_dict)) - if self.sfp_changes_dict: - logger.log_warning("dynamic detection putting sfp_changes_dict {} in modules changes queue...".format(self.sfp_changes_dict)) - #with self.modules_changes_queue.mutex: - if True: - try: - self.modules_queue_lock.acquire() - self.modules_changes_queue.put(self.sfp_changes_dict, timeout=1) - self.modules_queue_lock.release() - self.sfp_changes_dict = {} - except queue.Full: - logger.log_warning("failed to put item from modules changes queue, queue is full") - else: - logger.log_warning("sfp_changes_dict {} is empty...".format(self.sfp_changes_dict)) + self.add_ports_state_to_state_db(dynamic=True) + self.delete_ports_from_dict(dynamic=True) + self.send_changes_to_shared_queue(dynamic=True) i += 1 logger.log_warning("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): @@ -418,7 +353,10 @@ def run(self): def check_if_hw_present(self, port, module_sm_obj, dynamic=False): - module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) + if self.is_supported_indep_mods_system: + module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) + else: + module_fd_indep_path = SYSFS_LEGACY_FD_PRESENCE.format(port) if os.path.isfile(module_fd_indep_path): try: val_int = utils.read_int_from_file(module_fd_indep_path) @@ -427,31 +365,15 @@ def check_if_hw_present(self, port, module_sm_obj, dynamic=False): module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT elif 1 == val_int: + if not self.is_supported_indep_mods_system: + module_sm_obj.set_final_state(STATE_HW_PRESENT) logger.log_warning("returning {} for val {}".format(STATE_HW_PRESENT, val_int)) return STATE_HW_PRESENT except Exception as e: - if not dynamic: - logger.log_warning("exception {} for port {} setting final state STATE_ERROR_HANDLER".format(e, port)) - module_sm_obj.set_final_state(STATE_ERROR_HANDLER) - return STATE_ERROR_HANDLER - else: - module_fd_legacy_path = SYSFS_LEGACY_FD_PRESENCE.format(port) - logger.log_warning("falling back to legacy sysfs {} for port {} dynamic: {}".format(port, module_fd_legacy_path, dynamic)) - if os.path.isfile(module_fd_legacy_path): - logger.log_warning("reading legacy sysfs {} for port {} dynamic: {}".format(port, module_fd_legacy_path, dynamic)) - try: - val_int = utils.read_int_from_file(module_fd_legacy_path) - if 0 == val_int: - logger.log_warning("returning {} for val {} legacy sysfd {} port {} dynamic {}".format(STATE_HW_NOT_PRESENT, val_int, module_fd_legacy_path, port, dynamic)) - module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) - return STATE_HW_NOT_PRESENT - elif 1 == val_int: - logger.log_warning("returning {} for val {} legacy sysfd {} port {} dynamic {}".format(STATE_HW_PRESENT, val_int, module_fd_legacy_path, port, dynamic)) - return STATE_HW_PRESENT - except Exception as e: - logger.log_warning("check_if_hw_present dynamic exception {} for port {} setting final state STATE_ERROR_HANDLER".format(e, port)) - module_sm_obj.set_final_state(STATE_ERROR_HANDLER) - return STATE_ERROR_HANDLER + logger.log_warning("exception {} for port {} setting final state STATE_ERROR_HANDLER".format(e, port)) + module_sm_obj.set_final_state(STATE_ERROR_HANDLER) + return STATE_ERROR_HANDLER + module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT def checkIfModuleAvailable(self, port, module_sm_obj, dynamic=False): @@ -459,11 +381,20 @@ def checkIfModuleAvailable(self, port, module_sm_obj, dynamic=False): module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_GOOD.format(port) if os.path.isfile(module_fd_indep_path): try: - val_int = utils.read_int_from_file(module_fd_indep_path) + # not using utils.read_int_from_file since need to catch the exception here if no such file or it is + # not accesible. utils.read_int_from_file will return 0 in such a case + module_power_good_fd = open(module_fd_indep_path, "r") + val = module_power_good_fd.read() + val_int = int(val) + module_sm_obj.module_power_good_fd_path = module_fd_indep_path + module_sm_obj.module_power_good_fd = module_power_good_fd + # registering power good sysfs even if not good, so we can get an event from poller upon changes + self.poll_obj.register(module_sm_obj.module_power_good_fd, select.POLLERR | select.POLLPRI) + self.fds_mapping_to_obj[module_sm_obj.module_power_good_fd.fileno()] = { 'module_obj' : module_sm_obj + , 'fd':module_sm_obj.module_power_good_fd, 'fd_name' : 'power_good'} if 0 == val_int: logger.log_warning(f'port {port} power is not good') - self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) - self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = module_sm_obj + module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT elif 1 == val_int: logger.log_warning(f'port {port} power is good') @@ -472,6 +403,7 @@ def checkIfModuleAvailable(self, port, module_sm_obj, dynamic=False): logger.log_warning("exception {} for port {}".format(e, port)) return STATE_HW_NOT_PRESENT logger.log_warning(f'port {port} has no power good file {module_fd_indep_path}') + module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT def checkIfPowerOn(self, port, module_sm_obj, dynamic=False): @@ -504,6 +436,7 @@ def checkIfPowerOn(self, port, module_sm_obj, dynamic=False): return STATE_POWERED except Exception as e: logger.log_warning(f'got exception {e} in checkIfPowerOn') + module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT def powerOnModule(self, port, module_sm_obj, dynamic=False): @@ -563,7 +496,8 @@ def checkModuleType(self, port, module_sm_obj, dynamic=False): return STATE_ERROR_HANDLER module_type = int.from_bytes(module_type_ba, "big") logger.log_warning("got module_type {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) - if not 24 == module_type: + # QSFP-DD ID is 24, OSFP ID is 25 - only these 2 are supported currently as independent module - SW controlled + if module_type not in [24, 25]: logger.log_warning("setting STATE_FW_CONTROL for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) module_sm_obj.set_final_state = STATE_FW_CONTROL return STATE_FW_CONTROL @@ -572,18 +506,26 @@ def checkModuleType(self, port, module_sm_obj, dynamic=False): logger.log_warning("check_module_type port {} setting STATE_FW_CONTROL module ID {} due to flat_mem device" .format(module_type, port)) return STATE_FW_CONTROL - logger.log_warning("checking power cap for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) + logger.log_warning("checking power cap for {} in check_module_type port {} module_sm_obj {}" + .format(module_type, port, module_sm_obj)) power_cap = self.checkPowerCap(port, module_sm_obj) if power_cap is STATE_POWER_LIMIT_ERROR: + module_sm_obj.set_final_state(STATE_POWER_LIMIT_ERROR) return STATE_POWER_LIMIT_ERROR else: # read the module maximum supported clock of Management Comm Interface (MCI) from module EEPROM. - read_mci = "mci" + # from byte 2 bits 3-2: + # 00b means module supports up to 400KHz + # 01b means module supports up to 1MHz + logger.log_warning(f"check_module_type reading mci max frequency for port {port}") + read_mci = xcvr_api.xcvr_eeprom.read_raw(2, 1) + logger.log_warning(f"check_module_type read mci max frequency {read_mci} for port {port}") + mci_bits = read_mci & 0b00001100 + logger.log_warning(f"check_module_type read mci max frequency bits {mci_bits} for port {port}") # Then, set it to frequency Sysfs using: - # echo /sys/module/sx_core/$asic/$module/frequency // val: 0 - up to 400KHz, 1 - up to 1MHz - #indep_fd_freq = SYSFS_INDEPENDENT_FD_FREQ.format(port) - #with open(indep_fd_freq, "r") as freq_fd: - # freq_fd.write(read_mci) + # echo > /sys/module/sx_core/$asic/$module/frequency // val: 0 - up to 400KHz, 1 - up to 1MHz + indep_fd_freq = SYSFS_INDEPENDENT_FD_FREQ.format(port) + utils.write_file(indep_fd_freq, mci_bits) return STATE_SW_CONTROL def checkPowerCap(self, port, module_sm_obj, dynamic=False): @@ -623,23 +565,16 @@ def saveModuleControlMode(self, port, module_sm_obj, dynamic=False): module_fd = open(module_fd_legacy_path, "r") module_sm_obj.set_module_fd(module_fd) logger.log_warning("saveModuleControlMode changed module fd to legacy present for port {}".format(port)) + # register the module's sysfs fd to poller with ERR and PRI attrs logger.log_warning("saveModuleControlMode registering sysfs fd {} number {} path {} for port {}" .format(module_sm_obj.module_fd, module_sm_obj.module_fd.fileno(), module_sm_obj.set_module_fd_path, port)) self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) - self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = module_sm_obj + self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = { 'module_obj' : module_sm_obj + , 'fd': module_sm_obj.module_fd, 'fd_name' : 'presence' } module_sm_obj.set_poll_obj(self.poll_obj) logger.log_warning("saveModuleControlMode set current state {} for port {} as final state {}".format( module_sm_obj.get_current_state(), port, module_sm_obj.get_final_state())) - def STATE_ERROR_HANDLER(self): - pass - - def STATE_POWER_LIMIT_ERROR(self): - pass - - def STATE_SYSFS_ERROR(self): - pass - def timerTask(self): # wakes up every 1 second logger.log_warning("timerTask entered run state") empty = False @@ -693,6 +628,80 @@ def add_port_to_wait_reset(self, module_sm_obj): self.waiting_modules_list.add(module_sm_obj.port_num) logger.log_warning("add_port_to_wait_reset waiting_list after adding: {}".format(self.waiting_modules_list)) + def add_ports_state_to_state_db(self, dynamic=False): + state_db = None + detection_method = 'dynamic' if dynamic else 'static' + for port, module_obj in self.sfp_port_dict.items(): + final_state = module_obj.get_final_state() + if final_state: + # add port to delete list that we will iterate on later and delete the ports from sfp_port_dict + self.sfp_delete_list_from_port_dict.append(port) + if final_state in [STATE_HW_NOT_PRESENT, STATE_POWER_LIMIT_ERROR, STATE_ERROR_HANDLER]: + ctrl_type_db_value = '0' + else: + ctrl_type_db_value = '1' + self.sfp_changes_dict[str(module_obj.port_num)] = ctrl_type_db_value + if final_state in [STATE_SW_CONTROL, STATE_FW_CONTROL]: + namespaces = multi_asic.get_front_end_namespaces() + for namespace in namespaces: + logger.log_warning(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") + state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) + logger.log_warning(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") + logger.log_warning(f"{detection_method} detection got state_db for port {port} namespace {namespace}") + if state_db is not None: + logger.log_warning( + f"{detection_method} detection connecting to state_db for port {port} namespace {namespace}") + state_db.connect(state_db.STATE_DB) + if final_state in [STATE_FW_CONTROL]: + control_type = 'FW_CONTROL' + elif final_state in [STATE_SW_CONTROL]: + control_type = 'SW_CONTROL' + table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) + logger.log_warning(f"{detection_method} detection setting state_db table {table_name} for port {port}" + f" namespace {namespace} control_type {control_type}") + state_db.set(state_db.STATE_DB, table_name, "control_type", control_type) + + def delete_ports_state_from_state_db(self, ports, dynamic=True): + state_db = None + detection_method = 'dynamic' if dynamic else 'static' + for port in ports: + namespaces = multi_asic.get_front_end_namespaces() + for namespace in namespaces: + logger.log_warning(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") + state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) + logger.log_warning(f"{detection_method} detection got state_db for port {port} namespace {namespace}") + if state_db is not None: + logger.log_warning( + f"{detection_method} detection connecting to state_db for port {port} namespace {namespace}") + state_db.connect(state_db.STATE_DB) + table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) + logger.log_warning(f"{detection_method} detection deleting state_db table {table_name} " + f"for port {port} namespace {namespace}") + state_db.delete(state_db.STATE_DB, table_name) + + def delete_ports_from_dict(self, dynamic=False): + detection_method = 'dynamic' if dynamic else 'static' + logger.log_warning(f"{detection_method} detection sfp_port_dict before deletion: {self.sfp_port_dict}") + for port in self.sfp_delete_list_from_port_dict: + del self.sfp_port_dict[port] + self.sfp_delete_list_from_port_dict = [] + logger.log_warning("dynamic detection sfp_port_dict after deletion: {}".format(self.sfp_port_dict)) + + def send_changes_to_shared_queue(self, dynamic=False): + detection_method = 'dynamic' if dynamic else 'static' + if self.sfp_changes_dict: + logger.log_warning(f"{detection_method} detection putting sfp_changes_dict {self.sfp_changes_dict} " + f"in modules changes queue...") + try: + self.modules_queue_lock.acquire() + self.modules_changes_queue.put(self.sfp_changes_dict, timeout=1) + self.modules_queue_lock.release() + self.sfp_changes_dict = {} + except queue.Full: + logger.log_warning(f"{detection_method} failed to put item from modules changes queue, queue is full") + else: + logger.log_warning(f"{detection_method} sfp_changes_dict {self.sfp_changes_dict} is empty...") + class ModuleStateMachine(object): @@ -713,6 +722,8 @@ def __init__(self, port_num=0, initial_state=STATE_HW_NOT_PRESENT, current_state self.reset_start_time = reset_start_time self.wait_for_power_on = False self.eeprom_poweron_reset_retries = eeprom_poweron_reset_retries + self.module_power_good_fd_path = module_fd_path + self.module_power_good_fd = module_fd def set_initial_state(self, state): self.initial_state = state From e222cf07c85e0146ad76ae91a89dfe83f4f24d74 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Wed, 11 Oct 2023 00:25:14 +0000 Subject: [PATCH 07/26] change all logger from warning to info --- .../sonic_platform/chassis.py | 14 +- .../sonic_platform/modules_mgmt.py | 232 +++++++++--------- 2 files changed, 123 insertions(+), 123 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 1e8eb84e0156..d334fc10d8b0 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -143,9 +143,9 @@ def __del__(self): self.sfp_module.deinitialize_sdk_handle(self.sfp_module.SFP.shared_sdk_handle) #self.modules_mgmt_task_stopping_event.set() - #logger.log_warning('set modules_mgmt_task_stopping_event {self.modules_mgmt_task_stopping_event}') + #logger.log_info('set modules_mgmt_task_stopping_event {self.modules_mgmt_task_stopping_event}') #self.modules_mgmt_thread.join(timeout=10) - #logger.log_warning('joined modules_mgmt_thread thread') + #logger.log_info('joined modules_mgmt_thread thread') @property def RJ45_port_list(self): @@ -408,17 +408,17 @@ def get_change_event(self, timeout=0): error_dict = {} i = 0 while True: - logger.log_warning('get_change_event() acquiring queue lock iteration {}'.format(i)) + logger.log_info('get_change_event() acquiring queue lock iteration {}'.format(i)) self.modules_queue_lock.acquire() if self.modules_changes_queue.qsize() > 0: if True: try: - logger.log_warning('get_change_event() trying to get changes from queue') + logger.log_info('get_change_event() trying to get changes from queue') port_dict = self.modules_changes_queue.get(timeout=1) - logger.log_warning ('get_change_event() port_dict: {}'.format(port_dict)) + logger.log_info ('get_change_event() port_dict: {}'.format(port_dict)) except queue.Empty: - logger.log_warning("failed to get item from modules changes queue") - logger.log_warning('get_change_event() releasing queue lock iteration {}'.format(i)) + logger.log_info("failed to get item from modules changes queue") + logger.log_info('get_change_event() releasing queue lock iteration {}'.format(i)) self.modules_queue_lock.release() if port_dict: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 981592bcf817..b84459fb9286 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -95,13 +95,13 @@ def get_sm_func(self, sm, port): , STATE_POWER_LIMIT_ERROR: STATE_POWER_LIMIT_ERROR , STATE_SYSFS_ERROR: STATE_SYSFS_ERROR } - logger.log_warning("getting func for state {} for port {}".format(sm, port)) + logger.log_info("getting func for state {} for port {}".format(sm, port)) try: func = SFP_SM_ENUM[sm] - logger.log_warning("got func {} for state {} for port {}".format(func, sm, port)) + logger.log_info("got func {} for state {} for port {}".format(func, sm, port)) return func except KeyError as e: - logger.log_warning("exception {} for port {}".format(e, port)) + logger.log_info("exception {} for port {}".format(e, port)) return None def run(self): @@ -110,7 +110,7 @@ def run(self): #hwsku = device_info.get_hwsku() independent_file = INDEP_PROFILE_FILE.format(hwsku_dir) if os.path.isfile(independent_file): - logger.log_warning("file {} found, checking content for independent mode value".format(independent_file)) + logger.log_info("file {} found, checking content for independent mode value".format(independent_file)) with open(independent_file, "r") as independent_file_fd: independent_file_content = independent_file_fd.read() if SAI_INDEP_MODULE_MODE in independent_file_content and \ @@ -118,10 +118,10 @@ def run(self): independent_file_splitted = independent_file_content.split(SAI_INDEP_MODULE_MODE_DELIMITER) if (len(independent_file_splitted) > 1): self.is_supported_indep_mods_system = int(independent_file_splitted[1]) == int(SAI_INDEP_MODULE_MODE_TRUE_STR) - logger.log_warning("file {} found, system will work in independent mode".format(independent_file)) - logger.log_warning("value of indep mode var: {} found in file".format(independent_file_splitted[1])) + logger.log_info("file {} found, system will work in independent mode".format(independent_file)) + logger.log_info("value of indep mode var: {} found in file".format(independent_file_splitted[1])) else: - logger.log_warning("file {} not found, system stays in legacy mode".format(independent_file)) + logger.log_info("file {} not found, system stays in legacy mode".format(independent_file)) # static init - at first go over all ports and check each one if it's independent module or legacy self.sfp_changes_dict = {} @@ -136,9 +136,9 @@ def run(self): temp_module_sm = ModuleStateMachine(port_num=port, initial_state=STATE_HW_NOT_PRESENT , current_state=STATE_HW_NOT_PRESENT) module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) - logger.log_warning("system in indep mode: {} port {}".format(self.is_supported_indep_mods_system, port)) + logger.log_info("system in indep mode: {} port {}".format(self.is_supported_indep_mods_system, port)) if self.is_supported_indep_mods_system and os.path.isfile(module_fd_indep_path): - logger.log_warning("system in indep mode: {} port {} reading file {}".format(self.is_supported_indep_mods_system, port, module_fd_indep_path)) + logger.log_info("system in indep mode: {} port {} reading file {}".format(self.is_supported_indep_mods_system, port, module_fd_indep_path)) temp_module_sm.set_is_indep_modules(True) temp_module_sm.set_module_fd_path(module_fd_indep_path) module_fd = open(module_fd_indep_path, "r") @@ -152,7 +152,7 @@ def run(self): self.modules_lock_list.append(threading.Lock()) temp_module_sm.set_poll_obj(self.poll_obj) # start SM for this independent module - logger.log_warning("adding temp_module_sm {} to sfp_port_dict".format(temp_module_sm)) + logger.log_info("adding temp_module_sm {} to sfp_port_dict".format(temp_module_sm)) self.sfp_port_dict_initial[port] = temp_module_sm self.sfp_port_dict[port] = temp_module_sm @@ -160,32 +160,32 @@ def run(self): # need at least 1 module in final state until it makes sense to send changes dict is_final_state_module = False all_static_detection_done = False - logger.log_warning("sfp_port_dict before starting static detection: {}".format(self.sfp_port_dict)) + logger.log_info("sfp_port_dict before starting static detection: {}".format(self.sfp_port_dict)) # static detection - loop on different state for all ports until all done while not self.main_thread_stop_event and not all_static_detection_done: - logger.log_warning("static detection running iteration {}".format(i)) + logger.log_info("static detection running iteration {}".format(i)) waiting_list_len = len(self.waiting_modules_list) sfp_port_dict_keys_len = len(self.sfp_port_dict.keys()) if waiting_list_len == sfp_port_dict_keys_len: - logger.log_warning("static detection length of waiting list {}: {} and sfp port dict keys {}:{} is the same, sleeping 1 second..." + logger.log_info("static detection length of waiting list {}: {} and sfp port dict keys {}:{} is the same, sleeping 1 second..." .format(waiting_list_len, self.waiting_modules_list, sfp_port_dict_keys_len, self.sfp_port_dict.keys())) time.sleep(1) else: - logger.log_warning("static detectionlength of waiting list {}: {} and sfp port dict keys {}: {} is different, NOT sleeping 1 second" + logger.log_info("static detectionlength of waiting list {}: {} and sfp port dict keys {}: {} is different, NOT sleeping 1 second" .format(waiting_list_len, self.waiting_modules_list, sfp_port_dict_keys_len, self.sfp_port_dict.keys())) for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() - logger.log_warning(f'static detection STATE_LOG {port_num}: curr_state is {curr_state}') + logger.log_info(f'static detection STATE_LOG {port_num}: curr_state is {curr_state}') func = self.get_sm_func(curr_state, port_num) - logger.log_warning("static detection got returned func {} for state {}".format(func, curr_state)) + logger.log_info("static detection got returned func {} for state {}".format(func, curr_state)) try: if not isinstance(func, str): next_state = func(port_num, module_sm_obj) except TypeError as e: - logger.log_warning("static detection exception {} for port {} traceback:\n{}".format(e, port_num, traceback.format_exc())) + logger.log_info("static detection exception {} for port {} traceback:\n{}".format(e, port_num, traceback.format_exc())) module_sm_obj.set_final_state(STATE_ERROR_HANDLER) continue - logger.log_warning(f'static detection STATE_LOG {port_num}: next_state is {next_state}') + logger.log_info(f'static detection STATE_LOG {port_num}: next_state is {next_state}') if self.timer.is_alive(): logger.log_info("static detection timer threads is alive, acquiring lock") self.modules_lock_list[port_num].acquire() @@ -194,7 +194,7 @@ def run(self): module_sm_obj.set_next_state(next_state) module_sm_obj.advance_state() if module_sm_obj.get_final_state(): - logger.log_warning(f'static detection STATE_LOG {port_num}: enter final state {module_sm_obj.get_final_state()}') + logger.log_info(f'static detection STATE_LOG {port_num}: enter final state {module_sm_obj.get_final_state()}') is_final_state_module = True if self.timer.is_alive(): self.modules_lock_list[port_num].release() @@ -202,18 +202,18 @@ def run(self): logger.log_info("static detection timer thread is_alive {} port {}".format(is_timer_alive, port_num)) if STATE_NOT_POWERED == curr_state: if not is_timer_alive: - logger.log_warning ("static detection curr_state is {} and timer thread is_alive {}, running timer task thread" + logger.log_info ("static detection curr_state is {} and timer thread is_alive {}, running timer task thread" .format(curr_state, is_timer_alive)) # call timer task self.timer = threading.Timer(1.0, self.timerTask) self.timer.start() self.timer_queue.put(module_sm_obj) if self.timer.is_alive(): - logger.log_warning("timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) + logger.log_info("timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].acquire() module_sm_obj.set_next_state(next_state) if self.timer.is_alive(): - logger.log_warning("timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) + logger.log_info("timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].release() if is_final_state_module: @@ -221,20 +221,20 @@ def run(self): self.delete_ports_from_dict() self.send_changes_to_shared_queue() i += 1 - logger.log_warning("sfp_port_dict: {}".format(self.sfp_port_dict)) + logger.log_info("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): - logger.log_warning("static detection port_num: {} initial state: {} current_state: {} next_state: {}" + logger.log_info("static detection port_num: {} initial state: {} current_state: {} next_state: {}" .format(port_num, module_sm_obj.initial_state, module_sm_obj.get_current_state() , module_sm_obj.get_next_state())) sfp_port_dict_keys_len = len(self.sfp_port_dict.keys()) if sfp_port_dict_keys_len == 0: - logger.log_warning("static detection len of keys of sfp_port_dict is 0: {}".format(sfp_port_dict_keys_len)) + logger.log_info("static detection len of keys of sfp_port_dict is 0: {}".format(sfp_port_dict_keys_len)) all_static_detection_done = True else: - logger.log_warning("static detection len of keys of sfp_port_dict is not 0: {}".format(sfp_port_dict_keys_len)) - logger.log_warning("static detection all_static_detection_done: {}".format(all_static_detection_done)) + logger.log_info("static detection len of keys of sfp_port_dict is not 0: {}".format(sfp_port_dict_keys_len)) + logger.log_info("static detection all_static_detection_done: {}".format(all_static_detection_done)) - logger.log_warning("sfp_port_dict before dynamic detection: {}".format(self.sfp_port_dict)) + logger.log_info("sfp_port_dict before dynamic detection: {}".format(self.sfp_port_dict)) # dynamic detection - loop on polling changes, run state machine for them and put them into shared queue i = 0 # need at least 1 module in final state until it makes sense to send changes dict @@ -246,7 +246,7 @@ def run(self): self.fds_events_count_dict[module_obj.port_num] = { 'presence' : 0 , 'power_good' : 0 } dummy_read = False while not self.main_thread_stop_event: - logger.log_warning("dynamic detection running iteration {}".format(i)) + logger.log_info("dynamic detection running iteration {}".format(i)) # dummy read all sysfs fds before polling them due to linux kernel implementation of poll if not dummy_read: for fd_fileno in self.fds_mapping_to_obj: @@ -259,25 +259,25 @@ def run(self): elif fd_name in ['power_good']: module_fd_path = module_obj.module_power_good_fd_path try: - logger.log_warning("dynamic detection dummy reading from fd path {} for port {}" + logger.log_info("dynamic detection dummy reading from fd path {} for port {}" .format(module_fd_path, module_obj.port_num)) val = module_fd.read() module_fd.seek(0) val_int = None if len(val) > 0: val_int = int(val) - logger.log_warning("dynamic detection dummy read presence {} int {} for port {} before polling" + logger.log_info("dynamic detection dummy read presence {} int {} for port {} before polling" .format(val, val_int, module_obj.port_num)) except Exception as e: - logger.log_warning("dynamic detection exception on dummy read presence {} for port {} traceback:\n{}" + logger.log_info("dynamic detection exception on dummy read presence {} for port {} traceback:\n{}" .format(e, module_obj.port_num, traceback.format_exc())) dummy_read = True # poll for changes with 1 second timeout fds_events = self.poll_obj.poll(1000) - logger.log_warning("dynamic detection polled obj checking fds_events iteration {}".format(i)) + logger.log_info("dynamic detection polled obj checking fds_events iteration {}".format(i)) for fd, event in fds_events: # get modules object from fd according to saved key-value of fd-module obj saved earlier - logger.log_warning("dynamic detection working on fd {} event {}".format(fd, event)) + logger.log_info("dynamic detection working on fd {} event {}".format(fd, event)) #module_obj = self.fds_mapping_to_obj[fd] module_obj = self.fds_mapping_to_obj[fd_fileno]['module_obj'] module_fd = self.fds_mapping_to_obj[fd_fileno]['fd'] @@ -289,28 +289,28 @@ def run(self): self.fds_events_count_dict[module_obj.port_num][fd_name] += 1 val = module_fd.read() module_fd.seek(0) - logger.log_warning("dynamic detection got module_obj {} with port {} from fd number {} path {} count {}" + logger.log_info("dynamic detection got module_obj {} with port {} from fd number {} path {} count {}" .format(module_obj, module_obj.port_num, fd, module_fd_path, self.fds_events_count_dict[module_obj.port_num])) if module_obj.port_num not in self.sfp_port_dict.keys(): - logger.log_warning("dynamic detection port {} not found in sfp_port_dict keys: {} resetting all states".format(module_obj.port_num, self.sfp_port_dict.keys())) + logger.log_info("dynamic detection port {} not found in sfp_port_dict keys: {} resetting all states".format(module_obj.port_num, self.sfp_port_dict.keys())) module_obj.reset_all_states() # put again module obj in sfp_port_dict so next loop will work on it self.sfp_port_dict[module_obj.port_num] = module_obj self.delete_ports_from_state_db_list.append(module_obj.port_num) self.delete_ports_state_from_state_db(self.delete_ports_from_state_db_list) - logger.log_warning("dynamic detection sleeping 1 second...") + logger.log_info("dynamic detection sleeping 1 second...") time.sleep(1) for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() - logger.log_warning(f'dynamic detection STATE_LOG {port_num}: curr_state is {curr_state}') + logger.log_info(f'dynamic detection STATE_LOG {port_num}: curr_state is {curr_state}') func = self.get_sm_func(curr_state, port) - logger.log_warning("dynamic detection got returned func {} for state {}".format(func, curr_state)) + logger.log_info("dynamic detection got returned func {} for state {}".format(func, curr_state)) try: next_state = func(port_num, module_sm_obj, dynamic=True) except TypeError as e: - logger.log_warning("exception {} for port {}".format(e, port_num)) + logger.log_info("exception {} for port {}".format(e, port_num)) continue - logger.log_warning(f'dynamic detection STATE_LOG {port_num}: next_state is {next_state}') + logger.log_info(f'dynamic detection STATE_LOG {port_num}: next_state is {next_state}') if self.timer.is_alive(): logger.log_info("dynamic detection timer threads is alive, acquiring lock") self.modules_lock_list[port_num].acquire() @@ -326,18 +326,18 @@ def run(self): logger.log_info("dynamic detection timer thread is_alive {} port {}".format(is_timer_alive, port_num)) if STATE_NOT_POWERED == curr_state: if not is_timer_alive: - logger.log_warning("dynamic detection curr_state is {} and timer thread is_alive {}, running timer task thread" + logger.log_info("dynamic detection curr_state is {} and timer thread is_alive {}, running timer task thread" .format(curr_state, is_timer_alive)) # call timer task self.timer = threading.Timer(1.0, self.timerTask) self.timer.start() self.timer_queue.put(module_sm_obj) if self.timer.is_alive(): - logger.log_warning("dynamic detection timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) + logger.log_info("dynamic detection timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].acquire() module_sm_obj.set_next_state(next_state) if self.timer.is_alive(): - logger.log_warning( + logger.log_info( "dynamic detection timer thread is_alive {}, releasing module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].release() @@ -346,9 +346,9 @@ def run(self): self.delete_ports_from_dict(dynamic=True) self.send_changes_to_shared_queue(dynamic=True) i += 1 - logger.log_warning("sfp_port_dict: {}".format(self.sfp_port_dict)) + logger.log_info("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): - logger.log_warning("port_num: {} module_sm_obj initial state: {} current_state: {} next_state: {}" + logger.log_info("port_num: {} module_sm_obj initial state: {} current_state: {} next_state: {}" .format(port_num, module_sm_obj.initial_state, module_sm_obj.get_current_state(), module_sm_obj.get_next_state())) @@ -361,23 +361,23 @@ def check_if_hw_present(self, port, module_sm_obj, dynamic=False): try: val_int = utils.read_int_from_file(module_fd_indep_path) if 0 == val_int: - logger.log_warning("returning {} for val {}".format(STATE_HW_NOT_PRESENT, val_int)) + logger.log_info("returning {} for val {}".format(STATE_HW_NOT_PRESENT, val_int)) module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT elif 1 == val_int: if not self.is_supported_indep_mods_system: module_sm_obj.set_final_state(STATE_HW_PRESENT) - logger.log_warning("returning {} for val {}".format(STATE_HW_PRESENT, val_int)) + logger.log_info("returning {} for val {}".format(STATE_HW_PRESENT, val_int)) return STATE_HW_PRESENT except Exception as e: - logger.log_warning("exception {} for port {} setting final state STATE_ERROR_HANDLER".format(e, port)) + logger.log_info("exception {} for port {} setting final state STATE_ERROR_HANDLER".format(e, port)) module_sm_obj.set_final_state(STATE_ERROR_HANDLER) return STATE_ERROR_HANDLER module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT def checkIfModuleAvailable(self, port, module_sm_obj, dynamic=False): - logger.log_warning("enter check_if_module_available port {} module_sm_obj {}".format(port, module_sm_obj)) + logger.log_info("enter check_if_module_available port {} module_sm_obj {}".format(port, module_sm_obj)) module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_GOOD.format(port) if os.path.isfile(module_fd_indep_path): try: @@ -393,28 +393,28 @@ def checkIfModuleAvailable(self, port, module_sm_obj, dynamic=False): self.fds_mapping_to_obj[module_sm_obj.module_power_good_fd.fileno()] = { 'module_obj' : module_sm_obj , 'fd':module_sm_obj.module_power_good_fd, 'fd_name' : 'power_good'} if 0 == val_int: - logger.log_warning(f'port {port} power is not good') + logger.log_info(f'port {port} power is not good') module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT elif 1 == val_int: - logger.log_warning(f'port {port} power is good') + logger.log_info(f'port {port} power is good') return STATE_MODULE_AVAILABLE except Exception as e: - logger.log_warning("exception {} for port {}".format(e, port)) + logger.log_info("exception {} for port {}".format(e, port)) return STATE_HW_NOT_PRESENT - logger.log_warning(f'port {port} has no power good file {module_fd_indep_path}') + logger.log_info(f'port {port} has no power good file {module_fd_indep_path}') module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT def checkIfPowerOn(self, port, module_sm_obj, dynamic=False): - logger.log_warning(f'enter checkIfPowerOn for port {port}') + logger.log_info(f'enter checkIfPowerOn for port {port}') module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) if os.path.isfile(module_fd_indep_path): try: val = utils.read_int_from_file(module_fd_indep_path) val_int = int(val) if 0 == val_int: - logger.log_warning(f'port {port} is not powered') + logger.log_info(f'port {port} is not powered') return STATE_NOT_POWERED elif 1 == val_int: if not module_sm_obj.wait_for_power_on and \ @@ -423,7 +423,7 @@ def checkIfPowerOn(self, port, module_sm_obj, dynamic=False): xcvr_api = sfp.get_xcvr_api() # only if xcvr_api is None or if it is not active optics cables need reset if not xcvr_api or xcvr_api.is_flat_memory(): - logger.log_warning(f'port {port} is powered, but need reset') + logger.log_info(f'port {port} is powered, but need reset') utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 0) module_sm_obj.reset_start_time = time.time() module_sm_obj.wait_for_power_on = True @@ -432,10 +432,10 @@ def checkIfPowerOn(self, port, module_sm_obj, dynamic=False): module_sm_obj.wait_for_power_on = True self.waiting_modules_list.add(module_sm_obj.port_num) return STATE_NOT_POWERED - logger.log_warning(f'port {port} is powered, does not need reset') + logger.log_info(f'port {port} is powered, does not need reset') return STATE_POWERED except Exception as e: - logger.log_warning(f'got exception {e} in checkIfPowerOn') + logger.log_info(f'got exception {e} in checkIfPowerOn') module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT @@ -446,12 +446,12 @@ def powerOnModule(self, port, module_sm_obj, dynamic=False): module_fd_indep_path_r = SYSFS_INDEPENDENT_FD_HW_RESET.format(port) try: if os.path.isfile(module_fd_indep_path_po): - logger.log_warning("powerOnModule powering on via {} for port {}".format(module_fd_indep_path_po, port)) + logger.log_info("powerOnModule powering on via {} for port {}".format(module_fd_indep_path_po, port)) # echo 1 > /sys/module/sx_core/$asic/$module/power_on with open(module_fd_indep_path_po, "w") as module_fd: module_fd.write("1") if os.path.isfile(module_fd_indep_path_r): - logger.log_warning("powerOnModule resetting via {} for port {}".format(module_fd_indep_path_r, port)) + logger.log_info("powerOnModule resetting via {} for port {}".format(module_fd_indep_path_r, port)) # echo 0 > /sys/module/sx_core/$asic/$module/hw_reset with open(module_fd_indep_path_r, "w") as module_fd: module_fd.write("0") @@ -459,30 +459,30 @@ def powerOnModule(self, port, module_sm_obj, dynamic=False): module_sm_obj.wait_for_power_on = True self.waiting_modules_list.add(module_sm_obj.port_num) except Exception as e: - logger.log_warning("exception in powerOnModule {} for port {}".format(e, port)) + logger.log_info("exception in powerOnModule {} for port {}".format(e, port)) return STATE_HW_NOT_PRESENT return STATE_NOT_POWERED def checkModuleType(self, port, module_sm_obj, dynamic=False): - logger.log_warning("enter checkModuleType port {} module_sm_obj {}".format(port, module_sm_obj)) + logger.log_info("enter checkModuleType port {} module_sm_obj {}".format(port, module_sm_obj)) sfp = sfp_module.SFP(port) xcvr_api = sfp.get_xcvr_api() if not xcvr_api: - logger.log_warning("checkModuleType calling sfp reinit for port {} module_sm_obj {}".format(port, module_sm_obj)) + logger.log_info("checkModuleType calling sfp reinit for port {} module_sm_obj {}".format(port, module_sm_obj)) sfp.reinit() - logger.log_warning("checkModuleType setting as FW control as xcvr_api is empty for port {} module_sm_obj {}".format(port, module_sm_obj)) + logger.log_info("checkModuleType setting as FW control as xcvr_api is empty for port {} module_sm_obj {}".format(port, module_sm_obj)) return STATE_FW_CONTROL field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.ID_FIELD) module_type_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) if module_type_ba is None: - logger.log_warning("checkModuleType module_type is None for port {} - checking if we didnt retry yet max number of retries: {}".format(port, MAX_EEPROM_ERROR_RESET_RETRIES)) + logger.log_info("checkModuleType module_type is None for port {} - checking if we didnt retry yet max number of retries: {}".format(port, MAX_EEPROM_ERROR_RESET_RETRIES)) # if we didnt do this retry yet - do it up to 3 times - workaround for FW issue blocking upper page access if module_sm_obj.eeprom_poweron_reset_retries < MAX_EEPROM_ERROR_RESET_RETRIES: - logger.log_warning("checkModuleType module_type is None retrying by falling back to STATE_NOT_POWERED eeprom reset retries {}" + logger.log_info("checkModuleType module_type is None retrying by falling back to STATE_NOT_POWERED eeprom reset retries {}" " for port {}".format(module_sm_obj.eeprom_poweron_reset_retries, port)) if module_sm_obj.eeprom_poweron_reset_retries % 2 == 0: utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "0") - logger.log_warning("checkModuleType sleeping 1 second...") + logger.log_info("checkModuleType sleeping 1 second...") time.sleep(1) else: utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "1") @@ -490,23 +490,23 @@ def checkModuleType(self, port, module_sm_obj, dynamic=False): module_sm_obj.eeprom_poweron_reset_retries += 1 return STATE_NOT_POWERED else: - logger.log_warning("checkModuleType module_type is None and already retried - setting as STATE_ERROR_HANDLER" + logger.log_info("checkModuleType module_type is None and already retried - setting as STATE_ERROR_HANDLER" "for port {}".format(port)) module_sm_obj.set_final_state(STATE_ERROR_HANDLER) return STATE_ERROR_HANDLER module_type = int.from_bytes(module_type_ba, "big") - logger.log_warning("got module_type {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) + logger.log_info("got module_type {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) # QSFP-DD ID is 24, OSFP ID is 25 - only these 2 are supported currently as independent module - SW controlled if module_type not in [24, 25]: - logger.log_warning("setting STATE_FW_CONTROL for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) + logger.log_info("setting STATE_FW_CONTROL for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) module_sm_obj.set_final_state = STATE_FW_CONTROL return STATE_FW_CONTROL else: if xcvr_api.is_flat_memory(): - logger.log_warning("check_module_type port {} setting STATE_FW_CONTROL module ID {} due to flat_mem device" + logger.log_info("check_module_type port {} setting STATE_FW_CONTROL module ID {} due to flat_mem device" .format(module_type, port)) return STATE_FW_CONTROL - logger.log_warning("checking power cap for {} in check_module_type port {} module_sm_obj {}" + logger.log_info("checking power cap for {} in check_module_type port {} module_sm_obj {}" .format(module_type, port, module_sm_obj)) power_cap = self.checkPowerCap(port, module_sm_obj) if power_cap is STATE_POWER_LIMIT_ERROR: @@ -517,11 +517,11 @@ def checkModuleType(self, port, module_sm_obj, dynamic=False): # from byte 2 bits 3-2: # 00b means module supports up to 400KHz # 01b means module supports up to 1MHz - logger.log_warning(f"check_module_type reading mci max frequency for port {port}") + logger.log_info(f"check_module_type reading mci max frequency for port {port}") read_mci = xcvr_api.xcvr_eeprom.read_raw(2, 1) - logger.log_warning(f"check_module_type read mci max frequency {read_mci} for port {port}") + logger.log_info(f"check_module_type read mci max frequency {read_mci} for port {port}") mci_bits = read_mci & 0b00001100 - logger.log_warning(f"check_module_type read mci max frequency bits {mci_bits} for port {port}") + logger.log_info(f"check_module_type read mci max frequency bits {mci_bits} for port {port}") # Then, set it to frequency Sysfs using: # echo > /sys/module/sx_core/$asic/$module/frequency // val: 0 - up to 400KHz, 1 - up to 1MHz indep_fd_freq = SYSFS_INDEPENDENT_FD_FREQ.format(port) @@ -529,27 +529,27 @@ def checkModuleType(self, port, module_sm_obj, dynamic=False): return STATE_SW_CONTROL def checkPowerCap(self, port, module_sm_obj, dynamic=False): - logger.log_warning("enter checkPowerCap port {} module_sm_obj {}".format(port, module_sm_obj)) + logger.log_info("enter checkPowerCap port {} module_sm_obj {}".format(port, module_sm_obj)) #sfp_base_module = SfpBase() sfp = sfp_module.SFP(port) xcvr_api = sfp.get_xcvr_api() field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.MAX_POWER_FIELD) powercap_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) - logger.log_warning("checkPowerCap got powercap bytearray {} for port {} module_sm_obj {}".format(powercap_ba, port, module_sm_obj)) + logger.log_info("checkPowerCap got powercap bytearray {} for port {} module_sm_obj {}".format(powercap_ba, port, module_sm_obj)) powercap = int.from_bytes(powercap_ba, "big") - logger.log_warning("checkPowerCap got powercap {} for port {} module_sm_obj {}".format(powercap, port, module_sm_obj)) + logger.log_info("checkPowerCap got powercap {} for port {} module_sm_obj {}".format(powercap, port, module_sm_obj)) indep_fd_power_limit = self.get_sysfs_ethernet_port_fd(SYSFS_INDEPENDENT_FD_POWER_LIMIT, port) #with open(indep_fd_power_limit, "r") as power_limit_fd: # cage_power_limit = power_limit_fd.read() cage_power_limit = utils.read_int_from_file(indep_fd_power_limit) - logger.log_warning("checkPowerCap got cage_power_limit {} for port {} module_sm_obj {}".format(cage_power_limit, port, module_sm_obj)) + logger.log_info("checkPowerCap got cage_power_limit {} for port {} module_sm_obj {}".format(cage_power_limit, port, module_sm_obj)) if powercap > int(cage_power_limit): - logger.log_warning("checkPowerCap powercap {} != cage_power_limit {} for port {} module_sm_obj {}".format(powercap, cage_power_limit, port, module_sm_obj)) + logger.log_info("checkPowerCap powercap {} != cage_power_limit {} for port {} module_sm_obj {}".format(powercap, cage_power_limit, port, module_sm_obj)) module_sm_obj.set_final_state(STATE_POWER_LIMIT_ERROR) return STATE_POWER_LIMIT_ERROR def saveModuleControlMode(self, port, module_sm_obj, dynamic=False): - logger.log_warning("saveModuleControlMode setting current state {} for port {} as final state".format(module_sm_obj.get_current_state(), port)) + logger.log_info("saveModuleControlMode setting current state {} for port {} as final state".format(module_sm_obj.get_current_state(), port)) # bug - need to find root cause and fix #module_sm_obj.set_final_state(module_sm_obj.get_current_state()) state = module_sm_obj.get_current_state() @@ -559,56 +559,56 @@ def saveModuleControlMode(self, port, module_sm_obj, dynamic=False): indep_fd_fw_control = SYSFS_INDEPENDENT_FD_FW_CONTROL.format(port) with open(indep_fd_fw_control, "w") as fw_control_fd: fw_control_fd.write("0") - logger.log_warning("saveModuleControlMode set FW control for state {} port {}".format(state, port)) + logger.log_info("saveModuleControlMode set FW control for state {} port {}".format(state, port)) module_fd_legacy_path = SYSFS_LEGACY_FD_PRESENCE.format(port) module_sm_obj.set_module_fd_path(module_fd_legacy_path) module_fd = open(module_fd_legacy_path, "r") module_sm_obj.set_module_fd(module_fd) - logger.log_warning("saveModuleControlMode changed module fd to legacy present for port {}".format(port)) + logger.log_info("saveModuleControlMode changed module fd to legacy present for port {}".format(port)) # register the module's sysfs fd to poller with ERR and PRI attrs - logger.log_warning("saveModuleControlMode registering sysfs fd {} number {} path {} for port {}" + logger.log_info("saveModuleControlMode registering sysfs fd {} number {} path {} for port {}" .format(module_sm_obj.module_fd, module_sm_obj.module_fd.fileno(), module_sm_obj.set_module_fd_path, port)) self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = { 'module_obj' : module_sm_obj , 'fd': module_sm_obj.module_fd, 'fd_name' : 'presence' } module_sm_obj.set_poll_obj(self.poll_obj) - logger.log_warning("saveModuleControlMode set current state {} for port {} as final state {}".format( + logger.log_info("saveModuleControlMode set current state {} for port {} as final state {}".format( module_sm_obj.get_current_state(), port, module_sm_obj.get_final_state())) def timerTask(self): # wakes up every 1 second - logger.log_warning("timerTask entered run state") + logger.log_info("timerTask entered run state") empty = False i = 0 while not empty: - logger.log_warning("timerTask while loop itartion {}".format(i)) + logger.log_info("timerTask while loop itartion {}".format(i)) empty = True port_list_to_delete = [] for port in self.waiting_modules_list: - logger.log_warning("timerTask working on port {}".format(port)) + logger.log_info("timerTask working on port {}".format(port)) empty = False module = self.sfp_port_dict[port] - logger.log_warning("timerTask got module with port_num {} from port {}".format(module.port_num, port)) + logger.log_info("timerTask got module with port_num {} from port {}".format(module.port_num, port)) state = module.get_current_state() if module and state == STATE_NOT_POWERED: - logger.log_warning("timerTask module {} current_state {} counting seconds since reset_start_time" + logger.log_info("timerTask module {} current_state {} counting seconds since reset_start_time" .format(module, module.get_current_state())) if time.time() - module.reset_start_time >= 3: # set next state as STATE_POWERED state to trigger the function of check module type - logger.log_warning("timerTask module port {} locking lock of port {}".format(module.port_num, module.port_num)) + logger.log_info("timerTask module port {} locking lock of port {}".format(module.port_num, module.port_num)) self.modules_lock_list[module.port_num].acquire() - logger.log_warning("timerTask module port {} setting next state to STATE_POWERED".format(module.port_num)) + logger.log_info("timerTask module port {} setting next state to STATE_POWERED".format(module.port_num)) module.set_next_state(STATE_POWERED) - logger.log_warning("timerTask module port {} advancing next state".format(module.port_num)) + logger.log_info("timerTask module port {} advancing next state".format(module.port_num)) module.advance_state() - logger.log_warning("timerTask module port {} releasing lock of port {}".format(port, module.port_num)) + logger.log_info("timerTask module port {} releasing lock of port {}".format(port, module.port_num)) self.modules_lock_list[module.port_num].release() - logger.log_warning("timerTask module port {} adding to delete list to remove from waiting_modules_list".format(module.port_num)) + logger.log_info("timerTask module port {} adding to delete list to remove from waiting_modules_list".format(module.port_num)) port_list_to_delete.append(module.port_num) - logger.log_warning("timerTask deleting ports {} from waiting_modules_list...".format(port_list_to_delete)) + logger.log_info("timerTask deleting ports {} from waiting_modules_list...".format(port_list_to_delete)) for port in port_list_to_delete: - logger.log_warning("timerTask deleting port {} from waiting_modules_list".format(port)) + logger.log_info("timerTask deleting port {} from waiting_modules_list".format(port)) self.waiting_modules_list.remove(port) - logger.log_warning("timerTask waiting_modules_list after deletion: {}".format(self.waiting_modules_list)) + logger.log_info("timerTask waiting_modules_list after deletion: {}".format(self.waiting_modules_list)) time.sleep(1) i += 1 def get_sysfs_legacy_ethernet_port_fd(self, sysfs_fd, port): @@ -622,11 +622,11 @@ def get_sysfs_ethernet_port_fd(self, sysfs_fd, port): def add_port_to_wait_reset(self, module_sm_obj): module_sm_obj.reset_start_time = time.time() - logger.log_warning("add_port_to_wait_reset reset_start_time {}".format(module_sm_obj.reset_start_time)) + logger.log_info("add_port_to_wait_reset reset_start_time {}".format(module_sm_obj.reset_start_time)) module_sm_obj.wait_for_power_on = True - logger.log_warning("add_port_to_wait_reset wait_for_power_on {}".format(module_sm_obj.wait_for_power_on)) + logger.log_info("add_port_to_wait_reset wait_for_power_on {}".format(module_sm_obj.wait_for_power_on)) self.waiting_modules_list.add(module_sm_obj.port_num) - logger.log_warning("add_port_to_wait_reset waiting_list after adding: {}".format(self.waiting_modules_list)) + logger.log_info("add_port_to_wait_reset waiting_list after adding: {}".format(self.waiting_modules_list)) def add_ports_state_to_state_db(self, dynamic=False): state_db = None @@ -644,12 +644,12 @@ def add_ports_state_to_state_db(self, dynamic=False): if final_state in [STATE_SW_CONTROL, STATE_FW_CONTROL]: namespaces = multi_asic.get_front_end_namespaces() for namespace in namespaces: - logger.log_warning(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") + logger.log_info(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) - logger.log_warning(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") - logger.log_warning(f"{detection_method} detection got state_db for port {port} namespace {namespace}") + logger.log_info(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") + logger.log_info(f"{detection_method} detection got state_db for port {port} namespace {namespace}") if state_db is not None: - logger.log_warning( + logger.log_info( f"{detection_method} detection connecting to state_db for port {port} namespace {namespace}") state_db.connect(state_db.STATE_DB) if final_state in [STATE_FW_CONTROL]: @@ -657,7 +657,7 @@ def add_ports_state_to_state_db(self, dynamic=False): elif final_state in [STATE_SW_CONTROL]: control_type = 'SW_CONTROL' table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) - logger.log_warning(f"{detection_method} detection setting state_db table {table_name} for port {port}" + logger.log_info(f"{detection_method} detection setting state_db table {table_name} for port {port}" f" namespace {namespace} control_type {control_type}") state_db.set(state_db.STATE_DB, table_name, "control_type", control_type) @@ -667,30 +667,30 @@ def delete_ports_state_from_state_db(self, ports, dynamic=True): for port in ports: namespaces = multi_asic.get_front_end_namespaces() for namespace in namespaces: - logger.log_warning(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") + logger.log_info(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) - logger.log_warning(f"{detection_method} detection got state_db for port {port} namespace {namespace}") + logger.log_info(f"{detection_method} detection got state_db for port {port} namespace {namespace}") if state_db is not None: - logger.log_warning( + logger.log_info( f"{detection_method} detection connecting to state_db for port {port} namespace {namespace}") state_db.connect(state_db.STATE_DB) table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) - logger.log_warning(f"{detection_method} detection deleting state_db table {table_name} " + logger.log_info(f"{detection_method} detection deleting state_db table {table_name} " f"for port {port} namespace {namespace}") state_db.delete(state_db.STATE_DB, table_name) def delete_ports_from_dict(self, dynamic=False): detection_method = 'dynamic' if dynamic else 'static' - logger.log_warning(f"{detection_method} detection sfp_port_dict before deletion: {self.sfp_port_dict}") + logger.log_info(f"{detection_method} detection sfp_port_dict before deletion: {self.sfp_port_dict}") for port in self.sfp_delete_list_from_port_dict: del self.sfp_port_dict[port] self.sfp_delete_list_from_port_dict = [] - logger.log_warning("dynamic detection sfp_port_dict after deletion: {}".format(self.sfp_port_dict)) + logger.log_info("dynamic detection sfp_port_dict after deletion: {}".format(self.sfp_port_dict)) def send_changes_to_shared_queue(self, dynamic=False): detection_method = 'dynamic' if dynamic else 'static' if self.sfp_changes_dict: - logger.log_warning(f"{detection_method} detection putting sfp_changes_dict {self.sfp_changes_dict} " + logger.log_info(f"{detection_method} detection putting sfp_changes_dict {self.sfp_changes_dict} " f"in modules changes queue...") try: self.modules_queue_lock.acquire() @@ -698,9 +698,9 @@ def send_changes_to_shared_queue(self, dynamic=False): self.modules_queue_lock.release() self.sfp_changes_dict = {} except queue.Full: - logger.log_warning(f"{detection_method} failed to put item from modules changes queue, queue is full") + logger.log_info(f"{detection_method} failed to put item from modules changes queue, queue is full") else: - logger.log_warning(f"{detection_method} sfp_changes_dict {self.sfp_changes_dict} is empty...") + logger.log_info(f"{detection_method} sfp_changes_dict {self.sfp_changes_dict} is empty...") class ModuleStateMachine(object): From 0b2bea5f44c05ddd74007d43f29dbbc62806c316 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Wed, 11 Oct 2023 15:57:37 +0000 Subject: [PATCH 08/26] fix comments of internal CR part 1 --- .../sonic_platform/chassis.py | 31 ++-- .../sonic_platform/modules_mgmt.py | 144 ++++++++---------- 2 files changed, 74 insertions(+), 101 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index d334fc10d8b0..f0cd1a0e53e1 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -126,10 +126,8 @@ def __init__(self): self._RJ45_port_inited = False self._RJ45_port_list = None - self.threads = [] self.modules_mgmt_thread = threading.Thread() self.modules_changes_queue = queue.Queue() - self.modules_queue_lock = threading.Lock() self.modules_mgmt_task_stopping_event = threading.Event() logger.log_info("Chassis loaded successfully") @@ -142,11 +140,6 @@ def __del__(self): if self.sfp_module.SFP.shared_sdk_handle: self.sfp_module.deinitialize_sdk_handle(self.sfp_module.SFP.shared_sdk_handle) - #self.modules_mgmt_task_stopping_event.set() - #logger.log_info('set modules_mgmt_task_stopping_event {self.modules_mgmt_task_stopping_event}') - #self.modules_mgmt_thread.join(timeout=10) - #logger.log_info('joined modules_mgmt_thread thread') - @property def RJ45_port_list(self): if not self._RJ45_port_inited: @@ -399,27 +392,23 @@ def get_change_event(self, timeout=0): if not self.modules_mgmt_thread.is_alive(): # open new SFP change events thread self.modules_mgmt_thread = modules_mgmt.ModulesMgmtTask(q=self.modules_changes_queue - , l=self.modules_queue_lock , main_thread_stop_event = self.modules_mgmt_task_stopping_event) self.modules_mgmt_thread.start() - self.threads.append(self.modules_mgmt_thread) self.initialize_sfp() + wait_for_ever = (timeout == 0) + # select timeout should be no more than 1000ms to ensure fast shutdown flow + timeout = 1000.0 if timeout >= 1000 else float(timeout) port_dict = {} error_dict = {} + begin = time.time() i = 0 while True: - logger.log_info('get_change_event() acquiring queue lock iteration {}'.format(i)) - self.modules_queue_lock.acquire() - if self.modules_changes_queue.qsize() > 0: - if True: - try: - logger.log_info('get_change_event() trying to get changes from queue') - port_dict = self.modules_changes_queue.get(timeout=1) - logger.log_info ('get_change_event() port_dict: {}'.format(port_dict)) - except queue.Empty: - logger.log_info("failed to get item from modules changes queue") - logger.log_info('get_change_event() releasing queue lock iteration {}'.format(i)) - self.modules_queue_lock.release() + try: + logger.log_info(f'get_change_event() trying to get changes from queue on iteration {i}') + port_dict = self.modules_changes_queue.get(timeout=timeout) + logger.log_info (f'get_change_event() iteration {i} port_dict: {port_dict}') + except queue.Empty: + logger.log_info(f"failed to get item from modules changes queue on itertaion {i}") if port_dict: self.reinit_sfps(port_dict) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index b84459fb9286..7400c0c5bcd7 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -31,23 +31,22 @@ STATE_POWER_LIMIT_ERROR = "The cage has not enough power for the plugged module" STATE_SYSFS_ERROR = "An error occurred while writing/reading SySFS." -INDEP_PROFILE_FILE = "/{}/independent_mode_support.profile" +SAI_PROFILE_FILE = "/{}/sai.profile" SAI_INDEP_MODULE_MODE = "SAI_INDEPENDENT_MODULE_MODE" SAI_INDEP_MODULE_MODE_DELIMITER = "=" SAI_INDEP_MODULE_MODE_TRUE_STR = "1" SYSFS_LEGACY_FD_PRESENCE = "/sys/module/sx_core/asic0/module{}/present" ASIC_NUM = 0 -PORT_BREAKOUT = 8 SYSFS_INDEPENDENT_FD_PREFIX_WO_MODULE = "/sys/module/sx_core/asic{}".format(ASIC_NUM) SYSFS_INDEPENDENT_FD_PREFIX = SYSFS_INDEPENDENT_FD_PREFIX_WO_MODULE + "/module{}" -SYSFS_INDEPENDENT_FD_PRESENCE = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "hw_present"]) -SYSFS_INDEPENDENT_FD_POWER_GOOD = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "power_good"]) -SYSFS_INDEPENDENT_FD_POWER_ON = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "power_on"]) -SYSFS_INDEPENDENT_FD_HW_RESET = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "hw_reset"]) -SYSFS_INDEPENDENT_FD_POWER_LIMIT = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "power_limit"]) -SYSFS_INDEPENDENT_FD_FW_CONTROL = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "control"]) +SYSFS_INDEPENDENT_FD_PRESENCE = os.path.join(SYSFS_INDEPENDENT_FD_PREFIX, "hw_present") +SYSFS_INDEPENDENT_FD_POWER_GOOD = os.path.join(SYSFS_INDEPENDENT_FD_PREFIX, "power_good") +SYSFS_INDEPENDENT_FD_POWER_ON = os.path.join(SYSFS_INDEPENDENT_FD_PREFIX, "power_on") +SYSFS_INDEPENDENT_FD_HW_RESET = os.path.join(SYSFS_INDEPENDENT_FD_PREFIX, "hw_reset") +SYSFS_INDEPENDENT_FD_POWER_LIMIT = os.path.join(SYSFS_INDEPENDENT_FD_PREFIX, "power_limit") +SYSFS_INDEPENDENT_FD_FW_CONTROL = os.path.join(SYSFS_INDEPENDENT_FD_PREFIX, "control") # echo /sys/module/sx_core/$asic/$module/frequency // val: 0 - up to 400KHz, 1 - up to 1MHz -SYSFS_INDEPENDENT_FD_FREQ = '/'.join([SYSFS_INDEPENDENT_FD_PREFIX, "frequency"]) +SYSFS_INDEPENDENT_FD_FREQ = os.path.join(SYSFS_INDEPENDENT_FD_PREFIX, "frequency") IS_INDEPENDENT_MODULE = 'is_independent_module' STATE_DB_TABLE_NAME_PREFIX = 'TRANSCEIVER_MODULES_MGMT|{}' @@ -57,7 +56,7 @@ class ModulesMgmtTask(threading.Thread): RETRY_EEPROM_READING_INTERVAL = 60 - def __init__(self, namespaces=None, main_thread_stop_event=None, q=None, l=None): + def __init__(self, namespaces=None, main_thread_stop_event=None, q=None): threading.Thread.__init__(self) self.name = "ModulesMgmtTask" self.main_thread_stop_event = main_thread_stop_event @@ -69,14 +68,11 @@ def __init__(self, namespaces=None, main_thread_stop_event=None, q=None, l=None) self.sfp_delete_list_from_port_dict = [] self.namespaces = namespaces self.modules_changes_queue = q - self.modules_queue_lock = l self.is_supported_indep_mods_system = False self.modules_lock_list = [] # A set to hold those modules waiting 3 seconds since power on and hw reset self.waiting_modules_list = set() self.timer = threading.Thread() - self.timer_queue = queue.Queue() - self.timer_queue_lock = threading.Lock() self.poll_obj = None self.fds_mapping_to_obj = {} self.fds_events_count_dict = {} @@ -85,12 +81,12 @@ def __init__(self, namespaces=None, main_thread_stop_event=None, q=None, l=None) # SFPs state machine def get_sm_func(self, sm, port): SFP_SM_ENUM = {STATE_HW_NOT_PRESENT: self.check_if_hw_present - , STATE_HW_PRESENT: self.checkIfModuleAvailable - , STATE_MODULE_AVAILABLE: self.checkIfPowerOn - , STATE_NOT_POWERED: self.powerOnModule - , STATE_POWERED: self.checkModuleType - , STATE_FW_CONTROL: self.saveModuleControlMode - , STATE_SW_CONTROL: self.saveModuleControlMode + , STATE_HW_PRESENT: self.check_if_module_available + , STATE_MODULE_AVAILABLE: self.check_if_power_on + , STATE_NOT_POWERED: self.power_on_module + , STATE_POWERED: self.check_module_type + , STATE_FW_CONTROL: self.save_module_control_mode + , STATE_SW_CONTROL: self.save_module_control_mode , STATE_ERROR_HANDLER: STATE_ERROR_HANDLER , STATE_POWER_LIMIT_ERROR: STATE_POWER_LIMIT_ERROR , STATE_SYSFS_ERROR: STATE_SYSFS_ERROR @@ -101,25 +97,30 @@ def get_sm_func(self, sm, port): logger.log_info("got func {} for state {} for port {}".format(func, sm, port)) return func except KeyError as e: - logger.log_info("exception {} for port {}".format(e, port)) + logger.log_error("exception {} for port {} sm {}".format(e, port, sm)) return None def run(self): # check first if the system supports independent mode and set boolean accordingly (platform_path, hwsku_dir) = device_info.get_paths_to_platform_and_hwsku_dirs() - #hwsku = device_info.get_hwsku() - independent_file = INDEP_PROFILE_FILE.format(hwsku_dir) + logger.log_info("hwsku_dir {} found, continue to check sai.profile file".format(hwsku_dir)) + independent_file = SAI_PROFILE_FILE.format(hwsku_dir) if os.path.isfile(independent_file): logger.log_info("file {} found, checking content for independent mode value".format(independent_file)) with open(independent_file, "r") as independent_file_fd: - independent_file_content = independent_file_fd.read() - if SAI_INDEP_MODULE_MODE in independent_file_content and \ - SAI_INDEP_MODULE_MODE_DELIMITER in independent_file_content: - independent_file_splitted = independent_file_content.split(SAI_INDEP_MODULE_MODE_DELIMITER) - if (len(independent_file_splitted) > 1): - self.is_supported_indep_mods_system = int(independent_file_splitted[1]) == int(SAI_INDEP_MODULE_MODE_TRUE_STR) - logger.log_info("file {} found, system will work in independent mode".format(independent_file)) - logger.log_info("value of indep mode var: {} found in file".format(independent_file_splitted[1])) + found = False + independent_file_content = ' ' + logger.log_info("file {} found, checking content for independent mode value".format(independent_file)) + while independent_file_content and not found: + independent_file_content = independent_file_fd.readline() + if SAI_INDEP_MODULE_MODE in independent_file_content and \ + SAI_INDEP_MODULE_MODE_DELIMITER in independent_file_content: + independent_file_splitted = independent_file_content.split(SAI_INDEP_MODULE_MODE_DELIMITER) + if (len(independent_file_splitted) > 1): + self.is_supported_indep_mods_system = int(independent_file_splitted[1]) == int(SAI_INDEP_MODULE_MODE_TRUE_STR) + logger.log_info("file {} found, system will work in independent mode".format(independent_file)) + logger.log_info("value of indep mode var: {} found in file".format(independent_file_splitted[1])) + found = True else: logger.log_info("file {} not found, system stays in legacy mode".format(independent_file)) @@ -129,9 +130,7 @@ def run(self): num_of_ports = DeviceDataManager.get_sfp_count() # create the modules sysfs fds poller self.poll_obj = select.poll() - #self.poll_obj = [] for port in range(num_of_ports): - #temp_port_dict = {IS_INDEPENDENT_MODULE: False} # check sysfs per port whether it's independent mode or legacy temp_module_sm = ModuleStateMachine(port_num=port, initial_state=STATE_HW_NOT_PRESENT , current_state=STATE_HW_NOT_PRESENT) @@ -144,13 +143,12 @@ def run(self): module_fd = open(module_fd_indep_path, "r") temp_module_sm.set_module_fd(module_fd) else: - module_fd_legacy_path = self.get_sysfs_legacy_ethernet_port_fd(SYSFS_LEGACY_FD_PRESENCE, port) + module_fd_legacy_path = self.get_sysfs_ethernet_port_fd(SYSFS_LEGACY_FD_PRESENCE, port) temp_module_sm.set_module_fd_path(module_fd_legacy_path) module_fd = open(module_fd_legacy_path, "r") temp_module_sm.set_module_fd(module_fd) # add lock to use with timer task updating next state per module object self.modules_lock_list.append(threading.Lock()) - temp_module_sm.set_poll_obj(self.poll_obj) # start SM for this independent module logger.log_info("adding temp_module_sm {} to sfp_port_dict".format(temp_module_sm)) self.sfp_port_dict_initial[port] = temp_module_sm @@ -180,7 +178,8 @@ def run(self): logger.log_info("static detection got returned func {} for state {}".format(func, curr_state)) try: if not isinstance(func, str): - next_state = func(port_num, module_sm_obj) + if func is not None: + next_state = func(port_num, module_sm_obj) except TypeError as e: logger.log_info("static detection exception {} for port {} traceback:\n{}".format(e, port_num, traceback.format_exc())) module_sm_obj.set_final_state(STATE_ERROR_HANDLER) @@ -207,7 +206,6 @@ def run(self): # call timer task self.timer = threading.Timer(1.0, self.timerTask) self.timer.start() - self.timer_queue.put(module_sm_obj) if self.timer.is_alive(): logger.log_info("timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].acquire() @@ -254,9 +252,9 @@ def run(self): module_obj = self.fds_mapping_to_obj[fd_fileno]['module_obj'] module_fd = self.fds_mapping_to_obj[fd_fileno]['fd'] fd_name = self.fds_mapping_to_obj[fd_fileno]['fd_name'] - if fd_name in ['presence']: + if 'presence' == fd_name: module_fd_path = module_obj.module_fd_path - elif fd_name in ['power_good']: + elif 'power_good' == fd_name: module_fd_path = module_obj.module_power_good_fd_path try: logger.log_info("dynamic detection dummy reading from fd path {} for port {}" @@ -278,13 +276,12 @@ def run(self): for fd, event in fds_events: # get modules object from fd according to saved key-value of fd-module obj saved earlier logger.log_info("dynamic detection working on fd {} event {}".format(fd, event)) - #module_obj = self.fds_mapping_to_obj[fd] module_obj = self.fds_mapping_to_obj[fd_fileno]['module_obj'] module_fd = self.fds_mapping_to_obj[fd_fileno]['fd'] fd_name = self.fds_mapping_to_obj[fd_fileno]['fd_name'] - if fd_name in ['presence']: + if 'presence' == fd_name: module_fd_path = module_obj.module_fd_path - elif fd_name in ['power_good']: + elif 'power_good' == fd_name: module_fd_path = module_obj.module_power_good_fd_path self.fds_events_count_dict[module_obj.port_num][fd_name] += 1 val = module_fd.read() @@ -298,15 +295,14 @@ def run(self): self.sfp_port_dict[module_obj.port_num] = module_obj self.delete_ports_from_state_db_list.append(module_obj.port_num) self.delete_ports_state_from_state_db(self.delete_ports_from_state_db_list) - logger.log_info("dynamic detection sleeping 1 second...") - time.sleep(1) for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() logger.log_info(f'dynamic detection STATE_LOG {port_num}: curr_state is {curr_state}') func = self.get_sm_func(curr_state, port) logger.log_info("dynamic detection got returned func {} for state {}".format(func, curr_state)) try: - next_state = func(port_num, module_sm_obj, dynamic=True) + if func is not None: + next_state = func(port_num, module_sm_obj, dynamic=True) except TypeError as e: logger.log_info("exception {} for port {}".format(e, port_num)) continue @@ -331,7 +327,6 @@ def run(self): # call timer task self.timer = threading.Timer(1.0, self.timerTask) self.timer.start() - self.timer_queue.put(module_sm_obj) if self.timer.is_alive(): logger.log_info("dynamic detection timer thread is_alive {}, locking module obj".format(self.timer.is_alive())) self.modules_lock_list[port_num].acquire() @@ -353,6 +348,7 @@ def run(self): def check_if_hw_present(self, port, module_sm_obj, dynamic=False): + logger.log_info("enter check_if_hw_present port {} module_sm_obj {}".format(port, module_sm_obj)) if self.is_supported_indep_mods_system: module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) else: @@ -376,7 +372,7 @@ def check_if_hw_present(self, port, module_sm_obj, dynamic=False): module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT - def checkIfModuleAvailable(self, port, module_sm_obj, dynamic=False): + def check_if_module_available(self, port, module_sm_obj, dynamic=False): logger.log_info("enter check_if_module_available port {} module_sm_obj {}".format(port, module_sm_obj)) module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_GOOD.format(port) if os.path.isfile(module_fd_indep_path): @@ -406,13 +402,12 @@ def checkIfModuleAvailable(self, port, module_sm_obj, dynamic=False): module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT - def checkIfPowerOn(self, port, module_sm_obj, dynamic=False): + def check_if_power_on(self, port, module_sm_obj, dynamic=False): logger.log_info(f'enter checkIfPowerOn for port {port}') module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) if os.path.isfile(module_fd_indep_path): try: - val = utils.read_int_from_file(module_fd_indep_path) - val_int = int(val) + val_int = utils.read_int_from_file(module_fd_indep_path) if 0 == val_int: logger.log_info(f'port {port} is not powered') return STATE_NOT_POWERED @@ -426,7 +421,6 @@ def checkIfPowerOn(self, port, module_sm_obj, dynamic=False): logger.log_info(f'port {port} is powered, but need reset') utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 0) module_sm_obj.reset_start_time = time.time() - module_sm_obj.wait_for_power_on = True utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 1) module_sm_obj.reset_start_time = time.time() module_sm_obj.wait_for_power_on = True @@ -439,8 +433,8 @@ def checkIfPowerOn(self, port, module_sm_obj, dynamic=False): module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT - def powerOnModule(self, port, module_sm_obj, dynamic=False): - #if module_sm_obj not in self.waiting_modules_list: + def power_on_module(self, port, module_sm_obj, dynamic=False): + logger.log_info(f'enter checkIfPowerOn for port {port}') if not module_sm_obj.wait_for_power_on: module_fd_indep_path_po = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) module_fd_indep_path_r = SYSFS_INDEPENDENT_FD_HW_RESET.format(port) @@ -449,12 +443,12 @@ def powerOnModule(self, port, module_sm_obj, dynamic=False): logger.log_info("powerOnModule powering on via {} for port {}".format(module_fd_indep_path_po, port)) # echo 1 > /sys/module/sx_core/$asic/$module/power_on with open(module_fd_indep_path_po, "w") as module_fd: - module_fd.write("1") + utils.write_file(module_fd, "1") if os.path.isfile(module_fd_indep_path_r): logger.log_info("powerOnModule resetting via {} for port {}".format(module_fd_indep_path_r, port)) # echo 0 > /sys/module/sx_core/$asic/$module/hw_reset with open(module_fd_indep_path_r, "w") as module_fd: - module_fd.write("0") + utils.write_file(module_fd, "0") module_sm_obj.reset_start_time = time.time() module_sm_obj.wait_for_power_on = True self.waiting_modules_list.add(module_sm_obj.port_num) @@ -463,28 +457,32 @@ def powerOnModule(self, port, module_sm_obj, dynamic=False): return STATE_HW_NOT_PRESENT return STATE_NOT_POWERED - def checkModuleType(self, port, module_sm_obj, dynamic=False): + def check_module_type(self, port, module_sm_obj, dynamic=False): logger.log_info("enter checkModuleType port {} module_sm_obj {}".format(port, module_sm_obj)) sfp = sfp_module.SFP(port) xcvr_api = sfp.get_xcvr_api() if not xcvr_api: - logger.log_info("checkModuleType calling sfp reinit for port {} module_sm_obj {}".format(port, module_sm_obj)) + logger.log_info("checkModuleType calling sfp reinit for port {} module_sm_obj {}" + .format(port, module_sm_obj)) sfp.reinit() - logger.log_info("checkModuleType setting as FW control as xcvr_api is empty for port {} module_sm_obj {}".format(port, module_sm_obj)) + logger.log_info("checkModuleType setting as FW control as xcvr_api is empty for port {} module_sm_obj {}" + .format(port, module_sm_obj)) return STATE_FW_CONTROL field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.ID_FIELD) module_type_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) if module_type_ba is None: - logger.log_info("checkModuleType module_type is None for port {} - checking if we didnt retry yet max number of retries: {}".format(port, MAX_EEPROM_ERROR_RESET_RETRIES)) + logger.log_info("checkModuleType module_type is None for port {} - checking if we didnt retry yet max " + "number of retries: {}".format(port, MAX_EEPROM_ERROR_RESET_RETRIES)) # if we didnt do this retry yet - do it up to 3 times - workaround for FW issue blocking upper page access if module_sm_obj.eeprom_poweron_reset_retries < MAX_EEPROM_ERROR_RESET_RETRIES: - logger.log_info("checkModuleType module_type is None retrying by falling back to STATE_NOT_POWERED eeprom reset retries {}" - " for port {}".format(module_sm_obj.eeprom_poweron_reset_retries, port)) - if module_sm_obj.eeprom_poweron_reset_retries % 2 == 0: + logger.log_info(f"checkModuleType module_type is None retrying by falling back to STATE_NOT_POWERED " + f"eeprom reset retries {module_sm_obj.eeprom_poweron_reset_retries} for port {port}") + #if module_sm_obj.eeprom_poweron_reset_retries % 2 == 0: + if True: utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "0") logger.log_info("checkModuleType sleeping 1 second...") - time.sleep(1) - else: + #time.sleep(1) + #else: utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "1") self.add_port_to_wait_reset(module_sm_obj) module_sm_obj.eeprom_poweron_reset_retries += 1 @@ -528,7 +526,7 @@ def checkModuleType(self, port, module_sm_obj, dynamic=False): utils.write_file(indep_fd_freq, mci_bits) return STATE_SW_CONTROL - def checkPowerCap(self, port, module_sm_obj, dynamic=False): + def check_power_cap(self, port, module_sm_obj, dynamic=False): logger.log_info("enter checkPowerCap port {} module_sm_obj {}".format(port, module_sm_obj)) #sfp_base_module = SfpBase() sfp = sfp_module.SFP(port) @@ -548,7 +546,7 @@ def checkPowerCap(self, port, module_sm_obj, dynamic=False): module_sm_obj.set_final_state(STATE_POWER_LIMIT_ERROR) return STATE_POWER_LIMIT_ERROR - def saveModuleControlMode(self, port, module_sm_obj, dynamic=False): + def save_module_control_mode(self, port, module_sm_obj, dynamic=False): logger.log_info("saveModuleControlMode setting current state {} for port {} as final state".format(module_sm_obj.get_current_state(), port)) # bug - need to find root cause and fix #module_sm_obj.set_final_state(module_sm_obj.get_current_state()) @@ -571,7 +569,6 @@ def saveModuleControlMode(self, port, module_sm_obj, dynamic=False): self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = { 'module_obj' : module_sm_obj , 'fd': module_sm_obj.module_fd, 'fd_name' : 'presence' } - module_sm_obj.set_poll_obj(self.poll_obj) logger.log_info("saveModuleControlMode set current state {} for port {} as final state {}".format( module_sm_obj.get_current_state(), port, module_sm_obj.get_final_state())) @@ -611,10 +608,6 @@ def timerTask(self): # wakes up every 1 second logger.log_info("timerTask waiting_modules_list after deletion: {}".format(self.waiting_modules_list)) time.sleep(1) i += 1 - def get_sysfs_legacy_ethernet_port_fd(self, sysfs_fd, port): - breakout_port = "Ethernet{}".format(port * PORT_BREAKOUT) - sysfs_eth_port_fd = sysfs_fd.format(breakout_port) - return sysfs_eth_port_fd def get_sysfs_ethernet_port_fd(self, sysfs_fd, port): sysfs_eth_port_fd = sysfs_fd.format(port) @@ -693,9 +686,7 @@ def send_changes_to_shared_queue(self, dynamic=False): logger.log_info(f"{detection_method} detection putting sfp_changes_dict {self.sfp_changes_dict} " f"in modules changes queue...") try: - self.modules_queue_lock.acquire() self.modules_changes_queue.put(self.sfp_changes_dict, timeout=1) - self.modules_queue_lock.release() self.sfp_changes_dict = {} except queue.Full: logger.log_info(f"{detection_method} failed to put item from modules changes queue, queue is full") @@ -707,7 +698,7 @@ class ModuleStateMachine(object): def __init__(self, port_num=0, initial_state=STATE_HW_NOT_PRESENT, current_state=STATE_HW_NOT_PRESENT , next_state=STATE_HW_NOT_PRESENT, final_state='', is_indep_module=False - , module_fd_path='', module_fd=None, poll_obj=None, reset_start_time=None + , module_fd_path='', module_fd=None, reset_start_time=None , eeprom_poweron_reset_retries=1): self.port_num = port_num @@ -718,7 +709,6 @@ def __init__(self, port_num=0, initial_state=STATE_HW_NOT_PRESENT, current_state self.is_indep_modules = is_indep_module self.module_fd_path = module_fd_path self.module_fd = module_fd - self.poll_obj = poll_obj self.reset_start_time = reset_start_time self.wait_for_power_on = False self.eeprom_poweron_reset_retries = eeprom_poweron_reset_retries @@ -759,12 +749,6 @@ def set_module_fd_path(self, module_fd_path): def set_module_fd(self, module_fd): self.module_fd = module_fd - def get_poll_obj(self): - return self.poll_obj - - def set_poll_obj(self, poll_obj): - self.poll_obj = poll_obj - def reset_all_states(self, def_state=STATE_HW_NOT_PRESENT, retries=1): self.initial_state = def_state self.current_state = def_state From 97f9f478bd7a17d623e132964f3c9446c8e1fe58 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Thu, 12 Oct 2023 13:00:32 +0000 Subject: [PATCH 09/26] fix CR comments part 2 --- .../sonic_platform/modules_mgmt.py | 79 +++++++++---------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 7400c0c5bcd7..ed6242e696cb 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -77,6 +77,7 @@ def __init__(self, namespaces=None, main_thread_stop_event=None, q=None): self.fds_mapping_to_obj = {} self.fds_events_count_dict = {} self.delete_ports_from_state_db_list = [] + self.setName("ModulesMgmtTask") # SFPs state machine def get_sm_func(self, sm, port): @@ -158,9 +159,10 @@ def run(self): # need at least 1 module in final state until it makes sense to send changes dict is_final_state_module = False all_static_detection_done = False - logger.log_info("sfp_port_dict before starting static detection: {}".format(self.sfp_port_dict)) + logger.log_info(f"sfp_port_dict before starting static detection: {self.sfp_port_dict} main_thread_stop_event: " + f"{self.main_thread_stop_event.is_set()} all_static_detection_done: {all_static_detection_done}") # static detection - loop on different state for all ports until all done - while not self.main_thread_stop_event and not all_static_detection_done: + while not self.main_thread_stop_event.is_set() and not all_static_detection_done: logger.log_info("static detection running iteration {}".format(i)) waiting_list_len = len(self.waiting_modules_list) sfp_port_dict_keys_len = len(self.sfp_port_dict.keys()) @@ -232,18 +234,19 @@ def run(self): logger.log_info("static detection len of keys of sfp_port_dict is not 0: {}".format(sfp_port_dict_keys_len)) logger.log_info("static detection all_static_detection_done: {}".format(all_static_detection_done)) - logger.log_info("sfp_port_dict before dynamic detection: {}".format(self.sfp_port_dict)) + logger.log_info(f"sfp_port_dict before dynamic detection: {self.sfp_port_dict} " + f"main_thread_stop_event.is_set(): {self.main_thread_stop_event.is_set()}") # dynamic detection - loop on polling changes, run state machine for them and put them into shared queue i = 0 # need at least 1 module in final state until it makes sense to send changes dict is_final_state_module = False # initialize fds events count to 0 for fd_fileno in self.fds_mapping_to_obj: - module_obj = self.fds_mapping_to_obj[fd_fileno] + module_obj = self.fds_mapping_to_obj[fd_fileno]['module_obj'] # for debug purposes self.fds_events_count_dict[module_obj.port_num] = { 'presence' : 0 , 'power_good' : 0 } dummy_read = False - while not self.main_thread_stop_event: + while not self.main_thread_stop_event.is_set(): logger.log_info("dynamic detection running iteration {}".format(i)) # dummy read all sysfs fds before polling them due to linux kernel implementation of poll if not dummy_read: @@ -403,13 +406,13 @@ def check_if_module_available(self, port, module_sm_obj, dynamic=False): return STATE_HW_NOT_PRESENT def check_if_power_on(self, port, module_sm_obj, dynamic=False): - logger.log_info(f'enter checkIfPowerOn for port {port}') + logger.log_info(f'enter check_if_power_on for port {port}') module_fd_indep_path = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) if os.path.isfile(module_fd_indep_path): try: val_int = utils.read_int_from_file(module_fd_indep_path) if 0 == val_int: - logger.log_info(f'port {port} is not powered') + logger.log_info(f'check_if_power_on port {port} is not powered') return STATE_NOT_POWERED elif 1 == val_int: if not module_sm_obj.wait_for_power_on and \ @@ -418,23 +421,22 @@ def check_if_power_on(self, port, module_sm_obj, dynamic=False): xcvr_api = sfp.get_xcvr_api() # only if xcvr_api is None or if it is not active optics cables need reset if not xcvr_api or xcvr_api.is_flat_memory(): - logger.log_info(f'port {port} is powered, but need reset') - utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 0) - module_sm_obj.reset_start_time = time.time() + logger.log_info(f'check_if_power_on port {port} is powered, but need reset') utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 1) + utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 0) module_sm_obj.reset_start_time = time.time() module_sm_obj.wait_for_power_on = True self.waiting_modules_list.add(module_sm_obj.port_num) return STATE_NOT_POWERED - logger.log_info(f'port {port} is powered, does not need reset') + logger.log_info(f'check_if_power_on port {port} is powered, does not need reset') return STATE_POWERED except Exception as e: - logger.log_info(f'got exception {e} in checkIfPowerOn') + logger.log_info(f'check_if_power_on got exception {e}') module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) return STATE_HW_NOT_PRESENT def power_on_module(self, port, module_sm_obj, dynamic=False): - logger.log_info(f'enter checkIfPowerOn for port {port}') + logger.log_info(f'enter power_on_module for port {port}') if not module_sm_obj.wait_for_power_on: module_fd_indep_path_po = SYSFS_INDEPENDENT_FD_POWER_ON.format(port) module_fd_indep_path_r = SYSFS_INDEPENDENT_FD_HW_RESET.format(port) @@ -458,45 +460,42 @@ def power_on_module(self, port, module_sm_obj, dynamic=False): return STATE_NOT_POWERED def check_module_type(self, port, module_sm_obj, dynamic=False): - logger.log_info("enter checkModuleType port {} module_sm_obj {}".format(port, module_sm_obj)) + logger.log_info("enter check_module_type port {} module_sm_obj {}".format(port, module_sm_obj)) sfp = sfp_module.SFP(port) xcvr_api = sfp.get_xcvr_api() if not xcvr_api: - logger.log_info("checkModuleType calling sfp reinit for port {} module_sm_obj {}" + logger.log_info("check_module_type calling sfp reinit for port {} module_sm_obj {}" .format(port, module_sm_obj)) sfp.reinit() - logger.log_info("checkModuleType setting as FW control as xcvr_api is empty for port {} module_sm_obj {}" + logger.log_info("check_module_type setting as FW control as xcvr_api is empty for port {} module_sm_obj {}" .format(port, module_sm_obj)) return STATE_FW_CONTROL field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.ID_FIELD) module_type_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) if module_type_ba is None: - logger.log_info("checkModuleType module_type is None for port {} - checking if we didnt retry yet max " + logger.log_info("check_module_type module_type is None for port {} - checking if we didnt retry yet max " "number of retries: {}".format(port, MAX_EEPROM_ERROR_RESET_RETRIES)) # if we didnt do this retry yet - do it up to 3 times - workaround for FW issue blocking upper page access if module_sm_obj.eeprom_poweron_reset_retries < MAX_EEPROM_ERROR_RESET_RETRIES: - logger.log_info(f"checkModuleType module_type is None retrying by falling back to STATE_NOT_POWERED " + logger.log_info(f"check_module_type module_type is None retrying by falling back to STATE_NOT_POWERED " f"eeprom reset retries {module_sm_obj.eeprom_poweron_reset_retries} for port {port}") - #if module_sm_obj.eeprom_poweron_reset_retries % 2 == 0: - if True: - utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "0") - logger.log_info("checkModuleType sleeping 1 second...") - #time.sleep(1) - #else: - utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "1") + utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "1") + utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "0") self.add_port_to_wait_reset(module_sm_obj) module_sm_obj.eeprom_poweron_reset_retries += 1 return STATE_NOT_POWERED else: - logger.log_info("checkModuleType module_type is None and already retried - setting as STATE_ERROR_HANDLER" + logger.log_info("check_module_type module_type is None and already retried - setting as STATE_ERROR_HANDLER" "for port {}".format(port)) module_sm_obj.set_final_state(STATE_ERROR_HANDLER) return STATE_ERROR_HANDLER module_type = int.from_bytes(module_type_ba, "big") - logger.log_info("got module_type {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) + logger.log_info("check_module_type got module_type {} in check_module_type port {} module_sm_obj {}" + .format(module_type, port, module_sm_obj)) # QSFP-DD ID is 24, OSFP ID is 25 - only these 2 are supported currently as independent module - SW controlled if module_type not in [24, 25]: - logger.log_info("setting STATE_FW_CONTROL for {} in check_module_type port {} module_sm_obj {}".format(module_type, port, module_sm_obj)) + logger.log_info("check_module_type setting STATE_FW_CONTROL for {} in check_module_type port {} module_sm_obj {}" + .format(module_type, port, module_sm_obj)) module_sm_obj.set_final_state = STATE_FW_CONTROL return STATE_FW_CONTROL else: @@ -504,9 +503,9 @@ def check_module_type(self, port, module_sm_obj, dynamic=False): logger.log_info("check_module_type port {} setting STATE_FW_CONTROL module ID {} due to flat_mem device" .format(module_type, port)) return STATE_FW_CONTROL - logger.log_info("checking power cap for {} in check_module_type port {} module_sm_obj {}" + logger.log_info("check_module_type checking power cap for {} in check_module_type port {} module_sm_obj {}" .format(module_type, port, module_sm_obj)) - power_cap = self.checkPowerCap(port, module_sm_obj) + power_cap = self.check_power_cap(port, module_sm_obj) if power_cap is STATE_POWER_LIMIT_ERROR: module_sm_obj.set_final_state(STATE_POWER_LIMIT_ERROR) return STATE_POWER_LIMIT_ERROR @@ -527,27 +526,27 @@ def check_module_type(self, port, module_sm_obj, dynamic=False): return STATE_SW_CONTROL def check_power_cap(self, port, module_sm_obj, dynamic=False): - logger.log_info("enter checkPowerCap port {} module_sm_obj {}".format(port, module_sm_obj)) + logger.log_info("enter check_power_cap port {} module_sm_obj {}".format(port, module_sm_obj)) #sfp_base_module = SfpBase() sfp = sfp_module.SFP(port) xcvr_api = sfp.get_xcvr_api() field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.MAX_POWER_FIELD) powercap_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) - logger.log_info("checkPowerCap got powercap bytearray {} for port {} module_sm_obj {}".format(powercap_ba, port, module_sm_obj)) + logger.log_info("check_power_cap got powercap bytearray {} for port {} module_sm_obj {}".format(powercap_ba, port, module_sm_obj)) powercap = int.from_bytes(powercap_ba, "big") - logger.log_info("checkPowerCap got powercap {} for port {} module_sm_obj {}".format(powercap, port, module_sm_obj)) + logger.log_info("check_power_cap got powercap {} for port {} module_sm_obj {}".format(powercap, port, module_sm_obj)) indep_fd_power_limit = self.get_sysfs_ethernet_port_fd(SYSFS_INDEPENDENT_FD_POWER_LIMIT, port) #with open(indep_fd_power_limit, "r") as power_limit_fd: # cage_power_limit = power_limit_fd.read() cage_power_limit = utils.read_int_from_file(indep_fd_power_limit) - logger.log_info("checkPowerCap got cage_power_limit {} for port {} module_sm_obj {}".format(cage_power_limit, port, module_sm_obj)) + logger.log_info("check_power_cap got cage_power_limit {} for port {} module_sm_obj {}".format(cage_power_limit, port, module_sm_obj)) if powercap > int(cage_power_limit): - logger.log_info("checkPowerCap powercap {} != cage_power_limit {} for port {} module_sm_obj {}".format(powercap, cage_power_limit, port, module_sm_obj)) + logger.log_info("check_power_cap powercap {} != cage_power_limit {} for port {} module_sm_obj {}".format(powercap, cage_power_limit, port, module_sm_obj)) module_sm_obj.set_final_state(STATE_POWER_LIMIT_ERROR) return STATE_POWER_LIMIT_ERROR def save_module_control_mode(self, port, module_sm_obj, dynamic=False): - logger.log_info("saveModuleControlMode setting current state {} for port {} as final state".format(module_sm_obj.get_current_state(), port)) + logger.log_info("save_module_control_mode setting current state {} for port {} as final state".format(module_sm_obj.get_current_state(), port)) # bug - need to find root cause and fix #module_sm_obj.set_final_state(module_sm_obj.get_current_state()) state = module_sm_obj.get_current_state() @@ -557,19 +556,19 @@ def save_module_control_mode(self, port, module_sm_obj, dynamic=False): indep_fd_fw_control = SYSFS_INDEPENDENT_FD_FW_CONTROL.format(port) with open(indep_fd_fw_control, "w") as fw_control_fd: fw_control_fd.write("0") - logger.log_info("saveModuleControlMode set FW control for state {} port {}".format(state, port)) + logger.log_info("save_module_control_mode set FW control for state {} port {}".format(state, port)) module_fd_legacy_path = SYSFS_LEGACY_FD_PRESENCE.format(port) module_sm_obj.set_module_fd_path(module_fd_legacy_path) module_fd = open(module_fd_legacy_path, "r") module_sm_obj.set_module_fd(module_fd) - logger.log_info("saveModuleControlMode changed module fd to legacy present for port {}".format(port)) + logger.log_info("save_module_control_mode changed module fd to legacy present for port {}".format(port)) # register the module's sysfs fd to poller with ERR and PRI attrs - logger.log_info("saveModuleControlMode registering sysfs fd {} number {} path {} for port {}" + logger.log_info("save_module_control_mode registering sysfs fd {} number {} path {} for port {}" .format(module_sm_obj.module_fd, module_sm_obj.module_fd.fileno(), module_sm_obj.set_module_fd_path, port)) self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = { 'module_obj' : module_sm_obj , 'fd': module_sm_obj.module_fd, 'fd_name' : 'presence' } - logger.log_info("saveModuleControlMode set current state {} for port {} as final state {}".format( + logger.log_info("save_module_control_mode set current state {} for port {} as final state {}".format( module_sm_obj.get_current_state(), port, module_sm_obj.get_final_state())) def timerTask(self): # wakes up every 1 second From 9fbcbe3818184c2136e98492f5b5a5df6659d104 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Mon, 16 Oct 2023 08:02:40 +0000 Subject: [PATCH 10/26] remove workaround using hw_reset for xcvr_api returned None since it does not return None anymore --- .../sonic_platform/modules_mgmt.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index ed6242e696cb..d6bce8a891eb 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -415,20 +415,7 @@ def check_if_power_on(self, port, module_sm_obj, dynamic=False): logger.log_info(f'check_if_power_on port {port} is not powered') return STATE_NOT_POWERED elif 1 == val_int: - if not module_sm_obj.wait_for_power_on and \ - utils.read_int_from_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port)) == 1: - sfp = sfp_module.SFP(port) - xcvr_api = sfp.get_xcvr_api() - # only if xcvr_api is None or if it is not active optics cables need reset - if not xcvr_api or xcvr_api.is_flat_memory(): - logger.log_info(f'check_if_power_on port {port} is powered, but need reset') - utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 1) - utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), 0) - module_sm_obj.reset_start_time = time.time() - module_sm_obj.wait_for_power_on = True - self.waiting_modules_list.add(module_sm_obj.port_num) - return STATE_NOT_POWERED - logger.log_info(f'check_if_power_on port {port} is powered, does not need reset') + logger.log_info(f'check_if_power_on port {port} is powered') return STATE_POWERED except Exception as e: logger.log_info(f'check_if_power_on got exception {e}') From fa65a6a86eb6dc7cf395ccbf39c06da6f6712cc6 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Tue, 17 Oct 2023 14:27:44 +0000 Subject: [PATCH 11/26] fix CR comments part 3 and add critical fixes --- .../sonic_platform/chassis.py | 9 ++-- .../sonic_platform/modules_mgmt.py | 50 +++++++++---------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index f0cd1a0e53e1..c78a265b1875 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -373,7 +373,7 @@ def get_change_event(self, timeout=0): Args: timeout: Timeout in milliseconds (optional). If timeout == 0, - this method will block until a change is detected. - Deprecated + this method will block until a change is detected. Returns: (bool, dict): @@ -415,7 +415,10 @@ def get_change_event(self, timeout=0): result_dict = {'sfp': port_dict} result_dict['sfp_error'] = error_dict return True, result_dict - time.sleep(1) + else: + elapse = time.time() - begin + if elapse >= timeout: + return True, {'sfp': {}} i += 1 def reinit_sfps(self, port_dict): @@ -428,7 +431,7 @@ def reinit_sfps(self, port_dict): for index, status in port_dict.items(): if status == sfp.SFP_STATUS_INSERTED: try: - self._sfp_list[index - 1].reinit() + self._sfp_list[int(index) - 1].reinit() except Exception as e: logger.log_error("Fail to re-initialize SFP {} - {}".format(index, repr(e))) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index d6bce8a891eb..53482eecacf9 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -47,6 +47,7 @@ SYSFS_INDEPENDENT_FD_FW_CONTROL = os.path.join(SYSFS_INDEPENDENT_FD_PREFIX, "control") # echo /sys/module/sx_core/$asic/$module/frequency // val: 0 - up to 400KHz, 1 - up to 1MHz SYSFS_INDEPENDENT_FD_FREQ = os.path.join(SYSFS_INDEPENDENT_FD_PREFIX, "frequency") +SYSFS_INDEPENDENT_FD_FREQ_SUPPORT = os.path.join(SYSFS_INDEPENDENT_FD_PREFIX, "frequency_support") IS_INDEPENDENT_MODULE = 'is_independent_module' STATE_DB_TABLE_NAME_PREFIX = 'TRANSCEIVER_MODULES_MGMT|{}' @@ -60,8 +61,6 @@ def __init__(self, namespaces=None, main_thread_stop_event=None, q=None): threading.Thread.__init__(self) self.name = "ModulesMgmtTask" self.main_thread_stop_event = main_thread_stop_event - self.sfp_error_dict = {} - self.sfp_insert_events = {} self.sfp_port_dict_initial = {} self.sfp_port_dict = {} self.sfp_changes_dict = {} @@ -431,16 +430,12 @@ def power_on_module(self, port, module_sm_obj, dynamic=False): if os.path.isfile(module_fd_indep_path_po): logger.log_info("powerOnModule powering on via {} for port {}".format(module_fd_indep_path_po, port)) # echo 1 > /sys/module/sx_core/$asic/$module/power_on - with open(module_fd_indep_path_po, "w") as module_fd: - utils.write_file(module_fd, "1") + utils.write_file(module_fd_indep_path_po, "1") if os.path.isfile(module_fd_indep_path_r): logger.log_info("powerOnModule resetting via {} for port {}".format(module_fd_indep_path_r, port)) # echo 0 > /sys/module/sx_core/$asic/$module/hw_reset - with open(module_fd_indep_path_r, "w") as module_fd: - utils.write_file(module_fd, "0") - module_sm_obj.reset_start_time = time.time() - module_sm_obj.wait_for_power_on = True - self.waiting_modules_list.add(module_sm_obj.port_num) + utils.write_file(module_fd_indep_path_r, "0") + self.add_port_to_wait_reset(module_sm_obj) except Exception as e: logger.log_info("exception in powerOnModule {} for port {}".format(e, port)) return STATE_HW_NOT_PRESENT @@ -497,19 +492,23 @@ def check_module_type(self, port, module_sm_obj, dynamic=False): module_sm_obj.set_final_state(STATE_POWER_LIMIT_ERROR) return STATE_POWER_LIMIT_ERROR else: - # read the module maximum supported clock of Management Comm Interface (MCI) from module EEPROM. - # from byte 2 bits 3-2: - # 00b means module supports up to 400KHz - # 01b means module supports up to 1MHz - logger.log_info(f"check_module_type reading mci max frequency for port {port}") - read_mci = xcvr_api.xcvr_eeprom.read_raw(2, 1) - logger.log_info(f"check_module_type read mci max frequency {read_mci} for port {port}") - mci_bits = read_mci & 0b00001100 - logger.log_info(f"check_module_type read mci max frequency bits {mci_bits} for port {port}") - # Then, set it to frequency Sysfs using: - # echo > /sys/module/sx_core/$asic/$module/frequency // val: 0 - up to 400KHz, 1 - up to 1MHz - indep_fd_freq = SYSFS_INDEPENDENT_FD_FREQ.format(port) - utils.write_file(indep_fd_freq, mci_bits) + # first read the frequency support - if it's 1 then continue, if it's 0 no need to do anything + module_fd_freq_support_path = SYSFS_INDEPENDENT_FD_FREQ_SUPPORT.format(port) + val_int = utils.read_int_from_file(module_fd_freq_support_path) + if 1 == val_int: + # read the module maximum supported clock of Management Comm Interface (MCI) from module EEPROM. + # from byte 2 bits 3-2: + # 00b means module supports up to 400KHz + # 01b means module supports up to 1MHz + logger.log_info(f"check_module_type reading mci max frequency for port {port}") + read_mci = xcvr_api.xcvr_eeprom.read_raw(2, 1) + logger.log_info(f"check_module_type read mci max frequency {read_mci} for port {port}") + mci_bits = read_mci & 0b00001100 + logger.log_info(f"check_module_type read mci max frequency bits {mci_bits} for port {port}") + # Then, set it to frequency Sysfs using: + # echo > /sys/module/sx_core/$asic/$module/frequency // val: 0 - up to 400KHz, 1 - up to 1MHz + indep_fd_freq = SYSFS_INDEPENDENT_FD_FREQ.format(port) + utils.write_file(indep_fd_freq, mci_bits) return STATE_SW_CONTROL def check_power_cap(self, port, module_sm_obj, dynamic=False): @@ -541,9 +540,10 @@ def save_module_control_mode(self, port, module_sm_obj, dynamic=False): if state == STATE_FW_CONTROL: #"echo 0 > /sys/module/sx_core/$asic/$module/control" indep_fd_fw_control = SYSFS_INDEPENDENT_FD_FW_CONTROL.format(port) - with open(indep_fd_fw_control, "w") as fw_control_fd: - fw_control_fd.write("0") + utils.write_file(indep_fd_fw_control, "0") logger.log_info("save_module_control_mode set FW control for state {} port {}".format(state, port)) + # update the presence sysfs fd to legacy FD presence, first close the previous fd + os.close(module_sm_obj.module_fd.fileno()) module_fd_legacy_path = SYSFS_LEGACY_FD_PRESENCE.format(port) module_sm_obj.set_module_fd_path(module_fd_legacy_path) module_fd = open(module_fd_legacy_path, "r") @@ -619,7 +619,7 @@ def add_ports_state_to_state_db(self, dynamic=False): ctrl_type_db_value = '0' else: ctrl_type_db_value = '1' - self.sfp_changes_dict[str(module_obj.port_num)] = ctrl_type_db_value + self.sfp_changes_dict[str(module_obj.port_num + 1)] = ctrl_type_db_value if final_state in [STATE_SW_CONTROL, STATE_FW_CONTROL]: namespaces = multi_asic.get_front_end_namespaces() for namespace in namespaces: From c29d5787753846f02a61d4e13c4f994b5db03c7a Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox Date: Thu, 19 Oct 2023 11:11:36 +0300 Subject: [PATCH 12/26] fix test_change_event test and align code to it --- .../sonic_platform/chassis.py | 13 +++++++------ .../sonic_platform/modules_mgmt.py | 18 +++++++----------- .../mlnx-platform-api/tests/test_chassis.py | 15 +++------------ 3 files changed, 17 insertions(+), 29 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index c78a265b1875..5c7d628cd469 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -396,7 +396,7 @@ def get_change_event(self, timeout=0): self.modules_mgmt_thread.start() self.initialize_sfp() wait_for_ever = (timeout == 0) - # select timeout should be no more than 1000ms to ensure fast shutdown flow + # poll timeout should be no more than 1000ms to ensure fast shutdown flow timeout = 1000.0 if timeout >= 1000 else float(timeout) port_dict = {} error_dict = {} @@ -405,8 +405,8 @@ def get_change_event(self, timeout=0): while True: try: logger.log_info(f'get_change_event() trying to get changes from queue on iteration {i}') - port_dict = self.modules_changes_queue.get(timeout=timeout) - logger.log_info (f'get_change_event() iteration {i} port_dict: {port_dict}') + port_dict = self.modules_changes_queue.get(timeout=timeout / 1000) + logger.log_info(f'get_change_event() iteration {i} port_dict: {port_dict}') except queue.Empty: logger.log_info(f"failed to get item from modules changes queue on itertaion {i}") @@ -416,9 +416,10 @@ def get_change_event(self, timeout=0): result_dict['sfp_error'] = error_dict return True, result_dict else: - elapse = time.time() - begin - if elapse >= timeout: - return True, {'sfp': {}} + if not wait_for_ever: + elapse = time.time() - begin + if elapse >= timeout: + return True, {'sfp': {}} i += 1 def reinit_sfps(self, port_dict): diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 53482eecacf9..7e50c470e89f 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -55,7 +55,6 @@ MAX_EEPROM_ERROR_RESET_RETRIES = 4 class ModulesMgmtTask(threading.Thread): - RETRY_EEPROM_READING_INTERVAL = 60 def __init__(self, namespaces=None, main_thread_stop_event=None, q=None): threading.Thread.__init__(self) @@ -478,7 +477,7 @@ def check_module_type(self, port, module_sm_obj, dynamic=False): if module_type not in [24, 25]: logger.log_info("check_module_type setting STATE_FW_CONTROL for {} in check_module_type port {} module_sm_obj {}" .format(module_type, port, module_sm_obj)) - module_sm_obj.set_final_state = STATE_FW_CONTROL + module_sm_obj.set_final_state(STATE_FW_CONTROL) return STATE_FW_CONTROL else: if xcvr_api.is_flat_memory(): @@ -513,7 +512,6 @@ def check_module_type(self, port, module_sm_obj, dynamic=False): def check_power_cap(self, port, module_sm_obj, dynamic=False): logger.log_info("enter check_power_cap port {} module_sm_obj {}".format(port, module_sm_obj)) - #sfp_base_module = SfpBase() sfp = sfp_module.SFP(port) xcvr_api = sfp.get_xcvr_api() field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.MAX_POWER_FIELD) @@ -522,8 +520,6 @@ def check_power_cap(self, port, module_sm_obj, dynamic=False): powercap = int.from_bytes(powercap_ba, "big") logger.log_info("check_power_cap got powercap {} for port {} module_sm_obj {}".format(powercap, port, module_sm_obj)) indep_fd_power_limit = self.get_sysfs_ethernet_port_fd(SYSFS_INDEPENDENT_FD_POWER_LIMIT, port) - #with open(indep_fd_power_limit, "r") as power_limit_fd: - # cage_power_limit = power_limit_fd.read() cage_power_limit = utils.read_int_from_file(indep_fd_power_limit) logger.log_info("check_power_cap got cage_power_limit {} for port {} module_sm_obj {}".format(cage_power_limit, port, module_sm_obj)) if powercap > int(cage_power_limit): @@ -533,12 +529,10 @@ def check_power_cap(self, port, module_sm_obj, dynamic=False): def save_module_control_mode(self, port, module_sm_obj, dynamic=False): logger.log_info("save_module_control_mode setting current state {} for port {} as final state".format(module_sm_obj.get_current_state(), port)) - # bug - need to find root cause and fix - #module_sm_obj.set_final_state(module_sm_obj.get_current_state()) state = module_sm_obj.get_current_state() module_sm_obj.final_state = state if state == STATE_FW_CONTROL: - #"echo 0 > /sys/module/sx_core/$asic/$module/control" + # echo 0 > /sys/module/sx_core/$asic/$module/control indep_fd_fw_control = SYSFS_INDEPENDENT_FD_FW_CONTROL.format(port) utils.write_file(indep_fd_fw_control, "0") logger.log_info("save_module_control_mode set FW control for state {} port {}".format(state, port)) @@ -685,7 +679,7 @@ class ModuleStateMachine(object): def __init__(self, port_num=0, initial_state=STATE_HW_NOT_PRESENT, current_state=STATE_HW_NOT_PRESENT , next_state=STATE_HW_NOT_PRESENT, final_state='', is_indep_module=False , module_fd_path='', module_fd=None, reset_start_time=None - , eeprom_poweron_reset_retries=1): + , eeprom_poweron_reset_retries=1, module_power_good_fd_path=None, module_power_good_fd=None): self.port_num = port_num self.initial_state = initial_state @@ -698,8 +692,8 @@ def __init__(self, port_num=0, initial_state=STATE_HW_NOT_PRESENT, current_state self.reset_start_time = reset_start_time self.wait_for_power_on = False self.eeprom_poweron_reset_retries = eeprom_poweron_reset_retries - self.module_power_good_fd_path = module_fd_path - self.module_power_good_fd = module_fd + self.module_power_good_fd_path = module_power_good_fd_path + self.module_power_good_fd = module_power_good_fd def set_initial_state(self, state): self.initial_state = state @@ -742,3 +736,5 @@ def reset_all_states(self, def_state=STATE_HW_NOT_PRESENT, retries=1): self.final_state = '' self.wait_for_power_on = False self.eeprom_poweron_reset_retries = retries + os.close(self.module_fd.fileno()) + os.close(self.module_power_good_fd.fileno()) diff --git a/platform/mellanox/mlnx-platform-api/tests/test_chassis.py b/platform/mellanox/mlnx-platform-api/tests/test_chassis.py index cffdd437695f..aca1a240cdb1 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_chassis.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_chassis.py @@ -167,20 +167,11 @@ def test_sfp(self): assert len(sfp_list) == 3 assert chassis.sfp_initialized_count == 3 - @mock.patch('sonic_platform.sfp_event.sfp_event.check_sfp_status', MagicMock()) - @mock.patch('sonic_platform.sfp_event.sfp_event.__init__', MagicMock(return_value=None)) - @mock.patch('sonic_platform.sfp_event.sfp_event.initialize', MagicMock()) @mock.patch('sonic_platform.device_data.DeviceDataManager.get_sfp_count', MagicMock(return_value=3)) def test_change_event(self): - from sonic_platform.sfp_event import sfp_event - - return_port_dict = {1: '1'} - def mock_check_sfp_status(self, port_dict, error_dict, timeout): - port_dict.update(return_port_dict) - return True if port_dict else False - - sfp_event.check_sfp_status = mock_check_sfp_status chassis = Chassis() + chassis.modules_mgmt_thread.is_alive = MagicMock(return_value=True) + chassis.modules_changes_queue.get = MagicMock(return_value={1: '1'}) # Call get_change_event with timeout=0, wait until an event is detected status, event_dict = chassis.get_change_event() @@ -189,7 +180,7 @@ def mock_check_sfp_status(self, port_dict, error_dict, timeout): assert len(chassis._sfp_list) == 3 # Call get_change_event with timeout=1.0 - return_port_dict = {} + chassis.modules_changes_queue.get.return_value = {} status, event_dict = chassis.get_change_event(timeout=1.0) assert status is True assert 'sfp' in event_dict and not event_dict['sfp'] From fc3912a2030cba1014fef905017d913975d47c63 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Wed, 25 Oct 2023 20:50:47 +0000 Subject: [PATCH 13/26] Add copyright header and try except for fds poller --- .../sonic_platform/chassis.py | 2 + .../sonic_platform/modules_mgmt.py | 53 +++++++++++++------ 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 5c7d628cd469..bc4f2f6c9db2 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -418,7 +418,9 @@ def get_change_event(self, timeout=0): else: if not wait_for_ever: elapse = time.time() - begin + logger.log_info(f"get_change_event: wait_for_ever {wait_for_ever} elapse {elapse} iteartion {i}") if elapse >= timeout: + logger.log_info(f"elapse {elapse} > timeout {timeout} iteartion {i} returning empty dict") return True, {'sfp': {}} i += 1 diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 7e50c470e89f..cfb8d50deafe 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -1,3 +1,20 @@ +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import threading import time import queue @@ -268,8 +285,9 @@ def run(self): logger.log_info("dynamic detection dummy read presence {} int {} for port {} before polling" .format(val, val_int, module_obj.port_num)) except Exception as e: - logger.log_info("dynamic detection exception on dummy read presence {} for port {} traceback:\n{}" - .format(e, module_obj.port_num, traceback.format_exc())) + logger.log_info(f"dynamic detection exception on dummy read presence {e} for port " + f"{module_obj.port_num} fd name {module_fd.name} " + f"traceback:\n{traceback.format_exc()}") dummy_read = True # poll for changes with 1 second timeout fds_events = self.poll_obj.poll(1000) @@ -285,16 +303,22 @@ def run(self): elif 'power_good' == fd_name: module_fd_path = module_obj.module_power_good_fd_path self.fds_events_count_dict[module_obj.port_num][fd_name] += 1 - val = module_fd.read() - module_fd.seek(0) - logger.log_info("dynamic detection got module_obj {} with port {} from fd number {} path {} count {}" - .format(module_obj, module_obj.port_num, fd, module_fd_path, self.fds_events_count_dict[module_obj.port_num])) - if module_obj.port_num not in self.sfp_port_dict.keys(): - logger.log_info("dynamic detection port {} not found in sfp_port_dict keys: {} resetting all states".format(module_obj.port_num, self.sfp_port_dict.keys())) - module_obj.reset_all_states() - # put again module obj in sfp_port_dict so next loop will work on it - self.sfp_port_dict[module_obj.port_num] = module_obj - self.delete_ports_from_state_db_list.append(module_obj.port_num) + try: + val = module_fd.read() + module_fd.seek(0) + logger.log_info("dynamic detection got module_obj {} with port {} from fd number {} path {} count {}" + .format(module_obj, module_obj.port_num, fd, module_fd_path + , self.fds_events_count_dict[module_obj.port_num])) + if module_obj.port_num not in self.sfp_port_dict.keys(): + logger.log_info("dynamic detection port {} not found in sfp_port_dict keys: {} resetting all states" + .format(module_obj.port_num, self.sfp_port_dict.keys())) + module_obj.reset_all_states() + # put again module obj in sfp_port_dict so next loop will work on it + self.sfp_port_dict[module_obj.port_num] = module_obj + self.delete_ports_from_state_db_list.append(module_obj.port_num) + except Exception as e: + logger.log_info("dynamic detection exception on read presence {} for port {} fd name {} traceback:\n{}" + .format(e, module_obj.port_num, module_fd.name, traceback.format_exc())) self.delete_ports_state_from_state_db(self.delete_ports_from_state_db_list) for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() @@ -477,7 +501,6 @@ def check_module_type(self, port, module_sm_obj, dynamic=False): if module_type not in [24, 25]: logger.log_info("check_module_type setting STATE_FW_CONTROL for {} in check_module_type port {} module_sm_obj {}" .format(module_type, port, module_sm_obj)) - module_sm_obj.set_final_state(STATE_FW_CONTROL) return STATE_FW_CONTROL else: if xcvr_api.is_flat_memory(): @@ -530,7 +553,7 @@ def check_power_cap(self, port, module_sm_obj, dynamic=False): def save_module_control_mode(self, port, module_sm_obj, dynamic=False): logger.log_info("save_module_control_mode setting current state {} for port {} as final state".format(module_sm_obj.get_current_state(), port)) state = module_sm_obj.get_current_state() - module_sm_obj.final_state = state + module_sm_obj.set_final_state(state) if state == STATE_FW_CONTROL: # echo 0 > /sys/module/sx_core/$asic/$module/control indep_fd_fw_control = SYSFS_INDEPENDENT_FD_FW_CONTROL.format(port) @@ -658,7 +681,7 @@ def delete_ports_from_dict(self, dynamic=False): for port in self.sfp_delete_list_from_port_dict: del self.sfp_port_dict[port] self.sfp_delete_list_from_port_dict = [] - logger.log_info("dynamic detection sfp_port_dict after deletion: {}".format(self.sfp_port_dict)) + logger.log_info("{} detection sfp_port_dict after deletion: {}".format(self.sfp_port_dict, detection_method)) def send_changes_to_shared_queue(self, dynamic=False): detection_method = 'dynamic' if dynamic else 'static' From 68245510ad5ef4bda3a2847439b1bbb5bb826fca Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Thu, 26 Oct 2023 09:51:17 +0000 Subject: [PATCH 14/26] fix DPB bug of port not exists --- platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index bc4f2f6c9db2..a505de2cc07a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -419,7 +419,7 @@ def get_change_event(self, timeout=0): if not wait_for_ever: elapse = time.time() - begin logger.log_info(f"get_change_event: wait_for_ever {wait_for_ever} elapse {elapse} iteartion {i}") - if elapse >= timeout: + if elapse * 1000 >= timeout: logger.log_info(f"elapse {elapse} > timeout {timeout} iteartion {i} returning empty dict") return True, {'sfp': {}} i += 1 From 21586c21c7176e6fd61cfb5f2bba3af7691dcdc0 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Sun, 29 Oct 2023 11:32:20 +0000 Subject: [PATCH 15/26] fix get_sfp returns None --- .../mellanox/mlnx-platform-api/sonic_platform/device_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py index 4b4eed5bbb6f..56708f51721b 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -222,7 +222,8 @@ def is_psu_hotswapable(cls): @classmethod @utils.read_only_cache() def get_sfp_count(cls): - return utils.read_int_from_file('/run/hw-management/config/sfp_counter') + sfp_count = utils.read_int_from_file('/run/hw-management/config/sfp_counter') + return sfp_count if sfp_count > 0 else len(glob.glob('/sys/module/sx_core/asic0/module*')) @classmethod def get_linecard_sfp_count(cls, lc_index): From f966b80e7acb438054ca9380ec30621568689afd Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox Date: Thu, 2 Nov 2023 13:02:34 +0200 Subject: [PATCH 16/26] Junchao: Fix issues found in module detection flow --- .../sonic_platform/modules_mgmt.py | 37 +++---------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index cfb8d50deafe..1a69dcdbf1c8 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -26,8 +26,8 @@ from sonic_py_common.logger import Logger from sonic_py_common import device_info, multi_asic from .device_data import DeviceDataManager - from sonic_platform_base.sfp_base import SfpBase from sonic_platform_base.sonic_xcvr.fields import consts + from sonic_platform_base.sonic_xcvr.api.public import cmis from . import sfp as sfp_module from . import utils from swsscommon.swsscommon import SonicV2Connector @@ -374,10 +374,7 @@ def run(self): def check_if_hw_present(self, port, module_sm_obj, dynamic=False): logger.log_info("enter check_if_hw_present port {} module_sm_obj {}".format(port, module_sm_obj)) - if self.is_supported_indep_mods_system: - module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) - else: - module_fd_indep_path = SYSFS_LEGACY_FD_PRESENCE.format(port) + module_fd_indep_path = module_sm_obj.module_fd_path if os.path.isfile(module_fd_indep_path): try: val_int = utils.read_int_from_file(module_fd_indep_path) @@ -475,40 +472,18 @@ def check_module_type(self, port, module_sm_obj, dynamic=False): logger.log_info("check_module_type setting as FW control as xcvr_api is empty for port {} module_sm_obj {}" .format(port, module_sm_obj)) return STATE_FW_CONTROL - field = xcvr_api.xcvr_eeprom.mem_map.get_field(consts.ID_FIELD) - module_type_ba = xcvr_api.xcvr_eeprom.reader(field.get_offset(), field.get_size()) - if module_type_ba is None: - logger.log_info("check_module_type module_type is None for port {} - checking if we didnt retry yet max " - "number of retries: {}".format(port, MAX_EEPROM_ERROR_RESET_RETRIES)) - # if we didnt do this retry yet - do it up to 3 times - workaround for FW issue blocking upper page access - if module_sm_obj.eeprom_poweron_reset_retries < MAX_EEPROM_ERROR_RESET_RETRIES: - logger.log_info(f"check_module_type module_type is None retrying by falling back to STATE_NOT_POWERED " - f"eeprom reset retries {module_sm_obj.eeprom_poweron_reset_retries} for port {port}") - utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "1") - utils.write_file(SYSFS_INDEPENDENT_FD_HW_RESET.format(port), "0") - self.add_port_to_wait_reset(module_sm_obj) - module_sm_obj.eeprom_poweron_reset_retries += 1 - return STATE_NOT_POWERED - else: - logger.log_info("check_module_type module_type is None and already retried - setting as STATE_ERROR_HANDLER" - "for port {}".format(port)) - module_sm_obj.set_final_state(STATE_ERROR_HANDLER) - return STATE_ERROR_HANDLER - module_type = int.from_bytes(module_type_ba, "big") - logger.log_info("check_module_type got module_type {} in check_module_type port {} module_sm_obj {}" - .format(module_type, port, module_sm_obj)) # QSFP-DD ID is 24, OSFP ID is 25 - only these 2 are supported currently as independent module - SW controlled - if module_type not in [24, 25]: + if not isinstance(xcvr_api, cmis.CmisApi): logger.log_info("check_module_type setting STATE_FW_CONTROL for {} in check_module_type port {} module_sm_obj {}" - .format(module_type, port, module_sm_obj)) + .format(xcvr_api, port, module_sm_obj)) return STATE_FW_CONTROL else: if xcvr_api.is_flat_memory(): logger.log_info("check_module_type port {} setting STATE_FW_CONTROL module ID {} due to flat_mem device" - .format(module_type, port)) + .format(xcvr_api, port)) return STATE_FW_CONTROL logger.log_info("check_module_type checking power cap for {} in check_module_type port {} module_sm_obj {}" - .format(module_type, port, module_sm_obj)) + .format(xcvr_api, port, module_sm_obj)) power_cap = self.check_power_cap(port, module_sm_obj) if power_cap is STATE_POWER_LIMIT_ERROR: module_sm_obj.set_final_state(STATE_POWER_LIMIT_ERROR) From ca7d9863d7a942866a2fbb40c63a8591c232548e Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox Date: Fri, 3 Nov 2023 11:34:12 +0200 Subject: [PATCH 17/26] Junchao: Fix issue: wrong FD is used for dynamic detection --- .../mlnx-platform-api/sonic_platform/modules_mgmt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 1a69dcdbf1c8..63ee25d2f238 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -295,9 +295,9 @@ def run(self): for fd, event in fds_events: # get modules object from fd according to saved key-value of fd-module obj saved earlier logger.log_info("dynamic detection working on fd {} event {}".format(fd, event)) - module_obj = self.fds_mapping_to_obj[fd_fileno]['module_obj'] - module_fd = self.fds_mapping_to_obj[fd_fileno]['fd'] - fd_name = self.fds_mapping_to_obj[fd_fileno]['fd_name'] + module_obj = self.fds_mapping_to_obj[fd]['module_obj'] + module_fd = self.fds_mapping_to_obj[fd]['fd'] + fd_name = self.fds_mapping_to_obj[fd]['fd_name'] if 'presence' == fd_name: module_fd_path = module_obj.module_fd_path elif 'power_good' == fd_name: From e555c06de55939ea15133ce2a8752a070d57bc44 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox Date: Mon, 6 Nov 2023 05:22:41 +0200 Subject: [PATCH 18/26] Junchao: Fix issue: dummy read flow should use poll --- .../sonic_platform/modules_mgmt.py | 41 ++++++++++++++----- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 63ee25d2f238..73137e8a37d2 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -90,6 +90,7 @@ def __init__(self, namespaces=None, main_thread_stop_event=None, q=None): self.timer = threading.Thread() self.poll_obj = None self.fds_mapping_to_obj = {} + self.port_to_fds = {} self.fds_events_count_dict = {} self.delete_ports_from_state_db_list = [] self.setName("ModulesMgmtTask") @@ -265,7 +266,8 @@ def run(self): logger.log_info("dynamic detection running iteration {}".format(i)) # dummy read all sysfs fds before polling them due to linux kernel implementation of poll if not dummy_read: - for fd_fileno in self.fds_mapping_to_obj: + fds_events = self.poll_obj.poll(1000) + for fd_fileno, event in fds_events: # dummy read present / hw_present / power_good sysfs module_obj = self.fds_mapping_to_obj[fd_fileno]['module_obj'] module_fd = self.fds_mapping_to_obj[fd_fileno]['fd'] @@ -285,7 +287,7 @@ def run(self): logger.log_info("dynamic detection dummy read presence {} int {} for port {} before polling" .format(val, val_int, module_obj.port_num)) except Exception as e: - logger.log_info(f"dynamic detection exception on dummy read presence {e} for port " + logger.log_error(f"dynamic detection exception on dummy read presence {e} for port " f"{module_obj.port_num} fd name {module_fd.name} " f"traceback:\n{traceback.format_exc()}") dummy_read = True @@ -312,12 +314,13 @@ def run(self): if module_obj.port_num not in self.sfp_port_dict.keys(): logger.log_info("dynamic detection port {} not found in sfp_port_dict keys: {} resetting all states" .format(module_obj.port_num, self.sfp_port_dict.keys())) + self.deregister_fd_from_polling(module_obj.port_num) module_obj.reset_all_states() # put again module obj in sfp_port_dict so next loop will work on it self.sfp_port_dict[module_obj.port_num] = module_obj self.delete_ports_from_state_db_list.append(module_obj.port_num) except Exception as e: - logger.log_info("dynamic detection exception on read presence {} for port {} fd name {} traceback:\n{}" + logger.log_error("dynamic detection exception on read presence {} for port {} fd name {} traceback:\n{}" .format(e, module_obj.port_num, module_fd.name, traceback.format_exc())) self.delete_ports_state_from_state_db(self.delete_ports_from_state_db_list) for port_num, module_sm_obj in self.sfp_port_dict.items(): @@ -406,10 +409,7 @@ def check_if_module_available(self, port, module_sm_obj, dynamic=False): val_int = int(val) module_sm_obj.module_power_good_fd_path = module_fd_indep_path module_sm_obj.module_power_good_fd = module_power_good_fd - # registering power good sysfs even if not good, so we can get an event from poller upon changes - self.poll_obj.register(module_sm_obj.module_power_good_fd, select.POLLERR | select.POLLPRI) - self.fds_mapping_to_obj[module_sm_obj.module_power_good_fd.fileno()] = { 'module_obj' : module_sm_obj - , 'fd':module_sm_obj.module_power_good_fd, 'fd_name' : 'power_good'} + if 0 == val_int: logger.log_info(f'port {port} power is not good') module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) @@ -541,15 +541,34 @@ def save_module_control_mode(self, port, module_sm_obj, dynamic=False): module_fd = open(module_fd_legacy_path, "r") module_sm_obj.set_module_fd(module_fd) logger.log_info("save_module_control_mode changed module fd to legacy present for port {}".format(port)) + else: + # registering power good sysfs even if not good, so we can get an event from poller upon changes + self.register_fd_for_polling(module_sm_obj, module_sm_obj.module_power_good_fd, 'power_good') # register the module's sysfs fd to poller with ERR and PRI attrs logger.log_info("save_module_control_mode registering sysfs fd {} number {} path {} for port {}" .format(module_sm_obj.module_fd, module_sm_obj.module_fd.fileno(), module_sm_obj.set_module_fd_path, port)) - self.poll_obj.register(module_sm_obj.module_fd, select.POLLERR | select.POLLPRI) - self.fds_mapping_to_obj[module_sm_obj.module_fd.fileno()] = { 'module_obj' : module_sm_obj - , 'fd': module_sm_obj.module_fd, 'fd_name' : 'presence' } + self.register_fd_for_polling(module_sm_obj, module_sm_obj.module_fd, 'presence') logger.log_info("save_module_control_mode set current state {} for port {} as final state {}".format( module_sm_obj.get_current_state(), port, module_sm_obj.get_final_state())) + def register_fd_for_polling(self, module_sm_obj, fd, fd_name): + self.fds_mapping_to_obj[fd.fileno()] = {'module_obj' : module_sm_obj, + 'fd': fd, + 'fd_name' : fd_name} + if module_sm_obj.port_num not in self.port_to_fds: + self.port_to_fds[module_sm_obj.port_num] = [fd] + else: + self.port_to_fds[module_sm_obj.port_num].append(fd) + self.poll_obj.register(fd, select.POLLERR | select.POLLPRI) + + def deregister_fd_from_polling(self, port): + if port in self.port_to_fds: + fds = self.port_to_fds[port] + for fd in fds: + self.fds_mapping_to_obj.pop(fd) + self.poll_obj.unregister(fd) + self.port_to_fds.pop(port) + def timerTask(self): # wakes up every 1 second logger.log_info("timerTask entered run state") empty = False @@ -580,7 +599,7 @@ def timerTask(self): # wakes up every 1 second logger.log_info("timerTask module port {} adding to delete list to remove from waiting_modules_list".format(module.port_num)) port_list_to_delete.append(module.port_num) logger.log_info("timerTask deleting ports {} from waiting_modules_list...".format(port_list_to_delete)) - for port in port_list_to_delete: + for port in port_list_to_delete: logger.log_info("timerTask deleting port {} from waiting_modules_list".format(port)) self.waiting_modules_list.remove(port) logger.log_info("timerTask waiting_modules_list after deletion: {}".format(self.waiting_modules_list)) From f1ce61aaae6227a8c6e294fd54a16fda5babffe4 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox Date: Tue, 7 Nov 2023 11:27:53 +0200 Subject: [PATCH 19/26] Junchao: fix typo in modules_mgmt --- .../mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 73137e8a37d2..6b1ed43a2c11 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -565,7 +565,7 @@ def deregister_fd_from_polling(self, port): if port in self.port_to_fds: fds = self.port_to_fds[port] for fd in fds: - self.fds_mapping_to_obj.pop(fd) + self.fds_mapping_to_obj.pop(fd.fileno()) self.poll_obj.unregister(fd) self.port_to_fds.pop(port) From 100fd69d546c79f578353f3d09c0516c7851b771 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox Date: Tue, 7 Nov 2023 12:09:38 +0200 Subject: [PATCH 20/26] Junchao: workaround for dummy read issue --- .../sonic_platform/modules_mgmt.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 6b1ed43a2c11..7903b364b552 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -265,6 +265,7 @@ def run(self): while not self.main_thread_stop_event.is_set(): logger.log_info("dynamic detection running iteration {}".format(i)) # dummy read all sysfs fds before polling them due to linux kernel implementation of poll + """ if not dummy_read: fds_events = self.poll_obj.poll(1000) for fd_fileno, event in fds_events: @@ -291,6 +292,7 @@ def run(self): f"{module_obj.port_num} fd name {module_fd.name} " f"traceback:\n{traceback.format_exc()}") dummy_read = True + """ # poll for changes with 1 second timeout fds_events = self.poll_obj.poll(1000) logger.log_info("dynamic detection polled obj checking fds_events iteration {}".format(i)) @@ -308,6 +310,8 @@ def run(self): try: val = module_fd.read() module_fd.seek(0) + if self.is_dummy_event(int(val), module_obj): + continue logger.log_info("dynamic detection got module_obj {} with port {} from fd number {} path {} count {}" .format(module_obj, module_obj.port_num, fd, module_fd_path , self.fds_events_count_dict[module_obj.port_num])) @@ -374,6 +378,12 @@ def run(self): logger.log_info("port_num: {} module_sm_obj initial state: {} current_state: {} next_state: {}" .format(port_num, module_sm_obj.initial_state, module_sm_obj.get_current_state(), module_sm_obj.get_next_state())) + def is_dummy_event(self, val, module_sm_obj): + if val == 1: + return module_sm_obj.final_state in (STATE_HW_PRESENT, STATE_SW_CONTROL, STATE_FW_CONTROL) + elif val == 0: + return module_sm_obj.final_state in (STATE_HW_NOT_PRESENT,) + return False def check_if_hw_present(self, port, module_sm_obj, dynamic=False): logger.log_info("enter check_if_hw_present port {} module_sm_obj {}".format(port, module_sm_obj)) @@ -535,7 +545,7 @@ def save_module_control_mode(self, port, module_sm_obj, dynamic=False): utils.write_file(indep_fd_fw_control, "0") logger.log_info("save_module_control_mode set FW control for state {} port {}".format(state, port)) # update the presence sysfs fd to legacy FD presence, first close the previous fd - os.close(module_sm_obj.module_fd.fileno()) + module_sm_obj.module_fd.close() module_fd_legacy_path = SYSFS_LEGACY_FD_PRESENCE.format(port) module_sm_obj.set_module_fd_path(module_fd_legacy_path) module_fd = open(module_fd_legacy_path, "r") @@ -753,5 +763,5 @@ def reset_all_states(self, def_state=STATE_HW_NOT_PRESENT, retries=1): self.final_state = '' self.wait_for_power_on = False self.eeprom_poweron_reset_retries = retries - os.close(self.module_fd.fileno()) - os.close(self.module_power_good_fd.fileno()) + self.module_fd.close() + self.module_power_good_fd.close() From 875e73eb7c66c17d7b612fd95aac7dfbf2371449 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Mon, 13 Nov 2023 10:45:33 +0000 Subject: [PATCH 21/26] fix log typo and dynamic issue, remove old dummy --- .../sonic_platform/modules_mgmt.py | 43 ++++--------------- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 7903b364b552..7bb37156d9c3 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -261,38 +261,8 @@ def run(self): module_obj = self.fds_mapping_to_obj[fd_fileno]['module_obj'] # for debug purposes self.fds_events_count_dict[module_obj.port_num] = { 'presence' : 0 , 'power_good' : 0 } - dummy_read = False while not self.main_thread_stop_event.is_set(): logger.log_info("dynamic detection running iteration {}".format(i)) - # dummy read all sysfs fds before polling them due to linux kernel implementation of poll - """ - if not dummy_read: - fds_events = self.poll_obj.poll(1000) - for fd_fileno, event in fds_events: - # dummy read present / hw_present / power_good sysfs - module_obj = self.fds_mapping_to_obj[fd_fileno]['module_obj'] - module_fd = self.fds_mapping_to_obj[fd_fileno]['fd'] - fd_name = self.fds_mapping_to_obj[fd_fileno]['fd_name'] - if 'presence' == fd_name: - module_fd_path = module_obj.module_fd_path - elif 'power_good' == fd_name: - module_fd_path = module_obj.module_power_good_fd_path - try: - logger.log_info("dynamic detection dummy reading from fd path {} for port {}" - .format(module_fd_path, module_obj.port_num)) - val = module_fd.read() - module_fd.seek(0) - val_int = None - if len(val) > 0: - val_int = int(val) - logger.log_info("dynamic detection dummy read presence {} int {} for port {} before polling" - .format(val, val_int, module_obj.port_num)) - except Exception as e: - logger.log_error(f"dynamic detection exception on dummy read presence {e} for port " - f"{module_obj.port_num} fd name {module_fd.name} " - f"traceback:\n{traceback.format_exc()}") - dummy_read = True - """ # poll for changes with 1 second timeout fds_events = self.poll_obj.poll(1000) logger.log_info("dynamic detection polled obj checking fds_events iteration {}".format(i)) @@ -310,23 +280,28 @@ def run(self): try: val = module_fd.read() module_fd.seek(0) - if self.is_dummy_event(int(val), module_obj): - continue logger.log_info("dynamic detection got module_obj {} with port {} from fd number {} path {} count {}" .format(module_obj, module_obj.port_num, fd, module_fd_path , self.fds_events_count_dict[module_obj.port_num])) + if self.is_dummy_event(int(val), module_obj): + logger.log_info(f"dynamic detection dummy event port {module_obj.port_num} from fd number {fd}") + continue if module_obj.port_num not in self.sfp_port_dict.keys(): logger.log_info("dynamic detection port {} not found in sfp_port_dict keys: {} resetting all states" .format(module_obj.port_num, self.sfp_port_dict.keys())) self.deregister_fd_from_polling(module_obj.port_num) - module_obj.reset_all_states() # put again module obj in sfp_port_dict so next loop will work on it self.sfp_port_dict[module_obj.port_num] = module_obj self.delete_ports_from_state_db_list.append(module_obj.port_num) except Exception as e: logger.log_error("dynamic detection exception on read presence {} for port {} fd name {} traceback:\n{}" .format(e, module_obj.port_num, module_fd.name, traceback.format_exc())) + for port in self.delete_ports_from_state_db_list: + logger.log_info(f"dynamic detection resetting all states for port {module_obj.port_num}") + module_obj = self.sfp_port_dict[port] + module_obj.reset_all_states() self.delete_ports_state_from_state_db(self.delete_ports_from_state_db_list) + self.delete_ports_from_state_db_list = [] for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() logger.log_info(f'dynamic detection STATE_LOG {port_num}: curr_state is {curr_state}') @@ -685,7 +660,7 @@ def delete_ports_from_dict(self, dynamic=False): for port in self.sfp_delete_list_from_port_dict: del self.sfp_port_dict[port] self.sfp_delete_list_from_port_dict = [] - logger.log_info("{} detection sfp_port_dict after deletion: {}".format(self.sfp_port_dict, detection_method)) + logger.log_info("{} detection sfp_port_dict after deletion: {}".format(detection_method, self.sfp_port_dict)) def send_changes_to_shared_queue(self, dynamic=False): detection_method = 'dynamic' if dynamic else 'static' From d43b6837da93bb612694bf7863e901ecf6e3067a Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Wed, 15 Nov 2023 21:08:42 +0000 Subject: [PATCH 22/26] add hw_present fd registration to poll for shutdown port --- .../sonic_platform/modules_mgmt.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 7bb37156d9c3..a4eb3b672ae7 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -94,6 +94,7 @@ def __init__(self, namespaces=None, main_thread_stop_event=None, q=None): self.fds_events_count_dict = {} self.delete_ports_from_state_db_list = [] self.setName("ModulesMgmtTask") + self.register_hw_present_fds = [] # SFPs state machine def get_sm_func(self, sm, port): @@ -280,9 +281,9 @@ def run(self): try: val = module_fd.read() module_fd.seek(0) - logger.log_info("dynamic detection got module_obj {} with port {} from fd number {} path {} count {}" + logger.log_info("dynamic detection got module_obj {} with port {} from fd number {} path {} val {} count {}" .format(module_obj, module_obj.port_num, fd, module_fd_path - , self.fds_events_count_dict[module_obj.port_num])) + , val, self.fds_events_count_dict[module_obj.port_num])) if self.is_dummy_event(int(val), module_obj): logger.log_info(f"dynamic detection dummy event port {module_obj.port_num} from fd number {fd}") continue @@ -297,7 +298,7 @@ def run(self): logger.log_error("dynamic detection exception on read presence {} for port {} fd name {} traceback:\n{}" .format(e, module_obj.port_num, module_fd.name, traceback.format_exc())) for port in self.delete_ports_from_state_db_list: - logger.log_info(f"dynamic detection resetting all states for port {module_obj.port_num}") + logger.log_info(f"dynamic detection resetting all states for port {port}") module_obj = self.sfp_port_dict[port] module_obj.reset_all_states() self.delete_ports_state_from_state_db(self.delete_ports_from_state_db_list) @@ -347,6 +348,7 @@ def run(self): self.add_ports_state_to_state_db(dynamic=True) self.delete_ports_from_dict(dynamic=True) self.send_changes_to_shared_queue(dynamic=True) + self.register_hw_present_ports(True, self.register_hw_present_fds) i += 1 logger.log_info("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): @@ -613,6 +615,7 @@ def add_ports_state_to_state_db(self, dynamic=False): self.sfp_delete_list_from_port_dict.append(port) if final_state in [STATE_HW_NOT_PRESENT, STATE_POWER_LIMIT_ERROR, STATE_ERROR_HANDLER]: ctrl_type_db_value = '0' + self.register_hw_present_fds.append(module_obj) else: ctrl_type_db_value = '1' self.sfp_changes_dict[str(module_obj.port_num + 1)] = ctrl_type_db_value @@ -675,6 +678,17 @@ def send_changes_to_shared_queue(self, dynamic=False): else: logger.log_info(f"{detection_method} sfp_changes_dict {self.sfp_changes_dict} is empty...") + def register_hw_present_ports(self, dynamic=False, module_obj_list=[]): + detection_method = 'dynamic' if dynamic else 'static' + logger.log_info(f"{detection_method} detection enter register_presence_closed_ports") + for module_obj in module_obj_list: + port = module_obj.port_num + module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) + module_obj.set_module_fd_path(module_fd_indep_path) + module_fd = open(module_fd_indep_path, "r") + module_obj.set_module_fd(module_fd) + logger.log_info(f"{detection_method} registering fd {module_fd} fd name {module_fd.name} for port {port}") + self.register_fd_for_polling(module_obj, module_fd, 'presence') class ModuleStateMachine(object): From a52f918f073680cb001f7c02b4eeda586bdd8ad6 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Wed, 22 Nov 2023 10:09:32 +0000 Subject: [PATCH 23/26] fix plug out-in not detected when independent mode disabled and fix some issues --- .../sonic_platform/modules_mgmt.py | 74 +++++++++++-------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index a4eb3b672ae7..4419e6ca21c3 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -237,6 +237,7 @@ def run(self): self.add_ports_state_to_state_db() self.delete_ports_from_dict() self.send_changes_to_shared_queue() + self.register_presece_closed_ports(False, self.register_hw_present_fds) i += 1 logger.log_info("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): @@ -293,7 +294,8 @@ def run(self): self.deregister_fd_from_polling(module_obj.port_num) # put again module obj in sfp_port_dict so next loop will work on it self.sfp_port_dict[module_obj.port_num] = module_obj - self.delete_ports_from_state_db_list.append(module_obj.port_num) + if '0' == val: + self.delete_ports_from_state_db_list.append(module_obj.port_num) except Exception as e: logger.log_error("dynamic detection exception on read presence {} for port {} fd name {} traceback:\n{}" .format(e, module_obj.port_num, module_fd.name, traceback.format_exc())) @@ -348,7 +350,8 @@ def run(self): self.add_ports_state_to_state_db(dynamic=True) self.delete_ports_from_dict(dynamic=True) self.send_changes_to_shared_queue(dynamic=True) - self.register_hw_present_ports(True, self.register_hw_present_fds) + self.register_presece_closed_ports(True, self.register_hw_present_fds) + self.register_hw_present_fds = [] i += 1 logger.log_info("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): @@ -370,13 +373,16 @@ def check_if_hw_present(self, port, module_sm_obj, dynamic=False): val_int = utils.read_int_from_file(module_fd_indep_path) if 0 == val_int: logger.log_info("returning {} for val {}".format(STATE_HW_NOT_PRESENT, val_int)) - module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) - return STATE_HW_NOT_PRESENT + retval_state = STATE_HW_NOT_PRESENT + module_sm_obj.set_final_state(retval_state) + return retval_state elif 1 == val_int: - if not self.is_supported_indep_mods_system: - module_sm_obj.set_final_state(STATE_HW_PRESENT) logger.log_info("returning {} for val {}".format(STATE_HW_PRESENT, val_int)) - return STATE_HW_PRESENT + retval_state = STATE_HW_PRESENT + if not self.is_supported_indep_mods_system: + module_sm_obj.set_final_state(retval_state) + self.register_fd_for_polling(module_sm_obj, module_sm_obj.module_fd, 'presence') + return retval_state except Exception as e: logger.log_info("exception {} for port {} setting final state STATE_ERROR_HANDLER".format(e, port)) module_sm_obj.set_final_state(STATE_ERROR_HANDLER) @@ -513,27 +519,33 @@ def check_power_cap(self, port, module_sm_obj, dynamic=False): return STATE_POWER_LIMIT_ERROR def save_module_control_mode(self, port, module_sm_obj, dynamic=False): - logger.log_info("save_module_control_mode setting current state {} for port {} as final state".format(module_sm_obj.get_current_state(), port)) + detection_method = 'dynamic' if dynamic else 'static' + logger.log_info("{} detection save_module_control_mode setting current state {} for port {} as final state" + .format(detection_method, module_sm_obj.get_current_state(), port)) state = module_sm_obj.get_current_state() module_sm_obj.set_final_state(state) - if state == STATE_FW_CONTROL: - # echo 0 > /sys/module/sx_core/$asic/$module/control - indep_fd_fw_control = SYSFS_INDEPENDENT_FD_FW_CONTROL.format(port) - utils.write_file(indep_fd_fw_control, "0") - logger.log_info("save_module_control_mode set FW control for state {} port {}".format(state, port)) - # update the presence sysfs fd to legacy FD presence, first close the previous fd - module_sm_obj.module_fd.close() - module_fd_legacy_path = SYSFS_LEGACY_FD_PRESENCE.format(port) - module_sm_obj.set_module_fd_path(module_fd_legacy_path) - module_fd = open(module_fd_legacy_path, "r") - module_sm_obj.set_module_fd(module_fd) - logger.log_info("save_module_control_mode changed module fd to legacy present for port {}".format(port)) - else: - # registering power good sysfs even if not good, so we can get an event from poller upon changes - self.register_fd_for_polling(module_sm_obj, module_sm_obj.module_power_good_fd, 'power_good') - # register the module's sysfs fd to poller with ERR and PRI attrs - logger.log_info("save_module_control_mode registering sysfs fd {} number {} path {} for port {}" - .format(module_sm_obj.module_fd, module_sm_obj.module_fd.fileno(), module_sm_obj.set_module_fd_path, port)) + try: + if state == STATE_FW_CONTROL: + # echo 0 > /sys/module/sx_core/$asic/$module/control + indep_fd_fw_control = SYSFS_INDEPENDENT_FD_FW_CONTROL.format(port) + utils.write_file(indep_fd_fw_control, "0") + logger.log_info("save_module_control_mode set FW control for state {} port {}".format(state, port)) + # update the presence sysfs fd to legacy FD presence, first close the previous fd + module_sm_obj.module_fd.close() + module_fd_legacy_path = SYSFS_LEGACY_FD_PRESENCE.format(port) + module_sm_obj.set_module_fd_path(module_fd_legacy_path) + module_fd = open(module_fd_legacy_path, "r") + module_sm_obj.set_module_fd(module_fd) + logger.log_info("save_module_control_mode changed module fd to legacy present for port {}".format(port)) + else: + # registering power good sysfs even if not good, so we can get an event from poller upon changes + self.register_fd_for_polling(module_sm_obj, module_sm_obj.module_power_good_fd, 'power_good') + # register the module's sysfs fd to poller with ERR and PRI attrs + logger.log_info("save_module_control_mode registering sysfs fd {} number {} path {} for port {}" + .format(module_sm_obj.module_fd, module_sm_obj.module_fd.fileno(), module_sm_obj.set_module_fd_path, port)) + except Exception as e: + logger.log_error("{} detection exception on read presence {} for port {} fd name {} traceback:\n{}" + .format(detection_method, e, port, module_sm_obj.module_fd.name, traceback.format_exc())) self.register_fd_for_polling(module_sm_obj, module_sm_obj.module_fd, 'presence') logger.log_info("save_module_control_mode set current state {} for port {} as final state {}".format( module_sm_obj.get_current_state(), port, module_sm_obj.get_final_state())) @@ -678,12 +690,15 @@ def send_changes_to_shared_queue(self, dynamic=False): else: logger.log_info(f"{detection_method} sfp_changes_dict {self.sfp_changes_dict} is empty...") - def register_hw_present_ports(self, dynamic=False, module_obj_list=[]): + def register_presece_closed_ports(self, dynamic=False, module_obj_list=[]): detection_method = 'dynamic' if dynamic else 'static' logger.log_info(f"{detection_method} detection enter register_presence_closed_ports") for module_obj in module_obj_list: port = module_obj.port_num - module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) + if self.is_supported_indep_mods_system: + module_fd_indep_path = SYSFS_INDEPENDENT_FD_PRESENCE.format(port) + else: + module_fd_indep_path = SYSFS_LEGACY_FD_PRESENCE.format(port) module_obj.set_module_fd_path(module_fd_indep_path) module_fd = open(module_fd_indep_path, "r") module_obj.set_module_fd(module_fd) @@ -753,4 +768,5 @@ def reset_all_states(self, def_state=STATE_HW_NOT_PRESENT, retries=1): self.wait_for_power_on = False self.eeprom_poweron_reset_retries = retries self.module_fd.close() - self.module_power_good_fd.close() + if self.module_power_good_fd: + self.module_power_good_fd.close() From b926a3f3905be824e7d98a15a7878e8951395d2d Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Sun, 26 Nov 2023 17:27:35 +0000 Subject: [PATCH 24/26] fix dynamic detection not resetting states upon cable plug out --- .../mlnx-platform-api/sonic_platform/modules_mgmt.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 4419e6ca21c3..161b387b2ed5 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -285,11 +285,14 @@ def run(self): logger.log_info("dynamic detection got module_obj {} with port {} from fd number {} path {} val {} count {}" .format(module_obj, module_obj.port_num, fd, module_fd_path , val, self.fds_events_count_dict[module_obj.port_num])) + # workaround for garbage received after the 0 or 1 value of sysfs i.e. 0#012 or 1#012 + if len(val) > 1: + val = val[0] if self.is_dummy_event(int(val), module_obj): logger.log_info(f"dynamic detection dummy event port {module_obj.port_num} from fd number {fd}") continue if module_obj.port_num not in self.sfp_port_dict.keys(): - logger.log_info("dynamic detection port {} not found in sfp_port_dict keys: {} resetting all states" + logger.log_info("dynamic detection port {} not found in sfp_port_dict keys: {} adding it" .format(module_obj.port_num, self.sfp_port_dict.keys())) self.deregister_fd_from_polling(module_obj.port_num) # put again module obj in sfp_port_dict so next loop will work on it From ccec845012378c3a513436cf3d12002c49bafdef Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Mon, 27 Nov 2023 17:32:04 +0000 Subject: [PATCH 25/26] fix dynamic detection cable plug in flow --- .../sonic_platform/modules_mgmt.py | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 161b387b2ed5..0ae92dd5e421 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -92,7 +92,7 @@ def __init__(self, namespaces=None, main_thread_stop_event=None, q=None): self.fds_mapping_to_obj = {} self.port_to_fds = {} self.fds_events_count_dict = {} - self.delete_ports_from_state_db_list = [] + self.delete_ports_from_state_db_dict = {} self.setName("ModulesMgmtTask") self.register_hw_present_fds = [] @@ -239,6 +239,7 @@ def run(self): self.send_changes_to_shared_queue() self.register_presece_closed_ports(False, self.register_hw_present_fds) i += 1 + self.register_hw_present_fds = [] logger.log_info("sfp_port_dict: {}".format(self.sfp_port_dict)) for port_num, module_sm_obj in self.sfp_port_dict.items(): logger.log_info("static detection port_num: {} initial state: {} current_state: {} next_state: {}" @@ -297,17 +298,16 @@ def run(self): self.deregister_fd_from_polling(module_obj.port_num) # put again module obj in sfp_port_dict so next loop will work on it self.sfp_port_dict[module_obj.port_num] = module_obj - if '0' == val: - self.delete_ports_from_state_db_list.append(module_obj.port_num) + self.delete_ports_from_state_db_dict[module_obj.port_num] = val except Exception as e: logger.log_error("dynamic detection exception on read presence {} for port {} fd name {} traceback:\n{}" .format(e, module_obj.port_num, module_fd.name, traceback.format_exc())) - for port in self.delete_ports_from_state_db_list: - logger.log_info(f"dynamic detection resetting all states for port {port}") + for port, val in self.delete_ports_from_state_db_dict.items(): + logger.log_info(f"dynamic detection resetting all states for port {port} close_presence_ports {val}") module_obj = self.sfp_port_dict[port] - module_obj.reset_all_states() - self.delete_ports_state_from_state_db(self.delete_ports_from_state_db_list) - self.delete_ports_from_state_db_list = [] + module_obj.reset_all_states(close_presence_ports=val) + self.delete_ports_state_from_state_db(self.delete_ports_from_state_db_dict.keys()) + self.delete_ports_from_state_db_dict = {} for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() logger.log_info(f'dynamic detection STATE_LOG {port_num}: curr_state is {curr_state}') @@ -354,6 +354,9 @@ def run(self): self.delete_ports_from_dict(dynamic=True) self.send_changes_to_shared_queue(dynamic=True) self.register_presece_closed_ports(True, self.register_hw_present_fds) + if not self.sfp_port_dict and is_final_state_module: + is_final_state_module = False + logger.log_info(f"sft_port_dict is empty {self.sfp_port_dict}, set is_final_state_module to {is_final_state_module}") self.register_hw_present_fds = [] i += 1 logger.log_info("sfp_port_dict: {}".format(self.sfp_port_dict)) @@ -369,7 +372,8 @@ def is_dummy_event(self, val, module_sm_obj): return False def check_if_hw_present(self, port, module_sm_obj, dynamic=False): - logger.log_info("enter check_if_hw_present port {} module_sm_obj {}".format(port, module_sm_obj)) + detection_method = 'dynamic' if dynamic else 'static' + logger.log_info(f"{detection_method} detection enter check_if_hw_present port {port} module_sm_obj {module_sm_obj}") module_fd_indep_path = module_sm_obj.module_fd_path if os.path.isfile(module_fd_indep_path): try: @@ -377,20 +381,20 @@ def check_if_hw_present(self, port, module_sm_obj, dynamic=False): if 0 == val_int: logger.log_info("returning {} for val {}".format(STATE_HW_NOT_PRESENT, val_int)) retval_state = STATE_HW_NOT_PRESENT - module_sm_obj.set_final_state(retval_state) + module_sm_obj.set_final_state(retval_state, detection_method) return retval_state elif 1 == val_int: logger.log_info("returning {} for val {}".format(STATE_HW_PRESENT, val_int)) retval_state = STATE_HW_PRESENT if not self.is_supported_indep_mods_system: - module_sm_obj.set_final_state(retval_state) + module_sm_obj.set_final_state(retval_state, detection_method) self.register_fd_for_polling(module_sm_obj, module_sm_obj.module_fd, 'presence') return retval_state except Exception as e: logger.log_info("exception {} for port {} setting final state STATE_ERROR_HANDLER".format(e, port)) module_sm_obj.set_final_state(STATE_ERROR_HANDLER) return STATE_ERROR_HANDLER - module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT) + module_sm_obj.set_final_state(STATE_HW_NOT_PRESENT, detection_method) return STATE_HW_NOT_PRESENT def check_if_module_available(self, port, module_sm_obj, dynamic=False): @@ -623,6 +627,7 @@ def add_port_to_wait_reset(self, module_sm_obj): def add_ports_state_to_state_db(self, dynamic=False): state_db = None detection_method = 'dynamic' if dynamic else 'static' + logger.log_info(f"{detection_method} detection enter add_ports_state_to_state_db") for port, module_obj in self.sfp_port_dict.items(): final_state = module_obj.get_final_state() if final_state: @@ -630,6 +635,7 @@ def add_ports_state_to_state_db(self, dynamic=False): self.sfp_delete_list_from_port_dict.append(port) if final_state in [STATE_HW_NOT_PRESENT, STATE_POWER_LIMIT_ERROR, STATE_ERROR_HANDLER]: ctrl_type_db_value = '0' + logger.log_info(f"{detection_method} detection adding port {port} to register_hw_present_fds") self.register_hw_present_fds.append(module_obj) else: ctrl_type_db_value = '1' @@ -688,6 +694,7 @@ def send_changes_to_shared_queue(self, dynamic=False): try: self.modules_changes_queue.put(self.sfp_changes_dict, timeout=1) self.sfp_changes_dict = {} + logger.log_info(f"{detection_method} sfp_changes_dict after put changes: {self.sfp_changes_dict}") except queue.Full: logger.log_info(f"{detection_method} failed to put item from modules changes queue, queue is full") else: @@ -747,7 +754,8 @@ def set_next_state(self, state): def get_final_state(self): return self.final_state - def set_final_state(self, state): + def set_final_state(self, state, detection_method='static'): + logger.log_info(f"{detection_method} set_final_state setting {state} port {self.port_num}") self.final_state = state def advance_state(self): @@ -763,13 +771,14 @@ def set_module_fd_path(self, module_fd_path): def set_module_fd(self, module_fd): self.module_fd = module_fd - def reset_all_states(self, def_state=STATE_HW_NOT_PRESENT, retries=1): + def reset_all_states(self, def_state=STATE_HW_NOT_PRESENT, retries=1, close_presence_ports='0'): self.initial_state = def_state self.current_state = def_state self.next_state = def_state self.final_state = '' self.wait_for_power_on = False self.eeprom_poweron_reset_retries = retries - self.module_fd.close() + if '0' == close_presence_ports: + self.module_fd.close() if self.module_power_good_fd: self.module_power_good_fd.close() From 4210b1b029beb26824278691bbe3459cc7a99a84 Mon Sep 17 00:00:00 2001 From: Doron Barashi Date: Mon, 4 Dec 2023 22:04:58 +0000 Subject: [PATCH 26/26] Update modules_mgmt thread to be a daemon and deleted the use of the new table added to Redis DB --- .../sonic_platform/chassis.py | 2 + .../sonic_platform/modules_mgmt.py | 63 ++++--------------- 2 files changed, 13 insertions(+), 52 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 459791442a46..47bbf622816a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -390,6 +390,8 @@ def get_change_event(self, timeout=0): # open new SFP change events thread self.modules_mgmt_thread = modules_mgmt.ModulesMgmtTask(q=self.modules_changes_queue , main_thread_stop_event = self.modules_mgmt_task_stopping_event) + # Set the thread as daemon so when pmon/xcvrd are shutting down, modules_mgmt will shut down immedietly. + self.modules_mgmt_thread.daemon = True self.modules_mgmt_thread.start() self.initialize_sfp() wait_for_ever = (timeout == 0) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py index 0ae92dd5e421..470b39acb3df 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/modules_mgmt.py @@ -67,8 +67,6 @@ SYSFS_INDEPENDENT_FD_FREQ_SUPPORT = os.path.join(SYSFS_INDEPENDENT_FD_PREFIX, "frequency_support") IS_INDEPENDENT_MODULE = 'is_independent_module' -STATE_DB_TABLE_NAME_PREFIX = 'TRANSCEIVER_MODULES_MGMT|{}' - MAX_EEPROM_ERROR_RESET_RETRIES = 4 class ModulesMgmtTask(threading.Thread): @@ -92,7 +90,7 @@ def __init__(self, namespaces=None, main_thread_stop_event=None, q=None): self.fds_mapping_to_obj = {} self.port_to_fds = {} self.fds_events_count_dict = {} - self.delete_ports_from_state_db_dict = {} + self.delete_ports_and_reset_states_dict = {} self.setName("ModulesMgmtTask") self.register_hw_present_fds = [] @@ -234,7 +232,7 @@ def run(self): self.modules_lock_list[port_num].release() if is_final_state_module: - self.add_ports_state_to_state_db() + self.map_ports_final_state() self.delete_ports_from_dict() self.send_changes_to_shared_queue() self.register_presece_closed_ports(False, self.register_hw_present_fds) @@ -298,16 +296,15 @@ def run(self): self.deregister_fd_from_polling(module_obj.port_num) # put again module obj in sfp_port_dict so next loop will work on it self.sfp_port_dict[module_obj.port_num] = module_obj - self.delete_ports_from_state_db_dict[module_obj.port_num] = val + self.delete_ports_and_reset_states_dict[module_obj.port_num] = val except Exception as e: logger.log_error("dynamic detection exception on read presence {} for port {} fd name {} traceback:\n{}" .format(e, module_obj.port_num, module_fd.name, traceback.format_exc())) - for port, val in self.delete_ports_from_state_db_dict.items(): + for port, val in self.delete_ports_and_reset_states_dict.items(): logger.log_info(f"dynamic detection resetting all states for port {port} close_presence_ports {val}") module_obj = self.sfp_port_dict[port] module_obj.reset_all_states(close_presence_ports=val) - self.delete_ports_state_from_state_db(self.delete_ports_from_state_db_dict.keys()) - self.delete_ports_from_state_db_dict = {} + self.delete_ports_and_reset_states_dict = {} for port_num, module_sm_obj in self.sfp_port_dict.items(): curr_state = module_sm_obj.get_current_state() logger.log_info(f'dynamic detection STATE_LOG {port_num}: curr_state is {curr_state}') @@ -350,7 +347,7 @@ def run(self): self.modules_lock_list[port_num].release() if is_final_state_module: - self.add_ports_state_to_state_db(dynamic=True) + self.map_ports_final_state(dynamic=True) self.delete_ports_from_dict(dynamic=True) self.send_changes_to_shared_queue(dynamic=True) self.register_presece_closed_ports(True, self.register_hw_present_fds) @@ -624,59 +621,21 @@ def add_port_to_wait_reset(self, module_sm_obj): self.waiting_modules_list.add(module_sm_obj.port_num) logger.log_info("add_port_to_wait_reset waiting_list after adding: {}".format(self.waiting_modules_list)) - def add_ports_state_to_state_db(self, dynamic=False): - state_db = None + def map_ports_final_state(self, dynamic=False): detection_method = 'dynamic' if dynamic else 'static' - logger.log_info(f"{detection_method} detection enter add_ports_state_to_state_db") + logger.log_info(f"{detection_method} detection enter map_ports_final_state") for port, module_obj in self.sfp_port_dict.items(): final_state = module_obj.get_final_state() if final_state: # add port to delete list that we will iterate on later and delete the ports from sfp_port_dict self.sfp_delete_list_from_port_dict.append(port) if final_state in [STATE_HW_NOT_PRESENT, STATE_POWER_LIMIT_ERROR, STATE_ERROR_HANDLER]: - ctrl_type_db_value = '0' + port_status = '0' logger.log_info(f"{detection_method} detection adding port {port} to register_hw_present_fds") self.register_hw_present_fds.append(module_obj) else: - ctrl_type_db_value = '1' - self.sfp_changes_dict[str(module_obj.port_num + 1)] = ctrl_type_db_value - if final_state in [STATE_SW_CONTROL, STATE_FW_CONTROL]: - namespaces = multi_asic.get_front_end_namespaces() - for namespace in namespaces: - logger.log_info(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") - state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) - logger.log_info(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") - logger.log_info(f"{detection_method} detection got state_db for port {port} namespace {namespace}") - if state_db is not None: - logger.log_info( - f"{detection_method} detection connecting to state_db for port {port} namespace {namespace}") - state_db.connect(state_db.STATE_DB) - if final_state in [STATE_FW_CONTROL]: - control_type = 'FW_CONTROL' - elif final_state in [STATE_SW_CONTROL]: - control_type = 'SW_CONTROL' - table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) - logger.log_info(f"{detection_method} detection setting state_db table {table_name} for port {port}" - f" namespace {namespace} control_type {control_type}") - state_db.set(state_db.STATE_DB, table_name, "control_type", control_type) - - def delete_ports_state_from_state_db(self, ports, dynamic=True): - state_db = None - detection_method = 'dynamic' if dynamic else 'static' - for port in ports: - namespaces = multi_asic.get_front_end_namespaces() - for namespace in namespaces: - logger.log_info(f"{detection_method} detection getting state_db for port {port} namespace {namespace}") - state_db = SonicV2Connector(use_unix_socket_path=False, namespace=namespace) - logger.log_info(f"{detection_method} detection got state_db for port {port} namespace {namespace}") - if state_db is not None: - logger.log_info( - f"{detection_method} detection connecting to state_db for port {port} namespace {namespace}") - state_db.connect(state_db.STATE_DB) - table_name = STATE_DB_TABLE_NAME_PREFIX.format(port) - logger.log_info(f"{detection_method} detection deleting state_db table {table_name} " - f"for port {port} namespace {namespace}") - state_db.delete(state_db.STATE_DB, table_name) + port_status = '1' + self.sfp_changes_dict[str(module_obj.port_num + 1)] = port_status def delete_ports_from_dict(self, dynamic=False): detection_method = 'dynamic' if dynamic else 'static'