From b3e7152c648b111070c144a01ce482ec7f3f593c Mon Sep 17 00:00:00 2001 From: Jonathan Toppins Date: Wed, 2 Oct 2019 18:23:34 -0400 Subject: [PATCH 79/96] [netdrv] bnxt_en: Add FW fatal devlink_health_reporter Message-id: Patchwork-id: 276494 O-Subject: [RHEL-8.2 PATCH 72/78] bnxt_en: Add FW fatal devlink_health_reporter. Bugzilla: 1724766 RH-Acked-by: John Linville RH-Acked-by: Jarod Wilson Health show command example and output: $ devlink health show pci/0000:af:00.0 reporter fw_fatal pci/0000:af:00.0: name fw_fatal state healthy error 1 recover 1 grace_period 0 auto_recover true Fatal events from firmware or missing periodic heartbeats will be reported and recovery will be handled. We also turn on the support flags when we register with the firmware to enable this health and recovery feature in the firmware. Cc: Jiri Pirko Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller (cherry picked from commit acfb50e4e773c9a5755a3c265c7c20d37a8642e5) Bugzilla: 1724766 Build Info: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=23809532 Tested: build, boot, basic ping Signed-off-by: Jonathan Toppins Signed-off-by: Bruno Meneguele --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 80 ++++++++++++++++++++++- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 7 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 56 ++++++++++++++++ 3 files changed, 141 insertions(+), 2 deletions(-) Index: src/drivers/net/ethernet/broadcom/bnxt/bnxt.c =================================================================== --- src.orig/drivers/net/ethernet/broadcom/bnxt/bnxt.c 2020-02-06 16:23:20.864465843 +0100 +++ src/drivers/net/ethernet/broadcom/bnxt/bnxt.c 2020-02-06 16:23:21.000464594 +0100 @@ -1990,7 +1990,9 @@ goto async_event_process_exit; set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event); break; - case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY: + case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY: { + u32 data1 = le32_to_cpu(cmpl->event_data1); + bp->fw_reset_timestamp = jiffies; bp->fw_reset_min_dsecs = cmpl->timestamp_lo; if (!bp->fw_reset_min_dsecs) @@ -1998,8 +2000,16 @@ bp->fw_reset_max_dsecs = le16_to_cpu(cmpl->timestamp_hi); if (!bp->fw_reset_max_dsecs) bp->fw_reset_max_dsecs = BNXT_DFLT_FW_RST_MAX_DSECS; + if (EVENT_DATA1_RESET_NOTIFY_FATAL(data1)) { + netdev_warn(bp->dev, "Firmware fatal reset event received\n"); + set_bit(BNXT_STATE_FW_FATAL_COND, &bp->state); + } else { + netdev_warn(bp->dev, "Firmware non-fatal reset event received, max wait time %d msec\n", + bp->fw_reset_max_dsecs * 100); + } set_bit(BNXT_FW_RESET_NOTIFY_SP_EVENT, &bp->sp_event); break; + } case ASYNC_EVENT_CMPL_EVENT_ID_ERROR_RECOVERY: { struct bnxt_fw_health *fw_health = bp->fw_health; u32 data1 = le32_to_cpu(cmpl->event_data1); @@ -4419,6 +4429,7 @@ { struct hwrm_func_drv_rgtr_output *resp = bp->hwrm_cmd_resp_addr; struct hwrm_func_drv_rgtr_input req = {0}; + u32 flags; int rc; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1); @@ -4428,7 +4439,11 @@ FUNC_DRV_RGTR_REQ_ENABLES_VER); req.os_type = cpu_to_le16(FUNC_DRV_RGTR_REQ_OS_TYPE_LINUX); - req.flags = cpu_to_le32(FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE); + flags = FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE | + FUNC_DRV_RGTR_REQ_FLAGS_HOT_RESET_SUPPORT; + if (bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY) + flags |= FUNC_DRV_RGTR_REQ_FLAGS_ERROR_RECOVERY_SUPPORT; + req.flags = cpu_to_le32(flags); req.ver_maj_8b = DRV_VER_MAJ; req.ver_min_8b = DRV_VER_MIN; req.ver_upd_8b = DRV_VER_UPD; @@ -9931,6 +9946,38 @@ bnxt_queue_sp_work(bp); } +static void bnxt_fw_health_check(struct bnxt *bp) +{ + struct bnxt_fw_health *fw_health = bp->fw_health; + u32 val; + + if (!fw_health || !fw_health->enabled || + test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) + return; + + if (fw_health->tmr_counter) { + fw_health->tmr_counter--; + return; + } + + val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG); + if (val == fw_health->last_fw_heartbeat) + goto fw_reset; + + fw_health->last_fw_heartbeat = val; + + val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG); + if (val != fw_health->last_fw_reset_cnt) + goto fw_reset; + + fw_health->tmr_counter = fw_health->tmr_multiplier; + return; + +fw_reset: + set_bit(BNXT_FW_EXCEPTION_SP_EVENT, &bp->sp_event); + bnxt_queue_sp_work(bp); +} + static void bnxt_timer(struct timer_list *t) { struct bnxt *bp = from_timer(bp, t, timer); @@ -9942,6 +9989,9 @@ if (atomic_read(&bp->intr_sem) != 0) goto bnxt_restart_timer; + if (bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY) + bnxt_fw_health_check(bp); + if (bp->link_info.link_up && (bp->flags & BNXT_FLAG_PORT_STATS) && bp->stats_coal_ticks) { set_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event); @@ -10008,6 +10058,26 @@ bp->ctx = NULL; } +static bool is_bnxt_fw_ok(struct bnxt *bp) +{ + struct bnxt_fw_health *fw_health = bp->fw_health; + bool no_heartbeat = false, has_reset = false; + u32 val; + + val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG); + if (val == fw_health->last_fw_heartbeat) + no_heartbeat = true; + + val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG); + if (val != fw_health->last_fw_reset_cnt) + has_reset = true; + + if (!no_heartbeat && has_reset) + return true; + + return false; +} + /* rtnl_lock is acquired before calling this function */ static void bnxt_force_fw_reset(struct bnxt *bp) { @@ -10212,6 +10282,12 @@ if (test_and_clear_bit(BNXT_FW_RESET_NOTIFY_SP_EVENT, &bp->sp_event)) bnxt_devlink_health_report(bp, BNXT_FW_RESET_NOTIFY_SP_EVENT); + if (test_and_clear_bit(BNXT_FW_EXCEPTION_SP_EVENT, &bp->sp_event)) { + if (!is_bnxt_fw_ok(bp)) + bnxt_devlink_health_report(bp, + BNXT_FW_EXCEPTION_SP_EVENT); + } + smp_mb__before_atomic(); clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state); } Index: src/drivers/net/ethernet/broadcom/bnxt/bnxt.h =================================================================== --- src.orig/drivers/net/ethernet/broadcom/bnxt/bnxt.h 2020-02-06 16:23:20.864465843 +0100 +++ src/drivers/net/ethernet/broadcom/bnxt/bnxt.h 2020-02-06 16:23:21.001464585 +0100 @@ -472,6 +472,11 @@ ((le32_to_cpu((rx_tpa_end_ext)->rx_tpa_end_cmp_dup_acks) & \ RX_TPA_END_CMP_AGG_BUFS_P5) >> RX_TPA_END_CMP_AGG_BUFS_SHIFT_P5) +#define EVENT_DATA1_RESET_NOTIFY_FATAL(data1) \ + (((data1) & \ + ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_MASK) ==\ + ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_EXCEPTION_FATAL) + #define EVENT_DATA1_RECOVERY_MASTER_FUNC(data1) \ !!((data1) & \ ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_MASTER_FUNC) @@ -1372,6 +1377,7 @@ u32 fw_reset_seq_delay_msec[16]; struct devlink_health_reporter *fw_reporter; struct devlink_health_reporter *fw_reset_reporter; + struct devlink_health_reporter *fw_fatal_reporter; }; struct bnxt_fw_reporter_ctx { @@ -1728,6 +1734,7 @@ #define BNXT_UPDATE_PHY_SP_EVENT 16 #define BNXT_RING_COAL_NOW_SP_EVENT 17 #define BNXT_FW_RESET_NOTIFY_SP_EVENT 18 +#define BNXT_FW_EXCEPTION_SP_EVENT 19 struct delayed_work fw_reset_task; int fw_reset_state; Index: src/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c =================================================================== --- src.orig/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c 2020-02-06 16:23:20.308470946 +0100 +++ src/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c 2020-02-06 16:23:21.001464585 +0100 @@ -83,6 +83,31 @@ .recover = bnxt_fw_reset_recover, }; +static int bnxt_fw_fatal_recover(struct devlink_health_reporter *reporter, + void *priv_ctx) +{ + struct bnxt *bp = devlink_health_reporter_priv(reporter); + struct bnxt_fw_reporter_ctx *fw_reporter_ctx = priv_ctx; + unsigned long event; + + if (!priv_ctx) + return -EOPNOTSUPP; + + event = fw_reporter_ctx->sp_event; + if (event == BNXT_FW_RESET_NOTIFY_SP_EVENT) + bnxt_fw_reset(bp); + else if (event == BNXT_FW_EXCEPTION_SP_EVENT) + bnxt_fw_exception(bp); + + return 0; +} + +static const +struct devlink_health_reporter_ops bnxt_dl_fw_fatal_reporter_ops = { + .name = "fw_fatal", + .recover = bnxt_fw_fatal_recover, +}; + static void bnxt_dl_fw_reporters_create(struct bnxt *bp) { struct bnxt_fw_health *health = bp->fw_health; @@ -108,6 +133,16 @@ PTR_ERR(health->fw_reset_reporter)); health->fw_reset_reporter = NULL; } + + health->fw_fatal_reporter = + devlink_health_reporter_create(bp->dl, + &bnxt_dl_fw_fatal_reporter_ops, + 0, true, bp); + if (IS_ERR(health->fw_fatal_reporter)) { + netdev_warn(bp->dev, "Failed to create FW fatal health reporter, rc = %ld\n", + PTR_ERR(health->fw_fatal_reporter)); + health->fw_fatal_reporter = NULL; + } } static void bnxt_dl_fw_reporters_destroy(struct bnxt *bp) @@ -122,6 +157,9 @@ if (health->fw_reset_reporter) devlink_health_reporter_destroy(health->fw_reset_reporter); + + if (health->fw_fatal_reporter) + devlink_health_reporter_destroy(health->fw_fatal_reporter); } void bnxt_devlink_health_report(struct bnxt *bp, unsigned long event) @@ -135,6 +173,15 @@ fw_reporter_ctx.sp_event = event; switch (event) { case BNXT_FW_RESET_NOTIFY_SP_EVENT: + if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) { + if (!fw_health->fw_fatal_reporter) + return; + + devlink_health_report(fw_health->fw_fatal_reporter, + "FW fatal async event received", + &fw_reporter_ctx); + return; + } if (!fw_health->fw_reset_reporter) return; @@ -142,6 +189,15 @@ "FW non-fatal reset event received", &fw_reporter_ctx); return; + + case BNXT_FW_EXCEPTION_SP_EVENT: + if (!fw_health->fw_fatal_reporter) + return; + + devlink_health_report(fw_health->fw_fatal_reporter, + "FW fatal error reported", + &fw_reporter_ctx); + return; } }