You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
312 lines
10 KiB
312 lines
10 KiB
5 years ago
|
From b3e7152c648b111070c144a01ce482ec7f3f593c Mon Sep 17 00:00:00 2001
|
||
|
From: Jonathan Toppins <jtoppins@redhat.com>
|
||
|
Date: Wed, 2 Oct 2019 18:23:34 -0400
|
||
|
Subject: [PATCH 79/96] [netdrv] bnxt_en: Add FW fatal devlink_health_reporter
|
||
|
|
||
|
Message-id: <f7f97c323916640b6204ae069cfe0aaf36db26da.1570027456.git.jtoppins@redhat.com>
|
||
|
Patchwork-id: 276494
|
||
|
O-Subject: [RHEL-8.2 PATCH 72/78] bnxt_en: Add FW fatal devlink_health_reporter.
|
||
|
Bugzilla: 1724766
|
||
|
RH-Acked-by: John Linville <linville@redhat.com>
|
||
|
RH-Acked-by: Jarod Wilson <jarod@redhat.com>
|
||
|
|
||
|
Health show command example and output:
|
||
|
|
||
|
$ devlink health show pci/0000:af:00.0 reporter fw_fatal
|
||
|
|
||
|
pci/0000:af:00.0:
|
||
|
name fw_fatal
|
||
|
state healthy error 1 recover 1 grace_period 0 auto_recover true
|
||
|
|
||
|
Fatal events from firmware or missing periodic heartbeats will
|
||
|
be reported and recovery will be handled.
|
||
|
|
||
|
We also turn on the support flags when we register with the firmware to
|
||
|
enable this health and recovery feature in the firmware.
|
||
|
|
||
|
Cc: Jiri Pirko <jiri@mellanox.com>
|
||
|
Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
|
||
|
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
|
||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||
|
(cherry picked from commit acfb50e4e773c9a5755a3c265c7c20d37a8642e5)
|
||
|
Bugzilla: 1724766
|
||
|
Build Info: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=23809532
|
||
|
Tested: build, boot, basic ping
|
||
|
Signed-off-by: Jonathan Toppins <jtoppins@redhat.com>
|
||
|
Signed-off-by: Bruno Meneguele <bmeneg@redhat.com>
|
||
|
---
|
||
|
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 80 ++++++++++++++++++++++-
|
||
|
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 7 ++
|
||
|
drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 56 ++++++++++++++++
|
||
|
3 files changed, 141 insertions(+), 2 deletions(-)
|
||
|
|
||
|
Index: src/drivers/net/ethernet/broadcom/bnxt/bnxt.c
|
||
|
===================================================================
|
||
|
--- src.orig/drivers/net/ethernet/broadcom/bnxt/bnxt.c 2020-02-06 16:23:20.864465843 +0100
|
||
|
+++ src/drivers/net/ethernet/broadcom/bnxt/bnxt.c 2020-02-06 16:23:21.000464594 +0100
|
||
|
@@ -1990,7 +1990,9 @@
|
||
|
goto async_event_process_exit;
|
||
|
set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event);
|
||
|
break;
|
||
|
- case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY:
|
||
|
+ case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY: {
|
||
|
+ u32 data1 = le32_to_cpu(cmpl->event_data1);
|
||
|
+
|
||
|
bp->fw_reset_timestamp = jiffies;
|
||
|
bp->fw_reset_min_dsecs = cmpl->timestamp_lo;
|
||
|
if (!bp->fw_reset_min_dsecs)
|
||
|
@@ -1998,8 +2000,16 @@
|
||
|
bp->fw_reset_max_dsecs = le16_to_cpu(cmpl->timestamp_hi);
|
||
|
if (!bp->fw_reset_max_dsecs)
|
||
|
bp->fw_reset_max_dsecs = BNXT_DFLT_FW_RST_MAX_DSECS;
|
||
|
+ if (EVENT_DATA1_RESET_NOTIFY_FATAL(data1)) {
|
||
|
+ netdev_warn(bp->dev, "Firmware fatal reset event received\n");
|
||
|
+ set_bit(BNXT_STATE_FW_FATAL_COND, &bp->state);
|
||
|
+ } else {
|
||
|
+ netdev_warn(bp->dev, "Firmware non-fatal reset event received, max wait time %d msec\n",
|
||
|
+ bp->fw_reset_max_dsecs * 100);
|
||
|
+ }
|
||
|
set_bit(BNXT_FW_RESET_NOTIFY_SP_EVENT, &bp->sp_event);
|
||
|
break;
|
||
|
+ }
|
||
|
case ASYNC_EVENT_CMPL_EVENT_ID_ERROR_RECOVERY: {
|
||
|
struct bnxt_fw_health *fw_health = bp->fw_health;
|
||
|
u32 data1 = le32_to_cpu(cmpl->event_data1);
|
||
|
@@ -4419,6 +4429,7 @@
|
||
|
{
|
||
|
struct hwrm_func_drv_rgtr_output *resp = bp->hwrm_cmd_resp_addr;
|
||
|
struct hwrm_func_drv_rgtr_input req = {0};
|
||
|
+ u32 flags;
|
||
|
int rc;
|
||
|
|
||
|
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
|
||
|
@@ -4428,7 +4439,11 @@
|
||
|
FUNC_DRV_RGTR_REQ_ENABLES_VER);
|
||
|
|
||
|
req.os_type = cpu_to_le16(FUNC_DRV_RGTR_REQ_OS_TYPE_LINUX);
|
||
|
- req.flags = cpu_to_le32(FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE);
|
||
|
+ flags = FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE |
|
||
|
+ FUNC_DRV_RGTR_REQ_FLAGS_HOT_RESET_SUPPORT;
|
||
|
+ if (bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY)
|
||
|
+ flags |= FUNC_DRV_RGTR_REQ_FLAGS_ERROR_RECOVERY_SUPPORT;
|
||
|
+ req.flags = cpu_to_le32(flags);
|
||
|
req.ver_maj_8b = DRV_VER_MAJ;
|
||
|
req.ver_min_8b = DRV_VER_MIN;
|
||
|
req.ver_upd_8b = DRV_VER_UPD;
|
||
|
@@ -9931,6 +9946,38 @@
|
||
|
bnxt_queue_sp_work(bp);
|
||
|
}
|
||
|
|
||
|
+static void bnxt_fw_health_check(struct bnxt *bp)
|
||
|
+{
|
||
|
+ struct bnxt_fw_health *fw_health = bp->fw_health;
|
||
|
+ u32 val;
|
||
|
+
|
||
|
+ if (!fw_health || !fw_health->enabled ||
|
||
|
+ test_bit(BNXT_STATE_IN_FW_RESET, &bp->state))
|
||
|
+ return;
|
||
|
+
|
||
|
+ if (fw_health->tmr_counter) {
|
||
|
+ fw_health->tmr_counter--;
|
||
|
+ return;
|
||
|
+ }
|
||
|
+
|
||
|
+ val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG);
|
||
|
+ if (val == fw_health->last_fw_heartbeat)
|
||
|
+ goto fw_reset;
|
||
|
+
|
||
|
+ fw_health->last_fw_heartbeat = val;
|
||
|
+
|
||
|
+ val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
|
||
|
+ if (val != fw_health->last_fw_reset_cnt)
|
||
|
+ goto fw_reset;
|
||
|
+
|
||
|
+ fw_health->tmr_counter = fw_health->tmr_multiplier;
|
||
|
+ return;
|
||
|
+
|
||
|
+fw_reset:
|
||
|
+ set_bit(BNXT_FW_EXCEPTION_SP_EVENT, &bp->sp_event);
|
||
|
+ bnxt_queue_sp_work(bp);
|
||
|
+}
|
||
|
+
|
||
|
static void bnxt_timer(struct timer_list *t)
|
||
|
{
|
||
|
struct bnxt *bp = from_timer(bp, t, timer);
|
||
|
@@ -9942,6 +9989,9 @@
|
||
|
if (atomic_read(&bp->intr_sem) != 0)
|
||
|
goto bnxt_restart_timer;
|
||
|
|
||
|
+ if (bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY)
|
||
|
+ bnxt_fw_health_check(bp);
|
||
|
+
|
||
|
if (bp->link_info.link_up && (bp->flags & BNXT_FLAG_PORT_STATS) &&
|
||
|
bp->stats_coal_ticks) {
|
||
|
set_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event);
|
||
|
@@ -10008,6 +10058,26 @@
|
||
|
bp->ctx = NULL;
|
||
|
}
|
||
|
|
||
|
+static bool is_bnxt_fw_ok(struct bnxt *bp)
|
||
|
+{
|
||
|
+ struct bnxt_fw_health *fw_health = bp->fw_health;
|
||
|
+ bool no_heartbeat = false, has_reset = false;
|
||
|
+ u32 val;
|
||
|
+
|
||
|
+ val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG);
|
||
|
+ if (val == fw_health->last_fw_heartbeat)
|
||
|
+ no_heartbeat = true;
|
||
|
+
|
||
|
+ val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
|
||
|
+ if (val != fw_health->last_fw_reset_cnt)
|
||
|
+ has_reset = true;
|
||
|
+
|
||
|
+ if (!no_heartbeat && has_reset)
|
||
|
+ return true;
|
||
|
+
|
||
|
+ return false;
|
||
|
+}
|
||
|
+
|
||
|
/* rtnl_lock is acquired before calling this function */
|
||
|
static void bnxt_force_fw_reset(struct bnxt *bp)
|
||
|
{
|
||
|
@@ -10212,6 +10282,12 @@
|
||
|
if (test_and_clear_bit(BNXT_FW_RESET_NOTIFY_SP_EVENT, &bp->sp_event))
|
||
|
bnxt_devlink_health_report(bp, BNXT_FW_RESET_NOTIFY_SP_EVENT);
|
||
|
|
||
|
+ if (test_and_clear_bit(BNXT_FW_EXCEPTION_SP_EVENT, &bp->sp_event)) {
|
||
|
+ if (!is_bnxt_fw_ok(bp))
|
||
|
+ bnxt_devlink_health_report(bp,
|
||
|
+ BNXT_FW_EXCEPTION_SP_EVENT);
|
||
|
+ }
|
||
|
+
|
||
|
smp_mb__before_atomic();
|
||
|
clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
|
||
|
}
|
||
|
Index: src/drivers/net/ethernet/broadcom/bnxt/bnxt.h
|
||
|
===================================================================
|
||
|
--- src.orig/drivers/net/ethernet/broadcom/bnxt/bnxt.h 2020-02-06 16:23:20.864465843 +0100
|
||
|
+++ src/drivers/net/ethernet/broadcom/bnxt/bnxt.h 2020-02-06 16:23:21.001464585 +0100
|
||
|
@@ -472,6 +472,11 @@
|
||
|
((le32_to_cpu((rx_tpa_end_ext)->rx_tpa_end_cmp_dup_acks) & \
|
||
|
RX_TPA_END_CMP_AGG_BUFS_P5) >> RX_TPA_END_CMP_AGG_BUFS_SHIFT_P5)
|
||
|
|
||
|
+#define EVENT_DATA1_RESET_NOTIFY_FATAL(data1) \
|
||
|
+ (((data1) & \
|
||
|
+ ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_MASK) ==\
|
||
|
+ ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_EXCEPTION_FATAL)
|
||
|
+
|
||
|
#define EVENT_DATA1_RECOVERY_MASTER_FUNC(data1) \
|
||
|
!!((data1) & \
|
||
|
ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_MASTER_FUNC)
|
||
|
@@ -1372,6 +1377,7 @@
|
||
|
u32 fw_reset_seq_delay_msec[16];
|
||
|
struct devlink_health_reporter *fw_reporter;
|
||
|
struct devlink_health_reporter *fw_reset_reporter;
|
||
|
+ struct devlink_health_reporter *fw_fatal_reporter;
|
||
|
};
|
||
|
|
||
|
struct bnxt_fw_reporter_ctx {
|
||
|
@@ -1728,6 +1734,7 @@
|
||
|
#define BNXT_UPDATE_PHY_SP_EVENT 16
|
||
|
#define BNXT_RING_COAL_NOW_SP_EVENT 17
|
||
|
#define BNXT_FW_RESET_NOTIFY_SP_EVENT 18
|
||
|
+#define BNXT_FW_EXCEPTION_SP_EVENT 19
|
||
|
|
||
|
struct delayed_work fw_reset_task;
|
||
|
int fw_reset_state;
|
||
|
Index: src/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
|
||
|
===================================================================
|
||
|
--- src.orig/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c 2020-02-06 16:23:20.308470946 +0100
|
||
|
+++ src/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c 2020-02-06 16:23:21.001464585 +0100
|
||
|
@@ -83,6 +83,31 @@
|
||
|
.recover = bnxt_fw_reset_recover,
|
||
|
};
|
||
|
|
||
|
+static int bnxt_fw_fatal_recover(struct devlink_health_reporter *reporter,
|
||
|
+ void *priv_ctx)
|
||
|
+{
|
||
|
+ struct bnxt *bp = devlink_health_reporter_priv(reporter);
|
||
|
+ struct bnxt_fw_reporter_ctx *fw_reporter_ctx = priv_ctx;
|
||
|
+ unsigned long event;
|
||
|
+
|
||
|
+ if (!priv_ctx)
|
||
|
+ return -EOPNOTSUPP;
|
||
|
+
|
||
|
+ event = fw_reporter_ctx->sp_event;
|
||
|
+ if (event == BNXT_FW_RESET_NOTIFY_SP_EVENT)
|
||
|
+ bnxt_fw_reset(bp);
|
||
|
+ else if (event == BNXT_FW_EXCEPTION_SP_EVENT)
|
||
|
+ bnxt_fw_exception(bp);
|
||
|
+
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
+static const
|
||
|
+struct devlink_health_reporter_ops bnxt_dl_fw_fatal_reporter_ops = {
|
||
|
+ .name = "fw_fatal",
|
||
|
+ .recover = bnxt_fw_fatal_recover,
|
||
|
+};
|
||
|
+
|
||
|
static void bnxt_dl_fw_reporters_create(struct bnxt *bp)
|
||
|
{
|
||
|
struct bnxt_fw_health *health = bp->fw_health;
|
||
|
@@ -108,6 +133,16 @@
|
||
|
PTR_ERR(health->fw_reset_reporter));
|
||
|
health->fw_reset_reporter = NULL;
|
||
|
}
|
||
|
+
|
||
|
+ health->fw_fatal_reporter =
|
||
|
+ devlink_health_reporter_create(bp->dl,
|
||
|
+ &bnxt_dl_fw_fatal_reporter_ops,
|
||
|
+ 0, true, bp);
|
||
|
+ if (IS_ERR(health->fw_fatal_reporter)) {
|
||
|
+ netdev_warn(bp->dev, "Failed to create FW fatal health reporter, rc = %ld\n",
|
||
|
+ PTR_ERR(health->fw_fatal_reporter));
|
||
|
+ health->fw_fatal_reporter = NULL;
|
||
|
+ }
|
||
|
}
|
||
|
|
||
|
static void bnxt_dl_fw_reporters_destroy(struct bnxt *bp)
|
||
|
@@ -122,6 +157,9 @@
|
||
|
|
||
|
if (health->fw_reset_reporter)
|
||
|
devlink_health_reporter_destroy(health->fw_reset_reporter);
|
||
|
+
|
||
|
+ if (health->fw_fatal_reporter)
|
||
|
+ devlink_health_reporter_destroy(health->fw_fatal_reporter);
|
||
|
}
|
||
|
|
||
|
void bnxt_devlink_health_report(struct bnxt *bp, unsigned long event)
|
||
|
@@ -135,6 +173,15 @@
|
||
|
fw_reporter_ctx.sp_event = event;
|
||
|
switch (event) {
|
||
|
case BNXT_FW_RESET_NOTIFY_SP_EVENT:
|
||
|
+ if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) {
|
||
|
+ if (!fw_health->fw_fatal_reporter)
|
||
|
+ return;
|
||
|
+
|
||
|
+ devlink_health_report(fw_health->fw_fatal_reporter,
|
||
|
+ "FW fatal async event received",
|
||
|
+ &fw_reporter_ctx);
|
||
|
+ return;
|
||
|
+ }
|
||
|
if (!fw_health->fw_reset_reporter)
|
||
|
return;
|
||
|
|
||
|
@@ -142,6 +189,15 @@
|
||
|
"FW non-fatal reset event received",
|
||
|
&fw_reporter_ctx);
|
||
|
return;
|
||
|
+
|
||
|
+ case BNXT_FW_EXCEPTION_SP_EVENT:
|
||
|
+ if (!fw_health->fw_fatal_reporter)
|
||
|
+ return;
|
||
|
+
|
||
|
+ devlink_health_report(fw_health->fw_fatal_reporter,
|
||
|
+ "FW fatal error reported",
|
||
|
+ &fw_reporter_ctx);
|
||
|
+ return;
|
||
|
}
|
||
|
}
|
||
|
|