You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
361 lines
13 KiB
361 lines
13 KiB
From beae62dd1772b395964f8e73f82c202f1ad346d9 Mon Sep 17 00:00:00 2001
|
|
From: Alaa Hleihel <ahleihel@redhat.com>
|
|
Date: Sun, 10 May 2020 14:51:54 -0400
|
|
Subject: [PATCH 030/312] [netdrv] net/mlx5e: Report and recover from CQE error
|
|
on ICOSQ
|
|
|
|
Message-id: <20200510145245.10054-32-ahleihel@redhat.com>
|
|
Patchwork-id: 306571
|
|
Patchwork-instance: patchwork
|
|
O-Subject: [RHEL8.3 BZ 1789378 v2 31/82] net/mlx5e: Report and recover from CQE error on ICOSQ
|
|
Bugzilla: 1790198 1789378
|
|
RH-Acked-by: Kamal Heib <kheib@redhat.com>
|
|
RH-Acked-by: Jarod Wilson <jarod@redhat.com>
|
|
RH-Acked-by: Tony Camuso <tcamuso@redhat.com>
|
|
RH-Acked-by: Jonathan Toppins <jtoppins@redhat.com>
|
|
|
|
Bugzilla: http://bugzilla.redhat.com/1789378
|
|
Bugzilla: http://bugzilla.redhat.com/1790198
|
|
Upstream: v5.4-rc1
|
|
Conflicts:
|
|
- drivers/net/ethernet/mellanox/mlx5/core/en_main.c
|
|
- drivers/net/ethernet/mellanox/mlx5/core/en.h
|
|
Dropped hunks that were previously applied for fixing incremental build.
|
|
|
|
- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
|
|
Adapt mlx5e_rx_reporter_recover parameters to current API due to already
|
|
backported commit:
|
|
e7a981050a7f ("devlink: propagate extack down to health reporter ops")
|
|
---> .recover callback now expects to get extact as well.
|
|
|
|
commit be5323c8379f488f1de53206edeaf80fc20d7686
|
|
Author: Aya Levin <ayal@mellanox.com>
|
|
Date: Tue Jun 25 17:44:28 2019 +0300
|
|
|
|
net/mlx5e: Report and recover from CQE error on ICOSQ
|
|
|
|
Add support for report and recovery from error on completion on ICOSQ.
|
|
Deactivate RQ and flush, then deactivate ICOSQ. Set the queue back to
|
|
ready state (firmware) and reset the ICOSQ and the RQ (software
|
|
resources). Finally, activate the ICOSQ and the RQ.
|
|
|
|
Signed-off-by: Aya Levin <ayal@mellanox.com>
|
|
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
|
|
Acked-by: Jiri Pirko <jiri@mellanox.com>
|
|
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
|
|
|
|
Signed-off-by: Alaa Hleihel <ahleihel@redhat.com>
|
|
Signed-off-by: Frantisek Hrbata <fhrbata@redhat.com>
|
|
---
|
|
drivers/net/ethernet/mellanox/mlx5/core/en.h | 6 ++
|
|
.../net/ethernet/mellanox/mlx5/core/en/health.h | 1 +
|
|
.../ethernet/mellanox/mlx5/core/en/reporter_rx.c | 110 ++++++++++++++++++++-
|
|
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 18 +++-
|
|
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +
|
|
drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 3 +
|
|
drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 2 +
|
|
7 files changed, 137 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
|
|
index 21926cb209f9..f0ba350579ae 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
|
|
@@ -559,6 +559,8 @@ struct mlx5e_icosq {
|
|
/* control path */
|
|
struct mlx5_wq_ctrl wq_ctrl;
|
|
struct mlx5e_channel *channel;
|
|
+
|
|
+ struct work_struct recover_work;
|
|
} ____cacheline_aligned_in_smp;
|
|
|
|
struct mlx5e_wqe_frag_info {
|
|
@@ -1037,6 +1039,10 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
|
|
void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
|
|
void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
|
|
struct mlx5e_params *params);
|
|
+int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state);
|
|
+void mlx5e_activate_rq(struct mlx5e_rq *rq);
|
|
+void mlx5e_deactivate_rq(struct mlx5e_rq *rq);
|
|
+void mlx5e_free_rx_descs(struct mlx5e_rq *rq);
|
|
void mlx5e_activate_icosq(struct mlx5e_icosq *icosq);
|
|
void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq);
|
|
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
|
|
index a751c5316baf..8acd9dc520cf 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
|
|
@@ -18,6 +18,7 @@ int mlx5e_reporter_named_obj_nest_end(struct devlink_fmsg *fmsg);
|
|
|
|
int mlx5e_reporter_rx_create(struct mlx5e_priv *priv);
|
|
void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv);
|
|
+void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq);
|
|
|
|
#define MLX5E_REPORTER_PER_Q_MAX_LEN 256
|
|
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
|
|
index 7cd767f0b8c7..661de567ca6c 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
|
|
@@ -27,6 +27,110 @@ static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state)
|
|
return err;
|
|
}
|
|
|
|
+static int mlx5e_wait_for_icosq_flush(struct mlx5e_icosq *icosq)
|
|
+{
|
|
+ unsigned long exp_time = jiffies + msecs_to_jiffies(2000);
|
|
+
|
|
+ while (time_before(jiffies, exp_time)) {
|
|
+ if (icosq->cc == icosq->pc)
|
|
+ return 0;
|
|
+
|
|
+ msleep(20);
|
|
+ }
|
|
+
|
|
+ netdev_err(icosq->channel->netdev,
|
|
+ "Wait for ICOSQ 0x%x flush timeout (cc = 0x%x, pc = 0x%x)\n",
|
|
+ icosq->sqn, icosq->cc, icosq->pc);
|
|
+
|
|
+ return -ETIMEDOUT;
|
|
+}
|
|
+
|
|
+static void mlx5e_reset_icosq_cc_pc(struct mlx5e_icosq *icosq)
|
|
+{
|
|
+ WARN_ONCE(icosq->cc != icosq->pc, "ICOSQ 0x%x: cc (0x%x) != pc (0x%x)\n",
|
|
+ icosq->sqn, icosq->cc, icosq->pc);
|
|
+ icosq->cc = 0;
|
|
+ icosq->pc = 0;
|
|
+}
|
|
+
|
|
+static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx)
|
|
+{
|
|
+ struct mlx5_core_dev *mdev;
|
|
+ struct mlx5e_icosq *icosq;
|
|
+ struct net_device *dev;
|
|
+ struct mlx5e_rq *rq;
|
|
+ u8 state;
|
|
+ int err;
|
|
+
|
|
+ icosq = ctx;
|
|
+ rq = &icosq->channel->rq;
|
|
+ mdev = icosq->channel->mdev;
|
|
+ dev = icosq->channel->netdev;
|
|
+ err = mlx5_core_query_sq_state(mdev, icosq->sqn, &state);
|
|
+ if (err) {
|
|
+ netdev_err(dev, "Failed to query ICOSQ 0x%x state. err = %d\n",
|
|
+ icosq->sqn, err);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (state != MLX5_SQC_STATE_ERR)
|
|
+ goto out;
|
|
+
|
|
+ mlx5e_deactivate_rq(rq);
|
|
+ err = mlx5e_wait_for_icosq_flush(icosq);
|
|
+ if (err)
|
|
+ goto out;
|
|
+
|
|
+ mlx5e_deactivate_icosq(icosq);
|
|
+
|
|
+ /* At this point, both the rq and the icosq are disabled */
|
|
+
|
|
+ err = mlx5e_health_sq_to_ready(icosq->channel, icosq->sqn);
|
|
+ if (err)
|
|
+ goto out;
|
|
+
|
|
+ mlx5e_reset_icosq_cc_pc(icosq);
|
|
+ mlx5e_free_rx_descs(rq);
|
|
+ clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state);
|
|
+ mlx5e_activate_icosq(icosq);
|
|
+ mlx5e_activate_rq(rq);
|
|
+
|
|
+ rq->stats->recover++;
|
|
+ return 0;
|
|
+out:
|
|
+ clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq)
|
|
+{
|
|
+ struct mlx5e_priv *priv = icosq->channel->priv;
|
|
+ char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
|
|
+ struct mlx5e_err_ctx err_ctx = {};
|
|
+
|
|
+ err_ctx.ctx = icosq;
|
|
+ err_ctx.recover = mlx5e_rx_reporter_err_icosq_cqe_recover;
|
|
+ sprintf(err_str, "ERR CQE on ICOSQ: 0x%x", icosq->sqn);
|
|
+
|
|
+ mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
|
|
+}
|
|
+
|
|
+static int mlx5e_rx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
|
|
+{
|
|
+ return err_ctx->recover(err_ctx->ctx);
|
|
+}
|
|
+
|
|
+static int mlx5e_rx_reporter_recover(struct devlink_health_reporter *reporter,
|
|
+ void *context,
|
|
+ struct netlink_ext_ack *extack)
|
|
+{
|
|
+ struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
|
|
+ struct mlx5e_err_ctx *err_ctx = context;
|
|
+
|
|
+ return err_ctx ? mlx5e_rx_reporter_recover_from_ctx(err_ctx) :
|
|
+ mlx5e_health_recover_channels(priv);
|
|
+}
|
|
+
|
|
static int mlx5e_rx_reporter_build_diagnose_output(struct mlx5e_rq *rq,
|
|
struct devlink_fmsg *fmsg)
|
|
{
|
|
@@ -168,9 +272,12 @@ static int mlx5e_rx_reporter_diagnose(struct devlink_health_reporter *reporter,
|
|
|
|
static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = {
|
|
.name = "rx",
|
|
+ .recover = mlx5e_rx_reporter_recover,
|
|
.diagnose = mlx5e_rx_reporter_diagnose,
|
|
};
|
|
|
|
+#define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500
|
|
+
|
|
int mlx5e_reporter_rx_create(struct mlx5e_priv *priv)
|
|
{
|
|
struct devlink *devlink = priv_to_devlink(priv->mdev);
|
|
@@ -178,7 +285,8 @@ int mlx5e_reporter_rx_create(struct mlx5e_priv *priv)
|
|
|
|
reporter = devlink_health_reporter_create(devlink,
|
|
&mlx5_rx_reporter_ops,
|
|
- 0, false, priv);
|
|
+ MLX5E_REPORTER_RX_GRACEFUL_PERIOD,
|
|
+ true, priv);
|
|
if (IS_ERR(reporter)) {
|
|
netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n",
|
|
PTR_ERR(reporter));
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
|
|
index 7dde1be49f35..430fb04ea96f 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
|
|
@@ -691,8 +691,7 @@ static int mlx5e_create_rq(struct mlx5e_rq *rq,
|
|
return err;
|
|
}
|
|
|
|
-static int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state,
|
|
- int next_state)
|
|
+int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state)
|
|
{
|
|
struct mlx5_core_dev *mdev = rq->mdev;
|
|
|
|
@@ -803,7 +802,7 @@ int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time)
|
|
return -ETIMEDOUT;
|
|
}
|
|
|
|
-static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
|
|
+void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
|
|
{
|
|
__be16 wqe_ix_be;
|
|
u16 wqe_ix;
|
|
@@ -882,7 +881,7 @@ int mlx5e_open_rq(struct mlx5e_channel *c, struct mlx5e_params *params,
|
|
return err;
|
|
}
|
|
|
|
-static void mlx5e_activate_rq(struct mlx5e_rq *rq)
|
|
+void mlx5e_activate_rq(struct mlx5e_rq *rq)
|
|
{
|
|
set_bit(MLX5E_RQ_STATE_ENABLED, &rq->state);
|
|
mlx5e_trigger_irq(&rq->channel->icosq);
|
|
@@ -897,6 +896,7 @@ void mlx5e_deactivate_rq(struct mlx5e_rq *rq)
|
|
void mlx5e_close_rq(struct mlx5e_rq *rq)
|
|
{
|
|
cancel_work_sync(&rq->dim.work);
|
|
+ cancel_work_sync(&rq->channel->icosq.recover_work);
|
|
mlx5e_destroy_rq(rq);
|
|
mlx5e_free_rx_descs(rq);
|
|
mlx5e_free_rq(rq);
|
|
@@ -1013,6 +1013,14 @@ static int mlx5e_alloc_icosq_db(struct mlx5e_icosq *sq, int numa)
|
|
return 0;
|
|
}
|
|
|
|
+static void mlx5e_icosq_err_cqe_work(struct work_struct *recover_work)
|
|
+{
|
|
+ struct mlx5e_icosq *sq = container_of(recover_work, struct mlx5e_icosq,
|
|
+ recover_work);
|
|
+
|
|
+ mlx5e_reporter_icosq_cqe_err(sq);
|
|
+}
|
|
+
|
|
static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
|
|
struct mlx5e_sq_param *param,
|
|
struct mlx5e_icosq *sq)
|
|
@@ -1035,6 +1043,8 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
|
|
if (err)
|
|
goto err_sq_wq_destroy;
|
|
|
|
+ INIT_WORK(&sq->recover_work, mlx5e_icosq_err_cqe_work);
|
|
+
|
|
return 0;
|
|
|
|
err_sq_wq_destroy:
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
|
|
index a22b3a3db253..ce4d357188df 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
|
|
@@ -616,6 +616,8 @@ void mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
|
|
if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) {
|
|
netdev_WARN_ONCE(cq->channel->netdev,
|
|
"Bad OP in ICOSQ CQE: 0x%x\n", get_cqe_opcode(cqe));
|
|
+ if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
|
|
+ queue_work(cq->channel->priv->wq, &sq->recover_work);
|
|
break;
|
|
}
|
|
do {
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
|
|
index 3d993e2e7bea..79b3ec005f43 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
|
|
@@ -161,6 +161,7 @@ static const struct counter_desc sw_stats_desc[] = {
|
|
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_waive) },
|
|
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_congst_umr) },
|
|
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_err) },
|
|
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_recover) },
|
|
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_events) },
|
|
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_poll) },
|
|
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_arm) },
|
|
@@ -272,6 +273,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw)
|
|
s->rx_cache_waive += rq_stats->cache_waive;
|
|
s->rx_congst_umr += rq_stats->congst_umr;
|
|
s->rx_arfs_err += rq_stats->arfs_err;
|
|
+ s->rx_recover += rq_stats->recover;
|
|
s->ch_events += ch_stats->events;
|
|
s->ch_poll += ch_stats->poll;
|
|
s->ch_arm += ch_stats->arm;
|
|
@@ -1484,6 +1486,7 @@ static const struct counter_desc rq_stats_desc[] = {
|
|
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_waive) },
|
|
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, congst_umr) },
|
|
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_err) },
|
|
+ { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, recover) },
|
|
};
|
|
|
|
static const struct counter_desc sq_stats_desc[] = {
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
|
|
index a4a43613d026..ab1c3366ff7d 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
|
|
@@ -167,6 +167,7 @@ struct mlx5e_sw_stats {
|
|
u64 rx_cache_waive;
|
|
u64 rx_congst_umr;
|
|
u64 rx_arfs_err;
|
|
+ u64 rx_recover;
|
|
u64 ch_events;
|
|
u64 ch_poll;
|
|
u64 ch_arm;
|
|
@@ -302,6 +303,7 @@ struct mlx5e_rq_stats {
|
|
u64 cache_waive;
|
|
u64 congst_umr;
|
|
u64 arfs_err;
|
|
+ u64 recover;
|
|
};
|
|
|
|
struct mlx5e_sq_stats {
|
|
--
|
|
2.13.6
|
|
|