You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
248 lines
8.4 KiB
248 lines
8.4 KiB
From e945df9ee0cc44e01807d66995d4aa0e458a52aa Mon Sep 17 00:00:00 2001
|
|
From: Alaa Hleihel <ahleihel@redhat.com>
|
|
Date: Sun, 10 May 2020 14:51:57 -0400
|
|
Subject: [PATCH 033/312] [netdrv] net/mlx5e: Report and recover from CQE with
|
|
error on RQ
|
|
|
|
Message-id: <20200510145245.10054-35-ahleihel@redhat.com>
|
|
Patchwork-id: 306575
|
|
Patchwork-instance: patchwork
|
|
O-Subject: [RHEL8.3 BZ 1789378 v2 34/82] net/mlx5e: Report and recover from CQE with error on RQ
|
|
Bugzilla: 1790198 1789378
|
|
RH-Acked-by: Kamal Heib <kheib@redhat.com>
|
|
RH-Acked-by: Jarod Wilson <jarod@redhat.com>
|
|
RH-Acked-by: Tony Camuso <tcamuso@redhat.com>
|
|
RH-Acked-by: Jonathan Toppins <jtoppins@redhat.com>
|
|
|
|
Bugzilla: http://bugzilla.redhat.com/1789378
|
|
Bugzilla: http://bugzilla.redhat.com/1790198
|
|
Upstream: v5.4-rc1
|
|
|
|
commit 8276ea1353a4968a212f04ddf16659223e5408d9
|
|
Author: Aya Levin <ayal@mellanox.com>
|
|
Date: Wed Jun 26 23:21:40 2019 +0300
|
|
|
|
net/mlx5e: Report and recover from CQE with error on RQ
|
|
|
|
Add support for report and recovery from error on completion on RQ by
|
|
setting the queue back to ready state. Handle only errors with a
|
|
syndrome indicating the RQ might enter error state and could be
|
|
recovered.
|
|
|
|
Signed-off-by: Aya Levin <ayal@mellanox.com>
|
|
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
|
|
Acked-by: Jiri Pirko <jiri@mellanox.com>
|
|
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
|
|
|
|
Signed-off-by: Alaa Hleihel <ahleihel@redhat.com>
|
|
Signed-off-by: Frantisek Hrbata <fhrbata@redhat.com>
|
|
---
|
|
drivers/net/ethernet/mellanox/mlx5/core/en.h | 3 +
|
|
.../net/ethernet/mellanox/mlx5/core/en/health.h | 9 +++
|
|
.../ethernet/mellanox/mlx5/core/en/reporter_rx.c | 69 ++++++++++++++++++++++
|
|
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 9 +++
|
|
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 11 ++++
|
|
5 files changed, 101 insertions(+)
|
|
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
|
|
index f0ba350579ae..ada39a3f83a9 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
|
|
@@ -308,6 +308,7 @@ struct mlx5e_dcbx_dp {
|
|
|
|
enum {
|
|
MLX5E_RQ_STATE_ENABLED,
|
|
+ MLX5E_RQ_STATE_RECOVERING,
|
|
MLX5E_RQ_STATE_AM,
|
|
MLX5E_RQ_STATE_NO_CSUM_COMPLETE,
|
|
MLX5E_RQ_STATE_CSUM_FULL, /* cqe_csum_full hw bit is set */
|
|
@@ -680,6 +681,8 @@ struct mlx5e_rq {
|
|
struct zero_copy_allocator zca;
|
|
struct xdp_umem *umem;
|
|
|
|
+ struct work_struct recover_work;
|
|
+
|
|
/* control */
|
|
struct mlx5_wq_ctrl wq_ctrl;
|
|
__be32 mkey_be;
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
|
|
index 52e9ca37cf46..d3693fa547ac 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
|
|
@@ -8,6 +8,14 @@
|
|
|
|
#define MLX5E_RX_ERR_CQE(cqe) (get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)
|
|
|
|
+static inline bool cqe_syndrome_needs_recover(u8 syndrome)
|
|
+{
|
|
+ return syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR ||
|
|
+ syndrome == MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR ||
|
|
+ syndrome == MLX5_CQE_SYNDROME_LOCAL_PROT_ERR ||
|
|
+ syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
|
|
+}
|
|
+
|
|
int mlx5e_reporter_tx_create(struct mlx5e_priv *priv);
|
|
void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv);
|
|
void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq);
|
|
@@ -21,6 +29,7 @@ int mlx5e_reporter_named_obj_nest_end(struct devlink_fmsg *fmsg);
|
|
int mlx5e_reporter_rx_create(struct mlx5e_priv *priv);
|
|
void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv);
|
|
void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq);
|
|
+void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq);
|
|
void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq);
|
|
|
|
#define MLX5E_REPORTER_PER_Q_MAX_LEN 256
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
|
|
index 4e933db759b2..6c72b592315b 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
|
|
@@ -115,6 +115,75 @@ void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq)
|
|
mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
|
|
}
|
|
|
|
+static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state)
|
|
+{
|
|
+ struct net_device *dev = rq->netdev;
|
|
+ int err;
|
|
+
|
|
+ err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST);
|
|
+ if (err) {
|
|
+ netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn);
|
|
+ return err;
|
|
+ }
|
|
+ err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
|
|
+ if (err) {
|
|
+ netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn);
|
|
+ return err;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx)
|
|
+{
|
|
+ struct mlx5_core_dev *mdev;
|
|
+ struct net_device *dev;
|
|
+ struct mlx5e_rq *rq;
|
|
+ u8 state;
|
|
+ int err;
|
|
+
|
|
+ rq = ctx;
|
|
+ mdev = rq->mdev;
|
|
+ dev = rq->netdev;
|
|
+ err = mlx5e_query_rq_state(mdev, rq->rqn, &state);
|
|
+ if (err) {
|
|
+ netdev_err(dev, "Failed to query RQ 0x%x state. err = %d\n",
|
|
+ rq->rqn, err);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (state != MLX5_RQC_STATE_ERR)
|
|
+ goto out;
|
|
+
|
|
+ mlx5e_deactivate_rq(rq);
|
|
+ mlx5e_free_rx_descs(rq);
|
|
+
|
|
+ err = mlx5e_rq_to_ready(rq, MLX5_RQC_STATE_ERR);
|
|
+ if (err)
|
|
+ goto out;
|
|
+
|
|
+ clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state);
|
|
+ mlx5e_activate_rq(rq);
|
|
+ rq->stats->recover++;
|
|
+ return 0;
|
|
+out:
|
|
+ clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq)
|
|
+{
|
|
+ struct mlx5e_priv *priv = rq->channel->priv;
|
|
+ char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
|
|
+ struct mlx5e_err_ctx err_ctx = {};
|
|
+
|
|
+ err_ctx.ctx = rq;
|
|
+ err_ctx.recover = mlx5e_rx_reporter_err_rq_cqe_recover;
|
|
+ sprintf(err_str, "ERR CQE on RQ: 0x%x", rq->rqn);
|
|
+
|
|
+ mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
|
|
+}
|
|
+
|
|
static int mlx5e_rx_reporter_timeout_recover(void *ctx)
|
|
{
|
|
struct mlx5e_icosq *icosq;
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
|
|
index c3eba55e8a21..13c1151bf60c 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
|
|
@@ -353,6 +353,13 @@ static void mlx5e_free_di_list(struct mlx5e_rq *rq)
|
|
kvfree(rq->wqe.di);
|
|
}
|
|
|
|
+static void mlx5e_rq_err_cqe_work(struct work_struct *recover_work)
|
|
+{
|
|
+ struct mlx5e_rq *rq = container_of(recover_work, struct mlx5e_rq, recover_work);
|
|
+
|
|
+ mlx5e_reporter_rq_cqe_err(rq);
|
|
+}
|
|
+
|
|
static int mlx5e_alloc_rq(struct mlx5e_channel *c,
|
|
struct mlx5e_params *params,
|
|
struct mlx5e_xsk_param *xsk,
|
|
@@ -389,6 +396,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
|
|
rq->stats = &c->priv->channel_stats[c->ix].xskrq;
|
|
else
|
|
rq->stats = &c->priv->channel_stats[c->ix].rq;
|
|
+ INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work);
|
|
|
|
rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
|
|
if (IS_ERR(rq->xdp_prog)) {
|
|
@@ -898,6 +906,7 @@ void mlx5e_close_rq(struct mlx5e_rq *rq)
|
|
{
|
|
cancel_work_sync(&rq->dim.work);
|
|
cancel_work_sync(&rq->channel->icosq.recover_work);
|
|
+ cancel_work_sync(&rq->recover_work);
|
|
mlx5e_destroy_rq(rq);
|
|
mlx5e_free_rx_descs(rq);
|
|
mlx5e_free_rq(rq);
|
|
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
|
|
index 1c3da221ee69..64d6ecbece80 100644
|
|
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
|
|
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
|
|
@@ -1131,6 +1131,15 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
|
|
return skb;
|
|
}
|
|
|
|
+static void trigger_report(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
|
|
+{
|
|
+ struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe;
|
|
+
|
|
+ if (cqe_syndrome_needs_recover(err_cqe->syndrome) &&
|
|
+ !test_and_set_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state))
|
|
+ queue_work(rq->channel->priv->wq, &rq->recover_work);
|
|
+}
|
|
+
|
|
void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
|
|
{
|
|
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
|
|
@@ -1144,6 +1153,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
|
|
cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
|
|
|
|
if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
|
|
+ trigger_report(rq, cqe);
|
|
rq->stats->wqe_err++;
|
|
goto free_wqe;
|
|
}
|
|
@@ -1329,6 +1339,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
|
|
wi->consumed_strides += cstrides;
|
|
|
|
if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
|
|
+ trigger_report(rq, cqe);
|
|
rq->stats->wqe_err++;
|
|
goto mpwrq_cqe_out;
|
|
}
|
|
--
|
|
2.13.6
|
|
|