Compare commits
No commits in common. 'c9' and 'c9-beta' have entirely different histories.
@ -1,93 +0,0 @@
|
|||||||
commit 73d8177ce0d2fcb7693cacee4778d0845ebd3788
|
|
||||||
Author: sathya priya kumar <SathyaPriya.K@amd.com>
|
|
||||||
Date: Thu Jun 13 05:29:09 2024 +0000
|
|
||||||
|
|
||||||
rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits
|
|
||||||
|
|
||||||
Optimize smca_smu2_mce_desc in better way from the commit ced615c.
|
|
||||||
|
|
||||||
Update existing array with extended error descriptions instead
|
|
||||||
of creating new array, simplifying the code.
|
|
||||||
|
|
||||||
Signed-off-by: Sathya Priya Kumar <sathyapriya.k@amd.com>
|
|
||||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
|
||||||
|
|
||||||
---
|
|
||||||
mce-amd-smca.c | 29 +++--------------------------
|
|
||||||
ras-mce-handler.h | 1 -
|
|
||||||
2 files changed, 3 insertions(+), 27 deletions(-)
|
|
||||||
|
|
||||||
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-07-18 11:14:26.008582740 -0400
|
|
||||||
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-07-18 11:15:05.510270132 -0400
|
|
||||||
@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d
|
|
||||||
"An ECC or parity error in an SMU RAM instance",
|
|
||||||
};
|
|
||||||
|
|
||||||
-static const char * smca_smu2_mce_desc[64] = {
|
|
||||||
+static const char * const smca_smu2_mce_desc[] = {
|
|
||||||
"High SRAM ECC or parity error",
|
|
||||||
"Low SRAM ECC or parity error",
|
|
||||||
"Data Cache Bank A ECC or parity error",
|
|
||||||
@@ -410,14 +410,13 @@ static const char * smca_smu2_mce_desc[6
|
|
||||||
"Instruction Tag Cache Bank B ECC or parity error",
|
|
||||||
"System Hub Read Buffer ECC or parity error",
|
|
||||||
"PHY RAS ECC Error",
|
|
||||||
-};
|
|
||||||
-
|
|
||||||
-static const char * smca_smu2_ext_mce_desc[] = {
|
|
||||||
+ [12 ... 57] = "Reserved",
|
|
||||||
"A correctable error from a GFX Sub-IP",
|
|
||||||
"A fatal error from a GFX Sub-IP",
|
|
||||||
"Reserved",
|
|
||||||
"Reserved",
|
|
||||||
"A poison error from a GFX Sub-IP",
|
|
||||||
+ "Reserved",
|
|
||||||
};
|
|
||||||
|
|
||||||
static const char * const smca_mp5_mce_desc[] = {
|
|
||||||
@@ -824,27 +823,6 @@ static struct smca_bank_name smca_names[
|
|
||||||
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
|
|
||||||
};
|
|
||||||
|
|
||||||
-void smca_smu2_ext_err_desc(void)
|
|
||||||
-{
|
|
||||||
- int i, j;
|
|
||||||
- int smu2_bits = 62;
|
|
||||||
-
|
|
||||||
- /*
|
|
||||||
- * MCA_CTL_SMU error stings are defined for b'58:59 and b'62
|
|
||||||
- * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU
|
|
||||||
- *
|
|
||||||
- * b'0:11 can be decoded from existing array smca_smu2_mce_desc.
|
|
||||||
- * b'12:57 are Reserved and b'58:62 are appended to the
|
|
||||||
- * smca_smu2_mce_desc.
|
|
||||||
- */
|
|
||||||
- for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) {
|
|
||||||
- for ( ; i < 58; i++)
|
|
||||||
- smca_smu2_mce_desc[i] = "Reserved";
|
|
||||||
-
|
|
||||||
- smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j];
|
|
||||||
- }
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
void amd_decode_errcode(struct mce_event *e)
|
|
||||||
{
|
|
||||||
|
|
||||||
@@ -936,7 +914,6 @@ unsigned short xec = (e->status >> 16) &
|
|
||||||
mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
|
|
||||||
(ipid_high & MCI_IPID_MCATYPE) >> 16);
|
|
||||||
|
|
||||||
- smca_smu2_ext_err_desc();
|
|
||||||
fixup_hwid(m, &mcatype_hwid);
|
|
||||||
|
|
||||||
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
|
|
||||||
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-07-18 11:14:26.008582740 -0400
|
|
||||||
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-07-18 11:14:28.987559165 -0400
|
|
||||||
@@ -121,7 +121,6 @@ int set_intel_imc_log(enum cputype cputy
|
|
||||||
/* Undertake AMD SMCA Error Decoding */
|
|
||||||
void decode_smca_error(struct mce_event *e, struct mce_priv *m);
|
|
||||||
void amd_decode_errcode(struct mce_event *e);
|
|
||||||
-void smca_smu2_ext_err_desc(void);
|
|
||||||
|
|
||||||
/* Per-CPU-type decoders for Intel CPUs */
|
|
||||||
void p4_decode_model(struct mce_event *e);
|
|
@ -1,34 +0,0 @@
|
|||||||
commit 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e
|
|
||||||
Author: Aristeu Rozanski <arozansk@redhat.com>
|
|
||||||
Date: Tue Apr 9 10:06:30 2024 -0400
|
|
||||||
|
|
||||||
mce-amd-smca: update smca_hwid to use smca_bank_types
|
|
||||||
|
|
||||||
bank_type is used as smca_bank_types everywhere, there's no point in
|
|
||||||
declaring it as unsigned int. It also upsets covscan:
|
|
||||||
|
|
||||||
3. rasdaemon-0.6.7/mce-amd-smca.c:914: assignment: Assigning: "bank_type" = "s_hwid->bank_type".
|
|
||||||
7. rasdaemon-0.6.7/mce-amd-smca.c:926: cond_at_most: Checking "bank_type >= 64U" implies that "bank_type" and "s_hwid->bank_type" may be up to 63 on the false branch.
|
|
||||||
14. rasdaemon-0.6.7/mce-amd-smca.c:942: overrun-local: Overrunning array "smca_mce_descs" of 38 16-byte elements at element index 63 (byte offset 1023) using index "bank_type" (which evaluates to 63).
|
|
||||||
# 940| /* Only print the descriptor of valid extended error code */
|
|
||||||
# 941| if (xec < smca_mce_descs[bank_type].num_descs)
|
|
||||||
# 942|-> mce_snprintf(e->mcastatus_msg,
|
|
||||||
# 943| "%s. Ext Err Code: %d",
|
|
||||||
# 944| smca_mce_descs[bank_type].descs[xec],
|
|
||||||
|
|
||||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
|
||||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
|
||||||
|
|
||||||
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
|
|
||||||
index 7521ff7..6632663 100644
|
|
||||||
--- a/mce-amd-smca.c
|
|
||||||
+++ b/mce-amd-smca.c
|
|
||||||
@@ -706,7 +706,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
|
|
||||||
};
|
|
||||||
|
|
||||||
struct smca_hwid {
|
|
||||||
- unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/
|
|
||||||
+ enum smca_bank_types bank_type;
|
|
||||||
uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/
|
|
||||||
};
|
|
||||||
|
|
@ -1,22 +0,0 @@
|
|||||||
commit 885e546add918457c453bd3f753ac7df90b39e36
|
|
||||||
Author: weidongkl <weidongkl@sina.com>
|
|
||||||
Date: Tue Sep 19 16:29:21 2023 +0800
|
|
||||||
|
|
||||||
Add a space between "diskerror_event" and "store"
|
|
||||||
|
|
||||||
Signed-off-by: weidongkl <weidongkl@sina.com>
|
|
||||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
|
||||||
|
|
||||||
diff --git a/ras-record.c b/ras-record.c
|
|
||||||
index a5f99ae..6b050bb 100644
|
|
||||||
--- a/ras-record.c
|
|
||||||
+++ b/ras-record.c
|
|
||||||
@@ -484,7 +484,7 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev
|
|
||||||
|
|
||||||
if (!priv || !priv->stmt_diskerror_event)
|
|
||||||
return 0;
|
|
||||||
- log(TERM, LOG_INFO, "diskerror_eventstore: %p\n", priv->stmt_diskerror_event);
|
|
||||||
+ log(TERM, LOG_INFO, "diskerror_event store: %p\n", priv->stmt_diskerror_event);
|
|
||||||
|
|
||||||
sqlite3_bind_text(priv->stmt_diskerror_event, 1, ev->timestamp, -1, NULL);
|
|
||||||
sqlite3_bind_text(priv->stmt_diskerror_event, 2, ev->dev, -1, NULL);
|
|
@ -1,24 +0,0 @@
|
|||||||
commit 9bd84aef87978b806178a73ed33c39d6c442fc1f
|
|
||||||
Author: weidong <weidongkl@sina.com>
|
|
||||||
Date: Tue Aug 8 08:59:12 2023 +0000
|
|
||||||
|
|
||||||
add ':' before error output
|
|
||||||
|
|
||||||
All prints except disk are preceded by a colon
|
|
||||||
|
|
||||||
Signed-off-by: weidong <weidongkl@sina.com>
|
|
||||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
|
||||||
|
|
||||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
|
||||||
index dc326d3..13078c2 100755
|
|
||||||
--- a/util/ras-mc-ctl.in
|
|
||||||
+++ b/util/ras-mc-ctl.in
|
|
||||||
@@ -1469,7 +1469,7 @@ sub errors
|
|
||||||
$out .= "\n";
|
|
||||||
}
|
|
||||||
if ($out ne "") {
|
|
||||||
- print "Disk errors\n$out\n";
|
|
||||||
+ print "Disk errors:\n$out\n";
|
|
||||||
} else {
|
|
||||||
print "No disk errors.\n\n";
|
|
||||||
}
|
|
@ -1,117 +0,0 @@
|
|||||||
commit 9c86f6255f67a8bae28cd46c54500fc16bfc7a30
|
|
||||||
Author: Yang Shi <shy828301@gmail.com>
|
|
||||||
Date: Mon Apr 4 16:34:05 2022 -0700
|
|
||||||
|
|
||||||
rasdaemon: use the new block_rq_error tracepoint
|
|
||||||
|
|
||||||
Since Linux 5.18-rc1 a new block tracepoint called block_rq_error is
|
|
||||||
available for tracing disk error events dedicatedly. Currently
|
|
||||||
rasdaemon is using block_rq_complete which also traces successful cases.
|
|
||||||
It incurs excessive tracing logs and somehow overhead since the event is
|
|
||||||
triggered quite often.
|
|
||||||
|
|
||||||
Use the new tracepoint for disk error reporting, and the new trace point
|
|
||||||
has the same format as block_rq_complete.
|
|
||||||
|
|
||||||
Signed-off-by: Yang Shi <shy828301@gmail.com>
|
|
||||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
|
||||||
|
|
||||||
---
|
|
||||||
ras-events.c | 53 ++++++++++-------------------------------------------
|
|
||||||
ras-record.c | 2 +-
|
|
||||||
2 files changed, 11 insertions(+), 44 deletions(-)
|
|
||||||
|
|
||||||
--- rasdaemon-0.6.7.orig/ras-events.c 2024-05-14 11:05:40.020599541 -0400
|
|
||||||
+++ rasdaemon-0.6.7/ras-events.c 2024-05-14 11:06:38.831067957 -0400
|
|
||||||
@@ -27,6 +27,7 @@ * Foundation, Inc., 51 Franklin Street,
|
|
||||||
#include <sys/poll.h>
|
|
||||||
#include <signal.h>
|
|
||||||
#include <sys/signalfd.h>
|
|
||||||
+#include <linux/version.h>
|
|
||||||
#include "libtrace/kbuffer.h"
|
|
||||||
#include "libtrace/event-parse.h"
|
|
||||||
#include "ras-mc-handler.h"
|
|
||||||
@@ -229,7 +230,7 @@ if (rc < 0) {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef HAVE_DISKERROR
|
|
||||||
- rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable);
|
|
||||||
+ rc |= __toggle_ras_mc_event(ras, "block", "block_rq_error", enable);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef HAVE_MEMORY_FAILURE
|
|
||||||
@@ -241,37 +242,6 @@ free_ras:
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
-/*
|
|
||||||
- * Set kernel filter. libtrace doesn't provide an API for setting filters
|
|
||||||
- * in kernel, we have to implement it here.
|
|
||||||
- */
|
|
||||||
-static int filter_ras_mc_event(struct ras_events *ras, char *group, char *event,
|
|
||||||
- const char *filter_str)
|
|
||||||
-{
|
|
||||||
- int fd, rc;
|
|
||||||
- char fname[MAX_PATH + 1];
|
|
||||||
-
|
|
||||||
- snprintf(fname, sizeof(fname), "events/%s/%s/filter", group, event);
|
|
||||||
- fd = open_trace(ras, fname, O_RDWR | O_APPEND);
|
|
||||||
- if (fd < 0) {
|
|
||||||
- log(ALL, LOG_WARNING, "Can't open filter file\n");
|
|
||||||
- return errno;
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- rc = write(fd, filter_str ,strlen(filter_str));
|
|
||||||
- if (rc < 0) {
|
|
||||||
- log(ALL, LOG_WARNING, "Can't write to filter file\n");
|
|
||||||
- close(fd);
|
|
||||||
- return rc;
|
|
||||||
- }
|
|
||||||
- close(fd);
|
|
||||||
- if (!rc) {
|
|
||||||
- log(ALL, LOG_WARNING, "Nothing was written on filter file\n");
|
|
||||||
- return EIO;
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- return 0;
|
|
||||||
-}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Tracing read code
|
|
||||||
@@ -901,17 +871,14 @@ (void)open("/sys/kernel/debug/ras/daemon
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef HAVE_DISKERROR
|
|
||||||
- rc = filter_ras_mc_event(ras, "block", "block_rq_complete", "error != 0");
|
|
||||||
- if (!rc) {
|
|
||||||
- rc = add_event_handler(ras, pevent, page_size, "block",
|
|
||||||
- "block_rq_complete", ras_diskerror_event_handler,
|
|
||||||
- NULL, DISKERROR_EVENT);
|
|
||||||
- if (!rc)
|
|
||||||
- num_events++;
|
|
||||||
- else
|
|
||||||
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
|
|
||||||
- "block", "block_rq_complete");
|
|
||||||
- }
|
|
||||||
+ rc = add_event_handler(ras, pevent, page_size, "block",
|
|
||||||
+ "block_rq_error", ras_diskerror_event_handler,
|
|
||||||
+ NULL, DISKERROR_EVENT);
|
|
||||||
+ if (!rc)
|
|
||||||
+ num_events++;
|
|
||||||
+ else
|
|
||||||
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
|
|
||||||
+ "block", "block_rq_error");
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef HAVE_MEMORY_FAILURE
|
|
||||||
--- rasdaemon-0.6.7.orig/ras-record.c 2024-05-14 11:07:24.573654494 -0400
|
|
||||||
+++ rasdaemon-0.6.7/ras-record.c 2024-05-14 11:07:07.626807674 -0400
|
|
||||||
@@ -456,7 +456,7 @@ return 0;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
|
||||||
- * Table and functions to handle block:block_rq_complete
|
|
||||||
+ * Table and functions to handle block:block_rq_error
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifdef HAVE_DISKERROR
|
|
@ -1,94 +0,0 @@
|
|||||||
commit ced615cf8146f51b5d6fe7a29107a2adc77407ca
|
|
||||||
Author: Sathya Priya Kumar <sathyapriya.k@amd.com>
|
|
||||||
Date: Thu Jan 11 01:20:07 2024 -0600
|
|
||||||
|
|
||||||
rasdaemon: Add error decoding for MCA_CTL_SMU extended bits
|
|
||||||
|
|
||||||
Enable error decoding support for the newly added extended
|
|
||||||
error bit descriptions from MCA_CTL_SMU.
|
|
||||||
b'0:11 can be decoded from existing array smca_smu2_mce_desc.
|
|
||||||
Define a function to append the newly defined b'58:62 to the
|
|
||||||
smca_smu2_mce_desc. This reduces the maintaining Reserved bits
|
|
||||||
from b'12:57 in the code.
|
|
||||||
|
|
||||||
Signed-off-by: Sathya Priya Kumar <sathyapriya.k@amd.com>
|
|
||||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
|
||||||
|
|
||||||
---
|
|
||||||
mce-amd-smca.c | 33 ++++++++++++++++++++++++++++++++-
|
|
||||||
ras-mce-handler.h | 1 +
|
|
||||||
2 files changed, 33 insertions(+), 1 deletion(-)
|
|
||||||
|
|
||||||
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-06-28 10:34:16.453522865 -0400
|
|
||||||
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-06-28 10:34:46.049124270 -0400
|
|
||||||
@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d
|
|
||||||
"An ECC or parity error in an SMU RAM instance",
|
|
||||||
};
|
|
||||||
|
|
||||||
-static const char * const smca_smu2_mce_desc[] = {
|
|
||||||
+static const char * smca_smu2_mce_desc[64] = {
|
|
||||||
"High SRAM ECC or parity error",
|
|
||||||
"Low SRAM ECC or parity error",
|
|
||||||
"Data Cache Bank A ECC or parity error",
|
|
||||||
@@ -409,6 +409,15 @@ static const char * const smca_smu2_mce_
|
|
||||||
"Instruction Tag Cache Bank A ECC or parity error",
|
|
||||||
"Instruction Tag Cache Bank B ECC or parity error",
|
|
||||||
"System Hub Read Buffer ECC or parity error",
|
|
||||||
+ "PHY RAS ECC Error",
|
|
||||||
+};
|
|
||||||
+
|
|
||||||
+static const char * smca_smu2_ext_mce_desc[] = {
|
|
||||||
+ "A correctable error from a GFX Sub-IP",
|
|
||||||
+ "A fatal error from a GFX Sub-IP",
|
|
||||||
+ "Reserved",
|
|
||||||
+ "Reserved",
|
|
||||||
+ "A poison error from a GFX Sub-IP",
|
|
||||||
};
|
|
||||||
|
|
||||||
static const char * const smca_mp5_mce_desc[] = {
|
|
||||||
@@ -815,6 +824,27 @@ static struct smca_bank_name smca_names[
|
|
||||||
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
|
|
||||||
};
|
|
||||||
|
|
||||||
+void smca_smu2_ext_err_desc(void)
|
|
||||||
+{
|
|
||||||
+ int i, j;
|
|
||||||
+ int smu2_bits = 62;
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * MCA_CTL_SMU error stings are defined for b'58:59 and b'62
|
|
||||||
+ * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU
|
|
||||||
+ *
|
|
||||||
+ * b'0:11 can be decoded from existing array smca_smu2_mce_desc.
|
|
||||||
+ * b'12:57 are Reserved and b'58:62 are appended to the
|
|
||||||
+ * smca_smu2_mce_desc.
|
|
||||||
+ */
|
|
||||||
+ for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) {
|
|
||||||
+ for ( ; i < 58; i++)
|
|
||||||
+ smca_smu2_mce_desc[i] = "Reserved";
|
|
||||||
+
|
|
||||||
+ smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j];
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
void amd_decode_errcode(struct mce_event *e)
|
|
||||||
{
|
|
||||||
|
|
||||||
@@ -906,6 +936,7 @@ unsigned short xec = (e->status >> 16) &
|
|
||||||
mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
|
|
||||||
(ipid_high & MCI_IPID_MCATYPE) >> 16);
|
|
||||||
|
|
||||||
+ smca_smu2_ext_err_desc();
|
|
||||||
fixup_hwid(m, &mcatype_hwid);
|
|
||||||
|
|
||||||
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
|
|
||||||
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-06-28 10:34:16.453522865 -0400
|
|
||||||
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-06-28 10:34:17.795508302 -0400
|
|
||||||
@@ -121,6 +121,7 @@ int set_intel_imc_log(enum cputype cputy
|
|
||||||
/* Undertake AMD SMCA Error Decoding */
|
|
||||||
void decode_smca_error(struct mce_event *e, struct mce_priv *m);
|
|
||||||
void amd_decode_errcode(struct mce_event *e);
|
|
||||||
+void smca_smu2_ext_err_desc(void);
|
|
||||||
|
|
||||||
/* Per-CPU-type decoders for Intel CPUs */
|
|
||||||
void p4_decode_model(struct mce_event *e);
|
|
Loading…
Reference in new issue