Compare commits

..

No commits in common. 'c9' and 'i10c-beta' have entirely different histories.

2
.gitignore vendored

@ -1 +1 @@
SOURCES/rasdaemon-0.6.7.tar.bz2
SOURCES/rasdaemon-0.8.0.tar.bz2

@ -1 +1 @@
8ae34f40b676a0843be6647854b950f45161e7d4 SOURCES/rasdaemon-0.6.7.tar.bz2
e69a5639d698e85ce9698e9ba6db1eeb13c7a857 SOURCES/rasdaemon-0.8.0.tar.bz2

@ -1,163 +0,0 @@
commit 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Fri Jun 30 10:36:53 2023 +0000
rasdaemon: Add new MA_LLC, USR_DP, and USR_CP bank types.
Add HWID and McaType values for new SMCA bank types
and error decoding for those new SMCA banks.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7c88a46..fc51b5a 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -61,6 +61,7 @@ enum smca_bank_types {
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_UMC_V2,
+ SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
SMCA_PSP_V2,
@@ -76,6 +77,8 @@ enum smca_bank_types {
SMCA_SHUB, /* System Hub Unit */
SMCA_SATA, /* SATA Unit */
SMCA_USB, /* USB Unit */
+ SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */
+ SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */
SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
@@ -325,6 +328,16 @@ static const char * const smca_umc2_mce_desc[] = {
"LM32 MP errors",
};
+static const char * const smca_mall_mce_desc[] = {
+ "Counter overflow error",
+ "Counter underflow error",
+ "Write Data Parity Error",
+ "Read Response Parity Error",
+ "Cache Tag ECC Error Macro 0",
+ "Cache Tag ECC Error Macro 1",
+ "Cache Data ECC Error"
+};
+
static const char * const smca_pb_mce_desc[] = {
"An ECC error in the Parameter Block RAM array"
};
@@ -524,6 +537,57 @@ static const char * const smca_usb_mce_desc[] = {
"AXI Slave Response error",
};
+static const char * const smca_usrdp_mce_desc[] = {
+ "Mst CMD Error",
+ "Mst Rx FIFO Error",
+ "Mst Deskew Error",
+ "Mst Detect Timeout Error",
+ "Mst FlowControl Error",
+ "Mst DataValid FIFO Error",
+ "Mac LinkState Error",
+ "Deskew Error",
+ "Init Timeout Error",
+ "Init Attempt Error",
+ "Recovery Timeout Error",
+ "Recovery Attempt Error",
+ "Eye Training Timeout Error",
+ "Data Startup Limit Error",
+ "LS0 Exit Error",
+ "PLL powerState Update Timeout Error",
+ "Rx FIFO Error",
+ "Lcu Error",
+ "Conv CECC Error",
+ "Conv UECC Error",
+ "Reserved",
+ "Rx DataLoss Error",
+ "Replay CECC Error",
+ "Replay UECC Error",
+ "CRC Error",
+ "BER Exceeded Error",
+ "FC Init Timeout Error",
+ "FC Init Attempt Error",
+ "Replay Timeout Error",
+ "Replay Attempt Error",
+ "Replay Underflow Error",
+ "Replay Overflow Error",
+};
+
+static const char * const smca_usrcp_mce_desc[] = {
+ "Packet Type Error",
+ "Rx FIFO Error",
+ "Deskew Error",
+ "Rx Detect Timeout Error",
+ "Data Parity Error",
+ "Data Loss Error",
+ "Lcu Error",
+ "HB1 Handshake Timeout Error",
+ "HB2 Handshake Timeout Error",
+ "Clk Sleep Rsp Timeout Error",
+ "Clk Wake Rsp Timeout Error",
+ "Reset Attack Error",
+ "Remote Link Fatal Error",
+};
+
static const char * const smca_gmipcs_mce_desc[] = {
"Data Loss Error",
"Training Error",
@@ -579,6 +643,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
+ [SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
[SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
@@ -595,6 +660,8 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
[SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
[SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
+ [SMCA_USR_DP] = { smca_usrdp_mce_desc, ARRAY_SIZE(smca_usrdp_mce_desc) },
+ [SMCA_USR_CP] = { smca_usrcp_mce_desc, ARRAY_SIZE(smca_usrcp_mce_desc) },
[SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
/* All the PHY bank types have the same error descriptions, for now. */
[SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
@@ -631,6 +698,8 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_UMC, 0x00000096 },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
{ SMCA_UMC_V2, 0x00010096 },
+ /* Memory Attached Last Level Cache */
+ { SMCA_MA_LLC, 0x0004002E },
/* Parameter Block MCA type */
{ SMCA_PB, 0x00000005 },
@@ -664,6 +733,11 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_SHUB, 0x00000080 },
{ SMCA_SATA, 0x000000A8 },
{ SMCA_USB, 0x000000AA },
+
+ /* Ultra Short Reach Data and Control Plane Controller */
+ { SMCA_USR_DP, 0x00000170 },
+ { SMCA_USR_CP, 0x00000180 },
+
{ SMCA_GMI_PCS, 0x00000241 },
/* Ext Global Memory Interconnect PHY MCA type */
@@ -692,6 +766,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PIE] = { "Power, Interrupts, etc." },
[SMCA_UMC] = { "Unified Memory Controller" },
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
+ [SMCA_MA_LLC] = { "Memory Attached Last Level Cache" },
[SMCA_PB] = { "Parameter Block" },
[SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
[SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
@@ -704,6 +779,8 @@ static struct smca_bank_name smca_names[] = {
[SMCA_SHUB] = { "System Hub Unit" },
[SMCA_SATA] = { "SATA Unit" },
[SMCA_USB] = { "USB Unit" },
+ [SMCA_USR_DP] = { "Ultra Short Reach Data Plane Controller" },
+ [SMCA_USR_CP] = { "Ultra Short Reach Control Plane Controller" },
[SMCA_GMI_PCS] = { "Global Memory Interconnect PCS Unit" },
[SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
[SMCA_WAFL_PHY] = { "WAFL PHY Unit" },

@ -1,32 +0,0 @@
commit 1ff5f3d2a0fcd48add9462567c30fe0e14585fb4
Author: Matt Whitlock <whitslack@users.noreply.github.com>
Date: Wed Jun 9 10:25:18 2021 -0400
configure.ac: fix SYSCONFDEFDIR default value
configure.ac was using AC_ARG_WITH incorrectly, yielding a generated configure script like:
# Check whether --with-sysconfdefdir was given.
if test "${with_sysconfdefdir+set}" = set; then :
withval=$with_sysconfdefdir; SYSCONFDEFDIR=$withval
else
"/etc/sysconfig"
fi
This commit fixes the default case so that the SYSCONFDEFDIR variable is assigned the value "/etc/sysconfig" rather than trying to execute "/etc/sysconfig" as a command.
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/configure.ac b/configure.ac
index f7d1947..33b81fe 100644
--- a/configure.ac
+++ b/configure.ac
@@ -172,7 +172,7 @@ AC_SUBST([RASSTATEDIR])
AC_ARG_WITH(sysconfdefdir,
AC_HELP_STRING([--with-sysconfdefdir=DIR], [rasdaemon environment file dir]),
[SYSCONFDEFDIR=$withval],
- ["/etc/sysconfig"])
+ [SYSCONFDEFDIR=/etc/sysconfig])
AC_SUBST([SYSCONFDEFDIR])
AC_DEFINE([RAS_DB_FNAME], ["ras-mc_event.db"], [ras events database])

@ -1,28 +0,0 @@
commit 28ea956acc2dab7c18b4701f9657afb9ab3ddc79
Author: Muralidhara M K <muralimk@amd.com>
Date: Mon Jul 12 05:18:43 2021 -0500
rasdaemon: set SMCA maximum number of banks to 64
Newer AMD systems with SMCA banks support up to 64 MCA banks per CPU.
This patch is based on the commit below upstremed into the kernel:
a0bc32b3cacf ("x86/mce: Increase maximum number of banks to 64")
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index e0cf512..3c346f4 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -75,6 +75,9 @@ enum smca_bank_types {
N_SMCA_BANK_TYPES
};
+/* Maximum number of MCA banks per CPU. */
+#define MAX_NR_BANKS 64
+
/* SMCA Extended error strings */
/* Load Store */
static const char * const smca_ls_mce_desc[] = {

@ -1,63 +0,0 @@
commit 2b37a26dcec389723f75d69d3da9c2f15f6c317d
Author: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed May 26 12:41:27 2021 +0200
ci.yml: Fix the job for it to run on a single arch
There were some issues on the previous content. Fix them, in
order to allow it to build on a single architecture.
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5b3e757..747a844 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,34 +1,23 @@
name: CI
-# Should run only on branches and PR, as "on_tag.yml" will handle tags
on:
+ workflow_dispatch:
push:
- branches: master test
pull_request:
- branches: master
jobs:
-
-#
-# Linux
-#
Ubuntu:
name: Ubuntu
- runs-on: ubuntu-20.04
- strategy:
- matrix:
- arch: [x64_64, aarch64, armv7, ppc64le]
+ runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v2
- with:
- arch: ${{ matrix.arch }}
- - name: prepare
- run: |
- sudo apt-get update
- sudo apt-get install -y build-essential sqlite3
- - name: build
- run: |
- autoreconf -vfi
- ./configure --enable-all
- make
- sudo make install
+ - uses: actions/checkout@v2
+ - name: prepare
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y build-essential sqlite3
+ - name: build
+ run: |
+ autoreconf -vfi
+ ./configure --enable-all
+ make
+ sudo make install

@ -1,44 +0,0 @@
commit 2b6a54b0d31e02e657171fd27f4e31d996756bc6
Author: DmNosachev <quartz64@gmail.com>
Date: Thu Jul 22 10:25:38 2021 +0300
labels/supermicro: added Supermicro X10DRL, X11SPM
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 1e7761f..990fc9e 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -88,6 +88,16 @@ Vendor: Supermicro
P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1;
P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1;
P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1;
+
+ Model: X10DRL-i
+ P1-DIMMA1: 0.0.0;
+ P1-DIMMB1: 0.1.0;
+ P1-DIMMC1: 0.2.0;
+ P1-DIMMD1: 0.3.0;
+ P2-DIMME1: 1.0.0;
+ P2-DIMMF1: 1.1.0;
+ P2-DIMMG1: 1.2.0;
+ P2-DIMMH1: 1.3.0;
Model: X11DDW-NT, X11DDW-L
P1-DIMMA1: 0.0.0;
@@ -102,6 +112,14 @@ Vendor: Supermicro
P2-DIMMD1: 3.0.0;
P2-DIMME1: 3.1.0;
P2-DIMMF1: 3.2.0;
+
+ Model: X11SPM-F, X11SPM-TF, X11SPM-TPF
+ DIMMA1: 0.0.0;
+ DIMMB1: 0.1.0;
+ DIMMC1: 0.2.0;
+ DIMMD1: 1.0.0;
+ DIMME1: 1.1.0;
+ DIMMF1: 1.2.0;
Model: B1DRi
P1_DIMMA1: 0.0.0;

@ -1,105 +0,0 @@
commit 2d15882a0cbfce0b905039bebc811ac8311cd739
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Fri Jun 30 11:19:42 2023 +0000
rasdaemon: Handle reassigned bit definitions for UMC bank
On some AMD systems some of the existing bit definitions in the
CTL register of SMCA bank type are reassigned without defining
new HWID and McaType. Consequently, the errors whose bit
definitions have been reassigned in the CTL register are being
erroneously decoded.
Add new error description structure to compensate for the
reassigned bit definitions, by new software defined SMCA bank
type by utilizing the hardware-reserved values for HWID.
The new SMCA bank type will only be employed for UMC error
decoding on affected models and the existing error description
structure for UMC bank type is still valid.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index fc51b5a..54060ee 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -60,6 +60,7 @@ enum smca_bank_types {
SMCA_CS_V2_QUIRK,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
+ SMCA_UMC_QUIRK,
SMCA_UMC_V2,
SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
@@ -313,6 +314,25 @@ static const char * const smca_umc_mce_desc[] = {
"Read CRC Error",
};
+static const char * const smca_umc_quirk_mce_desc[] = {
+ "DRAM On Die ECC error",
+ "Data poison error",
+ "SDP parity error",
+ "Reserved",
+ "Address/Command parity error",
+ "HBM Write data parity error",
+ "Consolidated SRAM ECC error",
+ "Reserved",
+ "Reserved",
+ "Rdb SRAM ECC error",
+ "Thermal throttling",
+ "HBM Read Data Parity error",
+ "Reserved",
+ "UMC FW Error",
+ "SRAM Parity Error",
+ "HBM CRC Error",
+};
+
static const char * const smca_umc2_mce_desc[] = {
"DRAM ECC error",
"Data poison error",
@@ -642,6 +662,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_CS_V2_QUIRK] = { smca_cs2_quirk_mce_desc, ARRAY_SIZE(smca_cs2_quirk_mce_desc)},
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
+ [SMCA_UMC_QUIRK] = { smca_umc_quirk_mce_desc, ARRAY_SIZE(smca_umc_quirk_mce_desc) },
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
[SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
@@ -696,6 +717,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Unified Memory Controller MCA type */
{ SMCA_UMC, 0x00000096 },
+ { SMCA_UMC_QUIRK, 0x00020000 },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
{ SMCA_UMC_V2, 0x00010096 },
/* Memory Attached Last Level Cache */
@@ -764,7 +786,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_L3_CACHE] = { "L3 Cache" },
[SMCA_CS ... SMCA_CS_V2_QUIRK] = { "Coherent Slave" },
[SMCA_PIE] = { "Power, Interrupts, etc." },
- [SMCA_UMC] = { "Unified Memory Controller" },
+ [SMCA_UMC ... SMCA_UMC_QUIRK] = { "Unified Memory Controller" },
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
[SMCA_MA_LLC] = { "Memory Attached Last Level Cache" },
[SMCA_PB] = { "Parameter Block" },
@@ -843,6 +865,10 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
if (*hwid_mcatype == 0x0002002E)
*hwid_mcatype = 0x00010000;
break;
+ case 0x90 ... 0x9F:
+ if ((*hwid_mcatype & 0xFF) == 0x00000096)
+ *hwid_mcatype = 0x00020000;
+ break;
default:
break;
}
@@ -908,7 +934,7 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
smca_mce_descs[bank_type].descs[xec],
xec);
- if (bank_type == SMCA_UMC && xec == 0) {
+ if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_QUIRK) && xec == 0) {
channel = find_umc_channel(e);
csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",

@ -1,524 +0,0 @@
commit 30158ef8d7aebc3e5201bf39b73ce7644f8e419e
Author: Avadhut Naik <avadnaik@amd.com>
Date: Tue Apr 18 18:24:21 2023 +0000
rasdaemon: Update SMCA bank error descriptions
Update, reword some existing SMCA bank type error descriptions to extend
SMCA error decoding functionality for modern AMD processors. Additionally,
also add new error descriptions for missing SMCA bank types.
Signed-off-by: Avadhut Naik <avadnaik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 27ca8aa..7ec787a 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -66,12 +66,19 @@ enum smca_bank_types {
SMCA_SMU, /* System Management Unit */
SMCA_SMU_V2,
SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_MPDMA, /* MPDMA Unit */
SMCA_NBIO, /* Northbridge IO Unit */
SMCA_PCIE, /* PCI Express Unit */
SMCA_PCIE_V2,
SMCA_XGMI_PCS, /* xGMI PCS Unit */
+ SMCA_NBIF, /*NBIF Unit */
+ SMCA_SHUB, /* System Hub Unit */
+ SMCA_SATA, /* SATA Unit */
+ SMCA_USB, /* USB Unit */
+ SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
+ SMCA_GMI_PHY, /* GMI PHY Unit */
N_SMCA_BANK_TYPES
};
@@ -85,7 +92,6 @@ enum smca_bank_types {
#define NONCPU_NODE_INDEX 8
/* SMCA Extended error strings */
-/* Load Store */
static const char * const smca_ls_mce_desc[] = {
"Load queue parity",
"Store queue parity",
@@ -109,6 +115,7 @@ static const char * const smca_ls_mce_desc[] = {
"DC tag error type 5",
"L2 fill data error",
};
+
static const char * const smca_ls2_mce_desc[] = {
"An ECC error was detected on a data cache read by a probe or victimization",
"An ECC error or L2 poison was detected on a data cache read by a load",
@@ -133,92 +140,104 @@ static const char * const smca_ls2_mce_desc[] = {
"A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
"A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
"A hardware assertion error was reported",
- "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
+ "A parity error was detected in an STLF, SCB EMEM entry, store data mask or SRB store data by any access",
};
-/* Instruction Fetch */
+
static const char * const smca_if_mce_desc[] = {
"microtag probe port parity error",
"IC microtag or full tag multi-hit error",
"IC full tag parity",
"IC data array parity",
- "Decoupling queue phys addr parity error",
+ "PRQ Parity Error",
"L0 ITLB parity error",
- "L1 ITLB parity error",
- "L2 ITLB parity error",
+ "L1-TLB parity error",
+ "L2-TLB parity error",
"BPQ snoop parity on Thread 0",
"BPQ snoop parity on Thread 1",
- "L1 BTB multi-match error",
- "L2 BTB multi-match error",
+ "BP L1-BTB Multi-Hit Error",
+ "BP L2-BTB Multi-Hit Error",
"L2 Cache Response Poison error",
- "System Read Data error",
+ "L2 Cache Error Response",
+ "Hardware Assertion Error",
+ "L1-TLB Multi-Hit",
+ "L2-TLB Multi-Hit",
+ "BSR Parity Error",
+ "CT MCE",
};
-/* L2 Cache */
+
static const char * const smca_l2_mce_desc[] = {
- "L2M tag multi-way-hit error",
- "L2M tag ECC error",
- "L2M data ECC error",
- "HW assert",
+ "L2M Tag Multiple-Way-Hit error",
+ "L2M Tag or State Array ECC Error",
+ "L2M Data Array ECC Error",
+ "Hardware Assert Error",
+ "SDP Read Response Parity Error",
};
-/* Decoder Unit */
+
static const char * const smca_de_mce_desc[] = {
- "uop cache tag parity error",
- "uop cache data parity error",
- "Insn buffer parity error",
- "uop queue parity error",
- "Insn dispatch queue parity error",
- "Fetch address FIFO parity",
- "Patch RAM data parity",
- "Patch RAM sequencer parity",
- "uop buffer parity"
-};
-/* Execution Unit */
+ "Micro-op cache tag array parity error",
+ "Micro-op cache data array parity error",
+ "IBB Register File parity error",
+ "Micro-op queue parity error",
+ "Instruction dispatch queue parity error",
+ "Fetch address FIFO parity error",
+ "Patch RAM data parity error",
+ "Patch RAM sequencer parity error",
+ "Micro-op buffer parity error",
+ "Hardware Assertion MCA Error",
+};
+
static const char * const smca_ex_mce_desc[] = {
"Watchdog timeout error",
- "Phy register file parity",
- "Flag register file parity",
- "Immediate displacement register file parity",
- "Address generator payload parity",
- "EX payload parity",
- "Checkpoint queue parity",
- "Retire dispatch queue parity",
+ "Physical register file parity error",
+ "Flag register file parity error",
+ "Immediate displacement register file parity error",
+ "Address generator payload parity error",
+ "EX payload parity error",
+ "Checkpoint queue parity error",
+ "Retire dispatch queue parity error",
"Retire status queue parity error",
- "Scheduling queue parity error",
+ "Scheduler queue parity error",
"Branch buffer queue parity error",
+ "Hardware Assertion error",
+ "Spec Map parity error",
+ "Retire Map parity error",
};
-/* Floating Point Unit */
+
static const char * const smca_fp_mce_desc[] = {
- "Physical register file parity",
- "Freelist parity error",
- "Schedule queue parity",
+ "Physical register file (PRF) parity error",
+ "Freelist (FL) parity error",
+ "Schedule queue parity error",
"NSQ parity error",
- "Retire queue parity",
- "Status register file parity",
+ "Retire queue (RQ) parity error",
+ "Status register file (SRF) parity error",
"Hardware assertion",
+ "Physical K mask register file (KRF) parity error",
};
-/* L3 Cache */
+
static const char * const smca_l3_mce_desc[] = {
"Shadow tag macro ECC error",
"Shadow tag macro multi-way-hit error",
"L3M tag ECC error",
"L3M tag multi-way-hit error",
"L3M data ECC error",
- "XI parity, L3 fill done channel error",
- "L3 victim queue parity",
- "L3 HW assert",
+ "SDP Parity Error from XI",
+ "L3 victim queue Data Fabric error",
+ "L3 Hardware Assertion",
+ "XI WCB Parity Poison Creation event",
};
-/* Coherent Slave Unit */
+
static const char * const smca_cs_mce_desc[] = {
- "Illegal request from transport layer",
+ "Illegal request",
"Address violation",
"Security violation",
- "Illegal response from transport layer",
+ "Illegal response",
"Unexpected response",
- "Parity error on incoming request or probe response data",
- "Parity error on incoming read response data",
- "Atomic request parity",
- "ECC error on probe filter access",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic request parity error",
+ "Probe Filter ECC Error",
};
-/* Coherent Slave Unit V2 */
+
static const char * const smca_cs2_mce_desc[] = {
"Illegal Request",
"Address Violation",
@@ -234,15 +253,22 @@ static const char * const smca_cs2_mce_desc[] = {
"SDP read response had an unexpected RETRY error",
"Counter overflow error",
"Counter underflow error",
+ "Illegal Request on the no data channel",
+ "Address Violation on the no data channel",
+ "Security Violation on the no data channel",
+ "Hardware Assert Error",
};
-/* Power, Interrupt, etc.. */
+
static const char * const smca_pie_mce_desc[] = {
- "HW assert",
- "Internal PIE register security violation",
- "Error on GMI link",
- "Poison data written to internal PIE register",
+ "Hardware assert",
+ "Register security violation",
+ "Link error",
+ "Poison data consumption",
+ "A deferred error was detected in the DF",
+ "Watch Dog Timer",
+ "An SRAM ECC error was detected in the CNLI block",
};
-/* Unified Memory Controller */
+
static const char * const smca_umc_mce_desc[] = {
"DRAM ECC error",
"Data poison error on DRAM",
@@ -250,6 +276,12 @@ static const char * const smca_umc_mce_desc[] = {
"Advanced peripheral bus error",
"Command/address parity error",
"Write data CRC error",
+ "DCQ SRAM ECC error",
+ "AES SRAM ECC error",
+ "ECS Row Error",
+ "ECS Error",
+ "UMC Throttling Error",
+ "Read CRC Error",
};
static const char * const smca_umc2_mce_desc[] = {
@@ -267,15 +299,14 @@ static const char * const smca_umc2_mce_desc[] = {
"LM32 MP errors",
};
-/* Parameter Block */
static const char * const smca_pb_mce_desc[] = {
- "Parameter Block RAM ECC error",
+ "An ECC error in the Parameter Block RAM array"
};
-/* Platform Security Processor */
+
static const char * const smca_psp_mce_desc[] = {
- "PSP RAM ECC or parity error",
+ "An ECC or parity error in a PSP RAM instance",
};
-/* Platform Security Processor V2 */
+
static const char * const smca_psp2_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
@@ -296,11 +327,11 @@ static const char * const smca_psp2_mce_desc[] = {
"TLB Bank 1 parity error",
"System Hub Read Buffer ECC or parity error",
};
-/* System Management Unit */
+
static const char * const smca_smu_mce_desc[] = {
- "SMU RAM ECC or parity error",
+ "An ECC or parity error in an SMU RAM instance",
};
-/* System Management Unit V2 */
+
static const char * const smca_smu2_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
@@ -314,7 +345,7 @@ static const char * const smca_smu2_mce_desc[] = {
"Instruction Tag Cache Bank B ECC or parity error",
"System Hub Read Buffer ECC or parity error",
};
-/* Microprocessor 5 Unit */
+
static const char * const smca_mp5_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
@@ -327,15 +358,68 @@ static const char * const smca_mp5_mce_desc[] = {
"Instruction Tag Cache Bank A ECC or parity error",
"Instruction Tag Cache Bank B ECC or parity error",
};
-/* Northbridge IO Unit */
+
+static const char * const smca_mpdma_mce_desc[] = {
+ "Main SRAM [31:0] bank ECC or parity error",
+ "Main SRAM [63:32] bank ECC or parity error",
+ "Main SRAM [95:64] bank ECC or parity error",
+ "Main SRAM [127:96] bank ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "System Hub Read Buffer ECC or parity error",
+ "MPDMA TVF DVSEC Memory ECC or parity error",
+ "MPDMA TVF MMIO Mailbox0 ECC or parity error",
+ "MPDMA TVF MMIO Mailbox1 ECC or parity error",
+ "MPDMA TVF Doorbell Memory ECC or parity error",
+ "MPDMA TVF SDP Slave Memory 0 ECC or parity error",
+ "MPDMA TVF SDP Slave Memory 1 ECC or parity error",
+ "MPDMA TVF SDP Slave Memory 2 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 0 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 1 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 2 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 3 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 4 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 5 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 6 ECC or parity error",
+ "SDP Watchdog Timer expired",
+ "MPDMA PTE Command FIFO ECC or parity error",
+ "MPDMA PTE Hub Data FIFO ECC or parity error",
+ "MPDMA PTE Internal Data FIFO ECC or parity error",
+ "MPDMA PTE Command Memory DMA ECC or parity error",
+ "MPDMA PTE Command Memory Internal ECC or parity error",
+};
+
static const char * const smca_nbio_mce_desc[] = {
"ECC or Parity error",
"PCIE error",
- "SDP ErrEvent error",
- "SDP Egress Poison Error",
- "IOHC Internal Poison Error",
+ "External SDP ErrEvent error",
+ "SDP Egress Poison error",
+ "Internal Poison error",
+ "Internal system fatal error event",
};
-/* PCI Express Unit */
+
static const char * const smca_pcie_mce_desc[] = {
"CCIX PER Message logging",
"CCIX Read Response with Status: Non-Data Error",
@@ -345,7 +429,7 @@ static const char * const smca_pcie_mce_desc[] = {
};
static const char * const smca_pcie2_mce_desc[] = {
- "SDP Parity Error logging",
+ "SDP Data Parity Error logging",
};
static const char * const smca_xgmipcs_mce_desc[] = {
@@ -387,11 +471,66 @@ static const char * const smca_xgmiphy_mce_desc[] = {
"PHY APB error",
};
-static const char * const smca_waflphy_mce_desc[] = {
- "RAM ECC Error",
- "ARC instruction buffer parity error",
- "ARC data buffer parity error",
- "PHY APB error",
+static const char * const smca_nbif_mce_desc[] = {
+ "Timeout error from GMI",
+ "SRAM ECC error",
+ "NTB Error Event",
+ "SDP Parity error",
+};
+
+static const char * const smca_sata_mce_desc[] = {
+ "Parity error for port 0",
+ "Parity error for port 1",
+ "Parity error for port 2",
+ "Parity error for port 3",
+ "Parity error for port 4",
+ "Parity error for port 5",
+ "Parity error for port 6",
+ "Parity error for port 7",
+};
+
+static const char * const smca_usb_mce_desc[] = {
+ "Parity error or ECC error for S0 RAM0",
+ "Parity error or ECC error for S0 RAM1",
+ "Parity error or ECC error for S0 RAM2",
+ "Parity error for PHY RAM0",
+ "Parity error for PHY RAM1",
+ "AXI Slave Response error",
+};
+
+static const char * const smca_gmipcs_mce_desc[] = {
+ "Data Loss Error",
+ "Training Error",
+ "Replay Parity Error",
+ "Rx Fifo Underflow Error",
+ "Rx Fifo Overflow Error",
+ "CRC Error",
+ "BER Exceeded Error",
+ "Tx Fifo Underflow Error",
+ "Replay Buffer Parity Error",
+ "Tx Overflow Error",
+ "Replay Fifo Overflow Error",
+ "Replay Fifo Underflow Error",
+ "Elastic Fifo Overflow Error",
+ "Deskew Error",
+ "Offline Error",
+ "Data Startup Limit Error",
+ "FC Init Timeout Error",
+ "Recovery Timeout Error",
+ "Ready Serial Timeout Error",
+ "Ready Serial Attempt Error",
+ "Recovery Attempt Error",
+ "Recovery Relock Attempt Error",
+ "Deskew Abort Error",
+ "Rx Buffer Error",
+ "Rx LFDS Fifo Overflow Error",
+ "Rx LFDS Fifo Underflow Error",
+ "LinkSub Tx Timeout Error",
+ "LinkSub Rx Timeout Error",
+ "Rx CMD Packet Error",
+ "LFDS Training Timeout Error",
+ "LFDS FC Init Timeout Error",
+ "Data Loss Error",
};
struct smca_mce_desc {
@@ -419,12 +558,21 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
[SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)},
[SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
+ [SMCA_MPDMA] = { smca_mpdma_mce_desc, ARRAY_SIZE(smca_mpdma_mce_desc) },
[SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)},
[SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)},
[SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) },
[SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) },
+ /* NBIF and SHUB have the same error descriptions, for now. */
+ [SMCA_NBIF] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
+ [SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
+ [SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
+ [SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
+ [SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
+ /* All the PHY bank types have the same error descriptions, for now. */
[SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
- [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) },
+ [SMCA_WAFL_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
+ [SMCA_GMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
};
struct smca_hwid {
@@ -470,6 +618,9 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Microprocessor 5 Unit MCA type */
{ SMCA_MP5, 0x00020001 },
+ /* MPDMA MCA Type */
+ { SMCA_MPDMA, 0x00030001 },
+
/* Northbridge IO Unit MCA type */
{ SMCA_NBIO, 0x00000018 },
@@ -480,11 +631,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Ext Global Memory Interconnect PCS MCA type */
{ SMCA_XGMI_PCS, 0x00000050 },
+ { SMCA_NBIF, 0x0000006C },
+
+ { SMCA_SHUB, 0x00000080 },
+ { SMCA_SATA, 0x000000A8 },
+ { SMCA_USB, 0x000000AA },
+ { SMCA_GMI_PCS, 0x00000241 },
+
/* Ext Global Memory Interconnect PHY MCA type */
{ SMCA_XGMI_PHY, 0x00000259 },
/* WAFL PHY MCA type */
{ SMCA_WAFL_PHY, 0x00000267 },
+
+ { SMCA_GMI_PHY, 0x00000269 },
};
struct smca_bank_name {
@@ -508,12 +668,18 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
[SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
[SMCA_MP5] = { "Microprocessor 5 Unit" },
+ [SMCA_MPDMA] = { "MPDMA Unit" },
[SMCA_NBIO] = { "Northbridge IO Unit" },
[SMCA_PCIE ... SMCA_PCIE_V2] = { "PCI Express Unit" },
[SMCA_XGMI_PCS] = { "Ext Global Memory Interconnect PCS Unit" },
+ [SMCA_NBIF] = { "NBIF Unit" },
+ [SMCA_SHUB] = { "System Hub Unit" },
+ [SMCA_SATA] = { "SATA Unit" },
+ [SMCA_USB] = { "USB Unit" },
+ [SMCA_GMI_PCS] = { "Global Memory Interconnect PCS Unit" },
[SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
[SMCA_WAFL_PHY] = { "WAFL PHY Unit" },
-
+ [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
static void amd_decode_errcode(struct mce_event *e)

@ -1,43 +0,0 @@
commit 50565005b10fe909c66f1c90f2feb95712427c7d
Author: DmNosachev <quartz64@gmail.com>
Date: Tue Jun 29 14:07:54 2021 +0300
labels/supermicro: added Supermicro X11DDW-NT(-L)
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 86e4617..373de07 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -69,7 +69,7 @@ Vendor: Supermicro
P2_DIMM4B: 2.0.1;
P2_DIMM4B: 2.1.1;
- Model: X11DPH-i
+ Model: X11DPH-i, X11DPH-T, X11DPH-TQ
P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1;
P1-DIMMB1: 0.1.0;
P1-DIMMC1: 0.2.0;
@@ -91,4 +91,18 @@ Vendor: Supermicro
P2-DIMME1: 1.0.0; P2-DIMME2: 1.0.1;
P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1;
P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1;
- P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1;
\ No newline at end of file
+ P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1;
+
+ Model: X11DDW-NT, X11DDW-L
+ P1-DIMMA1: 0.0.0;
+ P1-DIMMB1: 0.1.0;
+ P1-DIMMC1: 0.2.0;
+ P1-DIMMD1: 1.0.0;
+ P1-DIMME1: 1.1.0;
+ P1-DIMMF1: 1.2.0;
+ P2-DIMMA1: 2.0.0;
+ P2-DIMMB1: 2.1.0;
+ P2-DIMMC1: 2.2.0;
+ P2-DIMMD1: 3.0.0;
+ P2-DIMME1: 3.1.0;
+ P2-DIMMF1: 3.2.0;
\ No newline at end of file

@ -1,37 +0,0 @@
commit 6bc43db1b6b3d73805179c21d1dd5521e8dc0f74
Author: DmNosachev <quartz64@gmail.com>
Date: Fri Jul 2 13:13:46 2021 +0300
labels/supermicro: added Supermicro X11SCA(-F)
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index b924a32..1e7761f 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -10,11 +10,7 @@
#
Vendor: Supermicro
- Model: A2SDi-8C-HLN4F
- DIMMA1: 0.0.0; DIMMA2: 0.0.1;
- DIMMB1: 0.1.0; DIMMB2: 0.1.1;
-
- Model: A2SDi-8C+-HLN4F
+ Model: A2SDi-8C-HLN4F, A2SDi-8C+-HLN4F
DIMMA1: 0.0.0; DIMMA2: 0.0.1;
DIMMB1: 0.1.0; DIMMB2: 0.1.1;
@@ -115,4 +111,8 @@ Vendor: Supermicro
P2_DIMME1: 1.0.0;
P2_DIMMF1: 1.1.0;
P2_DIMMG1: 1.2.0;
- P2_DIMMH1: 1.3.0;
\ No newline at end of file
+ P2_DIMMH1: 1.3.0;
+
+ Model: X11SCA, X11SCA-F
+ DIMMA1: 0.0.0, 0.1.0; DIMMA2: 0.2.0, 0.3.0;
+ DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1;
\ No newline at end of file

@ -1,610 +0,0 @@
commit 738bafafdcb2e8b0ced32fff31b13754d571090b
Author: Jason Tian <jason@os.amperecomputing.com>
Date: Fri May 28 11:35:43 2021 +0800
Add error handling for Ampere-specific errors.
Save Ampere-specific errors' decode into sqlite3 data
base and log PCIe segment, bus/device/function number
into BMC SEL.
Signed-off-by: Jason Tian <jason@os.amperecomputing.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/non-standard-ampere.c b/non-standard-ampere.c
index 8cceb26..05b5252 100644
--- a/non-standard-ampere.c
+++ b/non-standard-ampere.c
@@ -216,6 +216,13 @@ static const char * const err_bert_sub_type[] = {
"PMPRO Fatal",
};
+static char *sqlite3_table_list[] = {
+ "amp_payload0_event_tab",
+ "amp_payload1_event_tab",
+ "amp_payload2_event_tab",
+ "amp_payload3_event_tab",
+};
+
struct amp_ras_type_info {
int id;
const char *name;
@@ -352,6 +359,359 @@ static const char *oem_subtype_name(const struct amp_ras_type_info *info,
return "unknown";
}
+#ifdef HAVE_SQLITE3
+/*key pair definition for ampere specific error payload type 0*/
+static const struct db_fields amp_payload0_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "type", .type = "TEXT" },
+ { .name = "subtype", .type = "TEXT" },
+ { .name = "instance", .type = "INTEGER" },
+ { .name = "socket_num", .type = "INTEGER" },
+ { .name = "status_reg", .type = "INTEGER" },
+ { .name = "addr_reg", .type = "INTEGER" },
+ { .name = "misc0", .type = "INTEGER" },
+ { .name = "misc1", .type = "INTEGER" },
+ { .name = "misc2", .type = "INTEGER" },
+ { .name = "misc3", .type = "INTEGER" },
+};
+
+static const struct db_table_descriptor amp_payload0_event_tab = {
+ .name = "amp_payload0_event",
+ .fields = amp_payload0_event_fields,
+ .num_fields = ARRAY_SIZE(amp_payload0_event_fields),
+};
+
+/*key pair definition for ampere specific error payload type 1*/
+static const struct db_fields amp_payload1_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "type", .type = "TEXT" },
+ { .name = "subtype", .type = "TEXT" },
+ { .name = "instance", .type = "INTEGER" },
+ { .name = "socket_num", .type = "INTEGER" },
+ { .name = "uncore_err_status", .type = "INTEGER" },
+ { .name = "uncore_err_mask", .type = "INTEGER" },
+ { .name = "uncore_err_sev", .type = "INTEGER" },
+ { .name = "core_err_status", .type = "INTEGER" },
+ { .name = "core_err_mask", .type = "INTEGER" },
+ { .name = "root_err_cmd", .type = "INTEGER" },
+ { .name = "root_err_status", .type = "INTEGER" },
+ { .name = "src_id", .type = "INTEGER" },
+ { .name = "reserved1", .type = "INTEGER" },
+ { .name = "reserverd2", .type = "INTEGER" },
+};
+
+static const struct db_table_descriptor amp_payload1_event_tab = {
+ .name = "amp_payload1_event",
+ .fields = amp_payload1_event_fields,
+ .num_fields = ARRAY_SIZE(amp_payload1_event_fields),
+};
+
+/*key pair definition for ampere specific error payload type 2*/
+static const struct db_fields amp_payload2_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "type", .type = "TEXT" },
+ { .name = "subtype", .type = "TEXT" },
+ { .name = "instance", .type = "INTEGER" },
+ { .name = "socket_num", .type = "INTEGER" },
+ { .name = "ce_report_reg", .type = "INTEGER" },
+ { .name = "ce_location", .type = "INTEGER" },
+ { .name = "ce_addr", .type = "INTEGER" },
+ { .name = "ue_report_reg", .type = "INTEGER" },
+ { .name = "ue_location", .type = "INTEGER" },
+ { .name = "ue_addr", .type = "INTEGER" },
+ { .name = "reserved1", .type = "INTEGER" },
+ { .name = "reserved2", .type = "INTEGER" },
+ { .name = "reserved2", .type = "INTEGER" },
+};
+
+static const struct db_table_descriptor amp_payload2_event_tab = {
+ .name = "amp_payload2_event",
+ .fields = amp_payload2_event_fields,
+ .num_fields = ARRAY_SIZE(amp_payload2_event_fields),
+};
+
+/*key pair definition for ampere specific error payload type 3*/
+static const struct db_fields amp_payload3_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "type", .type = "TEXT" },
+ { .name = "subtype", .type = "TEXT" },
+ { .name = "instance", .type = "INTEGER" },
+ { .name = "socket_num", .type = "INTEGER" },
+ { .name = "fw_spec_data0", .type = "INTEGER" },
+ { .name = "fw_spec_data1", .type = "INTEGER" },
+ { .name = "fw_spec_data2", .type = "INTEGER" },
+ { .name = "fw_spec_data3", .type = "INTEGER" },
+ { .name = "fw_spec_data4", .type = "INTEGER" },
+ { .name = "fw_spec_data5", .type = "INTEGER" },
+};
+
+static const struct db_table_descriptor amp_payload3_event_tab = {
+ .name = "amp_payload3_event",
+ .fields = amp_payload3_event_fields,
+ .num_fields = ARRAY_SIZE(amp_payload3_event_fields),
+};
+
+/*Save data with different type into sqlite3 db*/
+static void record_amp_data(struct ras_ns_ev_decoder *ev_decoder,
+ enum amp_oem_data_type data_type,
+ int id, int64_t data, const char *text)
+{
+ switch (data_type) {
+ case AMP_OEM_DATA_TYPE_INT:
+ sqlite3_bind_int(ev_decoder->stmt_dec_record, id, data);
+ break;
+ case AMP_OEM_DATA_TYPE_INT64:
+ sqlite3_bind_int64(ev_decoder->stmt_dec_record, id, data);
+ break;
+ case AMP_OEM_DATA_TYPE_TEXT:
+ sqlite3_bind_text(ev_decoder->stmt_dec_record, id,
+ text, -1, NULL);
+ break;
+ default:
+ break;
+ }
+}
+
+static int store_amp_err_data(struct ras_ns_ev_decoder *ev_decoder,
+ const char *name)
+{
+ int rc;
+
+ rc = sqlite3_step(ev_decoder->stmt_dec_record);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do %s step on sqlite: error = %d\n", name, rc);
+
+ rc = sqlite3_reset(ev_decoder->stmt_dec_record);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to reset %s on sqlite: error = %d\n", name, rc);
+
+ rc = sqlite3_clear_bindings(ev_decoder->stmt_dec_record);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to clear bindings %s on sqlite: error = %d\n",
+ name, rc);
+
+ return rc;
+}
+
+/*save all Ampere Specific Error Payload type 0 to sqlite3 database*/
+static void record_amp_payload0_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload0_type_sec *err)
+{
+ if (ev_decoder != NULL) {
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD0_FIELD_TYPE, 0, type_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD0_FIELD_SUB_TYPE, 0, subtype_str);
+
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD0_FIELD_INS, INSTANCE(err->instance), NULL);
+
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD0_FIELD_SOCKET_NUM,
+ SOCKET_NUM(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD0_FIELD_STATUS_REG, err->err_status, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD0_FIELD_ADDR_REG,
+ err->err_addr, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD0_FIELD_MISC0,
+ err->err_misc_0, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD0_FIELD_MISC1,
+ err->err_misc_1, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD0_FIELD_MISC2,
+ err->err_misc_2, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD0_FIELD_MISC3,
+ err->err_misc_3, NULL);
+ store_amp_err_data(ev_decoder, "amp_payload0_event_tab");
+ }
+}
+
+/*save all Ampere Specific Error Payload type 1 to sqlite3 database*/
+static void record_amp_payload1_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload1_type_sec *err)
+{
+ if (ev_decoder != NULL) {
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD1_FIELD_TYPE, 0, type_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD1_FIELD_SUB_TYPE, 0, subtype_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_INS,
+ INSTANCE(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_SOCKET_NUM,
+ SOCKET_NUM(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_STATUS,
+ err->uncore_status, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_MASK,
+ err->uncore_mask, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_SEV,
+ err->uncore_sev, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_CORE_ERR_STATUS,
+ err->core_status, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_CORE_ERR_MASK,
+ err->core_mask, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_ROOT_ERR_CMD,
+ err->root_err_cmd, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_ROOT_ERR_STATUS,
+ err->root_status, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_SRC_ID,
+ err->src_id, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_RESERVED1,
+ err->reserved1, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD1_FIELD_RESERVED2,
+ err->reserved2, NULL);
+ store_amp_err_data(ev_decoder, "amp_payload1_event_tab");
+ }
+}
+
+/*save all Ampere Specific Error Payload type 2 to sqlite3 database*/
+static void record_amp_payload2_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload2_type_sec *err)
+{
+ if (ev_decoder != NULL) {
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD2_FIELD_TYPE, 0, type_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD2_FIELD_SUB_TYPE, 0, subtype_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_INS, INSTANCE(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_SOCKET_NUM,
+ SOCKET_NUM(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_CE_REPORT_REG,
+ err->ce_register, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_CE_LOACATION,
+ err->ce_location, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_CE_ADDR,
+ err->ce_addr, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_UE_REPORT_REG,
+ err->ue_register, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_UE_LOCATION,
+ err->ue_location, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_UE_ADDR,
+ err->ue_addr, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_RESERVED1,
+ err->reserved1, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD2_FIELD_RESERVED2,
+ err->reserved2, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD2_FIELD_RESERVED3,
+ err->reserved3, NULL);
+ store_amp_err_data(ev_decoder, "amp_payload2_event_tab");
+ }
+}
+
+/*save all Ampere Specific Error Payload type 3 to sqlite3 database*/
+static void record_amp_payload3_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload3_type_sec *err)
+{
+ if (ev_decoder != NULL) {
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD3_FIELD_TYPE, 0, type_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD3_FIELD_SUB_TYPE, 0, subtype_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD3_FIELD_INS, INSTANCE(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD3_FIELD_SOCKET_NUM,
+ SOCKET_NUM(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA0,
+ err->fw_speci_data0, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA1,
+ err->fw_speci_data1, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA2,
+ err->fw_speci_data2, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA3,
+ err->fw_speci_data3, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA4,
+ err->fw_speci_data4, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA5,
+ err->fw_speci_data5, NULL);
+ store_amp_err_data(ev_decoder, "amp_payload3_event_tab");
+ }
+}
+
+#else
+static void record_amp_data(struct ras_ns_ev_decoder *ev_decoder,
+ enum amp_oem_data_type data_type,
+ int id, int64_t data, const char *text)
+{
+ return 0;
+}
+
+static void record_amp_payload0_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload0_type_sec *err)
+{
+ return 0;
+}
+
+static void record_amp_payload1_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload1_type_sec *err)
+{
+ return 0;
+}
+
+static void record_amp_payload2_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload2_type_sec *err)
+{
+ return 0;
+}
+
+static void record_amp_payload3_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload3_type_sec *err)
+{
+ return 0;
+}
+
+static int store_amp_err_data(struct ras_ns_ev_decoder *ev_decoder, char *name)
+{
+ return 0;
+}
+#endif
/*decode ampere specific error payload type 0, the CPU's data is save*/
/*to sqlite by ras-arm-handler, others are saved by this function.*/
@@ -434,6 +794,7 @@ void decode_amp_payload0_err_regs(struct ras_ns_ev_decoder *ev_decoder,
*p = '\0';
}
+ record_amp_payload0_err(ev_decoder, type_str, subtype_str, err);
i = 0;
p = NULL;
end = NULL;
@@ -517,6 +878,7 @@ static void decode_amp_payload1_err_regs(struct ras_ns_ev_decoder *ev_decoder,
*p = '\0';
}
+ record_amp_payload1_err(ev_decoder, type_str, subtype_str, err);
i = 0;
p = NULL;
end = NULL;
@@ -601,6 +963,7 @@ static void decode_amp_payload2_err_regs(struct ras_ns_ev_decoder *ev_decoder,
*p = '\0';
}
+ record_amp_payload2_err(ev_decoder, type_str, subtype_str, err);
i = 0;
p = NULL;
end = NULL;
@@ -673,6 +1036,7 @@ static void decode_amp_payload3_err_regs(struct ras_ns_ev_decoder *ev_decoder,
*p = '\0';
}
+ record_amp_payload3_err(ev_decoder, type_str, subtype_str, err);
i = 0;
p = NULL;
end = NULL;
@@ -687,6 +1051,38 @@ static int decode_amp_oem_type_error(struct ras_events *ras,
{
int payload_type = PAYLOAD_TYPE(event->error[0]);
+#ifdef HAVE_SQLITE3
+ struct db_table_descriptor db_tab;
+ int id = 0;
+
+ if (payload_type == PAYLOAD_TYPE_0) {
+ db_tab = amp_payload0_event_tab;
+ id = AMP_PAYLOAD0_FIELD_TIMESTAMP;
+ } else if (payload_type == PAYLOAD_TYPE_1) {
+ db_tab = amp_payload1_event_tab;
+ id = AMP_PAYLOAD1_FIELD_TIMESTAMP;
+ } else if (payload_type == PAYLOAD_TYPE_2) {
+ db_tab = amp_payload2_event_tab;
+ id = AMP_PAYLOAD2_FIELD_TIMESTAMP;
+ } else if (payload_type == PAYLOAD_TYPE_3) {
+ db_tab = amp_payload3_event_tab;
+ id = AMP_PAYLOAD3_FIELD_TIMESTAMP;
+ } else
+ return -1;
+
+ if (!ev_decoder->stmt_dec_record) {
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
+ &db_tab) != SQLITE_OK) {
+ trace_seq_printf(s,
+ "create sql %s fail\n",
+ sqlite3_table_list[payload_type]);
+ return -1;
+ }
+ }
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ id, 0, event->timestamp);
+#endif
+
if (payload_type == PAYLOAD_TYPE_0) {
const struct amp_payload0_type_sec *err =
(struct amp_payload0_type_sec *)event->error;
diff --git a/non-standard-ampere.h b/non-standard-ampere.h
index aacf3a8..f463c53 100644
--- a/non-standard-ampere.h
+++ b/non-standard-ampere.h
@@ -102,6 +102,79 @@ struct amp_payload3_type_sec {
uint64_t fw_speci_data5;
};
+enum amp_oem_data_type {
+ AMP_OEM_DATA_TYPE_INT,
+ AMP_OEM_DATA_TYPE_INT64,
+ AMP_OEM_DATA_TYPE_TEXT,
+};
+
+enum {
+ AMP_PAYLOAD0_FIELD_ID,
+ AMP_PAYLOAD0_FIELD_TIMESTAMP,
+ AMP_PAYLOAD0_FIELD_TYPE,
+ AMP_PAYLOAD0_FIELD_SUB_TYPE,
+ AMP_PAYLOAD0_FIELD_INS,
+ AMP_PAYLOAD0_FIELD_SOCKET_NUM,
+ AMP_PAYLOAD0_FIELD_STATUS_REG,
+ AMP_PAYLOAD0_FIELD_ADDR_REG,
+ AMP_PAYLOAD0_FIELD_MISC0,
+ AMP_PAYLOAD0_FIELD_MISC1,
+ AMP_PAYLOAD0_FIELD_MISC2,
+ AMP_PAYLOAD0_FIELD_MISC3,
+};
+
+enum {
+ AMP_PAYLOAD1_FIELD_ID,
+ AMP_PAYLOAD1_FIELD_TIMESTAMP,
+ AMP_PAYLOAD1_FIELD_TYPE,
+ AMP_PAYLOAD1_FIELD_SUB_TYPE,
+ AMP_PAYLOAD1_FIELD_INS,
+ AMP_PAYLOAD1_FIELD_SOCKET_NUM,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_STATUS,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_MASK,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_SEV,
+ AMP_PAYLOAD1_FIELD_CORE_ERR_STATUS,
+ AMP_PAYLOAD1_FIELD_CORE_ERR_MASK,
+ AMP_PAYLOAD1_FIELD_ROOT_ERR_CMD,
+ AMP_PAYLOAD1_FIELD_ROOT_ERR_STATUS,
+ AMP_PAYLOAD1_FIELD_SRC_ID,
+ AMP_PAYLOAD1_FIELD_RESERVED1,
+ AMP_PAYLOAD1_FIELD_RESERVED2,
+};
+
+enum {
+ AMP_PAYLOAD2_FIELD_ID,
+ AMP_PAYLOAD2_FIELD_TIMESTAMP,
+ AMP_PAYLOAD2_FIELD_TYPE,
+ AMP_PAYLOAD2_FIELD_SUB_TYPE,
+ AMP_PAYLOAD2_FIELD_INS,
+ AMP_PAYLOAD2_FIELD_SOCKET_NUM,
+ AMP_PAYLOAD2_FIELD_CE_REPORT_REG,
+ AMP_PAYLOAD2_FIELD_CE_LOACATION,
+ AMP_PAYLOAD2_FIELD_CE_ADDR,
+ AMP_PAYLOAD2_FIELD_UE_REPORT_REG,
+ AMP_PAYLOAD2_FIELD_UE_LOCATION,
+ AMP_PAYLOAD2_FIELD_UE_ADDR,
+ AMP_PAYLOAD2_FIELD_RESERVED1,
+ AMP_PAYLOAD2_FIELD_RESERVED2,
+ AMP_PAYLOAD2_FIELD_RESERVED3,
+};
+
+enum {
+ AMP_PAYLOAD3_FIELD_ID,
+ AMP_PAYLOAD3_FIELD_TIMESTAMP,
+ AMP_PAYLOAD3_FIELD_TYPE,
+ AMP_PAYLOAD3_FIELD_SUB_TYPE,
+ AMP_PAYLOAD3_FIELD_INS,
+ AMP_PAYLOAD3_FIELD_SOCKET_NUM,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA0,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA1,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA2,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA3,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA4,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA5
+};
+
void decode_amp_payload0_err_regs(struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
const struct amp_payload0_type_sec *err);
diff --git a/ras-aer-handler.c b/ras-aer-handler.c
index 8ddd439..6f4cb2b 100644
--- a/ras-aer-handler.c
+++ b/ras-aer-handler.c
@@ -67,6 +67,9 @@ int ras_aer_event_handler(struct trace_seq *s,
struct tm *tm;
struct ras_aer_event ev;
char buf[BUF_LEN];
+ char ipmi_add_sel[105];
+ uint8_t sel_data[5];
+ int seg, bus, dev, fn;
/*
* Newer kernels (3.10-rc1 or upper) provide an uptime clock.
@@ -129,15 +132,19 @@ int ras_aer_event_handler(struct trace_seq *s,
switch (severity_val) {
case HW_EVENT_AER_UNCORRECTED_NON_FATAL:
ev.error_type = "Uncorrected (Non-Fatal)";
+ sel_data[0] = 0xca;
break;
case HW_EVENT_AER_UNCORRECTED_FATAL:
ev.error_type = "Uncorrected (Fatal)";
+ sel_data[0] = 0xca;
break;
case HW_EVENT_AER_CORRECTED:
ev.error_type = "Corrected";
+ sel_data[0] = 0xbf;
break;
default:
ev.error_type = "Unknown severity";
+ sel_data[0] = 0xbf;
}
trace_seq_puts(s, ev.error_type);
@@ -151,5 +158,29 @@ int ras_aer_event_handler(struct trace_seq *s,
ras_report_aer_event(ras, &ev);
#endif
+#ifdef HAVE_AMP_NS_DECODE
+ /*
+ * Get PCIe AER error source seg/bus/dev/fn and save it into
+ * BMC OEM SEL, ipmitool raw 0x0a 0x44 is IPMI command-Add SEL
+ * entry, please refer IPMI specificaiton chapter 31.6. 0xcd3a
+ * is manufactuer ID(ampere),byte 12 is sensor num(CE is 0xBF,
+ * UE is 0xCA), byte 13~14 is segment number, byte 15 is bus
+ * number, byte 16[7:3] is device number, byte 16[2:0] is
+ * function number
+ */
+ sscanf(ev.dev_name, "%x:%x:%x.%x", &seg, &bus, &dev, &fn);
+
+ sel_data[1] = seg & 0xff;
+ sel_data[2] = (seg & 0xff00) >> 8;
+ sel_data[3] = bus;
+ sel_data[4] = (((dev & 0x1f) << 3) | (fn & 0x7));
+
+ sprintf(ipmi_add_sel,
+ "ipmitool raw 0x0a 0x44 0x00 0x00 0xc0 0x00 0x00 0x00 0x00 0x3a 0xcd 0x00 0xc0 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x",
+ sel_data[0], sel_data[1], sel_data[2], sel_data[3], sel_data[4]);
+
+ system(ipmi_add_sel);
+#endif
+
return 0;
}

@ -1,93 +0,0 @@
commit 73d8177ce0d2fcb7693cacee4778d0845ebd3788
Author: sathya priya kumar <SathyaPriya.K@amd.com>
Date: Thu Jun 13 05:29:09 2024 +0000
rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits
Optimize smca_smu2_mce_desc in better way from the commit ced615c.
Update existing array with extended error descriptions instead
of creating new array, simplifying the code.
Signed-off-by: Sathya Priya Kumar <sathyapriya.k@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
mce-amd-smca.c | 29 +++--------------------------
ras-mce-handler.h | 1 -
2 files changed, 3 insertions(+), 27 deletions(-)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-07-18 11:14:26.008582740 -0400
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-07-18 11:15:05.510270132 -0400
@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d
"An ECC or parity error in an SMU RAM instance",
};
-static const char * smca_smu2_mce_desc[64] = {
+static const char * const smca_smu2_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
"Data Cache Bank A ECC or parity error",
@@ -410,14 +410,13 @@ static const char * smca_smu2_mce_desc[6
"Instruction Tag Cache Bank B ECC or parity error",
"System Hub Read Buffer ECC or parity error",
"PHY RAS ECC Error",
-};
-
-static const char * smca_smu2_ext_mce_desc[] = {
+ [12 ... 57] = "Reserved",
"A correctable error from a GFX Sub-IP",
"A fatal error from a GFX Sub-IP",
"Reserved",
"Reserved",
"A poison error from a GFX Sub-IP",
+ "Reserved",
};
static const char * const smca_mp5_mce_desc[] = {
@@ -824,27 +823,6 @@ static struct smca_bank_name smca_names[
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
-void smca_smu2_ext_err_desc(void)
-{
- int i, j;
- int smu2_bits = 62;
-
- /*
- * MCA_CTL_SMU error stings are defined for b'58:59 and b'62
- * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU
- *
- * b'0:11 can be decoded from existing array smca_smu2_mce_desc.
- * b'12:57 are Reserved and b'58:62 are appended to the
- * smca_smu2_mce_desc.
- */
- for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) {
- for ( ; i < 58; i++)
- smca_smu2_mce_desc[i] = "Reserved";
-
- smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j];
- }
-}
-
void amd_decode_errcode(struct mce_event *e)
{
@@ -936,7 +914,6 @@ unsigned short xec = (e->status >> 16) &
mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
(ipid_high & MCI_IPID_MCATYPE) >> 16);
- smca_smu2_ext_err_desc();
fixup_hwid(m, &mcatype_hwid);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-07-18 11:14:26.008582740 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-07-18 11:14:28.987559165 -0400
@@ -121,7 +121,6 @@ int set_intel_imc_log(enum cputype cputy
/* Undertake AMD SMCA Error Decoding */
void decode_smca_error(struct mce_event *e, struct mce_priv *m);
void amd_decode_errcode(struct mce_event *e);
-void smca_smu2_ext_err_desc(void);
/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);

@ -1,24 +0,0 @@
commit 7937f0d6c2aaaed096f3a3d306416743c0dcb7a4
Author: Muralidhara M K <muralimk@amd.com>
Date: Wed Jul 28 01:52:12 2021 -0500
rasdaemon: Support MCE for AMD CPU family 19h
Add support for family 19h x86 CPUs from AMD.
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
index 805004a..f2b53d4 100644
--- a/ras-mce-handler.c
+++ b/ras-mce-handler.c
@@ -208,7 +208,7 @@ static int detect_cpu(struct ras_events *ras)
mce->cputype = CPU_AMD_SMCA;
goto ret;
}
- if (mce->family > 23) {
+ if (mce->family > 25) {
log(ALL, LOG_INFO,
"Can't parse MCE for this AMD CPU yet %d\n",
mce->family);

@ -1,26 +0,0 @@
commit 7ccf12f5ae26a055926d175d908c7930293438c4
Author: DmNosachev <quartz64@gmail.com>
Date: Fri Jul 23 17:28:33 2021 +0300
labels/supermicro: added Supermicro X11SCW
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 990fc9e..aea7c3c 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -133,4 +133,10 @@ Vendor: Supermicro
Model: X11SCA, X11SCA-F
DIMMA1: 0.0.0, 0.1.0; DIMMA2: 0.2.0, 0.3.0;
- DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1;
\ No newline at end of file
+ DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1;
+
+ Model: X11SCW-F
+ DIMMA1: 0.1.0;
+ DIMMA2: 0.0.0;
+ DIMMB1: 0.1.1;
+ DIMMB2: 0.0.1;
\ No newline at end of file

@ -1,34 +0,0 @@
commit 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e
Author: Aristeu Rozanski <arozansk@redhat.com>
Date: Tue Apr 9 10:06:30 2024 -0400
mce-amd-smca: update smca_hwid to use smca_bank_types
bank_type is used as smca_bank_types everywhere, there's no point in
declaring it as unsigned int. It also upsets covscan:
3. rasdaemon-0.6.7/mce-amd-smca.c:914: assignment: Assigning: "bank_type" = "s_hwid->bank_type".
7. rasdaemon-0.6.7/mce-amd-smca.c:926: cond_at_most: Checking "bank_type >= 64U" implies that "bank_type" and "s_hwid->bank_type" may be up to 63 on the false branch.
14. rasdaemon-0.6.7/mce-amd-smca.c:942: overrun-local: Overrunning array "smca_mce_descs" of 38 16-byte elements at element index 63 (byte offset 1023) using index "bank_type" (which evaluates to 63).
# 940| /* Only print the descriptor of valid extended error code */
# 941| if (xec < smca_mce_descs[bank_type].num_descs)
# 942|-> mce_snprintf(e->mcastatus_msg,
# 943| "%s. Ext Err Code: %d",
# 944| smca_mce_descs[bank_type].descs[xec],
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7521ff7..6632663 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -706,7 +706,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
};
struct smca_hwid {
- unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/
+ enum smca_bank_types bank_type;
uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/
};

@ -1,22 +0,0 @@
commit 885e546add918457c453bd3f753ac7df90b39e36
Author: weidongkl <weidongkl@sina.com>
Date: Tue Sep 19 16:29:21 2023 +0800
Add a space between "diskerror_event" and "store"
Signed-off-by: weidongkl <weidongkl@sina.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-record.c b/ras-record.c
index a5f99ae..6b050bb 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -484,7 +484,7 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev
if (!priv || !priv->stmt_diskerror_event)
return 0;
- log(TERM, LOG_INFO, "diskerror_eventstore: %p\n", priv->stmt_diskerror_event);
+ log(TERM, LOG_INFO, "diskerror_event store: %p\n", priv->stmt_diskerror_event);
sqlite3_bind_text(priv->stmt_diskerror_event, 1, ev->timestamp, -1, NULL);
sqlite3_bind_text(priv->stmt_diskerror_event, 2, ev->dev, -1, NULL);

@ -1,411 +0,0 @@
commit 932118b04a04104dfac6b8536419803f236e6118
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Mon May 22 22:13:17 2023 +0000
rasdaemon: Add support for post-processing MCA errors
Currently, the rasdaemon performs detailed error decoding of received
MCA errors on the system only whence it is running, either as a daemon
or in the foreground.
As such, error decoding cannot be undertaken for any MCA errors received
whence the rasdaemon wasn't running. Additionally, if the error decoding
modules like edac_mce_amd too have not been loaded, error records in the
demsg buffer might correspond to raw values in associated MSRs, compelling
users to undertake decoding manually. The scenario seems more plausible on
AMD systems with Scalabale MCA (SMCA) with plans in place to remove SMCA
Extended Error Descriptions from the edac_mce_amd module in an effort to
offload SMCA Error Decoding to the rasdaemon.
As such, add support to post-process and decode MCA Errors received on AMD
SMCA systems from raw MSR values. Support for post-processing and decoding
of MCA Errors received on CPUs of other vendors can be added in the future,
as needed.
Suggested-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
mce-amd-smca.c | 8 ++-
ras-events.h | 1
ras-mce-handler.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++----
ras-mce-handler.h | 4 +
ras-record.h | 10 ++++
rasdaemon.c | 94 +++++++++++++++++++++++++++++++++++++++++++++-
6 files changed, 216 insertions(+), 11 deletions(-)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/mce-amd-smca.c 2023-10-27 12:44:58.549049019 -0400
@@ -710,7 +710,7 @@ static struct smca_bank_name smca_names[
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
-static void amd_decode_errcode(struct mce_event *e)
+void amd_decode_errcode(struct mce_event *e)
{
decode_amd_errcode(e);
@@ -782,7 +782,7 @@ *hwid_mcatype = 0x00010000;
}
/* Decode extended errors according to Scalable MCA specification */
-static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
+void decode_smca_error(struct mce_event *e, struct mce_priv *m)
{
enum smca_bank_types bank_type;
const char *ip_name;
@@ -827,7 +827,9 @@ for (i = 0; i < ARRAY_SIZE(smca_hwid_mca
/* Only print the descriptor of valid extended error code */
if (xec < smca_mce_descs[bank_type].num_descs)
mce_snprintf(e->mcastatus_msg,
- " %s.\n", smca_mce_descs[bank_type].descs[xec]);
+ "%s. Ext Err Code: %d",
+ smca_mce_descs[bank_type].descs[xec],
+ xec);
if (bank_type == SMCA_UMC && xec == 0) {
channel = find_umc_channel(e);
--- rasdaemon-0.6.7.orig/ras-events.h 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-events.h 2023-10-27 12:44:58.549049019 -0400
@@ -100,6 +100,7 @@ enum ghes_severity {
/* Function prototypes */
int toggle_ras_mc_event(int enable);
+int ras_offline_mce_event(struct ras_mc_offline_event *event);
int handle_ras_events(int record_events);
#endif
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.c 2023-10-27 12:45:27.159776011 -0400
@@ -63,10 +63,8 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_SAPPHIRERAPIDS] = "Sapphirerapids server",
};
-static enum cputype select_intel_cputype(struct ras_events *ras)
+static enum cputype select_intel_cputype(struct mce_priv *mce)
{
- struct mce_priv *mce = ras->mce_priv;
-
if (mce->family == 15) {
if (mce->model == 6)
return CPU_TULSA;
@@ -140,9 +138,8 @@ if (mce->model > 0x1a) {
return mce->family == 6 ? CPU_P6OLD : CPU_GENERIC;
}
-static int detect_cpu(struct ras_events *ras)
+static int detect_cpu(struct mce_priv *mce)
{
- struct mce_priv *mce = ras->mce_priv;
FILE *f;
int ret = 0;
char *line = NULL;
@@ -221,7 +218,7 @@ ret = 0;
}
goto ret;
} else if (!strcmp(mce->vendor,"GenuineIntel")) {
- mce->cputype = select_intel_cputype(ras);
+ mce->cputype = select_intel_cputype(mce);
} else {
ret = EINVAL;
}
@@ -246,7 +243,7 @@ int register_mce_handler(struct ras_even
mce = ras->mce_priv;
- rc = detect_cpu(ras);
+ rc = detect_cpu(mce);
if (rc) {
if (mce->processor_flags)
free (mce->processor_flags);
@@ -383,6 +380,105 @@ #if 0
*/
}
+static int report_mce_offline(struct trace_seq *s,
+ struct mce_event *mce,
+ struct mce_priv *priv)
+{
+ time_t now;
+ struct tm *tm;
+
+ time(&now);
+ tm = localtime(&now);
+
+ if (tm)
+ strftime(mce->timestamp, sizeof(mce->timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ trace_seq_printf(s, "%s,", mce->timestamp);
+
+ if (*mce->bank_name)
+ trace_seq_printf(s, " %s,", mce->bank_name);
+ else
+ trace_seq_printf(s, " bank=%x,", mce->bank);
+
+ if (*mce->mcastatus_msg)
+ trace_seq_printf(s, " mca: %s,", mce->mcastatus_msg);
+
+ if (*mce->mcistatus_msg)
+ trace_seq_printf(s, " mci: %s,", mce->mcistatus_msg);
+
+ if (*mce->mc_location)
+ trace_seq_printf(s, " Locn: %s,", mce->mc_location);
+
+ if (*mce->error_msg)
+ trace_seq_printf(s, " Error Msg: %s\n", mce->error_msg);
+
+ return 0;
+}
+
+int ras_offline_mce_event(struct ras_mc_offline_event *event)
+{
+ int rc = 0;
+ struct trace_seq s;
+ struct mce_event *mce = NULL;
+ struct mce_priv *priv = NULL;
+
+ mce = (struct mce_event *)calloc(1, sizeof(struct mce_event));
+ if (!mce) {
+ log(TERM, LOG_ERR, "Can't allocate memory for mce struct\n");
+ return errno;
+ }
+
+ priv = (struct mce_priv *)calloc(1, sizeof(struct mce_priv));
+ if (!priv) {
+ log(TERM, LOG_ERR, "Can't allocate memory for mce_priv struct\n");
+ free(mce);
+ return errno;
+ }
+
+ if (event->smca) {
+ priv->cputype = CPU_AMD_SMCA;
+ priv->family = event->family;
+ priv->model = event->model;
+ } else {
+ rc = detect_cpu(priv);
+ if (rc) {
+ log(TERM, LOG_ERR, "Failed to detect CPU\n");
+ goto free_mce;
+ }
+ }
+
+ mce->status = event->status;
+ mce->bank = event->bank;
+
+ switch (priv->cputype) {
+ case CPU_AMD_SMCA:
+ mce->synd = event->synd;
+ mce->ipid = event->ipid;
+ if (!mce->ipid || !mce->status) {
+ log(TERM, LOG_ERR, "%s MSR required.\n",
+ mce->ipid ? "Status" : "Ipid");
+ rc = -EINVAL;
+ goto free_mce;
+ }
+ decode_smca_error(mce, priv);
+ amd_decode_errcode(mce);
+ break;
+ default:
+ break;
+ }
+
+ trace_seq_init(&s);
+ report_mce_offline(&s, mce, priv);
+ trace_seq_do_printf(&s);
+ fflush(stdout);
+ trace_seq_destroy(&s);
+
+free_mce:
+ free(priv);
+ free(mce);
+ return rc;
+}
+
int ras_mce_event_handler(struct trace_seq *s,
struct pevent_record *record,
struct event_format *event, void *context)
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2023-10-27 12:44:58.550049010 -0400
@@ -118,6 +118,10 @@ int ras_mce_event_handler(struct trace_s
/* enables intel iMC logs */
int set_intel_imc_log(enum cputype cputype, unsigned ncpus);
+/* Undertake AMD SMCA Error Decoding */
+void decode_smca_error(struct mce_event *e, struct mce_priv *m);
+void amd_decode_errcode(struct mce_event *e);
+
/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);
void core2_decode_model(struct mce_event *e);
--- rasdaemon-0.6.7.orig/ras-record.h 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-record.h 2023-10-27 12:44:58.550049010 -0400
@@ -21,6 +21,7 @@ * Foundation, Inc., 51 Franklin Street,
#define __RAS_RECORD_H
#include <stdint.h>
+#include <stdbool.h>
#include "config.h"
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
@@ -39,6 +40,15 @@ struct ras_mc_event {
const char *driver_detail;
};
+struct ras_mc_offline_event {
+ unsigned int family, model;
+ bool smca;
+ uint8_t bank;
+ uint64_t ipid;
+ uint64_t synd;
+ uint64_t status;
+};
+
struct ras_aer_event {
char timestamp[64];
const char *error_type;
--- rasdaemon-0.6.7.orig/rasdaemon.c 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/rasdaemon.c 2023-10-27 12:44:58.550049010 -0400
@@ -41,8 +41,21 @@ struct arguments {
int record_events;
int enable_ras;
int foreground;
+ int offline;
};
+enum OFFLINE_ARG_KEYS {
+ SMCA = 0x100,
+ MODEL,
+ FAMILY,
+ BANK_NUM,
+ IPID_REG,
+ STATUS_REG,
+ SYNDROME_REG
+};
+
+struct ras_mc_offline_event event;
+
static error_t parse_opt(int k, char *arg, struct argp_state *state)
{
struct arguments *args = state->input;
@@ -62,18 +75,84 @@ static error_t parse_opt(int k, char *ar
case 'f':
args->foreground++;
break;
+#ifdef HAVE_MCE
+ case 'p':
+ if (state->argc < 4)
+ argp_state_help(state, stdout, ARGP_HELP_LONG | ARGP_HELP_EXIT_ERR);
+ args->offline++;
+ break;
+#endif
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
+#ifdef HAVE_MCE
+static error_t parse_opt_offline(int key, char *arg,
+ struct argp_state *state)
+{
+ switch (key) {
+ case SMCA:
+ event.smca = true;
+ break;
+ case MODEL:
+ event.model = strtoul(state->argv[state->next], NULL, 0);
+ break;
+ case FAMILY:
+ event.family = strtoul(state->argv[state->next], NULL, 0);
+ break;
+ case BANK_NUM:
+ event.bank = atoi(state->argv[state->next]);
+ break;
+ case IPID_REG:
+ event.ipid = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ case STATUS_REG:
+ event.status = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ case SYNDROME_REG:
+ event.synd = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ default:
+ return ARGP_ERR_UNKNOWN;
+ }
+ return 0;
+}
+#endif
+
long user_hz;
int main(int argc, char *argv[])
{
struct arguments args;
int idx = -1;
+
+#ifdef HAVE_MCE
+ const struct argp_option offline_options[] = {
+ {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
+ {"model", MODEL, 0, 0, "CPU Model"},
+ {"family", FAMILY, 0, 0, "CPU Family"},
+ {"bank", BANK_NUM, 0, 0, "Bank Number"},
+ {"ipid", IPID_REG, 0, 0, "IPID Register (for SMCA systems only)"},
+ {"status", STATUS_REG, 0, 0, "Status Register"},
+ {"synd", SYNDROME_REG, 0, 0, "Syndrome Register"},
+ {0, 0, 0, 0, 0, 0},
+ };
+
+ struct argp offline_argp = {
+ .options = offline_options,
+ .parser = parse_opt_offline,
+ .doc = TOOL_DESCRIPTION,
+ .args_doc = ARGS_DOC,
+ };
+
+ struct argp_child offline_parser[] = {
+ {&offline_argp, 0, "Post-Processing Options:", 0},
+ {0, 0, 0, 0},
+ };
+#endif
+
const struct argp_option options[] = {
{"enable", 'e', 0, 0, "enable RAS events and exit", 0},
{"disable", 'd', 0, 0, "disable RAS events and exit", 0},
@@ -81,6 +160,10 @@ {"disable", 'd', 0, 0, "disable RAS even
{"record", 'r', 0, 0, "record events via sqlite3", 0},
#endif
{"foreground", 'f', 0, 0, "run foreground, not daemonize"},
+#ifdef HAVE_MCE
+ {"post-processing", 'p', 0, 0,
+ "Post-processing MCE's with raw register values"},
+#endif
{ 0, 0, 0, 0, 0, 0 }
};
@@ -89,7 +172,9 @@ { 0, 0, 0, 0, 0, 0 }
.parser = parse_opt,
.doc = TOOL_DESCRIPTION,
.args_doc = ARGS_DOC,
-
+#ifdef HAVE_MCE
+ .children = offline_parser,
+#endif
};
memset (&args, 0, sizeof(args));
@@ -111,6 +196,13 @@ enable = (args.enable_ras > 0) ? 1 : 0;
return 0;
}
+#ifdef HAVE_MCE
+ if (args.offline) {
+ ras_offline_mce_event(&event);
+ return 0;
+ }
+#endif
+
openlog(TOOL_NAME, 0, LOG_DAEMON);
if (!args.foreground)
if (daemon(0,0))

@ -1,51 +0,0 @@
commit 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b
Author: Muralidhara M K <muralimk@amd.com>
Date: Tue Jul 27 06:36:45 2021 -0500
rasdaemon: ras-mc-ctl: Fix script to parse dimm sizes
Removes trailing spaces at the end of a line from
file location and fixes --layout option to parse dimm nodes
to get the size of each dimm from ras-mc-ctl.
Issue is reported https://github.com/mchehab/rasdaemon/issues/43
Where '> ras-mc-ctl --layout' reports all 0s
With this change the layout option prints the correct dimm sizes
> sudo ras-mc-ctl --layout
+-----------------------------------------------+
| mc0 |
| csrow0 | csrow1 | csrow2 | csrow3 |
----------+-----------------------------------------------+
...
channel7: | 16384 MB | 0 MB | 0 MB | 0 MB |
channel6: | 16384 MB | 0 MB | 0 MB | 0 MB |
...
----------+-----------------------------------------------+
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Cc: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lkml.kernel.org/r/20210810183855.129076-1-nchatrad@amd.com/
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 1e3aeb7..b22dd60 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -246,6 +246,7 @@ sub parse_dimm_nodes
if (($file =~ /max_location$/)) {
open IN, $file;
my $location = <IN>;
+ $location =~ s/\s+$//;
close IN;
my @temp = split(/ /, $location);
@@ -288,6 +289,7 @@ sub parse_dimm_nodes
open IN, $file;
my $location = <IN>;
+ $location =~ s/\s+$//;
close IN;
my @pos;

@ -1,40 +0,0 @@
commit 9a5baed97b21af31064d9995ffcfaac0e9d7983e
Author: DmNosachev <quartz64@gmail.com>
Date: Tue Jun 29 13:37:48 2021 +0300
labels/supermicro: supermicro db syntax
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index bfaed93..47ea05f 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -18,17 +18,17 @@ Vendor: Supermicro
DIMMA1: 0.0.0; DIMMA2: 0.0.1;
DIMMB1: 0.1.0; DIMMB2: 0.1.1;
- Product: X10SRA-F
- DIMMA1: 0.0.0
- DIMMA2: 0.0.1
- DIMMB1: 0.1.0
- DIMMB2: 0.1.1
- DIMMC1: 1.0.0
- DIMMC2: 1.0.1
- DIMMD1: 1.1.0
- DIMMD2: 1.1.1
+ Model: X10SRA-F
+ DIMMA1: 0.0.0;
+ DIMMA2: 0.0.1;
+ DIMMB1: 0.1.0;
+ DIMMB2: 0.1.1;
+ DIMMC1: 1.0.0;
+ DIMMC2: 1.0.1;
+ DIMMD1: 1.1.0;
+ DIMMD2: 1.1.1;
- Product: H8DGU
+ Model: H8DGU
P1_DIMM1A: 0.2.0;
P1_DIMM1A: 0.3.0;
P2_DIMM1A: 3.2.0;

@ -1,230 +0,0 @@
commit 9acef39f13833f7d53ef96abc5a72e79384260f4
Author: Naveen Krishna Chatradhi <nchatrad@amd.com>
Date: Tue Jun 1 11:01:17 2021 +0530
rasdaemon: Add new SMCA bank types with error decoding
Upcoming systems with Scalable Machine Check Architecture (SMCA) have
new MCA banks added.
This patch adds the (HWID, MCATYPE) tuple, name and error decoding for
those new SMCA banks.
While at it, optimize the string names in smca_bank_name[].
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7c619fd..e0cf512 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -47,7 +47,7 @@
/* These may be used by multiple smca_hwid_mcatypes */
enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
- SMCA_LS_V2, /* Load Store */
+ SMCA_LS_V2,
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
@@ -56,17 +56,22 @@ enum smca_bank_types {
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
- SMCA_CS_V2, /* Coherent Slave V2 */
+ SMCA_CS_V2,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
+ SMCA_UMC_V2,
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
- SMCA_PSP_V2, /* Platform Security Processor V2 */
+ SMCA_PSP_V2,
SMCA_SMU, /* System Management Unit */
- SMCA_SMU_V2, /* System Management Unit V2 */
+ SMCA_SMU_V2,
SMCA_MP5, /* Microprocessor 5 Unit */
SMCA_NBIO, /* Northbridge IO Unit */
SMCA_PCIE, /* PCI Express Unit */
+ SMCA_PCIE_V2,
+ SMCA_XGMI_PCS, /* xGMI PCS Unit */
+ SMCA_XGMI_PHY, /* xGMI PHY Unit */
+ SMCA_WAFL_PHY, /* WAFL PHY Unit */
N_SMCA_BANK_TYPES
};
@@ -237,6 +242,22 @@ static const char * const smca_umc_mce_desc[] = {
"Command/address parity error",
"Write data CRC error",
};
+
+static const char * const smca_umc2_mce_desc[] = {
+ "DRAM ECC error",
+ "Data poison error",
+ "SDP parity error",
+ "Reserved",
+ "Address/Command parity error",
+ "Write data parity error",
+ "DCQ SRAM ECC error",
+ "Reserved",
+ "Read data parity error",
+ "Rdb SRAM ECC error",
+ "RdRsp SRAM ECC error",
+ "LM32 MP errors",
+};
+
/* Parameter Block */
static const char * const smca_pb_mce_desc[] = {
"Parameter Block RAM ECC error",
@@ -314,6 +335,55 @@ static const char * const smca_pcie_mce_desc[] = {
"CCIX Non-okay write response with data error",
};
+static const char * const smca_pcie2_mce_desc[] = {
+ "SDP Parity Error logging",
+};
+
+static const char * const smca_xgmipcs_mce_desc[] = {
+ "Data Loss Error",
+ "Training Error",
+ "Flow Control Acknowledge Error",
+ "Rx Fifo Underflow Error",
+ "Rx Fifo Overflow Error",
+ "CRC Error",
+ "BER Exceeded Error",
+ "Tx Vcid Data Error",
+ "Replay Buffer Parity Error",
+ "Data Parity Error",
+ "Replay Fifo Overflow Error",
+ "Replay Fifo Underflow Error",
+ "Elastic Fifo Overflow Error",
+ "Deskew Error",
+ "Flow Control CRC Error",
+ "Data Startup Limit Error",
+ "FC Init Timeout Error",
+ "Recovery Timeout Error",
+ "Ready Serial Timeout Error",
+ "Ready Serial Attempt Error",
+ "Recovery Attempt Error",
+ "Recovery Relock Attempt Error",
+ "Replay Attempt Error",
+ "Sync Header Error",
+ "Tx Replay Timeout Error",
+ "Rx Replay Timeout Error",
+ "LinkSub Tx Timeout Error",
+ "LinkSub Rx Timeout Error",
+ "Rx CMD Pocket Error",
+};
+
+static const char * const smca_xgmiphy_mce_desc[] = {
+ "RAM ECC Error",
+ "ARC instruction buffer parity error",
+ "ARC data buffer parity error",
+ "PHY APB error",
+};
+
+static const char * const smca_waflphy_mce_desc[] = {
+ "RAM ECC Error",
+ "ARC instruction buffer parity error",
+ "ARC data buffer parity error",
+ "PHY APB error",
+};
struct smca_mce_desc {
const char * const *descs;
@@ -333,6 +403,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
+ [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
[SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
@@ -341,6 +412,10 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
[SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)},
[SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)},
+ [SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) },
+ [SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) },
+ [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
+ [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) },
};
struct smca_hwid {
@@ -369,6 +444,8 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Unified Memory Controller MCA type */
{ SMCA_UMC, 0x00000096 },
+ /* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
+ { SMCA_UMC_V2, 0x00010096 },
/* Parameter Block MCA type */
{ SMCA_PB, 0x00000005 },
@@ -389,6 +466,16 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* PCI Express Unit MCA type */
{ SMCA_PCIE, 0x00000046 },
+ { SMCA_PCIE_V2, 0x00010046 },
+
+ /* Ext Global Memory Interconnect PCS MCA type */
+ { SMCA_XGMI_PCS, 0x00000050 },
+
+ /* Ext Global Memory Interconnect PHY MCA type */
+ { SMCA_XGMI_PHY, 0x00000259 },
+
+ /* WAFL PHY MCA type */
+ { SMCA_WAFL_PHY, 0x00000267 },
};
struct smca_bank_name {
@@ -396,27 +483,28 @@ struct smca_bank_name {
};
static struct smca_bank_name smca_names[] = {
- [SMCA_LS] = { "Load Store Unit" },
- [SMCA_LS_V2] = { "Load Store Unit" },
- [SMCA_IF] = { "Instruction Fetch Unit" },
- [SMCA_L2_CACHE] = { "L2 Cache" },
- [SMCA_DE] = { "Decode Unit" },
- [SMCA_RESERVED] = { "Reserved" },
- [SMCA_EX] = { "Execution Unit" },
- [SMCA_FP] = { "Floating Point Unit" },
- [SMCA_L3_CACHE] = { "L3 Cache" },
- [SMCA_CS] = { "Coherent Slave" },
- [SMCA_CS_V2] = { "Coherent Slave" },
- [SMCA_PIE] = { "Power, Interrupts, etc." },
- [SMCA_UMC] = { "Unified Memory Controller" },
- [SMCA_PB] = { "Parameter Block" },
- [SMCA_PSP] = { "Platform Security Processor" },
- [SMCA_PSP_V2] = { "Platform Security Processor" },
- [SMCA_SMU] = { "System Management Unit" },
- [SMCA_SMU_V2] = { "System Management Unit" },
- [SMCA_MP5] = { "Microprocessor 5 Unit" },
- [SMCA_NBIO] = { "Northbridge IO Unit" },
- [SMCA_PCIE] = { "PCI Express Unit" },
+ [SMCA_LS ... SMCA_LS_V2] = { "Load Store Unit" },
+ [SMCA_IF] = { "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "L2 Cache" },
+ [SMCA_DE] = { "Decode Unit" },
+ [SMCA_RESERVED] = { "Reserved" },
+ [SMCA_EX] = { "Execution Unit" },
+ [SMCA_FP] = { "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "L3 Cache" },
+ [SMCA_CS ... SMCA_CS_V2] = { "Coherent Slave" },
+ [SMCA_PIE] = { "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "Unified Memory Controller" },
+ [SMCA_UMC_V2] = { "Unified Memory Controller V2" },
+ [SMCA_PB] = { "Parameter Block" },
+ [SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
+ [SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
+ [SMCA_MP5] = { "Microprocessor 5 Unit" },
+ [SMCA_NBIO] = { "Northbridge IO Unit" },
+ [SMCA_PCIE ... SMCA_PCIE_V2] = { "PCI Express Unit" },
+ [SMCA_XGMI_PCS] = { "Ext Global Memory Interconnect PCS Unit" },
+ [SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
+ [SMCA_WAFL_PHY] = { "WAFL PHY Unit" },
+
};
static void amd_decode_errcode(struct mce_event *e)

@ -1,24 +0,0 @@
commit 9bd84aef87978b806178a73ed33c39d6c442fc1f
Author: weidong <weidongkl@sina.com>
Date: Tue Aug 8 08:59:12 2023 +0000
add ':' before error output
All prints except disk are preceded by a colon
Signed-off-by: weidong <weidongkl@sina.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index dc326d3..13078c2 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1469,7 +1469,7 @@ sub errors
$out .= "\n";
}
if ($out ne "") {
- print "Disk errors\n$out\n";
+ print "Disk errors:\n$out\n";
} else {
print "No disk errors.\n\n";
}

@ -1,117 +0,0 @@
commit 9c86f6255f67a8bae28cd46c54500fc16bfc7a30
Author: Yang Shi <shy828301@gmail.com>
Date: Mon Apr 4 16:34:05 2022 -0700
rasdaemon: use the new block_rq_error tracepoint
Since Linux 5.18-rc1 a new block tracepoint called block_rq_error is
available for tracing disk error events dedicatedly. Currently
rasdaemon is using block_rq_complete which also traces successful cases.
It incurs excessive tracing logs and somehow overhead since the event is
triggered quite often.
Use the new tracepoint for disk error reporting, and the new trace point
has the same format as block_rq_complete.
Signed-off-by: Yang Shi <shy828301@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
ras-events.c | 53 ++++++++++-------------------------------------------
ras-record.c | 2 +-
2 files changed, 11 insertions(+), 44 deletions(-)
--- rasdaemon-0.6.7.orig/ras-events.c 2024-05-14 11:05:40.020599541 -0400
+++ rasdaemon-0.6.7/ras-events.c 2024-05-14 11:06:38.831067957 -0400
@@ -27,6 +27,7 @@ * Foundation, Inc., 51 Franklin Street,
#include <sys/poll.h>
#include <signal.h>
#include <sys/signalfd.h>
+#include <linux/version.h>
#include "libtrace/kbuffer.h"
#include "libtrace/event-parse.h"
#include "ras-mc-handler.h"
@@ -229,7 +230,7 @@ if (rc < 0) {
#endif
#ifdef HAVE_DISKERROR
- rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable);
+ rc |= __toggle_ras_mc_event(ras, "block", "block_rq_error", enable);
#endif
#ifdef HAVE_MEMORY_FAILURE
@@ -241,37 +242,6 @@ free_ras:
return rc;
}
-/*
- * Set kernel filter. libtrace doesn't provide an API for setting filters
- * in kernel, we have to implement it here.
- */
-static int filter_ras_mc_event(struct ras_events *ras, char *group, char *event,
- const char *filter_str)
-{
- int fd, rc;
- char fname[MAX_PATH + 1];
-
- snprintf(fname, sizeof(fname), "events/%s/%s/filter", group, event);
- fd = open_trace(ras, fname, O_RDWR | O_APPEND);
- if (fd < 0) {
- log(ALL, LOG_WARNING, "Can't open filter file\n");
- return errno;
- }
-
- rc = write(fd, filter_str ,strlen(filter_str));
- if (rc < 0) {
- log(ALL, LOG_WARNING, "Can't write to filter file\n");
- close(fd);
- return rc;
- }
- close(fd);
- if (!rc) {
- log(ALL, LOG_WARNING, "Nothing was written on filter file\n");
- return EIO;
- }
-
- return 0;
-}
/*
* Tracing read code
@@ -901,17 +871,14 @@ (void)open("/sys/kernel/debug/ras/daemon
#endif
#ifdef HAVE_DISKERROR
- rc = filter_ras_mc_event(ras, "block", "block_rq_complete", "error != 0");
- if (!rc) {
- rc = add_event_handler(ras, pevent, page_size, "block",
- "block_rq_complete", ras_diskerror_event_handler,
- NULL, DISKERROR_EVENT);
- if (!rc)
- num_events++;
- else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "block", "block_rq_complete");
- }
+ rc = add_event_handler(ras, pevent, page_size, "block",
+ "block_rq_error", ras_diskerror_event_handler,
+ NULL, DISKERROR_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "block", "block_rq_error");
#endif
#ifdef HAVE_MEMORY_FAILURE
--- rasdaemon-0.6.7.orig/ras-record.c 2024-05-14 11:07:24.573654494 -0400
+++ rasdaemon-0.6.7/ras-record.c 2024-05-14 11:07:07.626807674 -0400
@@ -456,7 +456,7 @@ return 0;
#endif
/*
- * Table and functions to handle block:block_rq_complete
+ * Table and functions to handle block:block_rq_error
*/
#ifdef HAVE_DISKERROR

@ -1,159 +0,0 @@
commit aa36c96cd52d775570dae989dd95a060f1149077
Author: Avadhut Naik <avadnaik@amd.com>
Date: Mon Apr 24 20:35:56 2023 +0000
rasdaemon: Handle reassigned bit definitions for CS SMCA
Currently, on AMD systems with Scalable MCA (SMCA), each machine check
error of a SMCA bank type has an associated bit position in the bank's
control (CTL) register used for enabling / disabling reporting of the
very error. An error's bit position in the CTL register is also used
during error decoding for offsetting into the corresponding bank's error
description structure. As new errors are being added in newer AMD systems
for existing SMCA bank types, the underlying SMCA architecture guarantees
that the bit positions of existing errors are not altered.
However, on some AMD systems viz. Genoa, some of the existing bit
definitions in the CTL register of the Coherent Slave (CS) SMCA bank type
are reassigned without defining new HWID and McaType. Consequently, the
very errors whose bit definitions have been reassigned in the CTL register
are being erroneously decoded.
As a solution, create a new software defined SMCA bank type by utilizing
one of the hardware-reserved values for HWID. The new SMCA bank type will
only be employed for CS error decoding on affected CPU models.
Additionally, since the existing error description structure for the CS
SMCA bank type is still valid, add new error description structure to
compensate for the reassigned bit definitions.
Signed-off-by: Avadhut Naik <avadnaik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7ec787a..e81f732 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -57,6 +57,7 @@ enum smca_bank_types {
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
SMCA_CS_V2,
+ SMCA_CS_V2_QUIRK,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_UMC_V2,
@@ -259,6 +260,31 @@ static const char * const smca_cs2_mce_desc[] = {
"Hardware Assert Error",
};
+/*
+ * Per Genoa's revision guide, erratum 1384, existing bit definitions
+ * are reassigned for SMCA CS bank type.
+ */
+static const char * const smca_cs2_quirk_mce_desc[] = {
+ "Illegal Request",
+ "Address Violation",
+ "Security Violation",
+ "Illegal Response",
+ "Unexpected Response",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic Request Parity Error",
+ "SDP read response had no match in the CS queue",
+ "SDP read response had an unexpected RETRY error",
+ "Counter overflow error",
+ "Counter underflow error",
+ "Probe Filter Protocol Error",
+ "Probe Filter ECC Error",
+ "Illegal Request on the no data channel",
+ "Address Violation on the no data channel",
+ "Security Violation on the no data channel",
+ "Hardware Assert Error",
+};
+
static const char * const smca_pie_mce_desc[] = {
"Hardware assert",
"Register security violation",
@@ -549,6 +575,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
[SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
+ [SMCA_CS_V2_QUIRK] = { smca_cs2_quirk_mce_desc, ARRAY_SIZE(smca_cs2_quirk_mce_desc)},
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
@@ -597,6 +624,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Data Fabric MCA types */
{ SMCA_CS, 0x0000002E },
{ SMCA_CS_V2, 0x0002002E },
+ {SMCA_CS_V2_QUIRK, 0x00010000 },
{ SMCA_PIE, 0x0001002E },
/* Unified Memory Controller MCA type */
@@ -660,7 +688,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_EX] = { "Execution Unit" },
[SMCA_FP] = { "Floating Point Unit" },
[SMCA_L3_CACHE] = { "L3 Cache" },
- [SMCA_CS ... SMCA_CS_V2] = { "Coherent Slave" },
+ [SMCA_CS ... SMCA_CS_V2_QUIRK] = { "Coherent Slave" },
[SMCA_PIE] = { "Power, Interrupts, etc." },
[SMCA_UMC] = { "Unified Memory Controller" },
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
@@ -723,8 +751,38 @@ static int find_hbm_channel(struct mce_event *e)
return (umc % 2) ? tmp + 4 : tmp;
}
+static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
+{
+ if (m->family == 0x19) {
+ switch (m->model) {
+ /*
+ * Per Genoa's revision guide, erratum 1384, some SMCA Extended
+ * Error Codes and SMCA Control bits are incorrect for SMCA CS
+ * bank type.
+ */
+ case 0x10 ... 0x1F:
+ case 0x60 ... 0x7B:
+ case 0xA0 ... 0xAF:
+ if (*hwid_mcatype == 0x0002002E)
+ *hwid_mcatype = 0x00010000;
+ break;
+ default:
+ break;
+ }
+ } else if (m->family == 0x1A) {
+ switch (m->model) {
+ case 0x40 ... 0x4F:
+ if (*hwid_mcatype == 0x0002002E)
+ *hwid_mcatype = 0x00010000;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
/* Decode extended errors according to Scalable MCA specification */
-static void decode_smca_error(struct mce_event *e)
+static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
{
enum smca_bank_types bank_type;
const char *ip_name;
@@ -735,6 +793,8 @@ static void decode_smca_error(struct mce_event *e)
unsigned int csrow = -1, channel = -1;
unsigned int i;
+ fixup_hwid(m, &mcatype_hwid);
+
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
s_hwid = &smca_hwid_mcatypes[i];
if (mcatype_hwid == s_hwid->mcatype_hwid) {
@@ -801,7 +861,7 @@ int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
if (mcgstatus & MCG_STATUS_MCIP)
mce_snprintf(e->mcgstatus_msg, "MCIP");
- decode_smca_error(e);
+ decode_smca_error(e, ras->mce_priv);
amd_decode_errcode(e);
return 0;
}

@ -1,107 +0,0 @@
commit aecf33aa70331670c06db6b652712b476e24051c
Author: Muralidhara M K <muralimk@amd.com>
Date: Mon Jul 12 05:40:46 2021 -0500
rasdaemon: Enumerate memory on noncpu nodes
On newer heterogeneous systems from AMD with GPU nodes (with HBM2 memory
banks) connected via xGMI links to the CPUs.
The node id information is available in the InstanceHI[47:44] of
the IPID register.
The UMC Phys on Aldeberan nodes are enumerated as csrow
The UMC channels connected to HBMs are enumerated as ranks.
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 3c346f4..f3379fc 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -78,6 +78,12 @@ enum smca_bank_types {
/* Maximum number of MCA banks per CPU. */
#define MAX_NR_BANKS 64
+/*
+ * On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
+ * via xGMI links, the NON CPU Nodes are enumerated from index 8
+ */
+#define NONCPU_NODE_INDEX 8
+
/* SMCA Extended error strings */
/* Load Store */
static const char * const smca_ls_mce_desc[] = {
@@ -531,6 +537,26 @@ static int find_umc_channel(struct mce_event *e)
{
return EXTRACT(e->ipid, 0, 31) >> 20;
}
+
+/*
+ * The HBM memory managed by the UMCCH of the noncpu node
+ * can be calculated based on the [15:12]bits of IPID
+ */
+static int find_hbm_channel(struct mce_event *e)
+{
+ int umc, tmp;
+
+ umc = EXTRACT(e->ipid, 0, 31) >> 20;
+
+ /*
+ * The HBM channel managed by the UMC of the noncpu node
+ * can be calculated based on the [15:12]bits of IPID as follows
+ */
+ tmp = ((e->ipid >> 12) & 0xf);
+
+ return (umc % 2) ? tmp + 4 : tmp;
+}
+
/* Decode extended errors according to Scalable MCA specification */
static void decode_smca_error(struct mce_event *e)
{
@@ -539,6 +565,7 @@ static void decode_smca_error(struct mce_event *e)
unsigned short xec = (e->status >> 16) & 0x3f;
const struct smca_hwid *s_hwid;
uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+ uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
unsigned int csrow = -1, channel = -1;
unsigned int i;
@@ -548,14 +575,16 @@ static void decode_smca_error(struct mce_event *e)
bank_type = s_hwid->bank_type;
break;
}
+ if (mcatype_instancehi >= NONCPU_NODE_INDEX)
+ bank_type = SMCA_UMC_V2;
}
- if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
+ if (i >= MAX_NR_BANKS) {
strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
return;
}
- if (bank_type >= N_SMCA_BANK_TYPES) {
+ if (bank_type >= MAX_NR_BANKS) {
strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
return;
}
@@ -580,6 +609,16 @@ static void decode_smca_error(struct mce_event *e)
mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
channel, csrow);
}
+
+ if (bank_type == SMCA_UMC_V2 && xec == 0) {
+ /* The UMCPHY is reported as csrow in case of noncpu nodes */
+ csrow = find_umc_channel(e) / 2;
+ /* UMCCH is managing the HBM memory */
+ channel = find_hbm_channel(e);
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
+ channel, csrow);
+ }
+
}
int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)

@ -1,30 +0,0 @@
commit b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4
Author: DmNosachev <quartz64@gmail.com>
Date: Tue Jun 29 13:48:55 2021 +0300
labels/supermicro: added Supermicro X10DRI(-T)
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 47ea05f..86e4617 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -81,4 +81,14 @@ Vendor: Supermicro
P2-DIMMC1: 2.2.0;
P2-DIMMD1: 3.0.0; P2-DIMMD2: 3.0.1;
P2-DIMME1: 3.1.0;
- P2-DIMMF1: 3.2.0;
\ No newline at end of file
+ P2-DIMMF1: 3.2.0;
+
+ Model: X10DRI, X10DRI-T
+ P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1;
+ P1-DIMMB1: 0.1.0; P1-DIMMB2: 0.1.1;
+ P1-DIMMC1: 0.2.0; P1-DIMMC2: 0.2.1;
+ P1-DIMMD1: 0.3.0; P1-DIMMD2: 0.3.1;
+ P2-DIMME1: 1.0.0; P2-DIMME2: 1.0.1;
+ P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1;
+ P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1;
+ P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1;
\ No newline at end of file

@ -1,208 +0,0 @@
commit b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Thu Aug 31 02:23:48 2023 -0500
rasdaemon: Fix SMCA bank type decoding
On AMD systems with Scalable MCA (SMCA), the (HWID, MCATYPE) tuple from
the MCA_IPID MSR, bits 43:32 and 63:48 respectively, are used for SMCA
bank type decoding. On occurrence of an SMCA error, the cached tuples are
compared against the tuple read from the MCA_IPID MSR to determine the
SMCA bank type.
Currently however, all high 32 bits of the MCA_IPID register are cached in
the rasdaemon for all SMCA bank types. Bits 47:44 which do not play a part
in bank type decoding are zeroed out. Likewise, when an SMCA error occurs,
all high 32 bits of the MCA_IPID register are read and compared against
the cached values in smca_hwid_mcatypes array.
This can lead to erroneous bank type decoding since the bits 47:44 are
not guaranteed to be zero. They are either reserved or, on some modern
AMD systems viz. Genoa, denote the InstanceIdHi value. The bits therefore,
should not be associated with SMCA bank type decoding.
Import the HWID_MCATYPE macro from the kernel to ensure that only the
relevant fields i.e. (HWID, MCATYPE) tuples are used for SMCA bank type
decoding on occurrence of an SMCA error.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index a20f03c..55620e2 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -90,6 +90,12 @@ enum smca_bank_types {
/* Maximum number of MCA banks per CPU. */
#define MAX_NR_BANKS 64
+#define MCI_IPID_MCATYPE 0xFFFF0000
+#define MCI_IPID_HWID 0xFFF
+
+/* Obtain HWID_MCATYPE Tuple on SMCA Systems */
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
/*
* On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
* via xGMI links, the NON CPU Nodes are enumerated from index 8
@@ -699,76 +705,76 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, mcatype_hwid } */
/* ZN Core (HWID=0xB0) MCA types */
- { SMCA_LS, 0x000000B0 },
- { SMCA_LS_V2, 0x001000B0 },
- { SMCA_IF, 0x000100B0 },
- { SMCA_L2_CACHE, 0x000200B0 },
- { SMCA_DE, 0x000300B0 },
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) },
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) },
/* HWID 0xB0 MCATYPE 0x4 is Reserved */
- { SMCA_EX, 0x000500B0 },
- { SMCA_FP, 0x000600B0 },
- { SMCA_L3_CACHE, 0x000700B0 },
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) },
/* Data Fabric MCA types */
- { SMCA_CS, 0x0000002E },
- { SMCA_CS_V2, 0x0002002E },
- {SMCA_CS_V2_QUIRK, 0x00010000 },
- { SMCA_PIE, 0x0001002E },
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
+ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_CS_V2_QUIRK, HWID_MCATYPE(0x0, 0x1) },
/* Unified Memory Controller MCA type */
- { SMCA_UMC, 0x00000096 },
- { SMCA_UMC_QUIRK, 0x00020000 },
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
+ { SMCA_UMC_QUIRK, HWID_MCATYPE(0x0, 0x2) },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
- { SMCA_UMC_V2, 0x00010096 },
+ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) },
/* Memory Attached Last Level Cache */
- { SMCA_MA_LLC, 0x0004002E },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
/* Parameter Block MCA type */
- { SMCA_PB, 0x00000005 },
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
/* Platform Security Processor MCA type */
- { SMCA_PSP, 0x000000FF },
- { SMCA_PSP_V2, 0x000100FF },
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) },
+ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) },
/* System Management Unit MCA type */
- { SMCA_SMU, 0x00000001 },
- { SMCA_SMU_V2, 0x00010001 },
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) },
+ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) },
/* Microprocessor 5 Unit MCA type */
- { SMCA_MP5, 0x00020001 },
+ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
/* MPDMA MCA Type */
- { SMCA_MPDMA, 0x00030001 },
+ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
/* Northbridge IO Unit MCA type */
- { SMCA_NBIO, 0x00000018 },
+ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
/* PCI Express Unit MCA type */
- { SMCA_PCIE, 0x00000046 },
- { SMCA_PCIE_V2, 0x00010046 },
+ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
+ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
/* Ext Global Memory Interconnect PCS MCA type */
- { SMCA_XGMI_PCS, 0x00000050 },
+ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
- { SMCA_NBIF, 0x0000006C },
+ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
- { SMCA_SHUB, 0x00000080 },
- { SMCA_SATA, 0x000000A8 },
- { SMCA_USB, 0x000000AA },
+ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
+ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
+ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
/* Ultra Short Reach Data and Control Plane Controller */
- { SMCA_USR_DP, 0x00000170 },
- { SMCA_USR_CP, 0x00000180 },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
- { SMCA_GMI_PCS, 0x00000241 },
+ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
/* Ext Global Memory Interconnect PHY MCA type */
- { SMCA_XGMI_PHY, 0x00000259 },
+ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
/* WAFL PHY MCA type */
- { SMCA_WAFL_PHY, 0x00000267 },
+ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
- { SMCA_GMI_PHY, 0x00000269 },
+ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
};
struct smca_bank_name {
@@ -862,12 +868,12 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
case 0x10 ... 0x1F:
case 0x60 ... 0x7B:
case 0xA0 ... 0xAF:
- if (*hwid_mcatype == 0x0002002E)
- *hwid_mcatype = 0x00010000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1);
break;
case 0x90 ... 0x9F:
- if ((*hwid_mcatype & 0xFF) == 0x00000096)
- *hwid_mcatype = 0x00020000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x96, 0x0))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x2);
break;
default:
break;
@@ -875,8 +881,8 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
} else if (m->family == 0x1A) {
switch (m->model) {
case 0x40 ... 0x4F:
- if (*hwid_mcatype == 0x0002002E)
- *hwid_mcatype = 0x00010000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1);
break;
default:
break;
@@ -889,13 +895,17 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
{
enum smca_bank_types bank_type;
const char *ip_name;
+ uint32_t mcatype_hwid = 0;
unsigned short xec = (e->status >> 16) & 0x3f;
const struct smca_hwid *s_hwid;
- uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+ uint32_t ipid_high = EXTRACT(e->ipid, 32, 63);
uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
unsigned int csrow = -1, channel = -1;
unsigned int i;
+ mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
+ (ipid_high & MCI_IPID_MCATYPE) >> 16);
+
fixup_hwid(m, &mcatype_hwid);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {

@ -1,37 +0,0 @@
commit c785d309dcbdeb7ecd219975244f3944a8d047e9
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Thu Jul 27 10:18:12 2023 +0000
rasdaemon: Identify the DIe Number in multidie system
Some AMD systems have 4 dies in each socket and Die ID represents
whether the error occured on cpu die or gpu die.
Also, respective Die used for FRU identification.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 54060ee..a20f03c 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -935,10 +935,15 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
xec);
if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_QUIRK) && xec == 0) {
- channel = find_umc_channel(e);
- csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
- mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
- channel, csrow);
+ if ((m->family == 0x19) && (m->model >= 0x90 && m->model <= 0x9f)) {
+ /* MCA_IPID[InstanceIdHi] give the AMD Node Die ID */
+ mce_snprintf(e->mc_location, "memory_die_id=%d", mcatype_instancehi / 4);
+ } else {
+ channel = find_umc_channel(e);
+ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
+ channel, csrow);
+ }
}
if (bank_type == SMCA_UMC_V2 && xec == 0) {

@ -1,94 +0,0 @@
commit ced615cf8146f51b5d6fe7a29107a2adc77407ca
Author: Sathya Priya Kumar <sathyapriya.k@amd.com>
Date: Thu Jan 11 01:20:07 2024 -0600
rasdaemon: Add error decoding for MCA_CTL_SMU extended bits
Enable error decoding support for the newly added extended
error bit descriptions from MCA_CTL_SMU.
b'0:11 can be decoded from existing array smca_smu2_mce_desc.
Define a function to append the newly defined b'58:62 to the
smca_smu2_mce_desc. This reduces the maintaining Reserved bits
from b'12:57 in the code.
Signed-off-by: Sathya Priya Kumar <sathyapriya.k@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
mce-amd-smca.c | 33 ++++++++++++++++++++++++++++++++-
ras-mce-handler.h | 1 +
2 files changed, 33 insertions(+), 1 deletion(-)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-06-28 10:34:16.453522865 -0400
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-06-28 10:34:46.049124270 -0400
@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d
"An ECC or parity error in an SMU RAM instance",
};
-static const char * const smca_smu2_mce_desc[] = {
+static const char * smca_smu2_mce_desc[64] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
"Data Cache Bank A ECC or parity error",
@@ -409,6 +409,15 @@ static const char * const smca_smu2_mce_
"Instruction Tag Cache Bank A ECC or parity error",
"Instruction Tag Cache Bank B ECC or parity error",
"System Hub Read Buffer ECC or parity error",
+ "PHY RAS ECC Error",
+};
+
+static const char * smca_smu2_ext_mce_desc[] = {
+ "A correctable error from a GFX Sub-IP",
+ "A fatal error from a GFX Sub-IP",
+ "Reserved",
+ "Reserved",
+ "A poison error from a GFX Sub-IP",
};
static const char * const smca_mp5_mce_desc[] = {
@@ -815,6 +824,27 @@ static struct smca_bank_name smca_names[
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
+void smca_smu2_ext_err_desc(void)
+{
+ int i, j;
+ int smu2_bits = 62;
+
+ /*
+ * MCA_CTL_SMU error stings are defined for b'58:59 and b'62
+ * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU
+ *
+ * b'0:11 can be decoded from existing array smca_smu2_mce_desc.
+ * b'12:57 are Reserved and b'58:62 are appended to the
+ * smca_smu2_mce_desc.
+ */
+ for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) {
+ for ( ; i < 58; i++)
+ smca_smu2_mce_desc[i] = "Reserved";
+
+ smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j];
+ }
+}
+
void amd_decode_errcode(struct mce_event *e)
{
@@ -906,6 +936,7 @@ unsigned short xec = (e->status >> 16) &
mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
(ipid_high & MCI_IPID_MCATYPE) >> 16);
+ smca_smu2_ext_err_desc();
fixup_hwid(m, &mcatype_hwid);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-06-28 10:34:16.453522865 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-06-28 10:34:17.795508302 -0400
@@ -121,6 +121,7 @@ int set_intel_imc_log(enum cputype cputy
/* Undertake AMD SMCA Error Decoding */
void decode_smca_error(struct mce_event *e, struct mce_priv *m);
void amd_decode_errcode(struct mce_event *e);
+void smca_smu2_ext_err_desc(void);
/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);

@ -1,42 +0,0 @@
commit d0e0bb3d73c4bc5060da20270a089857bba2a64c
Author: Justin Vreeland <vreeland.justin@gmail.com>
Date: Tue Nov 2 19:51:50 2021 -0700
Update ras-mc-ctl manpage to match current options
Signed-off-by: Justin Vreeland <vreeland.justin@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/man/ras-mc-ctl.8.in b/man/ras-mc-ctl.8.in
index 26230e0..a605122 100644
--- a/man/ras-mc-ctl.8.in
+++ b/man/ras-mc-ctl.8.in
@@ -79,9 +79,27 @@ Specify an alternate location for the labels database.
Specify a delay of \fBtime\fR seconds before registering DIMM labels.
Only meaninful if used together with --register-labels.
.TP
-.BI "--layout
+.BI "--layout"
Prints the memory layout as detected by the EDAC driver. Useful to check
if the EDAC driver is properly detecting the memory controller architecture.
+.TP
+.BI "--summary"
+Presents a summary of the logged errors.
+.TP
+.BI "--errors"
+Shows the errors stored at the error database.
+.TP
+.BI "--error-count"
+Shows the corrected and uncorrected error counts using sysfs.
+.TP
+.BI "--vendor-errors-summary="platform-id
+Pressents a summary of the vendor-specific logged errors.
+.TP
+.BI "--vendor-errors="platform-id
+Shows the vendor-specific errors stored in the error database.
+.TP
+.BI "--vendor-platforms"
+Shows the supported platforms with platform-ids for the vendor-specific errors.
.SH MAINBOARD CONFIGURATION
.PP

@ -1,27 +0,0 @@
commit dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b
Author: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed May 26 12:55:54 2021 +0200
Add support for multi-arch builds
Allow building rasdaemon on several architectures:
- x86_64
- arm 64
- ppc 64 LE
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 747a844..898687c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,6 +9,9 @@ jobs:
Ubuntu:
name: Ubuntu
runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ arch: [x64_64, aarch64, ppc64le]
steps:
- uses: actions/checkout@v2
- name: prepare

@ -1,31 +0,0 @@
commit ec443ec0add059fa897f844349e1a2345d81713c
Author: DmNosachev <quartz64@gmail.com>
Date: Tue Jun 29 11:33:10 2021 +0300
labels/supermicro: added x11dph-i labels
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 3fd6fee..bfaed93 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -68,3 +68,17 @@ Vendor: Supermicro
P1_DIMM4B: 1.1.1;
P2_DIMM4B: 2.0.1;
P2_DIMM4B: 2.1.1;
+
+ Model: X11DPH-i
+ P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1;
+ P1-DIMMB1: 0.1.0;
+ P1-DIMMC1: 0.2.0;
+ P1-DIMMD1: 1.0.0; P1-DIMMD2: 1.0.1;
+ P1-DIMME1: 1.1.0;
+ P1-DIMMF1: 1.2.0;
+ P2-DIMMA1: 2.0.0; P2-DIMMA2: 2.0.1;
+ P2-DIMMB1: 2.1.0;
+ P2-DIMMC1: 2.2.0;
+ P2-DIMMD1: 3.0.0; P2-DIMMD2: 3.0.1;
+ P2-DIMME1: 3.1.0;
+ P2-DIMMF1: 3.2.0;
\ No newline at end of file

@ -1,48 +0,0 @@
commit f7cdd720297cd17e405a7170c04df89d1d9536f8
Author: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed May 26 12:35:55 2021 +0200
Add a github workflow for CI automation
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..5b3e757
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,34 @@
+name: CI
+
+# Should run only on branches and PR, as "on_tag.yml" will handle tags
+on:
+ push:
+ branches: master test
+ pull_request:
+ branches: master
+
+jobs:
+
+#
+# Linux
+#
+ Ubuntu:
+ name: Ubuntu
+ runs-on: ubuntu-20.04
+ strategy:
+ matrix:
+ arch: [x64_64, aarch64, armv7, ppc64le]
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ arch: ${{ matrix.arch }}
+ - name: prepare
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y build-essential sqlite3
+ - name: build
+ run: |
+ autoreconf -vfi
+ ./configure --enable-all
+ make
+ sudo make install

@ -1,30 +0,0 @@
commit fc1dd37d422fc907416afd028514fff59b63ae12
Author: DmNosachev <quartz64@gmail.com>
Date: Wed Jun 30 16:49:18 2021 +0300
labels/supermicro: added Supermicro B1DRi
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 373de07..b924a32 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -105,4 +105,14 @@ Vendor: Supermicro
P2-DIMMC1: 2.2.0;
P2-DIMMD1: 3.0.0;
P2-DIMME1: 3.1.0;
- P2-DIMMF1: 3.2.0;
\ No newline at end of file
+ P2-DIMMF1: 3.2.0;
+
+ Model: B1DRi
+ P1_DIMMA1: 0.0.0;
+ P1_DIMMB1: 0.1.0;
+ P1_DIMMC1: 0.2.0;
+ P1_DIMMD1: 0.3.0;
+ P2_DIMME1: 1.0.0;
+ P2_DIMMF1: 1.1.0;
+ P2_DIMMG1: 1.2.0;
+ P2_DIMMH1: 1.3.0;
\ No newline at end of file

@ -1,28 +0,0 @@
commit fcdffdcb28ece67ed78e3575a3dce45d9dd4f015
Author: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed May 26 10:37:52 2021 +0200
rasdaemon.spec.in: Fix the description on this example file
While this is used just to test if building it is OK, better
to keep the logs nice ;-)
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in
index 6ef223f..afa4359 100644
--- a/misc/rasdaemon.spec.in
+++ b/misc/rasdaemon.spec.in
@@ -61,10 +61,10 @@ rm INSTALL %{buildroot}/usr/include/*.h
%changelog
* Wed May 26 2021 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> 0.6.7-1
-- Bump to version 0.6.5 with several fixes and additions
+- Bump to version 0.6.7 with several fixes and additions
* Tue Jul 21 2020 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> 0.6.6-1
-- Bump to version 0.6.5 with several fixes, new hip08 events and memory prediction analysis
+- Bump to version 0.6.6 with several fixes, new hip08 events and memory prediction analysis
* Wed Nov 20 2019 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> 0.6.5-1
- Bump to version 0.6.5 with several fixes and improves PCIe events record

@ -1,263 +0,0 @@
Add labels directory from upstream
Labels directory doesn't get exported by tarball releases.
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
---
labels/asus | 20 +++++++
labels/dell | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
labels/supermicro | 70 ++++++++++++++++++++++++
3 files changed, 242 insertions(+)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.7/labels/asus 2022-02-08 15:44:53.563362010 -0500
@@ -0,0 +1,20 @@
+# RASDAEMON Motherboard DIMM labels Database file.
+#
+# Vendor-name and model-name are found from the program 'dmidecode'
+# labels are found from the silk screen on the motherboard.
+#
+#Vendor: <vendor-name>
+# Product: <product-name>
+# Model: <model-name>
+# <label>: <mc>.<top>.<mid>.<low>
+#
+#
+#Vendor: <vendor-name>
+# Model: <model-name>
+# <label>: <mc>.<row>.<channel>
+#
+
+Vendor: ASUSTeK COMPUTER INC.
+ Model: PRIME X570-PRO
+ DIMM_A1: 0.0.1, 0.1.1; DIMM_A2: 0.2.1, 0.3.1;
+ DIMM_B1: 0.0.0, 0.1.0; DIMM_B2: 0.2.0, 0.3.0;
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.7/labels/dell 2022-02-08 15:44:53.564361999 -0500
@@ -0,0 +1,152 @@
+# RASDAEMON Motherboard DIMM labels Database file.
+#
+# Vendor-name and model-name are found from the program 'dmidecode'
+# labels are found from the silk screen on the motherboard.
+#
+#Vendor: <vendor-name>
+# Product: <product-name>
+# Model: <model-name>
+# <label>: <mc>.<top>.<mid>.<low>
+#
+
+Vendor: Dell Inc.
+# 1-socket
+ Product: PowerEdge R220, PowerEdge R330, PowerEdge T330, PowerEdge R230, PowerEdge T130, PowerEdge T30
+ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1;
+ DIMM_A3: 0.1.0; DIMM_A4: 0.1.1;
+
+ Product: PowerEdge T110 II, PowerEdge T20
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0;
+
+ DIMM_B1: 0.0.1; DIMM_B2: 0.1.1;
+
+ Product: PowerEdge R320, PowerEdge T320
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
+ DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1;
+
+# 2-socket
+ Product: PowerEdge R610
+ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2;
+ DIMM_A4: 0.1.0; DIMM_A5: 0.1.1; DIMM_A6: 0.1.2;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2;
+ DIMM_B4: 1.1.0; DIMM_B5: 1.1.1; DIMM_B6: 1.1.2;
+
+ Product: PowerEdge T710, PowerEdge R710
+ DIMM_A3: 0.0.0; DIMM_A2: 0.1.0; DIMM_A1: 0.2.0;
+ DIMM_A6: 0.0.1; DIMM_A5: 0.1.1; DIMM_A4: 0.2.1;
+ DIMM_A9: 0.0.2; DIMM_A8: 0.1.2; DIMM_A7: 0.2.2;
+
+ DIMM_B3: 1.0.0; DIMM_B2: 1.1.0; DIMM_B1: 1.2.0;
+ DIMM_B6: 1.0.1; DIMM_B5: 1.1.1; DIMM_B4: 1.2.1;
+ DIMM_B9: 1.0.2; DIMM_B8: 1.1.2; DIMM_B7: 1.2.2;
+
+ Product: PowerEdge R620, PowerEdge T620, PowerEdge R720xd, PowerEdge R730xd, PowerEdge T630, PowerEdge R730, PowerEdge R630, PowerEdge T620, PowerEdge M620, PowerEdge FC620, PowerEdge M630, PowerEdge FC630
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+ DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
+ DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2;
+
+ Product: PowerEdge R640, PowerEdge R740, PowerEdge R740xd, PowerEdge T640
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 0.1.1; A9: 0.2.1; A10: 1.0.1; A11: 1.1.1; A12: 1.2.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 2.1.1; B9: 2.2.1; B10: 3.0.1; B11: 3.1.1; B12: 3.2.1;
+
+ Product: PowerEdge M520, PowerEdge R420, PowerEdge T420
+ DIMM_A1: 0.1.0; DIMM_A2: 0.2.0; DIMM_A3: 0.3.0;
+ DIMM_A4: 0.1.1; DIMM_A5: 0.2.1; DIMM_A6: 0.3.1;
+
+ DIMM_B1: 1.1.0; DIMM_B2: 1.2.0; DIMM_B3: 1.3.0;
+ DIMM_B4: 1.1.1; DIMM_B5: 1.2.1; DIMM_B6: 1.3.1;
+
+ Product: PowerEdge FC420, PowerEdge M420
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0;
+
+ Product: PowerEdge C6320, PowerEdge C4130
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
+
+ Product: PowerEdge C6320p
+ A1: 0.0.0; B1: 0.1.0; C1: 0.2.0;
+ D1: 1.0.0; E1: 1.1.0; F1: 1.2.0;
+
+ Product: PowerEdge C6420
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 1.0.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 3.0.1;
+
+ Product: PowerEdge R430, PowerEdge T430, PowerEdge R530
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+
+ Product: PowerEdge FC430
+ DIMM_A1: 0.1.0; DIMM_A2: 0.0.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+
+ DIMM_B1: 1.1.0; DIMM_B2: 1.0.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+
+# 4-socket
+ Product: PowerEdge M820, PowerEdge R830, PowerEdge M830, PowerEdge R930, PowerEdge FC830
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+ DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
+ DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2;
+
+ DIMM_C1: 2.0.0; DIMM_C2: 2.1.0; DIMM_C3: 2.2.0; DIMM_C4: 2.3.0;
+ DIMM_C5: 2.0.1; DIMM_C6: 2.1.1; DIMM_C7: 2.2.1; DIMM_C8: 2.3.1;
+ DIMM_C9: 2.0.2; DIMM_C10: 2.1.2; DIMM_C11: 2.2.2; DIMM_C12: 2.3.2;
+
+ DIMM_D1: 3.0.0; DIMM_D2: 3.1.0; DIMM_D3: 3.2.0; DIMM_D4: 3.3.0;
+ DIMM_D5: 3.0.1; DIMM_D6: 3.1.1; DIMM_D7: 3.2.1; DIMM_D8: 3.3.1;
+ DIMM_D9: 3.0.2; DIMM_D10: 3.1.2; DIMM_D11: 3.2.2; DIMM_D12: 3.3.2;
+
+ Product: PowerEdge FM120x4
+ DIMM_A_A1: 0.1.0; DIMM_A_A2: 0.2.0;
+
+ DIMM_B_A1: 1.1.0; DIMM_B_A2: 1.2.0;
+
+ DIMM_C_A1: 2.1.0; DIMM_C_A2: 2.2.0;
+
+ DIMM_D_A1: 3.1.0; DIMM_D_A2: 3.2.0;
+
+ Product: PowerEdge R940
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 0.1.1; A9: 0.2.1; A10: 1.0.1; A11: 1.1.1; A12: 1.2.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 2.1.1; B9: 2.2.1; B10: 3.0.1; B11: 3.1.1; B12: 3.2.1;
+
+ C1: 4.0.0; C2: 4.1.0; C3: 4.2.0; C4: 5.0.0; C5: 5.1.0; C6: 5.2.0;
+ C7: 4.0.1; C8: 4.1.1; C9: 4.2.1; C10: 5.0.1; C11: 5.1.1; C12: 5.2.1;
+
+ D1: 6.0.0; D2: 6.1.0; D3: 6.2.0; D4: 7.0.0; D5: 7.1.0; D6: 7.2.0;
+ D7: 6.0.1; D8: 6.1.1; D9: 6.2.1; D10: 7.0.1; D11: 7.1.1; D12: 7.2.1;
+
+ Product: PowerEdge R440, PowerEdge R540
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 0.1.1; A9: 1.0.1; A10: 1.1.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+
+ Product: PowerEdge M640, PowerEdge FC640
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 1.0.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 3.0.1;
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.7/labels/supermicro 2022-02-08 15:44:53.564361999 -0500
@@ -0,0 +1,70 @@
+# RASDAEMON Motherboard DIMM labels Database file.
+#
+# Vendor-name and model-name are found from the program 'dmidecode'
+# labels are found from the silk screen on the motherboard.
+#
+#Vendor: <vendor-name>
+# Product: <product-name>
+# Model: <model-name>
+# <label>: <mc>.<top>.<mid>.<low>
+#
+
+Vendor: Supermicro
+ Model: A2SDi-8C-HLN4F
+ DIMMA1: 0.0.0; DIMMA2: 0.0.1;
+ DIMMB1: 0.1.0; DIMMB2: 0.1.1;
+
+ Model: A2SDi-8C+-HLN4F
+ DIMMA1: 0.0.0; DIMMA2: 0.0.1;
+ DIMMB1: 0.1.0; DIMMB2: 0.1.1;
+
+ Product: X10SRA-F
+ DIMMA1: 0.0.0
+ DIMMA2: 0.0.1
+ DIMMB1: 0.1.0
+ DIMMB2: 0.1.1
+ DIMMC1: 1.0.0
+ DIMMC2: 1.0.1
+ DIMMD1: 1.1.0
+ DIMMD2: 1.1.1
+
+ Product: H8DGU
+ P1_DIMM1A: 0.2.0;
+ P1_DIMM1A: 0.3.0;
+ P2_DIMM1A: 3.2.0;
+ P2_DIMM1A: 3.3.0;
+
+ P1_DIMM2A: 0.2.1;
+ P1_DIMM2A: 0.3.1;
+ P2_DIMM2A: 3.2.1;
+ P2_DIMM2A: 3.3.1;
+
+ P1_DIMM3A: 1.2.0;
+ P1_DIMM3A: 1.3.0;
+ P2_DIMM3A: 2.2.0;
+ P2_DIMM3A: 2.3.0;
+
+ P1_DIMM4A: 1.2.1;
+ P1_DIMM4A: 1.3.1;
+ P2_DIMM4A: 2.2.1;
+ P2_DIMM4A: 2.3.1;
+
+ P1_DIMM1B: 0.0.0;
+ P1_DIMM1B: 0.2.0;
+ P2_DIMM1B: 3.0.0;
+ P2_DIMM1B: 3.1.0;
+
+ P1_DIMM2B: 0.0.1;
+ P1_DIMM2B: 0.1.1;
+ P2_DIMM2B: 3.0.1;
+ P2_DIMM2B: 3.1.1;
+
+ P1_DIMM3B: 1.0.0;
+ P1_DIMM3B: 1.1.0;
+ P2_DIMM3B: 2.0.0;
+ P2_DIMM3B: 2.1.0;
+
+ P1_DIMM4B: 1.0.1;
+ P1_DIMM4B: 1.1.1;
+ P2_DIMM4B: 2.0.1;
+ P2_DIMM4B: 2.1.1;

@ -1,58 +1,25 @@
Name: rasdaemon
Version: 0.6.7
Release: 15%{?dist}
Version: 0.8.0
Release: 5%{?dist}
Summary: Utility to receive RAS error tracings
License: GPL-2.0-only
Group: Applications/System
License: GPLv2
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2
Patch0: labels.patch
Patch1: fcdffdcb28ece67ed78e3575a3dce45d9dd4f015.patch
Patch2: f7cdd720297cd17e405a7170c04df89d1d9536f8.patch
Patch3: 2b37a26dcec389723f75d69d3da9c2f15f6c317d.patch
Patch4: dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b.patch
Patch5: 738bafafdcb2e8b0ced32fff31b13754d571090b.patch
Patch6: 1ff5f3d2a0fcd48add9462567c30fe0e14585fb4.patch
Patch7: 9acef39f13833f7d53ef96abc5a72e79384260f4.patch
Patch8: 28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch
Patch9: aecf33aa70331670c06db6b652712b476e24051c.patch
Patch10: 7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch
Patch11: ec443ec0add059fa897f844349e1a2345d81713c.patch
Patch12: 9a5baed97b21af31064d9995ffcfaac0e9d7983e.patch
Patch13: b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4.patch
Patch14: 50565005b10fe909c66f1c90f2feb95712427c7d.patch
Patch15: fc1dd37d422fc907416afd028514fff59b63ae12.patch
Patch16: 6bc43db1b6b3d73805179c21d1dd5521e8dc0f74.patch
Patch17: 2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch
Patch18: 7ccf12f5ae26a055926d175d908c7930293438c4.patch
Patch19: 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch
Patch20: d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch
Patch21: 30158ef8d7aebc3e5201bf39b73ce7644f8e419e.patch
Patch22: aa36c96cd52d775570dae989dd95a060f1149077.patch
Patch23: 932118b04a04104dfac6b8536419803f236e6118.patch
Patch24: 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch
Patch25: 2d15882a0cbfce0b905039bebc811ac8311cd739.patch
Patch26: c785d309dcbdeb7ecd219975244f3944a8d047e9.patch
Patch27: b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch
Patch28: 9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch
Patch29: 9bd84aef87978b806178a73ed33c39d6c442fc1f.patch
Patch30: 885e546add918457c453bd3f753ac7df90b39e36.patch
Patch31: 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch
Patch32: ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch
Patch33: 73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch
ExcludeArch: s390 s390x
BuildRequires: make
BuildRequires: gcc
BuildRequires: autoconf automake libtool
BuildRequires: gettext-devel
BuildRequires: perl-generators
BuildRequires: sqlite-devel
BuildRequires: systemd
BuildRequires: autoconf
BuildRequires: automake
BuildRequires: libtool
BuildRequires: libtraceevent-devel
Provides: bundled(kernel-event-lib)
Requires: hwdata
Requires: perl-DBD-SQLite
Requires: libtraceevent
%ifarch %{ix86} x86_64
Requires: dmidecode
%endif
@ -73,125 +40,79 @@ an utility for reporting current error counts from the EDAC sysfs files.
%prep
%setup -q
%patch0 -p1
%patch1 -p1
%patch2 -p1
%patch3 -p1
%patch4 -p1
%patch5 -p1
%patch6 -p1
%patch7 -p1
%patch8 -p1
%patch9 -p1
%patch10 -p1
%patch11 -p1
%patch12 -p1
%patch13 -p1
%patch14 -p1
%patch15 -p1
%patch16 -p1
%patch17 -p1
%patch18 -p1
%patch19 -p1
%patch20 -p1
%patch21 -p1
%patch22 -p1
%patch23 -p1
%patch24 -p1
%patch25 -p1
%patch26 -p1
%patch27 -p1
%patch28 -p1
%patch29 -p1
%patch30 -p1
%patch31 -p1
%patch32 -p1
%patch33 -p1
# The tarball is locked in time the first time aclocal was ran and will keep
# requiring an older version of automake
autoreconf -vfi
%build
%ifarch %{arm} aarch64
%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-arm --enable-hisi-ns-decode
%configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm \
--enable-mce --enable-extlog --enable-devlink --enable-diskerror \
--enable-memory-failure --enable-abrt-report --enable-hisi-ns-decode \
--enable-memory-ce-pfa --enable-amp-ns-decode --enable-cpu-fault-isolation \
--with-sysconfdefdir=%{_sysconfdir}/sysconfig
%else
%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report
%configure --enable-sqlite3 --enable-aer \
--enable-mce --enable-extlog --enable-devlink --enable-diskerror \
--enable-memory-failure --enable-abrt-report --enable-cpu-fault-isolation \
--with-sysconfdefdir=%{_sysconfdir}/sysconfig
%endif
make %{?_smp_mflags}
%install
make install DESTDIR=%{buildroot}
install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service
install -D -p -m 0644 misc/rasdaemon.service %{buildroot}%{_unitdir}/rasdaemon.service
install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service
install -D -p -m 0655 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name}
rm INSTALL %{buildroot}/usr/include/*.h
mkdir -p %{buildroot}/%{_sharedstatedir}/rasdaemon
install -d -p -m 0755 %{buildroot}/%{_sharedstatedir}/rasdaemon
mkdir -p %{buildroot}/%{_sysconfdir}/sysconfig
install -D -p -m 0644 misc/rasdaemon.env %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon
sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon
%files
%doc AUTHORS ChangeLog COPYING README TODO
%doc AUTHORS ChangeLog COPYING README.md TODO
%{_sbindir}/rasdaemon
%{_sbindir}/ras-mc-ctl
%{_mandir}/*/*
%{_unitdir}/*.service
%{_sharedstatedir}/rasdaemon
%{_sysconfdir}/ras/dimm_labels.d
%{_sysconfdir}/sysconfig/rasdaemon
%config(noreplace) %{_sysconfdir}/sysconfig/%{name}
%changelog
* Thu Jul 18 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-14
- rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits [RHEL-48819]
* Tue Nov 26 2024 MSVSphere Packaging Team <packager@msvsphere-os.ru> - 0.8.0-5
- Rebuilt for MSVSphere 10
* Fri Jun 28 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-13
- rasdaemon: Add error decoding for MCA_CTL_SMU extended bits [RHEL-35718]
* Mon Jun 24 2024 Troy Dawson <tdawson@redhat.com> - 0.8.0-5
- Bump release for June 2024 mass rebuild
* Thu Jun 20 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-12
- mce-amd-smca: update smca_hwid to use smca_bank_types [RHEL-24170]
* Fri Jan 26 2024 Fedora Release Engineering <releng@fedoraproject.org> - 0.8.0-4
- Rebuilt for https://fedoraproject.org/wiki/Fedora_40_Mass_Rebuild
* Wed May 08 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-11
- Fix excessive block messages [RHEL-8708]
* Mon Jan 22 2024 Fedora Release Engineering <releng@fedoraproject.org> - 0.8.0-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_40_Mass_Rebuild
* Wed Jan 10 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-10
- Update License string to use SPDX [RHELMISC-1262]
* Fri Jul 21 2023 Fedora Release Engineering <releng@fedoraproject.org> - 0.8.0-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_39_Mass_Rebuild
* Thu Oct 26 2023 Aristeu Rozanski <aris@redhat.com> 0.6.7-9
- Update SMCA support for AMD processors [RHEL-11092]
* Tue May 03 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-8
- Update ras-mc-ctl manpage to match current options [2079132]
* Sat Feb 18 2023 Mauro Carvalho Chehab <mchehab@kernel.org> 0.8.0
- Bump to version 0.8.0 using libtraceevent.
* Mon May 02 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-7
- Fix issue printing memory module sizes [2080596]
* Sat Jan 21 2023 Mauro Carvalho Chehab <mchehab@kernel.org> 0.7.0
- Bump to version 0.7.0 with several fixes and additions
* Thu Mar 31 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-6
- Merging 2065729 fixes into 9.1 branch [2067499]
* Fri Jan 20 2023 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.8-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_38_Mass_Rebuild
* Thu Mar 24 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-5
- Trying to guess what's going on on the testing side [2065729]
* Sat Jul 23 2022 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.8-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_37_Mass_Rebuild
* Thu Mar 24 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-4
- Adding simple test to stop being gated [2065729]
* Fri Apr 01 2022 Mauro Carvalho Chehab <mchehab@kernel.org> 0.6.8-1
- Fix sysconfdir issues and upgrade to version 0.6.8
* Thu Mar 24 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-3
- Adding gating.yaml [2065729]
* Fri Jan 21 2022 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.7-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_36_Mass_Rebuild
* Fri Mar 18 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-2
- Adding missing rasdaemon environment configuration to /etc/sysconfig/rasdaemon [2065729]
* Fri Jul 23 2021 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.7-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_35_Mass_Rebuild
* Tue Feb 08 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-1
- Bumped to 0.6.7
- Backported patches that sit on top of 0.6.7 without being released
Related: rhbz#2052190
* Tue Aug 10 2021 Mohan Boddu <mboddu@redhat.com> - 0.6.4-6
- Rebuilt for IMA sigs, glibc 2.34, aarch64 flags
Related: rhbz#1991688
* Fri Apr 16 2021 Mohan Boddu <mboddu@redhat.com> - 0.6.4-5
- Rebuilt for RHEL 9 BETA on Apr 15th 2021. Related: rhbz#1947937
* Wed May 26 2021 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> 0.6.7-1
- Bump to version 0.6.7 with several fixes and additions
* Wed Jan 27 2021 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.4-4
- Rebuilt for https://fedoraproject.org/wiki/Fedora_34_Mass_Rebuild
@ -202,7 +123,6 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir
* Thu Jan 30 2020 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.4-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild
* Thu Oct 10 2019 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.4-1
- Bump to version 0.6.4 with some DB changes for hip08 and some fixes
@ -306,4 +226,3 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir
* Mon May 20 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.3.0-1
- Package created

Loading…
Cancel
Save