299 lines
8.9 KiB
Diff
299 lines
8.9 KiB
Diff
|
|
From 6e5f83712ee2e7af1272b8064dd965d423b97ce2 Mon Sep 17 00:00:00 2001
|
||
|
|
From: Junhao He <hejunhao3@huawei.com>
|
||
|
|
Date: Sat, 31 Aug 2024 17:52:02 +0800
|
||
|
|
Subject: [PATCH] rasdaemon: Add HBM Memory ACLS support for HiSilicon
|
||
|
|
|
||
|
|
When a hardware error occurs in a cell of the HBM memory, the internal
|
||
|
|
SRAM of the memory controller is used to replace the faulty memory, this
|
||
|
|
method is ACLS (Adaptive Cache Line Sparing). The IMU reports the ACLS
|
||
|
|
RAS, and the rasdaemon record it and runs the ACLS to replace the faulty
|
||
|
|
memory.
|
||
|
|
|
||
|
|
HBM ACLS can repair one cell (258-bit) memory at a time. The HBM can
|
||
|
|
check which HBM cell the physical address belongs to and filter invalid
|
||
|
|
HBM addresses. Multiple RAS errors are reported if memory errors occur
|
||
|
|
in different HBM cells.
|
||
|
|
|
||
|
|
The feature depends on the linux kernel CONFIG_HISI_MEM_RAS and
|
||
|
|
CONFIG_PAGE_EJECT.
|
||
|
|
|
||
|
|
Signed-off-by: Junhao He <hejunhao3@huawei.com>
|
||
|
|
---
|
||
|
|
configure.ac | 11 +++
|
||
|
|
misc/rasdaemon.env | 7 +-
|
||
|
|
non-standard-hisilicon.c | 196 +++++++++++++++++++++++++++++++++++++++
|
||
|
|
3 files changed, 213 insertions(+), 1 deletion(-)
|
||
|
|
|
||
|
|
diff --git a/configure.ac b/configure.ac
|
||
|
|
index d098fcf..30c90d2 100644
|
||
|
|
--- a/configure.ac
|
||
|
|
+++ b/configure.ac
|
||
|
|
@@ -171,6 +171,16 @@ AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "x
|
||
|
|
AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes])
|
||
|
|
AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"])
|
||
|
|
|
||
|
|
+AC_ARG_ENABLE([hisi_hbm_memory_acls],
|
||
|
|
+ AS_HELP_STRING([--enable-hisi-hbm-memory-acls], [enable HiSilicon HBM Memory ACLS]))
|
||
|
|
+
|
||
|
|
+AS_IF([test "x$enable_hisi_hbm_memory_acls" = "xyes" || test "x$enable_all" == "xyes"], [
|
||
|
|
+ AC_DEFINE(HAVE_HISI_HBM_MEMORY_ACLS,1,"have HiSilicon HBM Memory ACLS")
|
||
|
|
+ AC_SUBST([WITH_HISI_HBM_MEMORY_ACLS])
|
||
|
|
+])
|
||
|
|
+AM_CONDITIONAL([WITH_HISI_HBM_MEMORY_ACLS], [test x$enable_hisi_hbm_memory_acls = xyes || test x$enable_all == xyes])
|
||
|
|
+AM_COND_IF([WITH_HISI_HBM_MEMORY_ACLS], [USE_HISI_HBM_MEMORY_ACLS="yes"], [USE_HISI_HBM_MEMORY_ACLS="no"])
|
||
|
|
+
|
||
|
|
test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
|
||
|
|
|
||
|
|
CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
|
||
|
|
@@ -212,4 +222,5 @@ compile time options summary
|
||
|
|
Memory CE PFA : $USE_MEMORY_CE_PFA
|
||
|
|
AMP RAS errors : $USE_AMP_NS_DECODE
|
||
|
|
CPU fault isolation : $USE_CPU_FAULT_ISOLATION
|
||
|
|
+ HISI HBM Memory ACLS: $USE_HISI_HBM_MEMORY_ACLS
|
||
|
|
EOF
|
||
|
|
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
|
||
|
|
index ca12a1a..516c4ac 100644
|
||
|
|
--- a/misc/rasdaemon.env
|
||
|
|
+++ b/misc/rasdaemon.env
|
||
|
|
@@ -46,4 +46,9 @@ CPU_ISOLATION_CYCLE="24h"
|
||
|
|
CPU_ISOLATION_LIMIT="10"
|
||
|
|
|
||
|
|
# Disable specified events by config
|
||
|
|
-DISABLE="block:block_rq_complete"
|
||
|
|
\ No newline at end of file
|
||
|
|
+DISABLE="block:block_rq_complete"
|
||
|
|
+
|
||
|
|
+# Support the HBM Memory ACLS (Adaptive Cache Line Sparing) on HiSilicon platform (yes|no).
|
||
|
|
+HISI_HBM_MEMORY_ACLS="no"
|
||
|
|
+# Specify rasdaemon to isolation the error page which fails to be repaired by HiSilicon HBM ACLS (yes|no).
|
||
|
|
+HISI_HBM_ISOLATION_PAGE="no"
|
||
|
|
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
|
||
|
|
index 7296d28..2b176cd 100644
|
||
|
|
--- a/non-standard-hisilicon.c
|
||
|
|
+++ b/non-standard-hisilicon.c
|
||
|
|
@@ -19,6 +19,48 @@
|
||
|
|
#define HISI_BUF_LEN 2048
|
||
|
|
#define HISI_PCIE_INFO_BUF_LEN 256
|
||
|
|
|
||
|
|
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||
|
|
+#include <dirent.h>
|
||
|
|
+#include <errno.h>
|
||
|
|
+#include <fcntl.h>
|
||
|
|
+#include <unistd.h>
|
||
|
|
+#include <stdbool.h>
|
||
|
|
+
|
||
|
|
+#define HISI_HBM_MEM_RAS_NAME "HISI0521"
|
||
|
|
+#define HISI_HBM_UNKNOWN 0
|
||
|
|
+#define HISI_HBM_HBM_MEMORY 1
|
||
|
|
+#define HISI_HBM_DDR_MEMORY 2
|
||
|
|
+
|
||
|
|
+#define HISI_TYPE_UINT32_WIDTH 32
|
||
|
|
+/* Specify the Hisilicon HBMC HBM repair requeset type */
|
||
|
|
+#define HISI_HBM_REPAIR_REQ_TYPE 0
|
||
|
|
+#define HISI_HBM_CE_ACLS BIT(0)
|
||
|
|
+#define HISI_HBM_ACLS_ADDL 1
|
||
|
|
+#define HISI_HBM_ACLS_ADDH 2
|
||
|
|
+#define HISI_HBM_ACLS_ARRAY_SIZE 12
|
||
|
|
+#define HISI_HBMC_SUBMOD_HBM_REPAIR 6
|
||
|
|
+
|
||
|
|
+static bool hisi_hbm_acls_en;
|
||
|
|
+static bool hisi_hbm_isolation_page_en;
|
||
|
|
+
|
||
|
|
+static void hisi_hbm_param_init(void)
|
||
|
|
+{
|
||
|
|
+ char *env;
|
||
|
|
+
|
||
|
|
+ env = getenv("HISI_HBM_MEMORY_ACLS");
|
||
|
|
+ if (env && strcasecmp(env, "yes") == 0) {
|
||
|
|
+ log(TERM, LOG_INFO, "HiSilicon HBM Memory ACLS is enabled\n");
|
||
|
|
+ hisi_hbm_acls_en = true;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ env = getenv("HISI_HBM_ISOLATION_PAGE");
|
||
|
|
+ if (env && strcasecmp(env, "yes") == 0) {
|
||
|
|
+ log(TERM, LOG_INFO, "HiSilicon HBM ACLS page isolation is enabled\n");
|
||
|
|
+ hisi_hbm_isolation_page_en = true;
|
||
|
|
+ }
|
||
|
|
+}
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
struct hisi_common_error_section {
|
||
|
|
uint32_t val_bits;
|
||
|
|
uint8_t version;
|
||
|
|
@@ -358,6 +400,151 @@ static int add_hisi_common_table(struct ras_events *ras,
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||
|
|
+static int write_file(char *path, const char *name, unsigned long long value)
|
||
|
|
+{
|
||
|
|
+ char fname[MAX_PATH];
|
||
|
|
+ char buf[20];
|
||
|
|
+ int ret;
|
||
|
|
+ int fd;
|
||
|
|
+
|
||
|
|
+ snprintf(fname, MAX_PATH, "%s/%s", path, name);
|
||
|
|
+
|
||
|
|
+ fd = open(fname, O_WRONLY);
|
||
|
|
+ if (fd < 0) {
|
||
|
|
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot to open '%s': %s\n",
|
||
|
|
+ fname, strerror(errno));
|
||
|
|
+ return -errno;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ snprintf(buf, sizeof(buf), "0x%llx\n", value);
|
||
|
|
+ ret = write(fd, buf, strlen(buf));
|
||
|
|
+ if (ret <= 0)
|
||
|
|
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Failed to set %s (0x%llx): %s\n",
|
||
|
|
+ fname, value, strerror(errno));
|
||
|
|
+
|
||
|
|
+ close(fd);
|
||
|
|
+ return ret > 0 ? 0 : -errno;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int hisi_hbmc_hbm_acls(const struct hisi_common_error_section *err, char *path)
|
||
|
|
+{
|
||
|
|
+ unsigned long long paddr;
|
||
|
|
+ int ret;
|
||
|
|
+
|
||
|
|
+ paddr = err->reg_array[HISI_HBM_ACLS_ADDH];
|
||
|
|
+ paddr <<= HISI_TYPE_UINT32_WIDTH;
|
||
|
|
+ paddr += err->reg_array[HISI_HBM_ACLS_ADDL];
|
||
|
|
+
|
||
|
|
+ ret = write_file(path, "acls_query", paddr);
|
||
|
|
+ if (ret < 0)
|
||
|
|
+ return ret;
|
||
|
|
+
|
||
|
|
+ ret = write_file("/sys/kernel/page_eject", "offline_page", paddr);
|
||
|
|
+ if (ret < 0)
|
||
|
|
+ return ret;
|
||
|
|
+
|
||
|
|
+ ret = write_file(path, "acls_repair", paddr);
|
||
|
|
+ if (ret < 0 && hisi_hbm_isolation_page_en) {
|
||
|
|
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Keep page offline\n");
|
||
|
|
+ /* not much we can do about errors here */
|
||
|
|
+ (void)write_file("/sys/kernel/page_eject", "remove_page", paddr);
|
||
|
|
+ return ret;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ ret = write_file("/sys/kernel/page_eject", "online_page", paddr);
|
||
|
|
+ return ret < 0 ? ret : 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int hisi_hbmc_get_memory_type(char *path)
|
||
|
|
+{
|
||
|
|
+ int type = HISI_HBM_UNKNOWN;
|
||
|
|
+ char fname[MAX_PATH];
|
||
|
|
+ char buf[128];
|
||
|
|
+ FILE *file;
|
||
|
|
+
|
||
|
|
+ snprintf(fname, MAX_PATH, "%s/%s", path, "memory_type");
|
||
|
|
+ file = fopen(fname, "r");
|
||
|
|
+ if (!file) {
|
||
|
|
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot to open '%s': %s\n",
|
||
|
|
+ fname, strerror(errno));
|
||
|
|
+ return -errno;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (!fgets(buf, sizeof(buf), file)) {
|
||
|
|
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Failed to read %s\n", fname);
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /* Remove the last '\n' */
|
||
|
|
+ buf[strlen(buf) - 1] = 0;
|
||
|
|
+
|
||
|
|
+ if (strcmp(buf, "HBM") == 0)
|
||
|
|
+ type = HISI_HBM_HBM_MEMORY;
|
||
|
|
+ else if (strcmp(buf, "DDR") == 0)
|
||
|
|
+ type = HISI_HBM_DDR_MEMORY;
|
||
|
|
+
|
||
|
|
+err:
|
||
|
|
+ fclose(file);
|
||
|
|
+ return type;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static void hisi_hbm_acls_handler(const struct hisi_common_error_section *err)
|
||
|
|
+{
|
||
|
|
+ char *sys_dev_path = "/sys/devices/platform";
|
||
|
|
+ char path[MAX_PATH];
|
||
|
|
+ struct dirent *dent;
|
||
|
|
+ DIR *dir;
|
||
|
|
+ int ret;
|
||
|
|
+
|
||
|
|
+ dir = opendir(sys_dev_path);
|
||
|
|
+ if (!dir) {
|
||
|
|
+ log(TERM, LOG_WARNING, "HiSilicon Memory RAS: can't read '%s': %s\n",
|
||
|
|
+ sys_dev_path, strerror(errno));
|
||
|
|
+ return;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ while ((dent = readdir(dir))) {
|
||
|
|
+ if (!strstr(dent->d_name, HISI_HBM_MEM_RAS_NAME))
|
||
|
|
+ continue;
|
||
|
|
+
|
||
|
|
+ snprintf(path, MAX_PATH, "%s/%s", sys_dev_path, dent->d_name);
|
||
|
|
+
|
||
|
|
+ if (hisi_hbmc_get_memory_type(path) == HISI_HBM_HBM_MEMORY &&
|
||
|
|
+ err->reg_array[HISI_HBM_REPAIR_REQ_TYPE] & HISI_HBM_CE_ACLS) {
|
||
|
|
+ /*
|
||
|
|
+ * ENXIO means the memory @paddr does not belong to
|
||
|
|
+ * the HBMC, try the next one.
|
||
|
|
+ */
|
||
|
|
+ ret = hisi_hbmc_hbm_acls(err, path);
|
||
|
|
+ if (ret != -ENXIO)
|
||
|
|
+ break;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ closedir(dir);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static bool hisi_hbm_valid_acls_ras(const struct hisi_common_error_section *err)
|
||
|
|
+{
|
||
|
|
+ if (err->module_id >= sizeof(module_name)/sizeof(char *))
|
||
|
|
+ return false;
|
||
|
|
+
|
||
|
|
+ if (strcmp(module_name[err->module_id], "HBMC") != 0 ||
|
||
|
|
+ err->submodule_id != HISI_HBMC_SUBMOD_HBM_REPAIR)
|
||
|
|
+ return false;
|
||
|
|
+
|
||
|
|
+ if (!(err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE)) ||
|
||
|
|
+ err->reg_array_size < HISI_HBM_ACLS_ARRAY_SIZE) {
|
||
|
|
+ log(TERM, LOG_WARNING, "HiSilicon Memory RAS: No valid address array length (%u)\n",
|
||
|
|
+ err->reg_array_size);
|
||
|
|
+ return false;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ return true;
|
||
|
|
+}
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
static int decode_hisi_common_section(struct ras_events *ras,
|
||
|
|
struct ras_ns_ev_decoder *ev_decoder,
|
||
|
|
struct trace_seq *s,
|
||
|
|
@@ -393,6 +580,11 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||
|
|
step_vendor_data_tab(ev_decoder, "hisi_common_section_tab");
|
||
|
|
}
|
||
|
|
|
||
|
|
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||
|
|
+ if (hisi_hbm_acls_en && hisi_hbm_valid_acls_ras(err))
|
||
|
|
+ hisi_hbm_acls_handler(err);
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
@@ -410,4 +602,8 @@ static void __attribute__((constructor)) hisi_ns_init(void)
|
||
|
|
|
||
|
|
for (i = 0; i < ARRAY_SIZE(hisi_section_ns_ev_decoder); i++)
|
||
|
|
register_ns_ev_decoder(&hisi_section_ns_ev_decoder[i]);
|
||
|
|
+
|
||
|
|
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||
|
|
+ hisi_hbm_param_init();
|
||
|
|
+#endif
|
||
|
|
}
|
||
|
|
--
|
||
|
|
2.33.0
|
||
|
|
|