Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 3 additions & 50 deletions drivers/edac/i10nm_base.c
Original file line number Diff line number Diff line change
Expand Up @@ -1012,54 +1012,6 @@ static struct notifier_block i10nm_mce_dec = {
.priority = MCE_PRIO_EDAC,
};

#ifdef CONFIG_EDAC_DEBUG
/*
* Debug feature.
* Exercise the address decode logic by writing an address to
* /sys/kernel/debug/edac/i10nm_test/addr.
*/
static struct dentry *i10nm_test;

static int debugfs_u64_set(void *data, u64 val)
{
struct mce m;

pr_warn_once("Fake error to 0x%llx injected via debugfs\n", val);

memset(&m, 0, sizeof(m));
/* ADDRV + MemRd + Unknown channel */
m.status = MCI_STATUS_ADDRV + 0x90;
/* One corrected error */
m.status |= BIT_ULL(MCI_STATUS_CEC_SHIFT);
m.addr = val;
skx_mce_check_error(NULL, 0, &m);

return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");

static void setup_i10nm_debug(void)
{
i10nm_test = edac_debugfs_create_dir("i10nm_test");
if (!i10nm_test)
return;

if (!edac_debugfs_create_file("addr", 0200, i10nm_test,
NULL, &fops_u64_wo)) {
debugfs_remove(i10nm_test);
i10nm_test = NULL;
}
}

static void teardown_i10nm_debug(void)
{
debugfs_remove_recursive(i10nm_test);
}
#else
static inline void setup_i10nm_debug(void) {}
static inline void teardown_i10nm_debug(void) {}
#endif /*CONFIG_EDAC_DEBUG*/

static int __init i10nm_init(void)
{
u8 mc = 0, src_id = 0, node_id = 0;
Expand Down Expand Up @@ -1088,6 +1040,7 @@ static int __init i10nm_init(void)
return -ENODEV;

cfg = (struct res_config *)id->driver_data;
skx_set_res_cfg(cfg);
res_cfg = cfg;

rc = skx_get_hi_lo(0x09a2, off, &tolm, &tohm);
Expand Down Expand Up @@ -1158,7 +1111,7 @@ static int __init i10nm_init(void)

opstate_init();
mce_register_decode_chain(&i10nm_mce_dec);
setup_i10nm_debug();
skx_setup_debug("i10nm_test");

if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
skx_set_decode(i10nm_mc_decode, show_retry_rd_err_log);
Expand Down Expand Up @@ -1186,7 +1139,7 @@ static void __exit i10nm_exit(void)
enable_retry_rd_err_log(false);
}

teardown_i10nm_debug();
skx_teardown_debug();
mce_unregister_decode_chain(&i10nm_mce_dec);
skx_adxl_put();
skx_remove();
Expand Down
52 changes: 2 additions & 50 deletions drivers/edac/skx_base.c
Original file line number Diff line number Diff line change
Expand Up @@ -587,54 +587,6 @@ static struct notifier_block skx_mce_dec = {
.priority = MCE_PRIO_EDAC,
};

#ifdef CONFIG_EDAC_DEBUG
/*
* Debug feature.
* Exercise the address decode logic by writing an address to
* /sys/kernel/debug/edac/skx_test/addr.
*/
static struct dentry *skx_test;

static int debugfs_u64_set(void *data, u64 val)
{
struct mce m;

pr_warn_once("Fake error to 0x%llx injected via debugfs\n", val);

memset(&m, 0, sizeof(m));
/* ADDRV + MemRd + Unknown channel */
m.status = MCI_STATUS_ADDRV + 0x90;
/* One corrected error */
m.status |= BIT_ULL(MCI_STATUS_CEC_SHIFT);
m.addr = val;
skx_mce_check_error(NULL, 0, &m);

return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");

static void setup_skx_debug(void)
{
skx_test = edac_debugfs_create_dir("skx_test");
if (!skx_test)
return;

if (!edac_debugfs_create_file("addr", 0200, skx_test,
NULL, &fops_u64_wo)) {
debugfs_remove(skx_test);
skx_test = NULL;
}
}

static void teardown_skx_debug(void)
{
debugfs_remove_recursive(skx_test);
}
#else
static inline void setup_skx_debug(void) {}
static inline void teardown_skx_debug(void) {}
#endif /*CONFIG_EDAC_DEBUG*/

/*
* skx_init:
* make sure we are running on the correct cpu model
Expand Down Expand Up @@ -728,7 +680,7 @@ static int __init skx_init(void)
/* Ensure that the OPSTATE is set correctly for POLL or NMI */
opstate_init();

setup_skx_debug();
skx_setup_debug("skx_test");

mce_register_decode_chain(&skx_mce_dec);

Expand All @@ -742,7 +694,7 @@ static void __exit skx_exit(void)
{
edac_dbg(2, "\n");
mce_unregister_decode_chain(&skx_mce_dec);
teardown_skx_debug();
skx_teardown_debug();
if (nvdimm_count)
skx_adxl_put();
skx_remove();
Expand Down
104 changes: 86 additions & 18 deletions drivers/edac/skx_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ static skx_show_retry_log_f skx_show_retry_rd_err_log;
static u64 skx_tolm, skx_tohm;
static LIST_HEAD(dev_edac_list);
static bool skx_mem_cfg_2lm;
static struct res_config *skx_res_cfg;

int skx_adxl_get(void)
{
Expand Down Expand Up @@ -119,7 +120,7 @@ void skx_adxl_put(void)
}
EXPORT_SYMBOL_GPL(skx_adxl_put);

static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem)
static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
{
struct skx_dev *d;
int i, len = 0;
Expand All @@ -135,8 +136,24 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me
return false;
}

/*
* GNR with a Flat2LM memory configuration may mistakenly classify
* a near-memory error(DDR5) as a far-memory error(CXL), resulting
* in the incorrect selection of decoded ADXL components.
* To address this, prefetch the decoded far-memory controller ID
* and adjust the error source to near-memory if the far-memory
* controller ID is invalid.
*/
if (skx_res_cfg && skx_res_cfg->type == GNR && err_src == ERR_SRC_2LM_FM) {
res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
if (res->imc == -1) {
err_src = ERR_SRC_2LM_NM;
edac_dbg(0, "Adjust the error source to near-memory.\n");
}
}

res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]];
if (error_in_1st_level_mem) {
if (err_src == ERR_SRC_2LM_NM) {
res->imc = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ?
(int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1;
res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ?
Expand Down Expand Up @@ -191,6 +208,12 @@ void skx_set_mem_cfg(bool mem_cfg_2lm)
}
EXPORT_SYMBOL_GPL(skx_set_mem_cfg);

void skx_set_res_cfg(struct res_config *cfg)
{
skx_res_cfg = cfg;
}
EXPORT_SYMBOL_GPL(skx_set_res_cfg);

void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
{
driver_decode = decode;
Expand Down Expand Up @@ -620,40 +643,38 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
optype, skx_msg);
}

static bool skx_error_in_1st_level_mem(const struct mce *m)
static enum error_source skx_error_source(const struct mce *m)
{
u32 errcode;

if (!skx_mem_cfg_2lm)
return false;
u32 errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;

errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
if (errcode != MCACOD_MEM_CTL_ERR && errcode != MCACOD_EXT_MEM_ERR)
return ERR_SRC_NOT_MEMORY;

return errcode == MCACOD_EXT_MEM_ERR;
}

static bool skx_error_in_mem(const struct mce *m)
{
u32 errcode;
if (!skx_mem_cfg_2lm)
return ERR_SRC_1LM;

errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
if (errcode == MCACOD_EXT_MEM_ERR)
return ERR_SRC_2LM_NM;

return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR);
return ERR_SRC_2LM_FM;
}

int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *mce = (struct mce *)data;
enum error_source err_src;
struct decoded_addr res;
struct mem_ctl_info *mci;
char *type;

if (mce->kflags & MCE_HANDLED_CEC)
return NOTIFY_DONE;

err_src = skx_error_source(mce);

/* Ignore unless this is memory related with an address */
if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV))
if (err_src == ERR_SRC_NOT_MEMORY || !(mce->status & MCI_STATUS_ADDRV))
return NOTIFY_DONE;

memset(&res, 0, sizeof(res));
Expand All @@ -667,7 +688,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
/* Try driver decoder first */
if (!(driver_decode && driver_decode(&res))) {
/* Then try firmware decoder (ACPI DSM methods) */
if (!(adxl_component_count && skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce))))
if (!(adxl_component_count && skx_adxl_decode(&res, err_src)))
return NOTIFY_DONE;
}

Expand Down Expand Up @@ -739,6 +760,53 @@ void skx_remove(void)
}
EXPORT_SYMBOL_GPL(skx_remove);

#ifdef CONFIG_EDAC_DEBUG
/*
* Debug feature.
* Exercise the address decode logic by writing an address to
* /sys/kernel/debug/edac/{skx,i10nm}_test/addr.
*/
static struct dentry *skx_test;

static int debugfs_u64_set(void *data, u64 val)
{
struct mce m;

pr_warn_once("Fake error to 0x%llx injected via debugfs\n", val);

memset(&m, 0, sizeof(m));
/* ADDRV + MemRd + Unknown channel */
m.status = MCI_STATUS_ADDRV + 0x90;
/* One corrected error */
m.status |= BIT_ULL(MCI_STATUS_CEC_SHIFT);
m.addr = val;
skx_mce_check_error(NULL, 0, &m);

return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");

void skx_setup_debug(const char *name)
{
skx_test = edac_debugfs_create_dir(name);
if (!skx_test)
return;

if (!edac_debugfs_create_file("addr", 0200, skx_test,
NULL, &fops_u64_wo)) {
debugfs_remove(skx_test);
skx_test = NULL;
}
}
EXPORT_SYMBOL_GPL(skx_setup_debug);

void skx_teardown_debug(void)
{
debugfs_remove_recursive(skx_test);
}
EXPORT_SYMBOL_GPL(skx_teardown_debug);
#endif /*CONFIG_EDAC_DEBUG*/

MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Tony Luck");
MODULE_DESCRIPTION("MC Driver for Intel server processors");
16 changes: 16 additions & 0 deletions drivers/edac/skx_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,13 @@ enum {
INDEX_MAX
};

enum error_source {
ERR_SRC_1LM,
ERR_SRC_2LM_NM,
ERR_SRC_2LM_FM,
ERR_SRC_NOT_MEMORY,
};

#define BIT_NM_MEMCTRL BIT_ULL(INDEX_NM_MEMCTRL)
#define BIT_NM_CHANNEL BIT_ULL(INDEX_NM_CHANNEL)
#define BIT_NM_DIMM BIT_ULL(INDEX_NM_DIMM)
Expand Down Expand Up @@ -235,6 +242,7 @@ int skx_adxl_get(void);
void skx_adxl_put(void);
void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log);
void skx_set_mem_cfg(bool mem_cfg_2lm);
void skx_set_res_cfg(struct res_config *cfg);

int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
int skx_get_node_id(struct skx_dev *d, u8 *id);
Expand All @@ -260,4 +268,12 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,

void skx_remove(void);

#ifdef CONFIG_EDAC_DEBUG
void skx_setup_debug(const char *name);
void skx_teardown_debug(void);
#else
static inline void skx_setup_debug(const char *name) {}
static inline void skx_teardown_debug(void) {}
#endif

#endif /* _SKX_COMM_EDAC_H */