From b70ef01016850de87b9a28a6af19fed8801df076 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 25 Jun 2009 19:32:38 +0200 Subject: EDAC: move MCE error descriptions to EDAC core This is in preparation of adding AMD-specific MCE decoding functionality to the EDAC core. The error decoding macros originate from the AMD64 EDAC driver albeit in a simplified and cleaned up version here. While at it, add macros to generate the error description strings and use them in the error type decoders directly which removes a bunch of code and makes the decoding functions much more readable. Also, fix strings and shorten macro names. Remove superfluous htlink_msgs. Signed-off-by: Borislav Petkov --- drivers/edac/Makefile | 6 +- drivers/edac/amd64_edac.c | 140 +++++++++++++++++-------------- drivers/edac/amd64_edac.h | 17 +--- drivers/edac/amd64_edac_err_types.c | 161 ------------------------------------ drivers/edac/edac_mce_amd.c | 101 ++++++++++++++++++++++ drivers/edac/edac_mce_amd.h | 29 +++++++ 6 files changed, 214 insertions(+), 240 deletions(-) delete mode 100644 drivers/edac/amd64_edac_err_types.c create mode 100644 drivers/edac/edac_mce_amd.c create mode 100644 drivers/edac/edac_mce_amd.h diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 98aa4a7db41..cfa033ce53a 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -17,6 +17,10 @@ ifdef CONFIG_PCI edac_core-objs += edac_pci.o edac_pci_sysfs.o endif +ifdef CONFIG_CPU_SUP_AMD +edac_core-objs += edac_mce_amd.o +endif + obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o obj-$(CONFIG_EDAC_CPC925) += cpc925_edac.o obj-$(CONFIG_EDAC_I5000) += i5000_edac.o @@ -32,7 +36,7 @@ obj-$(CONFIG_EDAC_X38) += x38_edac.o obj-$(CONFIG_EDAC_I82860) += i82860_edac.o obj-$(CONFIG_EDAC_R82600) += r82600_edac.o -amd64_edac_mod-y := amd64_edac_err_types.o amd64_edac.o +amd64_edac_mod-y := amd64_edac.o amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o amd64_edac_mod-$(CONFIG_EDAC_AMD64_ERROR_INJECTION) += amd64_edac_inj.o diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e2a10bcba7a..b9e84bc9176 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -18,6 +18,63 @@ struct amd64_pvt; static struct mem_ctl_info *mci_lookup[MAX_NUMNODES]; static struct amd64_pvt *pvt_lookup[MAX_NUMNODES]; +/* + * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only + * for DDR2 DRAM mapping. + */ +u32 revf_quad_ddr2_shift[] = { + 0, /* 0000b NULL DIMM (128mb) */ + 28, /* 0001b 256mb */ + 29, /* 0010b 512mb */ + 29, /* 0011b 512mb */ + 29, /* 0100b 512mb */ + 30, /* 0101b 1gb */ + 30, /* 0110b 1gb */ + 31, /* 0111b 2gb */ + 31, /* 1000b 2gb */ + 32, /* 1001b 4gb */ + 32, /* 1010b 4gb */ + 33, /* 1011b 8gb */ + 0, /* 1100b future */ + 0, /* 1101b future */ + 0, /* 1110b future */ + 0 /* 1111b future */ +}; + +/* + * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing + * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- + * or higher value'. + * + *FIXME: Produce a better mapping/linearisation. + */ + +struct scrubrate scrubrates[] = { + { 0x01, 1600000000UL}, + { 0x02, 800000000UL}, + { 0x03, 400000000UL}, + { 0x04, 200000000UL}, + { 0x05, 100000000UL}, + { 0x06, 50000000UL}, + { 0x07, 25000000UL}, + { 0x08, 12284069UL}, + { 0x09, 6274509UL}, + { 0x0A, 3121951UL}, + { 0x0B, 1560975UL}, + { 0x0C, 781440UL}, + { 0x0D, 390720UL}, + { 0x0E, 195300UL}, + { 0x0F, 97650UL}, + { 0x10, 48854UL}, + { 0x11, 24427UL}, + { 0x12, 12213UL}, + { 0x13, 6101UL}, + { 0x14, 3051UL}, + { 0x15, 1523UL}, + { 0x16, 761UL}, + { 0x00, 0UL}, /* scrubbing off */ +}; + /* * Memory scrubber control interface. For K8, memory scrubbing is handled by * hardware and can involve L2 cache, dcache as well as the main memory. With @@ -1101,8 +1158,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u32 page, offset; /* Extract the syndrome parts and form a 16-bit syndrome */ - syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; - syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); + syndrome = HIGH_SYNDROME(info->nbsl) << 8; + syndrome |= LOW_SYNDROME(info->nbsh); /* CHIPKILL enabled */ if (info->nbcfg & K8_NBCFG_CHIPKILL) { @@ -1701,8 +1758,8 @@ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, if (csrow >= 0) { error_address_to_page_and_offset(sys_addr, &page, &offset); - syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; - syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); + syndrome = HIGH_SYNDROME(info->nbsl) << 8; + syndrome |= LOW_SYNDROME(info->nbsh); /* * Is CHIPKILL on? If so, then we can attempt to use the @@ -2155,36 +2212,22 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code; - u32 ec_tt; /* error code transaction type (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - err_code = EXTRACT_ERROR_CODE(info->nbsl); - ec_ll = EXTRACT_LL_CODE(err_code); - ec_tt = EXTRACT_TT_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "GART TLB event: transaction type(%s), " - "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]); + "cache level(%s)\n", TT_MSG(ec), LL_MSG(ec)); } static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code; - u32 ec_rrrr; /* error code memory transaction (4b) */ - u32 ec_tt; /* error code transaction type (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - err_code = EXTRACT_ERROR_CODE(info->nbsl); - ec_ll = EXTRACT_LL_CODE(err_code); - ec_tt = EXTRACT_TT_CODE(err_code); - ec_rrrr = EXTRACT_RRRR_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "cache hierarchy error: memory transaction type(%s), " "transaction type(%s), cache level(%s)\n", - rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]); + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } @@ -2264,21 +2307,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, static void amd64_decode_bus_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code, ext_ec; - u32 ec_pp; /* error code participating processor (2p) */ - u32 ec_to; /* error code timed out (1b) */ - u32 ec_rrrr; /* error code memory transaction (4b) */ - u32 ec_ii; /* error code memory or I/O (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl); - err_code = EXTRACT_ERROR_CODE(info->nbsl); - - ec_ll = EXTRACT_LL_CODE(err_code); - ec_ii = EXTRACT_II_CODE(err_code); - ec_rrrr = EXTRACT_RRRR_CODE(err_code); - ec_to = EXTRACT_TO_CODE(err_code); - ec_pp = EXTRACT_PP_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); + u32 xec = EXT_ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "BUS ERROR:\n" @@ -2286,20 +2316,17 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, " participating processor(%s)\n" " memory transaction type(%s)\n" " cache level(%s) Error Found by: %s\n", - to_msgs[ec_to], - ii_msgs[ec_ii], - pp_msgs[ec_pp], - rrrr_msgs[ec_rrrr], - ll_msgs[ec_ll], + TO_MSG(ec), II_MSG(ec), PP_MSG(ec), RRRR_MSG(ec), LL_MSG(ec), (info->nbsh & K8_NBSH_ERR_SCRUBER) ? "Scrubber" : "Normal Operation"); - /* If this was an 'observed' error, early out */ - if (ec_pp == K8_NBSL_PP_OBS) - return; /* We aren't the node involved */ + + /* Bail early out if this was an 'observed' error */ + if (PP(ec) == K8_NBSL_PP_OBS) + return; /* Parse out the extended error code for ECC events */ - switch (ext_ec) { + switch (xec) { /* F10 changed to one Extended ECC error code */ case F10_NBSL_EXT_ERR_RES: /* Reserved field */ case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */ @@ -2379,7 +2406,7 @@ int amd64_process_error_info(struct mem_ctl_info *mci, (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False"); - err_code = EXTRACT_ERROR_CODE(regs->nbsl); + err_code = ERROR_CODE(regs->nbsl); /* Determine which error type: * 1) GART errors - non-fatal, developmental events @@ -2387,7 +2414,7 @@ int amd64_process_error_info(struct mem_ctl_info *mci, * 3) BUS errors * 4) Unknown error */ - if (TEST_TLB_ERROR(err_code)) { + if (TLB_ERROR(err_code)) { /* * GART errors are intended to help graphics driver developers * to detect bad GART PTEs. It is recommended by AMD to disable @@ -2411,10 +2438,10 @@ int amd64_process_error_info(struct mem_ctl_info *mci, debugf1("GART TLB error\n"); amd64_decode_gart_tlb_error(mci, info); - } else if (TEST_MEM_ERROR(err_code)) { + } else if (MEM_ERROR(err_code)) { debugf1("Memory/Cache error\n"); amd64_decode_mem_cache_error(mci, info); - } else if (TEST_BUS_ERROR(err_code)) { + } else if (BUS_ERROR(err_code)) { debugf1("Bus (Link/DRAM) error\n"); amd64_decode_bus_error(mci, info); } else { @@ -2424,21 +2451,10 @@ int amd64_process_error_info(struct mem_ctl_info *mci, err_code); } - ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl); + ext_ec = EXT_ERROR_CODE(regs->nbsl); amd64_mc_printk(mci, KERN_ERR, "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]); - if (((ext_ec >= F10_NBSL_EXT_ERR_CRC && - ext_ec <= F10_NBSL_EXT_ERR_TGT) || - (ext_ec == F10_NBSL_EXT_ERR_RMW)) && - EXTRACT_LDT_LINK(info->nbsh)) { - - amd64_mc_printk(mci, KERN_ERR, - "Error on hypertransport link: %s\n", - htlink_msgs[ - EXTRACT_LDT_LINK(info->nbsh)]); - } - /* * Check the UE bit of the NB status high register, if set generate some * logs. If NOT a GART error, then process the event as a NO-INFO event. diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index ba73015af8e..1ddef8d15d5 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -72,6 +72,7 @@ #include #include #include "edac_core.h" +#include "edac_mce_amd.h" #define amd64_printk(level, fmt, arg...) \ edac_printk(level, "amd64", fmt, ##arg) @@ -303,9 +304,6 @@ enum { #define K8_NBSL 0x48 -#define EXTRACT_HIGH_SYNDROME(x) (((x) >> 24) & 0xff) -#define EXTRACT_EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) - /* Family F10h: Normalized Extended Error Codes */ #define F10_NBSL_EXT_ERR_RES 0x0 #define F10_NBSL_EXT_ERR_CRC 0x1 @@ -348,17 +346,6 @@ enum { #define K8_NBSL_EXT_ERR_CHIPKILL_ECC 0x8 #define K8_NBSL_EXT_ERR_DRAM_PARITY 0xD -#define EXTRACT_ERROR_CODE(x) ((x) & 0xffff) -#define TEST_TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) -#define TEST_MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) -#define TEST_BUS_ERROR(x) (((x) & 0xF800) == 0x0800) -#define EXTRACT_TT_CODE(x) (((x) >> 2) & 0x3) -#define EXTRACT_II_CODE(x) (((x) >> 2) & 0x3) -#define EXTRACT_LL_CODE(x) (((x) >> 0) & 0x3) -#define EXTRACT_RRRR_CODE(x) (((x) >> 4) & 0xf) -#define EXTRACT_TO_CODE(x) (((x) >> 8) & 0x1) -#define EXTRACT_PP_CODE(x) (((x) >> 9) & 0x3) - /* * The following are for BUS type errors AFTER values have been normalized by * shifting right @@ -386,9 +373,7 @@ enum { #define K8_NBSH_CORE1 BIT(1) #define K8_NBSH_CORE0 BIT(0) -#define EXTRACT_LDT_LINK(x) (((x) >> 4) & 0x7) #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) -#define EXTRACT_LOW_SYNDROME(x) (((x) >> 15) & 0xff) #define K8_NBEAL 0x50 diff --git a/drivers/edac/amd64_edac_err_types.c b/drivers/edac/amd64_edac_err_types.c deleted file mode 100644 index f212ff12a9d..00000000000 --- a/drivers/edac/amd64_edac_err_types.c +++ /dev/null @@ -1,161 +0,0 @@ -#include "amd64_edac.h" - -/* - * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only - * for DDR2 DRAM mapping. - */ -u32 revf_quad_ddr2_shift[] = { - 0, /* 0000b NULL DIMM (128mb) */ - 28, /* 0001b 256mb */ - 29, /* 0010b 512mb */ - 29, /* 0011b 512mb */ - 29, /* 0100b 512mb */ - 30, /* 0101b 1gb */ - 30, /* 0110b 1gb */ - 31, /* 0111b 2gb */ - 31, /* 1000b 2gb */ - 32, /* 1001b 4gb */ - 32, /* 1010b 4gb */ - 33, /* 1011b 8gb */ - 0, /* 1100b future */ - 0, /* 1101b future */ - 0, /* 1110b future */ - 0 /* 1111b future */ -}; - -/* - * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing - * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- - * or higher value'. - * - *FIXME: Produce a better mapping/linearisation. - */ - -struct scrubrate scrubrates[] = { - { 0x01, 1600000000UL}, - { 0x02, 800000000UL}, - { 0x03, 400000000UL}, - { 0x04, 200000000UL}, - { 0x05, 100000000UL}, - { 0x06, 50000000UL}, - { 0x07, 25000000UL}, - { 0x08, 12284069UL}, - { 0x09, 6274509UL}, - { 0x0A, 3121951UL}, - { 0x0B, 1560975UL}, - { 0x0C, 781440UL}, - { 0x0D, 390720UL}, - { 0x0E, 195300UL}, - { 0x0F, 97650UL}, - { 0x10, 48854UL}, - { 0x11, 24427UL}, - { 0x12, 12213UL}, - { 0x13, 6101UL}, - { 0x14, 3051UL}, - { 0x15, 1523UL}, - { 0x16, 761UL}, - { 0x00, 0UL}, /* scrubbing off */ -}; - -/* - * string representation for the different MCA reported error types, see F3x48 - * or MSR0000_0411. - */ -const char *tt_msgs[] = { /* transaction type */ - "instruction", - "data", - "generic", - "reserved" -}; - -const char *ll_msgs[] = { /* cache level */ - "L0", - "L1", - "L2", - "L3/generic" -}; - -const char *rrrr_msgs[] = { - "generic", - "generic read", - "generic write", - "data read", - "data write", - "inst fetch", - "prefetch", - "evict", - "snoop", - "reserved RRRR= 9", - "reserved RRRR= 10", - "reserved RRRR= 11", - "reserved RRRR= 12", - "reserved RRRR= 13", - "reserved RRRR= 14", - "reserved RRRR= 15" -}; - -const char *pp_msgs[] = { /* participating processor */ - "local node originated (SRC)", - "local node responded to request (RES)", - "local node observed as 3rd party (OBS)", - "generic" -}; - -const char *to_msgs[] = { - "no timeout", - "timed out" -}; - -const char *ii_msgs[] = { /* memory or i/o */ - "mem access", - "reserved", - "i/o access", - "generic" -}; - -/* Map the 5 bits of Extended Error code to the string table. */ -const char *ext_msgs[] = { /* extended error */ - "K8 ECC error/F10 reserved", /* 0_0000b */ - "CRC error", /* 0_0001b */ - "sync error", /* 0_0010b */ - "mst abort", /* 0_0011b */ - "tgt abort", /* 0_0100b */ - "GART error", /* 0_0101b */ - "RMW error", /* 0_0110b */ - "Wdog timer error", /* 0_0111b */ - "F10-ECC/K8-Chipkill error", /* 0_1000b */ - "DEV Error", /* 0_1001b */ - "Link Data error", /* 0_1010b */ - "Link or L3 Protocol error", /* 0_1011b */ - "NB Array error", /* 0_1100b */ - "DRAM Parity error", /* 0_1101b */ - "Link Retry/GART Table Walk/DEV Table Walk error", /* 0_1110b */ - "Res 0x0ff error", /* 0_1111b */ - "Res 0x100 error", /* 1_0000b */ - "Res 0x101 error", /* 1_0001b */ - "Res 0x102 error", /* 1_0010b */ - "Res 0x103 error", /* 1_0011b */ - "Res 0x104 error", /* 1_0100b */ - "Res 0x105 error", /* 1_0101b */ - "Res 0x106 error", /* 1_0110b */ - "Res 0x107 error", /* 1_0111b */ - "Res 0x108 error", /* 1_1000b */ - "Res 0x109 error", /* 1_1001b */ - "Res 0x10A error", /* 1_1010b */ - "Res 0x10B error", /* 1_1011b */ - "L3 Cache Data error", /* 1_1100b */ - "L3 CacheTag error", /* 1_1101b */ - "L3 Cache LRU error", /* 1_1110b */ - "Res 0x1FF error" /* 1_1111b */ -}; - -const char *htlink_msgs[] = { - "none", - "1", - "2", - "1 2", - "3", - "1 3", - "2 3", - "1 2 3" -}; diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c new file mode 100644 index 00000000000..cf8465450b3 --- /dev/null +++ b/drivers/edac/edac_mce_amd.c @@ -0,0 +1,101 @@ +#include +#include "edac_mce_amd.h" + +/* + * string representation for the different MCA reported error types, see F3x48 + * or MSR0000_0411. + */ +const char *tt_msgs[] = { /* transaction type */ + "instruction", + "data", + "generic", + "reserved" +}; +EXPORT_SYMBOL_GPL(tt_msgs); + +const char *ll_msgs[] = { /* cache level */ + "L0", + "L1", + "L2", + "L3/generic" +}; +EXPORT_SYMBOL_GPL(ll_msgs); + +const char *rrrr_msgs[] = { + "generic", + "generic read", + "generic write", + "data read", + "data write", + "inst fetch", + "prefetch", + "evict", + "snoop", + "reserved RRRR= 9", + "reserved RRRR= 10", + "reserved RRRR= 11", + "reserved RRRR= 12", + "reserved RRRR= 13", + "reserved RRRR= 14", + "reserved RRRR= 15" +}; +EXPORT_SYMBOL_GPL(rrrr_msgs); + +const char *pp_msgs[] = { /* participating processor */ + "local node originated (SRC)", + "local node responded to request (RES)", + "local node observed as 3rd party (OBS)", + "generic" +}; +EXPORT_SYMBOL_GPL(pp_msgs); + +const char *to_msgs[] = { + "no timeout", + "timed out" +}; +EXPORT_SYMBOL_GPL(to_msgs); + +const char *ii_msgs[] = { /* memory or i/o */ + "mem access", + "reserved", + "i/o access", + "generic" +}; +EXPORT_SYMBOL_GPL(ii_msgs); + +/* Map the 5 bits of Extended Error code to the string table. */ +const char *ext_msgs[] = { /* extended error */ + "K8 ECC error/F10 reserved", /* 0_0000b */ + "CRC error", /* 0_0001b */ + "sync error", /* 0_0010b */ + "mst abort", /* 0_0011b */ + "tgt abort", /* 0_0100b */ + "GART error", /* 0_0101b */ + "RMW error", /* 0_0110b */ + "Wdog timer error", /* 0_0111b */ + "F10-ECC/K8-Chipkill error", /* 0_1000b */ + "DEV Error", /* 0_1001b */ + "Link Data error", /* 0_1010b */ + "Link or L3 Protocol error", /* 0_1011b */ + "NB Array error", /* 0_1100b */ + "DRAM Parity error", /* 0_1101b */ + "Link Retry/GART Table Walk/DEV Table Walk error", /* 0_1110b */ + "Res 0x0ff error", /* 0_1111b */ + "Res 0x100 error", /* 1_0000b */ + "Res 0x101 error", /* 1_0001b */ + "Res 0x102 error", /* 1_0010b */ + "Res 0x103 error", /* 1_0011b */ + "Res 0x104 error", /* 1_0100b */ + "Res 0x105 error", /* 1_0101b */ + "Res 0x106 error", /* 1_0110b */ + "Res 0x107 error", /* 1_0111b */ + "Res 0x108 error", /* 1_1000b */ + "Res 0x109 error", /* 1_1001b */ + "Res 0x10A error", /* 1_1010b */ + "Res 0x10B error", /* 1_1011b */ + "L3 Cache Data error", /* 1_1100b */ + "L3 CacheTag error", /* 1_1101b */ + "L3 Cache LRU error", /* 1_1110b */ + "Res 0x1FF error" /* 1_1111b */ +}; +EXPORT_SYMBOL_GPL(ext_msgs); diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h new file mode 100644 index 00000000000..81f9dcf9990 --- /dev/null +++ b/drivers/edac/edac_mce_amd.h @@ -0,0 +1,29 @@ +#define ERROR_CODE(x) ((x) & 0xffff) +#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) +#define LOW_SYNDROME(x) (((x) >> 15) & 0xff) +#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) + +#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) +#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) +#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) + +#define TT(x) (((x) >> 2) & 0x3) +#define TT_MSG(x) tt_msgs[TT(x)] +#define II(x) (((x) >> 2) & 0x3) +#define II_MSG(x) ii_msgs[II(x)] +#define LL(x) (((x) >> 0) & 0x3) +#define LL_MSG(x) ll_msgs[LL(x)] +#define RRRR(x) (((x) >> 4) & 0xf) +#define RRRR_MSG(x) rrrr_msgs[RRRR(x)] +#define TO(x) (((x) >> 8) & 0x1) +#define TO_MSG(x) to_msgs[TO(x)] +#define PP(x) (((x) >> 9) & 0x3) +#define PP_MSG(x) pp_msgs[PP(x)] + +extern const char *tt_msgs[]; +extern const char *ll_msgs[]; +extern const char *rrrr_msgs[]; +extern const char *pp_msgs[]; +extern const char *to_msgs[]; +extern const char *ii_msgs[]; +extern const char *ext_msgs[]; -- cgit v1.2.3 From 1c43f2e24d059913bce58887f1d6e4267aaed284 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 23 Jul 2009 15:47:51 +0200 Subject: EDAC: beef up ErrorCodeExt error signatures Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 71 +++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index cf8465450b3..918567e8cfd 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -63,39 +63,42 @@ const char *ii_msgs[] = { /* memory or i/o */ }; EXPORT_SYMBOL_GPL(ii_msgs); -/* Map the 5 bits of Extended Error code to the string table. */ -const char *ext_msgs[] = { /* extended error */ - "K8 ECC error/F10 reserved", /* 0_0000b */ - "CRC error", /* 0_0001b */ - "sync error", /* 0_0010b */ - "mst abort", /* 0_0011b */ - "tgt abort", /* 0_0100b */ - "GART error", /* 0_0101b */ - "RMW error", /* 0_0110b */ - "Wdog timer error", /* 0_0111b */ - "F10-ECC/K8-Chipkill error", /* 0_1000b */ - "DEV Error", /* 0_1001b */ - "Link Data error", /* 0_1010b */ - "Link or L3 Protocol error", /* 0_1011b */ - "NB Array error", /* 0_1100b */ - "DRAM Parity error", /* 0_1101b */ - "Link Retry/GART Table Walk/DEV Table Walk error", /* 0_1110b */ - "Res 0x0ff error", /* 0_1111b */ - "Res 0x100 error", /* 1_0000b */ - "Res 0x101 error", /* 1_0001b */ - "Res 0x102 error", /* 1_0010b */ - "Res 0x103 error", /* 1_0011b */ - "Res 0x104 error", /* 1_0100b */ - "Res 0x105 error", /* 1_0101b */ - "Res 0x106 error", /* 1_0110b */ - "Res 0x107 error", /* 1_0111b */ - "Res 0x108 error", /* 1_1000b */ - "Res 0x109 error", /* 1_1001b */ - "Res 0x10A error", /* 1_1010b */ - "Res 0x10B error", /* 1_1011b */ - "L3 Cache Data error", /* 1_1100b */ - "L3 CacheTag error", /* 1_1101b */ - "L3 Cache LRU error", /* 1_1110b */ - "Res 0x1FF error" /* 1_1111b */ +/* + * Map the 4 or 5 (family-specific) bits of Extended Error code to the + * string table. + */ +const char *ext_msgs[] = { + "K8 ECC error", /* 0_0000b */ + "CRC error on link", /* 0_0001b */ + "Sync error packets on link", /* 0_0010b */ + "Master Abort during link operation", /* 0_0011b */ + "Target Abort during link operation", /* 0_0100b */ + "Invalid GART PTE entry during table walk", /* 0_0101b */ + "Unsupported atomic RMW command received", /* 0_0110b */ + "WDT error: NB transaction timeout", /* 0_0111b */ + "ECC/ChipKill ECC error", /* 0_1000b */ + "SVM DEV Error", /* 0_1001b */ + "Link Data error", /* 0_1010b */ + "Link/L3/Probe Filter Protocol error", /* 0_1011b */ + "NB Internal Arrays Parity error", /* 0_1100b */ + "DRAM Address/Control Parity error", /* 0_1101b */ + "Link Transmission error", /* 0_1110b */ + "GART/DEV Table Walk Data error" /* 0_1111b */ + "Res 0x100 error", /* 1_0000b */ + "Res 0x101 error", /* 1_0001b */ + "Res 0x102 error", /* 1_0010b */ + "Res 0x103 error", /* 1_0011b */ + "Res 0x104 error", /* 1_0100b */ + "Res 0x105 error", /* 1_0101b */ + "Res 0x106 error", /* 1_0110b */ + "Res 0x107 error", /* 1_0111b */ + "Res 0x108 error", /* 1_1000b */ + "Res 0x109 error", /* 1_1001b */ + "Res 0x10A error", /* 1_1010b */ + "Res 0x10B error", /* 1_1011b */ + "ECC error in L3 Cache Data", /* 1_1100b */ + "L3 Cache Tag error", /* 1_1101b */ + "L3 Cache LRU Parity error", /* 1_1110b */ + "Probe Filter error" /* 1_1111b */ }; EXPORT_SYMBOL_GPL(ext_msgs); -- cgit v1.2.3 From ef44cc4c2245d3c43f3c11c7bff6239852eef498 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 23 Jul 2009 14:45:48 +0200 Subject: amd64_edac: cleanup amd64_process_error_info * mv amd64_error_info_regs -> err_regs * remove redundant info ptr Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 44 ++++++++++++++++++++------------------------ drivers/edac/amd64_edac.h | 10 +++++----- 2 files changed, 25 insertions(+), 29 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index b9e84bc9176..c9b88d82970 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -750,7 +750,7 @@ static void find_csrow_limits(struct mem_ctl_info *mci, int csrow, * specific. */ static u64 extract_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt = mci->pvt_info; @@ -1106,7 +1106,7 @@ static int k8_early_channel_count(struct amd64_pvt *pvt) /* extract the ERROR ADDRESS for the K8 CPUs */ static u64 k8_get_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { return (((u64) (info->nbeah & 0xff)) << 32) + (info->nbeal & ~0x03); @@ -1149,7 +1149,7 @@ static void k8_read_dram_base_limit(struct amd64_pvt *pvt, int dram) } static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 SystemAddress) { struct mem_ctl_info *src_mci; @@ -1368,7 +1368,7 @@ static void amd64_teardown(struct amd64_pvt *pvt) } static u64 f10_get_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { return (((u64) (info->nbeah & 0xffff)) << 32) + (info->nbeal & ~0x01); @@ -1745,7 +1745,7 @@ static int f10_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, * The @sys_addr is usually an error address received from the hardware. */ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 sys_addr) { struct amd64_pvt *pvt = mci->pvt_info; @@ -2102,7 +2102,7 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome) * - 0: if no valid error is indicated */ static int amd64_get_error_info_regs(struct mem_ctl_info *mci, - struct amd64_error_info_regs *regs) + struct err_regs *regs) { struct amd64_pvt *pvt; struct pci_dev *misc_f3_ctl; @@ -2151,10 +2151,10 @@ err_reg: * - 0: if no error is found */ static int amd64_get_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt; - struct amd64_error_info_regs regs; + struct err_regs regs; pvt = mci->pvt_info; @@ -2210,7 +2210,7 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, } static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); @@ -2220,7 +2220,7 @@ static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, } static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); @@ -2236,7 +2236,7 @@ static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, * ADDRESS and process. */ static void amd64_handle_ce(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt = mci->pvt_info; u64 SystemAddress; @@ -2259,7 +2259,7 @@ static void amd64_handle_ce(struct mem_ctl_info *mci, /* Handle any Un-correctable Errors (UEs) */ static void amd64_handle_ue(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { int csrow; u64 SystemAddress; @@ -2305,7 +2305,7 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } static void amd64_decode_bus_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); @@ -2356,22 +2356,18 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, } int amd64_process_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *regs, int handle_errors) { struct amd64_pvt *pvt; - struct amd64_error_info_regs *regs; u32 err_code, ext_ec; int gart_tlb_error = 0; pvt = mci->pvt_info; - /* If caller doesn't want us to process the error, return */ if (!handle_errors) return 1; - regs = info; - debugf1("NorthBridge ERROR: mci(0x%p)\n", mci); debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n", pvt->mc_node_id, regs->nbeah, regs->nbeal); @@ -2437,13 +2433,13 @@ int amd64_process_error_info(struct mem_ctl_info *mci, gart_tlb_error = 1; debugf1("GART TLB error\n"); - amd64_decode_gart_tlb_error(mci, info); + amd64_decode_gart_tlb_error(mci, regs); } else if (MEM_ERROR(err_code)) { debugf1("Memory/Cache error\n"); - amd64_decode_mem_cache_error(mci, info); + amd64_decode_mem_cache_error(mci, regs); } else if (BUS_ERROR(err_code)) { debugf1("Bus (Link/DRAM) error\n"); - amd64_decode_bus_error(mci, info); + amd64_decode_bus_error(mci, regs); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, @@ -2480,10 +2476,10 @@ EXPORT_SYMBOL_GPL(amd64_process_error_info); */ static void amd64_check(struct mem_ctl_info *mci) { - struct amd64_error_info_regs info; + struct err_regs regs; - if (amd64_get_error_info(mci, &info)) - amd64_process_error_info(mci, &info, 1); + if (amd64_get_error_info(mci, ®s)) + amd64_process_error_info(mci, ®s, 1); } /* diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 1ddef8d15d5..bde8f78551f 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -449,7 +449,7 @@ enum amd64_chipset_families { * * Depends on entry into the modules */ -struct amd64_error_info_regs { +struct err_regs { u32 nbcfg; u32 nbsh; u32 nbsl; @@ -527,7 +527,7 @@ struct amd64_pvt { u32 online_spare; /* On-Line spare Reg */ /* temp storage for when input is received from sysfs */ - struct amd64_error_info_regs ctl_error_info; + struct err_regs ctl_error_info; /* place to store error injection parameters prior to issue */ struct error_injection injection; @@ -586,11 +586,11 @@ struct low_ops { int (*early_channel_count)(struct amd64_pvt *pvt); u64 (*get_error_address)(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info); + struct err_regs *info); void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram); void (*read_dram_ctl_register)(struct amd64_pvt *pvt); void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 SystemAddr); int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map); }; @@ -623,7 +623,7 @@ static inline struct low_ops *family_ops(int index) #define F11_MIN_SCRUB_RATE_BITS 0x6 int amd64_process_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, int handle_errors); int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); -- cgit v1.2.3 From 5110dbdeab546268dda2e4c6a83448639b2fc5ae Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 25 Jun 2009 19:51:04 +0200 Subject: amd64_edac: cleanup/complete NB MCE decoding * don't dump info which mcheck already does * update to newest BKDG * mv amd64_process_error_info -> amd64_decode_nb_mce * shorten error struct names * remove redundant info ptr in amd64_process_error_info * remove unused ErrorCodeExt[19:16] (MCx_STATUS) defines Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 125 ++++++++++++++++-------------------------- drivers/edac/amd64_edac.h | 26 +++------ drivers/edac/amd64_edac_dbg.c | 2 +- drivers/edac/edac_mce_amd.h | 2 + 4 files changed, 56 insertions(+), 99 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index c9b88d82970..5af87d44c80 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2355,62 +2355,47 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, "Error Overflow set"); } -int amd64_process_error_info(struct mem_ctl_info *mci, - struct err_regs *regs, - int handle_errors) +void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, + int handle_errors) { - struct amd64_pvt *pvt; - u32 err_code, ext_ec; - int gart_tlb_error = 0; - - pvt = mci->pvt_info; + struct amd64_pvt *pvt = mci->pvt_info; + int ecc; + u32 ec = ERROR_CODE(regs->nbsl); + u32 xec = EXT_ERROR_CODE(regs->nbsl); if (!handle_errors) - return 1; + return; - debugf1("NorthBridge ERROR: mci(0x%p)\n", mci); - debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n", - pvt->mc_node_id, regs->nbeah, regs->nbeal); - debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n", - regs->nbsh, regs->nbsl); - debugf1(" Valid Error=%s Overflow=%s\n", - (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False", - (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False"); - debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n", - (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ? - "True" : "False", - (regs->nbsh & K8_NBSH_ERR_ENABLE) ? - "True" : "False"); - debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n", - (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ? - "True" : "False", - (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ? - "True" : "False", - (regs->nbsh & K8_NBSH_PCC) ? - "True" : "False"); - debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n", - (regs->nbsh & K8_NBSH_CECC) ? - "True" : "False", - (regs->nbsh & K8_NBSH_UECC) ? - "True" : "False", - (regs->nbsh & K8_NBSH_ERR_SCRUBER) ? - "True" : "False"); - debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n", - (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False"); + pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id); + /* + * F10h, revD can disable ErrCpu[3:0] so check that first and also the + * value encoding has changed so interpret those differently + */ + if ((boot_cpu_data.x86 == 0x10) && + (boot_cpu_data.x86_model > 8)) { + if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) + pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); + } else { + pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); + } - err_code = ERROR_CODE(regs->nbsl); + pr_emerg(" Error: %sorrected", + ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); + pr_cont(", Report Error: %s", + ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); + pr_cont(", MiscV: %svalid, CPU context corrupt: %s", + ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), + ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); - /* Determine which error type: - * 1) GART errors - non-fatal, developmental events - * 2) MEMORY errors - * 3) BUS errors - * 4) Unknown error - */ - if (TLB_ERROR(err_code)) { + /* do the two bits[14:13] together */ + ecc = regs->nbsh & (0x3 << 13); + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + if (TLB_ERROR(ec)) { /* * GART errors are intended to help graphics driver developers * to detect bad GART PTEs. It is recommended by AMD to disable @@ -2423,52 +2408,34 @@ int amd64_process_error_info(struct mem_ctl_info *mci, * [1] section 13.10.1 on BIOS and Kernel Developers Guide for * AMD NPT family 0Fh processors */ - if (report_gart_errors == 0) - return 1; - - /* - * Only if GART error reporting is requested should we generate - * any logs. - */ - gart_tlb_error = 1; + if (!report_gart_errors) + return; - debugf1("GART TLB error\n"); + pr_emerg("GART TLB error\n"); amd64_decode_gart_tlb_error(mci, regs); - } else if (MEM_ERROR(err_code)) { - debugf1("Memory/Cache error\n"); + } else if (MEM_ERROR(ec)) { + pr_emerg("Memory/Cache error\n"); amd64_decode_mem_cache_error(mci, regs); - } else if (BUS_ERROR(err_code)) { - debugf1("Bus (Link/DRAM) error\n"); + } else if (BUS_ERROR(ec)) { + pr_emerg("Bus (Link/DRAM) error\n"); amd64_decode_bus_error(mci, regs); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, "%s(): unknown MCE error 0x%x\n", __func__, - err_code); + ec); } - ext_ec = EXT_ERROR_CODE(regs->nbsl); - amd64_mc_printk(mci, KERN_ERR, - "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]); + pr_emerg("%s.\n", EXT_ERR_MSG(xec)); /* * Check the UE bit of the NB status high register, if set generate some * logs. If NOT a GART error, then process the event as a NO-INFO event. * If it was a GART error, skip that process. */ - if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) { - amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n"); - if (!gart_tlb_error) - edac_mc_handle_ue_no_info(mci, "UE bit is set\n"); - } - - if (regs->nbsh & K8_NBSH_PCC) - amd64_mc_printk(mci, KERN_CRIT, - "PCC (processor context corrupt) set\n"); - - return 1; + if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) + edac_mc_handle_ue_no_info(mci, "UE bit is set"); } -EXPORT_SYMBOL_GPL(amd64_process_error_info); /* * The main polling 'check' function, called FROM the edac core to perform the @@ -2479,7 +2446,7 @@ static void amd64_check(struct mem_ctl_info *mci) struct err_regs regs; if (amd64_get_error_info(mci, ®s)) - amd64_process_error_info(mci, ®s, 1); + amd64_decode_nb_mce(mci, ®s, 1); } /* diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index bde8f78551f..ecab0c9fd14 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -306,16 +306,7 @@ enum { /* Family F10h: Normalized Extended Error Codes */ #define F10_NBSL_EXT_ERR_RES 0x0 -#define F10_NBSL_EXT_ERR_CRC 0x1 -#define F10_NBSL_EXT_ERR_SYNC 0x2 -#define F10_NBSL_EXT_ERR_MST 0x3 -#define F10_NBSL_EXT_ERR_TGT 0x4 -#define F10_NBSL_EXT_ERR_GART 0x5 -#define F10_NBSL_EXT_ERR_RMW 0x6 -#define F10_NBSL_EXT_ERR_WDT 0x7 #define F10_NBSL_EXT_ERR_ECC 0x8 -#define F10_NBSL_EXT_ERR_DEV 0x9 -#define F10_NBSL_EXT_ERR_LINK_DATA 0xA /* Next two are overloaded values */ #define F10_NBSL_EXT_ERR_LINK_PROTO 0xB @@ -360,18 +351,15 @@ enum { #define K8_NBSH_VALID_BIT BIT(31) #define K8_NBSH_OVERFLOW BIT(30) -#define K8_NBSH_UNCORRECTED_ERR BIT(29) -#define K8_NBSH_ERR_ENABLE BIT(28) -#define K8_NBSH_MISC_ERR_VALID BIT(27) +#define K8_NBSH_UC_ERR BIT(29) +#define K8_NBSH_ERR_EN BIT(28) +#define K8_NBSH_MISCV BIT(27) #define K8_NBSH_VALID_ERROR_ADDR BIT(26) #define K8_NBSH_PCC BIT(25) +#define K8_NBSH_ERR_CPU_VAL BIT(24) #define K8_NBSH_CECC BIT(14) #define K8_NBSH_UECC BIT(13) #define K8_NBSH_ERR_SCRUBER BIT(8) -#define K8_NBSH_CORE3 BIT(3) -#define K8_NBSH_CORE2 BIT(2) -#define K8_NBSH_CORE1 BIT(1) -#define K8_NBSH_CORE0 BIT(0) #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) @@ -622,8 +610,8 @@ static inline struct low_ops *family_ops(int index) #define F10_MIN_SCRUB_RATE_BITS 0x5 #define F11_MIN_SCRUB_RATE_BITS 0x6 -int amd64_process_error_info(struct mem_ctl_info *mci, - struct err_regs *info, - int handle_errors); +void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info, + int handle_errors); + int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index 0a41b248a4a..bcb4e2eba3d 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c @@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, /* Process the Mapping request */ /* TODO: Add race prevention */ - amd64_process_error_info(mci, &pvt->ctl_error_info, 1); + amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1); return count; } diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 81f9dcf9990..39971cdabb5 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -1,5 +1,7 @@ #define ERROR_CODE(x) ((x) & 0xffff) #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) +#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] + #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) -- cgit v1.2.3 From b7225e4fc19ce27a594cb2b868ef151bf82f8f93 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 23 Jul 2009 16:05:53 +0200 Subject: amd64_edac: remove memory and GART TLB error decoders Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 5af87d44c80..75842f08db8 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2209,28 +2209,6 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, return 1; } -static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, - struct err_regs *info) -{ - u32 ec = ERROR_CODE(info->nbsl); - - amd64_mc_printk(mci, KERN_ERR, - "GART TLB event: transaction type(%s), " - "cache level(%s)\n", TT_MSG(ec), LL_MSG(ec)); -} - -static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, - struct err_regs *info) -{ - u32 ec = ERROR_CODE(info->nbsl); - - amd64_mc_printk(mci, KERN_ERR, - "cache hierarchy error: memory transaction type(%s), " - "transaction type(%s), cache level(%s)\n", - RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); -} - - /* * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR * ADDRESS and process. @@ -2411,19 +2389,19 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, if (!report_gart_errors) return; - pr_emerg("GART TLB error\n"); - amd64_decode_gart_tlb_error(mci, regs); + pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", + TT_MSG(ec), LL_MSG(ec)); } else if (MEM_ERROR(ec)) { - pr_emerg("Memory/Cache error\n"); - amd64_decode_mem_cache_error(mci, regs); + pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," + " Cache Level: %s", + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { - pr_emerg("Bus (Link/DRAM) error\n"); + pr_emerg(" Bus (Link/DRAM) error\n"); amd64_decode_bus_error(mci, regs); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, - "%s(): unknown MCE error 0x%x\n", __func__, - ec); + "%s(): unknown MCE error 0x%x\n", __func__, ec); } pr_emerg("%s.\n", EXT_ERR_MSG(xec)); -- cgit v1.2.3 From ecaf5606de65cdd04de5f526185fe28fb0df654e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 23 Jul 2009 16:32:01 +0200 Subject: amd64_edac: cleanup amd64_decode_bus_error Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 75842f08db8..82f48ee90f1 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2283,42 +2283,26 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } static void amd64_decode_bus_error(struct mem_ctl_info *mci, - struct err_regs *info) + struct err_regs *info, int ecc_type) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); - amd64_mc_printk(mci, KERN_ERR, - "BUS ERROR:\n" - " time-out(%s) mem or i/o(%s)\n" - " participating processor(%s)\n" - " memory transaction type(%s)\n" - " cache level(%s) Error Found by: %s\n", - TO_MSG(ec), II_MSG(ec), PP_MSG(ec), RRRR_MSG(ec), LL_MSG(ec), - (info->nbsh & K8_NBSH_ERR_SCRUBER) ? - "Scrubber" : "Normal Operation"); + pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n", + RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); /* Bail early out if this was an 'observed' error */ if (PP(ec) == K8_NBSL_PP_OBS) return; - /* Parse out the extended error code for ECC events */ - switch (xec) { - /* F10 changed to one Extended ECC error code */ - case F10_NBSL_EXT_ERR_RES: /* Reserved field */ - case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */ - break; - - default: - amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error " - "handling for this error\n"); + /* Do only ECC errors */ + if (xec && xec != F10_NBSL_EXT_ERR_ECC) return; - } - if (info->nbsh & K8_NBSH_CECC) + if (ecc_type == 2) amd64_handle_ce(mci, info); - else if (info->nbsh & K8_NBSH_UECC) + else if (ecc_type == 1) amd64_handle_ue(mci, info); /* @@ -2329,8 +2313,7 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, * catastrophic. */ if (info->nbsh & K8_NBSH_OVERFLOW) - edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR - "Error Overflow set"); + edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); } void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, @@ -2397,7 +2380,7 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { pr_emerg(" Bus (Link/DRAM) error\n"); - amd64_decode_bus_error(mci, regs); + amd64_decode_bus_error(mci, regs, ecc); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, -- cgit v1.2.3 From 549d042df240dfb4203bab40ad44f9336751b7d6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 24 Jul 2009 13:51:42 +0200 Subject: x86, mce: pass mce info to EDAC for decoding Move NB decoder along with required defines to EDAC MCE core. Add registration routines for further decoding of the MCE info in the AMD64 EDAC module. CC: Andi Kleen Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 7 +++ drivers/edac/amd64_edac.c | 98 ++++++++------------------------- drivers/edac/amd64_edac.h | 36 ------------ drivers/edac/amd64_edac_dbg.c | 2 +- drivers/edac/edac_mce_amd.c | 115 +++++++++++++++++++++++++++++++++++++++ drivers/edac/edac_mce_amd.h | 38 +++++++++++++ 6 files changed, 185 insertions(+), 111 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 01213048f62..b82866f6adf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -183,6 +183,11 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +void __weak decode_mce(struct mce *m) +{ + return; +} + static void print_mce(struct mce *m) { printk(KERN_EMERG @@ -205,6 +210,8 @@ static void print_mce(struct mce *m) printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + + decode_mce(m); } static void print_mce_head(void) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 82f48ee90f1..2080b1e2e8a 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2282,8 +2282,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } } -static void amd64_decode_bus_error(struct mem_ctl_info *mci, - struct err_regs *info, int ecc_type) +static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, + struct err_regs *info, int ecc_type) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); @@ -2316,86 +2316,23 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); } -void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, - int handle_errors) +void amd64_decode_bus_error(int node_id, struct err_regs *regs, + int ecc_type) { - struct amd64_pvt *pvt = mci->pvt_info; - int ecc; - u32 ec = ERROR_CODE(regs->nbsl); - u32 xec = EXT_ERROR_CODE(regs->nbsl); - - if (!handle_errors) - return; - - pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id); - - /* - * F10h, revD can disable ErrCpu[3:0] so check that first and also the - * value encoding has changed so interpret those differently - */ - if ((boot_cpu_data.x86 == 0x10) && - (boot_cpu_data.x86_model > 8)) { - if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) - pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); - } else { - pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); - } - - pr_emerg(" Error: %sorrected", - ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); - pr_cont(", Report Error: %s", - ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); - pr_cont(", MiscV: %svalid, CPU context corrupt: %s", - ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), - ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); - - /* do the two bits[14:13] together */ - ecc = regs->nbsh & (0x3 << 13); - if (ecc) - pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); - - pr_cont("\n"); - - if (TLB_ERROR(ec)) { - /* - * GART errors are intended to help graphics driver developers - * to detect bad GART PTEs. It is recommended by AMD to disable - * GART table walk error reporting by default[1] (currently - * being disabled in mce_cpu_quirks()) and according to the - * comment in mce_cpu_quirks(), such GART errors can be - * incorrectly triggered. We may see these errors anyway and - * unless requested by the user, they won't be reported. - * - * [1] section 13.10.1 on BIOS and Kernel Developers Guide for - * AMD NPT family 0Fh processors - */ - if (!report_gart_errors) - return; - - pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", - TT_MSG(ec), LL_MSG(ec)); - } else if (MEM_ERROR(ec)) { - pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," - " Cache Level: %s", - RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); - } else if (BUS_ERROR(ec)) { - pr_emerg(" Bus (Link/DRAM) error\n"); - amd64_decode_bus_error(mci, regs, ecc); - } else { - /* shouldn't reach here! */ - amd64_mc_printk(mci, KERN_WARNING, - "%s(): unknown MCE error 0x%x\n", __func__, ec); - } + struct mem_ctl_info *mci = mci_lookup[node_id]; - pr_emerg("%s.\n", EXT_ERR_MSG(xec)); + __amd64_decode_bus_error(mci, regs, ecc_type); /* * Check the UE bit of the NB status high register, if set generate some * logs. If NOT a GART error, then process the event as a NO-INFO event. * If it was a GART error, skip that process. + * + * FIXME: this should go somewhere else, if at all. */ if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) edac_mc_handle_ue_no_info(mci, "UE bit is set"); + } /* @@ -2406,8 +2343,10 @@ static void amd64_check(struct mem_ctl_info *mci) { struct err_regs regs; - if (amd64_get_error_info(mci, ®s)) - amd64_decode_nb_mce(mci, ®s, 1); + if (amd64_get_error_info(mci, ®s)) { + struct amd64_pvt *pvt = mci->pvt_info; + amd_decode_nb_mce(pvt->mc_node_id, ®s, 1); + } } /* @@ -3103,6 +3042,13 @@ static int amd64_init_2nd_stage(struct amd64_pvt *pvt) mci_lookup[node_id] = mci; pvt_lookup[node_id] = NULL; + + /* register stuff with EDAC MCE */ + if (report_gart_errors) + amd_report_gart_errors(true); + + amd_register_ecc_decoder(amd64_decode_bus_error); + return 0; err_add_mc: @@ -3169,6 +3115,10 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev) mci_lookup[pvt->mc_node_id] = NULL; + /* unregister from EDAC MCE */ + amd_report_gart_errors(false); + amd_unregister_ecc_decoder(amd64_decode_bus_error); + /* Free the EDAC CORE resources */ edac_mc_free(mci); } diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index ecab0c9fd14..8ea07e2715d 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -346,24 +346,8 @@ enum { #define K8_NBSL_PP_OBS 0x2 #define K8_NBSL_PP_GENERIC 0x3 - -#define K8_NBSH 0x4C - -#define K8_NBSH_VALID_BIT BIT(31) -#define K8_NBSH_OVERFLOW BIT(30) -#define K8_NBSH_UC_ERR BIT(29) -#define K8_NBSH_ERR_EN BIT(28) -#define K8_NBSH_MISCV BIT(27) -#define K8_NBSH_VALID_ERROR_ADDR BIT(26) -#define K8_NBSH_PCC BIT(25) -#define K8_NBSH_ERR_CPU_VAL BIT(24) -#define K8_NBSH_CECC BIT(14) -#define K8_NBSH_UECC BIT(13) -#define K8_NBSH_ERR_SCRUBER BIT(8) - #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) - #define K8_NBEAL 0x50 #define K8_NBEAH 0x54 #define K8_SCRCTRL 0x58 @@ -428,23 +412,6 @@ enum amd64_chipset_families { F11_CPUS, }; -/* - * Structure to hold: - * - * 1) dynamically read status and error address HW registers - * 2) sysfs entered values - * 3) MCE values - * - * Depends on entry into the modules - */ -struct err_regs { - u32 nbcfg; - u32 nbsh; - u32 nbsl; - u32 nbeah; - u32 nbeal; -}; - /* Error injection control structure */ struct error_injection { u32 section; @@ -610,8 +577,5 @@ static inline struct low_ops *family_ops(int index) #define F10_MIN_SCRUB_RATE_BITS 0x5 #define F11_MIN_SCRUB_RATE_BITS 0x6 -void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info, - int handle_errors); - int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index bcb4e2eba3d..59cf2cf6e11 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c @@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, /* Process the Mapping request */ /* TODO: Add race prevention */ - amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1); + amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1); return count; } diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 918567e8cfd..444c2cc4472 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -1,6 +1,31 @@ #include #include "edac_mce_amd.h" +static bool report_gart_errors; +static void (*nb_bus_decoder)(int node_id, struct err_regs *regs, int ecc_type); + +void amd_report_gart_errors(bool v) +{ + report_gart_errors = v; +} +EXPORT_SYMBOL_GPL(amd_report_gart_errors); + +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)) +{ + nb_bus_decoder = f; +} +EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); + +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)) +{ + if (nb_bus_decoder) { + WARN_ON(nb_bus_decoder != f); + + nb_bus_decoder = NULL; + } +} +EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); + /* * string representation for the different MCA reported error types, see F3x48 * or MSR0000_0411. @@ -102,3 +127,93 @@ const char *ext_msgs[] = { "Probe Filter error" /* 1_1111b */ }; EXPORT_SYMBOL_GPL(ext_msgs); + +void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) +{ + int ecc; + u32 ec = ERROR_CODE(regs->nbsl); + u32 xec = EXT_ERROR_CODE(regs->nbsl); + + if (!handle_errors) + return; + + pr_emerg(" Northbridge Error, node %d", node_id); + + /* + * F10h, revD can disable ErrCpu[3:0] so check that first and also the + * value encoding has changed so interpret those differently + */ + if ((boot_cpu_data.x86 == 0x10) && + (boot_cpu_data.x86_model > 8)) { + if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) + pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); + } else { + pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); + } + + pr_emerg(" Error: %sorrected", + ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); + pr_cont(", Report Error: %s", + ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); + pr_cont(", MiscV: %svalid, CPU context corrupt: %s", + ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), + ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); + + /* do the two bits[14:13] together */ + ecc = regs->nbsh & (0x3 << 13); + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + if (TLB_ERROR(ec)) { + /* + * GART errors are intended to help graphics driver developers + * to detect bad GART PTEs. It is recommended by AMD to disable + * GART table walk error reporting by default[1] (currently + * being disabled in mce_cpu_quirks()) and according to the + * comment in mce_cpu_quirks(), such GART errors can be + * incorrectly triggered. We may see these errors anyway and + * unless requested by the user, they won't be reported. + * + * [1] section 13.10.1 on BIOS and Kernel Developers Guide for + * AMD NPT family 0Fh processors + */ + if (!report_gart_errors) + return; + + pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", + TT_MSG(ec), LL_MSG(ec)); + } else if (MEM_ERROR(ec)) { + pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," + " Cache Level: %s", + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); + } else if (BUS_ERROR(ec)) { + pr_emerg(" Bus (Link/DRAM) error\n"); + if (nb_bus_decoder) + nb_bus_decoder(node_id, regs, ecc); + } else { + /* shouldn't reach here! */ + pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec); + } + + pr_emerg("%s.\n", EXT_ERR_MSG(xec)); +} +EXPORT_SYMBOL_GPL(amd_decode_nb_mce); + +void decode_mce(struct mce *m) +{ + struct err_regs regs; + int node; + + if (m->bank != 4) + return; + + regs.nbsl = (u32) m->status; + regs.nbsh = (u32)(m->status >> 32); + regs.nbeal = (u32) m->addr; + regs.nbeah = (u32)(m->addr >> 32); + node = topology_cpu_node_id(m->extcpu); + + amd_decode_nb_mce(node, ®s, 1); +} diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 39971cdabb5..9114dc62782 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -1,3 +1,8 @@ +#ifndef _EDAC_MCE_AMD_H +#define _EDAC_MCE_AMD_H + +#include + #define ERROR_CODE(x) ((x) & 0xffff) #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) #define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] @@ -22,6 +27,20 @@ #define PP(x) (((x) >> 9) & 0x3) #define PP_MSG(x) pp_msgs[PP(x)] +#define K8_NBSH 0x4C + +#define K8_NBSH_VALID_BIT BIT(31) +#define K8_NBSH_OVERFLOW BIT(30) +#define K8_NBSH_UC_ERR BIT(29) +#define K8_NBSH_ERR_EN BIT(28) +#define K8_NBSH_MISCV BIT(27) +#define K8_NBSH_VALID_ERROR_ADDR BIT(26) +#define K8_NBSH_PCC BIT(25) +#define K8_NBSH_ERR_CPU_VAL BIT(24) +#define K8_NBSH_CECC BIT(14) +#define K8_NBSH_UECC BIT(13) +#define K8_NBSH_ERR_SCRUBER BIT(8) + extern const char *tt_msgs[]; extern const char *ll_msgs[]; extern const char *rrrr_msgs[]; @@ -29,3 +48,22 @@ extern const char *pp_msgs[]; extern const char *to_msgs[]; extern const char *ii_msgs[]; extern const char *ext_msgs[]; + +/* + * relevant NB regs + */ +struct err_regs { + u32 nbcfg; + u32 nbsh; + u32 nbsl; + u32 nbeah; + u32 nbeal; +}; + + +void amd_report_gart_errors(bool); +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)); +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)); +void amd_decode_nb_mce(int, struct err_regs *, int); + +#endif /* _EDAC_MCE_AMD_H */ -- cgit v1.2.3 From b69b29de65fe4078b125acc9dea34be82f7c362c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 27 Jul 2009 16:21:14 +0200 Subject: EDAC, AMD: carve out MCi_STATUS decoding The MCi_STATUS registers have most field definitions in common so decode them in the general path. Do not pass ecc_type along and compute it in __amd64_decode_bus_error instead. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 8 +++---- drivers/edac/edac_mce_amd.c | 57 ++++++++++++++++++++++----------------------- drivers/edac/edac_mce_amd.h | 4 ++-- 3 files changed, 34 insertions(+), 35 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 2080b1e2e8a..c81ca2cf8dc 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2283,10 +2283,11 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, - struct err_regs *info, int ecc_type) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); + int ecc_type = info->nbsh & (0x3 << 13); pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n", RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); @@ -2316,12 +2317,11 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); } -void amd64_decode_bus_error(int node_id, struct err_regs *regs, - int ecc_type) +void amd64_decode_bus_error(int node_id, struct err_regs *regs) { struct mem_ctl_info *mci = mci_lookup[node_id]; - __amd64_decode_bus_error(mci, regs, ecc_type); + __amd64_decode_bus_error(mci, regs); /* * Check the UE bit of the NB status high register, if set generate some diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 444c2cc4472..0ba92d65db4 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -2,7 +2,7 @@ #include "edac_mce_amd.h" static bool report_gart_errors; -static void (*nb_bus_decoder)(int node_id, struct err_regs *regs, int ecc_type); +static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); void amd_report_gart_errors(bool v) { @@ -10,13 +10,13 @@ void amd_report_gart_errors(bool v) } EXPORT_SYMBOL_GPL(amd_report_gart_errors); -void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)) +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)) { nb_bus_decoder = f; } EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); -void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)) +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)) { if (nb_bus_decoder) { WARN_ON(nb_bus_decoder != f); @@ -130,7 +130,6 @@ EXPORT_SYMBOL_GPL(ext_msgs); void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) { - int ecc; u32 ec = ERROR_CODE(regs->nbsl); u32 xec = EXT_ERROR_CODE(regs->nbsl); @@ -151,21 +150,6 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); } - pr_emerg(" Error: %sorrected", - ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); - pr_cont(", Report Error: %s", - ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); - pr_cont(", MiscV: %svalid, CPU context corrupt: %s", - ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), - ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); - - /* do the two bits[14:13] together */ - ecc = regs->nbsh & (0x3 << 13); - if (ecc) - pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); - - pr_cont("\n"); - if (TLB_ERROR(ec)) { /* * GART errors are intended to help graphics driver developers @@ -191,7 +175,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) } else if (BUS_ERROR(ec)) { pr_emerg(" Bus (Link/DRAM) error\n"); if (nb_bus_decoder) - nb_bus_decoder(node_id, regs, ecc); + nb_bus_decoder(node_id, regs); } else { /* shouldn't reach here! */ pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec); @@ -204,16 +188,31 @@ EXPORT_SYMBOL_GPL(amd_decode_nb_mce); void decode_mce(struct mce *m) { struct err_regs regs; - int node; + int node, ecc; - if (m->bank != 4) - return; + pr_emerg("MC%d_STATUS:\n", m->bank); - regs.nbsl = (u32) m->status; - regs.nbsh = (u32)(m->status >> 32); - regs.nbeal = (u32) m->addr; - regs.nbeah = (u32)(m->addr >> 32); - node = topology_cpu_node_id(m->extcpu); + pr_emerg(" Error: %sorrected, Report: %s, MiscV: %svalid, " + "CPU context corrupt: %s", + ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), + ((m->status & MCI_STATUS_EN) ? "yes" : "no"), + ((m->status & MCI_STATUS_MISCV) ? "" : "in"), + ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); - amd_decode_nb_mce(node, ®s, 1); + /* do the two bits[14:13] together */ + ecc = m->status & (3ULL << 45); + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + if (m->bank == 4) { + regs.nbsl = (u32) m->status; + regs.nbsh = (u32)(m->status >> 32); + regs.nbeal = (u32) m->addr; + regs.nbeah = (u32)(m->addr >> 32); + node = per_cpu(cpu_llc_id, m->extcpu); + + amd_decode_nb_mce(node, ®s, 1); + } } diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 9114dc62782..df23ee065f7 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -62,8 +62,8 @@ struct err_regs { void amd_report_gart_errors(bool); -void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)); -void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)); +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); void amd_decode_nb_mce(int, struct err_regs *, int); #endif /* _EDAC_MCE_AMD_H */ -- cgit v1.2.3 From d93cc222adf3532ddb442648f8db00c15d1dc4c1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 10:56:15 +0200 Subject: EDAC, AMD: carve out decoding of MCi_STATUS ErrorCode This is the MCE error code from the MCi_STATUS banks, bits [15:0] which describe what type of error was encountered: GART TLB, Memory or Bus error. The semantics of those bits are identical across all MCE banks so decode those separately, irrespectively of MCE type. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 4 ---- drivers/edac/edac_mce_amd.c | 37 ++++++++++++++++++++++--------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index c81ca2cf8dc..173dc4a8416 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2289,10 +2289,6 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, u32 xec = EXT_ERROR_CODE(info->nbsl); int ecc_type = info->nbsh & (0x3 << 13); - pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n", - RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); - - /* Bail early out if this was an 'observed' error */ if (PP(ec) == K8_NBSL_PP_OBS) return; diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 0ba92d65db4..81f812eb3ae 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -150,6 +150,16 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); } + + pr_emerg("%s.\n", EXT_ERR_MSG(xec)); + + if (BUS_ERROR(ec) && nb_bus_decoder) + nb_bus_decoder(node_id, regs); +} +EXPORT_SYMBOL_GPL(amd_decode_nb_mce); + +static inline void amd_decode_err_code(unsigned int ec) +{ if (TLB_ERROR(ec)) { /* * GART errors are intended to help graphics driver developers @@ -166,33 +176,28 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) if (!report_gart_errors) return; - pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", + pr_emerg(" Transaction: %s, Cache Level %s\n", TT_MSG(ec), LL_MSG(ec)); } else if (MEM_ERROR(ec)) { - pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," - " Cache Level: %s", + pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s", RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { - pr_emerg(" Bus (Link/DRAM) error\n"); - if (nb_bus_decoder) - nb_bus_decoder(node_id, regs); - } else { - /* shouldn't reach here! */ - pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec); - } - - pr_emerg("%s.\n", EXT_ERR_MSG(xec)); + pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, " + "Participating Processor: %s\n", + RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), + PP_MSG(ec)); + } else + pr_warning("Huh? Unknown MCE error 0x%x\n", ec); } -EXPORT_SYMBOL_GPL(amd_decode_nb_mce); void decode_mce(struct mce *m) { struct err_regs regs; int node, ecc; - pr_emerg("MC%d_STATUS:\n", m->bank); + pr_emerg("MC%d_STATUS: ", m->bank); - pr_emerg(" Error: %sorrected, Report: %s, MiscV: %svalid, " + pr_cont("%sorrected error, report: %s, MiscV: %svalid, " "CPU context corrupt: %s", ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), ((m->status & MCI_STATUS_EN) ? "yes" : "no"), @@ -206,6 +211,8 @@ void decode_mce(struct mce *m) pr_cont("\n"); + amd_decode_err_code(m->status & 0xffff); + if (m->bank == 4) { regs.nbsl = (u32) m->status; regs.nbsh = (u32)(m->status >> 32); -- cgit v1.2.3 From 51966241360874e85d1e4d93c9fcdd2ef917b0fb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 13:50:43 +0200 Subject: EDAC, AMD: decode data cache MCEs Those get reported in MC0_STATUS, see Table 92, F10h BKDG (31116, rev. 3.28) for more details. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 56 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 81f812eb3ae..fe8ccebd967 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -128,6 +128,49 @@ const char *ext_msgs[] = { }; EXPORT_SYMBOL_GPL(ext_msgs); +static void amd_decode_dc_mce(u64 mc0_status) +{ + u32 ec = mc0_status & 0xffff; + u32 xec = (mc0_status >> 16) & 0xf; + + pr_emerg(" Data Cache Error"); + + if (xec == 1 && TLB_ERROR(ec)) + pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); + else if (xec == 0) { + if (mc0_status & (1ULL << 40)) + pr_cont(" during Data Scrub.\n"); + else if (TLB_ERROR(ec)) + pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); + else if (MEM_ERROR(ec)) { + u8 ll = ec & 0x3; + u8 tt = (ec >> 2) & 0x3; + u8 rrrr = (ec >> 4) & 0xf; + + /* see F10h BKDG (31116), Table 92. */ + if (ll == 0x1) { + if (tt != 0x1) + goto wrong_dc_mce; + + pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec)); + + } else if (ll == 0x2 && rrrr == 0x3) + pr_cont(" during L1 linefill from L2.\n"); + else + goto wrong_dc_mce; + } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf) + pr_cont(" during system linefill.\n"); + else + goto wrong_dc_mce; + } else + goto wrong_dc_mce; + + return; + +wrong_dc_mce: + pr_warning("Corrupted DC MCE info?\n"); +} + void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) { u32 ec = ERROR_CODE(regs->nbsl); @@ -211,9 +254,12 @@ void decode_mce(struct mce *m) pr_cont("\n"); - amd_decode_err_code(m->status & 0xffff); + switch (m->bank) { + case 0: + amd_decode_dc_mce(m->status); + break; - if (m->bank == 4) { + case 4: regs.nbsl = (u32) m->status; regs.nbsh = (u32)(m->status >> 32); regs.nbeal = (u32) m->addr; @@ -221,5 +267,11 @@ void decode_mce(struct mce *m) node = per_cpu(cpu_llc_id, m->extcpu); amd_decode_nb_mce(node, ®s, 1); + break; + + default: + break; } + + amd_decode_err_code(m->status & 0xffff); } -- cgit v1.2.3 From ab5535e70fb35b8046b6ace50259fe212e074a4f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 14:06:26 +0200 Subject: EDAC, AMD: decode instruction cache MCEs See Fam10h BKDG (31116, rev. 3.28), Table 95 Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 61 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index fe8ccebd967..b30a8306b14 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -171,6 +171,63 @@ wrong_dc_mce: pr_warning("Corrupted DC MCE info?\n"); } +static void amd_decode_ic_mce(u64 mc1_status) +{ + u32 ec = mc1_status & 0xffff; + u32 xec = (mc1_status >> 16) & 0xf; + + pr_emerg(" Instruction Cache Error"); + + if (xec == 1 && TLB_ERROR(ec)) + pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); + else if (xec == 0) { + if (TLB_ERROR(ec)) + pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); + else if (BUS_ERROR(ec)) { + if (boot_cpu_data.x86 == 0xf && + (mc1_status & (1ULL << 58))) + pr_cont(" during system linefill.\n"); + else + pr_cont(" during attempted NB data read.\n"); + } else if (MEM_ERROR(ec)) { + u8 ll = ec & 0x3; + u8 rrrr = (ec >> 4) & 0xf; + + if (ll == 0x2) + pr_cont(" during a linefill from L2.\n"); + else if (ll == 0x1) { + + switch (rrrr) { + case 0x5: + pr_cont(": Parity error during " + "data load.\n"); + break; + + case 0x7: + pr_cont(": Copyback Parity/Victim" + " error.\n"); + break; + + case 0x8: + pr_cont(": Tag Snoop error.\n"); + break; + + default: + goto wrong_ic_mce; + break; + } + } + } else + goto wrong_ic_mce; + } else + goto wrong_ic_mce; + + return; + +wrong_ic_mce: + pr_warning("Corrupted IC MCE info?\n"); +} + void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) { u32 ec = ERROR_CODE(regs->nbsl); @@ -259,6 +316,10 @@ void decode_mce(struct mce *m) amd_decode_dc_mce(m->status); break; + case 1: + amd_decode_ic_mce(m->status); + break; + case 4: regs.nbsl = (u32) m->status; regs.nbsh = (u32)(m->status >> 32); -- cgit v1.2.3 From 56cad2d6fb832a876ab8bda4b01e5d0722dc754b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 14:14:24 +0200 Subject: EDAC, AMD: decode bus unit MCEs ... according to Table 69, Fam10h BKDG (31116, rev. 3.28). Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 46 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index b30a8306b14..e1f32c36248 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -228,6 +228,48 @@ wrong_ic_mce: pr_warning("Corrupted IC MCE info?\n"); } +static void amd_decode_bu_mce(u64 mc2_status) +{ + u32 ec = mc2_status & 0xffff; + u32 xec = (mc2_status >> 16) & 0xf; + + pr_emerg(" Bus Unit Error"); + + if (xec == 0x1) + pr_cont(" in the write data buffers.\n"); + else if (xec == 0x3) + pr_cont(" in the victim data buffers.\n"); + else if (xec == 0x2 && MEM_ERROR(ec)) + pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec)); + else if (xec == 0x0) { + if (TLB_ERROR(ec)) + pr_cont(": %s error in a Page Descriptor Cache or " + "Guest TLB.\n", TT_MSG(ec)); + else if (BUS_ERROR(ec)) + pr_cont(": %s/ECC error in data read from NB: %s.\n", + RRRR_MSG(ec), PP_MSG(ec)); + else if (MEM_ERROR(ec)) { + u8 rrrr = (ec >> 4) & 0xf; + + if (rrrr >= 0x7) + pr_cont(": %s error during data copyback.\n", + RRRR_MSG(ec)); + else if (rrrr <= 0x1) + pr_cont(": %s parity/ECC error during data " + "access from L2.\n", RRRR_MSG(ec)); + else + goto wrong_bu_mce; + } else + goto wrong_bu_mce; + } else + goto wrong_bu_mce; + + return; + +wrong_bu_mce: + pr_warning("Corrupted BU MCE info?\n"); +} + void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) { u32 ec = ERROR_CODE(regs->nbsl); @@ -320,6 +362,10 @@ void decode_mce(struct mce *m) amd_decode_ic_mce(m->status); break; + case 2: + amd_decode_bu_mce(m->status); + break; + case 4: regs.nbsl = (u32) m->status; regs.nbsh = (u32)(m->status >> 32); -- cgit v1.2.3 From f9350efd6f37ef60d2334739edb76ef1f8ee0183 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 14:17:30 +0200 Subject: EDAC, AMD: decode load store MCEs See Fam10h BKDG (31116, rev. 3.28), Table 100. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index e1f32c36248..22848285536 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -270,6 +270,27 @@ wrong_bu_mce: pr_warning("Corrupted BU MCE info?\n"); } +static void amd_decode_ls_mce(u64 mc3_status) +{ + u32 ec = mc3_status & 0xffff; + u32 xec = (mc3_status >> 16) & 0xf; + + pr_emerg(" Load Store Error"); + + if (xec == 0x0) { + u8 rrrr = (ec >> 4) & 0xf; + + if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4)) + goto wrong_ls_mce; + + pr_cont(" during %s.\n", RRRR_MSG(ec)); + } + return; + +wrong_ls_mce: + pr_warning("Corrupted LS MCE info?\n"); +} + void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) { u32 ec = ERROR_CODE(regs->nbsl); @@ -366,6 +387,10 @@ void decode_mce(struct mce *m) amd_decode_bu_mce(m->status); break; + case 3: + amd_decode_ls_mce(m->status); + break; + case 4: regs.nbsl = (u32) m->status; regs.nbsh = (u32)(m->status >> 32); -- cgit v1.2.3 From 53bd5fedca7d0c28b35b02cab5f4e27bf8d7fabe Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 14:20:46 +0200 Subject: EDAC, AMD: decode FR MCEs See Fam10h BKDG (31116, rev. 3.28), Table 101. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 22848285536..c8ca7136dac 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -321,6 +321,15 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) } EXPORT_SYMBOL_GPL(amd_decode_nb_mce); +static void amd_decode_fr_mce(u64 mc5_status) +{ + /* we have only one error signature so match all fields at once. */ + if ((mc5_status & 0xffff) == 0x0f0f) + pr_emerg(" FR Error: CPU Watchdog timer expire.\n"); + else + pr_warning("Corrupted FR MCE info?\n"); +} + static inline void amd_decode_err_code(unsigned int ec) { if (TLB_ERROR(ec)) { @@ -401,6 +410,10 @@ void decode_mce(struct mce *m) amd_decode_nb_mce(node, ®s, 1); break; + case 5: + amd_decode_fr_mce(m->status); + break; + default: break; } -- cgit v1.2.3 From 22223c9b417be5fd0ab2cf9ad17eb7bd1e19f7b9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 14:47:10 +0200 Subject: x86, mce: do not compile mcelog message on AMD Now that decoding is done in-kernel, suppress mcelog message part. CC: Andi Kleen Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b82866f6adf..9bfe9d2ea61 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -222,7 +222,10 @@ static void print_mce_head(void) static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" - "Run through mcelog --ascii to decode and contact your hardware vendor\n"); +#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD)) + "Run through mcelog --ascii to decode and contact your hardware vendor\n" +#endif + ); } #define PANIC_TIMEOUT 5 /* 5 seconds */ -- cgit v1.2.3