Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp: x86, mce: do not compile mcelog message on AMD EDAC, AMD: decode FR MCEs EDAC, AMD: decode load store MCEs EDAC, AMD: decode bus unit MCEs EDAC, AMD: decode instruction cache MCEs EDAC, AMD: decode data cache MCEs EDAC, AMD: carve out decoding of MCi_STATUS ErrorCode EDAC, AMD: carve out MCi_STATUS decoding x86, mce: pass mce info to EDAC for decoding amd64_edac: cleanup amd64_decode_bus_error amd64_edac: remove memory and GART TLB error decoders amd64_edac: cleanup/complete NB MCE decoding amd64_edac: cleanup amd64_process_error_info EDAC: beef up ErrorCodeExt error signatures EDAC: move MCE error descriptions to EDAC core
author: Linus Torvalds <torvalds@linux-foundation.org> 2009-09-14 17:38:38 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-09-14 17:38:38 -0700
commit: f65ac45e20b03081ed64f41ce91bb982f8ac258d (patch)
tree: 615e966b6c792ccd840f994f38591ff5d3d85f72
parent: 4142e0d1def2c0176c27fd2e810243045a62eb6d (diff)
parent: 22223c9b417be5fd0ab2cf9ad17eb7bd1e19f7b9 (diff)
8 files changed, 619 insertions, 452 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 01213048f62..9bfe9d2ea61 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -183,6 +183,11 @@ void mce_log(struct mce *mce)
 	set_bit(0, &mce_need_notify);
 }
 
+void __weak decode_mce(struct mce *m)
+{
+	return;
+}
+
 static void print_mce(struct mce *m)
 {
 	printk(KERN_EMERG
@@ -205,6 +210,8 @@ static void print_mce(struct mce *m)
 	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
 			m->cpuvendor, m->cpuid, m->time, m->socketid,
 			m->apicid);
+
+	decode_mce(m);
 }
 
 static void print_mce_head(void)
@@ -215,7 +222,10 @@ static void print_mce_head(void)
 static void print_mce_tail(void)
 {
 	printk(KERN_EMERG "This is not a software problem!\n"
-	       "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD))
+	       "Run through mcelog --ascii to decode and contact your hardware vendor\n"
+#endif
+	       );
 }
 
 #define PANIC_TIMEOUT 5 /* 5 seconds */
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 98aa4a7db41..cfa033ce53a 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -17,6 +17,10 @@ ifdef CONFIG_PCI
 edac_core-objs	+= edac_pci.o edac_pci_sysfs.o
 endif
 
+ifdef CONFIG_CPU_SUP_AMD
+edac_core-objs  += edac_mce_amd.o
+endif
+
 obj-$(CONFIG_EDAC_AMD76X)		+= amd76x_edac.o
 obj-$(CONFIG_EDAC_CPC925)		+= cpc925_edac.o
 obj-$(CONFIG_EDAC_I5000)		+= i5000_edac.o
@@ -32,7 +36,7 @@ obj-$(CONFIG_EDAC_X38)			+= x38_edac.o
 obj-$(CONFIG_EDAC_I82860)		+= i82860_edac.o
 obj-$(CONFIG_EDAC_R82600)		+= r82600_edac.o
 
-amd64_edac_mod-y :=  amd64_edac_err_types.o amd64_edac.o
+amd64_edac_mod-y := amd64_edac.o
 amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o
 amd64_edac_mod-$(CONFIG_EDAC_AMD64_ERROR_INJECTION) += amd64_edac_inj.o
 
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index e2a10bcba7a..173dc4a8416 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -19,6 +19,63 @@ static struct mem_ctl_info *mci_lookup[MAX_NUMNODES];
 static struct amd64_pvt *pvt_lookup[MAX_NUMNODES];
 
 /*
+ * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only
+ * for DDR2 DRAM mapping.
+ */
+u32 revf_quad_ddr2_shift[] = {
+	0,	/* 0000b NULL DIMM (128mb) */
+	28,	/* 0001b 256mb */
+	29,	/* 0010b 512mb */
+	29,	/* 0011b 512mb */
+	29,	/* 0100b 512mb */
+	30,	/* 0101b 1gb */
+	30,	/* 0110b 1gb */
+	31,	/* 0111b 2gb */
+	31,	/* 1000b 2gb */
+	32,	/* 1001b 4gb */
+	32,	/* 1010b 4gb */
+	33,	/* 1011b 8gb */
+	0,	/* 1100b future */
+	0,	/* 1101b future */
+	0,	/* 1110b future */
+	0	/* 1111b future */
+};
+
+/*
+ * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing
+ * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching-
+ * or higher value'.
+ *
+ *FIXME: Produce a better mapping/linearisation.
+ */
+
+struct scrubrate scrubrates[] = {
+	{ 0x01, 1600000000UL},
+	{ 0x02, 800000000UL},
+	{ 0x03, 400000000UL},
+	{ 0x04, 200000000UL},
+	{ 0x05, 100000000UL},
+	{ 0x06, 50000000UL},
+	{ 0x07, 25000000UL},
+	{ 0x08, 12284069UL},
+	{ 0x09, 6274509UL},
+	{ 0x0A, 3121951UL},
+	{ 0x0B, 1560975UL},
+	{ 0x0C, 781440UL},
+	{ 0x0D, 390720UL},
+	{ 0x0E, 195300UL},
+	{ 0x0F, 97650UL},
+	{ 0x10, 48854UL},
+	{ 0x11, 24427UL},
+	{ 0x12, 12213UL},
+	{ 0x13, 6101UL},
+	{ 0x14, 3051UL},
+	{ 0x15, 1523UL},
+	{ 0x16, 761UL},
+	{ 0x00, 0UL},        /* scrubbing off */
+};
+
+/*
  * Memory scrubber control interface. For K8, memory scrubbing is handled by
  * hardware and can involve L2 cache, dcache as well as the main memory. With
  * F10, this is extended to L3 cache scrubbing on CPU models sporting that
@@ -693,7 +750,7 @@ static void find_csrow_limits(struct mem_ctl_info *mci, int csrow,
  * specific.
  */
 static u64 extract_error_address(struct mem_ctl_info *mci,
-				 struct amd64_error_info_regs *info)
+				 struct err_regs *info)
 {
 	struct amd64_pvt *pvt = mci->pvt_info;
 
@@ -1049,7 +1106,7 @@ static int k8_early_channel_count(struct amd64_pvt *pvt)
 
 /* extract the ERROR ADDRESS for the K8 CPUs */
 static u64 k8_get_error_address(struct mem_ctl_info *mci,
-				struct amd64_error_info_regs *info)
+				struct err_regs *info)
 {
 	return (((u64) (info->nbeah & 0xff)) << 32) +
 			(info->nbeal & ~0x03);
@@ -1092,7 +1149,7 @@ static void k8_read_dram_base_limit(struct amd64_pvt *pvt, int dram)
 }
 
 static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
-					struct amd64_error_info_regs *info,
+					struct err_regs *info,
 					u64 SystemAddress)
 {
 	struct mem_ctl_info *src_mci;
@@ -1101,8 +1158,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
 	u32 page, offset;
 
 	/* Extract the syndrome parts and form a 16-bit syndrome */
-	syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8;
-	syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh);
+	syndrome  = HIGH_SYNDROME(info->nbsl) << 8;
+	syndrome |= LOW_SYNDROME(info->nbsh);
 
 	/* CHIPKILL enabled */
 	if (info->nbcfg & K8_NBCFG_CHIPKILL) {
@@ -1311,7 +1368,7 @@ static void amd64_teardown(struct amd64_pvt *pvt)
 }
 
 static u64 f10_get_error_address(struct mem_ctl_info *mci,
-			struct amd64_error_info_regs *info)
+			struct err_regs *info)
 {
 	return (((u64) (info->nbeah & 0xffff)) << 32) +
 			(info->nbeal & ~0x01);
@@ -1688,7 +1745,7 @@ static int f10_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
  * The @sys_addr is usually an error address received from the hardware.
  */
 static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
-				     struct amd64_error_info_regs *info,
+				     struct err_regs *info,
 				     u64 sys_addr)
 {
 	struct amd64_pvt *pvt = mci->pvt_info;
@@ -1701,8 +1758,8 @@ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
 	if (csrow >= 0) {
 		error_address_to_page_and_offset(sys_addr, &page, &offset);
 
-		syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8;
-		syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh);
+		syndrome  = HIGH_SYNDROME(info->nbsl) << 8;
+		syndrome |= LOW_SYNDROME(info->nbsh);
 
 		/*
 		 * Is CHIPKILL on? If so, then we can attempt to use the
@@ -2045,7 +2102,7 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome)
  *	- 0: if no valid error is indicated
  */
 static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
-				     struct amd64_error_info_regs *regs)
+				     struct err_regs *regs)
 {
 	struct amd64_pvt *pvt;
 	struct pci_dev *misc_f3_ctl;
@@ -2094,10 +2151,10 @@ err_reg:
  *	- 0: if no error is found
  */
 static int amd64_get_error_info(struct mem_ctl_info *mci,
-				struct amd64_error_info_regs *info)
+				struct err_regs *info)
 {
 	struct amd64_pvt *pvt;
-	struct amd64_error_info_regs regs;
+	struct err_regs regs;
 
 	pvt = mci->pvt_info;
 
@@ -2152,48 +2209,12 @@ static int amd64_get_error_info(struct mem_ctl_info *mci,
 	return 1;
 }
 
-static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
-					 struct amd64_error_info_regs *info)
-{
-	u32 err_code;
-	u32 ec_tt;		/* error code transaction type (2b) */
-	u32 ec_ll;		/* error code cache level (2b) */
-
-	err_code = EXTRACT_ERROR_CODE(info->nbsl);
-	ec_ll = EXTRACT_LL_CODE(err_code);
-	ec_tt = EXTRACT_TT_CODE(err_code);
-
-	amd64_mc_printk(mci, KERN_ERR,
-		     "GART TLB event: transaction type(%s), "
-		     "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]);
-}
-
-static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
-				      struct amd64_error_info_regs *info)
-{
-	u32 err_code;
-	u32 ec_rrrr;		/* error code memory transaction (4b) */
-	u32 ec_tt;		/* error code transaction type (2b) */
-	u32 ec_ll;		/* error code cache level (2b) */
-
-	err_code = EXTRACT_ERROR_CODE(info->nbsl);
-	ec_ll = EXTRACT_LL_CODE(err_code);
-	ec_tt = EXTRACT_TT_CODE(err_code);
-	ec_rrrr = EXTRACT_RRRR_CODE(err_code);
-
-	amd64_mc_printk(mci, KERN_ERR,
-		     "cache hierarchy error: memory transaction type(%s), "
-		     "transaction type(%s), cache level(%s)\n",
-		     rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]);
-}
-
-
 /*
  * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
  * ADDRESS and process.
  */
 static void amd64_handle_ce(struct mem_ctl_info *mci,
-			    struct amd64_error_info_regs *info)
+			    struct err_regs *info)
 {
 	struct amd64_pvt *pvt = mci->pvt_info;
 	u64 SystemAddress;
@@ -2216,7 +2237,7 @@ static void amd64_handle_ce(struct mem_ctl_info *mci,
 
 /* Handle any Un-correctable Errors (UEs) */
 static void amd64_handle_ue(struct mem_ctl_info *mci,
-			    struct amd64_error_info_regs *info)
+			    struct err_regs *info)
 {
 	int csrow;
 	u64 SystemAddress;
@@ -2261,59 +2282,24 @@ static void amd64_handle_ue(struct mem_ctl_info *mci,
 	}
 }
 
-static void amd64_decode_bus_error(struct mem_ctl_info *mci,
-				   struct amd64_error_info_regs *info)
+static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
+					    struct err_regs *info)
 {
-	u32 err_code, ext_ec;
-	u32 ec_pp;		/* error code participating processor (2p) */
-	u32 ec_to;		/* error code timed out (1b) */
-	u32 ec_rrrr;		/* error code memory transaction (4b) */
-	u32 ec_ii;		/* error code memory or I/O (2b) */
-	u32 ec_ll;		/* error code cache level (2b) */
+	u32 ec  = ERROR_CODE(info->nbsl);
+	u32 xec = EXT_ERROR_CODE(info->nbsl);
+	int ecc_type = info->nbsh & (0x3 << 13);
 
-	ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl);
-	err_code = EXTRACT_ERROR_CODE(info->nbsl);
-
-	ec_ll = EXTRACT_LL_CODE(err_code);
-	ec_ii = EXTRACT_II_CODE(err_code);
-	ec_rrrr = EXTRACT_RRRR_CODE(err_code);
-	ec_to = EXTRACT_TO_CODE(err_code);
-	ec_pp = EXTRACT_PP_CODE(err_code);
-
-	amd64_mc_printk(mci, KERN_ERR,
-		"BUS ERROR:\n"
-		"  time-out(%s) mem or i/o(%s)\n"
-		"  participating processor(%s)\n"
-		"  memory transaction type(%s)\n"
-		"  cache level(%s) Error Found by: %s\n",
-		to_msgs[ec_to],
-		ii_msgs[ec_ii],
-		pp_msgs[ec_pp],
-		rrrr_msgs[ec_rrrr],
-		ll_msgs[ec_ll],
-		(info->nbsh & K8_NBSH_ERR_SCRUBER) ?
-			"Scrubber" : "Normal Operation");
-
-	/* If this was an 'observed' error, early out */
-	if (ec_pp == K8_NBSL_PP_OBS)
-		return;		/* We aren't the node involved */
-
-	/* Parse out the extended error code for ECC events */
-	switch (ext_ec) {
-	/* F10 changed to one Extended ECC error code */
-	case F10_NBSL_EXT_ERR_RES:		/* Reserved field */
-	case F10_NBSL_EXT_ERR_ECC:		/* F10 ECC ext err code */
-		break;
+	/* Bail early out if this was an 'observed' error */
+	if (PP(ec) == K8_NBSL_PP_OBS)
+		return;
 
-	default:
-		amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error "
-					       "handling for this error\n");
+	/* Do only ECC errors */
+	if (xec && xec != F10_NBSL_EXT_ERR_ECC)
 		return;
-	}
 
-	if (info->nbsh & K8_NBSH_CECC)
+	if (ecc_type == 2)
 		amd64_handle_ce(mci, info);
-	else if (info->nbsh & K8_NBSH_UECC)
+	else if (ecc_type == 1)
 		amd64_handle_ue(mci, info);
 
 	/*
@@ -2324,139 +2310,26 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
 	 * catastrophic.
 	 */
 	if (info->nbsh & K8_NBSH_OVERFLOW)
-		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR
-					  "Error Overflow set");
+		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow");
 }
 
-int amd64_process_error_info(struct mem_ctl_info *mci,
-			     struct amd64_error_info_regs *info,
-			     int handle_errors)
+void amd64_decode_bus_error(int node_id, struct err_regs *regs)
 {
-	struct amd64_pvt *pvt;
-	struct amd64_error_info_regs *regs;
-	u32 err_code, ext_ec;
-	int gart_tlb_error = 0;
-
-	pvt = mci->pvt_info;
-
-	/* If caller doesn't want us to process the error, return */
-	if (!handle_errors)
-		return 1;
-
-	regs = info;
-
-	debugf1("NorthBridge ERROR: mci(0x%p)\n", mci);
-	debugf1("  MC node(%d) Error-Address(0x%.8x-%.8x)\n",
-		pvt->mc_node_id, regs->nbeah, regs->nbeal);
-	debugf1("  nbsh(0x%.8x) nbsl(0x%.8x)\n",
-		regs->nbsh, regs->nbsl);
-	debugf1("  Valid Error=%s Overflow=%s\n",
-		(regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False",
-		(regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False");
-	debugf1("  Err Uncorrected=%s MCA Error Reporting=%s\n",
-		(regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ?
-			"True" : "False",
-		(regs->nbsh & K8_NBSH_ERR_ENABLE) ?
-			"True" : "False");
-	debugf1("  MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n",
-		(regs->nbsh & K8_NBSH_MISC_ERR_VALID) ?
-			"True" : "False",
-		(regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ?
-			"True" : "False",
-		(regs->nbsh & K8_NBSH_PCC) ?
-			"True" : "False");
-	debugf1("  CECC=%s UECC=%s Found by Scruber=%s\n",
-		(regs->nbsh & K8_NBSH_CECC) ?
-			"True" : "False",
-		(regs->nbsh & K8_NBSH_UECC) ?
-			"True" : "False",
-		(regs->nbsh & K8_NBSH_ERR_SCRUBER) ?
-			"True" : "False");
-	debugf1("  CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n",
-		(regs->nbsh & K8_NBSH_CORE0) ? "True" : "False",
-		(regs->nbsh & K8_NBSH_CORE1) ? "True" : "False",
-		(regs->nbsh & K8_NBSH_CORE2) ? "True" : "False",
-		(regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");
-
-
-	err_code = EXTRACT_ERROR_CODE(regs->nbsl);
-
-	/* Determine which error type:
-	 *	1) GART errors - non-fatal, developmental events
-	 *	2) MEMORY errors
-	 *	3) BUS errors
-	 *	4) Unknown error
-	 */
-	if (TEST_TLB_ERROR(err_code)) {
-		/*
-		 * GART errors are intended to help graphics driver developers
-		 * to detect bad GART PTEs. It is recommended by AMD to disable
-		 * GART table walk error reporting by default[1] (currently
-		 * being disabled in mce_cpu_quirks()) and according to the
-		 * comment in mce_cpu_quirks(), such GART errors can be
-		 * incorrectly triggered. We may see these errors anyway and
-		 * unless requested by the user, they won't be reported.
-		 *
-		 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
-		 *     AMD NPT family 0Fh processors
-		 */
-		if (report_gart_errors == 0)
-			return 1;
-
-		/*
-		 * Only if GART error reporting is requested should we generate
-		 * any logs.
-		 */
-		gart_tlb_error = 1;
-
-		debugf1("GART TLB error\n");
-		amd64_decode_gart_tlb_error(mci, info);
-	} else if (TEST_MEM_ERROR(err_code)) {
-		debugf1("Memory/Cache error\n");
-		amd64_decode_mem_cache_error(mci, info);
-	} else if (TEST_BUS_ERROR(err_code)) {
-		debugf1("Bus (Link/DRAM) error\n");
-		amd64_decode_bus_error(mci, info);
-	} else {
-		/* shouldn't reach here! */
-		amd64_mc_printk(mci, KERN_WARNING,
-			     "%s(): unknown MCE error 0x%x\n", __func__,
-			     err_code);
-	}
-
-	ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl);
-	amd64_mc_printk(mci, KERN_ERR,
-		"ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);
+	struct mem_ctl_info *mci = mci_lookup[node_id];
 
-	if (((ext_ec >= F10_NBSL_EXT_ERR_CRC &&
-			ext_ec <= F10_NBSL_EXT_ERR_TGT) ||
-			(ext_ec == F10_NBSL_EXT_ERR_RMW)) &&
-			EXTRACT_LDT_LINK(info->nbsh)) {
-
-		amd64_mc_printk(mci, KERN_ERR,
-			"Error on hypertransport link: %s\n",
-			htlink_msgs[
-			EXTRACT_LDT_LINK(info->nbsh)]);
-	}
+	__amd64_decode_bus_error(mci, regs);
 
 	/*
 	 * Check the UE bit of the NB status high register, if set generate some
 	 * logs. If NOT a GART error, then process the event as a NO-INFO event.
 	 * If it was a GART error, skip that process.
+	 *
+	 * FIXME: this should go somewhere else, if at all.
 	 */
-	if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) {
-		amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n");
-		if (!gart_tlb_error)
-			edac_mc_handle_ue_no_info(mci, "UE bit is set\n");
-	}
-
-	if (regs->nbsh & K8_NBSH_PCC)
-		amd64_mc_printk(mci, KERN_CRIT,
-			"PCC (processor context corrupt) set\n");
+	if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
+		edac_mc_handle_ue_no_info(mci, "UE bit is set");
 
-	return 1;
 }
-EXPORT_SYMBOL_GPL(amd64_process_error_info);
 
 /*
  * The main polling 'check' function, called FROM the edac core to perform the
@@ -2464,10 +2337,12 @@ EXPORT_SYMBOL_GPL(amd64_process_error_info);
  */
 static void amd64_check(struct mem_ctl_info *mci)
 {
-	struct amd64_error_info_regs info;
+	struct err_regs regs;
 
-	if (amd64_get_error_info(mci, &info))
-		amd64_process_error_info(mci, &info, 1);
+	if (amd64_get_error_info(mci, &regs)) {
+		struct amd64_pvt *pvt = mci->pvt_info;
+		amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
+	}
 }
 
 /*
@@ -3163,6 +3038,13 @@ static int amd64_init_2nd_stage(struct amd64_pvt *pvt)
 
 	mci_lookup[node_id] = mci;
 	pvt_lookup[node_id] = NULL;
+
+	/* register stuff with EDAC MCE */
+	if (report_gart_errors)
+		amd_report_gart_errors(true);
+
+	amd_register_ecc_decoder(amd64_decode_bus_error);
+
 	return 0;
 
 err_add_mc:
@@ -3229,6 +3111,10 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
 
 	mci_lookup[pvt->mc_node_id] = NULL;
 
+	/* unregister from EDAC MCE */
+	amd_report_gart_errors(false);
+	amd_unregister_ecc_decoder(amd64_decode_bus_error);
+
 	/* Free the EDAC CORE resources */
 	edac_mc_free(mci);
 }
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index ba73015af8e..8ea07e2715d 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -72,6 +72,7 @@
 #include <linux/edac.h>
 #include <asm/msr.h>
 #include "edac_core.h"
+#include "edac_mce_amd.h"
 
 #define amd64_printk(level, fmt, arg...) \
 	edac_printk(level, "amd64", fmt, ##arg)
@@ -303,21 +304,9 @@ enum {
 #define K8_NBSL				0x48
 
 
-#define EXTRACT_HIGH_SYNDROME(x)	(((x) >> 24) & 0xff)
-#define EXTRACT_EXT_ERROR_CODE(x)	(((x) >> 16) & 0x1f)
-
 /* Family F10h: Normalized Extended Error Codes */
 #define F10_NBSL_EXT_ERR_RES		0x0
-#define F10_NBSL_EXT_ERR_CRC		0x1
-#define F10_NBSL_EXT_ERR_SYNC		0x2
-#define F10_NBSL_EXT_ERR_MST		0x3
-#define F10_NBSL_EXT_ERR_TGT		0x4
-#define F10_NBSL_EXT_ERR_GART		0x5
-#define F10_NBSL_EXT_ERR_RMW		0x6
-#define F10_NBSL_EXT_ERR_WDT		0x7
 #define F10_NBSL_EXT_ERR_ECC		0x8
-#define F10_NBSL_EXT_ERR_DEV		0x9
-#define F10_NBSL_EXT_ERR_LINK_DATA	0xA
 
 /* Next two are overloaded values */
 #define F10_NBSL_EXT_ERR_LINK_PROTO	0xB
@@ -348,17 +337,6 @@ enum {
 #define K8_NBSL_EXT_ERR_CHIPKILL_ECC	0x8
 #define K8_NBSL_EXT_ERR_DRAM_PARITY	0xD
 
-#define EXTRACT_ERROR_CODE(x)		((x) & 0xffff)
-#define	TEST_TLB_ERROR(x)		(((x) & 0xFFF0) == 0x0010)
-#define	TEST_MEM_ERROR(x)		(((x) & 0xFF00) == 0x0100)
-#define	TEST_BUS_ERROR(x)		(((x) & 0xF800) == 0x0800)
-#define	EXTRACT_TT_CODE(x)		(((x) >> 2) & 0x3)
-#define	EXTRACT_II_CODE(x)		(((x) >> 2) & 0x3)
-#define	EXTRACT_LL_CODE(x)		(((x) >> 0) & 0x3)
-#define	EXTRACT_RRRR_CODE(x)		(((x) >> 4) & 0xf)
-#define	EXTRACT_TO_CODE(x)		(((x) >> 8) & 0x1)
-#define	EXTRACT_PP_CODE(x)		(((x) >> 9) & 0x3)
-
 /*
  * The following are for BUS type errors AFTER values have been normalized by
  * shifting right
@@ -368,28 +346,7 @@ enum {
 #define K8_NBSL_PP_OBS			0x2
 #define K8_NBSL_PP_GENERIC		0x3
 
-
-#define K8_NBSH				0x4C
-
-#define K8_NBSH_VALID_BIT		BIT(31)
-#define K8_NBSH_OVERFLOW		BIT(30)
-#define K8_NBSH_UNCORRECTED_ERR		BIT(29)
-#define K8_NBSH_ERR_ENABLE		BIT(28)
-#define K8_NBSH_MISC_ERR_VALID		BIT(27)
-#define K8_NBSH_VALID_ERROR_ADDR	BIT(26)
-#define K8_NBSH_PCC			BIT(25)
-#define K8_NBSH_CECC			BIT(14)
-#define K8_NBSH_UECC			BIT(13)
-#define K8_NBSH_ERR_SCRUBER		BIT(8)
-#define K8_NBSH_CORE3			BIT(3)
-#define K8_NBSH_CORE2			BIT(2)
-#define K8_NBSH_CORE1			BIT(1)
-#define K8_NBSH_CORE0			BIT(0)
-
-#define EXTRACT_LDT_LINK(x)		(((x) >> 4) & 0x7)
 #define EXTRACT_ERR_CPU_MAP(x)		((x) & 0xF)
-#define EXTRACT_LOW_SYNDROME(x)		(((x) >> 15) & 0xff)
-
 
 #define K8_NBEAL			0x50
 #define K8_NBEAH			0x54
@@ -455,23 +412,6 @@ enum amd64_chipset_families {
 	F11_CPUS,
 };
 
-/*
- * Structure to hold:
- *
- * 1) dynamically read status and error address HW registers
- * 2) sysfs entered values
- * 3) MCE values
- *
- * Depends on entry into the modules
- */
-struct amd64_error_info_regs {
-	u32 nbcfg;
-	u32 nbsh;
-	u32 nbsl;
-	u32 nbeah;
-	u32 nbeal;
-};
-
 /* Error injection control structure */
 struct error_injection {
 	u32	section;
@@ -542,7 +482,7 @@ struct amd64_pvt {
 	u32 online_spare;               /* On-Line spare Reg */
 
 	/* temp storage for when input is received from sysfs */
-	struct amd64_error_info_regs ctl_error_info;
+	struct err_regs ctl_error_info;
 
 	/* place to store error injection parameters prior to issue */
 	struct error_injection injection;
@@ -601,11 +541,11 @@ struct low_ops {
 	int (*early_channel_count)(struct amd64_pvt *pvt);
 
 	u64 (*get_error_address)(struct mem_ctl_info *mci,
-			struct amd64_error_info_regs *info);
+			struct err_regs *info);
 	void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram);
 	void (*read_dram_ctl_register)(struct amd64_pvt *pvt);
 	void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci,
-					struct amd64_error_info_regs *info,
+					struct err_regs *info,
 					u64 SystemAddr);
 	int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map);
 };
@@ -637,8 +577,5 @@ static inline struct low_ops *family_ops(int index)
 #define F10_MIN_SCRUB_RATE_BITS	0x5
 #define F11_MIN_SCRUB_RATE_BITS	0x6
 
-int amd64_process_error_info(struct mem_ctl_info *mci,
-			     struct amd64_error_info_regs *info,
-			     int handle_errors);
 int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
 			     u64 *hole_offset, u64 *hole_size);
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
index 0a41b248a4a..59cf2cf6e11 100644
--- a/drivers/edac/amd64_edac_dbg.c
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
 
 		/* Process the Mapping request */
 		/* TODO: Add race prevention */
-		amd64_process_error_info(mci, &pvt->ctl_error_info, 1);
+		amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1);
 
 		return count;
 	}
diff --git a/drivers/edac/amd64_edac_err_types.c b/drivers/edac/amd64_edac_err_types.c
deleted file mode 100644
index f212ff12a9d..00000000000
--- a/drivers/edac/amd64_edac_err_types.c
+++ /dev/null
@@ -1,161 +0,0 @@
-#include "amd64_edac.h"
-
-/*
- * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only
- * for DDR2 DRAM mapping.
- */
-u32 revf_quad_ddr2_shift[] = {
-	0,	/* 0000b NULL DIMM (128mb) */
-	28,	/* 0001b 256mb */
-	29,	/* 0010b 512mb */
-	29,	/* 0011b 512mb */
-	29,	/* 0100b 512mb */
-	30,	/* 0101b 1gb */
-	30,	/* 0110b 1gb */
-	31,	/* 0111b 2gb */
-	31,	/* 1000b 2gb */
-	32,	/* 1001b 4gb */
-	32,	/* 1010b 4gb */
-	33,	/* 1011b 8gb */
-	0,	/* 1100b future */
-	0,	/* 1101b future */
-	0,	/* 1110b future */
-	0	/* 1111b future */
-};
-
-/*
- * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing
- * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching-
- * or higher value'.
- *
- *FIXME: Produce a better mapping/linearisation.
- */
-
-struct scrubrate scrubrates[] = {
-	{ 0x01, 1600000000UL},
-	{ 0x02, 800000000UL},
-	{ 0x03, 400000000UL},
-	{ 0x04, 200000000UL},
-	{ 0x05, 100000000UL},
-	{ 0x06, 50000000UL},
-	{ 0x07, 25000000UL},
-	{ 0x08, 12284069UL},
-	{ 0x09, 6274509UL},
-	{ 0x0A, 3121951UL},
-	{ 0x0B, 1560975UL},
-	{ 0x0C, 781440UL},
-	{ 0x0D, 390720UL},
-	{ 0x0E, 195300UL},
-	{ 0x0F, 97650UL},
-	{ 0x10, 48854UL},
-	{ 0x11, 24427UL},
-	{ 0x12, 12213UL},
-	{ 0x13, 6101UL},
-	{ 0x14, 3051UL},
-	{ 0x15, 1523UL},
-	{ 0x16, 761UL},
-	{ 0x00, 0UL},        /* scrubbing off */
-};
-
-/*
- * string representation for the different MCA reported error types, see F3x48
- * or MSR0000_0411.
- */
-const char *tt_msgs[] = {        /* transaction type */
-	"instruction",
-	"data",
-	"generic",
-	"reserved"
-};
-
-const char *ll_msgs[] = {	/* cache level */
-	"L0",
-	"L1",
-	"L2",
-	"L3/generic"
-};
-
-const char *rrrr_msgs[] = {
-	"generic",
-	"generic read",
-	"generic write",
-	"data read",
-	"data write",
-	"inst fetch",
-	"prefetch",
-	"evict",
-	"snoop",
-	"reserved RRRR= 9",
-	"reserved RRRR= 10",
-	"reserved RRRR= 11",
-	"reserved RRRR= 12",
-	"reserved RRRR= 13",
-	"reserved RRRR= 14",
-	"reserved RRRR= 15"
-};
-
-const char *pp_msgs[] = {	/* participating processor */
-	"local node originated (SRC)",
-	"local node responded to request (RES)",
-	"local node observed as 3rd party (OBS)",
-	"generic"
-};
-
-const char *to_msgs[] = {
-	"no timeout",
-	"timed out"
-};
-
-const char *ii_msgs[] = {	/* memory or i/o */
-	"mem access",
-	"reserved",
-	"i/o access",
-	"generic"
-};
-
-/* Map the 5 bits of Extended Error code to the string table. */
-const char *ext_msgs[] = {	/* extended error */
-	"K8 ECC error/F10 reserved",	/* 0_0000b */
-	"CRC error",			/* 0_0001b */
-	"sync error",			/* 0_0010b */
-	"mst abort",			/* 0_0011b */
-	"tgt abort",			/* 0_0100b */
-	"GART error",			/* 0_0101b */
-	"RMW error",			/* 0_0110b */
-	"Wdog timer error",		/* 0_0111b */
-	"F10-ECC/K8-Chipkill error",	/* 0_1000b */
-	"DEV Error",			/* 0_1001b */
-	"Link Data error",		/* 0_1010b */
-	"Link or L3 Protocol error",	/* 0_1011b */
-	"NB Array error",		/* 0_1100b */
-	"DRAM Parity error",		/* 0_1101b */
-	"Link Retry/GART Table Walk/DEV Table Walk error", /* 0_1110b */
-	"Res 0x0ff error",		/* 0_1111b */
-	"Res 0x100 error",		/* 1_0000b */
-	"Res 0x101 error",		/* 1_0001b */
-	"Res 0x102 error",		/* 1_0010b */
-	"Res 0x103 error",		/* 1_0011b */
-	"Res 0x104 error",		/* 1_0100b */
-	"Res 0x105 error",		/* 1_0101b */
-	"Res 0x106 error",		/* 1_0110b */
-	"Res 0x107 error",		/* 1_0111b */
-	"Res 0x108 error",		/* 1_1000b */
-	"Res 0x109 error",		/* 1_1001b */
-	"Res 0x10A error",		/* 1_1010b */
-	"Res 0x10B error",		/* 1_1011b */
-	"L3 Cache Data error",		/* 1_1100b */
-	"L3 CacheTag error",		/* 1_1101b */
-	"L3 Cache LRU error",		/* 1_1110b */
-	"Res 0x1FF error"		/* 1_1111b */
-};
-
-const char *htlink_msgs[] = {
-	"none",
-	"1",
-	"2",
-	"1 2",
-	"3",
-	"1 3",
-	"2 3",
-	"1 2 3"
-};
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
new file mode 100644
index 00000000000..c8ca7136dac
--- /dev/null
+++ b/drivers/edac/edac_mce_amd.c
@@ -0,0 +1,422 @@
+#include <linux/module.h>
+#include "edac_mce_amd.h"
+
+static bool report_gart_errors;
+static void (*nb_bus_decoder)(int node_id, struct err_regs *regs);
+
+void amd_report_gart_errors(bool v)
+{
+	report_gart_errors = v;
+}
+EXPORT_SYMBOL_GPL(amd_report_gart_errors);
+
+void amd_register_ecc_decoder(void (*f)(int, struct err_regs *))
+{
+	nb_bus_decoder = f;
+}
+EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
+
+void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *))
+{
+	if (nb_bus_decoder) {
+		WARN_ON(nb_bus_decoder != f);
+
+		nb_bus_decoder = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
+
+/*
+ * string representation for the different MCA reported error types, see F3x48
+ * or MSR0000_0411.
+ */
+const char *tt_msgs[] = {        /* transaction type */
+	"instruction",
+	"data",
+	"generic",
+	"reserved"
+};
+EXPORT_SYMBOL_GPL(tt_msgs);
+
+const char *ll_msgs[] = {	/* cache level */
+	"L0",
+	"L1",
+	"L2",
+	"L3/generic"
+};
+EXPORT_SYMBOL_GPL(ll_msgs);
+
+const char *rrrr_msgs[] = {
+	"generic",
+	"generic read",
+	"generic write",
+	"data read",
+	"data write",
+	"inst fetch",
+	"prefetch",
+	"evict",
+	"snoop",
+	"reserved RRRR= 9",
+	"reserved RRRR= 10",
+	"reserved RRRR= 11",
+	"reserved RRRR= 12",
+	"reserved RRRR= 13",
+	"reserved RRRR= 14",
+	"reserved RRRR= 15"
+};
+EXPORT_SYMBOL_GPL(rrrr_msgs);
+
+const char *pp_msgs[] = {	/* participating processor */
+	"local node originated (SRC)",
+	"local node responded to request (RES)",
+	"local node observed as 3rd party (OBS)",
+	"generic"
+};
+EXPORT_SYMBOL_GPL(pp_msgs);
+
+const char *to_msgs[] = {
+	"no timeout",
+	"timed out"
+};
+EXPORT_SYMBOL_GPL(to_msgs);
+
+const char *ii_msgs[] = {	/* memory or i/o */
+	"mem access",
+	"reserved",
+	"i/o access",
+	"generic"
+};
+EXPORT_SYMBOL_GPL(ii_msgs);
+
+/*
+ * Map the 4 or 5 (family-specific) bits of Extended Error code to the
+ * string table.
+ */
+const char *ext_msgs[] = {
+	"K8 ECC error",					/* 0_0000b */
+	"CRC error on link",				/* 0_0001b */
+	"Sync error packets on link",			/* 0_0010b */
+	"Master Abort during link operation",		/* 0_0011b */
+	"Target Abort during link operation",		/* 0_0100b */
+	"Invalid GART PTE entry during table walk",	/* 0_0101b */
+	"Unsupported atomic RMW command received",	/* 0_0110b */
+	"WDT error: NB transaction timeout",		/* 0_0111b */
+	"ECC/ChipKill ECC error",			/* 0_1000b */
+	"SVM DEV Error",				/* 0_1001b */
+	"Link Data error",				/* 0_1010b */
+	"Link/L3/Probe Filter Protocol error",		/* 0_1011b */
+	"NB Internal Arrays Parity error",		/* 0_1100b */
+	"DRAM Address/Control Parity error",		/* 0_1101b */
+	"Link Transmission error",			/* 0_1110b */
+	"GART/DEV Table Walk Data error"		/* 0_1111b */
+	"Res 0x100 error",				/* 1_0000b */
+	"Res 0x101 error",				/* 1_0001b */
+	"Res 0x102 error",				/* 1_0010b */
+	"Res 0x103 error",				/* 1_0011b */
+	"Res 0x104 error",				/* 1_0100b */
+	"Res 0x105 error",				/* 1_0101b */
+	"Res 0x106 error",				/* 1_0110b */
+	"Res 0x107 error",				/* 1_0111b */
+	"Res 0x108 error",				/* 1_1000b */
+	"Res 0x109 error",				/* 1_1001b */
+	"Res 0x10A error",				/* 1_1010b */
+	"Res 0x10B error",				/* 1_1011b */
+	"ECC error in L3 Cache Data",			/* 1_1100b */
+	"L3 Cache Tag error",				/* 1_1101b */
+	"L3 Cache LRU Parity error",			/* 1_1110b */
+	"Probe Filter error"				/* 1_1111b */
+};
+EXPORT_SYMBOL_GPL(ext_msgs);
+
+static void amd_decode_dc_mce(u64 mc0_status)
+{
+	u32 ec  = mc0_status & 0xffff;
+	u32 xec = (mc0_status >> 16) & 0xf;
+
+	pr_emerg(" Data Cache Error");
+
+	if (xec == 1 && TLB_ERROR(ec))
+		pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
+	else if (xec == 0) {
+		if (mc0_status & (1ULL << 40))
+			pr_cont(" during Data Scrub.\n");
+		else if (TLB_ERROR(ec))
+			pr_cont(": %s TLB parity error.\n", LL_MSG(ec));
+		else if (MEM_ERROR(ec)) {
+			u8 ll   = ec & 0x3;
+			u8 tt   = (ec >> 2) & 0x3;
+			u8 rrrr = (ec >> 4) & 0xf;
+
+			/* see F10h BKDG (31116), Table 92. */
+			if (ll == 0x1) {
+				if (tt != 0x1)
+					goto wrong_dc_mce;
+
+				pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec));
+
+			} else if (ll == 0x2 && rrrr == 0x3)
+				pr_cont(" during L1 linefill from L2.\n");
+			else
+				goto wrong_dc_mce;
+		} else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf)
+			pr_cont(" during system linefill.\n");
+		else
+			goto wrong_dc_mce;
+	} else
+		goto wrong_dc_mce;
+
+	return;
+
+wrong_dc_mce:
+	pr_warning("Corrupted DC MCE info?\n");
+}
+
+static void amd_decode_ic_mce(u64 mc1_status)
+{
+	u32 ec  = mc1_status & 0xffff;
+	u32 xec = (mc1_status >> 16) & 0xf;
+
+	pr_emerg(" Instruction Cache Error");
+
+	if (xec == 1 && TLB_ERROR(ec))
+		pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
+	else if (xec == 0) {
+		if (TLB_ERROR(ec))
+			pr_cont(": %s TLB Parity error.\n", LL_MSG(ec));
+		else if (BUS_ERROR(ec)) {
+			if (boot_cpu_data.x86 == 0xf &&
+			    (mc1_status & (1ULL << 58)))
+				pr_cont(" during system linefill.\n");
+			else
+				pr_cont(" during attempted NB data read.\n");
+		} else if (MEM_ERROR(ec)) {
+			u8 ll   = ec & 0x3;
+			u8 rrrr = (ec >> 4) & 0xf;
+
+			if (ll == 0x2)
+				pr_cont(" during a linefill from L2.\n");
+			else if (ll == 0x1) {
+
+				switch (rrrr) {
+				case 0x5:
+					pr_cont(": Parity error during "
+					       "data load.\n");
+					break;
+
+				case 0x7:
+					pr_cont(": Copyback Parity/Victim"
+						" error.\n");
+					break;
+
+				case 0x8:
+					pr_cont(": Tag Snoop error.\n");
+					break;
+
+				default:
+					goto wrong_ic_mce;
+					break;
+				}
+			}
+		} else
+			goto wrong_ic_mce;
+	} else
+		goto wrong_ic_mce;
+
+	return;
+
+wrong_ic_mce:
+	pr_warning("Corrupted IC MCE info?\n");
+}
+
+static void amd_decode_bu_mce(u64 mc2_status)
+{
+	u32 ec = mc2_status & 0xffff;
+	u32 xec = (mc2_status >> 16) & 0xf;
+
+	pr_emerg(" Bus Unit Error");
+
+	if (xec == 0x1)
+		pr_cont(" in the write data buffers.\n");
+	else if (xec == 0x3)
+		pr_cont(" in the victim data buffers.\n");
+	else if (xec == 0x2 && MEM_ERROR(ec))
+		pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
+	else if (xec == 0x0) {
+		if (TLB_ERROR(ec))
+			pr_cont(": %s error in a Page Descriptor Cache or "
+				"Guest TLB.\n", TT_MSG(ec));
+		else if (BUS_ERROR(ec))
+			pr_cont(": %s/ECC error in data read from NB: %s.\n",
+				RRRR_MSG(ec), PP_MSG(ec));
+		else if (MEM_ERROR(ec)) {
+			u8 rrrr = (ec >> 4) & 0xf;
+
+			if (rrrr >= 0x7)
+				pr_cont(": %s error during data copyback.\n",
+					RRRR_MSG(ec));
+			else if (rrrr <= 0x1)
+				pr_cont(": %s parity/ECC error during data "
+					"access from L2.\n", RRRR_MSG(ec));
+			else
+				goto wrong_bu_mce;
+		} else
+			goto wrong_bu_mce;
+	} else
+		goto wrong_bu_mce;
+
+	return;
+
+wrong_bu_mce:
+	pr_warning("Corrupted BU MCE info?\n");
+}
+
+static void amd_decode_ls_mce(u64 mc3_status)
+{
+	u32 ec  = mc3_status & 0xffff;
+	u32 xec = (mc3_status >> 16) & 0xf;
+
+	pr_emerg(" Load Store Error");
+
+	if (xec == 0x0) {
+		u8 rrrr = (ec >> 4) & 0xf;
+
+		if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4))
+			goto wrong_ls_mce;
+
+		pr_cont(" during %s.\n", RRRR_MSG(ec));
+	}
+	return;
+
+wrong_ls_mce:
+	pr_warning("Corrupted LS MCE info?\n");
+}
+
+void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
+{
+	u32 ec  = ERROR_CODE(regs->nbsl);
+	u32 xec = EXT_ERROR_CODE(regs->nbsl);
+
+	if (!handle_errors)
+		return;
+
+	pr_emerg(" Northbridge Error, node %d", node_id);
+
+	/*
+	 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
+	 * value encoding has changed so interpret those differently
+	 */
+	if ((boot_cpu_data.x86 == 0x10) &&
+	    (boot_cpu_data.x86_model > 8)) {
+		if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
+			pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
+	} else {
+		pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
+	}
+
+
+	pr_emerg("%s.\n", EXT_ERR_MSG(xec));
+
+	if (BUS_ERROR(ec) && nb_bus_decoder)
+		nb_bus_decoder(node_id, regs);
+}
+EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
+
+static void amd_decode_fr_mce(u64 mc5_status)
+{
+	/* we have only one error signature so match all fields at once. */
+	if ((mc5_status & 0xffff) == 0x0f0f)
+		pr_emerg(" FR Error: CPU Watchdog timer expire.\n");
+	else
+		pr_warning("Corrupted FR MCE info?\n");
+}
+
+static inline void amd_decode_err_code(unsigned int ec)
+{
+	if (TLB_ERROR(ec)) {
+		/*
+		 * GART errors are intended to help graphics driver developers
+		 * to detect bad GART PTEs. It is recommended by AMD to disable
+		 * GART table walk error reporting by default[1] (currently
+		 * being disabled in mce_cpu_quirks()) and according to the
+		 * comment in mce_cpu_quirks(), such GART errors can be
+		 * incorrectly triggered. We may see these errors anyway and
+		 * unless requested by the user, they won't be reported.
+		 *
+		 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
+		 *     AMD NPT family 0Fh processors
+		 */
+		if (!report_gart_errors)
+			return;
+
+		pr_emerg(" Transaction: %s, Cache Level %s\n",
+			 TT_MSG(ec), LL_MSG(ec));
+	} else if (MEM_ERROR(ec)) {
+		pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s",
+			 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
+	} else if (BUS_ERROR(ec)) {
+		pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, "
+			 "Participating Processor: %s\n",
+			  RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
+			  PP_MSG(ec));
+	} else
+		pr_warning("Huh? Unknown MCE error 0x%x\n", ec);
+}
+
+void decode_mce(struct mce *m)
+{
+	struct err_regs regs;
+	int node, ecc;
+
+	pr_emerg("MC%d_STATUS: ", m->bank);
+
+	pr_cont("%sorrected error, report: %s, MiscV: %svalid, "
+		 "CPU context corrupt: %s",
+		 ((m->status & MCI_STATUS_UC) ? "Unc"  : "C"),
+		 ((m->status & MCI_STATUS_EN) ? "yes"  : "no"),
+		 ((m->status & MCI_STATUS_MISCV) ? ""  : "in"),
+		 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
+
+	/* do the two bits[14:13] together */
+	ecc = m->status & (3ULL << 45);
+	if (ecc)
+		pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
+
+	pr_cont("\n");
+
+	switch (m->bank) {
+	case 0:
+		amd_decode_dc_mce(m->status);
+		break;
+
+	case 1:
+		amd_decode_ic_mce(m->status);
+		break;
+
+	case 2:
+		amd_decode_bu_mce(m->status);
+		break;
+
+	case 3:
+		amd_decode_ls_mce(m->status);
+		break;
+
+	case 4:
+		regs.nbsl  = (u32) m->status;
+		regs.nbsh  = (u32)(m->status >> 32);
+		regs.nbeal = (u32) m->addr;
+		regs.nbeah = (u32)(m->addr >> 32);
+		node       = per_cpu(cpu_llc_id, m->extcpu);
+
+		amd_decode_nb_mce(node, &regs, 1);
+		break;
+
+	case 5:
+		amd_decode_fr_mce(m->status);
+		break;
+
+	default:
+		break;
+	}
+
+	amd_decode_err_code(m->status & 0xffff);
+}
diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h
new file mode 100644
index 00000000000..df23ee065f7
--- /dev/null
+++ b/drivers/edac/edac_mce_amd.h
@@ -0,0 +1,69 @@
+#ifndef _EDAC_MCE_AMD_H
+#define _EDAC_MCE_AMD_H
+
+#include <asm/mce.h>
+
+#define ERROR_CODE(x)			((x) & 0xffff)
+#define EXT_ERROR_CODE(x)		(((x) >> 16) & 0x1f)
+#define EXT_ERR_MSG(x)			ext_msgs[EXT_ERROR_CODE(x)]
+
+#define LOW_SYNDROME(x)			(((x) >> 15) & 0xff)
+#define HIGH_SYNDROME(x)		(((x) >> 24) & 0xff)
+
+#define TLB_ERROR(x)			(((x) & 0xFFF0) == 0x0010)
+#define MEM_ERROR(x)			(((x) & 0xFF00) == 0x0100)
+#define BUS_ERROR(x)			(((x) & 0xF800) == 0x0800)
+
+#define TT(x)				(((x) >> 2) & 0x3)
+#define TT_MSG(x)			tt_msgs[TT(x)]
+#define II(x)				(((x) >> 2) & 0x3)
+#define II_MSG(x)			ii_msgs[II(x)]
+#define LL(x)				(((x) >> 0) & 0x3)
+#define LL_MSG(x)			ll_msgs[LL(x)]
+#define RRRR(x)				(((x) >> 4) & 0xf)
+#define RRRR_MSG(x)			rrrr_msgs[RRRR(x)]
+#define TO(x)				(((x) >> 8) & 0x1)
+#define TO_MSG(x)			to_msgs[TO(x)]
+#define PP(x)				(((x) >> 9) & 0x3)
+#define PP_MSG(x)			pp_msgs[PP(x)]
+
+#define K8_NBSH				0x4C
+
+#define K8_NBSH_VALID_BIT		BIT(31)
+#define K8_NBSH_OVERFLOW		BIT(30)
+#define K8_NBSH_UC_ERR			BIT(29)
+#define K8_NBSH_ERR_EN			BIT(28)
+#define K8_NBSH_MISCV			BIT(27)
+#define K8_NBSH_VALID_ERROR_ADDR	BIT(26)
+#define K8_NBSH_PCC			BIT(25)
+#define K8_NBSH_ERR_CPU_VAL		BIT(24)
+#define K8_NBSH_CECC			BIT(14)
+#define K8_NBSH_UECC			BIT(13)
+#define K8_NBSH_ERR_SCRUBER		BIT(8)
+
+extern const char *tt_msgs[];
+extern const char *ll_msgs[];
+extern const char *rrrr_msgs[];
+extern const char *pp_msgs[];
+extern const char *to_msgs[];
+extern const char *ii_msgs[];
+extern const char *ext_msgs[];
+
+/*
+ * relevant NB regs
+ */
+struct err_regs {
+	u32 nbcfg;
+	u32 nbsh;
+	u32 nbsl;
+	u32 nbeah;
+	u32 nbeal;
+};
+
+
+void amd_report_gart_errors(bool);
+void amd_register_ecc_decoder(void (*f)(int, struct err_regs *));
+void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *));
+void amd_decode_nb_mce(int, struct err_regs *, int);
+
+#endif /* _EDAC_MCE_AMD_H */
author	Linus Torvalds <torvalds@linux-foundation.org>	2009-09-14 17:38:38 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-09-14 17:38:38 -0700
commit	f65ac45e20b03081ed64f41ce91bb982f8ac258d (patch)
tree	615e966b6c792ccd840f994f38591ff5d3d85f72
parent	4142e0d1def2c0176c27fd2e810243045a62eb6d (diff)
parent	22223c9b417be5fd0ab2cf9ad17eb7bd1e19f7b9 (diff)