[coreboot-gerrit] Patch set updated for coreboot: northbridge/amd/mct_ddr3: Add registered and x4 DIMM support to Fam15h

Timothy Pearson (tpearson@raptorengineeringinc.com) gerrit at coreboot.org
Sun Nov 15 03:12:47 CET 2015


Timothy Pearson (tpearson at raptorengineeringinc.com) just uploaded a new patch set to gerrit, which you can find at http://review.coreboot.org/12019

-gerrit

commit c542b5bf431700a2491975280fe4720f0c15f11f
Author: Timothy Pearson <tpearson at raptorengineeringinc.com>
Date:   Tue Jul 28 15:16:46 2015 -0500

    northbridge/amd/mct_ddr3: Add registered and x4 DIMM support to Fam15h
    
    The existing MCT support code did not perform any of the requisite
    configuration to support registered or x4 DIMMs.  Add the needed
    configuration per the BKDG for Family 15h.
    
    Change-Id: I9ee0bb7346aa35f564fe535cdd337ec7f6148f2b
    Signed-off-by: Timothy Pearson <tpearson at raptorengineeringinc.com>
---
 src/northbridge/amd/amdmct/mct_ddr3/mct_d.c    | 186 +++++++-----
 src/northbridge/amd/amdmct/mct_ddr3/mct_d.h    |   2 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c |   4 +
 src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c   |  17 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctrci.c   | 191 ++++++++----
 src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c   |  42 ++-
 src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c   | 253 +++++++++-------
 src/northbridge/amd/amdmct/mct_ddr3/mctwl.c    |  16 +-
 src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c  | 400 +++++++++++++++----------
 src/northbridge/amd/amdmct/mct_ddr3/mwlc_d.h   |  13 +-
 10 files changed, 698 insertions(+), 426 deletions(-)

diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
index 3338ae3..25c4042 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
@@ -162,7 +162,7 @@ static void mct_EnDllShutdownSR(struct MCTStatStruc *pMCTstat,
 static void ChangeMemClk(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat);
 void SetTargetFreq(struct MCTStatStruc *pMCTstat,
-                                        struct DCTStatStruc *pDCTstat);
+                                        struct DCTStatStruc *pDCTstatA, uint8_t Node);
 
 static u32 mct_MR1Odt_RDimm(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct, u32 MrsChipSel);
@@ -1410,6 +1410,10 @@ static void precise_memclk_delay_fam15(struct MCTStatStruc *pMCTstat, struct DCT
 
 	memclk_freq = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0x94) & 0x1f;
 
+	if (fam15h_freq_tab[memclk_freq] == 0) {
+		printk(BIOS_DEBUG, "ERROR: precise_memclk_delay_fam15 for DCT %d (delay %d clocks) failed to obtain valid memory frequency!"
+			" (pDCTstat: %p pDCTstat->dev_dct: %08x memclk_freq: %02x)\n", dct, clocks, pDCTstat, pDCTstat->dev_dct, memclk_freq);
+	}
 	delay_ns = (((uint64_t)clocks * 1000) / fam15h_freq_tab[memclk_freq]);
 	precise_ndelay_fam15(pMCTstat, delay_ns);
 }
@@ -2327,7 +2331,7 @@ static void DQSTiming_D(struct MCTStatStruc *pMCTstat,
 	nv_DQSTrainCTL = !allow_config_restore;
 
 	mct_BeforeDQSTrain_D(pMCTstat, pDCTstatA);
-	phyAssistedMemFnceTraining(pMCTstat, pDCTstatA);
+	phyAssistedMemFnceTraining(pMCTstat, pDCTstatA, -1);
 
 	if (is_fam15h()) {
 		uint8_t Node;
@@ -3367,7 +3371,7 @@ static void SPD2ndTiming(struct MCTStatStruc *pMCTstat,
 }
 
 static u8 AutoCycTiming_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat, u8 dct)
+				struct DCTStatStruc *pDCTstat, uint8_t dct)
 {
 	/* Initialize  DCT Timing registers as per DIMM SPD.
 	 * For primary timing (T, CL) use best case T value.
@@ -3471,7 +3475,7 @@ static void GetPresetmaxF_D(struct MCTStatStruc *pMCTstat,
 }
 
 static void SPDGetTCL_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat, u8 dct)
+				struct DCTStatStruc *pDCTstat, uint8_t dct)
 {
 	/* Find the best T and CL primary timing parameter pair, per Mfg.,
 	 * for the given set of DIMMs, and store into DCTStatStruc
@@ -3750,10 +3754,15 @@ static u8 AutoConfig_D(struct MCTStatStruc *pMCTstat,
 		dword++;
 	}
 
-	if (Status & (1 << SB_Registered))
-		DramConfigLo |= 1 << ParEn;		/* Registered DIMMs */
-	else
-		DramConfigLo |= 1 << UnBuffDimm;	/* Unbuffered DIMMs */
+	if (Status & (1 << SB_Registered)) {
+		/* Registered DIMMs */
+		if (!is_fam15h()) {
+			DramConfigLo |= 1 << ParEn;
+		}
+	} else {
+		/* Unbuffered DIMMs */
+		DramConfigLo |= 1 << UnBuffDimm;
+	}
 
 	if (mctGet_NVbits(NV_ECC_CAP))
 		if (Status & (1 << SB_ECCDIMMs))
@@ -3771,10 +3780,11 @@ static u8 AutoConfig_D(struct MCTStatStruc *pMCTstat,
 	DramConfigHi |= dword - offset;	/* get MemClk encoding */
 	DramConfigHi |= 1 << MemClkFreqVal;
 
-	if (Status & (1 << SB_Registered))
-		if ((pDCTstat->Dimmx4Present != 0) && (pDCTstat->Dimmx8Present != 0))
-			/* set only if x8 Registered DIMMs in System*/
-			DramConfigHi |= 1 << RDqsEn;
+	if (!is_fam15h())
+		if (Status & (1 << SB_Registered))
+			if ((pDCTstat->Dimmx4Present != 0) && (pDCTstat->Dimmx8Present != 0))
+				/* set only if x8 Registered DIMMs in System*/
+				DramConfigHi |= 1 << RDqsEn;
 
 	if (pDCTstat->LogicalCPUID & AMD_FAM15_ALL) {
 		DramConfigLo |= 1 << 25;	/* PendRefPaybackS3En = 1 */
@@ -3786,14 +3796,16 @@ static u8 AutoConfig_D(struct MCTStatStruc *pMCTstat,
 			DramConfigHi |= 1 << 16;
 	}
 
-	/* Control Bank Swizzle */
-	if (0) /* call back not needed mctBankSwizzleControl_D()) */
-		DramConfigHi &= ~(1 << BankSwizzleMode);
-	else
-		DramConfigHi |= 1 << BankSwizzleMode; /* recommended setting (default) */
+	if (!is_fam15h()) {
+		/* Control Bank Swizzle */
+		if (0) /* call back not needed mctBankSwizzleControl_D()) */
+			DramConfigHi &= ~(1 << BankSwizzleMode);
+		else
+			DramConfigHi |= 1 << BankSwizzleMode; /* recommended setting (default) */
+	}
 
 	/* Check for Quadrank DIMM presence */
-	if ( pDCTstat->DimmQRPresent != 0) {
+	if (pDCTstat->DimmQRPresent != 0) {
 		byte = mctGet_NVbits(NV_4RANKType);
 		if (byte == 2)
 			DramConfigHi |= 1 << 17;	/* S4 (4-Rank SO-DIMMs) */
@@ -4598,8 +4610,9 @@ static u8 mct_setMode(struct MCTStatStruc *pMCTstat,
 			Set_NB32(pDCTstat->dev_dct, reg, val);
 		}
 		if (byte)	/* NV_Unganged */
-			pDCTstat->ErrStatus &= ~(1 << SB_DimmMismatchO); /* Clear so that there is no DIMM missmatch error */
+			pDCTstat->ErrStatus &= ~(1 << SB_DimmMismatchO); /* Clear so that there is no DIMM mismatch error */
 	}
+
 	return pDCTstat->ErrCode;
 }
 
@@ -4660,6 +4673,8 @@ void Set_NB32_index_wait(u32 dev, u32 index_reg, u32 index, u32 data)
 static u8 mct_BeforePlatformSpec(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct)
 {
+	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+
 	/* mct_checkForCxDxSupport_D */
 	if (pDCTstat->LogicalCPUID & AMD_DR_GT_Bx) {
 		/* Family 10h Errata 322: Address and Command Fine Delay Values May Be Incorrect */
@@ -4674,6 +4689,9 @@ static u8 mct_BeforePlatformSpec(struct MCTStatStruc *pMCTstat,
 		else
 			Set_NB32_index_wait_DCT(pDCTstat->dev_dct, dct, 0x98, 0x0D02E001, 0x90);
 	}
+
+	printk(BIOS_DEBUG, "%s: Done\n", __func__);
+
 	return pDCTstat->ErrCode;
 }
 
@@ -4684,6 +4702,8 @@ static u8 mct_PlatformSpec(struct MCTStatStruc *pMCTstat,
 	 * and program them into DCT.
 	 */
 
+	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+
 	u32 dev = pDCTstat->dev_dct;
 	u32 index_reg;
 	u8 i, i_start, i_end;
@@ -4704,6 +4724,8 @@ static u8 mct_PlatformSpec(struct MCTStatStruc *pMCTstat,
 		printk(BIOS_SPEW, "Programmed DCT %d timing/termination pattern %08x %08x\n", dct, pDCTstat->CH_ADDR_TMG[i], pDCTstat->CH_ODC_CTL[i]);
 	}
 
+	printk(BIOS_DEBUG, "%s: Done\n", __func__);
+
 	return pDCTstat->ErrCode;
 }
 
@@ -4715,7 +4737,8 @@ static void mct_SyncDCTsReady(struct DCTStatStruc *pDCTstat)
 	if (pDCTstat->NodePresent) {
 		dev = pDCTstat->dev_dct;
 
-		if ((pDCTstat->DIMMValidDCT[0] ) || (pDCTstat->DIMMValidDCT[1])) {		/* This Node has dram */
+		if ((pDCTstat->DIMMValidDCT[0]) || (pDCTstat->DIMMValidDCT[1])) {
+			/* This Node has DRAM */
 			do {
 				val = Get_NB32(dev, 0x110);
 			} while (!(val & (1 << DramEnabled)));
@@ -5663,57 +5686,56 @@ static void InitDDRPhy(struct MCTStatStruc *pMCTstat,
 	/* Fam15h BKDG v3.14 section 2.10.5.3
 	 * The remainder of the Phy Initialization algorithm picks up in phyAssistedMemFnceTraining
 	 */
-	for (dct = 0; dct < 2; dct++) {
-		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0000000b, 0x80000000);
-		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fe013, 0x00000118);
-
-		/* Program desired VDDIO level */
-		if (ddr_voltage_index & 0x4) {
-			/* 1.25V */
-			amd_voltage_level_index = 0x2;
-		} else if (ddr_voltage_index & 0x2) {
-			/* 1.35V */
-			amd_voltage_level_index = 0x1;
-		} else if (ddr_voltage_index & 0x1) {
-			/* 1.50V */
-			amd_voltage_level_index = 0x0;
-		}
-
-		/* D18F2x9C_x0D0F_0[F,8:0]1F_dct[1:0][RxVioLvl] */
-		for (index = 0; index < 0x9; index++) {
-			dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f001f | (index << 8));
-			dword &= ~(0x3 << 3);
-			dword |= (amd_voltage_level_index << 3);
-			Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f001f | (index << 8), dword);
-		}
-
-		/* D18F2x9C_x0D0F_[C,8,2][2:0]1F_dct[1:0][RxVioLvl] */
-		for (index = 0; index < 0x3; index++) {
-			dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f201f | (index << 8));
-			dword &= ~(0x3 << 3);
-			dword |= (amd_voltage_level_index << 3);
-			Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f201f | (index << 8), dword);
-		}
-		for (index = 0; index < 0x2; index++) {
-			dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f801f | (index << 8));
-			dword &= ~(0x3 << 3);
-			dword |= (amd_voltage_level_index << 3);
-			Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f801f | (index << 8), dword);
-		}
-		for (index = 0; index < 0x1; index++) {
-			dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fc01f | (index << 8));
-			dword &= ~(0x3 << 3);
-			dword |= (amd_voltage_level_index << 3);
-			Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fc01f | (index << 8), dword);
-		}
+	Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0000000b, 0x80000000);
+	Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fe013, 0x00000118);
 
-		/* D18F2x9C_x0D0F_4009_dct[1:0][CmpVioLvl, ComparatorAdjust] */
-		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f4009);
-		dword &= ~(0x0000c00c);
-		dword |= (amd_voltage_level_index << 14);
-		dword |= (amd_voltage_level_index << 2);
-		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f4009, dword);
-	}
+	/* Program desired VDDIO level */
+	if (ddr_voltage_index & 0x4) {
+		/* 1.25V */
+		amd_voltage_level_index = 0x2;
+	} else if (ddr_voltage_index & 0x2) {
+		/* 1.35V */
+		amd_voltage_level_index = 0x1;
+	} else if (ddr_voltage_index & 0x1) {
+		/* 1.50V */
+		amd_voltage_level_index = 0x0;
+	}
+
+	/* D18F2x9C_x0D0F_0[F,8:0]1F_dct[1:0][RxVioLvl] */
+	for (index = 0; index < 0x9; index++) {
+		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f001f | (index << 8));
+		dword &= ~(0x3 << 3);
+		dword |= (amd_voltage_level_index << 3);
+		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f001f | (index << 8), dword);
+	}
+
+	/* D18F2x9C_x0D0F_[C,8,2][2:0]1F_dct[1:0][RxVioLvl] */
+	for (index = 0; index < 0x3; index++) {
+		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f201f | (index << 8));
+		dword &= ~(0x3 << 3);
+		dword |= (amd_voltage_level_index << 3);
+		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f201f | (index << 8), dword);
+	}
+	for (index = 0; index < 0x2; index++) {
+		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f801f | (index << 8));
+		dword &= ~(0x3 << 3);
+		dword |= (amd_voltage_level_index << 3);
+		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f801f | (index << 8), dword);
+	}
+	for (index = 0; index < 0x1; index++) {
+		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fc01f | (index << 8));
+		dword &= ~(0x3 << 3);
+		dword |= (amd_voltage_level_index << 3);
+		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fc01f | (index << 8), dword);
+	}
+
+	/* D18F2x9C_x0D0F_4009_dct[1:0][CmpVioLvl, ComparatorAdjust] */
+	/* NOTE: CmpVioLvl and ComparatorAdjust only take effect when set on DCT 0 */
+	dword = Get_NB32_index_wait_DCT(dev, 0, index_reg, 0x0d0f4009);
+	dword &= ~(0x0000c00c);
+	dword |= (amd_voltage_level_index << 14);
+	dword |= (amd_voltage_level_index << 2);
+	Set_NB32_index_wait_DCT(dev, 0, index_reg, 0x0d0f4009, dword);
 
 	printk(BIOS_DEBUG, "%s: Done\n", __func__);
 }
@@ -5729,18 +5751,24 @@ static void InitPhyCompensation(struct MCTStatStruc *pMCTstat,
 	uint32_t dword;
 	const u8 *p;
 
-	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+	printk(BIOS_DEBUG, "%s: DCT %d: Start\n", __func__, dct);
 
 	if (is_fam15h()) {
 		/* Algorithm detailed in the Fam15h BKDG Rev. 3.14 section 2.10.5.3.4 */
 		uint32_t tx_pre;
 		uint32_t drive_strength;
 
-		/* Program D18F2x9C_x0D0F_E003_dct[1:0][DisAutoComp, DisablePredriverCal] */
+		/* Program D18F2x9C_x0D0F_E003_dct[1:0][DisAutoComp] */
 		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fe003);
-		dword |= (0x3 << 13);
+		dword |= (0x1 << 14);
 		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fe003, dword);
 
+		/* Program D18F2x9C_x0D0F_E003_dct[1:0][DisablePredriverCal] */
+		/* NOTE: DisablePredriverCal only takes effect when set on DCT 0 */
+		dword = Get_NB32_index_wait_DCT(dev, 0, index_reg, 0x0d0fe003);
+		dword |= (0x1 << 13);
+		Set_NB32_index_wait_DCT(dev, 0, index_reg, 0x0d0fe003, dword);
+
 		/* Determine TxPreP/TxPreN for data lanes (Stage 1) */
 		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x00000000);
 		drive_strength = (dword >> 20) & 0x7;	/* DqsDrvStren */
@@ -5886,12 +5914,14 @@ static void InitPhyCompensation(struct MCTStatStruc *pMCTstat,
 		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0a, dword);
 	}
 
-	printk(BIOS_DEBUG, "%s: Done\n", __func__);
+	printk(BIOS_DEBUG, "%s: DCT %d: Done\n", __func__, dct);
 }
 
 static void mct_EarlyArbEn_D(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct)
 {
+	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+
 	if (!is_fam15h()) {
 		u32 reg;
 		u32 val;
@@ -5913,6 +5943,8 @@ static void mct_EarlyArbEn_D(struct MCTStatStruc *pMCTstat,
 
 		Set_NB32_DCT(dev, dct, reg, val);
 	}
+
+	printk(BIOS_DEBUG, "%s: Done\n", __func__);
 }
 
 static u8 CheckNBCOFEarlyArbEn(struct MCTStatStruc *pMCTstat,
@@ -6556,6 +6588,8 @@ void mct_SetDramConfigHi_D(struct MCTStatStruc *pMCTstat,
 
 	uint32_t dword;
 
+	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+
 	if (is_fam15h()) {
 		/* Initial setup for frequency change
 		 * 9C_x0000_0004 must be configured before MemClkFreqVal is set
@@ -6588,6 +6622,8 @@ void mct_SetDramConfigHi_D(struct MCTStatStruc *pMCTstat,
 		mct_Wait(100);
 	}
 
+	printk(BIOS_DEBUG, "mct_SetDramConfigHi_D: DramConfigHi:    %08x\n", DramConfigHi);
+
 	/* Program the DRAM Configuration High register */
 	Set_NB32_DCT(dev, dct, 0x94, DramConfigHi);
 
@@ -6603,6 +6639,8 @@ void mct_SetDramConfigHi_D(struct MCTStatStruc *pMCTstat,
 		dword |= 0x0000000f;
 		Set_NB32_index_wait_DCT(pDCTstat->dev_dct, dct, index_reg, 0x0d0fe006, dword);
 	}
+
+	printk(BIOS_DEBUG, "%s: Done\n", __func__);
 }
 
 static void mct_BeforeDQSTrain_D(struct MCTStatStruc *pMCTstat,
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
index 592b1e6..eb4c74e 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
@@ -1010,7 +1010,7 @@ void InterleaveNodes_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTs
 void InterleaveChannels_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
 void mct_BeforeDQSTrain_Samp_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat);
 
-void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
+void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA, int16_t Node);
 u8 mct_SaveRcvEnDly_D_1Pass(struct DCTStatStruc *pDCTstat, u8 pass);
 u8 mct_InitReceiver_D(struct DCTStatStruc *pDCTstat, u8 dct);
 void mct_Wait(u32 cycles);
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
index cdb93f9..3615616 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
@@ -1584,6 +1584,7 @@ static void TrainDQSReceiverEnCyc_D_Fam15(struct MCTStatStruc *pMCTstat,
 
 	for (dct = 0; dct < 2; dct++) {
 		/* Program D18F2x9C_x0D0F_E003_dct[1:0][DisAutoComp, DisablePredriverCal] */
+		/* NOTE: DisablePredriverCal only takes effect when set on DCT 0 */
 		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fe003);
 		dword &= ~(0x3 << 13);
 		dword |= (0x1 << 13);
@@ -1623,6 +1624,9 @@ static void TrainDQSReceiverEnCyc_D_Fam15(struct MCTStatStruc *pMCTstat,
 				rx_en_offset = (initial_phy_phase_delay[lane] + 0x10) % 0x40;
 
 				/* 2.10.5.8.3 (4) */
+#if DQS_TRAIN_DEBUG > 0
+				printk(BIOS_DEBUG, "TrainDQSReceiverEnCyc_D_Fam15 Receiver %d lane %d initial phy delay %04x: iterating from %04x to %04x\n", Receiver, lane, initial_phy_phase_delay[lane], rx_en_offset, 0x3ff);
+#endif
 				for (current_phy_phase_delay[lane] = rx_en_offset; current_phy_phase_delay[lane] < 0x3ff; current_phy_phase_delay[lane] += ren_step) {
 					/* 2.10.5.8.3 (4 A) */
 					write_dqs_receiver_enable_control_registers(current_phy_phase_delay, dev, dct, dimm, index_reg);
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c b/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c
index b3572b1..a92f9e5 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c
@@ -17,7 +17,7 @@
 static uint8_t AgesaHwWlPhase1(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct, u8 dimm, u8 pass);
 static uint8_t AgesaHwWlPhase2(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat, u8 dct, u8 dimm, u8 pass);
+					struct DCTStatStruc *pDCTstat, uint8_t dct, uint8_t dimm, uint8_t pass);
 static uint8_t AgesaHwWlPhase3(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct, u8 dimm, u8 pass);
 static void EnableZQcalibration(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat);
@@ -129,7 +129,7 @@ static uint8_t PhyWLPass1(struct MCTStatStruc *pMCTstat,
 }
 
 static uint8_t PhyWLPass2(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat, u8 dct)
+					struct DCTStatStruc *pDCTstat, uint8_t dct, uint8_t final)
 {
 	u8 dimm;
 	u16 DIMMValid;
@@ -183,12 +183,15 @@ static uint16_t fam15h_next_highest_memclk_freq(uint16_t memclk_freq)
  * Algorithm detailed in the Fam10h BKDG Rev. 3.62 section 2.8.9.9.1
  */
 static void WriteLevelization_HW(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat, uint8_t Pass)
+					struct DCTStatStruc *pDCTstatA, uint8_t Node, uint8_t Pass)
 {
 	uint8_t status;
 	uint8_t timeout;
 	uint16_t final_target_freq;
 
+	struct DCTStatStruc *pDCTstat;
+	pDCTstat = pDCTstatA + Node;
+
 	pDCTstat->C_MCTPtr  = &(pDCTstat->s_C_MCTPtr);
 	pDCTstat->C_DCTPtr[0] = &(pDCTstat->s_C_DCTPtr[0]);
 	pDCTstat->C_DCTPtr[1] = &(pDCTstat->s_C_DCTPtr[1]);
@@ -236,13 +239,13 @@ static void WriteLevelization_HW(struct MCTStatStruc *pMCTstat,
 					pDCTstat->TargetFreq = fam15h_next_highest_memclk_freq(pDCTstat->Speed);
 				else
 					pDCTstat->TargetFreq = final_target_freq;
-				SetTargetFreq(pMCTstat, pDCTstat);
+				SetTargetFreq(pMCTstat, pDCTstatA, Node);
 				timeout = 0;
 				do {
 					status = 0;
 					timeout++;
-					status |= PhyWLPass2(pMCTstat, pDCTstat, 0);
-					status |= PhyWLPass2(pMCTstat, pDCTstat, 1);
+					status |= PhyWLPass2(pMCTstat, pDCTstat, 0, (pDCTstat->TargetFreq == final_target_freq));
+					status |= PhyWLPass2(pMCTstat, pDCTstat, 1, (pDCTstat->TargetFreq == final_target_freq));
 					if (status)
 						printk(BIOS_INFO,
 							"%s: Retrying write levelling due to invalid value(s) detected in last phase\n",
@@ -286,7 +289,7 @@ void mct_WriteLevelization_HW(struct MCTStatStruc *pMCTstat,
 		if (pDCTstat->NodePresent) {
 			mctSMBhub_Init(Node);
 			Clear_OnDimmMirror(pMCTstat, pDCTstat);
-			WriteLevelization_HW(pMCTstat, pDCTstat, Pass);
+			WriteLevelization_HW(pMCTstat, pDCTstatA, Node, Pass);
 			Restore_OnDimmMirror(pMCTstat, pDCTstat);
 		}
 	}
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctrci.c b/src/northbridge/amd/amdmct/mct_ddr3/mctrci.c
index 3f6c39d..01061a7 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctrci.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctrci.c
@@ -14,6 +14,78 @@
  * GNU General Public License for more details.
  */
 
+static uint8_t fam15h_rdimm_rc2_control_code(struct DCTStatStruc *pDCTstat, uint8_t dct)
+{
+	uint8_t MaxDimmsInstallable = mctGet_NVbits(NV_MAX_DIMMS_PER_CH);
+
+	uint8_t package_type;
+	uint8_t control_code = 0;
+
+	package_type = mctGet_NVbits(NV_PACK_TYPE);
+	uint16_t MemClkFreq = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0x94) & 0x1f;
+
+	/* Obtain number of DIMMs on channel */
+	uint8_t dimm_count = pDCTstat->MAdimms[dct];
+
+	/* FIXME
+	 * Assume there is only one register on the RDIMM for now
+	 */
+	uint8_t num_registers = 1;
+
+	if (package_type == PT_GR) {
+		/* Socket G34 */
+		/* Fam15h BKDG Rev. 3.14 section 2.10.5.7.1.2.1 Table 85 */
+		if (MaxDimmsInstallable == 1) {
+			if ((MemClkFreq == 0x4) || (MemClkFreq == 0x6)) {
+				/* DDR3-667 - DDR3-800 */
+				control_code = 0x1;
+			} else if ((MemClkFreq == 0xa) || (MemClkFreq == 0xe)) {
+				/* DDR3-1066 - DDR3-1333 */
+				if (num_registers == 1) {
+					control_code = 0x0;
+				} else {
+					control_code = 0x1;
+				}
+			} else if ((MemClkFreq == 0x12) || (MemClkFreq == 0x16)) {
+				/* DDR3-1600 - DDR3-1866 */
+				control_code = 0x0;
+			}
+		} else if (MaxDimmsInstallable == 2) {
+			if (dimm_count == 1) {
+				/* 1 DIMM detected */
+				if ((MemClkFreq == 0x4) || (MemClkFreq == 0x6)) {
+					/* DDR3-667 - DDR3-800 */
+					control_code = 0x1;
+				} else if ((MemClkFreq >= 0xa) && (MemClkFreq <= 0x12)) {
+					/* DDR3-1066 - DDR3-1600 */
+					if (num_registers == 1) {
+						control_code = 0x0;
+					} else {
+						control_code = 0x1;
+					}
+				}
+			} else if (dimm_count == 2) {
+				/* 2 DIMMs detected */
+				if (num_registers == 1) {
+					control_code = 0x1;
+				} else {
+					control_code = 0x8;
+				}
+			}
+		} else if (MaxDimmsInstallable == 3) {
+			/* TODO
+			 * 3 DIMM/channel support unimplemented
+			 */
+		}
+	} else {
+		/* TODO
+		 * Other socket support unimplemented
+		 */
+	}
+
+	return control_code;
+}
+
 static uint16_t memclk_to_freq(uint16_t memclk) {
 	uint16_t fam10h_freq_tab[] = {0, 0, 0, 400, 533, 667, 800};
 	uint16_t fam15h_freq_tab[] = {0, 0, 0, 0, 333, 0, 400, 0, 0, 0, 533, 0, 0, 0, 667, 0, 0, 0, 800, 0, 0, 0, 933};
@@ -33,36 +105,46 @@ static uint16_t memclk_to_freq(uint16_t memclk) {
 	return mem_freq;
 }
 
+static uint8_t rc_word_chip_select_lower_bit(void) {
+	if (is_fam15h()) {
+		return 21;
+	} else {
+		return 20;
+	}
+}
+
+static uint32_t rc_word_address_to_ctl_bits(uint32_t address) {
+	if (is_fam15h()) {
+		return (((address >> 3) & 0x1) << 2) << 18 | (address & 0x7);
+	} else {
+		return (((address >> 3) & 0x1) << 2) << 16 | (address & 0x7);
+	}
+}
+
 static uint32_t rc_word_value_to_ctl_bits(uint32_t value) {
-	return ((value >> 2) & 3) << 16 | ((value & 3) << 3);
+	if (is_fam15h()) {
+		return ((value >> 2) & 0x3) << 18 | ((value & 0x3) << 3);
+	} else {
+		return ((value >> 2) & 0x3) << 16 | ((value & 0x3) << 3);
+	}
 }
 
 static u32 mct_ControlRC(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstat, u32 MrsChipSel, u32 CtrlWordNum)
+			struct DCTStatStruc *pDCTstat, uint8_t dct, u32 MrsChipSel, u32 CtrlWordNum)
 {
 	u8 Dimms, DimmNum;
 	u32 val;
-	u32 dct = 0;
 	uint8_t ddr_voltage_index;
 	uint16_t mem_freq;
 	uint8_t package_type = mctGet_NVbits(NV_PACK_TYPE);
 	uint8_t MaxDimmsInstallable = mctGet_NVbits(NV_MAX_DIMMS_PER_CH);
 
-	DimmNum = (MrsChipSel >> 20) & 0xFE;
+	DimmNum = (MrsChipSel >> rc_word_chip_select_lower_bit()) & 0xfe;
 
-	/* assume dct=0; */
-	/* if (dct == 1) */
-	/* DimmNum ++; */
-	/* cl +=8; */
+	if (dct == 1)
+		DimmNum++;
 
 	mem_freq = memclk_to_freq(pDCTstat->DIMMAutoSpeed);
-
-	if (pDCTstat->CSPresent_DCT[0] > 0) {
-		dct = 0;
-	} else if (pDCTstat->CSPresent_DCT[1] > 0 ) {
-		dct = 1;
-		DimmNum++;
-	}
 	Dimms = pDCTstat->MAdimms[dct];
 
 	ddr_voltage_index = dct_ddr_voltage_index(pDCTstat, dct);
@@ -72,21 +154,25 @@ static u32 mct_ControlRC(struct MCTStatStruc *pMCTstat,
 		val = 0x2;
 	else if (CtrlWordNum == 1) {
 		if (!((pDCTstat->DimmDRPresent | pDCTstat->DimmQRPresent) & (1 << DimmNum)))
-			val = 0xC; /* if single rank, set DBA1 and DBA0 */
+			val = 0xc; /* if single rank, set DBA1 and DBA0 */
 	} else if (CtrlWordNum == 2) {
-		if (package_type == PT_GR) {
-			/* Socket G34 */
-			if (MaxDimmsInstallable == 2) {
-				if (Dimms > 1)
-					val = 0x4;
+		if (is_fam15h()) {
+			val = fam15h_rdimm_rc2_control_code(pDCTstat, dct);
+		} else {
+			if (package_type == PT_GR) {
+				/* Socket G34 */
+				if (MaxDimmsInstallable == 2) {
+					if (Dimms > 1)
+						val = 0x4;
+				}
 			}
 		}
 	} else if (CtrlWordNum == 3) {
-		val = (pDCTstat->CtrlWrd3 >> (DimmNum << 2)) & 0xFF;
+		val = (pDCTstat->CtrlWrd3 >> (DimmNum << 2)) & 0xff;
 	} else if (CtrlWordNum == 4) {
-		val = (pDCTstat->CtrlWrd4 >> (DimmNum << 2)) & 0xFF;
+		val = (pDCTstat->CtrlWrd4 >> (DimmNum << 2)) & 0xff;
 	} else if (CtrlWordNum == 5) {
-		val = (pDCTstat->CtrlWrd5 >> (DimmNum << 2)) & 0xFF;
+		val = (pDCTstat->CtrlWrd5 >> (DimmNum << 2)) & 0xff;
 	} else if (CtrlWordNum == 8) {
 		if (package_type == PT_GR) {
 			/* Socket G34 */
@@ -95,7 +181,7 @@ static u32 mct_ControlRC(struct MCTStatStruc *pMCTstat,
 			}
 		}
 	} else if (CtrlWordNum == 9) {
-		val = 0xD;	/* DBA1, DBA0, DA3 = 0 */
+		val = 0xd;	/* DBA1, DBA0, DA3 = 0 */
 	} else if (CtrlWordNum == 10) {
 		val = 0x0;	/* Lowest operating frequency */
 	} else if (CtrlWordNum == 11) {
@@ -110,43 +196,30 @@ static u32 mct_ControlRC(struct MCTStatStruc *pMCTstat,
 	}
 	val &= 0xf;
 
-	printk(BIOS_SPEW, "Preparing to send DIMM RC%d: %02x\n", CtrlWordNum, val);
+	printk(BIOS_SPEW, "Preparing to send DCT %d DIMM RC%d: %02x\n", dct, CtrlWordNum, val);
 
 	val = MrsChipSel | rc_word_value_to_ctl_bits(val);
-
-	/* transfer Control word number to address [BA2,A2,A1,A0] */
-	if (CtrlWordNum > 7) {
-		val |= 1 << 18;
-		CtrlWordNum &= 7;
-	}
-	val |= CtrlWordNum;
+	val |= rc_word_address_to_ctl_bits(CtrlWordNum);
 
 	return val;
 }
 
 static void mct_SendCtrlWrd(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstat, u32 val)
+			struct DCTStatStruc *pDCTstat, uint8_t dct, uint32_t val)
 {
-	uint8_t dct = 0;
 	u32 dev = pDCTstat->dev_dct;
 
-	if (pDCTstat->CSPresent_DCT[0] > 0) {
-		dct = 0;
-	} else if (pDCTstat->CSPresent_DCT[1] > 0 ){
-		dct = 1;
-	}
-
-	val |= Get_NB32_DCT(dev, dct, 0x7C) & ~0xFFFFFF;
+	val |= Get_NB32_DCT(dev, dct, 0x7c) & ~0xffffff;
 	val |= 1 << SendControlWord;
-	Set_NB32_DCT(dev, dct, 0x7C, val);
+	Set_NB32_DCT(dev, dct, 0x7c, val);
 
 	do {
-		val = Get_NB32_DCT(dev, dct, 0x7C);
+		val = Get_NB32_DCT(dev, dct, 0x7c);
 	} while (val & (1 << SendControlWord));
 }
 
 void mct_DramControlReg_Init_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat, u8 dct)
+				struct DCTStatStruc *pDCTstat, uint8_t dct)
 {
 	u8 MrsChipSel;
 	u32 dev = pDCTstat->dev_dct;
@@ -159,7 +232,7 @@ void mct_DramControlReg_Init_D(struct MCTStatStruc *pMCTstat,
 	for (MrsChipSel = 0; MrsChipSel < 8; MrsChipSel ++, MrsChipSel ++) {
 		if (pDCTstat->CSPresent & (1 << MrsChipSel)) {
 			val = Get_NB32_DCT(dev, dct, 0xa8);
-			val &= ~(0xF << 8);
+			val &= ~(0xf << 8);
 
 			switch (MrsChipSel) {
 				case 0:
@@ -180,8 +253,8 @@ void mct_DramControlReg_Init_D(struct MCTStatStruc *pMCTstat,
 			for (cw=0; cw <=15; cw ++) {
 				mct_Wait(1600);
 				if (!(cw==6 || cw==7)) {
-					val = mct_ControlRC(pMCTstat, pDCTstat, MrsChipSel << 20, cw);
-					mct_SendCtrlWrd(pMCTstat, pDCTstat, val);
+					val = mct_ControlRC(pMCTstat, pDCTstat, dct, MrsChipSel << rc_word_chip_select_lower_bit(), cw);
+					mct_SendCtrlWrd(pMCTstat, pDCTstat, dct, val);
 				}
 			}
 		}
@@ -191,7 +264,7 @@ void mct_DramControlReg_Init_D(struct MCTStatStruc *pMCTstat,
 }
 
 void FreqChgCtrlWrd(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstat)
+			struct DCTStatStruc *pDCTstat, uint8_t dct)
 {
 	u32 SaveSpeed = pDCTstat->DIMMAutoSpeed;
 	u32 MrsChipSel;
@@ -204,10 +277,10 @@ void FreqChgCtrlWrd(struct MCTStatStruc *pMCTstat,
 	for (MrsChipSel=0; MrsChipSel < 8; MrsChipSel++, MrsChipSel++) {
 		if (pDCTstat->CSPresent & (1 << MrsChipSel)) {
 			/* 2. Program F2x[1, 0]A8[CtrlWordCS]=bit mask for target chip selects. */
-			val = Get_NB32_DCT(dev, 0, 0xA8); /* TODO: dct 0 / 1 select */
-			val &= ~(0xFF << 8);
-			val |= (0x3 << (MrsChipSel & 0xFE)) << 8;
-			Set_NB32_DCT(dev, 0, 0xA8, val); /* TODO: dct 0 / 1 select */
+			val = Get_NB32_DCT(dev, dct, 0xa8);
+			val &= ~(0xff << 8);
+			val |= (0x3 << (MrsChipSel & 0xfe)) << 8;
+			Set_NB32_DCT(dev, dct, 0xa8, val);
 
 			/* Resend control word 10 */
 			uint8_t freq_ctl_val = 0;
@@ -231,21 +304,21 @@ void FreqChgCtrlWrd(struct MCTStatStruc *pMCTstat,
 					break;
 			}
 
-			printk(BIOS_SPEW, "Preparing to send DIMM RC%d: %02x\n", 10, freq_ctl_val);
+			printk(BIOS_SPEW, "Preparing to send DCT %d DIMM RC%d: %02x\n", dct, 10, freq_ctl_val);
 
-			mct_SendCtrlWrd(pMCTstat, pDCTstat, MrsChipSel << 20 | 0x40002 | rc_word_value_to_ctl_bits(freq_ctl_val));
+			mct_SendCtrlWrd(pMCTstat, pDCTstat, dct, MrsChipSel << rc_word_chip_select_lower_bit() | rc_word_address_to_ctl_bits(10) | rc_word_value_to_ctl_bits(freq_ctl_val));
 
 			mct_Wait(1600);
 
 			/* Resend control word 2 */
-			val = mct_ControlRC(pMCTstat, pDCTstat, MrsChipSel << 20, 2);
-			mct_SendCtrlWrd(pMCTstat, pDCTstat, val);
+			val = mct_ControlRC(pMCTstat, pDCTstat, dct, MrsChipSel << rc_word_chip_select_lower_bit(), 2);
+			mct_SendCtrlWrd(pMCTstat, pDCTstat, dct, val);
 
 			mct_Wait(1600);
 
 			/* Resend control word 8 */
-			val = mct_ControlRC(pMCTstat, pDCTstat, MrsChipSel << 20, 8);
-			mct_SendCtrlWrd(pMCTstat, pDCTstat, val);
+			val = mct_ControlRC(pMCTstat, pDCTstat, dct, MrsChipSel << rc_word_chip_select_lower_bit(), 8);
+			mct_SendCtrlWrd(pMCTstat, pDCTstat, dct, val);
 
 			mct_Wait(1600);
 		}
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c b/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c
index 718a61f..c75f2ea 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c
@@ -441,13 +441,13 @@ static u32 mct_MR2(struct MCTStatStruc *pMCTstat,
 	u32 dev = pDCTstat->dev_dct;
 	u32 dword, ret;
 
+	/* The formula for chip select number is: CS = dimm*2+rank */
+	uint8_t dimm = MrsChipSel / 2;
+	uint8_t rank = MrsChipSel % 2;
+
 	if (is_fam15h()) {
 		uint8_t package_type = mctGet_NVbits(NV_PACK_TYPE);
 
-		/* The formula for chip select number is: CS = dimm*2+rank */
-		uint8_t dimm = MrsChipSel / 2;
-		uint8_t rank = MrsChipSel % 2;
-
 		/* FIXME: These parameters should be configurable
 		 * For now, err on the side of caution and enable automatic 2x refresh
 		 * when the DDR temperature rises above the internal limits
@@ -492,7 +492,7 @@ static u32 mct_MR2(struct MCTStatStruc *pMCTstat,
 		ret |= ((dword >> 10) & 3) << 9;
 	}
 
-	printk(BIOS_SPEW, "Going to send MR2 control word %08x\n", ret);
+	printk(BIOS_SPEW, "Going to send DCT %d DIMM %d rank %d MR2 control word %08x\n", dct, dimm, rank, ret);
 
 	return ret;
 }
@@ -503,6 +503,10 @@ static u32 mct_MR3(struct MCTStatStruc *pMCTstat,
 	u32 dev = pDCTstat->dev_dct;
 	u32 dword, ret;
 
+	/* The formula for chip select number is: CS = dimm*2+rank */
+	uint8_t dimm = MrsChipSel / 2;
+	uint8_t rank = MrsChipSel % 2;
+
 	if (is_fam15h()) {
 		ret = 0xc0000;
 		ret |= (MrsChipSel << 21);
@@ -523,7 +527,7 @@ static u32 mct_MR3(struct MCTStatStruc *pMCTstat,
 		ret |= (dword >> 24) & 7;
 	}
 
-	printk(BIOS_SPEW, "Going to send MR3 control word %08x\n", ret);
+	printk(BIOS_SPEW, "Going to send DCT %d DIMM %d rank %d MR3 control word %08x\n", dct, dimm, rank, ret);
 
 	return ret;
 }
@@ -534,6 +538,10 @@ static u32 mct_MR1(struct MCTStatStruc *pMCTstat,
 	u32 dev = pDCTstat->dev_dct;
 	u32 dword, ret;
 
+	/* The formula for chip select number is: CS = dimm*2+rank */
+	uint8_t dimm = MrsChipSel / 2;
+	uint8_t rank = MrsChipSel % 2;
+
 	if (is_fam15h()) {
 		uint8_t package_type = mctGet_NVbits(NV_PACK_TYPE);
 
@@ -549,10 +557,6 @@ static u32 mct_MR1(struct MCTStatStruc *pMCTstat,
 		ret = 0x40000;
 		ret |= (MrsChipSel << 21);
 
-		/* The formula for chip select number is: CS = dimm*2+rank */
-		uint8_t dimm = MrsChipSel / 2;
-		uint8_t rank = MrsChipSel % 2;
-
 		/* Determine if TQDS should be set */
 		if ((pDCTstat->Dimmx8Present & (1 << dimm))
 			&& (((dimm & 0x1)?(pDCTstat->Dimmx4Present&0x55):(pDCTstat->Dimmx4Present&0xaa)) != 0x0)
@@ -619,7 +623,7 @@ static u32 mct_MR1(struct MCTStatStruc *pMCTstat,
 			ret |= 1 << 12;
 	}
 
-	printk(BIOS_SPEW, "Going to send MR1 control word %08x\n", ret);
+	printk(BIOS_SPEW, "Going to send DCT %d DIMM %d rank %d MR1 control word %08x\n", dct, dimm, rank, ret);
 
 	return ret;
 }
@@ -630,6 +634,10 @@ static u32 mct_MR0(struct MCTStatStruc *pMCTstat,
 	u32 dev = pDCTstat->dev_dct;
 	u32 dword, ret, dword2;
 
+	/* The formula for chip select number is: CS = dimm*2+rank */
+	uint8_t dimm = MrsChipSel / 2;
+	uint8_t rank = MrsChipSel % 2;
+
 	if (is_fam15h()) {
 		ret = 0x00000;
 		ret |= (MrsChipSel << 21);
@@ -740,7 +748,7 @@ static u32 mct_MR0(struct MCTStatStruc *pMCTstat,
 		ret |= 1 << 8;
 	}
 
-	printk(BIOS_SPEW, "Going to send MR0 control word %08x\n", ret);
+	printk(BIOS_SPEW, "Going to send DCT %d DIMM %d rank %d MR0 control word %08x\n", dct, dimm, rank, ret);
 
 	return ret;
 }
@@ -807,6 +815,16 @@ void mct_DramInit_Sw_D(struct MCTStatStruc *pMCTstat,
 		/* 8.wait 360ns */
 		mct_Wait(80);
 
+		/* Set up address parity */
+		if ((pDCTstat->Status & (1 << SB_Registered))
+			|| (pDCTstat->Status & (1 << SB_LoadReduced))) {
+			if (is_fam15h()) {
+				dword = Get_NB32_DCT(dev, dct, 0x90);
+				dword |= 1 << ParEn;
+				Set_NB32_DCT(dev, dct, 0x90, dword);
+			}
+		}
+
 		/* The following steps are performed with registered DIMMs only and
 		 * must be done for each chip select pair */
 		if (pDCTstat->Status & (1 << SB_Registered))
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
index c85dc27..19b1b8f 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
@@ -1142,8 +1142,10 @@ static void dqsTrainRcvrEn_SW_Fam15(struct MCTStatStruc *pMCTstat,
 	uint8_t dimm;
 	uint8_t rank;
 	uint8_t lane;
+	uint8_t nibble;
 	uint8_t mem_clk;
 	uint16_t initial_seed;
+	uint8_t train_both_nibbles;
 	uint16_t current_total_delay[MAX_BYTE_LANES];
 	uint16_t dqs_ret_pass1_total_delay[MAX_BYTE_LANES];
 	uint16_t rank0_current_total_delay[MAX_BYTE_LANES];
@@ -1159,6 +1161,11 @@ static void dqsTrainRcvrEn_SW_Fam15(struct MCTStatStruc *pMCTstat,
 	print_debug_dqs("\nTrainRcvEn: Node", pDCTstat->Node_ID, 0);
 	print_debug_dqs("TrainRcvEn: Pass", Pass, 0);
 
+	train_both_nibbles = 0;
+	if (pDCTstat->Dimmx4Present)
+		if (is_fam15h())
+			train_both_nibbles = 1;
+
 	dev = pDCTstat->dev_dct;
 	index_reg = 0x98;
 	ch_start = 0;
@@ -1241,132 +1248,148 @@ static void dqsTrainRcvrEn_SW_Fam15(struct MCTStatStruc *pMCTstat,
 			else
 				_2Ranks = 0;
 			for (rank = 0; rank < (_2Ranks + 1); rank++) {
-				/* 2.10.5.8.2 (1)
-				 * Specify the target DIMM to be trained
-				 * Set TrNibbleSel = 0
-				 *
-				 * TODO: Add support for x4 DIMMs
-				 */
-				dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
-				dword &= ~(0x3 << 4);		/* TrDimmSel */
-				dword |= ((dimm & 0x3) << 4);
-				dword &= ~(0x1 << 2);		/* TrNibbleSel */
-				Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
-
-				/* 2.10.5.8.2 (2)
-				 * Retrieve gross and fine timing fields from write DQS registers
-				 */
-				read_dqs_write_timing_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
+				for (nibble = 0; nibble < (train_both_nibbles + 1); nibble++) {
+					/* 2.10.5.8.2 (1)
+					 * Specify the target DIMM and nibble to be trained
+					 */
+					dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
+					dword &= ~(0x3 << 4);		/* TrDimmSel = dimm */
+					dword |= ((dimm & 0x3) << 4);
+					dword &= ~(0x1 << 2);		/* TrNibbleSel = nibble */
+					dword |= ((nibble & 0x1) << 2);
+					Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
+
+					/* 2.10.5.8.2 (2)
+					 * Retrieve gross and fine timing fields from write DQS registers
+					 */
+					read_dqs_write_timing_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
 
-				/* 2.10.5.8.2.1
-				 * Generate the DQS Receiver Enable Training Seed Values
-				 */
-				if (Pass == FirstPass) {
-					initial_seed = fam15_receiver_enable_training_seed(pDCTstat, Channel, dimm, rank, package_type);
+					/* 2.10.5.8.2.1
+					 * Generate the DQS Receiver Enable Training Seed Values
+					 */
+					if (Pass == FirstPass) {
+						initial_seed = fam15_receiver_enable_training_seed(pDCTstat, Channel, dimm, rank, package_type);
 
-					/* Adjust seed for the minimum platform supported frequency */
-					initial_seed = (uint16_t) (((((uint64_t) initial_seed) *
-						fam15h_freq_tab[mem_clk] * 100) / (mctGet_NVbits(NV_MIN_MEMCLK) * 100)));
+						/* Adjust seed for the minimum platform supported frequency */
+						initial_seed = (uint16_t) (((((uint64_t) initial_seed) *
+							fam15h_freq_tab[mem_clk] * 100) / (mctGet_NVbits(NV_MIN_MEMCLK) * 100)));
 
-					for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
-						uint16_t wl_pass1_delay;
-						wl_pass1_delay = current_total_delay[lane];
+						for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
+							uint16_t wl_pass1_delay;
+							wl_pass1_delay = current_total_delay[lane];
 
-						seed[lane] = initial_seed + wl_pass1_delay;
-					}
-				} else {
-					uint8_t addr_prelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
-					uint16_t register_delay;
-					int16_t seed_prescaling;
-
-					memcpy(current_total_delay, dqs_ret_pass1_total_delay, sizeof(current_total_delay));
-					if ((pDCTstat->Status & (1 << SB_Registered))) {
-						if (addr_prelaunch)
-							register_delay = 0x30;
-						else
-							register_delay = 0x20;
-					} else if ((pDCTstat->Status & (1 << SB_LoadReduced))) {
-						/* TODO
-						* Load reduced DIMM support unimplemented
-						*/
-						register_delay = 0x0;
+							seed[lane] = initial_seed + wl_pass1_delay;
+						}
 					} else {
-						register_delay = 0x0;
+						uint8_t addr_prelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
+						uint16_t register_delay;
+						int16_t seed_prescaling;
+
+						memcpy(current_total_delay, dqs_ret_pass1_total_delay, sizeof(current_total_delay));
+						if ((pDCTstat->Status & (1 << SB_Registered))) {
+							if (addr_prelaunch)
+								register_delay = 0x30;
+							else
+								register_delay = 0x20;
+						} else if ((pDCTstat->Status & (1 << SB_LoadReduced))) {
+							/* TODO
+							 * Load reduced DIMM support unimplemented
+							 */
+							register_delay = 0x0;
+						} else {
+							register_delay = 0x0;
+						}
+
+						for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
+							seed_prescaling = current_total_delay[lane] - register_delay - 0x20;
+							seed[lane] = (uint16_t) (register_delay + ((((uint64_t) seed_prescaling) * fam15h_freq_tab[mem_clk] * 100) / (mctGet_NVbits(NV_MIN_MEMCLK) * 100)));
+						}
 					}
 
 					for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
-						seed_prescaling = current_total_delay[lane] - register_delay - 0x20;
-						seed[lane] = (uint16_t) (register_delay + ((((uint64_t) seed_prescaling) * fam15h_freq_tab[mem_clk] * 100) / (mctGet_NVbits(NV_MIN_MEMCLK) * 100)));
-					}
-				}
+						seed_gross[lane] = (seed[lane] >> 5) & 0x1f;
+						seed_fine[lane] = seed[lane] & 0x1f;
 
-				for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
-					seed_gross[lane] = (seed[lane] >> 5) & 0x1f;
-					seed_fine[lane] = seed[lane] & 0x1f;
+						/*if (seed_gross[lane] == 0)
+							seed_pre_gross[lane] = 0;
+						else */if (seed_gross[lane] & 0x1)
+							seed_pre_gross[lane] = 1;
+						else
+							seed_pre_gross[lane] = 2;
 
-					/*if (seed_gross[lane] == 0)
-						seed_pre_gross[lane] = 0;
-					else */if (seed_gross[lane] & 0x1)
-						seed_pre_gross[lane] = 1;
-					else
-						seed_pre_gross[lane] = 2;
+						/* Calculate phase recovery delays */
+						phase_recovery_delays[lane] = ((seed_pre_gross[lane] & 0x1f) << 5) | (seed_fine[lane] & 0x1f);
 
-					/* Calculate phase recovery delays */
-					phase_recovery_delays[lane] = ((seed_pre_gross[lane] & 0x1f) << 5) | (seed_fine[lane] & 0x1f);
+						/* Set the gross delay.
+						* NOTE: While the BKDG states to only program DqsRcvEnGrossDelay, this appears
+						* to have been a misprint as DqsRcvEnFineDelay should be set to zero as well.
+						*/
+						current_total_delay[lane] = ((seed_gross[lane] & 0x1f) << 5);
+					}
 
-					/* Set the gross delay.
-					 * NOTE: While the BKDG states to only program DqsRcvEnGrossDelay, this appears
-					 * to have been a misprint as DqsRcvEnFineDelay should be set to zero as well.
+					/* 2.10.5.8.2 (2) / 2.10.5.8.2.1 (5 6)
+					 * Program PhRecFineDly and PhRecGrossDly
 					 */
-					current_total_delay[lane] = ((seed_gross[lane] & 0x1f) << 5);
-				}
+					write_dram_phase_recovery_control_registers(phase_recovery_delays, dev, Channel, dimm, index_reg);
 
-				/* 2.10.5.8.2 (2) / 2.10.5.8.2.1 (5 6)
-				 * Program PhRecFineDly and PhRecGrossDly
-				 */
-				write_dram_phase_recovery_control_registers(phase_recovery_delays, dev, Channel, dimm, index_reg);
+					/* 2.10.5.8.2 (2) / 2.10.5.8.2.1 (7)
+					 * Program the DQS Receiver Enable delay values for each lane
+					 */
+					write_dqs_receiver_enable_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
 
-				/* 2.10.5.8.2 (2) / 2.10.5.8.2.1 (7)
-				 * Program the DQS Receiver Enable delay values for each lane
-				 */
-				write_dqs_receiver_enable_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
+					/* 2.10.5.8.2 (3)
+					 * Program DqsRcvTrEn = 1
+					 */
+					dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
+					dword |= (0x1 << 13);
+					Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
 
-				/* 2.10.5.8.2 (3)
-				 * Program DqsRcvTrEn = 1
-				 */
-				dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
-				dword |= (0x1 << 13);
-				Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
+					/* 2.10.5.8.2 (4)
+					 * Issue 192 read requests to the target rank
+					 */
+					generate_dram_receiver_enable_training_pattern_fam15(pMCTstat, pDCTstat, Channel, Receiver + (rank & 0x1));
 
-				/* 2.10.5.8.2 (4)
-				 * Issue 192 read requests to the target rank
-				 */
-				generate_dram_receiver_enable_training_pattern_fam15(pMCTstat, pDCTstat, Channel, Receiver + (rank & 0x1));
+					/* 2.10.5.8.2 (5)
+					 * Program DqsRcvTrEn = 0
+					 */
+					dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
+					dword &= ~(0x1 << 13);
+					Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
 
-				/* 2.10.5.8.2 (5)
-				 * Program DqsRcvTrEn = 0
-				 */
-				dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
-				dword &= ~(0x1 << 13);
-				Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
+					/* 2.10.5.8.2 (6)
+					 * Read PhRecGrossDly, PhRecFineDly
+					 */
+					read_dram_phase_recovery_control_registers(phase_recovery_delays, dev, Channel, dimm, index_reg);
 
-				/* 2.10.5.8.2 (6)
-				 * Read PhRecGrossDly, PhRecFineDly
-				 */
-				read_dram_phase_recovery_control_registers(phase_recovery_delays, dev, Channel, dimm, index_reg);
+					/* 2.10.5.8.2 (7)
+					 * Calculate and program the DQS Receiver Enable delay values
+					 */
+					for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
+						current_total_delay[lane] = (phase_recovery_delays[lane] & 0x1f);
+						current_total_delay[lane] |= ((seed_gross[lane] + ((phase_recovery_delays[lane] >> 5) & 0x1f) - seed_pre_gross[lane] + 1) << 5);
+						if (nibble == 0) {
+							if (lane == 8)
+								pDCTstat->CH_D_BC_RCVRDLY[Channel][dimm] = current_total_delay[lane];
+							else
+								pDCTstat->CH_D_B_RCVRDLY[Channel][dimm][lane] = current_total_delay[lane];
+						} else {
+							/* 2.10.5.8.2 (1)
+							 * Average the trained values of both nibbles on x4 DIMMs
+							 */
+							if (lane == 8)
+								pDCTstat->CH_D_BC_RCVRDLY[Channel][dimm] = (pDCTstat->CH_D_BC_RCVRDLY[Channel][dimm] + current_total_delay[lane]) / 2;
+							else
+								pDCTstat->CH_D_B_RCVRDLY[Channel][dimm][lane] = (pDCTstat->CH_D_B_RCVRDLY[Channel][dimm][lane] + current_total_delay[lane]) / 2;
+						}
+					}
 
-				/* 2.10.5.8.2 (7)
-				 * Calculate and program the DQS Receiver Enable delay values
-				 */
-				for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
-					current_total_delay[lane] = (phase_recovery_delays[lane] & 0x1f);
-					current_total_delay[lane] |= ((seed_gross[lane] + ((phase_recovery_delays[lane] >> 5) & 0x1f) - seed_pre_gross[lane] + 1) << 5);
-					if (lane == 8)
-						pDCTstat->CH_D_BC_RCVRDLY[Channel][dimm] = current_total_delay[lane];
-					else
-						pDCTstat->CH_D_B_RCVRDLY[Channel][dimm][lane] = current_total_delay[lane];
+#if DQS_TRAIN_DEBUG > 1
+					for (lane = 0; lane < 8; lane++)
+						printk(BIOS_DEBUG, "\t\tTrainRcvEn55: Channel: %d dimm: %d nibble: %d lane %d current_total_delay: %04x CH_D_B_RCVRDLY: %04x\n",
+							Channel, dimm, nibble, lane, current_total_delay[lane], pDCTstat->CH_D_B_RCVRDLY[Channel][dimm][lane]);
+#endif
+					write_dqs_receiver_enable_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
 				}
-				write_dqs_receiver_enable_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
 
 				if (rank == 0) {
 					/* Back up the Rank 0 delays for later use */
@@ -1391,7 +1414,7 @@ static void dqsTrainRcvrEn_SW_Fam15(struct MCTStatStruc *pMCTstat,
 
 #if DQS_TRAIN_DEBUG > 0
 			for (lane = 0; lane < 8; lane++)
-				print_debug_dqs_pair("\t\tTrainRcvEn55: Lane ", lane, " current_total_delay ", current_total_delay[lane], 2);
+				print_debug_dqs_pair("\t\tTrainRcvEn56: Lane ", lane, " current_total_delay ", current_total_delay[lane], 2);
 #endif
 		}
 	}
@@ -1811,15 +1834,23 @@ void mctSetEccDQSRcvrEn_D(struct MCTStatStruc *pMCTstat,
 }
 
 void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstatA)
+			struct DCTStatStruc *pDCTstatA, int16_t single_node_number)
 {
 	u8 Node = 0;
 	struct DCTStatStruc *pDCTstat;
 
 	printk(BIOS_DEBUG, "%s: Start\n", __func__);
 
+	uint8_t start_node = 0;
+	uint8_t end_node = MAX_NODES_SUPPORTED;
+
+	if (single_node_number >= 0) {
+		start_node = single_node_number;
+		end_node = single_node_number;
+	}
+
 	/* FIXME: skip for Ax */
-	for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) {
+	for (Node = start_node; Node < end_node; Node++) {
 		pDCTstat = pDCTstatA + Node;
 		if (!pDCTstat->NodePresent)
 			continue;
@@ -1843,6 +1874,8 @@ void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat,
 					if (!pDCTstat->DIMMValidDCT[dct])
 						continue;
 
+					printk(BIOS_SPEW, "%s: training node %d DCT %d\n", __func__, Node, dct);
+
 					/* Back up D18F2x9C_x0000_0004_dct[1:0] */
 					datc_backup = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x00000004);
 
@@ -1981,6 +2014,8 @@ void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat,
 
 					/* Restore D18F2x9C_x0000_0004_dct[1:0] */
 					Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x00000004, datc_backup);
+
+					printk(BIOS_SPEW, "%s: done training node %d DCT %d\n", __func__, Node, dct);
 				}
 			} else {
 				fenceDynTraining_D(pMCTstat, pDCTstat, 0);
@@ -1993,7 +2028,7 @@ void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat,
 }
 
 static uint32_t fenceDynTraining_D(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstat, u8 dct)
+			struct DCTStatStruc *pDCTstat, uint8_t dct)
 {
 	u16 avRecValue;
 	u32 val;
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c b/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
index e163227..4bfcc40 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
@@ -15,7 +15,7 @@
  */
 
 static void FreqChgCtrlWrd(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstat);
+			struct DCTStatStruc *pDCTstat, uint8_t dct);
 
 
 static void AgesaDelay(u32 msec)
@@ -349,11 +349,14 @@ static void ExitSelfRefresh(struct MCTStatStruc *pMCTstat,
 }
 
 void SetTargetFreq(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat)
+					struct DCTStatStruc *pDCTstatA, uint8_t Node)
 {
 	uint32_t dword;
 	uint8_t package_type = mctGet_NVbits(NV_PACK_TYPE);
 
+	struct DCTStatStruc *pDCTstat;
+	pDCTstat = pDCTstatA + Node;
+
 	if (is_fam15h()) {
 		/* Program F2x[1, 0]90[DisDllShutDownSR]=1. */
 		if (pDCTstat->DIMMValidDCT[0]) {
@@ -387,7 +390,7 @@ void SetTargetFreq(struct MCTStatStruc *pMCTstat,
 		uint8_t dct;
 		for (dct = 0; dct < 2; dct++) {
 			if (pDCTstat->DIMMValidDCT[dct]) {
-				phyAssistedMemFnceTraining(pMCTstat, pDCTstat);
+				phyAssistedMemFnceTraining(pMCTstat, pDCTstatA, Node);
 				InitPhyCompensation(pMCTstat, pDCTstat, dct);
 			}
 		}
@@ -434,7 +437,12 @@ void SetTargetFreq(struct MCTStatStruc *pMCTstat,
 		else
 			pDCTstat->CSPresent = pDCTstat->CSPresent_DCT[1];
 
-		FreqChgCtrlWrd(pMCTstat, pDCTstat);
+		if (pDCTstat->DIMMValidDCT[0]) {
+			FreqChgCtrlWrd(pMCTstat, pDCTstat, 0);
+		}
+		if (pDCTstat->DIMMValidDCT[1]) {
+			FreqChgCtrlWrd(pMCTstat, pDCTstat, 1);
+		}
 	}
 }
 
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c
index fab7bc6..5deaca5 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c
@@ -31,9 +31,9 @@ u32 swapBankBits(struct DCTStatStruc *pDCTstat, uint8_t dct, uint32_t MRSValue);
 void prepareDimms(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat,
 	u8 dct, u8 dimm, BOOL wl);
 void programODT(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, uint8_t dct, u8 dimm);
-void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, uint8_t dct, u8 dimm, u8 pass);
+void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, uint8_t dct, uint8_t dimm, uint8_t pass, uint8_t nibble);
 void setWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 dimm, u8 targetAddr, uint8_t pass);
-void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 dimm, uint8_t pass);
+void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 dimm, uint8_t pass, uint8_t nibble);
 
 static int32_t abs(int32_t val) {
 	if (val < 0)
@@ -72,6 +72,8 @@ uint8_t AgesaHwWlPhase1(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 {
 	u8 ByteLane;
 	u32 Value, Addr;
+	uint8_t nibble = 0;
+	uint8_t train_both_nibbles;
 	u16 Addl_Data_Offset, Addl_Data_Port;
 	sMCTStruct *pMCTData = pDCTstat->C_MCTPtr;
 	sDCTStruct *pDCTData = pDCTstat->C_DCTPtr[dct];
@@ -84,98 +86,108 @@ uint8_t AgesaHwWlPhase1(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 			DRAM_ADD_DCT_PHY_CONTROL_REG, TrDimmSelStart,
 			TrDimmSelEnd, (u32)dimm);
 
-	if (is_fam15h()) {
-		/* Set TrNibbleSel = 0
-		 *
-		 * TODO: Add support for x4 DIMMs
-		 */
-		set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
-				DRAM_ADD_DCT_PHY_CONTROL_REG, 2,
-				2, (u32)0);
-	}
+	train_both_nibbles = 0;
+	if (pDCTstat->Dimmx4Present)
+		if (is_fam15h())
+			train_both_nibbles = 1;
 
-	/* 2. Prepare the DIMMs for write levelization using DDR3-defined
-	 * MR commands. */
-	prepareDimms(pMCTstat, pDCTstat, dct, dimm, TRUE);
+	for (nibble = 0; nibble < (train_both_nibbles + 1); nibble++) {
+		printk(BIOS_SPEW, "AgesaHwWlPhase1: training nibble %d\n", nibble);
 
-	/* 3. After the DIMMs are configured, BIOS waits 40 MEMCLKs to
-	 *    satisfy DDR3-defined internal DRAM timing.
-	 */
-	if (is_fam15h())
-		precise_memclk_delay_fam15(pMCTstat, pDCTstat, dct, 40);
-	else
-		pMCTData->AgesaDelay(40);
+		if (is_fam15h()) {
+			/* Program F2x[1, 0]9C_x08[WrtLvTrEn]=0 */
+			set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
+					DRAM_ADD_DCT_PHY_CONTROL_REG, WrtLvTrEn, WrtLvTrEn, 0);
+
+			/* Set TrNibbleSel */
+			set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
+					DRAM_ADD_DCT_PHY_CONTROL_REG, 2,
+					2, (uint32_t)nibble);
+		}
 
-	/* 4. Configure the processor's DDR phy for write levelization training: */
-	procConfig(pMCTstat, pDCTstat, dct, dimm, pass);
+		/* 2. Prepare the DIMMs for write levelization using DDR3-defined
+		 * MR commands. */
+		prepareDimms(pMCTstat, pDCTstat, dct, dimm, TRUE);
 
-	/* 5. Begin write levelization training:
-	 *  Program F2x[1, 0]9C_x08[WrtLvTrEn]=1. */
-	if (pDCTData->LogicalCPUID & (AMD_DR_Cx | AMD_DR_Dx | AMD_FAM15_ALL))
-	{
-		set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
-				DRAM_ADD_DCT_PHY_CONTROL_REG, WrtLvTrEn, WrtLvTrEn, 1);
-	}
-	else
-	{
-		/* Broadcast write to all D3Dbyte chipset register offset 0xc
-		 * Set bit 0 (wrTrain)
-		 * Program bit 4 to nibble being trained (only matters for x4dimms)
-		 * retain value of 3:2 (Trdimmsel)
-		 * reset bit 5 (FrzPR)
+		/* 3. After the DIMMs are configured, BIOS waits 40 MEMCLKs to
+		 *    satisfy DDR3-defined internal DRAM timing.
 		 */
-		if (dct)
+		if (is_fam15h())
+			precise_memclk_delay_fam15(pMCTstat, pDCTstat, dct, 40);
+		else
+			pMCTData->AgesaDelay(40);
+
+		/* 4. Configure the processor's DDR phy for write levelization training: */
+		procConfig(pMCTstat, pDCTstat, dct, dimm, pass, nibble);
+
+		/* 5. Begin write levelization training:
+		 *  Program F2x[1, 0]9C_x08[WrtLvTrEn]=1. */
+		if (pDCTData->LogicalCPUID & (AMD_DR_Cx | AMD_DR_Dx | AMD_FAM15_ALL))
 		{
-			Addl_Data_Offset=0x198;
-			Addl_Data_Port=0x19C;
+			set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
+					DRAM_ADD_DCT_PHY_CONTROL_REG, WrtLvTrEn, WrtLvTrEn, 1);
 		}
 		else
 		{
-			Addl_Data_Offset=0x98;
-			Addl_Data_Port=0x9C;
+			/* Broadcast write to all D3Dbyte chipset register offset 0xc
+			 * Set bit 0 (wrTrain)
+			 * Program bit 4 to nibble being trained (only matters for x4dimms)
+			 * retain value of 3:2 (Trdimmsel)
+			 * reset bit 5 (FrzPR)
+			 */
+			if (dct)
+			{
+				Addl_Data_Offset=0x198;
+				Addl_Data_Port=0x19C;
+			}
+			else
+			{
+				Addl_Data_Offset=0x98;
+				Addl_Data_Port=0x9C;
+			}
+			Addr=0x0D00000C;
+			AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Offset), 31, 0, &Addr);
+			while ((get_Bits(pDCTData,FUN_DCT,pDCTData->NodeId, FUN_DCT, Addl_Data_Offset,
+					DctAccessDone, DctAccessDone)) == 0);
+			AmdMemPCIReadBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Port), 31, 0, &Value);
+			Value = bitTestSet(Value, 0);	/* enable WL training */
+			Value = bitTestReset(Value, 4); /* for x8 only */
+			Value = bitTestReset(Value, 5); /* for hardware WL training */
+			AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Port), 31, 0, &Value);
+			Addr=0x4D030F0C;
+			AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Offset), 31, 0, &Addr);
+			while ((get_Bits(pDCTData,FUN_DCT,pDCTData->NodeId, FUN_DCT, Addl_Data_Offset,
+					DctAccessDone, DctAccessDone)) == 0);
 		}
-		Addr=0x0D00000C;
-		AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Offset), 31, 0, &Addr);
-		while ((get_Bits(pDCTData,FUN_DCT,pDCTData->NodeId, FUN_DCT, Addl_Data_Offset,
-				DctAccessDone, DctAccessDone)) == 0);
-		AmdMemPCIReadBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Port), 31, 0, &Value);
-		Value = bitTestSet(Value, 0);	/* enable WL training */
-		Value = bitTestReset(Value, 4); /* for x8 only */
-		Value = bitTestReset(Value, 5); /* for hardware WL training */
-		AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Port), 31, 0, &Value);
-		Addr=0x4D030F0C;
-		AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Offset), 31, 0, &Addr);
-		while ((get_Bits(pDCTData,FUN_DCT,pDCTData->NodeId, FUN_DCT, Addl_Data_Offset,
-				DctAccessDone, DctAccessDone)) == 0);
-	}
 
-	if (is_fam15h())
-		proc_MFENCE();
+		if (is_fam15h())
+			proc_MFENCE();
 
-	/* Wait 200 MEMCLKs. If executing pass 2, wait 32 MEMCLKs. */
-	if (is_fam15h())
-		precise_memclk_delay_fam15(pMCTstat, pDCTstat, dct, 200);
-	else
-		pMCTData->AgesaDelay(140);
+		/* Wait 200 MEMCLKs. If executing pass 2, wait 32 MEMCLKs. */
+		if (is_fam15h())
+			precise_memclk_delay_fam15(pMCTstat, pDCTstat, dct, 200);
+		else
+			pMCTData->AgesaDelay(140);
 
-	/* Program F2x[1, 0]9C_x08[WrtLevelTrEn]=0. */
-	set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
-			DRAM_ADD_DCT_PHY_CONTROL_REG, WrtLvTrEn, WrtLvTrEn, 0);
+		/* Program F2x[1, 0]9C_x08[WrtLevelTrEn]=0. */
+		set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
+				DRAM_ADD_DCT_PHY_CONTROL_REG, WrtLvTrEn, WrtLvTrEn, 0);
 
-	/* Read from registers F2x[1, 0]9C_x[51:50] and F2x[1, 0]9C_x52
-	 * to get the gross and fine delay settings
-	 * for the target DIMM and save these values. */
-	for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
-		getWLByteDelay(pDCTstat, dct, ByteLane, dimm, pass);
-	}
+		/* Read from registers F2x[1, 0]9C_x[51:50] and F2x[1, 0]9C_x52
+		 * to get the gross and fine delay settings
+		 * for the target DIMM and save these values. */
+		for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+			getWLByteDelay(pDCTstat, dct, ByteLane, dimm, pass, nibble);
+		}
 
-	pDCTData->WLCriticalGrossDelayPrevPass = 0x1f;
+		pDCTData->WLCriticalGrossDelayPrevPass = 0x0;
+	}
 
 	return 0;
 }
 
 uint8_t AgesaHwWlPhase2(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat,
-		u8 dct, u8 dimm, u8 pass)
+		uint8_t dct, uint8_t dimm, uint8_t pass)
 {
 	u8 ByteLane;
 	uint8_t status = 0;
@@ -186,6 +198,12 @@ uint8_t AgesaHwWlPhase2(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 		int32_t cgd = pDCTData->WLCriticalGrossDelayPrevPass;
 		uint8_t index = (uint8_t)(MAX_BYTE_LANES * dimm);
 
+		printk(BIOS_SPEW, "\toriginal critical gross delay: %d\n", cgd);
+
+		/* FIXME
+		 * For now, disable CGD adjustment as it seems to interfere with registered DIMM training
+		 */
+
 		/* Calculate the Critical Gross Delay */
 		for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
 			/* Calculate the gross delay differential for this lane */
@@ -201,6 +219,8 @@ uint8_t AgesaHwWlPhase2(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 				cgd = gross_diff[ByteLane];
 		}
 
+		printk(BIOS_SPEW, "\tnew critical gross delay: %d\n", cgd);
+
 		pDCTData->WLCriticalGrossDelayPrevPass = cgd;
 
 		if (pDCTstat->Speed != pDCTstat->TargetFreq) {
@@ -277,7 +297,7 @@ uint8_t AgesaHwWlPhase3(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 				gross_diff[ByteLane] = pDCTData->WLSeedGrossDelay[index+ByteLane] + pDCTData->WLGrossDelay[index+ByteLane];
 				gross_diff[ByteLane] -= pDCTData->WLSeedPreGrossDelay[index+ByteLane];
 
-				/* Prevent underflow in the presence of noise / instability*/
+				/* Prevent underflow in the presence of noise / instability */
 				if (gross_diff[ByteLane] < cgd)
 					gross_diff[ByteLane] = cgd;
 
@@ -285,7 +305,8 @@ uint8_t AgesaHwWlPhase3(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 			}
 		} else {
 			dword = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0xa8);
-			dword &= ~(0x3 << 24);			/* WrDqDqsEarly = 0 */
+			dword &= ~(0x3 << 24);			/* WrDqDqsEarly = pDCTData->WrDqsGrossDlyBaseOffset */
+			dword |= ((pDCTData->WrDqsGrossDlyBaseOffset & 0x3) << 24);
 			Set_NB32_DCT(pDCTstat->dev_dct, dct, 0xa8, dword);
 		}
 	}
@@ -955,7 +976,7 @@ static uint16_t fam15h_next_lowest_memclk_freq(uint16_t memclk_freq)
 #endif
 
 /*-----------------------------------------------------------------------------
- * void procConfig(MCTStruct *MCTData,DCTStruct *DCTData, u8 Dimm, u8 Pass)
+ * void procConfig(MCTStruct *MCTData,DCTStruct *DCTData, u8 Dimm, u8 Pass, u8 Nibble)
  *
  *  Description:
  *       This function programs the ODT values for the NB
@@ -968,13 +989,14 @@ static uint16_t fam15h_next_lowest_memclk_freq(uint16_t memclk_freq)
  *       OUT
  * ----------------------------------------------------------------------------
  */
-void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, uint8_t dct, u8 dimm, u8 pass)
+void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, uint8_t dct, uint8_t dimm, uint8_t pass, uint8_t nibble)
 {
 	u8 ByteLane, MemClkFreq;
 	int32_t Seed_Gross;
 	int32_t Seed_Fine;
 	uint8_t Seed_PreGross;
 	u32 Value, Addr;
+	uint32_t dword;
 	u16 Addl_Data_Offset, Addl_Data_Port;
 	sMCTStruct *pMCTData = pDCTstat->C_MCTPtr;
 	sDCTStruct *pDCTData = pDCTstat->C_DCTPtr[dct];
@@ -1044,10 +1066,17 @@ void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, ui
 			uint8_t AddrCmdPrelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
 			uint8_t package_type = mctGet_NVbits(NV_PACK_TYPE);
 			uint16_t Seed_Total = 0;
+			pDCTData->WrDqsGrossDlyBaseOffset = 0x0;
 			if (package_type == PT_GR) {
 				/* Socket G34: Fam15h BKDG v3.14 Table 96 */
 				if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
+					/* TODO
+					 * Implement mainboard-specific seed and
+					 * WrDqsGrossDly base overrides.
+					 * 0x41 and 0x0 are the "stock" values
+					 */
 					Seed_Total = 0x41;
+					pDCTData->WrDqsGrossDlyBaseOffset = 0x2;
 				} else if (pDCTData->Status[DCT_STATUS_LOAD_REDUCED]) {
 					Seed_Total = 0x0;
 				} else {
@@ -1129,15 +1158,16 @@ void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, ui
 			printk(BIOS_SPEW, "\tLane %02x initial seed: %04x\n", ByteLane, ((Seed_Gross & 0x1f) << 5) | (Seed_Fine & 0x1f));
 		}
 	} else {
-		/* Pass 2 */
-		/* From BKDG, Write Leveling Seed Value. */
-		if (is_fam15h()) {
-			uint32_t RegisterDelay;
-			int32_t SeedTotal;
-			int32_t SeedTotalPreScaling;
-			uint8_t AddrCmdPrelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
+		if (nibble == 0) {
+			/* Pass 2 */
+			/* From BKDG, Write Leveling Seed Value. */
+			if (is_fam15h()) {
+				uint32_t RegisterDelay;
+				int32_t SeedTotal[MAX_BYTE_LANES];
+				int32_t SeedTotalPreScaling[MAX_BYTE_LANES];
+				uint32_t WrDqDqsEarly;
+				uint8_t AddrCmdPrelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
 
-			for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
 				if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
 					if (AddrCmdPrelaunch)
 						RegisterDelay = 0x30;
@@ -1146,84 +1176,133 @@ void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, ui
 				} else {
 					RegisterDelay = 0;
 				}
+
 				/* Retrieve WrDqDqsEarly */
-				AmdMemPCIReadBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId), FUN_DCT, 0xa8), 25, 24, &Value);
+				dword = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0xa8);
+				WrDqDqsEarly = (dword >> 24) & 0x3;
 
-				/* Calculate adjusted seed values */
-				SeedTotal = (pDCTData->WLFineDelayPrevPass[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) |
-					((pDCTData->WLGrossDelayPrevPass[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5);
-				SeedTotalPreScaling = (SeedTotal - RegisterDelay - (0x20 * Value));
-				SeedTotal = (int32_t) (RegisterDelay + ((((int64_t) SeedTotalPreScaling) *
-					fam15h_freq_tab[MemClkFreq] * 100) / (fam15h_freq_tab[pDCTData->WLPrevMemclkFreq] * 100)));
+				/* FIXME
+				 * Ignore WrDqDqsEarly for now to work around training issues
+				 */
+				WrDqDqsEarly = 0;
 
-				if (SeedTotal >= 0) {
-					Seed_Gross = SeedTotal / 32;
-					Seed_Fine = SeedTotal % 32;
-				} else {
-					Seed_Gross = (SeedTotal / 32) - 1;
-					Seed_Fine = (SeedTotal % 32) + 32;
+				/* Generate new seed values */
+				for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+					/* Calculate adjusted seed values */
+					SeedTotal[ByteLane] = (pDCTData->WLFineDelayPrevPass[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) |
+						((pDCTData->WLGrossDelayPrevPass[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5);
+					SeedTotalPreScaling[ByteLane] = (SeedTotal[ByteLane] - RegisterDelay - (0x20 * WrDqDqsEarly));
+					SeedTotal[ByteLane] = (int32_t) (RegisterDelay + ((((int64_t) SeedTotalPreScaling[ByteLane]) *
+						fam15h_freq_tab[MemClkFreq] * 100) / (fam15h_freq_tab[pDCTData->WLPrevMemclkFreq] * 100)));
 				}
 
-				if (Seed_Gross == 0)
-					Seed_PreGross = 0;
-				else if (Seed_Gross & 0x1)
-					Seed_PreGross = 1;
-				else
-					Seed_PreGross = 2;
+				/* Generate register values from seeds */
+				for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+					printk(BIOS_SPEW, "\tLane %02x scaled delay: %04x\n", ByteLane, SeedTotal[ByteLane]);
 
-				/* Save seed values for later use */
-				pDCTData->WLSeedGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Gross;
-				pDCTData->WLSeedFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
-				pDCTData->WLSeedPreGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_PreGross;
+					if (SeedTotal[ByteLane] >= 0) {
+						Seed_Gross = SeedTotal[ByteLane] / 32;
+						Seed_Fine = SeedTotal[ByteLane] % 32;
+					} else {
+						Seed_Gross = (SeedTotal[ByteLane] / 32) - 1;
+						Seed_Fine = (SeedTotal[ByteLane] % 32) + 32;
+					}
 
-				pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_PreGross;
-				pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
+					if (Seed_Gross == 0)
+						Seed_PreGross = 0;
+					else if (Seed_Gross & 0x1)
+						Seed_PreGross = 1;
+					else
+						Seed_PreGross = 2;
 
-				printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((Seed_Gross & 0x1f) << 5) | (Seed_Fine & 0x1f));
-			}
-		} else {
-			uint32_t RegisterDelay;
-			uint32_t SeedTotalPreScaling;
-			uint32_t SeedTotal;
-			uint8_t AddrCmdPrelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
-			for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++)
-			{
-				if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
-					if (AddrCmdPrelaunch == 0)
-						RegisterDelay = 0x20;
+					/* The BKDG-recommended algorithm causes problems with registered DIMMs on some systems
+					 * due to the long register delays causing premature total delay wrap-around.
+					 * Attempt to work around this...
+					 */
+					Seed_PreGross = Seed_Gross;
+
+					/* Save seed values for later use */
+					pDCTData->WLSeedGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Gross;
+					pDCTData->WLSeedFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
+					pDCTData->WLSeedPreGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_PreGross;
+
+					pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_PreGross;
+					pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
+
+					printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f));
+				}
+			} else {
+				uint32_t RegisterDelay;
+				uint32_t SeedTotalPreScaling;
+				uint32_t SeedTotal;
+				uint8_t AddrCmdPrelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
+				for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++)
+				{
+					if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
+						if (AddrCmdPrelaunch == 0)
+							RegisterDelay = 0x20;
+						else
+							RegisterDelay = 0x30;
+					} else {
+						RegisterDelay = 0;
+					}
+					SeedTotalPreScaling = ((pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) |
+						(pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] << 5)) - RegisterDelay;
+					/* SeedTotalPreScaling = (the total delay value in F2x[1, 0]9C_x[4A:30] from pass 1 of write levelization
+					training) - RegisterDelay. */
+					SeedTotal = (uint16_t) ((((uint64_t) SeedTotalPreScaling) *
+										fam10h_freq_tab[MemClkFreq] * 100) / (fam10h_freq_tab[3] * 100));
+					Seed_Gross = SeedTotal / 32;
+					Seed_Fine = SeedTotal & 0x1f;
+					if (Seed_Gross == 0)
+						Seed_Gross = 0;
+					else if (Seed_Gross & 0x1)
+						Seed_Gross = 1;
 					else
-						RegisterDelay = 0x30;
-				} else {
-					RegisterDelay = 0;
+						Seed_Gross = 2;
+
+					/* The BKDG-recommended algorithm causes problems with registered DIMMs on some systems
+					* due to the long register delays causing premature total delay wrap-around.
+					* Attempt to work around this...
+					*/
+					SeedTotal = ((Seed_Gross & 0x1f) << 5) | (Seed_Fine & 0x1f);
+					SeedTotal += RegisterDelay;
+					Seed_Gross = SeedTotal / 32;
+					Seed_Fine = SeedTotal & 0x1f;
+
+					pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Gross;
+					pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
+
+					printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f));
 				}
-				SeedTotalPreScaling = ((pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) |
-					(pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] << 5)) - RegisterDelay;
-				/* SeedTotalPreScaling = (the total delay value in F2x[1, 0]9C_x[4A:30] from pass 1 of write levelization
-				training) - RegisterDelay. */
-				SeedTotal = (uint16_t) ((((uint64_t) SeedTotalPreScaling) *
-									fam10h_freq_tab[MemClkFreq] * 100) / (fam10h_freq_tab[3] * 100));
-				Seed_Gross = SeedTotal / 32;
-				Seed_Fine = SeedTotal & 0x1f;
-				if (Seed_Gross == 0)
-					Seed_Gross = 0;
-				else if (Seed_Gross & 0x1)
-					Seed_Gross = 1;
-				else
-					Seed_Gross = 2;
+			}
 
-				/* The BKDG-recommended algorithm causes problems with registered DIMMs on some systems
-				 * due to the long register delays causing premature total delay wrap-around.
-				 * Attempt to work around this...
-				 */
-				SeedTotal = ((Seed_Gross & 0x1f) << 5) | (Seed_Fine & 0x1f);
-				SeedTotal += RegisterDelay;
-				Seed_Gross = SeedTotal / 32;
-				Seed_Fine = SeedTotal & 0x1f;
+			/* Save initial seeds for upper nibble pass */
+			for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+				pDCTData->WLSeedPreGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedPreGrossDelay[MAX_BYTE_LANES*dimm+ByteLane];
+				pDCTData->WLSeedGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane];
+				pDCTData->WLSeedFinePrevNibble[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane];
+			}
+		} else {
+			/* Restore seed values from lower nibble pass */
+			if (is_fam15h()) {
+				for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+					pDCTData->WLSeedGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
+					pDCTData->WLSeedFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedFinePrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
+					pDCTData->WLSeedPreGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedPreGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
 
-				pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Gross;
-				pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
+					pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedPreGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
+					pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedFinePrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
 
-				printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((Seed_Gross & 0x1f) << 5) | (Seed_Fine & 0x1f));
+					printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f));
+				}
+			} else {
+				for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+					pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
+					pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedFinePrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
+
+					printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f));
+				}
 			}
 		}
 	}
@@ -1354,7 +1433,7 @@ void setWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8
 }
 
 /*-----------------------------------------------------------------------------
- *  void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 Dimm)
+ *  void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 Dimm, u8 Nibble)
  *
  *  Description:
  *       This function reads the write levelization byte delay from the Phase
@@ -1372,7 +1451,7 @@ void setWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8
  *
  *-----------------------------------------------------------------------------
  */
-void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 dimm, uint8_t pass)
+void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 dimm, uint8_t pass, uint8_t nibble)
 {
 	sDCTStruct *pDCTData = pDCTstat->C_DCTPtr[dct];
 	u8 fineStartLoc, fineEndLoc, grossStartLoc, grossEndLoc, tempB, tempB1, index;
@@ -1423,7 +1502,16 @@ void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8
 			fine = 0;
 		}
 	}
-	pDCTData->WLFineDelay[index+ByteLane] = (u8)fine;
-	pDCTData->WLGrossDelay[index+ByteLane] = (u8)gross;
-	printk(BIOS_SPEW, "\tLane %02x final adjusted value: %04x\n", ByteLane, ((gross & 0x1f) << 5) | (fine & 0x1f));
+	if (nibble == 0) {
+		pDCTData->WLFineDelay[index+ByteLane] = (uint8_t)fine;
+		pDCTData->WLGrossDelay[index+ByteLane] = (uint8_t)gross;
+	} else {
+		uint32_t WLTotalDelay = ((pDCTData->WLGrossDelay[index+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[index+ByteLane] & 0x1f);
+		WLTotalDelay += ((gross & 0x1f) << 5) | (fine & 0x1f);
+		WLTotalDelay /= 2;
+		pDCTData->WLFineDelay[index+ByteLane] = (uint8_t)(WLTotalDelay & 0x1f);
+		pDCTData->WLGrossDelay[index+ByteLane] = (uint8_t)((WLTotalDelay >> 5) & 0x1f);
+	}
+
+	printk(BIOS_SPEW, "\tLane %02x adjusted value: %04x\n", ByteLane, ((pDCTData->WLGrossDelay[index+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[index+ByteLane] & 0x1f));
 }
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mwlc_d.h b/src/northbridge/amd/amdmct/mct_ddr3/mwlc_d.h
index e14c433..4de7af0 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mwlc_d.h
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mwlc_d.h
@@ -115,16 +115,21 @@ typedef struct _sDCTStruct
 	u8 DctTrain;			/* Current DCT being trained */
 	u8 CurrDct;			/* Current DCT number (0 or 1) */
 	u8 DctCSPresent;		/* Current DCT CS mapping */
+	uint8_t WrDqsGrossDlyBaseOffset;
 	int32_t WLSeedGrossDelay[MAX_BYTE_LANES*MAX_LDIMMS];	/* Write Levelization Seed Gross Delay */
 								/* per byte Lane Per Logical DIMM*/
 	int32_t WLSeedFineDelay[MAX_BYTE_LANES*MAX_LDIMMS];	/* Write Levelization Seed Fine Delay */
 								/* per byte Lane Per Logical DIMM*/
 	int32_t WLSeedPreGrossDelay[MAX_BYTE_LANES*MAX_LDIMMS];	/* Write Levelization Seed Pre-Gross Delay */
 								/* per byte Lane Per Logical DIMM*/
-	u8 WLGrossDelay[MAX_BYTE_LANES*MAX_LDIMMS];	/* Write Levelization Gross Delay */
-							/* per byte Lane Per Logical DIMM*/
-	u8 WLFineDelay[MAX_BYTE_LANES*MAX_LDIMMS];	/* Write Levelization Fine Delay */
-							/* per byte Lane Per Logical DIMM*/
+	uint8_t WLSeedPreGrossPrevNibble[MAX_BYTE_LANES*MAX_LDIMMS];
+	uint8_t WLSeedGrossPrevNibble[MAX_BYTE_LANES*MAX_LDIMMS];
+	uint8_t WLSeedFinePrevNibble[MAX_BYTE_LANES*MAX_LDIMMS];
+								/* per byte Lane Per Logical DIMM*/
+	u8 WLGrossDelay[MAX_BYTE_LANES*MAX_LDIMMS];		/* Write Levelization Gross Delay */
+								/* per byte Lane Per Logical DIMM*/
+	u8 WLFineDelay[MAX_BYTE_LANES*MAX_LDIMMS];		/* Write Levelization Fine Delay */
+								/* per byte Lane Per Logical DIMM*/
 	u8 WLGrossDelayFirstPass[MAX_BYTE_LANES*MAX_LDIMMS];	/* First-Pass Write Levelization Gross Delay */
 								/* per byte Lane Per Logical DIMM*/
 	u8 WLFineDelayFirstPass[MAX_BYTE_LANES*MAX_LDIMMS];	/* First-Pass Write Levelization Fine Delay */



More information about the coreboot-gerrit mailing list