[coreboot] r818 - coreboot-v3/northbridge/amd/k8

svn at coreboot.org svn at coreboot.org
Sun Aug 24 20:19:40 CEST 2008


Author: rminnich
Date: 2008-08-24 20:19:40 +0200 (Sun, 24 Aug 2008)
New Revision: 818

Added:
   coreboot-v3/northbridge/amd/k8/dqs.c
Log:
I can't believe I forgot all these.

Signed-off-by: Ronald G. Minnich <rminnich at gmail.com>
Acked-by: Ronald G. Minnich <rminnich at gmail.com>


Added: coreboot-v3/northbridge/amd/k8/dqs.c
===================================================================
--- coreboot-v3/northbridge/amd/k8/dqs.c	                        (rev 0)
+++ coreboot-v3/northbridge/amd/k8/dqs.c	2008-08-24 18:19:40 UTC (rev 818)
@@ -0,0 +1,2030 @@
+/*
+ * K8
+ * This file is part of the coreboot project.
+ * Copyright (C) 2005-7 YingHai Lu
+ * Copyright (C) 2005 Ollie Lo
+ * Copyright (C) 2005-2007 Stefan Reinauer <stepan at openbios.org>
+ * Copyright (C) 2008 Ronald G. Minnich <rminnich at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA, 02110-1301 USA
+ */
+#include <console.h>
+#include <string.h>
+#include <mtrr.h>
+#include <macros.h>
+#include <spd.h>
+#include <cpu.h>
+#include <msr.h>
+#include <amd/k8/k8.h>
+#include <amd/k8/sysconf.h>
+#include <device/pci.h>
+#include <pci_ops.h>
+#include <mc146818rtc.h>
+#include <lib.h>
+#include <mainboard.h>
+
+#include <spd_ddr2.h>
+/*
+	yhlu 2005.10 dqs training
+*/
+//0: mean no debug info
+#define DQS_TRAIN_DEBUG 1
+
+// always undef this. We only support F2 and later. 
+#undef K8_REV_F_SUPPORT_F0_F1_WORKAROUND
+u32 pci_read_config32_index(u32 dev, u32 index_reg, u32 index);
+void pci_write_config32_index(u32 dev, u32 index_reg, u32 index, u32 data);
+u32 pci_read_config32_index_wait(u32 dev, u32 index_reg, u32 index);
+void pci_write_config32_index_wait(u32 dev, u32 index_reg, u32 index, u32 data);
+
+static inline void print_debug_dqs(const char *str, unsigned int val, unsigned level) 
+{
+	/* please note: you DO NOT NEED an #ifdef here. C will very happily optimize this out if 
+	 * DAW_TRAIN_DEBUG is 0. 
+	 */
+	if(DQS_TRAIN_DEBUG && DQS_TRAIN_DEBUG > level) {
+		printk(BIOS_DEBUG, "%s%x\n", str, val);
+	}
+}
+
+static inline void print_debug_dqs_pair(const char *str, unsigned val, const char *str2, unsigned val2, unsigned level)
+{
+	if(DQS_TRAIN_DEBUG && DQS_TRAIN_DEBUG > level) {
+                printk(BIOS_DEBUG, "%s%08x%s%08x\n", str, val, str2, val2);
+        }
+}
+
+static inline void print_debug_dqs_tsc(const char *str, unsigned i, unsigned val, unsigned val2, unsigned level)
+{
+	if(DQS_TRAIN_DEBUG && DQS_TRAIN_DEBUG > level) {
+                printk(BIOS_DEBUG, "%s[%02x]=%08x%08x\n", str, i, val, val2);
+        }
+}
+
+static inline void print_debug_dqs_tsc_x(const char *str, unsigned i, unsigned val, unsigned val2)
+{
+        printk(BIOS_DEBUG, "%s[%02x]=%08x%08x\n", str, i, val, val2);
+
+}
+
+static void fill_mem_cs_sysinfo(unsigned nodeid, const struct mem_controller *ctrl, struct sys_info *sysinfo)
+{
+
+	int i;
+        sysinfo->mem_base[nodeid] = pci_conf1_read_config32(ctrl->f1, 0x40 + (nodeid<<3));
+
+	for(i=0;i<8; i++) {
+		sysinfo->cs_base[nodeid*8+i] = pci_conf1_read_config32(ctrl->f2, 0x40 + (i<<2));
+	}
+
+	sysinfo->hole_reg[nodeid] = pci_conf1_read_config32(ctrl->f1, 0xf0);	
+
+}
+static unsigned Get_MCTSysAddr(const struct mem_controller *ctrl,  unsigned cs_idx, struct sys_info *sysinfo)
+{
+	u32 dword;
+	u32 mem_base;
+	unsigned nodeid = ctrl->node_id;
+
+#if HW_MEM_HOLE_SIZEK != 0	
+	u32 hole_reg;
+#endif
+
+	//get the local base addr of the chipselect
+	dword = sysinfo->cs_base[nodeid * 8 + cs_idx];
+	dword &= 0xfffffff0;
+
+	//sys addr= node base + local cs base
+	mem_base = sysinfo->mem_base[nodeid];
+	mem_base &= 0xffff0000;
+
+	dword += mem_base;
+#if HW_MEM_HOLE_SIZEK != 0
+	hole_reg = sysinfo->hole_reg[nodeid];
+	if(hole_reg & 1) {
+		unsigned hole_startk;
+		hole_startk = (hole_reg & (0xff<<24)) >> 10;
+		if( (dword >= (hole_startk<<2)) && (dword < ((4*1024*1024)<<2))) { 
+			dword += ((4*1024*1024 - hole_startk)<<2);
+		}
+	}  
+#endif
+
+	//add 1MB offset to avoid compat area
+	dword += (1<<(20-8));
+		
+	//So final result is upper 32 bit addr 
+	
+	return dword;
+
+}
+
+static unsigned Get_RcvrSysAddr(const struct mem_controller * ctrl, unsigned channel, unsigned cs_idx, struct sys_info *sysinfo)
+{
+	return Get_MCTSysAddr(ctrl, cs_idx, sysinfo);
+
+}
+
+static inline unsigned long read_cr4(void)
+{
+        unsigned long cr4;
+        asm volatile ("movl %%cr4, %0" : "=r" (cr4));
+        return cr4;
+}
+
+static inline void write_cr4(unsigned long cr4)
+{
+        asm volatile ("movl %0, %%cr4" : : "r" (cr4));
+}
+
+
+static inline void enable_sse2(void)
+{
+	unsigned long cr4;
+	cr4 = read_cr4();
+	cr4 |= (1<<9);
+	write_cr4(cr4);
+}
+
+static inline void disable_sse2(void)
+{
+        u32 cr4;
+        cr4 = read_cr4();
+        cr4 &= ~(1<<9);
+        write_cr4(cr4);
+}
+
+
+static void set_wrap32dis(void) {
+	struct msr msr;
+	
+	msr = rdmsr(0xc0010015);
+	msr.lo |= (1<<17);
+	
+	wrmsr(0xc0010015, msr);
+
+}
+
+static void clear_wrap32dis(void) {
+        struct msr msr;
+
+        msr = rdmsr(0xc0010015);
+        msr.lo &= ~(1<<17);
+
+        wrmsr(0xc0010015, msr);
+
+}
+
+static void set_FSBASE(u32 addr_hi)
+{
+        struct msr msr;
+
+        //set fs and use fs prefix to access the mem
+        msr.hi = addr_hi;
+        msr.lo = 0;
+        wrmsr(0xc0000100, msr); //FS_BASE
+
+}
+
+static unsigned ChipSelPresent(const struct mem_controller *ctrl, unsigned cs_idx, struct sys_info *sysinfo)
+{
+        unsigned enabled;
+	unsigned nodeid = ctrl->node_id;
+	
+
+        enabled = sysinfo->cs_base[nodeid * 8 + cs_idx];
+        enabled &= 1;
+
+        return enabled;
+
+}
+
+static unsigned RcvrRankEnabled(const struct mem_controller *ctrl, int channel, int cs_idx, unsigned is_Width128, struct sys_info *sysinfo)
+{
+	/* FIXME: process 64Muxed */
+        if(!is_Width128) {
+        	if(channel) return 0; // no channel b
+        }
+
+	return ChipSelPresent(ctrl, cs_idx, sysinfo);
+}
+
+static void WriteLNTestPattern(unsigned addr_lo, u8 *buf_a, unsigned line_num)
+{
+
+        __asm__ volatile (
+		"pushl %%ebx\n\t"
+                "1:\n\t"
+		"movdqa (%3), %%xmm0\n\t"
+		"movntdq %%xmm0, %%fs:(%0)\n\t" /* xmm0 is 128 bit */
+                "addl %1, %0\n\t"
+                "addl %1, %3\n\t"
+                "loop 1b\n\t"
+		"popl %%ebx\n\t"
+
+                :: "a" (addr_lo), "d" (16), "c" (line_num * 4), "r"(buf_a)
+        );
+
+
+}
+
+void Write1LTestPattern(unsigned addr, unsigned p, u8 *buf_a, u8 *buf_b) 
+{
+	u8 *buf;
+	if(p==1) { buf = buf_b; }
+	else { buf = buf_a; }
+
+	set_FSBASE (addr>>24);
+
+	WriteLNTestPattern(addr<<8, buf, 1);
+}
+
+void Read1LTestPattern(unsigned addr) 
+{
+        unsigned value;
+
+	set_FSBASE(addr>>24);
+	
+	/* 1st move causes read fill (to exclusive or shared)*/
+        __asm__ volatile (
+                "pushl %%ebx\n\tmovl %%fs:(%1), %0\n\tpopl %%ebx\n\t"
+                :"=r"(value): "a" (addr<<8)
+        );
+	
+}
+
+#define DQS_PASS 0
+#define DQS_FAIL 1
+
+#define DQS_FIRST_PASS 1
+#define DQS_SECOND_PASS 2
+
+#define SB_NORCVREN 11
+#define RCVREN_MARGIN 6
+#define SB_SmallRCVR 13
+#define SB_CHA2BRCVREN 12
+#define SB_NODQSPOS  14
+#define MIN_DQS_WNDW 3
+#define SB_SMALLDQS 15
+
+
+static unsigned CompareTestPatternQW0(unsigned channel, unsigned addr, unsigned pattern, const u32 *TestPattern0, const u32 *TestPattern1, const u32 *TestPattern2, unsigned Pass, unsigned is_Width128)
+{
+	u32 addr_lo;
+	u32 *test_buf;
+	u32 value;
+	u32 value_test;
+	unsigned result = DQS_FAIL;
+
+	if(Pass == DQS_FIRST_PASS) {
+		if(pattern==1) {
+			test_buf = (u32 *)TestPattern1;
+		}
+		else {
+			test_buf = (u32 *)TestPattern0;
+		}
+	}
+	else {
+		test_buf = (u32 *)TestPattern2;
+	}
+
+	set_FSBASE(addr>>24);	
+	
+	addr_lo = addr<<8;
+	
+	if(is_Width128 && (channel == 1)) {
+		addr_lo += 8; //second channel
+		test_buf += 2;
+	}
+	
+        __asm__ volatile (
+                "movl %%fs:(%1), %0\n\t"
+                :"=c"(value): "a" (addr_lo)
+        );
+
+	value_test = *test_buf;
+
+	
+        print_debug_dqs_pair("\t\t\t\t\t\tQW0.lo : test_buf= ", (unsigned)test_buf, " value = ", value_test, 4); 
+        print_debug_dqs_pair("\t\t\t\t\t\tQW0.lo : addr_lo = ", addr_lo, " value = ", value, 4); 
+
+	if(value == value_test) {
+		addr_lo += 4;
+		test_buf++;
+	        __asm__ volatile (
+	                "movl %%fs:(%1), %0\n\t"
+        	        :"=c"(value): "a" (addr_lo)
+        	);
+	        value_test = *test_buf;
+	        print_debug_dqs_pair("\t\t\t\t\t\tQW0.hi : test_buf= ", (unsigned)test_buf, " value = ", value_test, 4);
+        	print_debug_dqs_pair("\t\t\t\t\t\tQW0.hi : addr_lo = ", addr_lo, " value = ", value, 4);
+
+		if(value == value_test){
+			result =  DQS_PASS;
+		}
+	}
+	
+	if(Pass == DQS_SECOND_PASS) { // second pass need to be inverted
+		if(result==DQS_PASS) {
+			result = DQS_FAIL;
+		}
+		else {
+			result = DQS_PASS;
+		}
+	}
+
+	return result;
+
+}
+
+static void SetMaxAL_RcvrDly(const struct mem_controller *ctrl, unsigned dly) 
+{
+        u32 reg;
+
+	dly += (20-1); // round it
+	dly /= 20; // convert from unit 50ps to 1ns
+	
+	dly += 6;
+
+
+        reg = pci_conf1_read_config32(ctrl->f2, DRAM_CONFIG_HIGH);
+        reg &= ~(DCH_MaxAsyncLat_MASK <<DCH_MaxAsyncLat_SHIFT);
+        reg |= ((dly - DCH_MaxAsyncLat_BASE) << DCH_MaxAsyncLat_SHIFT);
+        pci_conf1_write_config32(ctrl->f2, DRAM_CONFIG_HIGH, reg);
+	
+}
+
+/*
+	Set the Target range to WT IO (using an IORR overlapping the already existing 
+	WB dram type). Use IORR0
+*/
+static void SetTargetWTIO(unsigned addr)
+{
+	struct msr msr;
+	msr.hi = addr>>24;
+	msr.lo = addr<<8;
+	wrmsr(0xc0010016, msr); //IORR0 BASE
+	
+	msr.hi = 0xff;
+	msr.lo = 0xfc000800;  // 64MB Mask
+	wrmsr(0xc0010017, msr); // IORR0 Mask 
+}
+
+static void ResetTargetWTIO(void)
+{
+        struct msr msr;
+
+        msr.hi = 0;
+        msr.lo = 0;  
+        wrmsr(0xc0010017, msr); // IORR0 Mask
+}
+
+static void proc_CLFLUSH(unsigned addr)
+{
+
+	set_FSBASE(addr>>24);
+
+        /* 1st move causes read fill (to exclusive or shared)*/
+        __asm__ volatile (
+			/* clflush fs:[eax] */
+		"clflush %%fs:(%0)\n\t"
+                ::"a" (addr<<8)
+        );
+	
+}
+static void proc_IOCLFLUSH(unsigned addr)
+{
+	SetTargetWTIO(addr);
+	proc_CLFLUSH(addr);
+	ResetTargetWTIO();
+}
+
+static void ResetDCTWrPtr(const struct mem_controller *ctrl)
+{
+	u32 dword;
+	unsigned index = 0x10;
+	
+	dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
+	pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
+
+}
+
+
+static u16 get_exact_T1000(unsigned i)
+{
+        //			           200   266,   333,  400
+	static const u16 T1000_a[]= { 5000, 3759, 3003, 2500 };
+
+        static const u16 TT_a[] = {
+                 /*200   266   333   400 */
+         /*4 */   6250, 6250, 6250, 6250,
+         /*5 */   5000, 5000, 5000, 2500,
+         /*6 */   5000, 4166, 4166, 2500,
+         /*7 */   5000, 4285, 3571, 2500,
+
+         /*8 */   5000, 3750, 3125, 2500,
+         /*9 */   5000, 3888, 3333, 2500,
+         /*10*/   5000, 4000, 3000, 2500,
+         /*11*/   5000, 4090, 3181, 2500,
+
+         /*12*/   5000, 3750, 3333, 2500,
+         /*13*/   5000, 3846, 3076, 2500,
+         /*14*/   5000, 3928, 3214, 2500,
+         /*15*/   5000, 4000, 3000, 2500,
+        };
+
+        unsigned fid_cur;
+        int index;
+
+        struct msr msr;
+        msr = rdmsr(0xc0010042);
+        fid_cur = msr.lo & 0x3f;
+
+        index = fid_cur>>1;
+
+        if(index>12) return T1000_a[i];
+
+        return TT_a[index * 4+i];
+
+}
+
+static void InitDQSPos4RcvrEn(const struct mem_controller *ctrl)
+{
+	int i;
+	u32 dword;
+	
+	dword = 0x00000000;
+	for(i=1; i<=3; i++) {
+        	/* Program the DQS Write Timing Control Registers (Function 2:Offset 0x9c, index 0x01-0x03, 0x21-0x23) to 0x00 for all bytes */
+	        pci_write_config32_index_wait(ctrl->f2, 0x98, i, dword);
+		pci_write_config32_index_wait(ctrl->f2, 0x98, i+0x20, dword);
+	}
+
+        dword = 0x2f2f2f2f;
+        for(i=5; i<=7; i++) {
+                /* Program the DQS Write Timing Control Registers (Function 2:Offset 0x9c, index 0x05-0x07, 0x25-0x27) to 0x2f for all bytes */
+                pci_write_config32_index_wait(ctrl->f2, 0x98, i, dword);
+                pci_write_config32_index_wait(ctrl->f2, 0x98, i+0x20, dword);
+        }
+
+
+}
+
+static unsigned TrainRcvrEn(const struct mem_controller *ctrl, unsigned Pass, struct sys_info *sysinfo)
+{
+
+	static const u32 TestPattern0[] = {
+			0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
+			0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
+			0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
+			0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
+		};
+        static const u32 TestPattern1[] = {
+                        0x55555555, 0x55555555, 0x55555555, 0x55555555,
+			0x55555555, 0x55555555, 0x55555555, 0x55555555,
+			0x55555555, 0x55555555, 0x55555555, 0x55555555,
+			0x55555555, 0x55555555, 0x55555555, 0x55555555,
+		};
+	static const u32 TestPattern2[] = { 
+			0x12345678, 0x87654321, 0x23456789, 0x98765432,
+			0x59385824, 0x30496724, 0x24490795, 0x99938733,
+                        0x40385642, 0x38465245, 0x29432163, 0x05067894,
+                        0x12349045, 0x98723467, 0x12387634, 0x34587623,
+		};
+
+	u8 pattern_buf_x[64 * 4 + 16]; // We need to two cache line So have more 16 bytes to keep 16 byte alignment */ 
+	u8 *buf_a, *buf_b; 
+	u32 ecc_bit;
+	u32 dword;
+	u8 *dqs_rcvr_dly_a = &sysinfo->dqs_rcvr_dly_a[ctrl->node_id * 2* 8] ; //8 node, channel 2, receiver 8
+
+	int i;
+
+	unsigned channel, receiver;
+
+	unsigned Errors;
+	unsigned CTLRMaxDelay;
+	unsigned T1000;
+
+	unsigned LastTest;
+	unsigned CurrTest;
+	unsigned Test0, Test1;
+
+	unsigned RcvrEnDlyRmin;
+
+	unsigned two_ranks;
+	unsigned RcvrEnDly;
+
+	unsigned PatternA;
+	unsigned PatternB;
+
+	unsigned long TestAddr0, TestAddr0B;
+	unsigned long TestAddr1 = 0;/* warning: this was not set in the original code */
+	unsigned long TestAddr1B = 0;/* warning: this was not set in the original code */
+
+	unsigned long CurrRcvrCHADelay = 0; /* warning: this was not set in the original code */
+
+	unsigned tmp;
+
+	unsigned is_Width128 = sysinfo->meminfo[ctrl->node_id].is_Width128;
+
+	if(Pass == DQS_FIRST_PASS) {
+		InitDQSPos4RcvrEn(ctrl);
+	}
+
+	//enable SSE2
+	enable_sse2();
+
+	//wrap32dis
+	set_wrap32dis();
+
+	//disable ECC temp
+	dword = pci_conf1_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
+	ecc_bit = dword & DCL_DimmEccEn;
+	dword &= ~(DCL_DimmEccEn); 
+	pci_conf1_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
+
+
+	if(Pass == DQS_FIRST_PASS) {
+#ifdef K8_REV_F_SUPPORT_F0_F1_WORKAROUND
+	cpu_f0_f1 = is_cpu_pre_f2_in_bsp(ctrl->node_id);
+	if(!cpu_f0_f1) 
+#endif
+	{
+#if 1
+		/* Set the DqsRcvEnTrain bit */
+		dword = pci_conf1_read_config32(ctrl->f2, DRAM_CTRL);
+		dword |= DC_DqsRcvEnTrain;
+		pci_conf1_write_config32(ctrl->f2, DRAM_CTRL, dword);
+#endif
+	}
+	}
+
+	//get T1000 figures (cycle time (ns)) * 1K
+	dword = pci_conf1_read_config32(ctrl->f2, DRAM_CONFIG_HIGH);
+	dword &= DCH_MemClkFreq_MASK;
+
+	T1000 = get_exact_T1000(dword); 
+
+	// SetupRcvrPattern 
+	buf_a = (u8 *)(((u32)(&pattern_buf_x[0]) + 0x10) & (0xfffffff0));
+        buf_b = buf_a + 128; //??
+	if(Pass==DQS_FIRST_PASS) {
+	        for(i=0;i<16;i++) {
+        	        *((u32 *)(buf_a + i*4)) = TestPattern0[i];
+	                *((u32 *)(buf_b + i*4)) = TestPattern1[i];
+        	}
+	}
+	else {
+                for(i=0;i<16;i++) {
+                        *((u32 *)(buf_a + i*4)) = TestPattern2[i];
+			*((u32 *)(buf_b + i*4)) = TestPattern2[i];
+                }
+	}
+
+	printk(BIOS_DEBUG, "\nTrainRcvEn: 0 ctrl 0x%x %d\n", ctrl->node_id, 0);
+
+	printk(BIOS_DEBUG, "TrainRcvEn: buf_a:0x%x\n", *buf_a); 
+
+	Errors = 0;
+	/* for each channel */
+	CTLRMaxDelay = 0;
+	for(channel = 0; (channel < 2) && (!Errors); channel++) 
+	{ 
+		printk(BIOS_DEBUG, "\tTrainRcvEn51: channel  0x%x %d\n",channel, 1); 
+		
+		/* for each rank */ 
+		/* there are four recriver pairs, loosely associated with CS */ 
+		for( receiver = 0; (receiver < 8) && (!Errors); receiver+=2) 
+		{
+			
+			unsigned index=(receiver>>1) * 3 + 0x10;
+
+	                printk(BIOS_DEBUG, "\t\tTrainRcvEn52: index  0x%x %d\n", index, 2); 
+
+			if(is_Width128) {
+				if(channel) {
+					dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
+					CurrRcvrCHADelay= dword & 0xff;
+				}
+			}
+			else {
+				if(channel) { 
+					index += 0x20;
+				}
+			}	
+
+			LastTest = DQS_FAIL;
+			RcvrEnDlyRmin = 0xaf;
+				
+			if(!RcvrRankEnabled(ctrl, channel, receiver, is_Width128, sysinfo)) continue;
+
+			/* for each DQS receiver enable setting */
+	
+			TestAddr0 = Get_RcvrSysAddr(ctrl, channel, receiver, sysinfo);
+
+			TestAddr0B = TestAddr0 + (1<<(20+2-8)); // 4MB
+	
+			if(RcvrRankEnabled(ctrl, channel, receiver+1, is_Width128, sysinfo)) {
+				TestAddr1 = Get_RcvrSysAddr(ctrl, channel, receiver+1, sysinfo);
+				TestAddr1B = TestAddr1 + (1<<(20+2-8)); //4MB
+				two_ranks = 1;
+			}
+			else {
+				two_ranks = 0;
+			}
+
+	                printk(BIOS_DEBUG, "\t\tTrainRcvEn53: TestAddr0B  0x%lx %d\n", TestAddr0B, 2); 
+
+			Write1LTestPattern(TestAddr0, 0, buf_a, buf_b); // rank0 of dimm, test p0
+			Write1LTestPattern(TestAddr0B, 1, buf_a, buf_b); //rank0 of dimm, test p1
+
+			if(two_ranks == 1) {
+				Write1LTestPattern(TestAddr1, 0, buf_a, buf_b); //rank 1 of dimm
+				Write1LTestPattern(TestAddr1B, 1, buf_a, buf_b);//rank 1 of dimm
+			}
+
+			if(Pass == DQS_FIRST_PASS) {
+				RcvrEnDly = 0; 
+			} else {
+				RcvrEnDly = dqs_rcvr_dly_a[channel * 8 + receiver];
+			}
+
+			while ( RcvrEnDly < 0xaf) { // Sweep Delay value here
+		                printk(BIOS_DEBUG, "\t\t\tTrainRcvEn541: RcvrEnDly  0x%x %d\n", RcvrEnDly, 3);
+
+				if(RcvrEnDly & 1) {
+					/* Odd steps get another pattern such that even
+					   and odd steps alternate.
+					   The pointers to the patterns will be swapped
+					   at the end of the loop so they are correspond
+					*/
+					PatternA = 1;
+					PatternB = 0;
+				}
+				else {
+					/* Even step */
+					PatternA = 0;
+					PatternB = 1;
+				}
+
+				/* Program current Receiver enable delay */
+				pci_write_config32_index_wait(ctrl->f2, 0x98, index, RcvrEnDly);
+				/* FIXME: 64bit MUX */
+	
+				if(is_Width128) {
+					/* Program current Receiver enable delay chaannel b */
+					pci_write_config32_index_wait(ctrl->f2, 0x98, index+ 0x20, RcvrEnDly);
+				}
+			
+                                /* Program the MaxAsyncLat filed with the
+                                   current DQS receiver enable setting plus 6ns
+                                */	
+				/*Porgram MaxAsyncLat to correspond with current delay */
+				SetMaxAL_RcvrDly(ctrl, RcvrEnDly);
+
+				CurrTest = DQS_FAIL;
+
+				Read1LTestPattern(TestAddr0);  //Cache Fill
+				/* ROM vs cache compare */
+				Test0 = CompareTestPatternQW0(channel, TestAddr0, PatternA, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
+				proc_IOCLFLUSH(TestAddr0);
+
+				ResetDCTWrPtr(ctrl);
+
+				printk(BIOS_DEBUG, "\t\t\tTrainRcvEn542: Test0  0x%x %d\n", Test0, 3); 
+
+				if(Test0 == DQS_PASS) {
+
+					Read1LTestPattern(TestAddr0B);
+                               		Test1 = CompareTestPatternQW0(channel, TestAddr0B, PatternB, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
+					proc_IOCLFLUSH(TestAddr0B);
+
+					ResetDCTWrPtr(ctrl);
+
+					printk(BIOS_DEBUG, "\t\t\tTrainRcvEn543: Test1  0x%x %d\n", Test1, 3); 
+					
+					if(Test1 == DQS_PASS) {
+						if(two_ranks) {
+							Read1LTestPattern(TestAddr1);
+                                			Test0 = CompareTestPatternQW0(channel, TestAddr1, PatternA, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
+                                			proc_IOCLFLUSH(TestAddr1);
+                                			ResetDCTWrPtr(ctrl);
+
+                                			if(Test0 == DQS_PASS) {
+                                        			Read1LTestPattern(TestAddr1B);
+                                        			Test1 = CompareTestPatternQW0(channel, TestAddr1B, PatternB, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
+                                        			proc_IOCLFLUSH(TestAddr1B);
+                                        			ResetDCTWrPtr(ctrl);
+
+                                        			if(Test1 == DQS_PASS) {
+                                                        		CurrTest = DQS_PASS;
+                                                		}
+                                        		} 
+							printk(BIOS_DEBUG, "\t\t\tTrainRcvEn544: Test0  0x%x %d\n", Test0, 3); 
+						}
+						else {
+							CurrTest = DQS_PASS;
+						}
+					}
+				}
+
+				printk(BIOS_DEBUG, "\t\t\tTrainRcvEn55: RcvrEnDly  0x%x %d\n", RcvrEnDly, 3); 
+
+				if(CurrTest == DQS_PASS) {
+					if(LastTest == DQS_FAIL) {
+						RcvrEnDlyRmin = RcvrEnDly;
+						break;
+					}
+				}
+				
+				LastTest = CurrTest;
+				
+				/* swap the rank 0 pointers */
+				tmp = TestAddr0;
+				TestAddr0 = TestAddr0B;
+				TestAddr0B = tmp;
+
+				/* swap the rank 1 pointers */
+                                tmp = TestAddr1;
+                                TestAddr1 = TestAddr1B;
+                                TestAddr1B = tmp;
+
+				printk(BIOS_DEBUG, "\t\t\tTrainRcvEn56: RcvrEnDly  0x%x %d\n", RcvrEnDly, 3); 
+				
+				RcvrEnDly++;
+				
+			} // while RcvrEnDly
+
+			printk(BIOS_DEBUG, "\t\tTrainRcvEn61: RcvrEnDly  0x%x %d\n", RcvrEnDly, 2); 
+
+			if(RcvrEnDlyRmin == 0xaf) {
+				//no passing window
+				Errors |= SB_NORCVREN;
+			}
+
+                        if(Pass == DQS_FIRST_PASS) {
+                                // We need a better value for DQSPos trainning
+                                RcvrEnDly = RcvrEnDlyRmin /* + RCVREN_MARGIN * T1000/64/50 */;
+                        } else {
+                                RcvrEnDly = RcvrEnDlyRmin;
+                        }
+
+                        if(RcvrEnDly > 0xae) {
+                                //passing window too narrow, too far delayed
+                                Errors |= SB_SmallRCVR;
+                                RcvrEnDly = 0xae;
+                        }
+
+                        if(Pass == DQS_SECOND_PASS) { //second pass must average vales
+                                RcvrEnDly += dqs_rcvr_dly_a[channel * 8 + receiver] /* - (RCVREN_MARGIN * T1000/64/50)*/;
+                                RcvrEnDly >>= 1;
+                        }
+		
+			dqs_rcvr_dly_a[channel * 8 + receiver] = RcvrEnDly; 
+	
+			//Set final RcvrEnDly for this DIMM and Channel	
+			pci_write_config32_index_wait(ctrl->f2, 0x98, index, RcvrEnDly);
+		
+			if(is_Width128) {
+				pci_write_config32_index_wait(ctrl->f2, 0x98, index+0x20, RcvrEnDly); // channel B
+				if(channel) { 
+					pci_write_config32_index_wait(ctrl->f2, 0x98, index, CurrRcvrCHADelay);
+					if(RcvrEnDly > CurrRcvrCHADelay) {
+						dword = RcvrEnDly - CurrRcvrCHADelay;	
+					}
+					else {
+						dword = CurrRcvrCHADelay - RcvrEnDly;
+					}
+					dword *= 50;
+					if(dword > T1000) {
+						Errors |= SB_CHA2BRCVREN;
+					}
+				}
+			}
+
+			printk(BIOS_DEBUG, "\t\tTrainRcvEn63: RcvrEnDly  0x%x %d\n", RcvrEnDly, 2); 
+
+			if(RcvrEnDly > CTLRMaxDelay) {
+				CTLRMaxDelay = RcvrEnDly;
+			}
+
+			printk(BIOS_DEBUG, "\t\tTrainRcvEn64: CTLRMaxDelay  0x%x %d\n", CTLRMaxDelay, 2); 
+			
+		} /* receiver */
+	} /* channel */
+
+	printk(BIOS_DEBUG, "\tTrainRcvEn65: CTLRMaxDelay  0x%x %d\n", CTLRMaxDelay, 1); 
+
+        /* Program the MaxAsysncLat field with the largest DQS Receiver Enable setting */
+	SetMaxAL_RcvrDly(ctrl, CTLRMaxDelay);
+	ResetDCTWrPtr(ctrl);
+
+	//Enable ECC again 
+        dword = pci_conf1_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
+        dword &= ~(DCL_DimmEccEn);
+	dword |= ecc_bit;
+        pci_conf1_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
+
+	if(Pass == DQS_FIRST_PASS) {
+#ifdef K8_REV_F_SUPPORT_F0_F1_WORKAROUND
+	if(!cpu_f0_f1) 
+#endif
+	{
+		dword = pci_conf1_read_config32(ctrl->f2, DRAM_CTRL);
+	        dword &= ~DC_DqsRcvEnTrain;
+	        pci_conf1_write_config32(ctrl->f2, DRAM_CTRL, dword);
+	}
+	}
+
+	//Clear wrap32dis 
+
+	clear_wrap32dis();
+
+	//restore SSE2 setting
+	disable_sse2();
+
+#if MEM_TRAIN_SEQ != 1  
+	/* We need tidy output for type 1 */
+	printk(BIOS_DEBUG, " CTLRMaxDelay=%02x", CTLRMaxDelay);
+#endif
+
+	return (CTLRMaxDelay==0xae)?1:0;
+
+}
+
+#define DQS_READDIR 1
+#define DQS_WRITEDIR 0
+
+
+static void SetDQSDelayCSR(const struct mem_controller *ctrl, unsigned channel, unsigned bytelane, unsigned direction, unsigned dqs_delay)
+{ //ByteLane could be 0-8, last is for ECC
+        unsigned index;
+        u32 dword;
+	unsigned shift;
+
+        dqs_delay &= 0xff;
+
+        index = (bytelane>>2) + 1 + channel * 0x20 + (direction << 2);
+	shift = bytelane;
+	while(shift>3) {
+		shift-=4;
+	}
+	shift <<= 3; // 8 bit
+
+        dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
+	dword &= ~(0x3f<<shift);
+	dword |= (dqs_delay<<shift);
+	pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
+
+}
+
+static void SetDQSDelayAllCSR(const struct mem_controller *ctrl, unsigned channel, unsigned direction, unsigned dqs_delay)
+{
+	unsigned index;
+	u32 dword;
+	int i;
+	
+	dword = 0;
+	dqs_delay &= 0xff;
+	for(i=0;i<4;i++) { 
+		dword |= dqs_delay<<(i*8);
+	}
+
+	index = 1 + channel * 0x20 + direction * 4;
+
+	for(i=0; i<2; i++) {
+		pci_write_config32_index_wait(ctrl->f2, 0x98, index + i, dword);
+	}
+	
+}
+
+static unsigned MiddleDQS(unsigned min_d, unsigned max_d)
+{
+	unsigned size_d;
+	size_d = max_d-min_d;
+	if(size_d & 1) { //need round up
+		min_d++;
+	}
+	return ( min_d + (size_d>>1));
+}
+
+static  inline void save_dqs_delay(unsigned channel, unsigned bytelane, unsigned direction, u8 *dqs_delay_a, u8 dqs_delay)
+{
+        dqs_delay_a[channel * 2*9 + direction * 9 + bytelane] = dqs_delay;
+}
+
+static void WriteDQSTestPattern(unsigned addr_lo, unsigned pattern , u8 *buf_a)
+{
+	WriteLNTestPattern(addr_lo, buf_a, (pattern+1) * 9);
+}
+
+static void ReadL18TestPattern(unsigned addr_lo) 
+{
+        //set fs and use fs prefix to access the mem
+        __asm__ volatile (
+		"pushl	%%ebx\n\t"
+                "movl %%fs:-128(%%esi), %%eax\n\t"  //TestAddr cache line
+		"movl %%fs:-64(%%esi), %%eax\n\t"   //+1
+		"movl %%fs:(%%esi), %%eax\n\t"	//+2
+		"movl %%fs:64(%%esi), %%eax\n\t"   //+3
+
+                "movl %%fs:-128(%%edi), %%eax\n\t"	//+4
+                "movl %%fs:-64(%%edi), %%eax\n\t"	//+5
+                "movl %%fs:(%%edi), %%eax\n\t"	//+6
+                "movl %%fs:64(%%edi), %%eax\n\t"	//+7
+
+                "movl %%fs:-128(%%ebx), %%eax\n\t"  //+8
+                "movl %%fs:-64(%%ebx), %%eax\n\t"	//+9
+                "movl %%fs:(%%ebx), %%eax\n\t"	//+10
+                "movl %%fs:64(%%ebx), %%eax\n\t"	//+11
+
+                "movl %%fs:-128(%%ecx), %%eax\n\t"	//+12
+                "movl %%fs:-64(%%ecx), %%eax\n\t"	//+13
+                "movl %%fs:(%%ecx), %%eax\n\t"	//+14
+                "movl %%fs:64(%%ecx), %%eax\n\t"	//+15
+
+                "movl %%fs:-128(%%edx), %%eax\n\t"	//+16
+                "movl %%fs:-64(%%edx), %%eax\n\t"	//+17
+		"popl	%%ebx\n\t"
+
+                :: "a"(0), "r" (addr_lo+128+8*64), "c" (addr_lo+128+12*64), "d" (addr_lo +128+16*64), "S"(addr_lo+128), "D"(addr_lo+128+4*64)
+        );
+
+}
+
+static void ReadL9TestPattern(unsigned addr_lo) 
+{
+
+        //set fs and use fs prefix to access the mem
+        __asm__ volatile (
+
+ 		"pushl	%%ebx\n\t"
+                 "movl %%fs:-128(%%ecx), %%eax\n\t"  //TestAddr cache line
+                "movl %%fs:-64(%%ecx), %%eax\n\t"   //+1
+                "movl %%fs:(%%ecx), %%eax\n\t"      //+2
+                "movl %%fs:64(%%ecx), %%eax\n\t"   //+3
+
+                "movl %%fs:-128(%%edx), %%eax\n\t"  //+4
+                "movl %%fs:-64(%%edx), %%eax\n\t"   //+5
+                "movl %%fs:(%%edx), %%eax\n\t"      //+6
+                "movl %%fs:64(%%edx), %%eax\n\t"   //+7
+
+                "movl %%fs:-128(%%ebx), %%eax\n\t"      //+8
+		"popl	%%ebx\n\t"
+
+                :: "a"(0), "r" (addr_lo+128+8*64), "c"(addr_lo+128), "d"(addr_lo+128+4*64) 
+        );
+
+}
+
+
+static void ReadDQSTestPattern(unsigned addr_lo, unsigned pattern)
+{
+	if(pattern == 0) {
+		ReadL9TestPattern(addr_lo);
+	}
+	else {
+		ReadL18TestPattern(addr_lo);
+	}
+}
+
+static void FlushDQSTestPattern_L9(unsigned addr_lo)
+{
+        __asm__ volatile (
+    		"pushl %%ebx\n\t"
+            "clflush %%fs:-128(%%ecx)\n\t"
+                "clflush %%fs:-64(%%ecx)\n\t"
+                "clflush %%fs:(%%ecx)\n\t"
+                "clflush %%fs:64(%%ecx)\n\t"
+
+                "clflush %%fs:-128(%%eax)\n\t"
+                "clflush %%fs:-64(%%eax)\n\t"
+                "clflush %%fs:(%%eax)\n\t"
+                "clflush %%fs:64(%%eax)\n\t"
+
+                "clflush %%fs:-128(%%ebx)\n\t"
+		"popl %%ebx\n\t"
+
+                ::  "r" (addr_lo+128+8*64), "c"(addr_lo+128), "a"(addr_lo+128+4*64)
+	);
+
+}
+static __attribute__((noinline)) void FlushDQSTestPattern_L18(unsigned addr_lo)
+{
+       __asm__ volatile (
+		"pushl %%ebx\n\t"
+                "clflush %%fs:-128(%%eax)\n\t"
+                "clflush %%fs:-64(%%eax)\n\t"
+                "clflush %%fs:(%%eax)\n\t"
+                "clflush %%fs:64(%%eax)\n\t"
+
+                "clflush %%fs:-128(%%edi)\n\t"
+                "clflush %%fs:-64(%%edi)\n\t"
+                "clflush %%fs:(%%edi)\n\t"
+                "clflush %%fs:64(%%edi)\n\t"
+
+                "clflush %%fs:-128(%%ebx)\n\t"
+                "clflush %%fs:-64(%%ebx)\n\t"
+                "clflush %%fs:(%%ebx)\n\t"
+                "clflush %%fs:64(%%ebx)\n\t"
+
+                "clflush %%fs:-128(%%ecx)\n\t"
+                "clflush %%fs:-64(%%ecx)\n\t"
+                "clflush %%fs:(%%ecx)\n\t"
+                "clflush %%fs:64(%%ecx)\n\t"
+
+                "clflush %%fs:-128(%%edx)\n\t"
+                "clflush %%fs:-64(%%edx)\n\t"
+		"popl %%ebx\n\t"
+
+                :: "r" (addr_lo+128+8*64), "c" (addr_lo+128+12*64), "d" (addr_lo +128+16*64), "a"(addr_lo+128), "D"(addr_lo+128+4*64)
+	);
+}
+
+static void FlushDQSTestPattern(unsigned addr_lo, unsigned pattern )
+{
+	
+	if(pattern == 0){
+		FlushDQSTestPattern_L9(addr_lo);
+	}
+	else {
+		FlushDQSTestPattern_L18(addr_lo);
+	}
+}
+
+static unsigned CompareDQSTestPattern(unsigned channel, unsigned addr_lo, unsigned pattern, u8 *buf_a)
+{
+        u32 *test_buf;
+	unsigned bitmap = 0xff;
+	unsigned bytelane;
+	int i;
+	u32 value;
+	int j;
+	u32 value_test;
+
+        test_buf = (u32 *)buf_a;
+	
+
+        if(pattern && channel) {
+                addr_lo += 8; //second channel
+                test_buf+= 2;
+        }
+
+	bytelane = 0;
+	for(i=0;i<9*64/4;i++) {
+	        __asm__ volatile (
+	                "pushl %%ebx\n\tmovl %%fs:(%1), %0\n\tpopl %%ebx\n\t"
+        	        :"=r"(value): "a" (addr_lo)
+        	);
+		value_test = *test_buf;
+
+		print_debug_dqs_pair("\t\t\t\t\t\ttest_buf= ", (unsigned)test_buf, " value = ", value_test, 7); 
+		print_debug_dqs_pair("\t\t\t\t\t\ttaddr_lo = ",addr_lo, " value = ", value, 7);
+
+		for(j=0;j<4*8;j+=8) {
+			if(((value>>j)&0xff) != ((value_test>>j)& 0xff)) {
+				bitmap &= ~(1<<bytelane);
+			}
+		
+			bytelane++;
+			bytelane &= 0x7; 
+		}
+		printk(BIOS_DEBUG, "\t\t\t\t\t\tbitmap =  0x%x %d\n", bitmap, 7);  
+
+		if(bytelane == 0) {
+			if(pattern == 1) { //dual channel 
+				addr_lo += 8; //skip over other channel's data
+				test_buf += 2;
+			}
+		}
+		addr_lo += 4;
+		test_buf +=1;
+		
+	}
+
+
+        return bitmap;
+
+}
+
+static unsigned TrainDQSPos(const struct mem_controller *ctrl, unsigned channel, unsigned Direction, unsigned Pattern, u8 *buf_a, u8 *dqs_delay_a, struct sys_info *sysinfo)
+{
+	unsigned ByteLane;
+	unsigned Errors;
+	unsigned BanksPresent;
+
+	unsigned MutualCSPassW[48];	
+
+	unsigned ChipSel;
+	unsigned DQSDelay;
+	
+	unsigned TestAddr;
+
+	unsigned LastTest;
+	unsigned RnkDlyFilterMax, RnkDlyFilterMin;
+	unsigned RnkDlySeqPassMax, RnkDlySeqPassMin = 0; /* warning: this was left unset in original code */
+
+	Errors = 0;
+	BanksPresent = 0;
+
+	printk(BIOS_DEBUG, "\t\t\tTrainDQSPos begin  0x%x %d\n", 0, 3);
+
+	printk(BIOS_DEBUG, "TrainDQSPos: MutualCSPassW[48] : 0x%x\n", *MutualCSPassW);
+
+	for(DQSDelay=0; DQSDelay<48; DQSDelay++) {
+		MutualCSPassW[DQSDelay] = 0xff; // Bitmapped status per delay setting, 0xff=All positions passing (1= PASS)
+	}
+
+	for(ChipSel = 0; ChipSel < 8; ChipSel++) { //logical register chipselects 0..7
+		printk(BIOS_DEBUG, "\t\t\t\tTrainDQSPos: 11 ChipSel 0x%x %d\n", ChipSel, 4); 
+		//FIXME: process 64MUXedMode
+		if(!ChipSelPresent(ctrl, ChipSel, sysinfo)) continue;
+		BanksPresent  = 1;
+
+		TestAddr = Get_MCTSysAddr(ctrl, ChipSel, sysinfo);
+
+		printk(BIOS_DEBUG,"\t\t\t\tTrainDQSPos: 12 TestAddr 0x%x %d\n", TestAddr, 4); 
+
+	        //set fs and use fs prefix to access the mem
+		set_FSBASE(TestAddr>>24);
+
+		if(Direction == DQS_READDIR) {
+			printk(BIOS_DEBUG,"\t\t\t\tTrainDQSPos: 13 for read so write at first %d %d\n", 0, 4);
+			WriteDQSTestPattern(TestAddr<<8, Pattern, buf_a);
+		}
+
+		for(DQSDelay = 0; DQSDelay < 48; DQSDelay++ ){
+			printk(BIOS_DEBUG, "\t\t\t\t\tTrainDQSPos: 141 DQSDelay 0x%x %d\n", DQSDelay, 5); 
+			if(MutualCSPassW[DQSDelay] == 0) continue; //skip current delay value if other chipselects have failed all 8 bytelanes
+			SetDQSDelayAllCSR(ctrl, channel, Direction, DQSDelay);
+			printk(BIOS_DEBUG, "\t\t\t\t\tTrainDQSPos: 142 MutualCSPassW  0x%x %d\n", MutualCSPassW[DQSDelay], 5); 
+			if(Direction == DQS_WRITEDIR) {
+				printk(BIOS_DEBUG, "\t\t\t\t\tTrainDQSPos: 143 for write 0x%x %d\n", 0, 5);
+				WriteDQSTestPattern(TestAddr<<8, Pattern, buf_a); 
+			}
+			printk(BIOS_DEBUG, "\t\t\t\t\tTrainDQSPos: 144 Pattern  0x%x %d\n", Pattern, 5);
+			ReadDQSTestPattern(TestAddr<<8, Pattern); 
+			printk(BIOS_DEBUG, "\t\t\t\t\tTrainDQSPos: 145 MutualCSPassW  0x%x %d\n", MutualCSPassW[DQSDelay], 5);
+			MutualCSPassW[DQSDelay] &= CompareDQSTestPattern(channel, TestAddr<<8, Pattern, buf_a); //0: fail, 1=pass
+			printk(BIOS_DEBUG, "\t\t\t\t\tTrainDQSPos: 146 MutualCSPassW  0x%x %d\n", MutualCSPassW[DQSDelay], 5); 
+			SetTargetWTIO(TestAddr);
+			FlushDQSTestPattern(TestAddr<<8, Pattern); 
+			ResetTargetWTIO();
+		}
+	}
+
+	if(BanksPresent) 
+	for(ByteLane = 0; ByteLane < 8; ByteLane++) {
+		printk(BIOS_DEBUG, "\t\t\t\tTrainDQSPos: 31 ByteLane  0x%x %d\n",ByteLane, 4); 
+
+		LastTest = DQS_FAIL;
+		RnkDlySeqPassMax = 0;
+		RnkDlyFilterMax = 0;
+		RnkDlyFilterMin = 0;
+		for(DQSDelay=0; DQSDelay<48; DQSDelay++) {
+			if(MutualCSPassW[DQSDelay] & (1<<ByteLane)) {
+
+				printk(BIOS_DEBUG, "\t\t\t\t\tTrainDQSPos: 321 DQSDelay  0x%x %d\n", DQSDelay, 5); 
+				printk(BIOS_DEBUG, "\t\t\t\t\tTrainDQSPos: 322 MutualCSPassW  0x%x %d\n", MutualCSPassW[DQSDelay], 5); 
+
+				RnkDlySeqPassMax = DQSDelay;
+				if(LastTest == DQS_FAIL) {
+					RnkDlySeqPassMin = DQSDelay; //start sequential run
+				}
+				if((RnkDlySeqPassMax - RnkDlySeqPassMin)>(RnkDlyFilterMax-RnkDlyFilterMin)){
+					RnkDlyFilterMin = RnkDlySeqPassMin;
+					RnkDlyFilterMax = RnkDlySeqPassMax;
+				}
+				LastTest = DQS_PASS;
+			}
+			else {
+				LastTest = DQS_FAIL;
+			}
+		}
+		printk(BIOS_DEBUG, "\t\t\t\tTrainDQSPos: 33 RnkDlySeqPassMax  0x%x %d\n", RnkDlySeqPassMax, 4); 
+
+		if(RnkDlySeqPassMax == 0) {
+			Errors |= SB_NODQSPOS; // no passing window
+		}
+		else {
+			printk(BIOS_DEBUG, "\t\t\t\tTrainDQSPos: 34 RnkDlyFilterMax  0x%x %d\n", RnkDlyFilterMax, 4); 
+			printk(BIOS_DEBUG, "\t\t\t\tTrainDQSPos: 34 RnkDlyFilterMin  0x%x %d\n", RnkDlyFilterMin, 4); 
+			if((RnkDlyFilterMax - RnkDlyFilterMin)< MIN_DQS_WNDW){
+				Errors |= SB_SMALLDQS;
+			}
+			else {
+				unsigned middle_dqs;
+				middle_dqs = MiddleDQS(RnkDlyFilterMin, RnkDlyFilterMax); 
+				printk(BIOS_DEBUG, "\t\t\t\tTrainDQSPos: 35 middle_dqs  0x%x %d\n",middle_dqs, 4); 
+				SetDQSDelayCSR(ctrl, channel, ByteLane, Direction, middle_dqs);
+				save_dqs_delay(channel, ByteLane, Direction, dqs_delay_a, middle_dqs);
+			}
+		}	
+
+	}
+
+	printk(BIOS_DEBUG, "\t\t\tTrainDQSPos: end 0x%x %d\n", 0xff, 3);
+	
+	return Errors;
+	
+
+}
+
+static unsigned TrainReadDQS(const struct mem_controller *ctrl, unsigned channel, unsigned pattern, u8 *buf_a, u8 *dqs_delay_a, struct sys_info *sysinfo)
+{
+	printk(BIOS_DEBUG, "\t\tTrainReadPos 0x%x %d\n", 0, 2); 
+	return TrainDQSPos(ctrl, channel, DQS_READDIR, pattern, buf_a, dqs_delay_a, sysinfo);	
+}
+
+static unsigned TrainWriteDQS(const struct mem_controller *ctrl, unsigned channel, unsigned pattern, u8 *buf_a, u8 *dqs_delay_a, struct sys_info *sysinfo)
+{
+	printk(BIOS_DEBUG, "\t\tTrainWritePos 0x%x %d\n", 0, 2);
+        return TrainDQSPos(ctrl, channel, DQS_WRITEDIR, pattern, buf_a, dqs_delay_a, sysinfo);
+}
+
+
+
+static unsigned TrainDQSRdWrPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
+{
+        static const u32 TestPatternJD1a[] = {
+					0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFF, // QW0-1, ALL-EVEN
+                                        0x00000000,0x00000000,0x00000000,0x00000000, // QW2-3, ALL-EVEN
+                                        0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFF, // QW4-5, ALL-EVEN
+                                        0x00000000,0x00000000,0x00000000,0x00000000, // QW6-7, ALL-EVEN
+                                        0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW0-1, DQ0-ODD
+                                        0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW2-3, DQ0-ODD
+                                        0x01010101,0x01010101,0xFeFeFeFe,0xFeFeFeFe, // QW4-5, DQ0-ODD
+                                        0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW6-7, DQ0-ODD
+                                        0x02020202,0x02020202,0x02020202,0x02020202, // QW0-1, DQ1-ODD
+                                        0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW2-3, DQ1-ODD
+                                        0xFdFdFdFd,0xFdFdFdFd,0x02020202,0x02020202, // QW4-5, DQ1-ODD
+                                        0x02020202,0x02020202,0x02020202,0x02020202, // QW6-7, DQ1-ODD
+                                        0x04040404,0x04040404,0xfBfBfBfB,0xfBfBfBfB, // QW0-1, DQ2-ODD
+                                        0x04040404,0x04040404,0x04040404,0x04040404, // QW2-3, DQ2-ODD
+                                        0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW4-5, DQ2-ODD
+                                        0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW6-7, DQ2-ODD
+                                        0x08080808,0x08080808,0xF7F7F7F7,0xF7F7F7F7, // QW0-1, DQ3-ODD
+                                        0x08080808,0x08080808,0x08080808,0x08080808, // QW2-3, DQ3-ODD
+                                        0xF7F7F7F7,0xF7F7F7F7,0x08080808,0x08080808, // QW4-5, DQ3-ODD
+                                        0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW6-7, DQ3-ODD
+                                        0x10101010,0x10101010,0x10101010,0x10101010, // QW0-1, DQ4-ODD
+                                        0xeFeFeFeF,0xeFeFeFeF,0x10101010,0x10101010, // QW2-3, DQ4-ODD
+                                        0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW4-5, DQ4-ODD
+                                        0xeFeFeFeF,0xeFeFeFeF,0x10101010,0x10101010, // QW6-7, DQ4-ODD
+                                        0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW0-1, DQ5-ODD
+                                        0xdFdFdFdF,0xdFdFdFdF,0x20202020,0x20202020, // QW2-3, DQ5-ODD
+                                        0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW4-5, DQ5-ODD
+                                        0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW6-7, DQ5-ODD
+                                        0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW0-1, DQ6-ODD
+                                        0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW2-3, DQ6-ODD
+                                        0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW4-5, DQ6-ODD
+                                        0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW6-7, DQ6-ODD
+                                        0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW0-1, DQ7-ODD
+                                        0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW2-3, DQ7-ODD
+                                        0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW4-5, DQ7-ODD
+                                        0x80808080,0x80808080,0x80808080,0x80808080  // QW6-7, DQ7-ODD
+                };
+        static const u32 TestPatternJD1b[] = {
+					0x00000000,0x00000000,0x00000000,0x00000000, // QW0,CHA-B, ALL-EVEN
+                                        0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, // QW1,CHA-B, ALL-EVEN
+                                        0x00000000,0x00000000,0x00000000,0x00000000, // QW2,CHA-B, ALL-EVEN
+                                        0x00000000,0x00000000,0x00000000,0x00000000, // QW3,CHA-B, ALL-EVEN
+                                        0x00000000,0x00000000,0x00000000,0x00000000, // QW4,CHA-B, ALL-EVEN
+                                        0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, // QW5,CHA-B, ALL-EVEN
+                                        0x00000000,0x00000000,0x00000000,0x00000000, // QW6,CHA-B, ALL-EVEN
+                                        0x00000000,0x00000000,0x00000000,0x00000000, // QW7,CHA-B, ALL-EVEN
+                                        0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW0,CHA-B, DQ0-ODD
+                                        0x01010101,0x01010101,0x01010101,0x01010101, // QW1,CHA-B, DQ0-ODD
+                                        0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW2,CHA-B, DQ0-ODD
+                                        0x01010101,0x01010101,0x01010101,0x01010101, // QW3,CHA-B, DQ0-ODD
+                                        0x01010101,0x01010101,0x01010101,0x01010101, // QW4,CHA-B, DQ0-ODD
+                                        0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW5,CHA-B, DQ0-ODD
+                                        0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW6,CHA-B, DQ0-ODD
+                                        0x01010101,0x01010101,0x01010101,0x01010101, // QW7,CHA-B, DQ0-ODD
+                                    	0x02020202,0x02020202,0x02020202,0x02020202, // QW0,CHA-B, DQ1-ODD
+                                    	0x02020202,0x02020202,0x02020202,0x02020202, // QW1,CHA-B, DQ1-ODD
+                                        0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW2,CHA-B, DQ1-ODD
+                                        0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW3,CHA-B, DQ1-ODD
+                                    	0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW4,CHA-B, DQ1-ODD
+                                    	0x02020202,0x02020202,0x02020202,0x02020202, // QW5,CHA-B, DQ1-ODD
+                                        0x02020202,0x02020202,0x02020202,0x02020202, // QW6,CHA-B, DQ1-ODD
+                                        0x02020202,0x02020202,0x02020202,0x02020202, // QW7,CHA-B, DQ1-ODD
+                                        0x04040404,0x04040404,0x04040404,0x04040404, // QW0,CHA-B, DQ2-ODD
+                                        0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW1,CHA-B, DQ2-ODD
+                                        0x04040404,0x04040404,0x04040404,0x04040404, // QW2,CHA-B, DQ2-ODD
+                                        0x04040404,0x04040404,0x04040404,0x04040404, // QW3,CHA-B, DQ2-ODD
+                                        0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW4,CHA-B, DQ2-ODD
+                                        0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW5,CHA-B, DQ2-ODD
+                                        0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW6,CHA-B, DQ2-ODD
+                                        0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW7,CHA-B, DQ2-ODD
+                                        0x08080808,0x08080808,0x08080808,0x08080808, // QW0,CHA-B, DQ3-ODD
+                                        0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW1,CHA-B, DQ3-ODD
+                                        0x08080808,0x08080808,0x08080808,0x08080808, // QW2,CHA-B, DQ3-ODD
+                                        0x08080808,0x08080808,0x08080808,0x08080808, // QW3,CHA-B, DQ3-ODD
+                                        0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW4,CHA-B, DQ3-ODD
+                                        0x08080808,0x08080808,0x08080808,0x08080808, // QW5,CHA-B, DQ3-ODD
+                                        0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW6,CHA-B, DQ3-ODD
+                                        0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW7,CHA-B, DQ3-ODD
+                                        0x10101010,0x10101010,0x10101010,0x10101010, // QW0,CHA-B, DQ4-ODD
+                                        0x10101010,0x10101010,0x10101010,0x10101010, // QW1,CHA-B, DQ4-ODD
+                                        0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW2,CHA-B, DQ4-ODD
+                                        0x10101010,0x10101010,0x10101010,0x10101010, // QW3,CHA-B, DQ4-ODD
+                                        0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW4,CHA-B, DQ4-ODD
+                                        0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW5,CHA-B, DQ4-ODD
+                                        0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW6,CHA-B, DQ4-ODD
+                                        0x10101010,0x10101010,0x10101010,0x10101010, // QW7,CHA-B, DQ4-ODD
+                                        0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW0,CHA-B, DQ5-ODD
+                                        0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW1,CHA-B, DQ5-ODD
+                                        0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW2,CHA-B, DQ5-ODD
+                                        0x20202020,0x20202020,0x20202020,0x20202020, // QW3,CHA-B, DQ5-ODD
+                                        0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW4,CHA-B, DQ5-ODD
+                                        0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW5,CHA-B, DQ5-ODD
+                                        0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW6,CHA-B, DQ5-ODD
+                                        0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW7,CHA-B, DQ5-ODD
+                                        0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW0,CHA-B, DQ6-ODD
+                                        0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW1,CHA-B, DQ6-ODD
+                                        0x40404040,0x40404040,0x40404040,0x40404040, // QW2,CHA-B, DQ6-ODD
+                                        0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW3,CHA-B, DQ6-ODD
+                                        0x40404040,0x40404040,0x40404040,0x40404040, // QW4,CHA-B, DQ6-ODD
+                                        0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW5,CHA-B, DQ6-ODD
+                                        0x40404040,0x40404040,0x40404040,0x40404040, // QW6,CHA-B, DQ6-ODD
+                                        0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW7,CHA-B, DQ6-ODD
+                                        0x80808080,0x80808080,0x80808080,0x80808080, // QW0,CHA-B, DQ7-ODD
+                                        0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW1,CHA-B, DQ7-ODD
+                                        0x80808080,0x80808080,0x80808080,0x80808080, // QW2,CHA-B, DQ7-ODD
+                                        0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW3,CHA-B, DQ7-ODD
+                                        0x80808080,0x80808080,0x80808080,0x80808080, // QW4,CHA-B, DQ7-ODD
+                                        0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW5,CHA-B, DQ7-ODD
+                                        0x80808080,0x80808080,0x80808080,0x80808080, // QW6,CHA-B, DQ7-ODD
+                                        0x80808080,0x80808080,0x80808080,0x80808080  // QW7,CHA-B, DQ7-ODD
+                };
+        u8 pattern_buf_x[64 * 18 + 16]; // We need to two cache line So have more 16 bytes to keep 16 byte alignment */
+        u8 *buf_a;
+
+	unsigned pattern;
+	u32 dword;
+	u32 ecc_bit;
+	unsigned Errors;
+	unsigned channel;
+	int i;
+	unsigned DQSWrDelay;
+	unsigned is_Width128 = sysinfo->meminfo[ctrl->node_id].is_Width128;
+	u8 *dqs_delay_a = &sysinfo->dqs_delay_a[ctrl->node_id * 2*2*9]; //channel 2, direction 2 , bytelane *9
+
+        //enable SSE2
+        enable_sse2();
+
+        //wrap32dis
+        set_wrap32dis();
+
+        //disable ECC temp
+        dword = pci_conf1_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
+        ecc_bit = dword & DCL_DimmEccEn;
+        dword &= ~(DCL_DimmEccEn);
+        pci_conf1_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
+
+	//SetupDqsPattern
+	buf_a = (u8 *)(((u32)(&pattern_buf_x[0]) + 0x10) & (~0xf));
+
+	if(is_Width128){
+		pattern = 1;
+	        for(i=0;i<16*18;i++) {
+                	*((u32 *)(buf_a + i*4)) = TestPatternJD1b[i];
+       		 }
+	}
+	else {
+		pattern = 0;
+		for(i=0; i<16*9;i++) {
+			*((u32 *)(buf_a + i*4)) = TestPatternJD1a[i];
+		}
+		
+	}
+
+	printk(BIOS_DEBUG, "\nTrainDQSRdWrPos: 0 ctrl  0x%x %d\n", ctrl->node_id, 0); 
+
+	printk(BIOS_DEBUG, "TrainDQSRdWrPos: buf_a: %02x\n", *buf_a);
+
+	Errors = 0;
+
+	channel = 0;
+	while( (channel<2) && (!Errors)) {
+		printk(BIOS_DEBUG, "\tTrainDQSRdWrPos: 1 channel  0x%x %d\n",channel, 1); 
+		for(DQSWrDelay = 0; DQSWrDelay < 48; DQSWrDelay++) {
+			unsigned err;
+			SetDQSDelayAllCSR(ctrl, channel, DQS_WRITEDIR, DQSWrDelay);
+			printk(BIOS_DEBUG, "\t\tTrainDQSRdWrPos: 21 DQSWrDelay  0x%x %d\n", DQSWrDelay, 2); 
+			err= TrainReadDQS(ctrl, channel, pattern, buf_a, dqs_delay_a, sysinfo);
+			printk(BIOS_DEBUG, "\t\tTrainDQSRdWrPos: 22 err  0x%x %d\n",err, 2); 
+			if(err == 0) break;
+			Errors |= err;
+		}
+
+		printk(BIOS_DEBUG, "\tTrainDQSRdWrPos: 3 DQSWrDelay  0x%x %d\n", DQSWrDelay, 1); 
+
+		if(DQSWrDelay < 48) {
+			Errors = TrainWriteDQS(ctrl, channel, pattern, buf_a, dqs_delay_a, sysinfo);
+			printk(BIOS_DEBUG, "\tTrainDQSRdWrPos: 4 Errors  0x%x %d\n", Errors, 1); 
+
+		}
+		channel++;
+		if(!is_Width128){
+			//FIXME: 64MuxMode??	
+			channel++; // skip channel if 64-bit mode
+		}
+	}
+
+        //Enable ECC again
+        dword = pci_conf1_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
+        dword &= ~(DCL_DimmEccEn);
+        dword |= ecc_bit;
+        pci_conf1_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
+
+        //Clear wrap32dis
+
+        clear_wrap32dis();
+
+        //restore SSE2 setting
+        disable_sse2();
+
+	printk(BIOS_DEBUG, "TrainDQSRdWrPos:  0x%x %d\n", 5, 0); 
+	
+	return Errors;
+
+}
+static inline u8 get_dqs_delay(unsigned channel, unsigned bytelane, unsigned direction, u8 *dqs_delay_a)
+{
+	return dqs_delay_a[channel * 2*9 + direction * 9 + bytelane];
+}
+
+static unsigned CalcEccDQSPos(unsigned channel,unsigned ByteLane0, unsigned ByteLane1, unsigned InterFactor, unsigned Direction, u8 *dqs_delay_a)
+/* InterFactor: 0: 100% ByteLane 0
+                0x80: 50% between ByteLane 0 and 1
+		0xff: 99.6% ByteLane 1 and 0.4% like 0
+*/
+{
+	unsigned DQSDelay0, DQSDelay1;
+	unsigned DQSDelay;
+	
+	DQSDelay0 = get_dqs_delay(channel, ByteLane0, Direction, dqs_delay_a);
+	DQSDelay1 = get_dqs_delay(channel, ByteLane1, Direction, dqs_delay_a); 
+	
+	if(DQSDelay0>DQSDelay1) {
+		DQSDelay = DQSDelay0 - DQSDelay1;
+		InterFactor = 0xff - InterFactor;
+	}
+	else {
+		DQSDelay = DQSDelay1 - DQSDelay0;
+	}
+
+	DQSDelay *= InterFactor;
+
+	DQSDelay >>= 8; // /255
+
+        if(DQSDelay0>DQSDelay1) {
+                DQSDelay += DQSDelay1;
+        }
+        else {
+                DQSDelay += DQSDelay0;
+        }
+
+	return DQSDelay;
+
+}
+
+static void SetEccDQSRdWrPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
+{	
+	unsigned channel;
+	unsigned ByteLane;
+	unsigned Direction;
+	unsigned lane0, lane1, ratio;
+	unsigned dqs_delay;
+
+	unsigned direction[] = { DQS_READDIR, DQS_WRITEDIR };
+	int i;
+	u8 *dqs_delay_a = &sysinfo->dqs_delay_a[ctrl->node_id * 2*2*9]; //channel 2, direction 2 , bytelane *9
+
+	ByteLane = 8;
+
+	for(channel = 0; channel < 2; channel++) {
+		for(i=0;i<2;i++) {
+			Direction = direction[i];
+			lane0 = 4; lane1 = 5; ratio = 0;
+			dqs_delay = CalcEccDQSPos(channel, lane0, lane1, ratio, Direction, dqs_delay_a);
+			print_debug_dqs_pair("\t\tSetEccDQSRdWrPos: channel ", channel, Direction==DQS_READDIR? " R dqs_delay":" W dqs_delay",  dqs_delay, 2); 
+			SetDQSDelayCSR(ctrl, channel, ByteLane, Direction, dqs_delay);
+			save_dqs_delay(channel, ByteLane, Direction, dqs_delay_a, dqs_delay);
+		}
+	}
+}
+
+static unsigned train_DqsRcvrEn(const struct mem_controller *ctrl, unsigned Pass, struct sys_info *sysinfo)
+{
+	printk(BIOS_DEBUG, "\ntrain_DqsRcvrEn: begin ctrl  0x%x %d\n", ctrl->node_id, 0); 
+	if(TrainRcvrEn(ctrl, Pass, sysinfo)) {
+		return 1;
+	}
+	printk(BIOS_DEBUG, "\ntrain_DqsRcvrEn: end ctrl  0x%x %d\n", ctrl->node_id, 0); 
+	return 0;
+	
+}
+static unsigned train_DqsPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
+{
+	printk(BIOS_DEBUG, "\ntrain_DqsPos: begin ctrl %d\n", ctrl->node_id);
+	if(TrainDQSRdWrPos(ctrl, sysinfo) != 0) {
+                printk(BIOS_ERR, "\nDQS Training Rd Wr failed ctrl %d\n", ctrl->node_id);
+		return 1;
+	}
+	else {
+		SetEccDQSRdWrPos(ctrl, sysinfo);
+	}
+	printk(BIOS_DEBUG, "\ntrain_DqsPos: end ctrl 0x%x %d\n", ctrl->node_id, 0); 
+	return 0;
+	
+}
+
+#ifdef K8_REV_F_SUPPORT_F0_F1_WORKAROUND
+static void f0_svm_workaround(int controllers, const struct mem_controller *ctrl, u64 *tsc0, struct sys_info *sysinfo)
+{
+        u64 tsc1[8];
+	unsigned cpu_f0_f1[8];
+	int i;
+
+        printk(BIOS_DEBUG, "dqs_timing: tsc1[8] :0x%llx", tsc1);
+
+        for(i = 0; i < controllers; i++) {
+                if (!sysinfo->ctrl_present[i])
+                        continue;
+
+                /* Skip everything if I don't have any memory on this controller */
+		if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
+
+                u32 dword;
+
+                cpu_f0_f1[i] = is_cpu_pre_f2_in_bsp(i);
+
+                if(!cpu_f0_f1[i]) continue;
+
+                dword = pci_conf1_read_config32(ctrl[i].f2, DRAM_CTRL);
+                dword &= ~DC_DqsRcvEnTrain;
+                pci_conf1_write_config32(ctrl[i].f2, DRAM_CTRL, dword);
+
+                dword = pci_conf1_read_config32(ctrl[i].f2, DRAM_INIT);
+                dword |= DI_EnDramInit;
+                pci_conf1_write_config32(ctrl[i].f2, DRAM_INIT, dword);
+                dword &= ~DI_EnDramInit;
+                pci_conf1_write_config32(ctrl[i].f2, DRAM_INIT, dword);
+
+                tsc1[i] = cycles();
+                print_debug_dqs_tsc("begin: tsc1", i, tsc1[i].hi, tsc1[i].lo, 2);
+
+                dword = tsc1[i].lo + tsc0[i].lo;
+                if((dword<tsc1[i].lo) || (dword<tsc0[i].lo)) {
+                        tsc1[i].hi++;
+                }
+                tsc1[i].lo = dword;
+                tsc1[i].hi+= tsc0[i].hi;
+
+                print_debug_dqs_tsc("end  : tsc1", i, tsc1[i].hi, tsc1[i].lo, 2);
+
+        }
+
+        for(i = 0; i < controllers; i++) {
+                if (!sysinfo->ctrl_present[i])
+                        continue;
+
+                /* Skip everything if I don't have any memory on this controller */
+		if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
+
+		if(!cpu_f0_f1[i]) continue;
+
+                u64 tsc;
+
+                do {
+                        tsc = cycles();
+                } while ((tsc1[i].hi>tsc.hi) || ((tsc1[i].hi==tsc.hi) && (tsc1[i].lo>tsc.lo)));
+
+                print_debug_dqs_tsc("end  : tsc ", i, tsc.hi, tsc.lo, 2);
+        }
+
+}
+
+#endif
+
+
+/* setting variable mtrr, comes from linux kernel source */
+static void set_var_mtrr_dqs(
+        unsigned int reg, unsigned long basek, unsigned long sizek,
+        unsigned char type, unsigned address_bits)
+{
+        struct msr base, mask;
+        unsigned address_mask_high;
+
+        address_mask_high = ((1u << (address_bits - 32u)) - 1u);
+
+        base.hi = basek >> 22;
+        base.lo  = basek << 10;
+
+        if (sizek < 4*1024*1024) {
+                mask.hi = address_mask_high;
+                mask.lo = ~((sizek << 10) -1);
+        }
+        else {
+                mask.hi = address_mask_high & (~((sizek >> 22) -1));
+                mask.lo = 0;
+        }
+
+        if (reg >= 8)
+                return;
+
+        if (sizek == 0) {
+                struct msr zero;
+                zero.lo = zero.hi = 0;
+                /* The invalid bit is kept in the mask, so we simply clear the
+                   relevant mask register to disable a range. */
+                wrmsr (MTRRphysMask_MSR(reg), zero);
+        } else {
+                /* Bit 32-35 of MTRRphysMask should be set to 1 */
+                base.lo |= type;
+                mask.lo |= 0x800;
+                wrmsr (MTRRphysBase_MSR(reg), base);
+                wrmsr (MTRRphysMask_MSR(reg), mask);
+        }
+}
+
+
+/* fms: find most sigificant bit set, stolen from Linux Kernel Source. */
+static inline unsigned int fms(unsigned int x)
+{
+        int r;
+
+        __asm__("bsrl %1,%0\n\t"
+                "jnz 1f\n\t"
+                "movl $0,%0\n"
+                "1:" : "=r" (r) : "g" (x));
+        return r;
+}
+
+/* fms: find least sigificant bit set */
+static inline unsigned int fls(unsigned int x)
+{
+        int r;
+
+        __asm__("bsfl %1,%0\n\t"
+                "jnz 1f\n\t"
+                "movl $32,%0\n"
+                "1:" : "=r" (r) : "g" (x));
+        return r;
+}
+
+static unsigned int range_to_mtrr(unsigned int reg,
+        unsigned long range_startk, unsigned long range_sizek,
+        unsigned long next_range_startk, unsigned char type, unsigned address_bits)
+{
+        if (!range_sizek || (reg >= 8)) {
+                return reg;
+        }
+        while(range_sizek) {
+                unsigned long max_align, align;
+                unsigned long sizek;
+                /* Compute the maximum size I can make a range */
+                max_align = fls(range_startk);
+                align = fms(range_sizek);
+                if (align > max_align) {
+                        align = max_align;
+                }
+                sizek = 1 << align;
+#if MEM_TRAIN_SEQ != 1
+                printk(BIOS_DEBUG, "Setting variable MTRR %d, base: %4ldMB, range: %4ldMB, type %s\n",
+                        reg, range_startk >>10, sizek >> 10,
+                        (type==MTRR_TYPE_UNCACHEABLE)?"UC":
+                            ((type==MTRR_TYPE_WRBACK)?"WB":"Other")
+                        );
+#endif
+                set_var_mtrr_dqs(reg++, range_startk, sizek, type, address_bits);
+                range_startk += sizek;
+                range_sizek -= sizek;
+                if (reg >= 8)
+                        break;
+        }
+        return reg;
+}
+
+void set_top_mem_ap(unsigned tom_k, unsigned tom2_k)
+{
+        struct msr msr;
+
+        /* Now set top of memory */
+        msr.lo = (tom2_k & 0x003fffff) << 10;
+        msr.hi = (tom2_k & 0xffc00000) >> 22;
+        wrmsr(TOP_MEM2, msr);
+
+        msr.lo = (tom_k & 0x003fffff) << 10;
+        msr.hi = (tom_k & 0xffc00000) >> 22;
+        wrmsr(TOP_MEM, msr);
+}
+
+static void setup_mtrr_dqs(unsigned tom_k, unsigned tom2_k){
+        unsigned reg;
+        struct msr msr;
+
+#if 0
+        //still enable from cache_as_ram.inc
+        msr = rdmsr(SYSCFG_MSR);
+        msr.lo |= SYSCFG_MSR_MtrrFixDramModEn;
+        wrmsr(SYSCFG_MSR,msr);
+#endif
+
+        //[0,512k), [512k, 640k)
+        msr.hi = 0x1e1e1e1e;
+        msr.lo = msr.hi;
+        wrmsr(0x250, msr);
+        wrmsr(0x258, msr);
+
+        //[1M, TOM)
+        reg = range_to_mtrr(2, 0, tom_k,4*1024*1024, MTRR_TYPE_WRBACK, 40);
+
+        //[4G, TOM2)
+        if(tom2_k) {
+                //enable tom2 and type
+                msr = rdmsr(SYSCFG_MSR);
+                msr.lo |= (1<<21) | (1<<22); //MtrrTom2En and Tom2ForceMemTypeWB
+                wrmsr(SYSCFG_MSR, msr);
+        }
+
+}
+
+static void clear_mtrr_dqs(unsigned tom2_k){
+        struct msr msr;
+        unsigned i;
+
+        //still enable from cache_as_ram.inc
+        msr = rdmsr(SYSCFG_MSR);
+        msr.lo |= SYSCFG_MSR_MtrrFixDramModEn;
+        wrmsr(SYSCFG_MSR,msr);
+
+        //[0,512k), [512k, 640k)
+        msr.hi = 0;
+        msr.lo = msr.hi;
+        wrmsr(0x250, msr);
+        wrmsr(0x258, msr);
+
+        //[1M, TOM)
+        for(i=0x204;i<0x210;i++) {
+                wrmsr(i, msr);
+        }
+
+        //[4G, TOM2)
+        if(tom2_k) {
+                //enable tom2 and type
+                msr = rdmsr(SYSCFG_MSR);
+                msr.lo &= ~((1<<21) | (1<<22)); //MtrrTom2En and Tom2ForceMemTypeWB
+                wrmsr(SYSCFG_MSR, msr);
+        }
+}
+
+static void set_htic_bit(unsigned i, unsigned val, unsigned bit)
+{
+        u32 dword;
+        dword = pci_conf1_read_config32(PCI_BDF(0, 0x18+i, 0), HT_INIT_CONTROL);
+        dword &= ~(1<<bit);
+        dword |= ((val & 1) <<bit);
+        pci_conf1_write_config32(PCI_BDF(0, 0x18+i, 0), HT_INIT_CONTROL, dword);
+}
+
+
+static unsigned get_htic_bit(unsigned i, unsigned bit)
+{
+        u32 dword;
+        dword = pci_conf1_read_config32(PCI_BDF(0, 0x18+i, 0), HT_INIT_CONTROL);
+        dword &= (1<<bit);
+        return dword;
+}
+
+void wait_till_sysinfo_in_ram(void)
+{
+        while(1) {
+                if(get_htic_bit(0, 9)) return;
+        }
+}
+
+void set_sysinfo_in_ram(unsigned val)
+{
+        set_htic_bit(0, val, 9);
+}
+
+
+#if MEM_TRAIN_SEQ == 0
+
+
+#ifdef K8_REV_F_SUPPORT_F0_F1_WORKAROUND
+static void dqs_timing(int controllers, const struct mem_controller *ctrl, u64 *tsc0, struct sys_info *sysinfo)
+#else
+void dqs_timing(int controllers, const struct mem_controller *ctrl, struct sys_info *sysinfo)
+#endif
+{
+	int  i;
+
+	u64 tsc[5];
+
+        //need to enable mtrr, so dqs training could access the test address
+        setup_mtrr_dqs(sysinfo->tom_k, sysinfo->tom2_k);
+
+        for(i = 0; i < controllers; i++) {
+                if (!sysinfo->ctrl_present[ i ])
+                        continue;
+
+                /* Skip everything if I don't have any memory on this controller */
+                if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
+
+		fill_mem_cs_sysinfo(i, ctrl+i, sysinfo);
+	}
+
+	tsc[0] = cycles();
+        for(i = 0; i < controllers; i++) {
+                if (!sysinfo->ctrl_present[ i ])
+                        continue;
+
+                /* Skip everything if I don't have any memory on this controller */
+		if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
+
+                printk(BIOS_DEBUG, "DQS Training:RcvrEn:Pass1: %d", i);
+                if(train_DqsRcvrEn(ctrl+i, 1, sysinfo)) goto out;
+       	        printk(BIOS_DEBUG, " done\n");
+        }
+
+	tsc[1] = cycles();
+#ifdef K8_REV_F_SUPPORT_F0_F1_WORKAROUND
+	f0_svm_workaround(controllers, ctrl, tsc0, sysinfo);
+#endif
+
+	tsc[2] = cycles();
+        for(i = 0; i < controllers; i++) {
+                if (!sysinfo->ctrl_present[i])
+                        continue;
+
+                /* Skip everything if I don't have any memory on this controller */
+		if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
+
+                printk(BIOS_DEBUG, "DQS Training:DQSPos: %d", i);
+                if(train_DqsPos(ctrl+i, sysinfo)) goto out;
+                printk(BIOS_DEBUG, " done\n");
+        }
+
+	tsc[3] = cycles();
+        for(i = 0; i < controllers; i++) {
+                if (!sysinfo->ctrl_present[i])
+                        continue;
+
+                /* Skip everything if I don't have any memory on this controller */
+		if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
+
+                printk(BIOS_DEBUG, "DQS Training:RcvrEn:Pass2: %d", i);
+                if(train_DqsRcvrEn(ctrl+i, 2, sysinfo)) goto out;
+                printk(BIOS_DEBUG, " done\n");
+		sysinfo->mem_trained[i]=1;
+        }
+
+out:
+	tsc[4] = cycles();
+	clear_mtrr_dqs(sysinfo->tom2_k);
+
+
+	for(i=0;i<5;i++) {
+//		print_debug_dqs_tsc_x("DQS Training:tsc", i,  tsc[i].hi, tsc[i].lo);
+	}
+
+
+	
+}
+
+#endif
+
+
+#if MEM_TRAIN_SEQ > 0 
+
+static void dqs_timing(int i, const struct mem_controller *ctrl, struct sys_info *sysinfo, unsigned int v)
+{
+
+        int ii;
+
+         u64 tsc[4];
+
+	if(sysinfo->mem_trained[i] != 0x80) return;
+
+#if MEM_TRAIN_SEQ == 1
+        //need to enable mtrr, so dqs training could access the test address
+        setup_mtrr_dqs(sysinfo->tom_k, sysinfo->tom2_k);
+#endif
+
+	fill_mem_cs_sysinfo(i, ctrl, sysinfo);
+
+	if(v) {
+	        tsc[0] = cycles();
+
+	        printk(BIOS_DEBUG, "set DQS timing:RcvrEn:Pass1: 0x%x\n", i);
+	}
+        if(train_DqsRcvrEn(ctrl, 1,  sysinfo)) {
+		sysinfo->mem_trained[i]=0x81; //
+		goto out;
+	}
+
+	if(v) {
+	        printk(BIOS_DEBUG, " done\n");
+        	tsc[1] = cycles();
+	        printk(BIOS_DEBUG, "set DQS timing:DQSPos: ");
+	        print_debug_hex8(i);
+	}
+
+        if(train_DqsPos(ctrl, sysinfo)) {
+		sysinfo->mem_trained[i]=0x82; //
+		goto out;
+	}
+	
+	if(v) {
+	        printk(BIOS_DEBUG, " done\n");
+	        tsc[2] = cycles();
+
+	        printk(BIOS_DEBUG, "set DQS timing:RcvrEn:Pass2: ");
+	        print_debug_hex8(i);
+	}
+        if(train_DqsRcvrEn(ctrl, 2,  sysinfo)){
+		sysinfo->mem_trained[i]=0x83; //
+		goto out;
+	}
+
+	if(v) {
+	        printk(BIOS_DEBUG, " done\n");
+
+	        tsc[3] = cycles();
+	}
+
+out:
+#if MEM_TRAIN_SEQ == 1
+        clear_mtrr_dqs(sysinfo->tom2_k);
+#endif
+
+	if(v) {
+	        for(ii=0;ii<4;ii++) {
+        	      print_debug_dqs_tsc_x("Total DQS Training : tsc ", ii,  tsc[ii].hi, tsc[ii].lo);
+	        }
+	}
+	
+	if(sysinfo->mem_trained[i] == 0x80) {
+		sysinfo->mem_trained[i]=1;
+	}
+
+}
+#endif
+
+#if MEM_TRAIN_SEQ == 1
+static void train_ram(unsigned nodeid, struct sys_info *sysinfo, struct sys_info *sysinfox)
+{
+	dqs_timing(nodeid, &sysinfo->ctrl[nodeid], sysinfo, 0); // keep the output tidy
+//      memcpy(&sysinfox->dqs_rcvr_dly_a[nodeid * 2 * 8],&sysinfo->dqs_rcvr_dly_a[nodeid * 2 * 8], 2*8);
+//      memcpy(&sysinfox->dqs_delay_a[nodeid * 2 * 2 * 9], &sysinfo->dqs_delay_a[nodeid * 2 * 2 * 9], 2 * 2 * 9);
+	sysinfox->mem_trained[nodeid] = sysinfo->mem_trained[nodeid];
+
+}
+static void copy_and_run_ap_code_in_car(unsigned ret_addr);
+static inline void train_ram_on_node(unsigned nodeid, unsigned coreid, struct sys_info *sysinfo, unsigned retcall)
+{
+	if(coreid) return; // only do it on core0
+	struct sys_info *sysinfox = ((CONFIG_LB_MEM_TOPK<<10) - DCACHE_RAM_GLOBAL_VAR_SIZE);
+	wait_till_sysinfo_in_ram(); // use pci to get it
+
+	if(sysinfox->mem_trained[nodeid] == 0x80) {
+	#if 0
+		sysinfo->tom_k = sysinfox->tom_k;
+		sysinfo->tom2_k = sysinfox->tom2_k;
+		sysinfo->meminfo[nodeid].is_Width128 = sysinfox->meminfo[nodeid].is_Width128;
+		sysinfo->mem_trained[nodeid] = sysinfox->mem_trained[nodeid];
+		memcpy(&sysinfo->ctrl[nodeid], &sysinfox->ctrl[nodeid], sizeof(struct mem_controller));
+	#else
+		memcpy(sysinfo, sysinfox, DCACHE_RAM_GLOBAL_VAR_SIZE);
+	#endif
+		set_top_mem_ap(sysinfo->tom_k, sysinfo->tom2_k); // keep the ap's tom consistent with bsp's
+	#if CONFIG_AP_CODE_IN_CAR == 0
+		printk(BIOS_DEBUG, "CODE IN ROM AND RUN ON NODE:"); print_debug_hex8(nodeid); printk(BIOS_DEBUG, "\n");
+		train_ram(nodeid, sysinfo, sysinfox);
+	#else
+		/* Can copy dqs_timing to ap cache and run from cache?
+		* we need coreboot_ap_car.rom? and treat it as coreboot_ram.rom for ap ?
+		*/
+		copy_and_run_ap_code_in_car(retcall);
+		// will go back by jump
+	#endif
+	}
+}
+#endif





More information about the coreboot mailing list