Angel Pons has uploaded this change for review.

View Change

nb/intel/sandybridge: Improve channel disable logic

Do not consider the failed channel's SPDs in emergency mode. Also, force
a full training if not resuming from S3 and the saved data indicates a
channel has failed. Special care needs to be taken as the selected
memory frequency could be raised after a channel has been disabled.

Tested on Asus P8Z77-V LX2, with a good DIMM on channel 0 and a bad DIMM
on channel 1. The failing channel causes errors when discovering timB,
and emergency mode raminit for the other channel is good enough to boot.
With this commit:

a. Regular boots always result in full training.
b. The board still boots, as emergency raminit trained the good DIMM.
c. S3 resume still works in all tested configurations.
d. If pretending the bad DIMM is slower:
1. The MPLL gets locked at the slow frequency of the bad DIMM.
2. The first raminit attempt still fails miserably on the bad DIMM.
3. Emergency mode uses the frequency the MPLL is locked at.
4. The board boots with the good DIMM running at the slow frequency.

With good DIMMs on both channels, everything works the same as before.

Change-Id: I3d34594561680906cb0b15a9c6a5fa7a773c0496
Signed-off-by: Angel Pons <th3fanbus@gmail.com>
---
M src/northbridge/intel/sandybridge/raminit.c
M src/northbridge/intel/sandybridge/raminit_common.h
M src/northbridge/intel/sandybridge/raminit_native.c
3 files changed, 38 insertions(+), 14 deletions(-)

git pull ssh://review.coreboot.org:29418/coreboot refs/changes/00/45500/1
diff --git a/src/northbridge/intel/sandybridge/raminit.c b/src/northbridge/intel/sandybridge/raminit.c
index 319fea3..51c57c6 100644
--- a/src/northbridge/intel/sandybridge/raminit.c
+++ b/src/northbridge/intel/sandybridge/raminit.c
@@ -43,15 +43,7 @@
/* Disable a channel in ramctr_timing */
static void disable_channel(ramctr_timing *ctrl, int channel)
{
- ctrl->rankmap[channel] = 0;
-
- memset(&ctrl->rank_mirror[channel][0], 0, sizeof(ctrl->rank_mirror[0]));
-
- ctrl->channel_size_mb[channel] = 0;
- ctrl->cmd_stretch[channel] = 0;
- ctrl->mad_dimm[channel] = 0;
- memset(&ctrl->timings[channel][0], 0, sizeof(ctrl->timings[0]));
- memset(&ctrl->info.dimm[channel][0], 0, sizeof(ctrl->info.dimm[0]));
+ ctrl->failed_channels |= BIT(channel);
}

static bool nb_supports_ecc(const uint32_t capid0_a)
@@ -163,6 +155,10 @@
FOR_ALL_CHANNELS {
ctrl->channel_size_mb[channel] = 0;

+ /* Skip decoding the SPDs of failed channels */
+ if (ctrl->failed_channels & BIT(channel))
+ continue;
+
ch_dimms = 0;
/* Count dimms on channel */
for (slot = 0; slot < NUM_SLOTS; slot++) {
@@ -345,8 +341,21 @@
system_reset();
}

+ /*
+ * Discard the saved data if it is from raminit emergency mode.
+ * Ideally, we would not advertise support for ACPI S3 in such
+ * cases, because there is evidence that the RAM has stability
+ * problems. Until then, skip this sanity check on S3 resumes.
+ */
+ if (!s3resume && ctrl_cached && ctrl_cached->failed_channels) {
+ printk(BIOS_ERR, "Stored timings indicate channels have failed.\n");
+
+ ctrl_cached = NULL;
+ }
+
/* Verify MRC cache for fast boot */
if (!s3resume && ctrl_cached) {
+
/* Load SPD unique information data. */
memset(spds, 0, sizeof(spds));
mainboard_get_spd(spds, 1);
@@ -391,6 +400,13 @@
}

if (err) {
+ /*
+ * We can't reinitialize the DRAM MPLL at a different frequency.
+ * As we want to ignore the SPDs on the failed channel, we have
+ * to use the same tCK in emergency mode to avoid some troubles.
+ */
+ const u32 old_tCK = ctrl.tCK;
+
/* Fallback: disable failing channel */
printk(BIOS_ERR, "RAM training failed, trying fallback.\n");
printram("Disable failing channel.\n");
@@ -398,12 +414,15 @@
/* Reset internal state */
reinit_ctrl(&ctrl, cpuid);

- /* Reset DDR3 frequency */
- dram_find_spds_ddr3(spds, &ctrl);
-
/* Disable failing channel */
disable_channel(&ctrl, GET_ERR_CHANNEL(err));

+ /* Restore tCK */
+ ctrl.tCK = old_tCK;
+
+ /* Recalculate DDR3 frequency */
+ dram_find_spds_ddr3(spds, &ctrl);
+
err = try_init_dram_ddr3(&ctrl, fast_boot, s3resume, me_uma_size);
}

diff --git a/src/northbridge/intel/sandybridge/raminit_common.h b/src/northbridge/intel/sandybridge/raminit_common.h
index 32f2b44..cf3d6a9 100644
--- a/src/northbridge/intel/sandybridge/raminit_common.h
+++ b/src/northbridge/intel/sandybridge/raminit_common.h
@@ -105,7 +105,7 @@
/*
* WARNING: Do not forget to increase MRC_CACHE_VERSION when the saved data is changed!
*/
-#define MRC_CACHE_VERSION 5
+#define MRC_CACHE_VERSION 6

typedef struct odtmap_st {
u16 rttwr;
@@ -185,6 +185,9 @@
/* Bits [0..11] of PM_DLL_CONFIG: Master DLL wakeup delay timer */
u16 mdll_wake_delay;

+ /* Bitmap of channels that have failed */
+ u8 failed_channels;
+
u8 rankmap[NUM_CHANNELS];
int ref_card_offset[NUM_CHANNELS];
u32 mad_dimm[NUM_CHANNELS];
diff --git a/src/northbridge/intel/sandybridge/raminit_native.c b/src/northbridge/intel/sandybridge/raminit_native.c
index c23a5ac..a469910 100644
--- a/src/northbridge/intel/sandybridge/raminit_native.c
+++ b/src/northbridge/intel/sandybridge/raminit_native.c
@@ -265,7 +265,9 @@

printk(BIOS_DEBUG, "PLL_REF100_CFG value: 0x%x\n", ref_100mhz_support);

- ctrl->tCK = get_mem_min_tck();
+ /* In emergency mode, use provided tCK if non-zero. Otherwise, calculate it. */
+ if (!ctrl->failed_channels || !ctrl->tCK)
+ ctrl->tCK = get_mem_min_tck();

/* Find CAS latency */
while (1) {

To view, visit change 45500. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: coreboot
Gerrit-Branch: master
Gerrit-Change-Id: I3d34594561680906cb0b15a9c6a5fa7a773c0496
Gerrit-Change-Number: 45500
Gerrit-PatchSet: 1
Gerrit-Owner: Angel Pons <th3fanbus@gmail.com>
Gerrit-Reviewer: Patrick Rudolph <siro@das-labor.org>
Gerrit-MessageType: newchange