/* * Machine check handler. * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). */ #include #include #include #include #include #include #include #include #include #include #include static int mce_disabled __initdata; static unsigned long mce_cpus; /* * Machine Check Handler For PII/PIII/K7 */ static int banks; static unsigned long ignored_banks, disabled_banks; /* Machine Check on everything dubious. This is a good setting for device driver testing. */ #define K8_DRIVER_DEBUG ((1<<13)-1) /* Report RAM errors and Hyper Transport Problems, but ignore Device aborts and GART errors. */ #define K8_NORMAL_OP 0xff static u32 k8_nb_flags __initdata = K8_NORMAL_OP; static void generic_machine_check(struct pt_regs * regs, long error_code) { int recover=1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); if(mcgstl&(1<<0)) /* Recoverable ? */ recover=0; printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); if (regs && (mcgstl & 2)) printk(KERN_EMERG "RIP <%02lx>:%016lx RSP %016lx\n", regs->cs, regs->rip, regs->rsp); for(i=0;ibus->number==0 && PCI_FUNC(dev->devfn)==3 && PCI_SLOT(dev->devfn) == (24+cpu)) return dev; } return NULL; } static char *transaction[] = { "instruction", "data", "generic", "reserved" }; static char *cachelevel[] = { "level 0", "level 1", "level 2", "level generic" }; static char *memtrans[] = { "generic error", "generic read", "generic write", "data read", "data write", "instruction fetch", "prefetch", "snoop", "?", "?", "?", "?", "?", "?", "?" }; static char *partproc[] = { "local node origin", "local node response", "local node observed", "generic" }; static char *timeout[] = { "request didn't time out", "request timed out" }; static char *memoryio[] = { "memory access", "res.", "i/o access", "generic" }; static char *extendederr[] = { "ecc error", "crc error", "sync error", "mst abort", "tgt abort", "gart error", "rmw error", "wdog error", "chipkill ecc error", "<9>","<10>","<11>","<12>", "<13>","<14>","<15>" }; static char *highbits[32] = { [31] = "previous error lost", [30] = "error overflow", [29] = "error uncorrected", [28] = "error enable", [27] = "misc error valid", [26] = "error address valid", [25] = "processor context corrupt", [24] = "res24", [23] = "res23", /* 22-15 ecc syndrome bits */ [14] = "corrected ecc error", [13] = "uncorrected ecc error", [12] = "res12", [11] = "res11", [10] = "res10", [9] = "res9", [8] = "dram scrub error", [7] = "res7", /* 6-4 ht link number of error */ [3] = "res3", [2] = "res2", [1] = "err cpu0", [0] = "err cpu1", }; static void check_k8_nb(void) { struct pci_dev *nb; nb = find_k8_nb(); if (nb == NULL) return; u32 statuslow, statushigh; pci_read_config_dword(nb, 0x48, &statuslow); pci_read_config_dword(nb, 0x4c, &statushigh); if (!(statushigh & (1<<31))) return; printk(KERN_ERR "Northbridge status %08x%08x\n", statushigh,statuslow); unsigned short errcode = statuslow & 0xffff; switch (errcode >> 8) { case 0: printk(KERN_ERR " GART TLB error %s %s\n", transaction[(errcode >> 2) & 3], cachelevel[errcode & 3]); break; case 1: if (errcode & (1<<11)) { printk(KERN_ERR " bus error %s %s %s %s %s\n", partproc[(errcode >> 10) & 0x3], timeout[(errcode >> 9) & 1], memtrans[(errcode >> 4) & 0xf], memoryio[(errcode >> 2) & 0x3], cachelevel[(errcode & 0x3)]); } else if (errcode & (1<<8)) { printk(KERN_ERR " memory error %s %s %s\n", memtrans[(errcode >> 4) & 0xf], transaction[(errcode >> 2) & 0x3], cachelevel[(errcode & 0x3)]); } else { printk(KERN_ERR " unknown error code %x\n", errcode); } break; } if (statushigh & ((1<<14)|(1<<13))) printk(KERN_ERR " ECC syndrome bits %x\n", (((statuslow >> 24) & 0xff) << 8) | ((statushigh >> 15) & 0x7f)); errcode = (statuslow >> 16) & 0xf; printk(KERN_ERR " extended error %s\n", extendederr[(statuslow >> 16) & 0xf]); /* should only print when it was a HyperTransport related error. */ printk(KERN_ERR " link number %x\n", (statushigh >> 4) & 3); int i; for (i = 0; i < 32; i++) if (highbits[i] && (statushigh & (1<rip, regs->rsp); others: generic_machine_check(regs, error_code); } static struct timer_list mcheck_timer; int mcheck_interval = 30*HZ; #ifndef CONFIG_SMP static void mcheck_timer_handler(unsigned long data) { k8_machine_check(NULL,0); mcheck_timer.expires = jiffies + mcheck_interval; add_timer(&mcheck_timer); } #else /* SMP needs a process context trampoline because smp_call_function cannot be called from interrupt context. */ static void mcheck_timer_other(void *data) { k8_machine_check(NULL, 0); } static void mcheck_timer_dist(void *data) { smp_call_function(mcheck_timer_other,0,0,0); k8_machine_check(NULL, 0); mcheck_timer.expires = jiffies + mcheck_interval; add_timer(&mcheck_timer); } static void mcheck_timer_handler(unsigned long data) { static struct tq_struct mcheck_task = { routine: mcheck_timer_dist }; schedule_task(&mcheck_task); } #endif static int nok8 __initdata; static void __init k8_mcheck_init(struct cpuinfo_x86 *c) { u64 cap; int i; struct pci_dev *nb; if (!test_bit(X86_FEATURE_MCE, &c->x86_capability) || !test_bit(X86_FEATURE_MCA, &c->x86_capability)) return; rdmsrl(MSR_IA32_MCG_CAP, cap); banks = cap&0xff; machine_check_vector = k8_machine_check; for (i = 0; i < banks; i++) { u64 val = ((1UL<x86_capability) ) return; /* * Check for PPro style MCA */ if( !test_bit(X86_FEATURE_MCA, &c->x86_capability) ) return; /* Ok machine check is available */ machine_check_vector = generic_machine_check; wmb(); if(done==0) printk(KERN_INFO "Intel machine check architecture supported.\n"); rdmsr(MSR_IA32_MCG_CAP, l, h); if(l&(1<<8)) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); banks = l&0xff; for(i=0;ix86_vendor) { case X86_VENDOR_AMD: if (c->x86 == 15 && !nok8) { k8_mcheck_init(c); break; } /* FALL THROUGH */ default: case X86_VENDOR_INTEL: generic_mcheck_init(c); break; } } static int __init mcheck_disable(char *str) { mce_disabled = 1; return 0; } /* mce=off disable machine check mce=nok8 disable k8 specific features mce=disable disable bank NUMBER mce=enable enable bank number mce=device Enable device driver test reporting in NB mce=NUMBER mcheck timer interval number seconds. Can be also comma separated in a single mce= */ static int __init mcheck_enable(char *str) { char *p; while ((p = strsep(&str,",")) != NULL) { if (isdigit(*p)) mcheck_interval = simple_strtol(p,NULL,0) * HZ; else if (!strcmp(p,"off")) mce_disabled = 1; else if (!strncmp(p,"enable",6)) disabled_banks &= ~(1<