1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Common corrected MCE threshold handler code: 4 */ 5 #include <linux/interrupt.h> 6 #include <linux/kernel.h> 7 8 #include <asm/irq_vectors.h> 9 #include <asm/traps.h> 10 #include <asm/apic.h> 11 #include <asm/mce.h> 12 #include <asm/trace/irq_vectors.h> 13 14 #include "internal.h" 15 16 static u32 mce_apei_thr_limit; 17 18 void mce_save_apei_thr_limit(u32 thr_limit) 19 { 20 mce_apei_thr_limit = thr_limit; 21 pr_info("HEST corrected error threshold limit: %u\n", thr_limit); 22 } 23 24 u32 mce_get_apei_thr_limit(void) 25 { 26 return mce_apei_thr_limit; 27 } 28 29 static void default_threshold_interrupt(void) 30 { 31 pr_err("Unexpected threshold interrupt at vector %x\n", 32 THRESHOLD_APIC_VECTOR); 33 } 34 35 void (*mce_threshold_vector)(void) = default_threshold_interrupt; 36 37 DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) 38 { 39 trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); 40 inc_irq_stat(irq_threshold_count); 41 mce_threshold_vector(); 42 trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); 43 apic_eoi(); 44 } 45 46 DEFINE_PER_CPU(struct mca_storm_desc, storm_desc); 47 48 void mce_inherit_storm(unsigned int bank) 49 { 50 struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); 51 52 /* 53 * Previous CPU owning this bank had put it into storm mode, 54 * but the precise history of that storm is unknown. Assume 55 * the worst (all recent polls of the bank found a valid error 56 * logged). This will avoid the new owner prematurely declaring 57 * the storm has ended. 58 */ 59 storm->banks[bank].history = ~0ull; 60 storm->banks[bank].timestamp = jiffies; 61 } 62 63 bool mce_get_storm_mode(void) 64 { 65 return __this_cpu_read(storm_desc.poll_mode); 66 } 67 68 void mce_set_storm_mode(bool storm) 69 { 70 __this_cpu_write(storm_desc.poll_mode, storm); 71 } 72 73 static void mce_handle_storm(unsigned int bank, bool on) 74 { 75 switch (boot_cpu_data.x86_vendor) { 76 case X86_VENDOR_INTEL: 77 mce_intel_handle_storm(bank, on); 78 break; 79 case X86_VENDOR_AMD: 80 mce_amd_handle_storm(bank, on); 81 break; 82 } 83 } 84 85 void cmci_storm_begin(unsigned int bank) 86 { 87 struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); 88 89 __set_bit(bank, this_cpu_ptr(mce_poll_banks)); 90 storm->banks[bank].in_storm_mode = true; 91 92 /* 93 * If this is the first bank on this CPU to enter storm mode 94 * start polling. 95 */ 96 if (++storm->stormy_bank_count == 1) 97 mce_timer_kick(true); 98 } 99 100 void cmci_storm_end(unsigned int bank) 101 { 102 struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); 103 104 if (!mce_flags.amd_threshold) 105 __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); 106 storm->banks[bank].history = 0; 107 storm->banks[bank].in_storm_mode = false; 108 109 /* If no banks left in storm mode, stop polling. */ 110 if (!--storm->stormy_bank_count) 111 mce_timer_kick(false); 112 } 113 114 void mce_track_storm(struct mce *mce) 115 { 116 struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); 117 unsigned long now = jiffies, delta; 118 unsigned int shift = 1; 119 u64 history = 0; 120 121 /* No tracking needed for banks that do not support CMCI */ 122 if (storm->banks[mce->bank].poll_only) 123 return; 124 125 /* 126 * When a bank is in storm mode it is polled once per second and 127 * the history mask will record about the last minute of poll results. 128 * If it is not in storm mode, then the bank is only checked when 129 * there is a CMCI interrupt. Check how long it has been since 130 * this bank was last checked, and adjust the amount of "shift" 131 * to apply to history. 132 */ 133 if (!storm->banks[mce->bank].in_storm_mode) { 134 delta = now - storm->banks[mce->bank].timestamp; 135 shift = (delta + HZ) / HZ; 136 } 137 138 /* If it has been a long time since the last poll, clear history. */ 139 if (shift < NUM_HISTORY_BITS) 140 history = storm->banks[mce->bank].history << shift; 141 142 storm->banks[mce->bank].timestamp = now; 143 144 /* History keeps track of corrected errors. VAL=1 && UC=0 */ 145 if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce)) 146 history |= 1; 147 148 storm->banks[mce->bank].history = history; 149 150 if (storm->banks[mce->bank].in_storm_mode) { 151 if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0)) 152 return; 153 printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank); 154 mce_handle_storm(mce->bank, false); 155 cmci_storm_end(mce->bank); 156 } else { 157 if (hweight64(history) < STORM_BEGIN_THRESHOLD) 158 return; 159 printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank); 160 mce_handle_storm(mce->bank, true); 161 cmci_storm_begin(mce->bank); 162 } 163 } 164