1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include <linux/fault-inject.h> 7 8 #include "regs/xe_gsc_regs.h" 9 #include "regs/xe_hw_error_regs.h" 10 #include "regs/xe_irq_regs.h" 11 12 #include "xe_device.h" 13 #include "xe_hw_error.h" 14 #include "xe_mmio.h" 15 #include "xe_survivability_mode.h" 16 17 #define HEC_UNCORR_FW_ERR_BITS 4 18 extern struct fault_attr inject_csc_hw_error; 19 20 /* Error categories reported by hardware */ 21 enum hardware_error { 22 HARDWARE_ERROR_CORRECTABLE = 0, 23 HARDWARE_ERROR_NONFATAL = 1, 24 HARDWARE_ERROR_FATAL = 2, 25 HARDWARE_ERROR_MAX, 26 }; 27 28 static const char * const hec_uncorrected_fw_errors[] = { 29 "Fatal", 30 "CSE Disabled", 31 "FD Corruption", 32 "Data Corruption" 33 }; 34 35 static const char *hw_error_to_str(const enum hardware_error hw_err) 36 { 37 switch (hw_err) { 38 case HARDWARE_ERROR_CORRECTABLE: 39 return "CORRECTABLE"; 40 case HARDWARE_ERROR_NONFATAL: 41 return "NONFATAL"; 42 case HARDWARE_ERROR_FATAL: 43 return "FATAL"; 44 default: 45 return "UNKNOWN"; 46 } 47 } 48 49 static bool fault_inject_csc_hw_error(void) 50 { 51 return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1); 52 } 53 54 static void csc_hw_error_work(struct work_struct *work) 55 { 56 struct xe_tile *tile = container_of(work, typeof(*tile), csc_hw_error_work); 57 struct xe_device *xe = tile_to_xe(tile); 58 int ret; 59 60 ret = xe_survivability_mode_runtime_enable(xe); 61 if (ret) 62 drm_err(&xe->drm, "Failed to enable runtime survivability mode\n"); 63 } 64 65 static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err) 66 { 67 const char *hw_err_str = hw_error_to_str(hw_err); 68 struct xe_device *xe = tile_to_xe(tile); 69 struct xe_mmio *mmio = &tile->mmio; 70 u32 base, err_bit, err_src; 71 unsigned long fw_err; 72 73 if (xe->info.platform != XE_BATTLEMAGE) 74 return; 75 76 base = BMG_GSC_HECI1_BASE; 77 lockdep_assert_held(&xe->irq.lock); 78 err_src = xe_mmio_read32(mmio, HEC_UNCORR_ERR_STATUS(base)); 79 if (!err_src) { 80 drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported HEC_ERR_STATUS_%s blank\n", 81 tile->id, hw_err_str); 82 return; 83 } 84 85 if (err_src & UNCORR_FW_REPORTED_ERR) { 86 fw_err = xe_mmio_read32(mmio, HEC_UNCORR_FW_ERR_DW0(base)); 87 for_each_set_bit(err_bit, &fw_err, HEC_UNCORR_FW_ERR_BITS) { 88 drm_err_ratelimited(&xe->drm, HW_ERR 89 "%s: HEC Uncorrected FW %s error reported, bit[%d] is set\n", 90 hw_err_str, hec_uncorrected_fw_errors[err_bit], 91 err_bit); 92 93 schedule_work(&tile->csc_hw_error_work); 94 } 95 } 96 97 xe_mmio_write32(mmio, HEC_UNCORR_ERR_STATUS(base), err_src); 98 } 99 100 static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err) 101 { 102 const char *hw_err_str = hw_error_to_str(hw_err); 103 struct xe_device *xe = tile_to_xe(tile); 104 unsigned long flags; 105 u32 err_src; 106 107 if (xe->info.platform != XE_BATTLEMAGE) 108 return; 109 110 spin_lock_irqsave(&xe->irq.lock, flags); 111 err_src = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err)); 112 if (!err_src) { 113 drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported DEV_ERR_STAT_%s blank!\n", 114 tile->id, hw_err_str); 115 goto unlock; 116 } 117 118 if (err_src & XE_CSC_ERROR) 119 csc_hw_error_handler(tile, hw_err); 120 121 xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src); 122 123 unlock: 124 spin_unlock_irqrestore(&xe->irq.lock, flags); 125 } 126 127 /** 128 * xe_hw_error_irq_handler - irq handling for hw errors 129 * @tile: tile instance 130 * @master_ctl: value read from master interrupt register 131 * 132 * Xe platforms add three error bits to the master interrupt register to support error handling. 133 * These three bits are used to convey the class of error FATAL, NONFATAL, or CORRECTABLE. 134 * To process the interrupt, determine the source of error by reading the Device Error Source 135 * Register that corresponds to the class of error being serviced. 136 */ 137 void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl) 138 { 139 enum hardware_error hw_err; 140 141 if (fault_inject_csc_hw_error()) 142 schedule_work(&tile->csc_hw_error_work); 143 144 for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) 145 if (master_ctl & ERROR_IRQ(hw_err)) 146 hw_error_source_handler(tile, hw_err); 147 } 148 149 /* 150 * Process hardware errors during boot 151 */ 152 static void process_hw_errors(struct xe_device *xe) 153 { 154 struct xe_tile *tile; 155 u32 master_ctl; 156 u8 id; 157 158 for_each_tile(tile, xe, id) { 159 master_ctl = xe_mmio_read32(&tile->mmio, GFX_MSTR_IRQ); 160 xe_hw_error_irq_handler(tile, master_ctl); 161 xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ, master_ctl); 162 } 163 } 164 165 /** 166 * xe_hw_error_init - Initialize hw errors 167 * @xe: xe device instance 168 * 169 * Initialize and check for errors that occurred during boot 170 * prior to driver load 171 */ 172 void xe_hw_error_init(struct xe_device *xe) 173 { 174 struct xe_tile *tile = xe_device_get_root_tile(xe); 175 176 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) 177 return; 178 179 INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work); 180 181 process_hw_errors(xe); 182 } 183