1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include "xe_survivability_mode.h" 7 #include "xe_survivability_mode_types.h" 8 9 #include <linux/kobject.h> 10 #include <linux/pci.h> 11 #include <linux/sysfs.h> 12 13 #include "xe_configfs.h" 14 #include "xe_device.h" 15 #include "xe_gt.h" 16 #include "xe_heci_gsc.h" 17 #include "xe_mmio.h" 18 #include "xe_pcode_api.h" 19 #include "xe_vsec.h" 20 21 #define MAX_SCRATCH_MMIO 8 22 23 /** 24 * DOC: Xe Boot Survivability 25 * 26 * Boot Survivability is a software based workflow for recovering a system in a failed boot state 27 * Here system recoverability is concerned with recovering the firmware responsible for boot. 28 * 29 * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware 30 * to be flashed through mei and collect telemetry. The driver's probe flow is modified 31 * such that it enters survivability mode when pcode initialization is incomplete and boot status 32 * denotes a failure. 33 * 34 * Survivability mode can also be entered manually using the survivability mode attribute available 35 * through configfs which is beneficial in several usecases. It can be used to address scenarios 36 * where pcode does not detect failure or for validation purposes. It can also be used in 37 * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node. 38 * 39 * Use below command enable survivability mode manually:: 40 * 41 * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode 42 * 43 * Refer :ref:`xe_configfs` for more details on how to use configfs 44 * 45 * Survivability mode is indicated by the below admin-only readable sysfs which provides additional 46 * debug information:: 47 * 48 * /sys/bus/pci/devices/<device>/surivability_mode 49 * 50 * Capability Information: 51 * Provides boot status 52 * Postcode Information: 53 * Provides information about the failure 54 * Overflow Information 55 * Provides history of previous failures 56 * Auxiliary Information 57 * Certain failures may have information in addition to postcode information 58 */ 59 60 static u32 aux_history_offset(u32 reg_value) 61 { 62 return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value); 63 } 64 65 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info, 66 int id, char *name) 67 { 68 strscpy(info[id].name, name, sizeof(info[id].name)); 69 info[id].reg = PCODE_SCRATCH(id).raw; 70 info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); 71 } 72 73 static void populate_survivability_info(struct xe_device *xe) 74 { 75 struct xe_survivability *survivability = &xe->survivability; 76 struct xe_survivability_info *info = survivability->info; 77 struct xe_mmio *mmio; 78 u32 id = 0, reg_value; 79 char name[NAME_MAX]; 80 int index; 81 82 mmio = xe_root_tile_mmio(xe); 83 set_survivability_info(mmio, info, id, "Capability Info"); 84 reg_value = info[id].value; 85 86 if (reg_value & HISTORY_TRACKING) { 87 id++; 88 set_survivability_info(mmio, info, id, "Postcode Info"); 89 90 if (reg_value & OVERFLOW_SUPPORT) { 91 id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value); 92 set_survivability_info(mmio, info, id, "Overflow Info"); 93 } 94 } 95 96 if (reg_value & AUXINFO_SUPPORT) { 97 id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value); 98 99 for (index = 0; id && reg_value; index++, reg_value = info[id].value, 100 id = aux_history_offset(reg_value)) { 101 snprintf(name, NAME_MAX, "Auxiliary Info %d", index); 102 set_survivability_info(mmio, info, id, name); 103 } 104 } 105 } 106 107 static void log_survivability_info(struct pci_dev *pdev) 108 { 109 struct xe_device *xe = pdev_to_xe_device(pdev); 110 struct xe_survivability *survivability = &xe->survivability; 111 struct xe_survivability_info *info = survivability->info; 112 int id; 113 114 dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n", 115 survivability->boot_status); 116 for (id = 0; id < MAX_SCRATCH_MMIO; id++) { 117 if (info[id].reg) 118 dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name, 119 info[id].reg, info[id].value); 120 } 121 } 122 123 static ssize_t survivability_mode_show(struct device *dev, 124 struct device_attribute *attr, char *buff) 125 { 126 struct pci_dev *pdev = to_pci_dev(dev); 127 struct xe_device *xe = pdev_to_xe_device(pdev); 128 struct xe_survivability *survivability = &xe->survivability; 129 struct xe_survivability_info *info = survivability->info; 130 int index = 0, count = 0; 131 132 for (index = 0; index < MAX_SCRATCH_MMIO; index++) { 133 if (info[index].reg) 134 count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name, 135 info[index].reg, info[index].value); 136 } 137 138 return count; 139 } 140 141 static DEVICE_ATTR_ADMIN_RO(survivability_mode); 142 143 static void xe_survivability_mode_fini(void *arg) 144 { 145 struct xe_device *xe = arg; 146 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 147 struct device *dev = &pdev->dev; 148 149 xe_configfs_clear_survivability_mode(pdev); 150 sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); 151 } 152 153 static int enable_survivability_mode(struct pci_dev *pdev) 154 { 155 struct device *dev = &pdev->dev; 156 struct xe_device *xe = pdev_to_xe_device(pdev); 157 struct xe_survivability *survivability = &xe->survivability; 158 int ret = 0; 159 160 /* create survivability mode sysfs */ 161 ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); 162 if (ret) { 163 dev_warn(dev, "Failed to create survivability sysfs files\n"); 164 return ret; 165 } 166 167 ret = devm_add_action_or_reset(xe->drm.dev, 168 xe_survivability_mode_fini, xe); 169 if (ret) 170 return ret; 171 172 /* Make sure xe_heci_gsc_init() knows about survivability mode */ 173 survivability->mode = true; 174 175 ret = xe_heci_gsc_init(xe); 176 if (ret) { 177 /* 178 * But if it fails, device can't enter survivability 179 * so move it back for correct error handling 180 */ 181 survivability->mode = false; 182 return ret; 183 } 184 185 xe_vsec_init(xe); 186 187 dev_err(dev, "In Survivability Mode\n"); 188 189 return 0; 190 } 191 192 /** 193 * xe_survivability_mode_is_enabled - check if survivability mode is enabled 194 * @xe: xe device instance 195 * 196 * Returns true if in survivability mode, false otherwise 197 */ 198 bool xe_survivability_mode_is_enabled(struct xe_device *xe) 199 { 200 return xe->survivability.mode; 201 } 202 203 /** 204 * xe_survivability_mode_is_requested - check if it's possible to enable survivability 205 * mode that was requested by firmware or userspace 206 * @xe: xe device instance 207 * 208 * This function reads configfs and boot status from Pcode. 209 * 210 * Return: true if platform support is available and boot status indicates 211 * failure or if survivability mode is requested, false otherwise. 212 */ 213 bool xe_survivability_mode_is_requested(struct xe_device *xe) 214 { 215 struct xe_survivability *survivability = &xe->survivability; 216 struct xe_mmio *mmio = xe_root_tile_mmio(xe); 217 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 218 u32 data; 219 bool survivability_mode; 220 221 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) 222 return false; 223 224 survivability_mode = xe_configfs_get_survivability_mode(pdev); 225 226 if (xe->info.platform < XE_BATTLEMAGE) { 227 if (survivability_mode) { 228 dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n"); 229 xe_configfs_clear_survivability_mode(pdev); 230 } 231 return false; 232 } 233 234 /* Enable survivability mode if set via configfs */ 235 if (survivability_mode) 236 return true; 237 238 data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); 239 survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); 240 241 return survivability->boot_status == NON_CRITICAL_FAILURE || 242 survivability->boot_status == CRITICAL_FAILURE; 243 } 244 245 /** 246 * xe_survivability_mode_enable - Initialize and enable the survivability mode 247 * @xe: xe device instance 248 * 249 * Initialize survivability information and enable survivability mode 250 * 251 * Return: 0 if survivability mode is enabled or not requested; negative error 252 * code otherwise. 253 */ 254 int xe_survivability_mode_enable(struct xe_device *xe) 255 { 256 struct xe_survivability *survivability = &xe->survivability; 257 struct xe_survivability_info *info; 258 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 259 260 if (!xe_survivability_mode_is_requested(xe)) 261 return 0; 262 263 survivability->size = MAX_SCRATCH_MMIO; 264 265 info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), 266 GFP_KERNEL); 267 if (!info) 268 return -ENOMEM; 269 270 survivability->info = info; 271 272 populate_survivability_info(xe); 273 274 /* Only log debug information and exit if it is a critical failure */ 275 if (survivability->boot_status == CRITICAL_FAILURE) { 276 log_survivability_info(pdev); 277 return -ENXIO; 278 } 279 280 return enable_survivability_mode(pdev); 281 } 282