1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include "xe_survivability_mode.h" 7 #include "xe_survivability_mode_types.h" 8 9 #include <linux/kobject.h> 10 #include <linux/pci.h> 11 #include <linux/sysfs.h> 12 13 #include "xe_configfs.h" 14 #include "xe_device.h" 15 #include "xe_gt.h" 16 #include "xe_heci_gsc.h" 17 #include "xe_i2c.h" 18 #include "xe_mmio.h" 19 #include "xe_pcode_api.h" 20 #include "xe_vsec.h" 21 22 #define MAX_SCRATCH_MMIO 8 23 24 /** 25 * DOC: Survivability Mode 26 * 27 * Survivability Mode is a software based workflow for recovering a system in a failed boot state 28 * Here system recoverability is concerned with recovering the firmware responsible for boot. 29 * 30 * Boot Survivability 31 * =================== 32 * 33 * Boot Survivability is implemented by loading the driver with bare minimum (no drm card) to allow 34 * the firmware to be flashed through mei driver and collect telemetry. The driver's probe flow is 35 * modified such that it enters survivability mode when pcode initialization is incomplete and boot 36 * status denotes a failure. 37 * 38 * Survivability mode can also be entered manually using the survivability mode attribute available 39 * through configfs which is beneficial in several usecases. It can be used to address scenarios 40 * where pcode does not detect failure or for validation purposes. It can also be used in 41 * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node. 42 * 43 * Use below command enable survivability mode manually:: 44 * 45 * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode 46 * 47 * Refer :ref:`xe_configfs` for more details on how to use configfs 48 * 49 * Survivability mode is indicated by the below admin-only readable sysfs which provides additional 50 * debug information:: 51 * 52 * /sys/bus/pci/devices/<device>/survivability_mode 53 * 54 * Capability Information: 55 * Provides boot status 56 * Postcode Information: 57 * Provides information about the failure 58 * Overflow Information 59 * Provides history of previous failures 60 * Auxiliary Information 61 * Certain failures may have information in addition to postcode information 62 * 63 * Runtime Survivability 64 * ===================== 65 * 66 * Certain runtime firmware errors can cause the device to enter a wedged state 67 * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation. 68 * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and 69 * is indicated by the presence of survivability mode sysfs:: 70 * 71 * /sys/bus/pci/devices/<device>/survivability_mode 72 * 73 * Survivability mode sysfs provides information about the type of survivability mode. 74 * 75 * When such errors occur, userspace is notified with the drm device wedged uevent and runtime 76 * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd 77 * to restore device to normal operation. 78 */ 79 80 static u32 aux_history_offset(u32 reg_value) 81 { 82 return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value); 83 } 84 85 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info, 86 int id, char *name) 87 { 88 strscpy(info[id].name, name, sizeof(info[id].name)); 89 info[id].reg = PCODE_SCRATCH(id).raw; 90 info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); 91 } 92 93 static void populate_survivability_info(struct xe_device *xe) 94 { 95 struct xe_survivability *survivability = &xe->survivability; 96 struct xe_survivability_info *info = survivability->info; 97 struct xe_mmio *mmio; 98 u32 id = 0, reg_value; 99 char name[NAME_MAX]; 100 int index; 101 102 mmio = xe_root_tile_mmio(xe); 103 set_survivability_info(mmio, info, id, "Capability Info"); 104 reg_value = info[id].value; 105 106 if (reg_value & HISTORY_TRACKING) { 107 id++; 108 set_survivability_info(mmio, info, id, "Postcode Info"); 109 110 if (reg_value & OVERFLOW_SUPPORT) { 111 id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value); 112 set_survivability_info(mmio, info, id, "Overflow Info"); 113 } 114 } 115 116 if (reg_value & AUXINFO_SUPPORT) { 117 id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value); 118 119 for (index = 0; id && reg_value; index++, reg_value = info[id].value, 120 id = aux_history_offset(reg_value)) { 121 snprintf(name, NAME_MAX, "Auxiliary Info %d", index); 122 set_survivability_info(mmio, info, id, name); 123 } 124 } 125 } 126 127 static void log_survivability_info(struct pci_dev *pdev) 128 { 129 struct xe_device *xe = pdev_to_xe_device(pdev); 130 struct xe_survivability *survivability = &xe->survivability; 131 struct xe_survivability_info *info = survivability->info; 132 int id; 133 134 dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n", 135 survivability->boot_status); 136 for (id = 0; id < MAX_SCRATCH_MMIO; id++) { 137 if (info[id].reg) 138 dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name, 139 info[id].reg, info[id].value); 140 } 141 } 142 143 static int check_boot_failure(struct xe_device *xe) 144 { 145 struct xe_survivability *survivability = &xe->survivability; 146 147 return survivability->boot_status == NON_CRITICAL_FAILURE || 148 survivability->boot_status == CRITICAL_FAILURE; 149 } 150 151 static ssize_t survivability_mode_show(struct device *dev, 152 struct device_attribute *attr, char *buff) 153 { 154 struct pci_dev *pdev = to_pci_dev(dev); 155 struct xe_device *xe = pdev_to_xe_device(pdev); 156 struct xe_survivability *survivability = &xe->survivability; 157 struct xe_survivability_info *info = survivability->info; 158 int index = 0, count = 0; 159 160 count += sysfs_emit_at(buff, count, "Survivability mode type: %s\n", 161 survivability->type ? "Runtime" : "Boot"); 162 163 if (!check_boot_failure(xe)) 164 return count; 165 166 for (index = 0; index < MAX_SCRATCH_MMIO; index++) { 167 if (info[index].reg) 168 count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name, 169 info[index].reg, info[index].value); 170 } 171 172 return count; 173 } 174 175 static DEVICE_ATTR_ADMIN_RO(survivability_mode); 176 177 static void xe_survivability_mode_fini(void *arg) 178 { 179 struct xe_device *xe = arg; 180 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 181 struct device *dev = &pdev->dev; 182 183 xe_configfs_clear_survivability_mode(pdev); 184 sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); 185 } 186 187 static int create_survivability_sysfs(struct pci_dev *pdev) 188 { 189 struct device *dev = &pdev->dev; 190 struct xe_device *xe = pdev_to_xe_device(pdev); 191 int ret; 192 193 /* create survivability mode sysfs */ 194 ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); 195 if (ret) { 196 dev_warn(dev, "Failed to create survivability sysfs files\n"); 197 return ret; 198 } 199 200 ret = devm_add_action_or_reset(xe->drm.dev, 201 xe_survivability_mode_fini, xe); 202 if (ret) 203 return ret; 204 205 return 0; 206 } 207 208 static int enable_boot_survivability_mode(struct pci_dev *pdev) 209 { 210 struct device *dev = &pdev->dev; 211 struct xe_device *xe = pdev_to_xe_device(pdev); 212 struct xe_survivability *survivability = &xe->survivability; 213 int ret = 0; 214 215 ret = create_survivability_sysfs(pdev); 216 if (ret) 217 return ret; 218 219 /* Make sure xe_heci_gsc_init() knows about survivability mode */ 220 survivability->mode = true; 221 222 ret = xe_heci_gsc_init(xe); 223 if (ret) 224 goto err; 225 226 xe_vsec_init(xe); 227 228 ret = xe_i2c_probe(xe); 229 if (ret) 230 goto err; 231 232 dev_err(dev, "In Survivability Mode\n"); 233 234 return 0; 235 236 err: 237 survivability->mode = false; 238 return ret; 239 } 240 241 static int init_survivability_mode(struct xe_device *xe) 242 { 243 struct xe_survivability *survivability = &xe->survivability; 244 struct xe_survivability_info *info; 245 246 survivability->size = MAX_SCRATCH_MMIO; 247 248 info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), 249 GFP_KERNEL); 250 if (!info) 251 return -ENOMEM; 252 253 survivability->info = info; 254 255 populate_survivability_info(xe); 256 257 return 0; 258 } 259 260 /** 261 * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled 262 * @xe: xe device instance 263 * 264 * Returns true if in boot survivability mode of type, else false 265 */ 266 bool xe_survivability_mode_is_boot_enabled(struct xe_device *xe) 267 { 268 struct xe_survivability *survivability = &xe->survivability; 269 270 return survivability->mode && survivability->type == XE_SURVIVABILITY_TYPE_BOOT; 271 } 272 273 /** 274 * xe_survivability_mode_is_requested - check if it's possible to enable survivability 275 * mode that was requested by firmware or userspace 276 * @xe: xe device instance 277 * 278 * This function reads configfs and boot status from Pcode. 279 * 280 * Return: true if platform support is available and boot status indicates 281 * failure or if survivability mode is requested, false otherwise. 282 */ 283 bool xe_survivability_mode_is_requested(struct xe_device *xe) 284 { 285 struct xe_survivability *survivability = &xe->survivability; 286 struct xe_mmio *mmio = xe_root_tile_mmio(xe); 287 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 288 u32 data; 289 bool survivability_mode; 290 291 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) 292 return false; 293 294 survivability_mode = xe_configfs_get_survivability_mode(pdev); 295 296 if (xe->info.platform < XE_BATTLEMAGE) { 297 if (survivability_mode) { 298 dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n"); 299 xe_configfs_clear_survivability_mode(pdev); 300 } 301 return false; 302 } 303 304 /* Enable survivability mode if set via configfs */ 305 if (survivability_mode) 306 return true; 307 308 data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); 309 survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); 310 311 return check_boot_failure(xe); 312 } 313 314 /** 315 * xe_survivability_mode_runtime_enable - Initialize and enable runtime survivability mode 316 * @xe: xe device instance 317 * 318 * Initialize survivability information and enable runtime survivability mode. 319 * Runtime survivability mode is enabled when certain errors cause the device to be 320 * in non-recoverable state. The device is declared wedged with the appropriate 321 * recovery method and survivability mode sysfs exposed to userspace 322 * 323 * Return: 0 if runtime survivability mode is enabled, negative error code otherwise. 324 */ 325 int xe_survivability_mode_runtime_enable(struct xe_device *xe) 326 { 327 struct xe_survivability *survivability = &xe->survivability; 328 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 329 int ret; 330 331 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) { 332 dev_err(&pdev->dev, "Runtime Survivability Mode not supported\n"); 333 return -EINVAL; 334 } 335 336 ret = init_survivability_mode(xe); 337 if (ret) 338 return ret; 339 340 ret = create_survivability_sysfs(pdev); 341 if (ret) 342 dev_err(&pdev->dev, "Failed to create survivability mode sysfs\n"); 343 344 survivability->type = XE_SURVIVABILITY_TYPE_RUNTIME; 345 dev_err(&pdev->dev, "Runtime Survivability mode enabled\n"); 346 347 xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR); 348 xe_device_declare_wedged(xe); 349 dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n"); 350 351 return 0; 352 } 353 354 /** 355 * xe_survivability_mode_boot_enable - Initialize and enable boot survivability mode 356 * @xe: xe device instance 357 * 358 * Initialize survivability information and enable boot survivability mode 359 * 360 * Return: 0 if boot survivability mode is enabled or not requested, negative error 361 * code otherwise. 362 */ 363 int xe_survivability_mode_boot_enable(struct xe_device *xe) 364 { 365 struct xe_survivability *survivability = &xe->survivability; 366 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 367 int ret; 368 369 if (!xe_survivability_mode_is_requested(xe)) 370 return 0; 371 372 ret = init_survivability_mode(xe); 373 if (ret) 374 return ret; 375 376 /* Log breadcrumbs but do not enter survivability mode for Critical boot errors */ 377 if (survivability->boot_status == CRITICAL_FAILURE) { 378 log_survivability_info(pdev); 379 return -ENXIO; 380 } 381 382 survivability->type = XE_SURVIVABILITY_TYPE_BOOT; 383 384 return enable_boot_survivability_mode(pdev); 385 } 386