1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include "xe_survivability_mode.h" 7 #include "xe_survivability_mode_types.h" 8 9 #include <linux/kobject.h> 10 #include <linux/pci.h> 11 #include <linux/sysfs.h> 12 13 #include "xe_configfs.h" 14 #include "xe_device.h" 15 #include "xe_heci_gsc.h" 16 #include "xe_i2c.h" 17 #include "xe_mmio.h" 18 #include "xe_nvm.h" 19 #include "xe_pcode_api.h" 20 #include "xe_vsec.h" 21 22 /** 23 * DOC: Survivability Mode 24 * 25 * Survivability Mode is a software based workflow for recovering a system in a failed boot state 26 * Here system recoverability is concerned with recovering the firmware responsible for boot. 27 * 28 * Boot Survivability 29 * =================== 30 * 31 * Boot Survivability is implemented by loading the driver with bare minimum (no drm card) to allow 32 * the firmware to be flashed through mei driver and collect telemetry. The driver's probe flow is 33 * modified such that it enters survivability mode when pcode initialization is incomplete and boot 34 * status denotes a failure. 35 * 36 * Survivability mode can also be entered manually using the survivability mode attribute available 37 * through configfs which is beneficial in several usecases. It can be used to address scenarios 38 * where pcode does not detect failure or for validation purposes. It can also be used in 39 * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node. 40 * 41 * Use below command enable survivability mode manually:: 42 * 43 * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode 44 * 45 * It is the responsibility of the user to clear the mode once firmware flash is complete. 46 * 47 * Refer :ref:`xe_configfs` for more details on how to use configfs 48 * 49 * Survivability mode is indicated by the below admin-only readable sysfs entry. It 50 * provides information about the type of survivability mode (Boot/Runtime). 51 * 52 * .. code-block:: shell 53 * 54 * # cat /sys/bus/pci/devices/<device>/survivability_mode 55 * Boot 56 * 57 * 58 * Any additional debug information if present will be visible under the directory 59 * ``survivability_info``:: 60 * 61 * /sys/bus/pci/devices/<device>/survivability_info/ 62 * ├── aux_info0 63 * ├── aux_info1 64 * ├── aux_info2 65 * ├── aux_info3 66 * ├── aux_info4 67 * ├── capability_info 68 * ├── fdo_mode 69 * ├── postcode_trace 70 * └── postcode_trace_overflow 71 * 72 * This directory has the following attributes 73 * 74 * - ``capability_info`` : Indicates Boot status and support for additional information 75 * 76 * - ``postcode_trace``, ``postcode_trace_overflow`` : Each postcode is a 8bit value and 77 * represents a boot failure event. When a new failure event is logged by PCODE the 78 * existing postcodes are shifted left. These entries provide a history of 8 postcodes. 79 * 80 * - ``aux_info<n>`` : Some failures have additional debug information 81 * 82 * - ``fdo_mode`` : To allow recovery in scenarios where MEI itself fails, a new SPI Flash 83 * Descriptor Override (FDO) mode is added in v2 survivability breadcrumbs. This mode is enabled 84 * by PCODE and provides the ability to directly update the firmware via SPI Driver without 85 * any dependency on MEI. Xe KMD initializes the nvm aux driver if FDO mode is enabled. 86 * 87 * Runtime Survivability 88 * ===================== 89 * 90 * Certain runtime firmware errors can cause the device to enter a wedged state 91 * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation. 92 * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and 93 * is indicated by the presence of survivability mode sysfs. 94 * Survivability mode sysfs provides information about the type of survivability mode. 95 * 96 * .. code-block:: shell 97 * 98 * # cat /sys/bus/pci/devices/<device>/survivability_mode 99 * Runtime 100 * 101 * When such errors occur, userspace is notified with the drm device wedged uevent and runtime 102 * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd 103 * to restore device to normal operation. 104 */ 105 106 static const char * const reg_map[] = { 107 [CAPABILITY_INFO] = "Capability Info", 108 [POSTCODE_TRACE] = "Postcode trace", 109 [POSTCODE_TRACE_OVERFLOW] = "Postcode trace overflow", 110 [AUX_INFO0] = "Auxiliary Info 0", 111 [AUX_INFO1] = "Auxiliary Info 1", 112 [AUX_INFO2] = "Auxiliary Info 2", 113 [AUX_INFO3] = "Auxiliary Info 3", 114 [AUX_INFO4] = "Auxiliary Info 4", 115 }; 116 117 #define FDO_INFO (MAX_SCRATCH_REG + 1) 118 119 struct xe_survivability_attribute { 120 struct device_attribute attr; 121 u8 index; 122 }; 123 124 static struct 125 xe_survivability_attribute *dev_attr_to_survivability_attr(struct device_attribute *attr) 126 { 127 return container_of(attr, struct xe_survivability_attribute, attr); 128 } 129 130 static void set_survivability_info(struct xe_mmio *mmio, u32 *info, int id) 131 { 132 info[id] = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); 133 } 134 135 static void populate_survivability_info(struct xe_device *xe) 136 { 137 struct xe_survivability *survivability = &xe->survivability; 138 u32 *info = survivability->info; 139 struct xe_mmio *mmio; 140 u32 id = 0, reg_value; 141 142 mmio = xe_root_tile_mmio(xe); 143 set_survivability_info(mmio, info, CAPABILITY_INFO); 144 reg_value = info[CAPABILITY_INFO]; 145 146 survivability->version = REG_FIELD_GET(BREADCRUMB_VERSION, reg_value); 147 /* FDO mode is exposed only from version 2 */ 148 if (survivability->version >= 2) 149 survivability->fdo_mode = REG_FIELD_GET(FDO_MODE, reg_value); 150 151 if (reg_value & HISTORY_TRACKING) { 152 set_survivability_info(mmio, info, POSTCODE_TRACE); 153 154 if (reg_value & OVERFLOW_SUPPORT) 155 set_survivability_info(mmio, info, POSTCODE_TRACE_OVERFLOW); 156 } 157 158 /* Traverse the linked list of aux info registers */ 159 if (reg_value & AUXINFO_SUPPORT) { 160 for (id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value); 161 id >= AUX_INFO0 && id < MAX_SCRATCH_REG; 162 id = REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, info[id])) 163 set_survivability_info(mmio, info, id); 164 } 165 } 166 167 static void log_survivability_info(struct pci_dev *pdev) 168 { 169 struct xe_device *xe = pdev_to_xe_device(pdev); 170 struct xe_survivability *survivability = &xe->survivability; 171 u32 *info = survivability->info; 172 int id; 173 174 dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n", 175 survivability->boot_status); 176 for (id = 0; id < MAX_SCRATCH_REG; id++) { 177 if (info[id]) 178 dev_info(&pdev->dev, "%s: 0x%x\n", reg_map[id], info[id]); 179 } 180 } 181 182 static int check_boot_failure(struct xe_device *xe) 183 { 184 struct xe_survivability *survivability = &xe->survivability; 185 186 return survivability->boot_status == NON_CRITICAL_FAILURE || 187 survivability->boot_status == CRITICAL_FAILURE; 188 } 189 190 static ssize_t survivability_mode_show(struct device *dev, 191 struct device_attribute *attr, char *buff) 192 { 193 struct pci_dev *pdev = to_pci_dev(dev); 194 struct xe_device *xe = pdev_to_xe_device(pdev); 195 struct xe_survivability *survivability = &xe->survivability; 196 197 return sysfs_emit(buff, "%s\n", survivability->type ? "Runtime" : "Boot"); 198 } 199 200 static DEVICE_ATTR_ADMIN_RO(survivability_mode); 201 202 static ssize_t survivability_info_show(struct device *dev, 203 struct device_attribute *attr, char *buff) 204 { 205 struct xe_survivability_attribute *sa = dev_attr_to_survivability_attr(attr); 206 struct pci_dev *pdev = to_pci_dev(dev); 207 struct xe_device *xe = pdev_to_xe_device(pdev); 208 struct xe_survivability *survivability = &xe->survivability; 209 u32 *info = survivability->info; 210 211 if (sa->index == FDO_INFO) 212 return sysfs_emit(buff, "%s\n", str_enabled_disabled(survivability->fdo_mode)); 213 214 return sysfs_emit(buff, "0x%x\n", info[sa->index]); 215 } 216 217 #define SURVIVABILITY_ATTR_RO(name, _index) \ 218 struct xe_survivability_attribute attr_##name = { \ 219 .attr = __ATTR(name, 0400, survivability_info_show, NULL), \ 220 .index = _index, \ 221 } 222 223 static SURVIVABILITY_ATTR_RO(capability_info, CAPABILITY_INFO); 224 static SURVIVABILITY_ATTR_RO(postcode_trace, POSTCODE_TRACE); 225 static SURVIVABILITY_ATTR_RO(postcode_trace_overflow, POSTCODE_TRACE_OVERFLOW); 226 static SURVIVABILITY_ATTR_RO(aux_info0, AUX_INFO0); 227 static SURVIVABILITY_ATTR_RO(aux_info1, AUX_INFO1); 228 static SURVIVABILITY_ATTR_RO(aux_info2, AUX_INFO2); 229 static SURVIVABILITY_ATTR_RO(aux_info3, AUX_INFO3); 230 static SURVIVABILITY_ATTR_RO(aux_info4, AUX_INFO4); 231 static SURVIVABILITY_ATTR_RO(fdo_mode, FDO_INFO); 232 233 static void xe_survivability_mode_fini(void *arg) 234 { 235 struct xe_device *xe = arg; 236 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 237 struct device *dev = &pdev->dev; 238 239 device_remove_file(dev, &dev_attr_survivability_mode); 240 } 241 242 static umode_t survivability_info_attrs_visible(struct kobject *kobj, struct attribute *attr, 243 int idx) 244 { 245 struct xe_device *xe = kdev_to_xe_device(kobj_to_dev(kobj)); 246 struct xe_survivability *survivability = &xe->survivability; 247 u32 *info = survivability->info; 248 249 /* 250 * Last index in survivability_info_attrs is fdo mode and is applicable only in 251 * version 2 of survivability mode 252 */ 253 if (idx == MAX_SCRATCH_REG && survivability->version >= 2) 254 return 0400; 255 256 if (idx < MAX_SCRATCH_REG && info[idx]) 257 return 0400; 258 259 return 0; 260 } 261 262 /* Attributes are ordered according to enum scratch_reg */ 263 static struct attribute *survivability_info_attrs[] = { 264 &attr_capability_info.attr.attr, 265 &attr_postcode_trace.attr.attr, 266 &attr_postcode_trace_overflow.attr.attr, 267 &attr_aux_info0.attr.attr, 268 &attr_aux_info1.attr.attr, 269 &attr_aux_info2.attr.attr, 270 &attr_aux_info3.attr.attr, 271 &attr_aux_info4.attr.attr, 272 &attr_fdo_mode.attr.attr, 273 NULL, 274 }; 275 276 static const struct attribute_group survivability_info_group = { 277 .name = "survivability_info", 278 .attrs = survivability_info_attrs, 279 .is_visible = survivability_info_attrs_visible, 280 }; 281 282 static int create_survivability_sysfs(struct pci_dev *pdev) 283 { 284 struct device *dev = &pdev->dev; 285 struct xe_device *xe = pdev_to_xe_device(pdev); 286 int ret; 287 288 ret = device_create_file(dev, &dev_attr_survivability_mode); 289 if (ret) { 290 dev_warn(dev, "Failed to create survivability sysfs files\n"); 291 return ret; 292 } 293 294 ret = devm_add_action_or_reset(xe->drm.dev, 295 xe_survivability_mode_fini, xe); 296 if (ret) 297 return ret; 298 299 if (check_boot_failure(xe)) { 300 ret = devm_device_add_group(dev, &survivability_info_group); 301 if (ret) 302 return ret; 303 } 304 305 return 0; 306 } 307 308 static int enable_boot_survivability_mode(struct pci_dev *pdev) 309 { 310 struct device *dev = &pdev->dev; 311 struct xe_device *xe = pdev_to_xe_device(pdev); 312 struct xe_survivability *survivability = &xe->survivability; 313 int ret = 0; 314 315 ret = create_survivability_sysfs(pdev); 316 if (ret) 317 return ret; 318 319 /* Make sure xe_heci_gsc_init() and xe_i2c_probe() are aware of survivability */ 320 survivability->mode = true; 321 322 xe_heci_gsc_init(xe); 323 324 xe_vsec_init(xe); 325 326 if (survivability->fdo_mode) { 327 ret = xe_nvm_init(xe); 328 if (ret) 329 goto err; 330 } 331 332 ret = xe_i2c_probe(xe); 333 if (ret) 334 goto err; 335 336 dev_err(dev, "In Survivability Mode\n"); 337 338 return 0; 339 340 err: 341 dev_err(dev, "Failed to enable Survivability Mode\n"); 342 survivability->mode = false; 343 return ret; 344 } 345 346 /** 347 * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled 348 * @xe: xe device instance 349 * 350 * Returns true if in boot survivability mode of type, else false 351 */ 352 bool xe_survivability_mode_is_boot_enabled(struct xe_device *xe) 353 { 354 struct xe_survivability *survivability = &xe->survivability; 355 356 return survivability->mode && survivability->type == XE_SURVIVABILITY_TYPE_BOOT; 357 } 358 359 /** 360 * xe_survivability_mode_is_requested - check if it's possible to enable survivability 361 * mode that was requested by firmware or userspace 362 * @xe: xe device instance 363 * 364 * This function reads configfs and boot status from Pcode. 365 * 366 * Return: true if platform support is available and boot status indicates 367 * failure or if survivability mode is requested, false otherwise. 368 */ 369 bool xe_survivability_mode_is_requested(struct xe_device *xe) 370 { 371 struct xe_survivability *survivability = &xe->survivability; 372 struct xe_mmio *mmio = xe_root_tile_mmio(xe); 373 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 374 u32 data; 375 bool survivability_mode; 376 377 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) 378 return false; 379 380 survivability_mode = xe_configfs_get_survivability_mode(pdev); 381 /* Enable survivability mode if set via configfs */ 382 if (survivability_mode) 383 return true; 384 385 data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); 386 survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); 387 388 return check_boot_failure(xe); 389 } 390 391 /** 392 * xe_survivability_mode_runtime_enable - Initialize and enable runtime survivability mode 393 * @xe: xe device instance 394 * 395 * Initialize survivability information and enable runtime survivability mode. 396 * Runtime survivability mode is enabled when certain errors cause the device to be 397 * in non-recoverable state. The device is declared wedged with the appropriate 398 * recovery method and survivability mode sysfs exposed to userspace 399 * 400 * Return: 0 if runtime survivability mode is enabled, negative error code otherwise. 401 */ 402 int xe_survivability_mode_runtime_enable(struct xe_device *xe) 403 { 404 struct xe_survivability *survivability = &xe->survivability; 405 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 406 int ret; 407 408 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) { 409 dev_err(&pdev->dev, "Runtime Survivability Mode not supported\n"); 410 return -EINVAL; 411 } 412 413 populate_survivability_info(xe); 414 415 ret = create_survivability_sysfs(pdev); 416 if (ret) 417 dev_err(&pdev->dev, "Failed to create survivability mode sysfs\n"); 418 419 survivability->type = XE_SURVIVABILITY_TYPE_RUNTIME; 420 dev_err(&pdev->dev, "Runtime Survivability mode enabled\n"); 421 422 xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR); 423 xe_device_declare_wedged(xe); 424 dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n"); 425 426 return 0; 427 } 428 429 /** 430 * xe_survivability_mode_boot_enable - Initialize and enable boot survivability mode 431 * @xe: xe device instance 432 * 433 * Initialize survivability information and enable boot survivability mode 434 * 435 * Return: 0 if boot survivability mode is enabled or not requested, negative error 436 * code otherwise. 437 */ 438 int xe_survivability_mode_boot_enable(struct xe_device *xe) 439 { 440 struct xe_survivability *survivability = &xe->survivability; 441 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 442 443 if (!xe_survivability_mode_is_requested(xe)) 444 return 0; 445 446 populate_survivability_info(xe); 447 448 /* 449 * v2 supports survivability mode for critical errors 450 */ 451 if (survivability->version < 2 && survivability->boot_status == CRITICAL_FAILURE) { 452 log_survivability_info(pdev); 453 return -ENXIO; 454 } 455 456 survivability->type = XE_SURVIVABILITY_TYPE_BOOT; 457 458 return enable_boot_survivability_mode(pdev); 459 } 460