1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include "xe_survivability_mode.h" 7 #include "xe_survivability_mode_types.h" 8 9 #include <linux/kobject.h> 10 #include <linux/pci.h> 11 #include <linux/sysfs.h> 12 13 #include "xe_configfs.h" 14 #include "xe_device.h" 15 #include "xe_gt.h" 16 #include "xe_heci_gsc.h" 17 #include "xe_i2c.h" 18 #include "xe_mmio.h" 19 #include "xe_pcode_api.h" 20 #include "xe_vsec.h" 21 22 #define MAX_SCRATCH_MMIO 8 23 24 /** 25 * DOC: Survivability Mode 26 * 27 * Survivability Mode is a software based workflow for recovering a system in a failed boot state 28 * Here system recoverability is concerned with recovering the firmware responsible for boot. 29 * 30 * Boot Survivability 31 * =================== 32 * 33 * Boot Survivability is implemented by loading the driver with bare minimum (no drm card) to allow 34 * the firmware to be flashed through mei driver and collect telemetry. The driver's probe flow is 35 * modified such that it enters survivability mode when pcode initialization is incomplete and boot 36 * status denotes a failure. 37 * 38 * Survivability mode can also be entered manually using the survivability mode attribute available 39 * through configfs which is beneficial in several usecases. It can be used to address scenarios 40 * where pcode does not detect failure or for validation purposes. It can also be used in 41 * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node. 42 * 43 * Use below command enable survivability mode manually:: 44 * 45 * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode 46 * 47 * It is the responsibility of the user to clear the mode once firmware flash is complete. 48 * 49 * Refer :ref:`xe_configfs` for more details on how to use configfs 50 * 51 * Survivability mode is indicated by the below admin-only readable sysfs which provides additional 52 * debug information:: 53 * 54 * /sys/bus/pci/devices/<device>/survivability_mode 55 * 56 * Capability Information: 57 * Provides boot status 58 * Postcode Information: 59 * Provides information about the failure 60 * Overflow Information 61 * Provides history of previous failures 62 * Auxiliary Information 63 * Certain failures may have information in addition to postcode information 64 * 65 * Runtime Survivability 66 * ===================== 67 * 68 * Certain runtime firmware errors can cause the device to enter a wedged state 69 * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation. 70 * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and 71 * is indicated by the presence of survivability mode sysfs:: 72 * 73 * /sys/bus/pci/devices/<device>/survivability_mode 74 * 75 * Survivability mode sysfs provides information about the type of survivability mode. 76 * 77 * When such errors occur, userspace is notified with the drm device wedged uevent and runtime 78 * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd 79 * to restore device to normal operation. 80 */ 81 82 static u32 aux_history_offset(u32 reg_value) 83 { 84 return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value); 85 } 86 87 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info, 88 int id, char *name) 89 { 90 strscpy(info[id].name, name, sizeof(info[id].name)); 91 info[id].reg = PCODE_SCRATCH(id).raw; 92 info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); 93 } 94 95 static void populate_survivability_info(struct xe_device *xe) 96 { 97 struct xe_survivability *survivability = &xe->survivability; 98 struct xe_survivability_info *info = survivability->info; 99 struct xe_mmio *mmio; 100 u32 id = 0, reg_value; 101 char name[NAME_MAX]; 102 int index; 103 104 mmio = xe_root_tile_mmio(xe); 105 set_survivability_info(mmio, info, id, "Capability Info"); 106 reg_value = info[id].value; 107 108 if (reg_value & HISTORY_TRACKING) { 109 id++; 110 set_survivability_info(mmio, info, id, "Postcode Info"); 111 112 if (reg_value & OVERFLOW_SUPPORT) { 113 id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value); 114 set_survivability_info(mmio, info, id, "Overflow Info"); 115 } 116 } 117 118 if (reg_value & AUXINFO_SUPPORT) { 119 id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value); 120 121 for (index = 0; id && reg_value; index++, reg_value = info[id].value, 122 id = aux_history_offset(reg_value)) { 123 snprintf(name, NAME_MAX, "Auxiliary Info %d", index); 124 set_survivability_info(mmio, info, id, name); 125 } 126 } 127 } 128 129 static void log_survivability_info(struct pci_dev *pdev) 130 { 131 struct xe_device *xe = pdev_to_xe_device(pdev); 132 struct xe_survivability *survivability = &xe->survivability; 133 struct xe_survivability_info *info = survivability->info; 134 int id; 135 136 dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n", 137 survivability->boot_status); 138 for (id = 0; id < MAX_SCRATCH_MMIO; id++) { 139 if (info[id].reg) 140 dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name, 141 info[id].reg, info[id].value); 142 } 143 } 144 145 static int check_boot_failure(struct xe_device *xe) 146 { 147 struct xe_survivability *survivability = &xe->survivability; 148 149 return survivability->boot_status == NON_CRITICAL_FAILURE || 150 survivability->boot_status == CRITICAL_FAILURE; 151 } 152 153 static ssize_t survivability_mode_show(struct device *dev, 154 struct device_attribute *attr, char *buff) 155 { 156 struct pci_dev *pdev = to_pci_dev(dev); 157 struct xe_device *xe = pdev_to_xe_device(pdev); 158 struct xe_survivability *survivability = &xe->survivability; 159 struct xe_survivability_info *info = survivability->info; 160 int index = 0, count = 0; 161 162 count += sysfs_emit_at(buff, count, "Survivability mode type: %s\n", 163 survivability->type ? "Runtime" : "Boot"); 164 165 if (!check_boot_failure(xe)) 166 return count; 167 168 for (index = 0; index < MAX_SCRATCH_MMIO; index++) { 169 if (info[index].reg) 170 count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name, 171 info[index].reg, info[index].value); 172 } 173 174 return count; 175 } 176 177 static DEVICE_ATTR_ADMIN_RO(survivability_mode); 178 179 static void xe_survivability_mode_fini(void *arg) 180 { 181 struct xe_device *xe = arg; 182 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 183 struct device *dev = &pdev->dev; 184 185 sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); 186 } 187 188 static int create_survivability_sysfs(struct pci_dev *pdev) 189 { 190 struct device *dev = &pdev->dev; 191 struct xe_device *xe = pdev_to_xe_device(pdev); 192 int ret; 193 194 /* create survivability mode sysfs */ 195 ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); 196 if (ret) { 197 dev_warn(dev, "Failed to create survivability sysfs files\n"); 198 return ret; 199 } 200 201 ret = devm_add_action_or_reset(xe->drm.dev, 202 xe_survivability_mode_fini, xe); 203 if (ret) 204 return ret; 205 206 return 0; 207 } 208 209 static int enable_boot_survivability_mode(struct pci_dev *pdev) 210 { 211 struct device *dev = &pdev->dev; 212 struct xe_device *xe = pdev_to_xe_device(pdev); 213 struct xe_survivability *survivability = &xe->survivability; 214 int ret = 0; 215 216 ret = create_survivability_sysfs(pdev); 217 if (ret) 218 return ret; 219 220 /* Make sure xe_heci_gsc_init() knows about survivability mode */ 221 survivability->mode = true; 222 223 ret = xe_heci_gsc_init(xe); 224 if (ret) 225 goto err; 226 227 xe_vsec_init(xe); 228 229 ret = xe_i2c_probe(xe); 230 if (ret) 231 goto err; 232 233 dev_err(dev, "In Survivability Mode\n"); 234 235 return 0; 236 237 err: 238 survivability->mode = false; 239 return ret; 240 } 241 242 static int init_survivability_mode(struct xe_device *xe) 243 { 244 struct xe_survivability *survivability = &xe->survivability; 245 struct xe_survivability_info *info; 246 247 survivability->size = MAX_SCRATCH_MMIO; 248 249 info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), 250 GFP_KERNEL); 251 if (!info) 252 return -ENOMEM; 253 254 survivability->info = info; 255 256 populate_survivability_info(xe); 257 258 return 0; 259 } 260 261 /** 262 * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled 263 * @xe: xe device instance 264 * 265 * Returns true if in boot survivability mode of type, else false 266 */ 267 bool xe_survivability_mode_is_boot_enabled(struct xe_device *xe) 268 { 269 struct xe_survivability *survivability = &xe->survivability; 270 271 return survivability->mode && survivability->type == XE_SURVIVABILITY_TYPE_BOOT; 272 } 273 274 /** 275 * xe_survivability_mode_is_requested - check if it's possible to enable survivability 276 * mode that was requested by firmware or userspace 277 * @xe: xe device instance 278 * 279 * This function reads configfs and boot status from Pcode. 280 * 281 * Return: true if platform support is available and boot status indicates 282 * failure or if survivability mode is requested, false otherwise. 283 */ 284 bool xe_survivability_mode_is_requested(struct xe_device *xe) 285 { 286 struct xe_survivability *survivability = &xe->survivability; 287 struct xe_mmio *mmio = xe_root_tile_mmio(xe); 288 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 289 u32 data; 290 bool survivability_mode; 291 292 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) 293 return false; 294 295 survivability_mode = xe_configfs_get_survivability_mode(pdev); 296 297 if (xe->info.platform < XE_BATTLEMAGE) { 298 if (survivability_mode) { 299 dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n"); 300 xe_configfs_clear_survivability_mode(pdev); 301 } 302 return false; 303 } 304 305 /* Enable survivability mode if set via configfs */ 306 if (survivability_mode) 307 return true; 308 309 data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); 310 survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); 311 312 return check_boot_failure(xe); 313 } 314 315 /** 316 * xe_survivability_mode_runtime_enable - Initialize and enable runtime survivability mode 317 * @xe: xe device instance 318 * 319 * Initialize survivability information and enable runtime survivability mode. 320 * Runtime survivability mode is enabled when certain errors cause the device to be 321 * in non-recoverable state. The device is declared wedged with the appropriate 322 * recovery method and survivability mode sysfs exposed to userspace 323 * 324 * Return: 0 if runtime survivability mode is enabled, negative error code otherwise. 325 */ 326 int xe_survivability_mode_runtime_enable(struct xe_device *xe) 327 { 328 struct xe_survivability *survivability = &xe->survivability; 329 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 330 int ret; 331 332 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) { 333 dev_err(&pdev->dev, "Runtime Survivability Mode not supported\n"); 334 return -EINVAL; 335 } 336 337 ret = init_survivability_mode(xe); 338 if (ret) 339 return ret; 340 341 ret = create_survivability_sysfs(pdev); 342 if (ret) 343 dev_err(&pdev->dev, "Failed to create survivability mode sysfs\n"); 344 345 survivability->type = XE_SURVIVABILITY_TYPE_RUNTIME; 346 dev_err(&pdev->dev, "Runtime Survivability mode enabled\n"); 347 348 xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR); 349 xe_device_declare_wedged(xe); 350 dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n"); 351 352 return 0; 353 } 354 355 /** 356 * xe_survivability_mode_boot_enable - Initialize and enable boot survivability mode 357 * @xe: xe device instance 358 * 359 * Initialize survivability information and enable boot survivability mode 360 * 361 * Return: 0 if boot survivability mode is enabled or not requested, negative error 362 * code otherwise. 363 */ 364 int xe_survivability_mode_boot_enable(struct xe_device *xe) 365 { 366 struct xe_survivability *survivability = &xe->survivability; 367 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 368 int ret; 369 370 if (!xe_survivability_mode_is_requested(xe)) 371 return 0; 372 373 ret = init_survivability_mode(xe); 374 if (ret) 375 return ret; 376 377 /* Log breadcrumbs but do not enter survivability mode for Critical boot errors */ 378 if (survivability->boot_status == CRITICAL_FAILURE) { 379 log_survivability_info(pdev); 380 return -ENXIO; 381 } 382 383 survivability->type = XE_SURVIVABILITY_TYPE_BOOT; 384 385 return enable_boot_survivability_mode(pdev); 386 } 387