1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include "xe_survivability_mode.h" 7 #include "xe_survivability_mode_types.h" 8 9 #include <linux/kobject.h> 10 #include <linux/pci.h> 11 #include <linux/sysfs.h> 12 13 #include "xe_configfs.h" 14 #include "xe_device.h" 15 #include "xe_gt.h" 16 #include "xe_heci_gsc.h" 17 #include "xe_i2c.h" 18 #include "xe_mmio.h" 19 #include "xe_pcode_api.h" 20 #include "xe_vsec.h" 21 22 #define MAX_SCRATCH_MMIO 8 23 24 /** 25 * DOC: Survivability Mode 26 * 27 * Survivability Mode is a software based workflow for recovering a system in a failed boot state 28 * Here system recoverability is concerned with recovering the firmware responsible for boot. 29 * 30 * Boot Survivability 31 * =================== 32 * 33 * Boot Survivability is implemented by loading the driver with bare minimum (no drm card) to allow 34 * the firmware to be flashed through mei driver and collect telemetry. The driver's probe flow is 35 * modified such that it enters survivability mode when pcode initialization is incomplete and boot 36 * status denotes a failure. 37 * 38 * Survivability mode can also be entered manually using the survivability mode attribute available 39 * through configfs which is beneficial in several usecases. It can be used to address scenarios 40 * where pcode does not detect failure or for validation purposes. It can also be used in 41 * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node. 42 * 43 * Use below command enable survivability mode manually:: 44 * 45 * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode 46 * 47 * It is the responsibility of the user to clear the mode once firmware flash is complete. 48 * 49 * Refer :ref:`xe_configfs` for more details on how to use configfs 50 * 51 * Survivability mode is indicated by the below admin-only readable sysfs which provides additional 52 * debug information:: 53 * 54 * /sys/bus/pci/devices/<device>/survivability_mode 55 * 56 * Capability Information: 57 * Provides boot status 58 * Postcode Information: 59 * Provides information about the failure 60 * Overflow Information 61 * Provides history of previous failures 62 * Auxiliary Information 63 * Certain failures may have information in addition to postcode information 64 * 65 * Runtime Survivability 66 * ===================== 67 * 68 * Certain runtime firmware errors can cause the device to enter a wedged state 69 * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation. 70 * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and 71 * is indicated by the presence of survivability mode sysfs:: 72 * 73 * /sys/bus/pci/devices/<device>/survivability_mode 74 * 75 * Survivability mode sysfs provides information about the type of survivability mode. 76 * 77 * When such errors occur, userspace is notified with the drm device wedged uevent and runtime 78 * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd 79 * to restore device to normal operation. 80 */ 81 82 static u32 aux_history_offset(u32 reg_value) 83 { 84 return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value); 85 } 86 87 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info, 88 int id, char *name) 89 { 90 strscpy(info[id].name, name, sizeof(info[id].name)); 91 info[id].reg = PCODE_SCRATCH(id).raw; 92 info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); 93 } 94 95 static void populate_survivability_info(struct xe_device *xe) 96 { 97 struct xe_survivability *survivability = &xe->survivability; 98 struct xe_survivability_info *info = survivability->info; 99 struct xe_mmio *mmio; 100 u32 id = 0, reg_value; 101 char name[NAME_MAX]; 102 int index; 103 104 mmio = xe_root_tile_mmio(xe); 105 set_survivability_info(mmio, info, id, "Capability Info"); 106 reg_value = info[id].value; 107 108 if (reg_value & HISTORY_TRACKING) { 109 id++; 110 set_survivability_info(mmio, info, id, "Postcode Info"); 111 112 if (reg_value & OVERFLOW_SUPPORT) { 113 id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value); 114 set_survivability_info(mmio, info, id, "Overflow Info"); 115 } 116 } 117 118 if (reg_value & AUXINFO_SUPPORT) { 119 id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value); 120 121 for (index = 0; id && reg_value; index++, reg_value = info[id].value, 122 id = aux_history_offset(reg_value)) { 123 snprintf(name, NAME_MAX, "Auxiliary Info %d", index); 124 set_survivability_info(mmio, info, id, name); 125 } 126 } 127 } 128 129 static void log_survivability_info(struct pci_dev *pdev) 130 { 131 struct xe_device *xe = pdev_to_xe_device(pdev); 132 struct xe_survivability *survivability = &xe->survivability; 133 struct xe_survivability_info *info = survivability->info; 134 int id; 135 136 dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n", 137 survivability->boot_status); 138 for (id = 0; id < MAX_SCRATCH_MMIO; id++) { 139 if (info[id].reg) 140 dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name, 141 info[id].reg, info[id].value); 142 } 143 } 144 145 static int check_boot_failure(struct xe_device *xe) 146 { 147 struct xe_survivability *survivability = &xe->survivability; 148 149 return survivability->boot_status == NON_CRITICAL_FAILURE || 150 survivability->boot_status == CRITICAL_FAILURE; 151 } 152 153 static ssize_t survivability_mode_show(struct device *dev, 154 struct device_attribute *attr, char *buff) 155 { 156 struct pci_dev *pdev = to_pci_dev(dev); 157 struct xe_device *xe = pdev_to_xe_device(pdev); 158 struct xe_survivability *survivability = &xe->survivability; 159 struct xe_survivability_info *info = survivability->info; 160 int index = 0, count = 0; 161 162 count += sysfs_emit_at(buff, count, "Survivability mode type: %s\n", 163 survivability->type ? "Runtime" : "Boot"); 164 165 if (!check_boot_failure(xe)) 166 return count; 167 168 for (index = 0; index < MAX_SCRATCH_MMIO; index++) { 169 if (info[index].reg) 170 count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name, 171 info[index].reg, info[index].value); 172 } 173 174 return count; 175 } 176 177 static DEVICE_ATTR_ADMIN_RO(survivability_mode); 178 179 static void xe_survivability_mode_fini(void *arg) 180 { 181 struct xe_device *xe = arg; 182 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 183 struct device *dev = &pdev->dev; 184 185 sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); 186 } 187 188 static int create_survivability_sysfs(struct pci_dev *pdev) 189 { 190 struct device *dev = &pdev->dev; 191 struct xe_device *xe = pdev_to_xe_device(pdev); 192 int ret; 193 194 /* create survivability mode sysfs */ 195 ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); 196 if (ret) { 197 dev_warn(dev, "Failed to create survivability sysfs files\n"); 198 return ret; 199 } 200 201 ret = devm_add_action_or_reset(xe->drm.dev, 202 xe_survivability_mode_fini, xe); 203 if (ret) 204 return ret; 205 206 return 0; 207 } 208 209 static int enable_boot_survivability_mode(struct pci_dev *pdev) 210 { 211 struct device *dev = &pdev->dev; 212 struct xe_device *xe = pdev_to_xe_device(pdev); 213 struct xe_survivability *survivability = &xe->survivability; 214 int ret = 0; 215 216 ret = create_survivability_sysfs(pdev); 217 if (ret) 218 return ret; 219 220 /* Make sure xe_heci_gsc_init() knows about survivability mode */ 221 survivability->mode = true; 222 223 ret = xe_heci_gsc_init(xe); 224 if (ret) 225 goto err; 226 227 xe_vsec_init(xe); 228 229 ret = xe_i2c_probe(xe); 230 if (ret) 231 goto err; 232 233 dev_err(dev, "In Survivability Mode\n"); 234 235 return 0; 236 237 err: 238 survivability->mode = false; 239 return ret; 240 } 241 242 static int init_survivability_mode(struct xe_device *xe) 243 { 244 struct xe_survivability *survivability = &xe->survivability; 245 struct xe_survivability_info *info; 246 247 survivability->size = MAX_SCRATCH_MMIO; 248 249 info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), 250 GFP_KERNEL); 251 if (!info) 252 return -ENOMEM; 253 254 survivability->info = info; 255 256 populate_survivability_info(xe); 257 258 return 0; 259 } 260 261 /** 262 * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled 263 * @xe: xe device instance 264 * 265 * Returns true if in boot survivability mode of type, else false 266 */ 267 bool xe_survivability_mode_is_boot_enabled(struct xe_device *xe) 268 { 269 struct xe_survivability *survivability = &xe->survivability; 270 271 return survivability->mode && survivability->type == XE_SURVIVABILITY_TYPE_BOOT; 272 } 273 274 /** 275 * xe_survivability_mode_is_requested - check if it's possible to enable survivability 276 * mode that was requested by firmware or userspace 277 * @xe: xe device instance 278 * 279 * This function reads configfs and boot status from Pcode. 280 * 281 * Return: true if platform support is available and boot status indicates 282 * failure or if survivability mode is requested, false otherwise. 283 */ 284 bool xe_survivability_mode_is_requested(struct xe_device *xe) 285 { 286 struct xe_survivability *survivability = &xe->survivability; 287 struct xe_mmio *mmio = xe_root_tile_mmio(xe); 288 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 289 u32 data; 290 bool survivability_mode; 291 292 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) 293 return false; 294 295 survivability_mode = xe_configfs_get_survivability_mode(pdev); 296 /* Enable survivability mode if set via configfs */ 297 if (survivability_mode) 298 return true; 299 300 data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); 301 survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); 302 303 return check_boot_failure(xe); 304 } 305 306 /** 307 * xe_survivability_mode_runtime_enable - Initialize and enable runtime survivability mode 308 * @xe: xe device instance 309 * 310 * Initialize survivability information and enable runtime survivability mode. 311 * Runtime survivability mode is enabled when certain errors cause the device to be 312 * in non-recoverable state. The device is declared wedged with the appropriate 313 * recovery method and survivability mode sysfs exposed to userspace 314 * 315 * Return: 0 if runtime survivability mode is enabled, negative error code otherwise. 316 */ 317 int xe_survivability_mode_runtime_enable(struct xe_device *xe) 318 { 319 struct xe_survivability *survivability = &xe->survivability; 320 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 321 int ret; 322 323 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) { 324 dev_err(&pdev->dev, "Runtime Survivability Mode not supported\n"); 325 return -EINVAL; 326 } 327 328 ret = init_survivability_mode(xe); 329 if (ret) 330 return ret; 331 332 ret = create_survivability_sysfs(pdev); 333 if (ret) 334 dev_err(&pdev->dev, "Failed to create survivability mode sysfs\n"); 335 336 survivability->type = XE_SURVIVABILITY_TYPE_RUNTIME; 337 dev_err(&pdev->dev, "Runtime Survivability mode enabled\n"); 338 339 xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR); 340 xe_device_declare_wedged(xe); 341 dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n"); 342 343 return 0; 344 } 345 346 /** 347 * xe_survivability_mode_boot_enable - Initialize and enable boot survivability mode 348 * @xe: xe device instance 349 * 350 * Initialize survivability information and enable boot survivability mode 351 * 352 * Return: 0 if boot survivability mode is enabled or not requested, negative error 353 * code otherwise. 354 */ 355 int xe_survivability_mode_boot_enable(struct xe_device *xe) 356 { 357 struct xe_survivability *survivability = &xe->survivability; 358 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 359 int ret; 360 361 if (!xe_survivability_mode_is_requested(xe)) 362 return 0; 363 364 ret = init_survivability_mode(xe); 365 if (ret) 366 return ret; 367 368 /* Log breadcrumbs but do not enter survivability mode for Critical boot errors */ 369 if (survivability->boot_status == CRITICAL_FAILURE) { 370 log_survivability_info(pdev); 371 return -ENXIO; 372 } 373 374 survivability->type = XE_SURVIVABILITY_TYPE_BOOT; 375 376 return enable_boot_survivability_mode(pdev); 377 } 378