1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include "xe_survivability_mode.h" 7 #include "xe_survivability_mode_types.h" 8 9 #include <linux/kobject.h> 10 #include <linux/pci.h> 11 #include <linux/sysfs.h> 12 13 #include "xe_configfs.h" 14 #include "xe_device.h" 15 #include "xe_gt.h" 16 #include "xe_heci_gsc.h" 17 #include "xe_i2c.h" 18 #include "xe_mmio.h" 19 #include "xe_pcode_api.h" 20 #include "xe_vsec.h" 21 22 #define MAX_SCRATCH_MMIO 8 23 24 /** 25 * DOC: Xe Boot Survivability 26 * 27 * Boot Survivability is a software based workflow for recovering a system in a failed boot state 28 * Here system recoverability is concerned with recovering the firmware responsible for boot. 29 * 30 * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware 31 * to be flashed through mei and collect telemetry. The driver's probe flow is modified 32 * such that it enters survivability mode when pcode initialization is incomplete and boot status 33 * denotes a failure. 34 * 35 * Survivability mode can also be entered manually using the survivability mode attribute available 36 * through configfs which is beneficial in several usecases. It can be used to address scenarios 37 * where pcode does not detect failure or for validation purposes. It can also be used in 38 * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node. 39 * 40 * Use below command enable survivability mode manually:: 41 * 42 * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode 43 * 44 * It is the responsibility of the user to clear the mode once firmware flash is complete. 45 * 46 * Refer :ref:`xe_configfs` for more details on how to use configfs 47 * 48 * Survivability mode is indicated by the below admin-only readable sysfs which provides additional 49 * debug information:: 50 * 51 * /sys/bus/pci/devices/<device>/surivability_mode 52 * 53 * Capability Information: 54 * Provides boot status 55 * Postcode Information: 56 * Provides information about the failure 57 * Overflow Information 58 * Provides history of previous failures 59 * Auxiliary Information 60 * Certain failures may have information in addition to postcode information 61 */ 62 63 static u32 aux_history_offset(u32 reg_value) 64 { 65 return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value); 66 } 67 68 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info, 69 int id, char *name) 70 { 71 strscpy(info[id].name, name, sizeof(info[id].name)); 72 info[id].reg = PCODE_SCRATCH(id).raw; 73 info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); 74 } 75 76 static void populate_survivability_info(struct xe_device *xe) 77 { 78 struct xe_survivability *survivability = &xe->survivability; 79 struct xe_survivability_info *info = survivability->info; 80 struct xe_mmio *mmio; 81 u32 id = 0, reg_value; 82 char name[NAME_MAX]; 83 int index; 84 85 mmio = xe_root_tile_mmio(xe); 86 set_survivability_info(mmio, info, id, "Capability Info"); 87 reg_value = info[id].value; 88 89 if (reg_value & HISTORY_TRACKING) { 90 id++; 91 set_survivability_info(mmio, info, id, "Postcode Info"); 92 93 if (reg_value & OVERFLOW_SUPPORT) { 94 id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value); 95 set_survivability_info(mmio, info, id, "Overflow Info"); 96 } 97 } 98 99 if (reg_value & AUXINFO_SUPPORT) { 100 id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value); 101 102 for (index = 0; id && reg_value; index++, reg_value = info[id].value, 103 id = aux_history_offset(reg_value)) { 104 snprintf(name, NAME_MAX, "Auxiliary Info %d", index); 105 set_survivability_info(mmio, info, id, name); 106 } 107 } 108 } 109 110 static void log_survivability_info(struct pci_dev *pdev) 111 { 112 struct xe_device *xe = pdev_to_xe_device(pdev); 113 struct xe_survivability *survivability = &xe->survivability; 114 struct xe_survivability_info *info = survivability->info; 115 int id; 116 117 dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n", 118 survivability->boot_status); 119 for (id = 0; id < MAX_SCRATCH_MMIO; id++) { 120 if (info[id].reg) 121 dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name, 122 info[id].reg, info[id].value); 123 } 124 } 125 126 static ssize_t survivability_mode_show(struct device *dev, 127 struct device_attribute *attr, char *buff) 128 { 129 struct pci_dev *pdev = to_pci_dev(dev); 130 struct xe_device *xe = pdev_to_xe_device(pdev); 131 struct xe_survivability *survivability = &xe->survivability; 132 struct xe_survivability_info *info = survivability->info; 133 int index = 0, count = 0; 134 135 for (index = 0; index < MAX_SCRATCH_MMIO; index++) { 136 if (info[index].reg) 137 count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name, 138 info[index].reg, info[index].value); 139 } 140 141 return count; 142 } 143 144 static DEVICE_ATTR_ADMIN_RO(survivability_mode); 145 146 static void xe_survivability_mode_fini(void *arg) 147 { 148 struct xe_device *xe = arg; 149 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 150 struct device *dev = &pdev->dev; 151 152 sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); 153 } 154 155 static int enable_survivability_mode(struct pci_dev *pdev) 156 { 157 struct device *dev = &pdev->dev; 158 struct xe_device *xe = pdev_to_xe_device(pdev); 159 struct xe_survivability *survivability = &xe->survivability; 160 int ret = 0; 161 162 /* create survivability mode sysfs */ 163 ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); 164 if (ret) { 165 dev_warn(dev, "Failed to create survivability sysfs files\n"); 166 return ret; 167 } 168 169 ret = devm_add_action_or_reset(xe->drm.dev, 170 xe_survivability_mode_fini, xe); 171 if (ret) 172 return ret; 173 174 /* Make sure xe_heci_gsc_init() knows about survivability mode */ 175 survivability->mode = true; 176 177 ret = xe_heci_gsc_init(xe); 178 if (ret) 179 goto err; 180 181 xe_vsec_init(xe); 182 183 ret = xe_i2c_probe(xe); 184 if (ret) 185 goto err; 186 187 dev_err(dev, "In Survivability Mode\n"); 188 189 return 0; 190 191 err: 192 survivability->mode = false; 193 return ret; 194 } 195 196 /** 197 * xe_survivability_mode_is_enabled - check if survivability mode is enabled 198 * @xe: xe device instance 199 * 200 * Returns true if in survivability mode, false otherwise 201 */ 202 bool xe_survivability_mode_is_enabled(struct xe_device *xe) 203 { 204 return xe->survivability.mode; 205 } 206 207 /** 208 * xe_survivability_mode_is_requested - check if it's possible to enable survivability 209 * mode that was requested by firmware or userspace 210 * @xe: xe device instance 211 * 212 * This function reads configfs and boot status from Pcode. 213 * 214 * Return: true if platform support is available and boot status indicates 215 * failure or if survivability mode is requested, false otherwise. 216 */ 217 bool xe_survivability_mode_is_requested(struct xe_device *xe) 218 { 219 struct xe_survivability *survivability = &xe->survivability; 220 struct xe_mmio *mmio = xe_root_tile_mmio(xe); 221 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 222 u32 data; 223 bool survivability_mode; 224 225 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) 226 return false; 227 228 survivability_mode = xe_configfs_get_survivability_mode(pdev); 229 230 if (xe->info.platform < XE_BATTLEMAGE) { 231 if (survivability_mode) { 232 dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n"); 233 xe_configfs_clear_survivability_mode(pdev); 234 } 235 return false; 236 } 237 238 /* Enable survivability mode if set via configfs */ 239 if (survivability_mode) 240 return true; 241 242 data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); 243 survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); 244 245 return survivability->boot_status == NON_CRITICAL_FAILURE || 246 survivability->boot_status == CRITICAL_FAILURE; 247 } 248 249 /** 250 * xe_survivability_mode_enable - Initialize and enable the survivability mode 251 * @xe: xe device instance 252 * 253 * Initialize survivability information and enable survivability mode 254 * 255 * Return: 0 if survivability mode is enabled or not requested; negative error 256 * code otherwise. 257 */ 258 int xe_survivability_mode_enable(struct xe_device *xe) 259 { 260 struct xe_survivability *survivability = &xe->survivability; 261 struct xe_survivability_info *info; 262 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 263 264 if (!xe_survivability_mode_is_requested(xe)) 265 return 0; 266 267 survivability->size = MAX_SCRATCH_MMIO; 268 269 info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), 270 GFP_KERNEL); 271 if (!info) 272 return -ENOMEM; 273 274 survivability->info = info; 275 276 populate_survivability_info(xe); 277 278 /* Only log debug information and exit if it is a critical failure */ 279 if (survivability->boot_status == CRITICAL_FAILURE) { 280 log_survivability_info(pdev); 281 return -ENXIO; 282 } 283 284 return enable_survivability_mode(pdev); 285 } 286