1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include "xe_survivability_mode.h" 7 #include "xe_survivability_mode_types.h" 8 9 #include <linux/kobject.h> 10 #include <linux/pci.h> 11 #include <linux/sysfs.h> 12 13 #include "xe_device.h" 14 #include "xe_gt.h" 15 #include "xe_heci_gsc.h" 16 #include "xe_mmio.h" 17 #include "xe_pcode_api.h" 18 #include "xe_vsec.h" 19 20 #define MAX_SCRATCH_MMIO 8 21 22 /** 23 * DOC: Xe Boot Survivability 24 * 25 * Boot Survivability is a software based workflow for recovering a system in a failed boot state 26 * Here system recoverability is concerned with recovering the firmware responsible for boot. 27 * 28 * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware 29 * to be flashed through mei and collect telemetry. The driver's probe flow is modified 30 * such that it enters survivability mode when pcode initialization is incomplete and boot status 31 * denotes a failure. The driver then populates the survivability_mode PCI sysfs indicating 32 * survivability mode and provides additional information required for debug 33 * 34 * KMD exposes below admin-only readable sysfs in survivability mode 35 * 36 * device/survivability_mode: The presence of this file indicates that the card is in survivability 37 * mode. Also, provides additional information on why the driver entered 38 * survivability mode. 39 * 40 * Capability Information - Provides boot status 41 * Postcode Information - Provides information about the failure 42 * Overflow Information - Provides history of previous failures 43 * Auxiliary Information - Certain failures may have information in 44 * addition to postcode information 45 */ 46 47 static u32 aux_history_offset(u32 reg_value) 48 { 49 return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value); 50 } 51 52 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info, 53 int id, char *name) 54 { 55 strscpy(info[id].name, name, sizeof(info[id].name)); 56 info[id].reg = PCODE_SCRATCH(id).raw; 57 info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); 58 } 59 60 static void populate_survivability_info(struct xe_device *xe) 61 { 62 struct xe_survivability *survivability = &xe->survivability; 63 struct xe_survivability_info *info = survivability->info; 64 struct xe_mmio *mmio; 65 u32 id = 0, reg_value; 66 char name[NAME_MAX]; 67 int index; 68 69 mmio = xe_root_tile_mmio(xe); 70 set_survivability_info(mmio, info, id, "Capability Info"); 71 reg_value = info[id].value; 72 73 if (reg_value & HISTORY_TRACKING) { 74 id++; 75 set_survivability_info(mmio, info, id, "Postcode Info"); 76 77 if (reg_value & OVERFLOW_SUPPORT) { 78 id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value); 79 set_survivability_info(mmio, info, id, "Overflow Info"); 80 } 81 } 82 83 if (reg_value & AUXINFO_SUPPORT) { 84 id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value); 85 86 for (index = 0; id && reg_value; index++, reg_value = info[id].value, 87 id = aux_history_offset(reg_value)) { 88 snprintf(name, NAME_MAX, "Auxiliary Info %d", index); 89 set_survivability_info(mmio, info, id, name); 90 } 91 } 92 } 93 94 static void log_survivability_info(struct pci_dev *pdev) 95 { 96 struct xe_device *xe = pdev_to_xe_device(pdev); 97 struct xe_survivability *survivability = &xe->survivability; 98 struct xe_survivability_info *info = survivability->info; 99 int id; 100 101 dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n", 102 survivability->boot_status); 103 for (id = 0; id < MAX_SCRATCH_MMIO; id++) { 104 if (info[id].reg) 105 dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name, 106 info[id].reg, info[id].value); 107 } 108 } 109 110 static ssize_t survivability_mode_show(struct device *dev, 111 struct device_attribute *attr, char *buff) 112 { 113 struct pci_dev *pdev = to_pci_dev(dev); 114 struct xe_device *xe = pdev_to_xe_device(pdev); 115 struct xe_survivability *survivability = &xe->survivability; 116 struct xe_survivability_info *info = survivability->info; 117 int index = 0, count = 0; 118 119 for (index = 0; index < MAX_SCRATCH_MMIO; index++) { 120 if (info[index].reg) 121 count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name, 122 info[index].reg, info[index].value); 123 } 124 125 return count; 126 } 127 128 static DEVICE_ATTR_ADMIN_RO(survivability_mode); 129 130 static void xe_survivability_mode_fini(void *arg) 131 { 132 struct xe_device *xe = arg; 133 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 134 struct device *dev = &pdev->dev; 135 136 sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); 137 } 138 139 static int enable_survivability_mode(struct pci_dev *pdev) 140 { 141 struct device *dev = &pdev->dev; 142 struct xe_device *xe = pdev_to_xe_device(pdev); 143 struct xe_survivability *survivability = &xe->survivability; 144 int ret = 0; 145 146 /* create survivability mode sysfs */ 147 ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); 148 if (ret) { 149 dev_warn(dev, "Failed to create survivability sysfs files\n"); 150 return ret; 151 } 152 153 ret = devm_add_action_or_reset(xe->drm.dev, 154 xe_survivability_mode_fini, xe); 155 if (ret) 156 return ret; 157 158 /* Make sure xe_heci_gsc_init() knows about survivability mode */ 159 survivability->mode = true; 160 161 ret = xe_heci_gsc_init(xe); 162 if (ret) { 163 /* 164 * But if it fails, device can't enter survivability 165 * so move it back for correct error handling 166 */ 167 survivability->mode = false; 168 return ret; 169 } 170 171 xe_vsec_init(xe); 172 173 dev_err(dev, "In Survivability Mode\n"); 174 175 return 0; 176 } 177 178 /** 179 * xe_survivability_mode_is_enabled - check if survivability mode is enabled 180 * @xe: xe device instance 181 * 182 * Returns true if in survivability mode, false otherwise 183 */ 184 bool xe_survivability_mode_is_enabled(struct xe_device *xe) 185 { 186 return xe->survivability.mode; 187 } 188 189 /* 190 * survivability_mode_requested - check if it's possible to enable 191 * survivability mode and that was requested by firmware 192 * 193 * This function reads the boot status from Pcode. 194 * 195 * Return: true if platform support is available and boot status indicates 196 * failure, false otherwise. 197 */ 198 static bool survivability_mode_requested(struct xe_device *xe) 199 { 200 struct xe_survivability *survivability = &xe->survivability; 201 struct xe_mmio *mmio = xe_root_tile_mmio(xe); 202 u32 data; 203 204 if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe)) 205 return false; 206 207 data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); 208 survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); 209 210 return survivability->boot_status == NON_CRITICAL_FAILURE || 211 survivability->boot_status == CRITICAL_FAILURE; 212 } 213 214 /** 215 * xe_survivability_mode_enable - Initialize and enable the survivability mode 216 * @xe: xe device instance 217 * 218 * Initialize survivability information and enable survivability mode 219 * 220 * Return: 0 if survivability mode is enabled or not requested; negative error 221 * code otherwise. 222 */ 223 int xe_survivability_mode_enable(struct xe_device *xe) 224 { 225 struct xe_survivability *survivability = &xe->survivability; 226 struct xe_survivability_info *info; 227 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 228 229 if (!survivability_mode_requested(xe)) 230 return 0; 231 232 survivability->size = MAX_SCRATCH_MMIO; 233 234 info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), 235 GFP_KERNEL); 236 if (!info) 237 return -ENOMEM; 238 239 survivability->info = info; 240 241 populate_survivability_info(xe); 242 243 /* Only log debug information and exit if it is a critical failure */ 244 if (survivability->boot_status == CRITICAL_FAILURE) { 245 log_survivability_info(pdev); 246 return -ENXIO; 247 } 248 249 return enable_survivability_mode(pdev); 250 } 251