1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2025 Intel Corporation
4 */
5
6 #include "xe_survivability_mode.h"
7 #include "xe_survivability_mode_types.h"
8
9 #include <linux/kobject.h>
10 #include <linux/pci.h>
11 #include <linux/sysfs.h>
12
13 #include "xe_configfs.h"
14 #include "xe_device.h"
15 #include "xe_gt.h"
16 #include "xe_heci_gsc.h"
17 #include "xe_i2c.h"
18 #include "xe_mmio.h"
19 #include "xe_pcode_api.h"
20 #include "xe_vsec.h"
21
22 #define MAX_SCRATCH_MMIO 8
23
24 /**
25 * DOC: Survivability Mode
26 *
27 * Survivability Mode is a software based workflow for recovering a system in a failed boot state
28 * Here system recoverability is concerned with recovering the firmware responsible for boot.
29 *
30 * Boot Survivability
31 * ===================
32 *
33 * Boot Survivability is implemented by loading the driver with bare minimum (no drm card) to allow
34 * the firmware to be flashed through mei driver and collect telemetry. The driver's probe flow is
35 * modified such that it enters survivability mode when pcode initialization is incomplete and boot
36 * status denotes a failure.
37 *
38 * Survivability mode can also be entered manually using the survivability mode attribute available
39 * through configfs which is beneficial in several usecases. It can be used to address scenarios
40 * where pcode does not detect failure or for validation purposes. It can also be used in
41 * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node.
42 *
43 * Use below command enable survivability mode manually::
44 *
45 * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
46 *
47 * It is the responsibility of the user to clear the mode once firmware flash is complete.
48 *
49 * Refer :ref:`xe_configfs` for more details on how to use configfs
50 *
51 * Survivability mode is indicated by the below admin-only readable sysfs which provides additional
52 * debug information::
53 *
54 * /sys/bus/pci/devices/<device>/survivability_mode
55 *
56 * Capability Information:
57 * Provides boot status
58 * Postcode Information:
59 * Provides information about the failure
60 * Overflow Information
61 * Provides history of previous failures
62 * Auxiliary Information
63 * Certain failures may have information in addition to postcode information
64 *
65 * Runtime Survivability
66 * =====================
67 *
68 * Certain runtime firmware errors can cause the device to enter a wedged state
69 * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation.
70 * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and
71 * is indicated by the presence of survivability mode sysfs::
72 *
73 * /sys/bus/pci/devices/<device>/survivability_mode
74 *
75 * Survivability mode sysfs provides information about the type of survivability mode.
76 *
77 * When such errors occur, userspace is notified with the drm device wedged uevent and runtime
78 * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd
79 * to restore device to normal operation.
80 */
81
aux_history_offset(u32 reg_value)82 static u32 aux_history_offset(u32 reg_value)
83 {
84 return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
85 }
86
set_survivability_info(struct xe_mmio * mmio,struct xe_survivability_info * info,int id,char * name)87 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
88 int id, char *name)
89 {
90 strscpy(info[id].name, name, sizeof(info[id].name));
91 info[id].reg = PCODE_SCRATCH(id).raw;
92 info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
93 }
94
populate_survivability_info(struct xe_device * xe)95 static void populate_survivability_info(struct xe_device *xe)
96 {
97 struct xe_survivability *survivability = &xe->survivability;
98 struct xe_survivability_info *info = survivability->info;
99 struct xe_mmio *mmio;
100 u32 id = 0, reg_value;
101 char name[NAME_MAX];
102 int index;
103
104 mmio = xe_root_tile_mmio(xe);
105 set_survivability_info(mmio, info, id, "Capability Info");
106 reg_value = info[id].value;
107
108 if (reg_value & HISTORY_TRACKING) {
109 id++;
110 set_survivability_info(mmio, info, id, "Postcode Info");
111
112 if (reg_value & OVERFLOW_SUPPORT) {
113 id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
114 set_survivability_info(mmio, info, id, "Overflow Info");
115 }
116 }
117
118 if (reg_value & AUXINFO_SUPPORT) {
119 id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
120
121 for (index = 0; id && reg_value; index++, reg_value = info[id].value,
122 id = aux_history_offset(reg_value)) {
123 snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
124 set_survivability_info(mmio, info, id, name);
125 }
126 }
127 }
128
log_survivability_info(struct pci_dev * pdev)129 static void log_survivability_info(struct pci_dev *pdev)
130 {
131 struct xe_device *xe = pdev_to_xe_device(pdev);
132 struct xe_survivability *survivability = &xe->survivability;
133 struct xe_survivability_info *info = survivability->info;
134 int id;
135
136 dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
137 survivability->boot_status);
138 for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
139 if (info[id].reg)
140 dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
141 info[id].reg, info[id].value);
142 }
143 }
144
check_boot_failure(struct xe_device * xe)145 static int check_boot_failure(struct xe_device *xe)
146 {
147 struct xe_survivability *survivability = &xe->survivability;
148
149 return survivability->boot_status == NON_CRITICAL_FAILURE ||
150 survivability->boot_status == CRITICAL_FAILURE;
151 }
152
survivability_mode_show(struct device * dev,struct device_attribute * attr,char * buff)153 static ssize_t survivability_mode_show(struct device *dev,
154 struct device_attribute *attr, char *buff)
155 {
156 struct pci_dev *pdev = to_pci_dev(dev);
157 struct xe_device *xe = pdev_to_xe_device(pdev);
158 struct xe_survivability *survivability = &xe->survivability;
159 struct xe_survivability_info *info = survivability->info;
160 int index = 0, count = 0;
161
162 count += sysfs_emit_at(buff, count, "Survivability mode type: %s\n",
163 survivability->type ? "Runtime" : "Boot");
164
165 if (!check_boot_failure(xe))
166 return count;
167
168 for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
169 if (info[index].reg)
170 count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
171 info[index].reg, info[index].value);
172 }
173
174 return count;
175 }
176
177 static DEVICE_ATTR_ADMIN_RO(survivability_mode);
178
xe_survivability_mode_fini(void * arg)179 static void xe_survivability_mode_fini(void *arg)
180 {
181 struct xe_device *xe = arg;
182 struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
183 struct device *dev = &pdev->dev;
184
185 sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
186 }
187
create_survivability_sysfs(struct pci_dev * pdev)188 static int create_survivability_sysfs(struct pci_dev *pdev)
189 {
190 struct device *dev = &pdev->dev;
191 struct xe_device *xe = pdev_to_xe_device(pdev);
192 int ret;
193
194 /* create survivability mode sysfs */
195 ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
196 if (ret) {
197 dev_warn(dev, "Failed to create survivability sysfs files\n");
198 return ret;
199 }
200
201 ret = devm_add_action_or_reset(xe->drm.dev,
202 xe_survivability_mode_fini, xe);
203 if (ret)
204 return ret;
205
206 return 0;
207 }
208
enable_boot_survivability_mode(struct pci_dev * pdev)209 static int enable_boot_survivability_mode(struct pci_dev *pdev)
210 {
211 struct device *dev = &pdev->dev;
212 struct xe_device *xe = pdev_to_xe_device(pdev);
213 struct xe_survivability *survivability = &xe->survivability;
214 int ret = 0;
215
216 ret = create_survivability_sysfs(pdev);
217 if (ret)
218 return ret;
219
220 /* Make sure xe_heci_gsc_init() knows about survivability mode */
221 survivability->mode = true;
222
223 ret = xe_heci_gsc_init(xe);
224 if (ret)
225 goto err;
226
227 xe_vsec_init(xe);
228
229 ret = xe_i2c_probe(xe);
230 if (ret)
231 goto err;
232
233 dev_err(dev, "In Survivability Mode\n");
234
235 return 0;
236
237 err:
238 survivability->mode = false;
239 return ret;
240 }
241
init_survivability_mode(struct xe_device * xe)242 static int init_survivability_mode(struct xe_device *xe)
243 {
244 struct xe_survivability *survivability = &xe->survivability;
245 struct xe_survivability_info *info;
246
247 survivability->size = MAX_SCRATCH_MMIO;
248
249 info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
250 GFP_KERNEL);
251 if (!info)
252 return -ENOMEM;
253
254 survivability->info = info;
255
256 populate_survivability_info(xe);
257
258 return 0;
259 }
260
261 /**
262 * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled
263 * @xe: xe device instance
264 *
265 * Returns true if in boot survivability mode of type, else false
266 */
xe_survivability_mode_is_boot_enabled(struct xe_device * xe)267 bool xe_survivability_mode_is_boot_enabled(struct xe_device *xe)
268 {
269 struct xe_survivability *survivability = &xe->survivability;
270
271 return survivability->mode && survivability->type == XE_SURVIVABILITY_TYPE_BOOT;
272 }
273
274 /**
275 * xe_survivability_mode_is_requested - check if it's possible to enable survivability
276 * mode that was requested by firmware or userspace
277 * @xe: xe device instance
278 *
279 * This function reads configfs and boot status from Pcode.
280 *
281 * Return: true if platform support is available and boot status indicates
282 * failure or if survivability mode is requested, false otherwise.
283 */
xe_survivability_mode_is_requested(struct xe_device * xe)284 bool xe_survivability_mode_is_requested(struct xe_device *xe)
285 {
286 struct xe_survivability *survivability = &xe->survivability;
287 struct xe_mmio *mmio = xe_root_tile_mmio(xe);
288 struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
289 u32 data;
290 bool survivability_mode;
291
292 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE)
293 return false;
294
295 survivability_mode = xe_configfs_get_survivability_mode(pdev);
296 /* Enable survivability mode if set via configfs */
297 if (survivability_mode)
298 return true;
299
300 data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
301 survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
302
303 return check_boot_failure(xe);
304 }
305
306 /**
307 * xe_survivability_mode_runtime_enable - Initialize and enable runtime survivability mode
308 * @xe: xe device instance
309 *
310 * Initialize survivability information and enable runtime survivability mode.
311 * Runtime survivability mode is enabled when certain errors cause the device to be
312 * in non-recoverable state. The device is declared wedged with the appropriate
313 * recovery method and survivability mode sysfs exposed to userspace
314 *
315 * Return: 0 if runtime survivability mode is enabled, negative error code otherwise.
316 */
xe_survivability_mode_runtime_enable(struct xe_device * xe)317 int xe_survivability_mode_runtime_enable(struct xe_device *xe)
318 {
319 struct xe_survivability *survivability = &xe->survivability;
320 struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
321 int ret;
322
323 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) {
324 dev_err(&pdev->dev, "Runtime Survivability Mode not supported\n");
325 return -EINVAL;
326 }
327
328 ret = init_survivability_mode(xe);
329 if (ret)
330 return ret;
331
332 ret = create_survivability_sysfs(pdev);
333 if (ret)
334 dev_err(&pdev->dev, "Failed to create survivability mode sysfs\n");
335
336 survivability->type = XE_SURVIVABILITY_TYPE_RUNTIME;
337 dev_err(&pdev->dev, "Runtime Survivability mode enabled\n");
338
339 xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR);
340 xe_device_declare_wedged(xe);
341 dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n");
342
343 return 0;
344 }
345
346 /**
347 * xe_survivability_mode_boot_enable - Initialize and enable boot survivability mode
348 * @xe: xe device instance
349 *
350 * Initialize survivability information and enable boot survivability mode
351 *
352 * Return: 0 if boot survivability mode is enabled or not requested, negative error
353 * code otherwise.
354 */
xe_survivability_mode_boot_enable(struct xe_device * xe)355 int xe_survivability_mode_boot_enable(struct xe_device *xe)
356 {
357 struct xe_survivability *survivability = &xe->survivability;
358 struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
359 int ret;
360
361 if (!xe_survivability_mode_is_requested(xe))
362 return 0;
363
364 ret = init_survivability_mode(xe);
365 if (ret)
366 return ret;
367
368 /* Log breadcrumbs but do not enter survivability mode for Critical boot errors */
369 if (survivability->boot_status == CRITICAL_FAILURE) {
370 log_survivability_info(pdev);
371 return -ENXIO;
372 }
373
374 survivability->type = XE_SURVIVABILITY_TYPE_BOOT;
375
376 return enable_boot_survivability_mode(pdev);
377 }
378