xref: /linux/drivers/gpu/drm/xe/xe_survivability_mode.c (revision ca220141fa8ebae09765a242076b2b77338106b0)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include "xe_survivability_mode.h"
7 #include "xe_survivability_mode_types.h"
8 
9 #include <linux/kobject.h>
10 #include <linux/pci.h>
11 #include <linux/sysfs.h>
12 
13 #include "xe_configfs.h"
14 #include "xe_device.h"
15 #include "xe_heci_gsc.h"
16 #include "xe_i2c.h"
17 #include "xe_mmio.h"
18 #include "xe_nvm.h"
19 #include "xe_pcode_api.h"
20 #include "xe_vsec.h"
21 
22 /**
23  * DOC: Survivability Mode
24  *
25  * Survivability Mode is a software based workflow for recovering a system in a failed boot state
26  * Here system recoverability is concerned with recovering the firmware responsible for boot.
27  *
28  * Boot Survivability
29  * ===================
30  *
31  * Boot Survivability is implemented by loading the driver with bare minimum (no drm card) to allow
32  * the firmware to be flashed through mei driver and collect telemetry. The driver's probe flow is
33  * modified such that it enters survivability mode when pcode initialization is incomplete and boot
34  * status denotes a failure.
35  *
36  * Survivability mode can also be entered manually using the survivability mode attribute available
37  * through configfs which is beneficial in several usecases. It can be used to address scenarios
38  * where pcode does not detect failure or for validation purposes. It can also be used in
39  * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node.
40  *
41  * Use below command enable survivability mode manually::
42  *
43  *	# echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
44  *
45  * It is the responsibility of the user to clear the mode once firmware flash is complete.
46  *
47  * Refer :ref:`xe_configfs` for more details on how to use configfs
48  *
49  * Survivability mode is indicated by the below admin-only readable sysfs entry. It
50  * provides information about the type of survivability mode (Boot/Runtime).
51  *
52  * .. code-block:: shell
53  *
54  *	# cat /sys/bus/pci/devices/<device>/survivability_mode
55  *	  Boot
56  *
57  *
58  * Any additional debug information if present will be visible under the directory
59  * ``survivability_info``::
60  *
61  *	/sys/bus/pci/devices/<device>/survivability_info/
62  *	├── aux_info0
63  *	├── aux_info1
64  *	├── aux_info2
65  *	├── aux_info3
66  *	├── aux_info4
67  *	├── capability_info
68  *	├── fdo_mode
69  *	├── postcode_trace
70  *	└── postcode_trace_overflow
71  *
72  * This directory has the following attributes
73  *
74  * - ``capability_info`` : Indicates Boot status and support for additional information
75  *
76  * - ``postcode_trace``, ``postcode_trace_overflow`` : Each postcode is a 8bit value and
77  *   represents a boot failure event. When a new failure event is logged by PCODE the
78  *   existing postcodes are shifted left. These entries provide a history of 8 postcodes.
79  *
80  * - ``aux_info<n>`` : Some failures have additional debug information
81  *
82  * - ``fdo_mode`` : To allow recovery in scenarios where MEI itself fails, a new SPI Flash
83  *   Descriptor Override (FDO) mode is added in v2 survivability breadcrumbs. This mode is enabled
84  *   by PCODE and provides the ability to directly update the firmware via SPI Driver without
85  *   any dependency on MEI. Xe KMD initializes the nvm aux driver if FDO mode is enabled.
86  *
87  * Runtime Survivability
88  * =====================
89  *
90  * Certain runtime firmware errors can cause the device to enter a wedged state
91  * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation.
92  * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and
93  * is indicated by the presence of survivability mode sysfs.
94  * Survivability mode sysfs provides information about the type of survivability mode.
95  *
96  * .. code-block:: shell
97  *
98  *	# cat /sys/bus/pci/devices/<device>/survivability_mode
99  *	  Runtime
100  *
101  * When such errors occur, userspace is notified with the drm device wedged uevent and runtime
102  * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd
103  * to restore device to normal operation.
104  */
105 
106 static const char * const reg_map[] = {
107 	[CAPABILITY_INFO]         = "Capability Info",
108 	[POSTCODE_TRACE]          = "Postcode trace",
109 	[POSTCODE_TRACE_OVERFLOW] = "Postcode trace overflow",
110 	[AUX_INFO0]               = "Auxiliary Info 0",
111 	[AUX_INFO1]               = "Auxiliary Info 1",
112 	[AUX_INFO2]               = "Auxiliary Info 2",
113 	[AUX_INFO3]               = "Auxiliary Info 3",
114 	[AUX_INFO4]               = "Auxiliary Info 4",
115 };
116 
117 #define FDO_INFO	(MAX_SCRATCH_REG + 1)
118 
119 struct xe_survivability_attribute {
120 	struct device_attribute attr;
121 	u8 index;
122 };
123 
124 static struct
125 xe_survivability_attribute *dev_attr_to_survivability_attr(struct device_attribute *attr)
126 {
127 	return container_of(attr, struct xe_survivability_attribute, attr);
128 }
129 
130 static void set_survivability_info(struct xe_mmio *mmio, u32  *info, int id)
131 {
132 	info[id] = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
133 }
134 
135 static void populate_survivability_info(struct xe_device *xe)
136 {
137 	struct xe_survivability *survivability = &xe->survivability;
138 	u32 *info = survivability->info;
139 	struct xe_mmio *mmio;
140 	u32 id = 0, reg_value;
141 
142 	mmio = xe_root_tile_mmio(xe);
143 	set_survivability_info(mmio, info, CAPABILITY_INFO);
144 	reg_value = info[CAPABILITY_INFO];
145 
146 	survivability->version = REG_FIELD_GET(BREADCRUMB_VERSION, reg_value);
147 	/* FDO mode is exposed only from version 2 */
148 	if (survivability->version >= 2)
149 		survivability->fdo_mode = REG_FIELD_GET(FDO_MODE, reg_value);
150 
151 	if (reg_value & HISTORY_TRACKING) {
152 		set_survivability_info(mmio, info, POSTCODE_TRACE);
153 
154 		if (reg_value & OVERFLOW_SUPPORT)
155 			set_survivability_info(mmio, info, POSTCODE_TRACE_OVERFLOW);
156 	}
157 
158 	/* Traverse the linked list of aux info registers */
159 	if (reg_value & AUXINFO_SUPPORT) {
160 		for (id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
161 		     id >= AUX_INFO0 && id < MAX_SCRATCH_REG;
162 		     id =  REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, info[id]))
163 			set_survivability_info(mmio, info, id);
164 	}
165 }
166 
167 static void log_survivability_info(struct pci_dev *pdev)
168 {
169 	struct xe_device *xe = pdev_to_xe_device(pdev);
170 	struct xe_survivability *survivability = &xe->survivability;
171 	u32 *info = survivability->info;
172 	int id;
173 
174 	dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
175 		 survivability->boot_status);
176 	for (id = 0; id < MAX_SCRATCH_REG; id++) {
177 		if (info[id])
178 			dev_info(&pdev->dev, "%s: 0x%x\n", reg_map[id], info[id]);
179 	}
180 }
181 
182 static int check_boot_failure(struct xe_device *xe)
183 {
184 	struct xe_survivability *survivability = &xe->survivability;
185 
186 	return survivability->boot_status == NON_CRITICAL_FAILURE ||
187 		survivability->boot_status == CRITICAL_FAILURE;
188 }
189 
190 static ssize_t survivability_mode_show(struct device *dev,
191 				       struct device_attribute *attr, char *buff)
192 {
193 	struct pci_dev *pdev = to_pci_dev(dev);
194 	struct xe_device *xe = pdev_to_xe_device(pdev);
195 	struct xe_survivability *survivability = &xe->survivability;
196 
197 	return sysfs_emit(buff, "%s\n", survivability->type ? "Runtime" : "Boot");
198 }
199 
200 static DEVICE_ATTR_ADMIN_RO(survivability_mode);
201 
202 static ssize_t survivability_info_show(struct device *dev,
203 				       struct device_attribute *attr, char *buff)
204 {
205 	struct xe_survivability_attribute *sa = dev_attr_to_survivability_attr(attr);
206 	struct pci_dev *pdev = to_pci_dev(dev);
207 	struct xe_device *xe = pdev_to_xe_device(pdev);
208 	struct xe_survivability *survivability = &xe->survivability;
209 	u32 *info = survivability->info;
210 
211 	if (sa->index == FDO_INFO)
212 		return sysfs_emit(buff, "%s\n", str_enabled_disabled(survivability->fdo_mode));
213 
214 	return sysfs_emit(buff, "0x%x\n", info[sa->index]);
215 }
216 
217 #define SURVIVABILITY_ATTR_RO(name, _index)					\
218 	struct xe_survivability_attribute attr_##name =	{			\
219 		.attr =  __ATTR(name, 0400, survivability_info_show, NULL),	\
220 		.index = _index,						\
221 	}
222 
223 static SURVIVABILITY_ATTR_RO(capability_info, CAPABILITY_INFO);
224 static SURVIVABILITY_ATTR_RO(postcode_trace, POSTCODE_TRACE);
225 static SURVIVABILITY_ATTR_RO(postcode_trace_overflow, POSTCODE_TRACE_OVERFLOW);
226 static SURVIVABILITY_ATTR_RO(aux_info0, AUX_INFO0);
227 static SURVIVABILITY_ATTR_RO(aux_info1, AUX_INFO1);
228 static SURVIVABILITY_ATTR_RO(aux_info2, AUX_INFO2);
229 static SURVIVABILITY_ATTR_RO(aux_info3, AUX_INFO3);
230 static SURVIVABILITY_ATTR_RO(aux_info4, AUX_INFO4);
231 static SURVIVABILITY_ATTR_RO(fdo_mode, FDO_INFO);
232 
233 static void xe_survivability_mode_fini(void *arg)
234 {
235 	struct xe_device *xe = arg;
236 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
237 	struct device *dev = &pdev->dev;
238 
239 	device_remove_file(dev, &dev_attr_survivability_mode);
240 }
241 
242 static umode_t survivability_info_attrs_visible(struct kobject *kobj, struct attribute *attr,
243 						int idx)
244 {
245 	struct xe_device *xe = kdev_to_xe_device(kobj_to_dev(kobj));
246 	struct xe_survivability *survivability = &xe->survivability;
247 	u32 *info = survivability->info;
248 
249 	/*
250 	 * Last index in survivability_info_attrs is fdo mode and is applicable only in
251 	 * version 2 of survivability mode
252 	 */
253 	if (idx == MAX_SCRATCH_REG && survivability->version >= 2)
254 		return 0400;
255 
256 	if (idx < MAX_SCRATCH_REG && info[idx])
257 		return 0400;
258 
259 	return 0;
260 }
261 
262 /* Attributes are ordered according to enum scratch_reg */
263 static struct attribute *survivability_info_attrs[] = {
264 	&attr_capability_info.attr.attr,
265 	&attr_postcode_trace.attr.attr,
266 	&attr_postcode_trace_overflow.attr.attr,
267 	&attr_aux_info0.attr.attr,
268 	&attr_aux_info1.attr.attr,
269 	&attr_aux_info2.attr.attr,
270 	&attr_aux_info3.attr.attr,
271 	&attr_aux_info4.attr.attr,
272 	&attr_fdo_mode.attr.attr,
273 	NULL,
274 };
275 
276 static const struct attribute_group survivability_info_group = {
277 	.name = "survivability_info",
278 	.attrs = survivability_info_attrs,
279 	.is_visible = survivability_info_attrs_visible,
280 };
281 
282 static int create_survivability_sysfs(struct pci_dev *pdev)
283 {
284 	struct device *dev = &pdev->dev;
285 	struct xe_device *xe = pdev_to_xe_device(pdev);
286 	int ret;
287 
288 	ret = device_create_file(dev, &dev_attr_survivability_mode);
289 	if (ret) {
290 		dev_warn(dev, "Failed to create survivability sysfs files\n");
291 		return ret;
292 	}
293 
294 	ret = devm_add_action_or_reset(xe->drm.dev,
295 				       xe_survivability_mode_fini, xe);
296 	if (ret)
297 		return ret;
298 
299 	if (check_boot_failure(xe)) {
300 		ret = devm_device_add_group(dev, &survivability_info_group);
301 		if (ret)
302 			return ret;
303 	}
304 
305 	return 0;
306 }
307 
308 static int enable_boot_survivability_mode(struct pci_dev *pdev)
309 {
310 	struct device *dev = &pdev->dev;
311 	struct xe_device *xe = pdev_to_xe_device(pdev);
312 	struct xe_survivability *survivability = &xe->survivability;
313 	int ret = 0;
314 
315 	ret = create_survivability_sysfs(pdev);
316 	if (ret)
317 		return ret;
318 
319 	/* Make sure xe_heci_gsc_init() and xe_i2c_probe() are aware of survivability */
320 	survivability->mode = true;
321 
322 	xe_heci_gsc_init(xe);
323 
324 	xe_vsec_init(xe);
325 
326 	if (survivability->fdo_mode) {
327 		ret = xe_nvm_init(xe);
328 		if (ret)
329 			goto err;
330 	}
331 
332 	ret = xe_i2c_probe(xe);
333 	if (ret)
334 		goto err;
335 
336 	dev_err(dev, "In Survivability Mode\n");
337 
338 	return 0;
339 
340 err:
341 	dev_err(dev, "Failed to enable Survivability Mode\n");
342 	survivability->mode = false;
343 	return ret;
344 }
345 
346 /**
347  * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled
348  * @xe: xe device instance
349  *
350  * Returns true if in boot survivability mode of type, else false
351  */
352 bool xe_survivability_mode_is_boot_enabled(struct xe_device *xe)
353 {
354 	struct xe_survivability *survivability = &xe->survivability;
355 
356 	return survivability->mode && survivability->type == XE_SURVIVABILITY_TYPE_BOOT;
357 }
358 
359 /**
360  * xe_survivability_mode_is_requested - check if it's possible to enable survivability
361  *					mode that was requested by firmware or userspace
362  * @xe: xe device instance
363  *
364  * This function reads configfs and  boot status from Pcode.
365  *
366  * Return: true if platform support is available and boot status indicates
367  * failure or if survivability mode is requested, false otherwise.
368  */
369 bool xe_survivability_mode_is_requested(struct xe_device *xe)
370 {
371 	struct xe_survivability *survivability = &xe->survivability;
372 	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
373 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
374 	u32 data;
375 	bool survivability_mode;
376 
377 	if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE)
378 		return false;
379 
380 	survivability_mode = xe_configfs_get_survivability_mode(pdev);
381 	/* Enable survivability mode if set via configfs */
382 	if (survivability_mode)
383 		return true;
384 
385 	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
386 	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
387 
388 	return check_boot_failure(xe);
389 }
390 
391 /**
392  * xe_survivability_mode_runtime_enable - Initialize and enable runtime survivability mode
393  * @xe: xe device instance
394  *
395  * Initialize survivability information and enable runtime survivability mode.
396  * Runtime survivability mode is enabled when certain errors cause the device to be
397  * in non-recoverable state. The device is declared wedged with the appropriate
398  * recovery method and survivability mode sysfs exposed to userspace
399  *
400  * Return: 0 if runtime survivability mode is enabled, negative error code otherwise.
401  */
402 int xe_survivability_mode_runtime_enable(struct xe_device *xe)
403 {
404 	struct xe_survivability *survivability = &xe->survivability;
405 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
406 	int ret;
407 
408 	if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) {
409 		dev_err(&pdev->dev, "Runtime Survivability Mode not supported\n");
410 		return -EINVAL;
411 	}
412 
413 	populate_survivability_info(xe);
414 
415 	ret = create_survivability_sysfs(pdev);
416 	if (ret)
417 		dev_err(&pdev->dev, "Failed to create survivability mode sysfs\n");
418 
419 	survivability->type = XE_SURVIVABILITY_TYPE_RUNTIME;
420 	dev_err(&pdev->dev, "Runtime Survivability mode enabled\n");
421 
422 	xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR);
423 	xe_device_declare_wedged(xe);
424 	dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n");
425 
426 	return 0;
427 }
428 
429 /**
430  * xe_survivability_mode_boot_enable - Initialize and enable boot survivability mode
431  * @xe: xe device instance
432  *
433  * Initialize survivability information and enable boot survivability mode
434  *
435  * Return: 0 if boot survivability mode is enabled or not requested, negative error
436  * code otherwise.
437  */
438 int xe_survivability_mode_boot_enable(struct xe_device *xe)
439 {
440 	struct xe_survivability *survivability = &xe->survivability;
441 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
442 
443 	if (!xe_survivability_mode_is_requested(xe))
444 		return 0;
445 
446 	populate_survivability_info(xe);
447 
448 	/*
449 	 * v2 supports survivability mode for critical errors
450 	 */
451 	if (survivability->version < 2  && survivability->boot_status == CRITICAL_FAILURE) {
452 		log_survivability_info(pdev);
453 		return -ENXIO;
454 	}
455 
456 	survivability->type = XE_SURVIVABILITY_TYPE_BOOT;
457 
458 	return enable_boot_survivability_mode(pdev);
459 }
460