xref: /linux/drivers/gpu/drm/xe/xe_survivability_mode.c (revision 9fd2da71c301184d98fe37674ca8d017d1ce6600)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include "xe_survivability_mode.h"
7 #include "xe_survivability_mode_types.h"
8 
9 #include <linux/kobject.h>
10 #include <linux/pci.h>
11 #include <linux/sysfs.h>
12 
13 #include "xe_configfs.h"
14 #include "xe_device.h"
15 #include "xe_gt.h"
16 #include "xe_heci_gsc.h"
17 #include "xe_i2c.h"
18 #include "xe_mmio.h"
19 #include "xe_pcode_api.h"
20 #include "xe_vsec.h"
21 
22 #define MAX_SCRATCH_MMIO 8
23 
24 /**
25  * DOC: Survivability Mode
26  *
27  * Survivability Mode is a software based workflow for recovering a system in a failed boot state
28  * Here system recoverability is concerned with recovering the firmware responsible for boot.
29  *
30  * Boot Survivability
31  * ===================
32  *
33  * Boot Survivability is implemented by loading the driver with bare minimum (no drm card) to allow
34  * the firmware to be flashed through mei driver and collect telemetry. The driver's probe flow is
35  * modified such that it enters survivability mode when pcode initialization is incomplete and boot
36  * status denotes a failure.
37  *
38  * Survivability mode can also be entered manually using the survivability mode attribute available
39  * through configfs which is beneficial in several usecases. It can be used to address scenarios
40  * where pcode does not detect failure or for validation purposes. It can also be used in
41  * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node.
42  *
43  * Use below command enable survivability mode manually::
44  *
45  *	# echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
46  *
47  * Refer :ref:`xe_configfs` for more details on how to use configfs
48  *
49  * Survivability mode is indicated by the below admin-only readable sysfs which provides additional
50  * debug information::
51  *
52  *	/sys/bus/pci/devices/<device>/survivability_mode
53  *
54  * Capability Information:
55  *	Provides boot status
56  * Postcode Information:
57  *	Provides information about the failure
58  * Overflow Information
59  *	Provides history of previous failures
60  * Auxiliary Information
61  *	Certain failures may have information in addition to postcode information
62  *
63  * Runtime Survivability
64  * =====================
65  *
66  * Certain runtime firmware errors can cause the device to enter a wedged state
67  * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation.
68  * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and
69  * is indicated by the presence of survivability mode sysfs::
70  *
71  *	/sys/bus/pci/devices/<device>/survivability_mode
72  *
73  * Survivability mode sysfs provides information about the type of survivability mode.
74  *
75  * When such errors occur, userspace is notified with the drm device wedged uevent and runtime
76  * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd
77  * to restore device to normal operation.
78  */
79 
80 static u32 aux_history_offset(u32 reg_value)
81 {
82 	return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
83 }
84 
85 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
86 				   int id, char *name)
87 {
88 	strscpy(info[id].name, name, sizeof(info[id].name));
89 	info[id].reg = PCODE_SCRATCH(id).raw;
90 	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
91 }
92 
93 static void populate_survivability_info(struct xe_device *xe)
94 {
95 	struct xe_survivability *survivability = &xe->survivability;
96 	struct xe_survivability_info *info = survivability->info;
97 	struct xe_mmio *mmio;
98 	u32 id = 0, reg_value;
99 	char name[NAME_MAX];
100 	int index;
101 
102 	mmio = xe_root_tile_mmio(xe);
103 	set_survivability_info(mmio, info, id, "Capability Info");
104 	reg_value = info[id].value;
105 
106 	if (reg_value & HISTORY_TRACKING) {
107 		id++;
108 		set_survivability_info(mmio, info, id, "Postcode Info");
109 
110 		if (reg_value & OVERFLOW_SUPPORT) {
111 			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
112 			set_survivability_info(mmio, info, id, "Overflow Info");
113 		}
114 	}
115 
116 	if (reg_value & AUXINFO_SUPPORT) {
117 		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
118 
119 		for (index = 0; id && reg_value; index++, reg_value = info[id].value,
120 		     id = aux_history_offset(reg_value)) {
121 			snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
122 			set_survivability_info(mmio, info, id, name);
123 		}
124 	}
125 }
126 
127 static void log_survivability_info(struct pci_dev *pdev)
128 {
129 	struct xe_device *xe = pdev_to_xe_device(pdev);
130 	struct xe_survivability *survivability = &xe->survivability;
131 	struct xe_survivability_info *info = survivability->info;
132 	int id;
133 
134 	dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
135 		 survivability->boot_status);
136 	for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
137 		if (info[id].reg)
138 			dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
139 				 info[id].reg, info[id].value);
140 	}
141 }
142 
143 static int check_boot_failure(struct xe_device *xe)
144 {
145 	struct xe_survivability *survivability = &xe->survivability;
146 
147 	return survivability->boot_status == NON_CRITICAL_FAILURE ||
148 		survivability->boot_status == CRITICAL_FAILURE;
149 }
150 
151 static ssize_t survivability_mode_show(struct device *dev,
152 				       struct device_attribute *attr, char *buff)
153 {
154 	struct pci_dev *pdev = to_pci_dev(dev);
155 	struct xe_device *xe = pdev_to_xe_device(pdev);
156 	struct xe_survivability *survivability = &xe->survivability;
157 	struct xe_survivability_info *info = survivability->info;
158 	int index = 0, count = 0;
159 
160 	count += sysfs_emit_at(buff, count, "Survivability mode type: %s\n",
161 			       survivability->type ? "Runtime" : "Boot");
162 
163 	if (!check_boot_failure(xe))
164 		return count;
165 
166 	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
167 		if (info[index].reg)
168 			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
169 					       info[index].reg, info[index].value);
170 	}
171 
172 	return count;
173 }
174 
175 static DEVICE_ATTR_ADMIN_RO(survivability_mode);
176 
177 static void xe_survivability_mode_fini(void *arg)
178 {
179 	struct xe_device *xe = arg;
180 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
181 	struct device *dev = &pdev->dev;
182 
183 	xe_configfs_clear_survivability_mode(pdev);
184 	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
185 }
186 
187 static int create_survivability_sysfs(struct pci_dev *pdev)
188 {
189 	struct device *dev = &pdev->dev;
190 	struct xe_device *xe = pdev_to_xe_device(pdev);
191 	int ret;
192 
193 	/* create survivability mode sysfs */
194 	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
195 	if (ret) {
196 		dev_warn(dev, "Failed to create survivability sysfs files\n");
197 		return ret;
198 	}
199 
200 	ret = devm_add_action_or_reset(xe->drm.dev,
201 				       xe_survivability_mode_fini, xe);
202 	if (ret)
203 		return ret;
204 
205 	return 0;
206 }
207 
208 static int enable_boot_survivability_mode(struct pci_dev *pdev)
209 {
210 	struct device *dev = &pdev->dev;
211 	struct xe_device *xe = pdev_to_xe_device(pdev);
212 	struct xe_survivability *survivability = &xe->survivability;
213 	int ret = 0;
214 
215 	ret = create_survivability_sysfs(pdev);
216 	if (ret)
217 		return ret;
218 
219 	/* Make sure xe_heci_gsc_init() knows about survivability mode */
220 	survivability->mode = true;
221 
222 	ret = xe_heci_gsc_init(xe);
223 	if (ret)
224 		goto err;
225 
226 	xe_vsec_init(xe);
227 
228 	ret = xe_i2c_probe(xe);
229 	if (ret)
230 		goto err;
231 
232 	dev_err(dev, "In Survivability Mode\n");
233 
234 	return 0;
235 
236 err:
237 	survivability->mode = false;
238 	return ret;
239 }
240 
241 static int init_survivability_mode(struct xe_device *xe)
242 {
243 	struct xe_survivability *survivability = &xe->survivability;
244 	struct xe_survivability_info *info;
245 
246 	survivability->size = MAX_SCRATCH_MMIO;
247 
248 	info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
249 			    GFP_KERNEL);
250 	if (!info)
251 		return -ENOMEM;
252 
253 	survivability->info = info;
254 
255 	populate_survivability_info(xe);
256 
257 	return 0;
258 }
259 
260 /**
261  * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled
262  * @xe: xe device instance
263  *
264  * Returns true if in boot survivability mode of type, else false
265  */
266 bool xe_survivability_mode_is_boot_enabled(struct xe_device *xe)
267 {
268 	struct xe_survivability *survivability = &xe->survivability;
269 
270 	return survivability->mode && survivability->type == XE_SURVIVABILITY_TYPE_BOOT;
271 }
272 
273 /**
274  * xe_survivability_mode_is_requested - check if it's possible to enable survivability
275  *					mode that was requested by firmware or userspace
276  * @xe: xe device instance
277  *
278  * This function reads configfs and  boot status from Pcode.
279  *
280  * Return: true if platform support is available and boot status indicates
281  * failure or if survivability mode is requested, false otherwise.
282  */
283 bool xe_survivability_mode_is_requested(struct xe_device *xe)
284 {
285 	struct xe_survivability *survivability = &xe->survivability;
286 	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
287 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
288 	u32 data;
289 	bool survivability_mode;
290 
291 	if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
292 		return false;
293 
294 	survivability_mode = xe_configfs_get_survivability_mode(pdev);
295 
296 	if (xe->info.platform < XE_BATTLEMAGE) {
297 		if (survivability_mode) {
298 			dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n");
299 			xe_configfs_clear_survivability_mode(pdev);
300 		}
301 		return false;
302 	}
303 
304 	/* Enable survivability mode if set via configfs */
305 	if (survivability_mode)
306 		return true;
307 
308 	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
309 	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
310 
311 	return check_boot_failure(xe);
312 }
313 
314 /**
315  * xe_survivability_mode_runtime_enable - Initialize and enable runtime survivability mode
316  * @xe: xe device instance
317  *
318  * Initialize survivability information and enable runtime survivability mode.
319  * Runtime survivability mode is enabled when certain errors cause the device to be
320  * in non-recoverable state. The device is declared wedged with the appropriate
321  * recovery method and survivability mode sysfs exposed to userspace
322  *
323  * Return: 0 if runtime survivability mode is enabled, negative error code otherwise.
324  */
325 int xe_survivability_mode_runtime_enable(struct xe_device *xe)
326 {
327 	struct xe_survivability *survivability = &xe->survivability;
328 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
329 	int ret;
330 
331 	if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) {
332 		dev_err(&pdev->dev, "Runtime Survivability Mode not supported\n");
333 		return -EINVAL;
334 	}
335 
336 	ret = init_survivability_mode(xe);
337 	if (ret)
338 		return ret;
339 
340 	ret = create_survivability_sysfs(pdev);
341 	if (ret)
342 		dev_err(&pdev->dev, "Failed to create survivability mode sysfs\n");
343 
344 	survivability->type = XE_SURVIVABILITY_TYPE_RUNTIME;
345 	dev_err(&pdev->dev, "Runtime Survivability mode enabled\n");
346 
347 	xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR);
348 	xe_device_declare_wedged(xe);
349 	dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n");
350 
351 	return 0;
352 }
353 
354 /**
355  * xe_survivability_mode_boot_enable - Initialize and enable boot survivability mode
356  * @xe: xe device instance
357  *
358  * Initialize survivability information and enable boot survivability mode
359  *
360  * Return: 0 if boot survivability mode is enabled or not requested, negative error
361  * code otherwise.
362  */
363 int xe_survivability_mode_boot_enable(struct xe_device *xe)
364 {
365 	struct xe_survivability *survivability = &xe->survivability;
366 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
367 	int ret;
368 
369 	if (!xe_survivability_mode_is_requested(xe))
370 		return 0;
371 
372 	ret = init_survivability_mode(xe);
373 	if (ret)
374 		return ret;
375 
376 	/* Log breadcrumbs but do not enter survivability mode for Critical boot errors */
377 	if (survivability->boot_status == CRITICAL_FAILURE) {
378 		log_survivability_info(pdev);
379 		return -ENXIO;
380 	}
381 
382 	survivability->type = XE_SURVIVABILITY_TYPE_BOOT;
383 
384 	return enable_boot_survivability_mode(pdev);
385 }
386