xref: /linux/drivers/gpu/drm/xe/xe_survivability_mode.c (revision 72c181399b01bb4836d1fabaa9f5f6438c82178e)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include "xe_survivability_mode.h"
7 #include "xe_survivability_mode_types.h"
8 
9 #include <linux/kobject.h>
10 #include <linux/pci.h>
11 #include <linux/sysfs.h>
12 
13 #include "xe_configfs.h"
14 #include "xe_device.h"
15 #include "xe_gt.h"
16 #include "xe_heci_gsc.h"
17 #include "xe_i2c.h"
18 #include "xe_mmio.h"
19 #include "xe_pcode_api.h"
20 #include "xe_vsec.h"
21 
22 #define MAX_SCRATCH_MMIO 8
23 
24 /**
25  * DOC: Survivability Mode
26  *
27  * Survivability Mode is a software based workflow for recovering a system in a failed boot state
28  * Here system recoverability is concerned with recovering the firmware responsible for boot.
29  *
30  * Boot Survivability
31  * ===================
32  *
33  * Boot Survivability is implemented by loading the driver with bare minimum (no drm card) to allow
34  * the firmware to be flashed through mei driver and collect telemetry. The driver's probe flow is
35  * modified such that it enters survivability mode when pcode initialization is incomplete and boot
36  * status denotes a failure.
37  *
38  * Survivability mode can also be entered manually using the survivability mode attribute available
39  * through configfs which is beneficial in several usecases. It can be used to address scenarios
40  * where pcode does not detect failure or for validation purposes. It can also be used in
41  * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node.
42  *
43  * Use below command enable survivability mode manually::
44  *
45  *	# echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
46  *
47  * It is the responsibility of the user to clear the mode once firmware flash is complete.
48  *
49  * Refer :ref:`xe_configfs` for more details on how to use configfs
50  *
51  * Survivability mode is indicated by the below admin-only readable sysfs which provides additional
52  * debug information::
53  *
54  *	/sys/bus/pci/devices/<device>/survivability_mode
55  *
56  * Capability Information:
57  *	Provides boot status
58  * Postcode Information:
59  *	Provides information about the failure
60  * Overflow Information
61  *	Provides history of previous failures
62  * Auxiliary Information
63  *	Certain failures may have information in addition to postcode information
64  *
65  * Runtime Survivability
66  * =====================
67  *
68  * Certain runtime firmware errors can cause the device to enter a wedged state
69  * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation.
70  * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and
71  * is indicated by the presence of survivability mode sysfs::
72  *
73  *	/sys/bus/pci/devices/<device>/survivability_mode
74  *
75  * Survivability mode sysfs provides information about the type of survivability mode.
76  *
77  * When such errors occur, userspace is notified with the drm device wedged uevent and runtime
78  * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd
79  * to restore device to normal operation.
80  */
81 
aux_history_offset(u32 reg_value)82 static u32 aux_history_offset(u32 reg_value)
83 {
84 	return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
85 }
86 
set_survivability_info(struct xe_mmio * mmio,struct xe_survivability_info * info,int id,char * name)87 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
88 				   int id, char *name)
89 {
90 	strscpy(info[id].name, name, sizeof(info[id].name));
91 	info[id].reg = PCODE_SCRATCH(id).raw;
92 	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
93 }
94 
populate_survivability_info(struct xe_device * xe)95 static void populate_survivability_info(struct xe_device *xe)
96 {
97 	struct xe_survivability *survivability = &xe->survivability;
98 	struct xe_survivability_info *info = survivability->info;
99 	struct xe_mmio *mmio;
100 	u32 id = 0, reg_value;
101 	char name[NAME_MAX];
102 	int index;
103 
104 	mmio = xe_root_tile_mmio(xe);
105 	set_survivability_info(mmio, info, id, "Capability Info");
106 	reg_value = info[id].value;
107 
108 	if (reg_value & HISTORY_TRACKING) {
109 		id++;
110 		set_survivability_info(mmio, info, id, "Postcode Info");
111 
112 		if (reg_value & OVERFLOW_SUPPORT) {
113 			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
114 			set_survivability_info(mmio, info, id, "Overflow Info");
115 		}
116 	}
117 
118 	if (reg_value & AUXINFO_SUPPORT) {
119 		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
120 
121 		for (index = 0; id && reg_value; index++, reg_value = info[id].value,
122 		     id = aux_history_offset(reg_value)) {
123 			snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
124 			set_survivability_info(mmio, info, id, name);
125 		}
126 	}
127 }
128 
log_survivability_info(struct pci_dev * pdev)129 static void log_survivability_info(struct pci_dev *pdev)
130 {
131 	struct xe_device *xe = pdev_to_xe_device(pdev);
132 	struct xe_survivability *survivability = &xe->survivability;
133 	struct xe_survivability_info *info = survivability->info;
134 	int id;
135 
136 	dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
137 		 survivability->boot_status);
138 	for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
139 		if (info[id].reg)
140 			dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
141 				 info[id].reg, info[id].value);
142 	}
143 }
144 
check_boot_failure(struct xe_device * xe)145 static int check_boot_failure(struct xe_device *xe)
146 {
147 	struct xe_survivability *survivability = &xe->survivability;
148 
149 	return survivability->boot_status == NON_CRITICAL_FAILURE ||
150 		survivability->boot_status == CRITICAL_FAILURE;
151 }
152 
survivability_mode_show(struct device * dev,struct device_attribute * attr,char * buff)153 static ssize_t survivability_mode_show(struct device *dev,
154 				       struct device_attribute *attr, char *buff)
155 {
156 	struct pci_dev *pdev = to_pci_dev(dev);
157 	struct xe_device *xe = pdev_to_xe_device(pdev);
158 	struct xe_survivability *survivability = &xe->survivability;
159 	struct xe_survivability_info *info = survivability->info;
160 	int index = 0, count = 0;
161 
162 	count += sysfs_emit_at(buff, count, "Survivability mode type: %s\n",
163 			       survivability->type ? "Runtime" : "Boot");
164 
165 	if (!check_boot_failure(xe))
166 		return count;
167 
168 	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
169 		if (info[index].reg)
170 			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
171 					       info[index].reg, info[index].value);
172 	}
173 
174 	return count;
175 }
176 
177 static DEVICE_ATTR_ADMIN_RO(survivability_mode);
178 
xe_survivability_mode_fini(void * arg)179 static void xe_survivability_mode_fini(void *arg)
180 {
181 	struct xe_device *xe = arg;
182 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
183 	struct device *dev = &pdev->dev;
184 
185 	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
186 }
187 
create_survivability_sysfs(struct pci_dev * pdev)188 static int create_survivability_sysfs(struct pci_dev *pdev)
189 {
190 	struct device *dev = &pdev->dev;
191 	struct xe_device *xe = pdev_to_xe_device(pdev);
192 	int ret;
193 
194 	/* create survivability mode sysfs */
195 	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
196 	if (ret) {
197 		dev_warn(dev, "Failed to create survivability sysfs files\n");
198 		return ret;
199 	}
200 
201 	ret = devm_add_action_or_reset(xe->drm.dev,
202 				       xe_survivability_mode_fini, xe);
203 	if (ret)
204 		return ret;
205 
206 	return 0;
207 }
208 
enable_boot_survivability_mode(struct pci_dev * pdev)209 static int enable_boot_survivability_mode(struct pci_dev *pdev)
210 {
211 	struct device *dev = &pdev->dev;
212 	struct xe_device *xe = pdev_to_xe_device(pdev);
213 	struct xe_survivability *survivability = &xe->survivability;
214 	int ret = 0;
215 
216 	ret = create_survivability_sysfs(pdev);
217 	if (ret)
218 		return ret;
219 
220 	/* Make sure xe_heci_gsc_init() knows about survivability mode */
221 	survivability->mode = true;
222 
223 	ret = xe_heci_gsc_init(xe);
224 	if (ret)
225 		goto err;
226 
227 	xe_vsec_init(xe);
228 
229 	ret = xe_i2c_probe(xe);
230 	if (ret)
231 		goto err;
232 
233 	dev_err(dev, "In Survivability Mode\n");
234 
235 	return 0;
236 
237 err:
238 	survivability->mode = false;
239 	return ret;
240 }
241 
init_survivability_mode(struct xe_device * xe)242 static int init_survivability_mode(struct xe_device *xe)
243 {
244 	struct xe_survivability *survivability = &xe->survivability;
245 	struct xe_survivability_info *info;
246 
247 	survivability->size = MAX_SCRATCH_MMIO;
248 
249 	info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
250 			    GFP_KERNEL);
251 	if (!info)
252 		return -ENOMEM;
253 
254 	survivability->info = info;
255 
256 	populate_survivability_info(xe);
257 
258 	return 0;
259 }
260 
261 /**
262  * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled
263  * @xe: xe device instance
264  *
265  * Returns true if in boot survivability mode of type, else false
266  */
xe_survivability_mode_is_boot_enabled(struct xe_device * xe)267 bool xe_survivability_mode_is_boot_enabled(struct xe_device *xe)
268 {
269 	struct xe_survivability *survivability = &xe->survivability;
270 
271 	return survivability->mode && survivability->type == XE_SURVIVABILITY_TYPE_BOOT;
272 }
273 
274 /**
275  * xe_survivability_mode_is_requested - check if it's possible to enable survivability
276  *					mode that was requested by firmware or userspace
277  * @xe: xe device instance
278  *
279  * This function reads configfs and  boot status from Pcode.
280  *
281  * Return: true if platform support is available and boot status indicates
282  * failure or if survivability mode is requested, false otherwise.
283  */
xe_survivability_mode_is_requested(struct xe_device * xe)284 bool xe_survivability_mode_is_requested(struct xe_device *xe)
285 {
286 	struct xe_survivability *survivability = &xe->survivability;
287 	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
288 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
289 	u32 data;
290 	bool survivability_mode;
291 
292 	if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE)
293 		return false;
294 
295 	survivability_mode = xe_configfs_get_survivability_mode(pdev);
296 	/* Enable survivability mode if set via configfs */
297 	if (survivability_mode)
298 		return true;
299 
300 	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
301 	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
302 
303 	return check_boot_failure(xe);
304 }
305 
306 /**
307  * xe_survivability_mode_runtime_enable - Initialize and enable runtime survivability mode
308  * @xe: xe device instance
309  *
310  * Initialize survivability information and enable runtime survivability mode.
311  * Runtime survivability mode is enabled when certain errors cause the device to be
312  * in non-recoverable state. The device is declared wedged with the appropriate
313  * recovery method and survivability mode sysfs exposed to userspace
314  *
315  * Return: 0 if runtime survivability mode is enabled, negative error code otherwise.
316  */
xe_survivability_mode_runtime_enable(struct xe_device * xe)317 int xe_survivability_mode_runtime_enable(struct xe_device *xe)
318 {
319 	struct xe_survivability *survivability = &xe->survivability;
320 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
321 	int ret;
322 
323 	if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) {
324 		dev_err(&pdev->dev, "Runtime Survivability Mode not supported\n");
325 		return -EINVAL;
326 	}
327 
328 	ret = init_survivability_mode(xe);
329 	if (ret)
330 		return ret;
331 
332 	ret = create_survivability_sysfs(pdev);
333 	if (ret)
334 		dev_err(&pdev->dev, "Failed to create survivability mode sysfs\n");
335 
336 	survivability->type = XE_SURVIVABILITY_TYPE_RUNTIME;
337 	dev_err(&pdev->dev, "Runtime Survivability mode enabled\n");
338 
339 	xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR);
340 	xe_device_declare_wedged(xe);
341 	dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n");
342 
343 	return 0;
344 }
345 
346 /**
347  * xe_survivability_mode_boot_enable - Initialize and enable boot survivability mode
348  * @xe: xe device instance
349  *
350  * Initialize survivability information and enable boot survivability mode
351  *
352  * Return: 0 if boot survivability mode is enabled or not requested, negative error
353  * code otherwise.
354  */
xe_survivability_mode_boot_enable(struct xe_device * xe)355 int xe_survivability_mode_boot_enable(struct xe_device *xe)
356 {
357 	struct xe_survivability *survivability = &xe->survivability;
358 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
359 	int ret;
360 
361 	if (!xe_survivability_mode_is_requested(xe))
362 		return 0;
363 
364 	ret = init_survivability_mode(xe);
365 	if (ret)
366 		return ret;
367 
368 	/* Log breadcrumbs but do not enter survivability mode for Critical boot errors */
369 	if (survivability->boot_status == CRITICAL_FAILURE) {
370 		log_survivability_info(pdev);
371 		return -ENXIO;
372 	}
373 
374 	survivability->type = XE_SURVIVABILITY_TYPE_BOOT;
375 
376 	return enable_boot_survivability_mode(pdev);
377 }
378