xref: /linux/drivers/gpu/drm/xe/xe_survivability_mode.c (revision 4f9786035f9e519db41375818e1d0b5f20da2f10)
15e940312SRiana Tauro // SPDX-License-Identifier: MIT
25e940312SRiana Tauro /*
35e940312SRiana Tauro  * Copyright © 2025 Intel Corporation
45e940312SRiana Tauro  */
55e940312SRiana Tauro 
65e940312SRiana Tauro #include "xe_survivability_mode.h"
75e940312SRiana Tauro #include "xe_survivability_mode_types.h"
85e940312SRiana Tauro 
95e940312SRiana Tauro #include <linux/kobject.h>
105e940312SRiana Tauro #include <linux/pci.h>
115e940312SRiana Tauro #include <linux/sysfs.h>
125e940312SRiana Tauro 
135e940312SRiana Tauro #include "xe_device.h"
145e940312SRiana Tauro #include "xe_gt.h"
158b47c9cdSRiana Tauro #include "xe_heci_gsc.h"
165e940312SRiana Tauro #include "xe_mmio.h"
175e940312SRiana Tauro #include "xe_pcode_api.h"
188b47c9cdSRiana Tauro #include "xe_vsec.h"
195e940312SRiana Tauro 
205e940312SRiana Tauro #define MAX_SCRATCH_MMIO 8
215e940312SRiana Tauro 
225e940312SRiana Tauro /**
235e940312SRiana Tauro  * DOC: Xe Boot Survivability
245e940312SRiana Tauro  *
255e940312SRiana Tauro  * Boot Survivability is a software based workflow for recovering a system in a failed boot state
265e940312SRiana Tauro  * Here system recoverability is concerned with recovering the firmware responsible for boot.
275e940312SRiana Tauro  *
285e940312SRiana Tauro  * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
295e940312SRiana Tauro  * to be flashed through mei and collect telemetry. The driver's probe flow is modified
305e940312SRiana Tauro  * such that it enters survivability mode when pcode initialization is incomplete and boot status
315e940312SRiana Tauro  * denotes a failure. The driver then  populates the survivability_mode PCI sysfs indicating
325e940312SRiana Tauro  * survivability mode and provides additional information required for debug
335e940312SRiana Tauro  *
345e940312SRiana Tauro  * KMD exposes below admin-only readable sysfs in survivability mode
355e940312SRiana Tauro  *
365e940312SRiana Tauro  * device/survivability_mode: The presence of this file indicates that the card is in survivability
375e940312SRiana Tauro  *			      mode. Also, provides additional information on why the driver entered
385e940312SRiana Tauro  *			      survivability mode.
395e940312SRiana Tauro  *
405e940312SRiana Tauro  *			      Capability Information - Provides boot status
415e940312SRiana Tauro  *			      Postcode Information   - Provides information about the failure
425e940312SRiana Tauro  *			      Overflow Information   - Provides history of previous failures
435e940312SRiana Tauro  *			      Auxiliary Information  - Certain failures may have information in
445e940312SRiana Tauro  *						       addition to postcode information
455e940312SRiana Tauro  */
465e940312SRiana Tauro 
475e940312SRiana Tauro static u32 aux_history_offset(u32 reg_value)
485e940312SRiana Tauro {
495e940312SRiana Tauro 	return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
505e940312SRiana Tauro }
515e940312SRiana Tauro 
525e940312SRiana Tauro static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
535e940312SRiana Tauro 				   int id, char *name)
545e940312SRiana Tauro {
555e940312SRiana Tauro 	strscpy(info[id].name, name, sizeof(info[id].name));
565e940312SRiana Tauro 	info[id].reg = PCODE_SCRATCH(id).raw;
575e940312SRiana Tauro 	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
585e940312SRiana Tauro }
595e940312SRiana Tauro 
605e940312SRiana Tauro static void populate_survivability_info(struct xe_device *xe)
615e940312SRiana Tauro {
625e940312SRiana Tauro 	struct xe_survivability *survivability = &xe->survivability;
635e940312SRiana Tauro 	struct xe_survivability_info *info = survivability->info;
645e940312SRiana Tauro 	struct xe_mmio *mmio;
655e940312SRiana Tauro 	u32 id = 0, reg_value;
665e940312SRiana Tauro 	char name[NAME_MAX];
675e940312SRiana Tauro 	int index;
685e940312SRiana Tauro 
695e940312SRiana Tauro 	mmio = xe_root_tile_mmio(xe);
705e940312SRiana Tauro 	set_survivability_info(mmio, info, id, "Capability Info");
715e940312SRiana Tauro 	reg_value = info[id].value;
725e940312SRiana Tauro 
735e940312SRiana Tauro 	if (reg_value & HISTORY_TRACKING) {
745e940312SRiana Tauro 		id++;
755e940312SRiana Tauro 		set_survivability_info(mmio, info, id, "Postcode Info");
765e940312SRiana Tauro 
775e940312SRiana Tauro 		if (reg_value & OVERFLOW_SUPPORT) {
785e940312SRiana Tauro 			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
795e940312SRiana Tauro 			set_survivability_info(mmio, info, id, "Overflow Info");
805e940312SRiana Tauro 		}
815e940312SRiana Tauro 	}
825e940312SRiana Tauro 
835e940312SRiana Tauro 	if (reg_value & AUXINFO_SUPPORT) {
845e940312SRiana Tauro 		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
855e940312SRiana Tauro 
865e940312SRiana Tauro 		for (index = 0; id && reg_value; index++, reg_value = info[id].value,
875e940312SRiana Tauro 		     id = aux_history_offset(reg_value)) {
885e940312SRiana Tauro 			snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
895e940312SRiana Tauro 			set_survivability_info(mmio, info, id, name);
905e940312SRiana Tauro 		}
915e940312SRiana Tauro 	}
925e940312SRiana Tauro }
935e940312SRiana Tauro 
945e940312SRiana Tauro static void log_survivability_info(struct pci_dev *pdev)
955e940312SRiana Tauro {
965e940312SRiana Tauro 	struct xe_device *xe = pdev_to_xe_device(pdev);
975e940312SRiana Tauro 	struct xe_survivability *survivability = &xe->survivability;
985e940312SRiana Tauro 	struct xe_survivability_info *info = survivability->info;
995e940312SRiana Tauro 	int id;
1005e940312SRiana Tauro 
1015e940312SRiana Tauro 	dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
1025e940312SRiana Tauro 		 survivability->boot_status);
1035e940312SRiana Tauro 	for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
1045e940312SRiana Tauro 		if (info[id].reg)
1055e940312SRiana Tauro 			dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
1065e940312SRiana Tauro 				 info[id].reg, info[id].value);
1075e940312SRiana Tauro 	}
1085e940312SRiana Tauro }
1095e940312SRiana Tauro 
1105e940312SRiana Tauro static ssize_t survivability_mode_show(struct device *dev,
1115e940312SRiana Tauro 				       struct device_attribute *attr, char *buff)
1125e940312SRiana Tauro {
1135e940312SRiana Tauro 	struct pci_dev *pdev = to_pci_dev(dev);
1145e940312SRiana Tauro 	struct xe_device *xe = pdev_to_xe_device(pdev);
1155e940312SRiana Tauro 	struct xe_survivability *survivability = &xe->survivability;
1165e940312SRiana Tauro 	struct xe_survivability_info *info = survivability->info;
1175e940312SRiana Tauro 	int index = 0, count = 0;
1185e940312SRiana Tauro 
1195e940312SRiana Tauro 	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
1205e940312SRiana Tauro 		if (info[index].reg)
1215e940312SRiana Tauro 			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
1225e940312SRiana Tauro 					       info[index].reg, info[index].value);
1235e940312SRiana Tauro 	}
1245e940312SRiana Tauro 
1255e940312SRiana Tauro 	return count;
1265e940312SRiana Tauro }
1275e940312SRiana Tauro 
1285e940312SRiana Tauro static DEVICE_ATTR_ADMIN_RO(survivability_mode);
1295e940312SRiana Tauro 
130d40f275dSLucas De Marchi static void xe_survivability_mode_fini(void *arg)
131d40f275dSLucas De Marchi {
132d40f275dSLucas De Marchi 	struct xe_device *xe = arg;
133d40f275dSLucas De Marchi 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
134d40f275dSLucas De Marchi 	struct device *dev = &pdev->dev;
135d40f275dSLucas De Marchi 
136d40f275dSLucas De Marchi 	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
137d40f275dSLucas De Marchi }
138d40f275dSLucas De Marchi 
139d40f275dSLucas De Marchi static int enable_survivability_mode(struct pci_dev *pdev)
1405e940312SRiana Tauro {
1415e940312SRiana Tauro 	struct device *dev = &pdev->dev;
1425e940312SRiana Tauro 	struct xe_device *xe = pdev_to_xe_device(pdev);
1435e940312SRiana Tauro 	struct xe_survivability *survivability = &xe->survivability;
1445e940312SRiana Tauro 	int ret = 0;
1455e940312SRiana Tauro 
1465e940312SRiana Tauro 	/* create survivability mode sysfs */
1475e940312SRiana Tauro 	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
1485e940312SRiana Tauro 	if (ret) {
1495e940312SRiana Tauro 		dev_warn(dev, "Failed to create survivability sysfs files\n");
150d40f275dSLucas De Marchi 		return ret;
1515e940312SRiana Tauro 	}
1528b47c9cdSRiana Tauro 
153d40f275dSLucas De Marchi 	ret = devm_add_action_or_reset(xe->drm.dev,
154d40f275dSLucas De Marchi 				       xe_survivability_mode_fini, xe);
155d40f275dSLucas De Marchi 	if (ret)
156d40f275dSLucas De Marchi 		return ret;
157d40f275dSLucas De Marchi 
158*22d00862SLucas De Marchi 	/* Make sure xe_heci_gsc_init() knows about survivability mode */
159*22d00862SLucas De Marchi 	survivability->mode = true;
160*22d00862SLucas De Marchi 
161292b1a8aSLucas De Marchi 	ret = xe_heci_gsc_init(xe);
162*22d00862SLucas De Marchi 	if (ret) {
163*22d00862SLucas De Marchi 		/*
164*22d00862SLucas De Marchi 		 * But if it fails, device can't enter survivability
165*22d00862SLucas De Marchi 		 * so move it back for correct error handling
166*22d00862SLucas De Marchi 		 */
167*22d00862SLucas De Marchi 		survivability->mode = false;
168292b1a8aSLucas De Marchi 		return ret;
169*22d00862SLucas De Marchi 	}
1708b47c9cdSRiana Tauro 
1718b47c9cdSRiana Tauro 	xe_vsec_init(xe);
172d40f275dSLucas De Marchi 
173d40f275dSLucas De Marchi 	dev_err(dev, "In Survivability Mode\n");
174d40f275dSLucas De Marchi 
175d40f275dSLucas De Marchi 	return 0;
1765e940312SRiana Tauro }
1775e940312SRiana Tauro 
1785e940312SRiana Tauro /**
179d40f275dSLucas De Marchi  * xe_survivability_mode_is_enabled - check if survivability mode is enabled
180256daa32SRiana Tauro  * @xe: xe device instance
181256daa32SRiana Tauro  *
182256daa32SRiana Tauro  * Returns true if in survivability mode, false otherwise
183256daa32SRiana Tauro  */
184d40f275dSLucas De Marchi bool xe_survivability_mode_is_enabled(struct xe_device *xe)
185256daa32SRiana Tauro {
186d40f275dSLucas De Marchi 	return xe->survivability.mode;
187256daa32SRiana Tauro }
188256daa32SRiana Tauro 
189caf2f156SLucas De Marchi /*
190caf2f156SLucas De Marchi  * survivability_mode_requested - check if it's possible to enable
191caf2f156SLucas De Marchi  * survivability mode and that was requested by firmware
1925e940312SRiana Tauro  *
193caf2f156SLucas De Marchi  * This function reads the boot status from Pcode.
1945e940312SRiana Tauro  *
195caf2f156SLucas De Marchi  * Return: true if platform support is available and boot status indicates
196caf2f156SLucas De Marchi  * failure, false otherwise.
1975e940312SRiana Tauro  */
198caf2f156SLucas De Marchi static bool survivability_mode_requested(struct xe_device *xe)
1995e940312SRiana Tauro {
2005e940312SRiana Tauro 	struct xe_survivability *survivability = &xe->survivability;
2015e940312SRiana Tauro 	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
2025e940312SRiana Tauro 	u32 data;
2035e940312SRiana Tauro 
204d9bc3044SRiana Tauro 	if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe))
205256daa32SRiana Tauro 		return false;
206256daa32SRiana Tauro 
2075e940312SRiana Tauro 	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
2085e940312SRiana Tauro 	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
2095e940312SRiana Tauro 
210d40f275dSLucas De Marchi 	return survivability->boot_status == NON_CRITICAL_FAILURE ||
211d40f275dSLucas De Marchi 		survivability->boot_status == CRITICAL_FAILURE;
2125e940312SRiana Tauro }
2135e940312SRiana Tauro 
2145e940312SRiana Tauro /**
215d40f275dSLucas De Marchi  * xe_survivability_mode_enable - Initialize and enable the survivability mode
2165e940312SRiana Tauro  * @xe: xe device instance
2175e940312SRiana Tauro  *
218d40f275dSLucas De Marchi  * Initialize survivability information and enable survivability mode
2195e940312SRiana Tauro  *
220caf2f156SLucas De Marchi  * Return: 0 if survivability mode is enabled or not requested; negative error
221caf2f156SLucas De Marchi  * code otherwise.
2225e940312SRiana Tauro  */
223d40f275dSLucas De Marchi int xe_survivability_mode_enable(struct xe_device *xe)
2245e940312SRiana Tauro {
2255e940312SRiana Tauro 	struct xe_survivability *survivability = &xe->survivability;
2265e940312SRiana Tauro 	struct xe_survivability_info *info;
2275e940312SRiana Tauro 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
2285e940312SRiana Tauro 
229caf2f156SLucas De Marchi 	if (!survivability_mode_requested(xe))
230caf2f156SLucas De Marchi 		return 0;
231caf2f156SLucas De Marchi 
2325e940312SRiana Tauro 	survivability->size = MAX_SCRATCH_MMIO;
2335e940312SRiana Tauro 
234d40f275dSLucas De Marchi 	info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
235d40f275dSLucas De Marchi 			    GFP_KERNEL);
2365e940312SRiana Tauro 	if (!info)
237d40f275dSLucas De Marchi 		return -ENOMEM;
2385e940312SRiana Tauro 
2395e940312SRiana Tauro 	survivability->info = info;
2405e940312SRiana Tauro 
2415e940312SRiana Tauro 	populate_survivability_info(xe);
2425e940312SRiana Tauro 
2435e940312SRiana Tauro 	/* Only log debug information and exit if it is a critical failure */
2445e940312SRiana Tauro 	if (survivability->boot_status == CRITICAL_FAILURE) {
2455e940312SRiana Tauro 		log_survivability_info(pdev);
246d40f275dSLucas De Marchi 		return -ENXIO;
2475e940312SRiana Tauro 	}
2485e940312SRiana Tauro 
249d40f275dSLucas De Marchi 	return enable_survivability_mode(pdev);
2505e940312SRiana Tauro }
251