15e940312SRiana Tauro // SPDX-License-Identifier: MIT 25e940312SRiana Tauro /* 35e940312SRiana Tauro * Copyright © 2025 Intel Corporation 45e940312SRiana Tauro */ 55e940312SRiana Tauro 65e940312SRiana Tauro #include "xe_survivability_mode.h" 75e940312SRiana Tauro #include "xe_survivability_mode_types.h" 85e940312SRiana Tauro 95e940312SRiana Tauro #include <linux/kobject.h> 105e940312SRiana Tauro #include <linux/pci.h> 115e940312SRiana Tauro #include <linux/sysfs.h> 125e940312SRiana Tauro 135e940312SRiana Tauro #include "xe_device.h" 145e940312SRiana Tauro #include "xe_gt.h" 158b47c9cdSRiana Tauro #include "xe_heci_gsc.h" 165e940312SRiana Tauro #include "xe_mmio.h" 175e940312SRiana Tauro #include "xe_pcode_api.h" 188b47c9cdSRiana Tauro #include "xe_vsec.h" 195e940312SRiana Tauro 205e940312SRiana Tauro #define MAX_SCRATCH_MMIO 8 215e940312SRiana Tauro 225e940312SRiana Tauro /** 235e940312SRiana Tauro * DOC: Xe Boot Survivability 245e940312SRiana Tauro * 255e940312SRiana Tauro * Boot Survivability is a software based workflow for recovering a system in a failed boot state 265e940312SRiana Tauro * Here system recoverability is concerned with recovering the firmware responsible for boot. 275e940312SRiana Tauro * 285e940312SRiana Tauro * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware 295e940312SRiana Tauro * to be flashed through mei and collect telemetry. The driver's probe flow is modified 305e940312SRiana Tauro * such that it enters survivability mode when pcode initialization is incomplete and boot status 315e940312SRiana Tauro * denotes a failure. The driver then populates the survivability_mode PCI sysfs indicating 325e940312SRiana Tauro * survivability mode and provides additional information required for debug 335e940312SRiana Tauro * 345e940312SRiana Tauro * KMD exposes below admin-only readable sysfs in survivability mode 355e940312SRiana Tauro * 365e940312SRiana Tauro * device/survivability_mode: The presence of this file indicates that the card is in survivability 375e940312SRiana Tauro * mode. Also, provides additional information on why the driver entered 385e940312SRiana Tauro * survivability mode. 395e940312SRiana Tauro * 405e940312SRiana Tauro * Capability Information - Provides boot status 415e940312SRiana Tauro * Postcode Information - Provides information about the failure 425e940312SRiana Tauro * Overflow Information - Provides history of previous failures 435e940312SRiana Tauro * Auxiliary Information - Certain failures may have information in 445e940312SRiana Tauro * addition to postcode information 455e940312SRiana Tauro */ 465e940312SRiana Tauro 475e940312SRiana Tauro static u32 aux_history_offset(u32 reg_value) 485e940312SRiana Tauro { 495e940312SRiana Tauro return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value); 505e940312SRiana Tauro } 515e940312SRiana Tauro 525e940312SRiana Tauro static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info, 535e940312SRiana Tauro int id, char *name) 545e940312SRiana Tauro { 555e940312SRiana Tauro strscpy(info[id].name, name, sizeof(info[id].name)); 565e940312SRiana Tauro info[id].reg = PCODE_SCRATCH(id).raw; 575e940312SRiana Tauro info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); 585e940312SRiana Tauro } 595e940312SRiana Tauro 605e940312SRiana Tauro static void populate_survivability_info(struct xe_device *xe) 615e940312SRiana Tauro { 625e940312SRiana Tauro struct xe_survivability *survivability = &xe->survivability; 635e940312SRiana Tauro struct xe_survivability_info *info = survivability->info; 645e940312SRiana Tauro struct xe_mmio *mmio; 655e940312SRiana Tauro u32 id = 0, reg_value; 665e940312SRiana Tauro char name[NAME_MAX]; 675e940312SRiana Tauro int index; 685e940312SRiana Tauro 695e940312SRiana Tauro mmio = xe_root_tile_mmio(xe); 705e940312SRiana Tauro set_survivability_info(mmio, info, id, "Capability Info"); 715e940312SRiana Tauro reg_value = info[id].value; 725e940312SRiana Tauro 735e940312SRiana Tauro if (reg_value & HISTORY_TRACKING) { 745e940312SRiana Tauro id++; 755e940312SRiana Tauro set_survivability_info(mmio, info, id, "Postcode Info"); 765e940312SRiana Tauro 775e940312SRiana Tauro if (reg_value & OVERFLOW_SUPPORT) { 785e940312SRiana Tauro id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value); 795e940312SRiana Tauro set_survivability_info(mmio, info, id, "Overflow Info"); 805e940312SRiana Tauro } 815e940312SRiana Tauro } 825e940312SRiana Tauro 835e940312SRiana Tauro if (reg_value & AUXINFO_SUPPORT) { 845e940312SRiana Tauro id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value); 855e940312SRiana Tauro 865e940312SRiana Tauro for (index = 0; id && reg_value; index++, reg_value = info[id].value, 875e940312SRiana Tauro id = aux_history_offset(reg_value)) { 885e940312SRiana Tauro snprintf(name, NAME_MAX, "Auxiliary Info %d", index); 895e940312SRiana Tauro set_survivability_info(mmio, info, id, name); 905e940312SRiana Tauro } 915e940312SRiana Tauro } 925e940312SRiana Tauro } 935e940312SRiana Tauro 945e940312SRiana Tauro static void log_survivability_info(struct pci_dev *pdev) 955e940312SRiana Tauro { 965e940312SRiana Tauro struct xe_device *xe = pdev_to_xe_device(pdev); 975e940312SRiana Tauro struct xe_survivability *survivability = &xe->survivability; 985e940312SRiana Tauro struct xe_survivability_info *info = survivability->info; 995e940312SRiana Tauro int id; 1005e940312SRiana Tauro 1015e940312SRiana Tauro dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n", 1025e940312SRiana Tauro survivability->boot_status); 1035e940312SRiana Tauro for (id = 0; id < MAX_SCRATCH_MMIO; id++) { 1045e940312SRiana Tauro if (info[id].reg) 1055e940312SRiana Tauro dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name, 1065e940312SRiana Tauro info[id].reg, info[id].value); 1075e940312SRiana Tauro } 1085e940312SRiana Tauro } 1095e940312SRiana Tauro 1105e940312SRiana Tauro static ssize_t survivability_mode_show(struct device *dev, 1115e940312SRiana Tauro struct device_attribute *attr, char *buff) 1125e940312SRiana Tauro { 1135e940312SRiana Tauro struct pci_dev *pdev = to_pci_dev(dev); 1145e940312SRiana Tauro struct xe_device *xe = pdev_to_xe_device(pdev); 1155e940312SRiana Tauro struct xe_survivability *survivability = &xe->survivability; 1165e940312SRiana Tauro struct xe_survivability_info *info = survivability->info; 1175e940312SRiana Tauro int index = 0, count = 0; 1185e940312SRiana Tauro 1195e940312SRiana Tauro for (index = 0; index < MAX_SCRATCH_MMIO; index++) { 1205e940312SRiana Tauro if (info[index].reg) 1215e940312SRiana Tauro count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name, 1225e940312SRiana Tauro info[index].reg, info[index].value); 1235e940312SRiana Tauro } 1245e940312SRiana Tauro 1255e940312SRiana Tauro return count; 1265e940312SRiana Tauro } 1275e940312SRiana Tauro 1285e940312SRiana Tauro static DEVICE_ATTR_ADMIN_RO(survivability_mode); 1295e940312SRiana Tauro 130d40f275dSLucas De Marchi static void xe_survivability_mode_fini(void *arg) 131d40f275dSLucas De Marchi { 132d40f275dSLucas De Marchi struct xe_device *xe = arg; 133d40f275dSLucas De Marchi struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 134d40f275dSLucas De Marchi struct device *dev = &pdev->dev; 135d40f275dSLucas De Marchi 136d40f275dSLucas De Marchi sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); 137d40f275dSLucas De Marchi } 138d40f275dSLucas De Marchi 139d40f275dSLucas De Marchi static int enable_survivability_mode(struct pci_dev *pdev) 1405e940312SRiana Tauro { 1415e940312SRiana Tauro struct device *dev = &pdev->dev; 1425e940312SRiana Tauro struct xe_device *xe = pdev_to_xe_device(pdev); 1435e940312SRiana Tauro struct xe_survivability *survivability = &xe->survivability; 1445e940312SRiana Tauro int ret = 0; 1455e940312SRiana Tauro 1465e940312SRiana Tauro /* create survivability mode sysfs */ 1475e940312SRiana Tauro ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); 1485e940312SRiana Tauro if (ret) { 1495e940312SRiana Tauro dev_warn(dev, "Failed to create survivability sysfs files\n"); 150d40f275dSLucas De Marchi return ret; 1515e940312SRiana Tauro } 1528b47c9cdSRiana Tauro 153d40f275dSLucas De Marchi ret = devm_add_action_or_reset(xe->drm.dev, 154d40f275dSLucas De Marchi xe_survivability_mode_fini, xe); 155d40f275dSLucas De Marchi if (ret) 156d40f275dSLucas De Marchi return ret; 157d40f275dSLucas De Marchi 158*22d00862SLucas De Marchi /* Make sure xe_heci_gsc_init() knows about survivability mode */ 159*22d00862SLucas De Marchi survivability->mode = true; 160*22d00862SLucas De Marchi 161292b1a8aSLucas De Marchi ret = xe_heci_gsc_init(xe); 162*22d00862SLucas De Marchi if (ret) { 163*22d00862SLucas De Marchi /* 164*22d00862SLucas De Marchi * But if it fails, device can't enter survivability 165*22d00862SLucas De Marchi * so move it back for correct error handling 166*22d00862SLucas De Marchi */ 167*22d00862SLucas De Marchi survivability->mode = false; 168292b1a8aSLucas De Marchi return ret; 169*22d00862SLucas De Marchi } 1708b47c9cdSRiana Tauro 1718b47c9cdSRiana Tauro xe_vsec_init(xe); 172d40f275dSLucas De Marchi 173d40f275dSLucas De Marchi dev_err(dev, "In Survivability Mode\n"); 174d40f275dSLucas De Marchi 175d40f275dSLucas De Marchi return 0; 1765e940312SRiana Tauro } 1775e940312SRiana Tauro 1785e940312SRiana Tauro /** 179d40f275dSLucas De Marchi * xe_survivability_mode_is_enabled - check if survivability mode is enabled 180256daa32SRiana Tauro * @xe: xe device instance 181256daa32SRiana Tauro * 182256daa32SRiana Tauro * Returns true if in survivability mode, false otherwise 183256daa32SRiana Tauro */ 184d40f275dSLucas De Marchi bool xe_survivability_mode_is_enabled(struct xe_device *xe) 185256daa32SRiana Tauro { 186d40f275dSLucas De Marchi return xe->survivability.mode; 187256daa32SRiana Tauro } 188256daa32SRiana Tauro 189caf2f156SLucas De Marchi /* 190caf2f156SLucas De Marchi * survivability_mode_requested - check if it's possible to enable 191caf2f156SLucas De Marchi * survivability mode and that was requested by firmware 1925e940312SRiana Tauro * 193caf2f156SLucas De Marchi * This function reads the boot status from Pcode. 1945e940312SRiana Tauro * 195caf2f156SLucas De Marchi * Return: true if platform support is available and boot status indicates 196caf2f156SLucas De Marchi * failure, false otherwise. 1975e940312SRiana Tauro */ 198caf2f156SLucas De Marchi static bool survivability_mode_requested(struct xe_device *xe) 1995e940312SRiana Tauro { 2005e940312SRiana Tauro struct xe_survivability *survivability = &xe->survivability; 2015e940312SRiana Tauro struct xe_mmio *mmio = xe_root_tile_mmio(xe); 2025e940312SRiana Tauro u32 data; 2035e940312SRiana Tauro 204d9bc3044SRiana Tauro if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe)) 205256daa32SRiana Tauro return false; 206256daa32SRiana Tauro 2075e940312SRiana Tauro data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); 2085e940312SRiana Tauro survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); 2095e940312SRiana Tauro 210d40f275dSLucas De Marchi return survivability->boot_status == NON_CRITICAL_FAILURE || 211d40f275dSLucas De Marchi survivability->boot_status == CRITICAL_FAILURE; 2125e940312SRiana Tauro } 2135e940312SRiana Tauro 2145e940312SRiana Tauro /** 215d40f275dSLucas De Marchi * xe_survivability_mode_enable - Initialize and enable the survivability mode 2165e940312SRiana Tauro * @xe: xe device instance 2175e940312SRiana Tauro * 218d40f275dSLucas De Marchi * Initialize survivability information and enable survivability mode 2195e940312SRiana Tauro * 220caf2f156SLucas De Marchi * Return: 0 if survivability mode is enabled or not requested; negative error 221caf2f156SLucas De Marchi * code otherwise. 2225e940312SRiana Tauro */ 223d40f275dSLucas De Marchi int xe_survivability_mode_enable(struct xe_device *xe) 2245e940312SRiana Tauro { 2255e940312SRiana Tauro struct xe_survivability *survivability = &xe->survivability; 2265e940312SRiana Tauro struct xe_survivability_info *info; 2275e940312SRiana Tauro struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 2285e940312SRiana Tauro 229caf2f156SLucas De Marchi if (!survivability_mode_requested(xe)) 230caf2f156SLucas De Marchi return 0; 231caf2f156SLucas De Marchi 2325e940312SRiana Tauro survivability->size = MAX_SCRATCH_MMIO; 2335e940312SRiana Tauro 234d40f275dSLucas De Marchi info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), 235d40f275dSLucas De Marchi GFP_KERNEL); 2365e940312SRiana Tauro if (!info) 237d40f275dSLucas De Marchi return -ENOMEM; 2385e940312SRiana Tauro 2395e940312SRiana Tauro survivability->info = info; 2405e940312SRiana Tauro 2415e940312SRiana Tauro populate_survivability_info(xe); 2425e940312SRiana Tauro 2435e940312SRiana Tauro /* Only log debug information and exit if it is a critical failure */ 2445e940312SRiana Tauro if (survivability->boot_status == CRITICAL_FAILURE) { 2455e940312SRiana Tauro log_survivability_info(pdev); 246d40f275dSLucas De Marchi return -ENXIO; 2475e940312SRiana Tauro } 2485e940312SRiana Tauro 249d40f275dSLucas De Marchi return enable_survivability_mode(pdev); 2505e940312SRiana Tauro } 251