1*b40db12bSRiana Tauro // SPDX-License-Identifier: MIT 2*b40db12bSRiana Tauro /* 3*b40db12bSRiana Tauro * Copyright © 2026 Intel Corporation 4*b40db12bSRiana Tauro */ 5*b40db12bSRiana Tauro 6*b40db12bSRiana Tauro #include <linux/bitmap.h> 7*b40db12bSRiana Tauro 8*b40db12bSRiana Tauro #include <drm/drm_managed.h> 9*b40db12bSRiana Tauro #include <drm/drm_print.h> 10*b40db12bSRiana Tauro #include <drm/drm_ras.h> 11*b40db12bSRiana Tauro 12*b40db12bSRiana Tauro #include "xe_device_types.h" 13*b40db12bSRiana Tauro #include "xe_drm_ras.h" 14*b40db12bSRiana Tauro 15*b40db12bSRiana Tauro static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES; 16*b40db12bSRiana Tauro static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES; 17*b40db12bSRiana Tauro 18*b40db12bSRiana Tauro static int hw_query_error_counter(struct xe_drm_ras_counter *info, 19*b40db12bSRiana Tauro u32 error_id, const char **name, u32 *val) 20*b40db12bSRiana Tauro { 21*b40db12bSRiana Tauro if (!info || !info[error_id].name) 22*b40db12bSRiana Tauro return -ENOENT; 23*b40db12bSRiana Tauro 24*b40db12bSRiana Tauro *name = info[error_id].name; 25*b40db12bSRiana Tauro *val = atomic_read(&info[error_id].counter); 26*b40db12bSRiana Tauro 27*b40db12bSRiana Tauro return 0; 28*b40db12bSRiana Tauro } 29*b40db12bSRiana Tauro 30*b40db12bSRiana Tauro static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id, 31*b40db12bSRiana Tauro const char **name, u32 *val) 32*b40db12bSRiana Tauro { 33*b40db12bSRiana Tauro struct xe_device *xe = ep->priv; 34*b40db12bSRiana Tauro struct xe_drm_ras *ras = &xe->ras; 35*b40db12bSRiana Tauro struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE]; 36*b40db12bSRiana Tauro 37*b40db12bSRiana Tauro return hw_query_error_counter(info, error_id, name, val); 38*b40db12bSRiana Tauro } 39*b40db12bSRiana Tauro 40*b40db12bSRiana Tauro static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id, 41*b40db12bSRiana Tauro const char **name, u32 *val) 42*b40db12bSRiana Tauro { 43*b40db12bSRiana Tauro struct xe_device *xe = ep->priv; 44*b40db12bSRiana Tauro struct xe_drm_ras *ras = &xe->ras; 45*b40db12bSRiana Tauro struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE]; 46*b40db12bSRiana Tauro 47*b40db12bSRiana Tauro return hw_query_error_counter(info, error_id, name, val); 48*b40db12bSRiana Tauro } 49*b40db12bSRiana Tauro 50*b40db12bSRiana Tauro static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe) 51*b40db12bSRiana Tauro { 52*b40db12bSRiana Tauro struct xe_drm_ras_counter *counter; 53*b40db12bSRiana Tauro int i; 54*b40db12bSRiana Tauro 55*b40db12bSRiana Tauro counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL); 56*b40db12bSRiana Tauro if (!counter) 57*b40db12bSRiana Tauro return ERR_PTR(-ENOMEM); 58*b40db12bSRiana Tauro 59*b40db12bSRiana Tauro for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) { 60*b40db12bSRiana Tauro if (!error_components[i]) 61*b40db12bSRiana Tauro continue; 62*b40db12bSRiana Tauro 63*b40db12bSRiana Tauro counter[i].name = error_components[i]; 64*b40db12bSRiana Tauro atomic_set(&counter[i].counter, 0); 65*b40db12bSRiana Tauro } 66*b40db12bSRiana Tauro 67*b40db12bSRiana Tauro return counter; 68*b40db12bSRiana Tauro } 69*b40db12bSRiana Tauro 70*b40db12bSRiana Tauro static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node, 71*b40db12bSRiana Tauro const enum drm_xe_ras_error_severity severity) 72*b40db12bSRiana Tauro { 73*b40db12bSRiana Tauro struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 74*b40db12bSRiana Tauro struct xe_drm_ras *ras = &xe->ras; 75*b40db12bSRiana Tauro const char *device_name; 76*b40db12bSRiana Tauro 77*b40db12bSRiana Tauro device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d", 78*b40db12bSRiana Tauro pci_domain_nr(pdev->bus), pdev->bus->number, 79*b40db12bSRiana Tauro PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); 80*b40db12bSRiana Tauro 81*b40db12bSRiana Tauro if (!device_name) 82*b40db12bSRiana Tauro return -ENOMEM; 83*b40db12bSRiana Tauro 84*b40db12bSRiana Tauro node->device_name = device_name; 85*b40db12bSRiana Tauro node->node_name = error_severity[severity]; 86*b40db12bSRiana Tauro node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER; 87*b40db12bSRiana Tauro node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; 88*b40db12bSRiana Tauro node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1; 89*b40db12bSRiana Tauro node->priv = xe; 90*b40db12bSRiana Tauro 91*b40db12bSRiana Tauro ras->info[severity] = allocate_and_copy_counters(xe); 92*b40db12bSRiana Tauro if (IS_ERR(ras->info[severity])) 93*b40db12bSRiana Tauro return PTR_ERR(ras->info[severity]); 94*b40db12bSRiana Tauro 95*b40db12bSRiana Tauro if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) 96*b40db12bSRiana Tauro node->query_error_counter = query_correctable_error_counter; 97*b40db12bSRiana Tauro else 98*b40db12bSRiana Tauro node->query_error_counter = query_uncorrectable_error_counter; 99*b40db12bSRiana Tauro 100*b40db12bSRiana Tauro return 0; 101*b40db12bSRiana Tauro } 102*b40db12bSRiana Tauro 103*b40db12bSRiana Tauro static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity) 104*b40db12bSRiana Tauro { 105*b40db12bSRiana Tauro struct drm_ras_node *node = &ras->node[severity]; 106*b40db12bSRiana Tauro 107*b40db12bSRiana Tauro kfree(ras->info[severity]); 108*b40db12bSRiana Tauro ras->info[severity] = NULL; 109*b40db12bSRiana Tauro 110*b40db12bSRiana Tauro kfree(node->device_name); 111*b40db12bSRiana Tauro node->device_name = NULL; 112*b40db12bSRiana Tauro } 113*b40db12bSRiana Tauro 114*b40db12bSRiana Tauro static int register_nodes(struct xe_device *xe) 115*b40db12bSRiana Tauro { 116*b40db12bSRiana Tauro struct xe_drm_ras *ras = &xe->ras; 117*b40db12bSRiana Tauro int i; 118*b40db12bSRiana Tauro 119*b40db12bSRiana Tauro for_each_error_severity(i) { 120*b40db12bSRiana Tauro struct drm_ras_node *node = &ras->node[i]; 121*b40db12bSRiana Tauro int ret; 122*b40db12bSRiana Tauro 123*b40db12bSRiana Tauro ret = assign_node_params(xe, node, i); 124*b40db12bSRiana Tauro if (ret) { 125*b40db12bSRiana Tauro cleanup_node_param(ras, i); 126*b40db12bSRiana Tauro return ret; 127*b40db12bSRiana Tauro } 128*b40db12bSRiana Tauro 129*b40db12bSRiana Tauro ret = drm_ras_node_register(node); 130*b40db12bSRiana Tauro if (ret) { 131*b40db12bSRiana Tauro cleanup_node_param(ras, i); 132*b40db12bSRiana Tauro return ret; 133*b40db12bSRiana Tauro } 134*b40db12bSRiana Tauro } 135*b40db12bSRiana Tauro 136*b40db12bSRiana Tauro return 0; 137*b40db12bSRiana Tauro } 138*b40db12bSRiana Tauro 139*b40db12bSRiana Tauro static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg) 140*b40db12bSRiana Tauro { 141*b40db12bSRiana Tauro struct xe_device *xe = arg; 142*b40db12bSRiana Tauro struct xe_drm_ras *ras = &xe->ras; 143*b40db12bSRiana Tauro int i; 144*b40db12bSRiana Tauro 145*b40db12bSRiana Tauro for_each_error_severity(i) { 146*b40db12bSRiana Tauro struct drm_ras_node *node = &ras->node[i]; 147*b40db12bSRiana Tauro 148*b40db12bSRiana Tauro drm_ras_node_unregister(node); 149*b40db12bSRiana Tauro cleanup_node_param(ras, i); 150*b40db12bSRiana Tauro } 151*b40db12bSRiana Tauro } 152*b40db12bSRiana Tauro 153*b40db12bSRiana Tauro /** 154*b40db12bSRiana Tauro * xe_drm_ras_init() - Initialize DRM RAS 155*b40db12bSRiana Tauro * @xe: xe device instance 156*b40db12bSRiana Tauro * 157*b40db12bSRiana Tauro * Allocate and register DRM RAS nodes per device 158*b40db12bSRiana Tauro * 159*b40db12bSRiana Tauro * Return: 0 on success, negative error code otherwise. 160*b40db12bSRiana Tauro */ 161*b40db12bSRiana Tauro int xe_drm_ras_init(struct xe_device *xe) 162*b40db12bSRiana Tauro { 163*b40db12bSRiana Tauro struct xe_drm_ras *ras = &xe->ras; 164*b40db12bSRiana Tauro struct drm_ras_node *node; 165*b40db12bSRiana Tauro int err; 166*b40db12bSRiana Tauro 167*b40db12bSRiana Tauro node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL); 168*b40db12bSRiana Tauro if (!node) 169*b40db12bSRiana Tauro return -ENOMEM; 170*b40db12bSRiana Tauro 171*b40db12bSRiana Tauro ras->node = node; 172*b40db12bSRiana Tauro 173*b40db12bSRiana Tauro err = register_nodes(xe); 174*b40db12bSRiana Tauro if (err) { 175*b40db12bSRiana Tauro drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err)); 176*b40db12bSRiana Tauro return err; 177*b40db12bSRiana Tauro } 178*b40db12bSRiana Tauro 179*b40db12bSRiana Tauro err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe); 180*b40db12bSRiana Tauro if (err) { 181*b40db12bSRiana Tauro drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err)); 182*b40db12bSRiana Tauro return err; 183*b40db12bSRiana Tauro } 184*b40db12bSRiana Tauro 185*b40db12bSRiana Tauro return 0; 186*b40db12bSRiana Tauro } 187