1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2026 Intel Corporation 4 */ 5 6 #include <linux/bitmap.h> 7 8 #include <drm/drm_managed.h> 9 #include <drm/drm_print.h> 10 #include <drm/drm_ras.h> 11 12 #include "xe_device_types.h" 13 #include "xe_drm_ras.h" 14 15 static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES; 16 static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES; 17 18 static int hw_query_error_counter(struct xe_drm_ras_counter *info, 19 u32 error_id, const char **name, u32 *val) 20 { 21 if (!info || !info[error_id].name) 22 return -ENOENT; 23 24 *name = info[error_id].name; 25 *val = atomic_read(&info[error_id].counter); 26 27 return 0; 28 } 29 30 static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id, 31 const char **name, u32 *val) 32 { 33 struct xe_device *xe = ep->priv; 34 struct xe_drm_ras *ras = &xe->ras; 35 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE]; 36 37 return hw_query_error_counter(info, error_id, name, val); 38 } 39 40 static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id, 41 const char **name, u32 *val) 42 { 43 struct xe_device *xe = ep->priv; 44 struct xe_drm_ras *ras = &xe->ras; 45 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE]; 46 47 return hw_query_error_counter(info, error_id, name, val); 48 } 49 50 static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe) 51 { 52 struct xe_drm_ras_counter *counter; 53 int i; 54 55 counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL); 56 if (!counter) 57 return ERR_PTR(-ENOMEM); 58 59 for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) { 60 if (!error_components[i]) 61 continue; 62 63 counter[i].name = error_components[i]; 64 atomic_set(&counter[i].counter, 0); 65 } 66 67 return counter; 68 } 69 70 static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node, 71 const enum drm_xe_ras_error_severity severity) 72 { 73 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 74 struct xe_drm_ras *ras = &xe->ras; 75 const char *device_name; 76 77 device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d", 78 pci_domain_nr(pdev->bus), pdev->bus->number, 79 PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); 80 81 if (!device_name) 82 return -ENOMEM; 83 84 node->device_name = device_name; 85 node->node_name = error_severity[severity]; 86 node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER; 87 node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; 88 node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1; 89 node->priv = xe; 90 91 ras->info[severity] = allocate_and_copy_counters(xe); 92 if (IS_ERR(ras->info[severity])) 93 return PTR_ERR(ras->info[severity]); 94 95 if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) 96 node->query_error_counter = query_correctable_error_counter; 97 else 98 node->query_error_counter = query_uncorrectable_error_counter; 99 100 return 0; 101 } 102 103 static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity) 104 { 105 struct drm_ras_node *node = &ras->node[severity]; 106 107 kfree(ras->info[severity]); 108 ras->info[severity] = NULL; 109 110 kfree(node->device_name); 111 node->device_name = NULL; 112 } 113 114 static int register_nodes(struct xe_device *xe) 115 { 116 struct xe_drm_ras *ras = &xe->ras; 117 int i; 118 119 for_each_error_severity(i) { 120 struct drm_ras_node *node = &ras->node[i]; 121 int ret; 122 123 ret = assign_node_params(xe, node, i); 124 if (ret) { 125 cleanup_node_param(ras, i); 126 return ret; 127 } 128 129 ret = drm_ras_node_register(node); 130 if (ret) { 131 cleanup_node_param(ras, i); 132 return ret; 133 } 134 } 135 136 return 0; 137 } 138 139 static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg) 140 { 141 struct xe_device *xe = arg; 142 struct xe_drm_ras *ras = &xe->ras; 143 int i; 144 145 for_each_error_severity(i) { 146 struct drm_ras_node *node = &ras->node[i]; 147 148 drm_ras_node_unregister(node); 149 cleanup_node_param(ras, i); 150 } 151 } 152 153 /** 154 * xe_drm_ras_init() - Initialize DRM RAS 155 * @xe: xe device instance 156 * 157 * Allocate and register DRM RAS nodes per device 158 * 159 * Return: 0 on success, negative error code otherwise. 160 */ 161 int xe_drm_ras_init(struct xe_device *xe) 162 { 163 struct xe_drm_ras *ras = &xe->ras; 164 struct drm_ras_node *node; 165 int err; 166 167 node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL); 168 if (!node) 169 return -ENOMEM; 170 171 ras->node = node; 172 173 err = register_nodes(xe); 174 if (err) { 175 drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err)); 176 return err; 177 } 178 179 err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe); 180 if (err) { 181 drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err)); 182 return err; 183 } 184 185 return 0; 186 } 187