1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2026 Intel Corporation 4 */ 5 6 #include <linux/bitmap.h> 7 8 #include <drm/drm_managed.h> 9 #include <drm/drm_print.h> 10 #include <drm/drm_ras.h> 11 12 #include "xe_device_types.h" 13 #include "xe_drm_ras.h" 14 15 static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES; 16 static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES; 17 18 static int hw_query_error_counter(struct xe_drm_ras_counter *info, 19 u32 error_id, const char **name, u32 *val) 20 { 21 if (!info || !info[error_id].name) 22 return -ENOENT; 23 24 *name = info[error_id].name; 25 *val = atomic_read(&info[error_id].counter); 26 27 return 0; 28 } 29 30 static int hw_clear_error_counter(struct xe_drm_ras_counter *info, u32 error_id) 31 { 32 if (!info || !info[error_id].name) 33 return -ENOENT; 34 35 atomic_set(&info[error_id].counter, 0); 36 37 return 0; 38 } 39 40 static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id, 41 const char **name, u32 *val) 42 { 43 struct xe_device *xe = ep->priv; 44 struct xe_drm_ras *ras = &xe->ras; 45 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE]; 46 47 return hw_query_error_counter(info, error_id, name, val); 48 } 49 50 static int clear_uncorrectable_error_counter(struct drm_ras_node *node, u32 error_id) 51 { 52 struct xe_device *xe = node->priv; 53 struct xe_drm_ras *ras = &xe->ras; 54 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE]; 55 56 return hw_clear_error_counter(info, error_id); 57 } 58 59 static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id, 60 const char **name, u32 *val) 61 { 62 struct xe_device *xe = ep->priv; 63 struct xe_drm_ras *ras = &xe->ras; 64 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE]; 65 66 return hw_query_error_counter(info, error_id, name, val); 67 } 68 69 static int clear_correctable_error_counter(struct drm_ras_node *node, u32 error_id) 70 { 71 struct xe_device *xe = node->priv; 72 struct xe_drm_ras *ras = &xe->ras; 73 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE]; 74 75 return hw_clear_error_counter(info, error_id); 76 } 77 78 static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe) 79 { 80 struct xe_drm_ras_counter *counter; 81 int i; 82 83 counter = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL); 84 if (!counter) 85 return ERR_PTR(-ENOMEM); 86 87 for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) { 88 if (!error_components[i]) 89 continue; 90 91 counter[i].name = error_components[i]; 92 atomic_set(&counter[i].counter, 0); 93 } 94 95 return counter; 96 } 97 98 static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node, 99 const enum drm_xe_ras_error_severity severity) 100 { 101 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 102 struct xe_drm_ras *ras = &xe->ras; 103 const char *device_name; 104 105 device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d", 106 pci_domain_nr(pdev->bus), pdev->bus->number, 107 PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); 108 109 if (!device_name) 110 return -ENOMEM; 111 112 node->device_name = device_name; 113 node->node_name = error_severity[severity]; 114 node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER; 115 node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; 116 node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1; 117 node->priv = xe; 118 119 ras->info[severity] = allocate_and_copy_counters(xe); 120 if (IS_ERR(ras->info[severity])) 121 return PTR_ERR(ras->info[severity]); 122 123 if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) { 124 node->query_error_counter = query_correctable_error_counter; 125 node->clear_error_counter = clear_correctable_error_counter; 126 } else { 127 node->query_error_counter = query_uncorrectable_error_counter; 128 node->clear_error_counter = clear_uncorrectable_error_counter; 129 } 130 131 return 0; 132 } 133 134 static void cleanup_node_param(struct drm_ras_node *node) 135 { 136 kfree(node->device_name); 137 node->device_name = NULL; 138 } 139 140 static void cleanup_node(struct drm_device *drm, void *node) 141 { 142 drm_ras_node_unregister(node); 143 cleanup_node_param(node); 144 } 145 146 static int register_nodes(struct xe_device *xe) 147 { 148 struct xe_drm_ras *ras = &xe->ras; 149 struct drm_ras_node *node; 150 int i, ret; 151 152 for_each_error_severity(i) { 153 node = &ras->node[i]; 154 155 ret = assign_node_params(xe, node, i); 156 if (ret) 157 goto free_param; 158 159 ret = drm_ras_node_register(node); 160 if (ret) 161 goto free_param; 162 163 ret = drmm_add_action_or_reset(&xe->drm, cleanup_node, node); 164 if (ret) 165 goto null_info; 166 } 167 168 return 0; 169 170 free_param: 171 cleanup_node_param(node); 172 null_info: 173 ras->info[i] = NULL; 174 return ret; 175 } 176 177 /** 178 * xe_drm_ras_init() - Initialize DRM RAS 179 * @xe: xe device instance 180 * 181 * Allocate and register DRM RAS nodes per device 182 * 183 * Return: 0 on success, negative error code otherwise. 184 */ 185 int xe_drm_ras_init(struct xe_device *xe) 186 { 187 struct xe_drm_ras *ras = &xe->ras; 188 struct drm_ras_node *node; 189 int err; 190 191 node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL); 192 if (!node) 193 return -ENOMEM; 194 195 ras->node = node; 196 197 err = register_nodes(xe); 198 if (err) { 199 drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err)); 200 return err; 201 } 202 203 return 0; 204 } 205