1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2026 Intel Corporation 4 */ 5 6 #include <linux/bitmap.h> 7 8 #include <drm/drm_managed.h> 9 #include <drm/drm_print.h> 10 #include <drm/drm_ras.h> 11 12 #include "xe_device_types.h" 13 #include "xe_drm_ras.h" 14 15 static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES; 16 static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES; 17 18 static int hw_query_error_counter(struct xe_drm_ras_counter *info, 19 u32 error_id, const char **name, u32 *val) 20 { 21 if (!info || !info[error_id].name) 22 return -ENOENT; 23 24 *name = info[error_id].name; 25 *val = atomic_read(&info[error_id].counter); 26 27 return 0; 28 } 29 30 static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id, 31 const char **name, u32 *val) 32 { 33 struct xe_device *xe = ep->priv; 34 struct xe_drm_ras *ras = &xe->ras; 35 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE]; 36 37 return hw_query_error_counter(info, error_id, name, val); 38 } 39 40 static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id, 41 const char **name, u32 *val) 42 { 43 struct xe_device *xe = ep->priv; 44 struct xe_drm_ras *ras = &xe->ras; 45 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE]; 46 47 return hw_query_error_counter(info, error_id, name, val); 48 } 49 50 static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe) 51 { 52 struct xe_drm_ras_counter *counter; 53 int i; 54 55 counter = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL); 56 if (!counter) 57 return ERR_PTR(-ENOMEM); 58 59 for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) { 60 if (!error_components[i]) 61 continue; 62 63 counter[i].name = error_components[i]; 64 atomic_set(&counter[i].counter, 0); 65 } 66 67 return counter; 68 } 69 70 static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node, 71 const enum drm_xe_ras_error_severity severity) 72 { 73 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 74 struct xe_drm_ras *ras = &xe->ras; 75 const char *device_name; 76 77 device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d", 78 pci_domain_nr(pdev->bus), pdev->bus->number, 79 PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); 80 81 if (!device_name) 82 return -ENOMEM; 83 84 node->device_name = device_name; 85 node->node_name = error_severity[severity]; 86 node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER; 87 node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; 88 node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1; 89 node->priv = xe; 90 91 ras->info[severity] = allocate_and_copy_counters(xe); 92 if (IS_ERR(ras->info[severity])) 93 return PTR_ERR(ras->info[severity]); 94 95 if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) 96 node->query_error_counter = query_correctable_error_counter; 97 else 98 node->query_error_counter = query_uncorrectable_error_counter; 99 100 return 0; 101 } 102 103 static void cleanup_node_param(struct drm_ras_node *node) 104 { 105 kfree(node->device_name); 106 node->device_name = NULL; 107 } 108 109 static void cleanup_node(struct drm_device *drm, void *node) 110 { 111 drm_ras_node_unregister(node); 112 cleanup_node_param(node); 113 } 114 115 static int register_nodes(struct xe_device *xe) 116 { 117 struct xe_drm_ras *ras = &xe->ras; 118 struct drm_ras_node *node; 119 int i, ret; 120 121 for_each_error_severity(i) { 122 node = &ras->node[i]; 123 124 ret = assign_node_params(xe, node, i); 125 if (ret) 126 goto free_param; 127 128 ret = drm_ras_node_register(node); 129 if (ret) 130 goto free_param; 131 132 ret = drmm_add_action_or_reset(&xe->drm, cleanup_node, node); 133 if (ret) 134 goto null_info; 135 } 136 137 return 0; 138 139 free_param: 140 cleanup_node_param(node); 141 null_info: 142 ras->info[i] = NULL; 143 return ret; 144 } 145 146 /** 147 * xe_drm_ras_init() - Initialize DRM RAS 148 * @xe: xe device instance 149 * 150 * Allocate and register DRM RAS nodes per device 151 * 152 * Return: 0 on success, negative error code otherwise. 153 */ 154 int xe_drm_ras_init(struct xe_device *xe) 155 { 156 struct xe_drm_ras *ras = &xe->ras; 157 struct drm_ras_node *node; 158 int err; 159 160 node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL); 161 if (!node) 162 return -ENOMEM; 163 164 ras->node = node; 165 166 err = register_nodes(xe); 167 if (err) { 168 drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err)); 169 return err; 170 } 171 172 return 0; 173 } 174