1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright(c) 2025 AMD Corporation. All rights reserved. */ 3 4 #include <linux/pci.h> 5 #include <linux/aer.h> 6 #include <cxl/event.h> 7 #include <cxlmem.h> 8 #include <cxlpci.h> 9 #include "trace.h" 10 11 static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, 12 struct cxl_ras_capability_regs ras_cap) 13 { 14 u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; 15 16 trace_cxl_port_aer_correctable_error(&pdev->dev, status); 17 } 18 19 static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev, 20 struct cxl_ras_capability_regs ras_cap) 21 { 22 u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; 23 u32 fe; 24 25 if (hweight32(status) > 1) 26 fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, 27 ras_cap.cap_control)); 28 else 29 fe = status; 30 31 trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe, 32 ras_cap.header_log); 33 } 34 35 static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd, 36 struct cxl_ras_capability_regs ras_cap) 37 { 38 u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; 39 40 trace_cxl_aer_correctable_error(cxlmd, status); 41 } 42 43 static void 44 cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd, 45 struct cxl_ras_capability_regs ras_cap) 46 { 47 u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; 48 u32 fe; 49 50 if (hweight32(status) > 1) 51 fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, 52 ras_cap.cap_control)); 53 else 54 fe = status; 55 56 trace_cxl_aer_uncorrectable_error(cxlmd, status, fe, 57 ras_cap.header_log); 58 } 59 60 static int match_memdev_by_parent(struct device *dev, const void *uport) 61 { 62 if (is_cxl_memdev(dev) && dev->parent == uport) 63 return 1; 64 return 0; 65 } 66 67 void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) 68 { 69 unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device, 70 data->prot_err.agent_addr.function); 71 struct pci_dev *pdev __free(pci_dev_put) = 72 pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, 73 data->prot_err.agent_addr.bus, 74 devfn); 75 struct cxl_memdev *cxlmd; 76 int port_type; 77 78 if (!pdev) 79 return; 80 81 port_type = pci_pcie_type(pdev); 82 if (port_type == PCI_EXP_TYPE_ROOT_PORT || 83 port_type == PCI_EXP_TYPE_DOWNSTREAM || 84 port_type == PCI_EXP_TYPE_UPSTREAM) { 85 if (data->severity == AER_CORRECTABLE) 86 cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap); 87 else 88 cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap); 89 90 return; 91 } 92 93 guard(device)(&pdev->dev); 94 if (!pdev->dev.driver) 95 return; 96 97 struct device *mem_dev __free(put_device) = bus_find_device( 98 &cxl_bus_type, NULL, pdev, match_memdev_by_parent); 99 if (!mem_dev) 100 return; 101 102 cxlmd = to_cxl_memdev(mem_dev); 103 if (data->severity == AER_CORRECTABLE) 104 cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap); 105 else 106 cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap); 107 } 108 EXPORT_SYMBOL_GPL(cxl_cper_handle_prot_err); 109 110 static void cxl_cper_prot_err_work_fn(struct work_struct *work) 111 { 112 struct cxl_cper_prot_err_work_data wd; 113 114 while (cxl_cper_prot_err_kfifo_get(&wd)) 115 cxl_cper_handle_prot_err(&wd); 116 } 117 static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn); 118 119 int cxl_ras_init(void) 120 { 121 return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work); 122 } 123 124 void cxl_ras_exit(void) 125 { 126 cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work); 127 cancel_work_sync(&cxl_cper_prot_err_work); 128 } 129 130 static void cxl_dport_map_ras(struct cxl_dport *dport) 131 { 132 struct cxl_register_map *map = &dport->reg_map; 133 struct device *dev = dport->dport_dev; 134 135 if (!map->component_map.ras.valid) 136 dev_dbg(dev, "RAS registers not found\n"); 137 else if (cxl_map_component_regs(map, &dport->regs.component, 138 BIT(CXL_CM_CAP_CAP_ID_RAS))) 139 dev_dbg(dev, "Failed to map RAS capability.\n"); 140 } 141 142 /** 143 * devm_cxl_dport_ras_setup - Setup CXL RAS report on this dport 144 * @dport: the cxl_dport that needs to be initialized 145 */ 146 void devm_cxl_dport_ras_setup(struct cxl_dport *dport) 147 { 148 dport->reg_map.host = dport_to_host(dport); 149 cxl_dport_map_ras(dport); 150 } 151 152 void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport) 153 { 154 struct pci_host_bridge *host_bridge; 155 156 if (!dev_is_pci(dport->dport_dev)) 157 return; 158 159 devm_cxl_dport_ras_setup(dport); 160 161 host_bridge = to_pci_host_bridge(dport->dport_dev); 162 if (!host_bridge->native_aer) 163 return; 164 165 cxl_dport_map_rch_aer(dport); 166 cxl_disable_rch_root_ints(dport); 167 } 168 EXPORT_SYMBOL_NS_GPL(devm_cxl_dport_rch_ras_setup, "CXL"); 169 170 void devm_cxl_port_ras_setup(struct cxl_port *port) 171 { 172 struct cxl_register_map *map = &port->reg_map; 173 174 if (!map->component_map.ras.valid) { 175 dev_dbg(&port->dev, "RAS registers not found\n"); 176 return; 177 } 178 179 map->host = &port->dev; 180 if (cxl_map_component_regs(map, &port->regs, 181 BIT(CXL_CM_CAP_CAP_ID_RAS))) 182 dev_dbg(&port->dev, "Failed to map RAS capability\n"); 183 } 184 EXPORT_SYMBOL_NS_GPL(devm_cxl_port_ras_setup, "CXL"); 185 186 void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base) 187 { 188 void __iomem *addr; 189 u32 status; 190 191 if (!ras_base) 192 return; 193 194 addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; 195 status = readl(addr); 196 if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { 197 writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); 198 trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); 199 } 200 } 201 202 /* CXL spec rev3.0 8.2.4.16.1 */ 203 static void header_log_copy(void __iomem *ras_base, u32 *log) 204 { 205 void __iomem *addr; 206 u32 *log_addr; 207 int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); 208 209 addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; 210 log_addr = log; 211 212 for (i = 0; i < log_u32_size; i++) { 213 *log_addr = readl(addr); 214 log_addr++; 215 addr += sizeof(u32); 216 } 217 } 218 219 /* 220 * Log the state of the RAS status registers and prepare them to log the 221 * next error status. Return 1 if reset needed. 222 */ 223 bool cxl_handle_ras(struct device *dev, void __iomem *ras_base) 224 { 225 u32 hl[CXL_HEADERLOG_SIZE_U32]; 226 void __iomem *addr; 227 u32 status; 228 u32 fe; 229 230 if (!ras_base) 231 return false; 232 233 addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; 234 status = readl(addr); 235 if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) 236 return false; 237 238 /* If multiple errors, log header points to first error from ctrl reg */ 239 if (hweight32(status) > 1) { 240 void __iomem *rcc_addr = 241 ras_base + CXL_RAS_CAP_CONTROL_OFFSET; 242 243 fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, 244 readl(rcc_addr))); 245 } else { 246 fe = status; 247 } 248 249 header_log_copy(ras_base, hl); 250 trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); 251 writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); 252 253 return true; 254 } 255 256 void cxl_cor_error_detected(struct pci_dev *pdev) 257 { 258 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 259 struct cxl_memdev *cxlmd = cxlds->cxlmd; 260 struct device *dev = &cxlds->cxlmd->dev; 261 262 scoped_guard(device, dev) { 263 if (!dev->driver) { 264 dev_warn(&pdev->dev, 265 "%s: memdev disabled, abort error handling\n", 266 dev_name(dev)); 267 return; 268 } 269 270 if (cxlds->rcd) 271 cxl_handle_rdport_errors(cxlds); 272 273 cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlmd->endpoint->regs.ras); 274 } 275 } 276 EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); 277 278 pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, 279 pci_channel_state_t state) 280 { 281 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 282 struct cxl_memdev *cxlmd = cxlds->cxlmd; 283 struct device *dev = &cxlmd->dev; 284 bool ue; 285 286 scoped_guard(device, dev) { 287 if (!dev->driver) { 288 dev_warn(&pdev->dev, 289 "%s: memdev disabled, abort error handling\n", 290 dev_name(dev)); 291 return PCI_ERS_RESULT_DISCONNECT; 292 } 293 294 if (cxlds->rcd) 295 cxl_handle_rdport_errors(cxlds); 296 /* 297 * A frozen channel indicates an impending reset which is fatal to 298 * CXL.mem operation, and will likely crash the system. On the off 299 * chance the situation is recoverable dump the status of the RAS 300 * capability registers and bounce the active state of the memdev. 301 */ 302 ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlmd->endpoint->regs.ras); 303 } 304 305 switch (state) { 306 case pci_channel_io_normal: 307 if (ue) { 308 device_release_driver(dev); 309 return PCI_ERS_RESULT_NEED_RESET; 310 } 311 return PCI_ERS_RESULT_CAN_RECOVER; 312 case pci_channel_io_frozen: 313 dev_warn(&pdev->dev, 314 "%s: frozen state error detected, disable CXL.mem\n", 315 dev_name(dev)); 316 device_release_driver(dev); 317 return PCI_ERS_RESULT_NEED_RESET; 318 case pci_channel_io_perm_failure: 319 dev_warn(&pdev->dev, 320 "failure state error detected, request disconnect\n"); 321 return PCI_ERS_RESULT_DISCONNECT; 322 } 323 return PCI_ERS_RESULT_NEED_RESET; 324 } 325 EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); 326