1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright(c) 2025 AMD Corporation. All rights reserved. */ 3 4 #include <linux/pci.h> 5 #include <linux/aer.h> 6 #include <cxl/event.h> 7 #include <cxlmem.h> 8 #include <cxlpci.h> 9 #include "trace.h" 10 11 static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, 12 struct cxl_ras_capability_regs ras_cap) 13 { 14 u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; 15 16 trace_cxl_port_aer_correctable_error(&pdev->dev, status); 17 } 18 19 static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev, 20 struct cxl_ras_capability_regs ras_cap) 21 { 22 u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; 23 u32 fe; 24 25 if (hweight32(status) > 1) 26 fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, 27 ras_cap.cap_control)); 28 else 29 fe = status; 30 31 trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe, 32 ras_cap.header_log); 33 } 34 35 static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd, 36 struct cxl_ras_capability_regs ras_cap) 37 { 38 u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; 39 40 trace_cxl_aer_correctable_error(cxlmd, status); 41 } 42 43 static void 44 cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd, 45 struct cxl_ras_capability_regs ras_cap) 46 { 47 u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; 48 u32 fe; 49 50 if (hweight32(status) > 1) 51 fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, 52 ras_cap.cap_control)); 53 else 54 fe = status; 55 56 trace_cxl_aer_uncorrectable_error(cxlmd, status, fe, 57 ras_cap.header_log); 58 } 59 60 static int match_memdev_by_parent(struct device *dev, const void *uport) 61 { 62 if (is_cxl_memdev(dev) && dev->parent == uport) 63 return 1; 64 return 0; 65 } 66 67 static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) 68 { 69 unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device, 70 data->prot_err.agent_addr.function); 71 struct pci_dev *pdev __free(pci_dev_put) = 72 pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, 73 data->prot_err.agent_addr.bus, 74 devfn); 75 struct cxl_memdev *cxlmd; 76 int port_type; 77 78 if (!pdev) 79 return; 80 81 port_type = pci_pcie_type(pdev); 82 if (port_type == PCI_EXP_TYPE_ROOT_PORT || 83 port_type == PCI_EXP_TYPE_DOWNSTREAM || 84 port_type == PCI_EXP_TYPE_UPSTREAM) { 85 if (data->severity == AER_CORRECTABLE) 86 cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap); 87 else 88 cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap); 89 90 return; 91 } 92 93 guard(device)(&pdev->dev); 94 if (!pdev->dev.driver) 95 return; 96 97 struct device *mem_dev __free(put_device) = bus_find_device( 98 &cxl_bus_type, NULL, pdev, match_memdev_by_parent); 99 if (!mem_dev) 100 return; 101 102 cxlmd = to_cxl_memdev(mem_dev); 103 if (data->severity == AER_CORRECTABLE) 104 cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap); 105 else 106 cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap); 107 } 108 109 static void cxl_cper_prot_err_work_fn(struct work_struct *work) 110 { 111 struct cxl_cper_prot_err_work_data wd; 112 113 while (cxl_cper_prot_err_kfifo_get(&wd)) 114 cxl_cper_handle_prot_err(&wd); 115 } 116 static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn); 117 118 int cxl_ras_init(void) 119 { 120 return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work); 121 } 122 123 void cxl_ras_exit(void) 124 { 125 cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work); 126 cancel_work_sync(&cxl_cper_prot_err_work); 127 } 128 129 static void cxl_dport_map_ras(struct cxl_dport *dport) 130 { 131 struct cxl_register_map *map = &dport->reg_map; 132 struct device *dev = dport->dport_dev; 133 134 if (!map->component_map.ras.valid) 135 dev_dbg(dev, "RAS registers not found\n"); 136 else if (cxl_map_component_regs(map, &dport->regs.component, 137 BIT(CXL_CM_CAP_CAP_ID_RAS))) 138 dev_dbg(dev, "Failed to map RAS capability.\n"); 139 } 140 141 /** 142 * devm_cxl_dport_ras_setup - Setup CXL RAS report on this dport 143 * @dport: the cxl_dport that needs to be initialized 144 */ 145 void devm_cxl_dport_ras_setup(struct cxl_dport *dport) 146 { 147 dport->reg_map.host = dport_to_host(dport); 148 cxl_dport_map_ras(dport); 149 } 150 151 void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport) 152 { 153 struct pci_host_bridge *host_bridge; 154 155 if (!dev_is_pci(dport->dport_dev)) 156 return; 157 158 devm_cxl_dport_ras_setup(dport); 159 160 host_bridge = to_pci_host_bridge(dport->dport_dev); 161 if (!host_bridge->native_aer) 162 return; 163 164 cxl_dport_map_rch_aer(dport); 165 cxl_disable_rch_root_ints(dport); 166 } 167 EXPORT_SYMBOL_NS_GPL(devm_cxl_dport_rch_ras_setup, "CXL"); 168 169 void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base) 170 { 171 void __iomem *addr; 172 u32 status; 173 174 if (!ras_base) 175 return; 176 177 addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; 178 status = readl(addr); 179 if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { 180 writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); 181 trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); 182 } 183 } 184 185 /* CXL spec rev3.0 8.2.4.16.1 */ 186 static void header_log_copy(void __iomem *ras_base, u32 *log) 187 { 188 void __iomem *addr; 189 u32 *log_addr; 190 int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); 191 192 addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; 193 log_addr = log; 194 195 for (i = 0; i < log_u32_size; i++) { 196 *log_addr = readl(addr); 197 log_addr++; 198 addr += sizeof(u32); 199 } 200 } 201 202 /* 203 * Log the state of the RAS status registers and prepare them to log the 204 * next error status. Return 1 if reset needed. 205 */ 206 bool cxl_handle_ras(struct device *dev, void __iomem *ras_base) 207 { 208 u32 hl[CXL_HEADERLOG_SIZE_U32]; 209 void __iomem *addr; 210 u32 status; 211 u32 fe; 212 213 if (!ras_base) 214 return false; 215 216 addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; 217 status = readl(addr); 218 if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) 219 return false; 220 221 /* If multiple errors, log header points to first error from ctrl reg */ 222 if (hweight32(status) > 1) { 223 void __iomem *rcc_addr = 224 ras_base + CXL_RAS_CAP_CONTROL_OFFSET; 225 226 fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, 227 readl(rcc_addr))); 228 } else { 229 fe = status; 230 } 231 232 header_log_copy(ras_base, hl); 233 trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); 234 writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); 235 236 return true; 237 } 238 239 void cxl_cor_error_detected(struct pci_dev *pdev) 240 { 241 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 242 struct device *dev = &cxlds->cxlmd->dev; 243 244 scoped_guard(device, dev) { 245 if (!dev->driver) { 246 dev_warn(&pdev->dev, 247 "%s: memdev disabled, abort error handling\n", 248 dev_name(dev)); 249 return; 250 } 251 252 if (cxlds->rcd) 253 cxl_handle_rdport_errors(cxlds); 254 255 cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras); 256 } 257 } 258 EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); 259 260 pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, 261 pci_channel_state_t state) 262 { 263 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 264 struct cxl_memdev *cxlmd = cxlds->cxlmd; 265 struct device *dev = &cxlmd->dev; 266 bool ue; 267 268 scoped_guard(device, dev) { 269 if (!dev->driver) { 270 dev_warn(&pdev->dev, 271 "%s: memdev disabled, abort error handling\n", 272 dev_name(dev)); 273 return PCI_ERS_RESULT_DISCONNECT; 274 } 275 276 if (cxlds->rcd) 277 cxl_handle_rdport_errors(cxlds); 278 /* 279 * A frozen channel indicates an impending reset which is fatal to 280 * CXL.mem operation, and will likely crash the system. On the off 281 * chance the situation is recoverable dump the status of the RAS 282 * capability registers and bounce the active state of the memdev. 283 */ 284 ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras); 285 } 286 287 288 switch (state) { 289 case pci_channel_io_normal: 290 if (ue) { 291 device_release_driver(dev); 292 return PCI_ERS_RESULT_NEED_RESET; 293 } 294 return PCI_ERS_RESULT_CAN_RECOVER; 295 case pci_channel_io_frozen: 296 dev_warn(&pdev->dev, 297 "%s: frozen state error detected, disable CXL.mem\n", 298 dev_name(dev)); 299 device_release_driver(dev); 300 return PCI_ERS_RESULT_NEED_RESET; 301 case pci_channel_io_perm_failure: 302 dev_warn(&pdev->dev, 303 "failure state error detected, request disconnect\n"); 304 return PCI_ERS_RESULT_DISCONNECT; 305 } 306 return PCI_ERS_RESULT_NEED_RESET; 307 } 308 EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); 309