1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright(c) 2025 AMD Corporation. All rights reserved. */ 3 4 #include <linux/pci.h> 5 #include <linux/aer.h> 6 #include <cxl/event.h> 7 #include <cxlmem.h> 8 #include <cxlpci.h> 9 #include "trace.h" 10 11 static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, 12 struct cxl_ras_capability_regs ras_cap) 13 { 14 u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; 15 16 trace_cxl_port_aer_correctable_error(&pdev->dev, status); 17 } 18 19 static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev, 20 struct cxl_ras_capability_regs ras_cap) 21 { 22 u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; 23 u32 fe; 24 25 if (hweight32(status) > 1) 26 fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, 27 ras_cap.cap_control)); 28 else 29 fe = status; 30 31 trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe, 32 ras_cap.header_log); 33 } 34 35 static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd, 36 struct cxl_ras_capability_regs ras_cap) 37 { 38 u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; 39 40 trace_cxl_aer_correctable_error(cxlmd, status); 41 } 42 43 static void 44 cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd, 45 struct cxl_ras_capability_regs ras_cap) 46 { 47 u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; 48 u32 fe; 49 50 if (hweight32(status) > 1) 51 fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, 52 ras_cap.cap_control)); 53 else 54 fe = status; 55 56 trace_cxl_aer_uncorrectable_error(cxlmd, status, fe, 57 ras_cap.header_log); 58 } 59 60 static int match_memdev_by_parent(struct device *dev, const void *uport) 61 { 62 if (is_cxl_memdev(dev) && dev->parent == uport) 63 return 1; 64 return 0; 65 } 66 67 static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) 68 { 69 unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device, 70 data->prot_err.agent_addr.function); 71 struct pci_dev *pdev __free(pci_dev_put) = 72 pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, 73 data->prot_err.agent_addr.bus, 74 devfn); 75 struct cxl_memdev *cxlmd; 76 int port_type; 77 78 if (!pdev) 79 return; 80 81 port_type = pci_pcie_type(pdev); 82 if (port_type == PCI_EXP_TYPE_ROOT_PORT || 83 port_type == PCI_EXP_TYPE_DOWNSTREAM || 84 port_type == PCI_EXP_TYPE_UPSTREAM) { 85 if (data->severity == AER_CORRECTABLE) 86 cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap); 87 else 88 cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap); 89 90 return; 91 } 92 93 guard(device)(&pdev->dev); 94 if (!pdev->dev.driver) 95 return; 96 97 struct device *mem_dev __free(put_device) = bus_find_device( 98 &cxl_bus_type, NULL, pdev, match_memdev_by_parent); 99 if (!mem_dev) 100 return; 101 102 cxlmd = to_cxl_memdev(mem_dev); 103 if (data->severity == AER_CORRECTABLE) 104 cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap); 105 else 106 cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap); 107 } 108 109 static void cxl_cper_prot_err_work_fn(struct work_struct *work) 110 { 111 struct cxl_cper_prot_err_work_data wd; 112 113 while (cxl_cper_prot_err_kfifo_get(&wd)) 114 cxl_cper_handle_prot_err(&wd); 115 } 116 static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn); 117 118 int cxl_ras_init(void) 119 { 120 return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work); 121 } 122 123 void cxl_ras_exit(void) 124 { 125 cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work); 126 cancel_work_sync(&cxl_cper_prot_err_work); 127 } 128 129 static void cxl_dport_map_ras(struct cxl_dport *dport) 130 { 131 struct cxl_register_map *map = &dport->reg_map; 132 struct device *dev = dport->dport_dev; 133 134 if (!map->component_map.ras.valid) 135 dev_dbg(dev, "RAS registers not found\n"); 136 else if (cxl_map_component_regs(map, &dport->regs.component, 137 BIT(CXL_CM_CAP_CAP_ID_RAS))) 138 dev_dbg(dev, "Failed to map RAS capability.\n"); 139 } 140 141 /** 142 * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport 143 * @dport: the cxl_dport that needs to be initialized 144 * @host: host device for devm operations 145 */ 146 void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) 147 { 148 dport->reg_map.host = host; 149 cxl_dport_map_ras(dport); 150 151 if (dport->rch) { 152 struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev); 153 154 if (!host_bridge->native_aer) 155 return; 156 157 cxl_dport_map_rch_aer(dport); 158 cxl_disable_rch_root_ints(dport); 159 } 160 } 161 EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); 162 163 void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) 164 { 165 void __iomem *addr; 166 u32 status; 167 168 if (!ras_base) 169 return; 170 171 addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; 172 status = readl(addr); 173 if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { 174 writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); 175 trace_cxl_aer_correctable_error(cxlds->cxlmd, status); 176 } 177 } 178 179 /* CXL spec rev3.0 8.2.4.16.1 */ 180 static void header_log_copy(void __iomem *ras_base, u32 *log) 181 { 182 void __iomem *addr; 183 u32 *log_addr; 184 int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); 185 186 addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; 187 log_addr = log; 188 189 for (i = 0; i < log_u32_size; i++) { 190 *log_addr = readl(addr); 191 log_addr++; 192 addr += sizeof(u32); 193 } 194 } 195 196 /* 197 * Log the state of the RAS status registers and prepare them to log the 198 * next error status. Return 1 if reset needed. 199 */ 200 bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) 201 { 202 u32 hl[CXL_HEADERLOG_SIZE_U32]; 203 void __iomem *addr; 204 u32 status; 205 u32 fe; 206 207 if (!ras_base) 208 return false; 209 210 addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; 211 status = readl(addr); 212 if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) 213 return false; 214 215 /* If multiple errors, log header points to first error from ctrl reg */ 216 if (hweight32(status) > 1) { 217 void __iomem *rcc_addr = 218 ras_base + CXL_RAS_CAP_CONTROL_OFFSET; 219 220 fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, 221 readl(rcc_addr))); 222 } else { 223 fe = status; 224 } 225 226 header_log_copy(ras_base, hl); 227 trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); 228 writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); 229 230 return true; 231 } 232 233 void cxl_cor_error_detected(struct pci_dev *pdev) 234 { 235 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 236 struct device *dev = &cxlds->cxlmd->dev; 237 238 scoped_guard(device, dev) { 239 if (!dev->driver) { 240 dev_warn(&pdev->dev, 241 "%s: memdev disabled, abort error handling\n", 242 dev_name(dev)); 243 return; 244 } 245 246 if (cxlds->rcd) 247 cxl_handle_rdport_errors(cxlds); 248 249 cxl_handle_cor_ras(cxlds, cxlds->regs.ras); 250 } 251 } 252 EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); 253 254 pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, 255 pci_channel_state_t state) 256 { 257 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 258 struct cxl_memdev *cxlmd = cxlds->cxlmd; 259 struct device *dev = &cxlmd->dev; 260 bool ue; 261 262 scoped_guard(device, dev) { 263 if (!dev->driver) { 264 dev_warn(&pdev->dev, 265 "%s: memdev disabled, abort error handling\n", 266 dev_name(dev)); 267 return PCI_ERS_RESULT_DISCONNECT; 268 } 269 270 if (cxlds->rcd) 271 cxl_handle_rdport_errors(cxlds); 272 /* 273 * A frozen channel indicates an impending reset which is fatal to 274 * CXL.mem operation, and will likely crash the system. On the off 275 * chance the situation is recoverable dump the status of the RAS 276 * capability registers and bounce the active state of the memdev. 277 */ 278 ue = cxl_handle_ras(cxlds, cxlds->regs.ras); 279 } 280 281 282 switch (state) { 283 case pci_channel_io_normal: 284 if (ue) { 285 device_release_driver(dev); 286 return PCI_ERS_RESULT_NEED_RESET; 287 } 288 return PCI_ERS_RESULT_CAN_RECOVER; 289 case pci_channel_io_frozen: 290 dev_warn(&pdev->dev, 291 "%s: frozen state error detected, disable CXL.mem\n", 292 dev_name(dev)); 293 device_release_driver(dev); 294 return PCI_ERS_RESULT_NEED_RESET; 295 case pci_channel_io_perm_failure: 296 dev_warn(&pdev->dev, 297 "failure state error detected, request disconnect\n"); 298 return PCI_ERS_RESULT_DISCONNECT; 299 } 300 return PCI_ERS_RESULT_NEED_RESET; 301 } 302 EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); 303