1 /* 2 * APEI Generic Hardware Error Source support 3 * 4 * Generic Hardware Error Source provides a way to report platform 5 * hardware errors (such as that from chipset). It works in so called 6 * "Firmware First" mode, that is, hardware errors are reported to 7 * firmware firstly, then reported to Linux by firmware. This way, 8 * some non-standard hardware error registers or non-standard hardware 9 * link can be checked by firmware to produce more hardware error 10 * information for Linux. 11 * 12 * For more information about Generic Hardware Error Source, please 13 * refer to ACPI Specification version 4.0, section 17.3.2.6 14 * 15 * Now, only SCI notification type and memory errors are 16 * supported. More notification type and hardware error type will be 17 * added later. 18 * 19 * Copyright 2010 Intel Corp. 20 * Author: Huang Ying <ying.huang@intel.com> 21 * 22 * This program is free software; you can redistribute it and/or 23 * modify it under the terms of the GNU General Public License version 24 * 2 as published by the Free Software Foundation; 25 * 26 * This program is distributed in the hope that it will be useful, 27 * but WITHOUT ANY WARRANTY; without even the implied warranty of 28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 * GNU General Public License for more details. 30 * 31 * You should have received a copy of the GNU General Public License 32 * along with this program; if not, write to the Free Software 33 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 34 */ 35 36 #include <linux/kernel.h> 37 #include <linux/module.h> 38 #include <linux/init.h> 39 #include <linux/acpi.h> 40 #include <linux/io.h> 41 #include <linux/interrupt.h> 42 #include <linux/cper.h> 43 #include <linux/kdebug.h> 44 #include <acpi/apei.h> 45 #include <acpi/atomicio.h> 46 #include <acpi/hed.h> 47 #include <asm/mce.h> 48 49 #include "apei-internal.h" 50 51 #define GHES_PFX "GHES: " 52 53 #define GHES_ESTATUS_MAX_SIZE 65536 54 55 /* 56 * One struct ghes is created for each generic hardware error 57 * source. 58 * 59 * It provides the context for APEI hardware error timer/IRQ/SCI/NMI 60 * handler. Handler for one generic hardware error source is only 61 * triggered after the previous one is done. So handler can uses 62 * struct ghes without locking. 63 * 64 * estatus: memory buffer for error status block, allocated during 65 * HEST parsing. 66 */ 67 #define GHES_TO_CLEAR 0x0001 68 69 struct ghes { 70 struct acpi_hest_generic *generic; 71 struct acpi_hest_generic_status *estatus; 72 struct list_head list; 73 u64 buffer_paddr; 74 unsigned long flags; 75 }; 76 77 /* 78 * Error source lists, one list for each notification method. The 79 * members in lists are struct ghes. 80 * 81 * The list members are only added in HEST parsing and deleted during 82 * module_exit, that is, single-threaded. So no lock is needed for 83 * that. 84 * 85 * But the mutual exclusion is needed between members adding/deleting 86 * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is 87 * used for that. 88 */ 89 static LIST_HEAD(ghes_sci); 90 91 static struct ghes *ghes_new(struct acpi_hest_generic *generic) 92 { 93 struct ghes *ghes; 94 unsigned int error_block_length; 95 int rc; 96 97 ghes = kzalloc(sizeof(*ghes), GFP_KERNEL); 98 if (!ghes) 99 return ERR_PTR(-ENOMEM); 100 ghes->generic = generic; 101 INIT_LIST_HEAD(&ghes->list); 102 rc = acpi_pre_map_gar(&generic->error_status_address); 103 if (rc) 104 goto err_free; 105 error_block_length = generic->error_block_length; 106 if (error_block_length > GHES_ESTATUS_MAX_SIZE) { 107 pr_warning(FW_WARN GHES_PFX 108 "Error status block length is too long: %u for " 109 "generic hardware error source: %d.\n", 110 error_block_length, generic->header.source_id); 111 error_block_length = GHES_ESTATUS_MAX_SIZE; 112 } 113 ghes->estatus = kmalloc(error_block_length, GFP_KERNEL); 114 if (!ghes->estatus) { 115 rc = -ENOMEM; 116 goto err_unmap; 117 } 118 119 return ghes; 120 121 err_unmap: 122 acpi_post_unmap_gar(&generic->error_status_address); 123 err_free: 124 kfree(ghes); 125 return ERR_PTR(rc); 126 } 127 128 static void ghes_fini(struct ghes *ghes) 129 { 130 kfree(ghes->estatus); 131 acpi_post_unmap_gar(&ghes->generic->error_status_address); 132 } 133 134 enum { 135 GHES_SER_NO = 0x0, 136 GHES_SER_CORRECTED = 0x1, 137 GHES_SER_RECOVERABLE = 0x2, 138 GHES_SER_PANIC = 0x3, 139 }; 140 141 static inline int ghes_severity(int severity) 142 { 143 switch (severity) { 144 case CPER_SER_INFORMATIONAL: 145 return GHES_SER_NO; 146 case CPER_SER_CORRECTED: 147 return GHES_SER_CORRECTED; 148 case CPER_SER_RECOVERABLE: 149 return GHES_SER_RECOVERABLE; 150 case CPER_SER_FATAL: 151 return GHES_SER_PANIC; 152 default: 153 /* Unkown, go panic */ 154 return GHES_SER_PANIC; 155 } 156 } 157 158 /* SCI handler run in work queue, so ioremap can be used here */ 159 static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, 160 int from_phys) 161 { 162 void *vaddr; 163 164 vaddr = ioremap_cache(paddr, len); 165 if (!vaddr) 166 return -ENOMEM; 167 if (from_phys) 168 memcpy(buffer, vaddr, len); 169 else 170 memcpy(vaddr, buffer, len); 171 iounmap(vaddr); 172 173 return 0; 174 } 175 176 static int ghes_read_estatus(struct ghes *ghes, int silent) 177 { 178 struct acpi_hest_generic *g = ghes->generic; 179 u64 buf_paddr; 180 u32 len; 181 int rc; 182 183 rc = acpi_atomic_read(&buf_paddr, &g->error_status_address); 184 if (rc) { 185 if (!silent && printk_ratelimit()) 186 pr_warning(FW_WARN GHES_PFX 187 "Failed to read error status block address for hardware error source: %d.\n", 188 g->header.source_id); 189 return -EIO; 190 } 191 if (!buf_paddr) 192 return -ENOENT; 193 194 rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr, 195 sizeof(*ghes->estatus), 1); 196 if (rc) 197 return rc; 198 if (!ghes->estatus->block_status) 199 return -ENOENT; 200 201 ghes->buffer_paddr = buf_paddr; 202 ghes->flags |= GHES_TO_CLEAR; 203 204 rc = -EIO; 205 len = apei_estatus_len(ghes->estatus); 206 if (len < sizeof(*ghes->estatus)) 207 goto err_read_block; 208 if (len > ghes->generic->error_block_length) 209 goto err_read_block; 210 if (apei_estatus_check_header(ghes->estatus)) 211 goto err_read_block; 212 rc = ghes_copy_tofrom_phys(ghes->estatus + 1, 213 buf_paddr + sizeof(*ghes->estatus), 214 len - sizeof(*ghes->estatus), 1); 215 if (rc) 216 return rc; 217 if (apei_estatus_check(ghes->estatus)) 218 goto err_read_block; 219 rc = 0; 220 221 err_read_block: 222 if (rc && !silent) 223 pr_warning(FW_WARN GHES_PFX 224 "Failed to read error status block!\n"); 225 return rc; 226 } 227 228 static void ghes_clear_estatus(struct ghes *ghes) 229 { 230 ghes->estatus->block_status = 0; 231 if (!(ghes->flags & GHES_TO_CLEAR)) 232 return; 233 ghes_copy_tofrom_phys(ghes->estatus, ghes->buffer_paddr, 234 sizeof(ghes->estatus->block_status), 0); 235 ghes->flags &= ~GHES_TO_CLEAR; 236 } 237 238 static void ghes_do_proc(struct ghes *ghes) 239 { 240 int ser, processed = 0; 241 struct acpi_hest_generic_data *gdata; 242 243 ser = ghes_severity(ghes->estatus->error_severity); 244 apei_estatus_for_each_section(ghes->estatus, gdata) { 245 #ifdef CONFIG_X86_MCE 246 if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, 247 CPER_SEC_PLATFORM_MEM)) { 248 apei_mce_report_mem_error( 249 ser == GHES_SER_CORRECTED, 250 (struct cper_sec_mem_err *)(gdata+1)); 251 processed = 1; 252 } 253 #endif 254 } 255 256 if (!processed && printk_ratelimit()) 257 pr_warning(GHES_PFX 258 "Unknown error record from generic hardware error source: %d\n", 259 ghes->generic->header.source_id); 260 } 261 262 static int ghes_proc(struct ghes *ghes) 263 { 264 int rc; 265 266 rc = ghes_read_estatus(ghes, 0); 267 if (rc) 268 goto out; 269 ghes_do_proc(ghes); 270 271 out: 272 ghes_clear_estatus(ghes); 273 return 0; 274 } 275 276 static int ghes_notify_sci(struct notifier_block *this, 277 unsigned long event, void *data) 278 { 279 struct ghes *ghes; 280 int ret = NOTIFY_DONE; 281 282 rcu_read_lock(); 283 list_for_each_entry_rcu(ghes, &ghes_sci, list) { 284 if (!ghes_proc(ghes)) 285 ret = NOTIFY_OK; 286 } 287 rcu_read_unlock(); 288 289 return ret; 290 } 291 292 static struct notifier_block ghes_notifier_sci = { 293 .notifier_call = ghes_notify_sci, 294 }; 295 296 static int hest_ghes_parse(struct acpi_hest_header *hest_hdr, void *data) 297 { 298 struct acpi_hest_generic *generic; 299 struct ghes *ghes = NULL; 300 int rc = 0; 301 302 if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR) 303 return 0; 304 305 generic = (struct acpi_hest_generic *)hest_hdr; 306 if (!generic->enabled) 307 return 0; 308 309 if (generic->error_block_length < 310 sizeof(struct acpi_hest_generic_status)) { 311 pr_warning(FW_BUG GHES_PFX 312 "Invalid error block length: %u for generic hardware error source: %d\n", 313 generic->error_block_length, 314 generic->header.source_id); 315 goto err; 316 } 317 if (generic->records_to_preallocate == 0) { 318 pr_warning(FW_BUG GHES_PFX 319 "Invalid records to preallocate: %u for generic hardware error source: %d\n", 320 generic->records_to_preallocate, 321 generic->header.source_id); 322 goto err; 323 } 324 ghes = ghes_new(generic); 325 if (IS_ERR(ghes)) { 326 rc = PTR_ERR(ghes); 327 ghes = NULL; 328 goto err; 329 } 330 switch (generic->notify.type) { 331 case ACPI_HEST_NOTIFY_POLLED: 332 pr_warning(GHES_PFX 333 "Generic hardware error source: %d notified via POLL is not supported!\n", 334 generic->header.source_id); 335 break; 336 case ACPI_HEST_NOTIFY_EXTERNAL: 337 case ACPI_HEST_NOTIFY_LOCAL: 338 pr_warning(GHES_PFX 339 "Generic hardware error source: %d notified via IRQ is not supported!\n", 340 generic->header.source_id); 341 break; 342 case ACPI_HEST_NOTIFY_SCI: 343 if (list_empty(&ghes_sci)) 344 register_acpi_hed_notifier(&ghes_notifier_sci); 345 list_add_rcu(&ghes->list, &ghes_sci); 346 break; 347 case ACPI_HEST_NOTIFY_NMI: 348 pr_warning(GHES_PFX 349 "Generic hardware error source: %d notified via NMI is not supported!\n", 350 generic->header.source_id); 351 break; 352 default: 353 pr_warning(FW_WARN GHES_PFX 354 "Unknown notification type: %u for generic hardware error source: %d\n", 355 generic->notify.type, generic->header.source_id); 356 break; 357 } 358 359 return 0; 360 err: 361 if (ghes) 362 ghes_fini(ghes); 363 return rc; 364 } 365 366 static void ghes_cleanup(void) 367 { 368 struct ghes *ghes, *nghes; 369 370 if (!list_empty(&ghes_sci)) 371 unregister_acpi_hed_notifier(&ghes_notifier_sci); 372 373 synchronize_rcu(); 374 375 list_for_each_entry_safe(ghes, nghes, &ghes_sci, list) { 376 list_del(&ghes->list); 377 ghes_fini(ghes); 378 kfree(ghes); 379 } 380 } 381 382 static int __init ghes_init(void) 383 { 384 int rc; 385 386 if (acpi_disabled) 387 return -ENODEV; 388 389 if (hest_disable) { 390 pr_info(GHES_PFX "HEST is not enabled!\n"); 391 return -EINVAL; 392 } 393 394 rc = apei_hest_parse(hest_ghes_parse, NULL); 395 if (rc) { 396 pr_err(GHES_PFX 397 "Error during parsing HEST generic hardware error sources.\n"); 398 goto err_cleanup; 399 } 400 401 if (list_empty(&ghes_sci)) { 402 pr_info(GHES_PFX 403 "No functional generic hardware error sources.\n"); 404 rc = -ENODEV; 405 goto err_cleanup; 406 } 407 408 pr_info(GHES_PFX 409 "Generic Hardware Error Source support is initialized.\n"); 410 411 return 0; 412 err_cleanup: 413 ghes_cleanup(); 414 return rc; 415 } 416 417 static void __exit ghes_exit(void) 418 { 419 ghes_cleanup(); 420 } 421 422 module_init(ghes_init); 423 module_exit(ghes_exit); 424 425 MODULE_AUTHOR("Huang Ying"); 426 MODULE_DESCRIPTION("APEI Generic Hardware Error Source support"); 427 MODULE_LICENSE("GPL"); 428