xref: /linux/drivers/acpi/apei/ghes.c (revision cc4589ebfae6f8dbb5cf880a0a67eedab3416492)
1 /*
2  * APEI Generic Hardware Error Source support
3  *
4  * Generic Hardware Error Source provides a way to report platform
5  * hardware errors (such as that from chipset). It works in so called
6  * "Firmware First" mode, that is, hardware errors are reported to
7  * firmware firstly, then reported to Linux by firmware. This way,
8  * some non-standard hardware error registers or non-standard hardware
9  * link can be checked by firmware to produce more hardware error
10  * information for Linux.
11  *
12  * For more information about Generic Hardware Error Source, please
13  * refer to ACPI Specification version 4.0, section 17.3.2.6
14  *
15  * Now, only SCI notification type and memory errors are
16  * supported. More notification type and hardware error type will be
17  * added later.
18  *
19  * Copyright 2010 Intel Corp.
20  *   Author: Huang Ying <ying.huang@intel.com>
21  *
22  * This program is free software; you can redistribute it and/or
23  * modify it under the terms of the GNU General Public License version
24  * 2 as published by the Free Software Foundation;
25  *
26  * This program is distributed in the hope that it will be useful,
27  * but WITHOUT ANY WARRANTY; without even the implied warranty of
28  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29  * GNU General Public License for more details.
30  *
31  * You should have received a copy of the GNU General Public License
32  * along with this program; if not, write to the Free Software
33  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
34  */
35 
36 #include <linux/kernel.h>
37 #include <linux/module.h>
38 #include <linux/init.h>
39 #include <linux/acpi.h>
40 #include <linux/io.h>
41 #include <linux/interrupt.h>
42 #include <linux/cper.h>
43 #include <linux/kdebug.h>
44 #include <acpi/apei.h>
45 #include <acpi/atomicio.h>
46 #include <acpi/hed.h>
47 #include <asm/mce.h>
48 
49 #include "apei-internal.h"
50 
51 #define GHES_PFX	"GHES: "
52 
53 #define GHES_ESTATUS_MAX_SIZE		65536
54 
55 /*
56  * One struct ghes is created for each generic hardware error
57  * source.
58  *
59  * It provides the context for APEI hardware error timer/IRQ/SCI/NMI
60  * handler. Handler for one generic hardware error source is only
61  * triggered after the previous one is done. So handler can uses
62  * struct ghes without locking.
63  *
64  * estatus: memory buffer for error status block, allocated during
65  * HEST parsing.
66  */
67 #define GHES_TO_CLEAR		0x0001
68 
69 struct ghes {
70 	struct acpi_hest_generic *generic;
71 	struct acpi_hest_generic_status *estatus;
72 	struct list_head list;
73 	u64 buffer_paddr;
74 	unsigned long flags;
75 };
76 
77 /*
78  * Error source lists, one list for each notification method. The
79  * members in lists are struct ghes.
80  *
81  * The list members are only added in HEST parsing and deleted during
82  * module_exit, that is, single-threaded. So no lock is needed for
83  * that.
84  *
85  * But the mutual exclusion is needed between members adding/deleting
86  * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is
87  * used for that.
88  */
89 static LIST_HEAD(ghes_sci);
90 
91 static struct ghes *ghes_new(struct acpi_hest_generic *generic)
92 {
93 	struct ghes *ghes;
94 	unsigned int error_block_length;
95 	int rc;
96 
97 	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
98 	if (!ghes)
99 		return ERR_PTR(-ENOMEM);
100 	ghes->generic = generic;
101 	INIT_LIST_HEAD(&ghes->list);
102 	rc = acpi_pre_map_gar(&generic->error_status_address);
103 	if (rc)
104 		goto err_free;
105 	error_block_length = generic->error_block_length;
106 	if (error_block_length > GHES_ESTATUS_MAX_SIZE) {
107 		pr_warning(FW_WARN GHES_PFX
108 			   "Error status block length is too long: %u for "
109 			   "generic hardware error source: %d.\n",
110 			   error_block_length, generic->header.source_id);
111 		error_block_length = GHES_ESTATUS_MAX_SIZE;
112 	}
113 	ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
114 	if (!ghes->estatus) {
115 		rc = -ENOMEM;
116 		goto err_unmap;
117 	}
118 
119 	return ghes;
120 
121 err_unmap:
122 	acpi_post_unmap_gar(&generic->error_status_address);
123 err_free:
124 	kfree(ghes);
125 	return ERR_PTR(rc);
126 }
127 
128 static void ghes_fini(struct ghes *ghes)
129 {
130 	kfree(ghes->estatus);
131 	acpi_post_unmap_gar(&ghes->generic->error_status_address);
132 }
133 
134 enum {
135 	GHES_SER_NO = 0x0,
136 	GHES_SER_CORRECTED = 0x1,
137 	GHES_SER_RECOVERABLE = 0x2,
138 	GHES_SER_PANIC = 0x3,
139 };
140 
141 static inline int ghes_severity(int severity)
142 {
143 	switch (severity) {
144 	case CPER_SER_INFORMATIONAL:
145 		return GHES_SER_NO;
146 	case CPER_SER_CORRECTED:
147 		return GHES_SER_CORRECTED;
148 	case CPER_SER_RECOVERABLE:
149 		return GHES_SER_RECOVERABLE;
150 	case CPER_SER_FATAL:
151 		return GHES_SER_PANIC;
152 	default:
153 		/* Unkown, go panic */
154 		return GHES_SER_PANIC;
155 	}
156 }
157 
158 /* SCI handler run in work queue, so ioremap can be used here */
159 static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
160 				 int from_phys)
161 {
162 	void *vaddr;
163 
164 	vaddr = ioremap_cache(paddr, len);
165 	if (!vaddr)
166 		return -ENOMEM;
167 	if (from_phys)
168 		memcpy(buffer, vaddr, len);
169 	else
170 		memcpy(vaddr, buffer, len);
171 	iounmap(vaddr);
172 
173 	return 0;
174 }
175 
176 static int ghes_read_estatus(struct ghes *ghes, int silent)
177 {
178 	struct acpi_hest_generic *g = ghes->generic;
179 	u64 buf_paddr;
180 	u32 len;
181 	int rc;
182 
183 	rc = acpi_atomic_read(&buf_paddr, &g->error_status_address);
184 	if (rc) {
185 		if (!silent && printk_ratelimit())
186 			pr_warning(FW_WARN GHES_PFX
187 "Failed to read error status block address for hardware error source: %d.\n",
188 				   g->header.source_id);
189 		return -EIO;
190 	}
191 	if (!buf_paddr)
192 		return -ENOENT;
193 
194 	rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
195 				   sizeof(*ghes->estatus), 1);
196 	if (rc)
197 		return rc;
198 	if (!ghes->estatus->block_status)
199 		return -ENOENT;
200 
201 	ghes->buffer_paddr = buf_paddr;
202 	ghes->flags |= GHES_TO_CLEAR;
203 
204 	rc = -EIO;
205 	len = apei_estatus_len(ghes->estatus);
206 	if (len < sizeof(*ghes->estatus))
207 		goto err_read_block;
208 	if (len > ghes->generic->error_block_length)
209 		goto err_read_block;
210 	if (apei_estatus_check_header(ghes->estatus))
211 		goto err_read_block;
212 	rc = ghes_copy_tofrom_phys(ghes->estatus + 1,
213 				   buf_paddr + sizeof(*ghes->estatus),
214 				   len - sizeof(*ghes->estatus), 1);
215 	if (rc)
216 		return rc;
217 	if (apei_estatus_check(ghes->estatus))
218 		goto err_read_block;
219 	rc = 0;
220 
221 err_read_block:
222 	if (rc && !silent)
223 		pr_warning(FW_WARN GHES_PFX
224 			   "Failed to read error status block!\n");
225 	return rc;
226 }
227 
228 static void ghes_clear_estatus(struct ghes *ghes)
229 {
230 	ghes->estatus->block_status = 0;
231 	if (!(ghes->flags & GHES_TO_CLEAR))
232 		return;
233 	ghes_copy_tofrom_phys(ghes->estatus, ghes->buffer_paddr,
234 			      sizeof(ghes->estatus->block_status), 0);
235 	ghes->flags &= ~GHES_TO_CLEAR;
236 }
237 
238 static void ghes_do_proc(struct ghes *ghes)
239 {
240 	int ser, processed = 0;
241 	struct acpi_hest_generic_data *gdata;
242 
243 	ser = ghes_severity(ghes->estatus->error_severity);
244 	apei_estatus_for_each_section(ghes->estatus, gdata) {
245 #ifdef CONFIG_X86_MCE
246 		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
247 				 CPER_SEC_PLATFORM_MEM)) {
248 			apei_mce_report_mem_error(
249 				ser == GHES_SER_CORRECTED,
250 				(struct cper_sec_mem_err *)(gdata+1));
251 			processed = 1;
252 		}
253 #endif
254 	}
255 
256 	if (!processed && printk_ratelimit())
257 		pr_warning(GHES_PFX
258 		"Unknown error record from generic hardware error source: %d\n",
259 			   ghes->generic->header.source_id);
260 }
261 
262 static int ghes_proc(struct ghes *ghes)
263 {
264 	int rc;
265 
266 	rc = ghes_read_estatus(ghes, 0);
267 	if (rc)
268 		goto out;
269 	ghes_do_proc(ghes);
270 
271 out:
272 	ghes_clear_estatus(ghes);
273 	return 0;
274 }
275 
276 static int ghes_notify_sci(struct notifier_block *this,
277 				  unsigned long event, void *data)
278 {
279 	struct ghes *ghes;
280 	int ret = NOTIFY_DONE;
281 
282 	rcu_read_lock();
283 	list_for_each_entry_rcu(ghes, &ghes_sci, list) {
284 		if (!ghes_proc(ghes))
285 			ret = NOTIFY_OK;
286 	}
287 	rcu_read_unlock();
288 
289 	return ret;
290 }
291 
292 static struct notifier_block ghes_notifier_sci = {
293 	.notifier_call = ghes_notify_sci,
294 };
295 
296 static int hest_ghes_parse(struct acpi_hest_header *hest_hdr, void *data)
297 {
298 	struct acpi_hest_generic *generic;
299 	struct ghes *ghes = NULL;
300 	int rc = 0;
301 
302 	if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR)
303 		return 0;
304 
305 	generic = (struct acpi_hest_generic *)hest_hdr;
306 	if (!generic->enabled)
307 		return 0;
308 
309 	if (generic->error_block_length <
310 	    sizeof(struct acpi_hest_generic_status)) {
311 		pr_warning(FW_BUG GHES_PFX
312 "Invalid error block length: %u for generic hardware error source: %d\n",
313 			   generic->error_block_length,
314 			   generic->header.source_id);
315 		goto err;
316 	}
317 	if (generic->records_to_preallocate == 0) {
318 		pr_warning(FW_BUG GHES_PFX
319 "Invalid records to preallocate: %u for generic hardware error source: %d\n",
320 			   generic->records_to_preallocate,
321 			   generic->header.source_id);
322 		goto err;
323 	}
324 	ghes = ghes_new(generic);
325 	if (IS_ERR(ghes)) {
326 		rc = PTR_ERR(ghes);
327 		ghes = NULL;
328 		goto err;
329 	}
330 	switch (generic->notify.type) {
331 	case ACPI_HEST_NOTIFY_POLLED:
332 		pr_warning(GHES_PFX
333 "Generic hardware error source: %d notified via POLL is not supported!\n",
334 			   generic->header.source_id);
335 		break;
336 	case ACPI_HEST_NOTIFY_EXTERNAL:
337 	case ACPI_HEST_NOTIFY_LOCAL:
338 		pr_warning(GHES_PFX
339 "Generic hardware error source: %d notified via IRQ is not supported!\n",
340 			   generic->header.source_id);
341 		break;
342 	case ACPI_HEST_NOTIFY_SCI:
343 		if (list_empty(&ghes_sci))
344 			register_acpi_hed_notifier(&ghes_notifier_sci);
345 		list_add_rcu(&ghes->list, &ghes_sci);
346 		break;
347 	case ACPI_HEST_NOTIFY_NMI:
348 		pr_warning(GHES_PFX
349 "Generic hardware error source: %d notified via NMI is not supported!\n",
350 			   generic->header.source_id);
351 		break;
352 	default:
353 		pr_warning(FW_WARN GHES_PFX
354 	"Unknown notification type: %u for generic hardware error source: %d\n",
355 			   generic->notify.type, generic->header.source_id);
356 		break;
357 	}
358 
359 	return 0;
360 err:
361 	if (ghes)
362 		ghes_fini(ghes);
363 	return rc;
364 }
365 
366 static void ghes_cleanup(void)
367 {
368 	struct ghes *ghes, *nghes;
369 
370 	if (!list_empty(&ghes_sci))
371 		unregister_acpi_hed_notifier(&ghes_notifier_sci);
372 
373 	synchronize_rcu();
374 
375 	list_for_each_entry_safe(ghes, nghes, &ghes_sci, list) {
376 		list_del(&ghes->list);
377 		ghes_fini(ghes);
378 		kfree(ghes);
379 	}
380 }
381 
382 static int __init ghes_init(void)
383 {
384 	int rc;
385 
386 	if (acpi_disabled)
387 		return -ENODEV;
388 
389 	if (hest_disable) {
390 		pr_info(GHES_PFX "HEST is not enabled!\n");
391 		return -EINVAL;
392 	}
393 
394 	rc = apei_hest_parse(hest_ghes_parse, NULL);
395 	if (rc) {
396 		pr_err(GHES_PFX
397 		"Error during parsing HEST generic hardware error sources.\n");
398 		goto err_cleanup;
399 	}
400 
401 	if (list_empty(&ghes_sci)) {
402 		pr_info(GHES_PFX
403 			"No functional generic hardware error sources.\n");
404 		rc = -ENODEV;
405 		goto err_cleanup;
406 	}
407 
408 	pr_info(GHES_PFX
409 		"Generic Hardware Error Source support is initialized.\n");
410 
411 	return 0;
412 err_cleanup:
413 	ghes_cleanup();
414 	return rc;
415 }
416 
417 static void __exit ghes_exit(void)
418 {
419 	ghes_cleanup();
420 }
421 
422 module_init(ghes_init);
423 module_exit(ghes_exit);
424 
425 MODULE_AUTHOR("Huang Ying");
426 MODULE_DESCRIPTION("APEI Generic Hardware Error Source support");
427 MODULE_LICENSE("GPL");
428