xref: /illumos-gate/usr/src/uts/common/io/mem.c (revision 5418b7d90f4acb3e524771dad953c2cad85e61bb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright (c) 2016 by Delphix. All rights reserved.
25  */
26 
27 /*
28  * Copyright 2017 Joyent, Inc.
29  * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com>
30  */
31 
32 /*
33  * Memory special file
34  */
35 
36 #include <sys/types.h>
37 #include <sys/param.h>
38 #include <sys/user.h>
39 #include <sys/buf.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/vm.h>
43 #include <sys/uio.h>
44 #include <sys/mman.h>
45 #include <sys/kmem.h>
46 #include <vm/seg.h>
47 #include <vm/page.h>
48 #include <sys/stat.h>
49 #include <sys/vmem.h>
50 #include <sys/memlist.h>
51 #include <sys/bootconf.h>
52 
53 #include <vm/seg_vn.h>
54 #include <vm/seg_dev.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/seg_kp.h>
57 #include <vm/seg_kpm.h>
58 #include <vm/hat.h>
59 
60 #include <sys/conf.h>
61 #include <sys/mem.h>
62 #include <sys/types.h>
63 #include <sys/conf.h>
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/errno.h>
67 #include <sys/modctl.h>
68 #include <sys/memlist.h>
69 #include <sys/ddi.h>
70 #include <sys/sunddi.h>
71 #include <sys/debug.h>
72 #include <sys/fm/protocol.h>
73 
74 #if defined(__sparc)
75 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
76 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
77     uint64_t *, int *, int *, int *);
78 extern size_t cpu_get_name_bufsize(void);
79 extern int cpu_get_mem_sid(char *, char *, int, int *);
80 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
81 #elif defined(__x86)
82 #include <sys/cpu_module.h>
83 #endif	/* __sparc */
84 
85 /*
86  * Turn a byte length into a pagecount.  The DDI btop takes a
87  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
88  * large physical-memory 32-bit machines.
89  */
90 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
91 
92 static kmutex_t mm_lock;
93 static caddr_t mm_map;
94 
95 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
96 
97 static int mm_kmem_io_access;
98 
99 static int mm_kstat_update(kstat_t *ksp, int rw);
100 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
101 
102 #define	MM_KMEMLOG_NENTRIES	64
103 
104 static int mm_kmemlogent;
105 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
106 
107 /*
108  * On kmem/allmem writes, we log information that might be useful in the event
109  * that a write is errant (that is, due to operator error) and induces a later
110  * problem.  Note that (in particular) in the event of such operator-induced
111  * corruption, a search over the kernel address space for the corrupted
112  * address will yield the ring buffer entry that recorded the write.  And
113  * should it seem baroque or otherwise unnecessary, yes, we need this kind of
114  * auditing facility and yes, we learned that the hard way: disturbingly,
115  * there exist recommendations for "tuning" the system that involve writing to
116  * kernel memory addresses via the kernel debugger, and -- as we discovered --
117  * these can easily be applied incorrectly or unsafely, yielding an entirely
118  * undebuggable "can't happen" kind of panic.
119  */
120 static void
121 mm_logkmem(struct uio *uio)
122 {
123 	mm_logentry_t *ent;
124 	proc_t *p = curthread->t_procp;
125 
126 	mutex_enter(&mm_lock);
127 
128 	ent = &mm_kmemlog[mm_kmemlogent++];
129 
130 	if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
131 		mm_kmemlogent = 0;
132 
133 	ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
134 	ent->mle_len = uio->uio_resid;
135 	gethrestime(&ent->mle_hrestime);
136 	ent->mle_hrtime = gethrtime();
137 	ent->mle_pid = p->p_pidp->pid_id;
138 
139 	(void) strncpy(ent->mle_psargs,
140 	    p->p_user.u_psargs, sizeof (ent->mle_psargs));
141 
142 	mutex_exit(&mm_lock);
143 }
144 
145 /*ARGSUSED1*/
146 static int
147 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
148 {
149 	int i;
150 	struct mem_minor {
151 		char *name;
152 		minor_t minor;
153 		int privonly;
154 		const char *rdpriv;
155 		const char *wrpriv;
156 		mode_t priv_mode;
157 	} mm[] = {
158 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
159 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
160 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
161 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
162 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
163 		{ "full",	M_FULL, PRIVONLY_DEV,	NULL,	NULL,	0666 },
164 	};
165 	kstat_t *ksp;
166 
167 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
168 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
169 
170 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
171 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
172 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
173 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
174 		    DDI_FAILURE) {
175 			ddi_remove_minor_node(devi, NULL);
176 			return (DDI_FAILURE);
177 		}
178 	}
179 
180 	mm_dip = devi;
181 
182 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
183 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
184 	if (ksp != NULL) {
185 		ksp->ks_update = mm_kstat_update;
186 		ksp->ks_snapshot = mm_kstat_snapshot;
187 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
188 		kstat_install(ksp);
189 	}
190 
191 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
192 	    "kmem_io_access", 0);
193 
194 	return (DDI_SUCCESS);
195 }
196 
197 /*ARGSUSED*/
198 static int
199 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
200 {
201 	register int error;
202 
203 	switch (infocmd) {
204 	case DDI_INFO_DEVT2DEVINFO:
205 		*result = (void *)mm_dip;
206 		error = DDI_SUCCESS;
207 		break;
208 	case DDI_INFO_DEVT2INSTANCE:
209 		*result = (void *)0;
210 		error = DDI_SUCCESS;
211 		break;
212 	default:
213 		error = DDI_FAILURE;
214 	}
215 	return (error);
216 }
217 
218 /*ARGSUSED1*/
219 static int
220 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
221 {
222 	switch (getminor(*devp)) {
223 	case M_NULL:
224 	case M_ZERO:
225 	case M_FULL:
226 	case M_MEM:
227 	case M_KMEM:
228 	case M_ALLKMEM:
229 		/* standard devices */
230 		break;
231 
232 	default:
233 		/* Unsupported or unknown type */
234 		return (EINVAL);
235 	}
236 	/* must be character device */
237 	if (typ != OTYP_CHR)
238 		return (EINVAL);
239 	return (0);
240 }
241 
242 struct pollhead	mm_pollhd;
243 
244 /*ARGSUSED*/
245 static int
246 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
247     struct pollhead **phpp)
248 {
249 	switch (getminor(dev)) {
250 	case M_NULL:
251 	case M_ZERO:
252 	case M_FULL:
253 	case M_MEM:
254 	case M_KMEM:
255 	case M_ALLKMEM:
256 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
257 		    POLLWRNORM | POLLRDBAND | POLLWRBAND);
258 		/*
259 		 * A non NULL pollhead pointer should be returned in case
260 		 * user polls for 0 events or is doing an edge-triggerd poll.
261 		 */
262 		if ((!*reventsp && !anyyet) || (events & POLLET)) {
263 			*phpp = &mm_pollhd;
264 		}
265 		return (0);
266 	default:
267 		/* no other devices currently support polling */
268 		return (ENXIO);
269 	}
270 }
271 
272 static int
273 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
274     char *name, caddr_t valuep, int *lengthp)
275 {
276 	/*
277 	 * implement zero size to reduce overhead (avoid two failing
278 	 * property lookups per stat).
279 	 */
280 	return (ddi_prop_op_size(dev, dip, prop_op,
281 	    flags, name, valuep, lengthp, 0));
282 }
283 
284 static int
285 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
286     page_t *pp)
287 {
288 	int error = 0;
289 	int devload = 0;
290 	int is_memory = pf_is_memory(pfn);
291 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
292 	    (size_t)uio->uio_iov->iov_len);
293 	caddr_t va = NULL;
294 
295 	mutex_enter(&mm_lock);
296 
297 	if (is_memory && kpm_enable) {
298 		if (pp)
299 			va = hat_kpm_mapin(pp, NULL);
300 		else
301 			va = hat_kpm_mapin_pfn(pfn);
302 	}
303 
304 	if (va == NULL) {
305 		hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
306 		    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
307 		    HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
308 		va = mm_map;
309 		devload = 1;
310 	}
311 
312 	if (!is_memory) {
313 		if (allowio) {
314 			size_t c = uio->uio_iov->iov_len;
315 
316 			if (ddi_peekpokeio(NULL, uio, rw,
317 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
318 			    sizeof (int32_t)) != DDI_SUCCESS)
319 				error = EFAULT;
320 		} else
321 			error = EIO;
322 	} else
323 		error = uiomove(va + pageoff, nbytes, rw, uio);
324 
325 	if (devload)
326 		hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
327 	else if (pp)
328 		hat_kpm_mapout(pp, NULL, va);
329 	else
330 		hat_kpm_mapout_pfn(pfn);
331 
332 	mutex_exit(&mm_lock);
333 	return (error);
334 }
335 
336 #ifdef	__sparc
337 
338 static int
339 mmpagelock(struct as *as, caddr_t va)
340 {
341 	struct seg *seg;
342 	int i;
343 
344 	AS_LOCK_ENTER(as, RW_READER);
345 	seg = as_segat(as, va);
346 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
347 	AS_LOCK_EXIT(as);
348 
349 	return (i);
350 }
351 
352 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
353 
354 #else
355 
356 #define	NEED_LOCK_KVADDR(va)	0
357 
358 #endif	/* __sparc */
359 
360 /*ARGSUSED3*/
361 static int
362 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
363 {
364 	pfn_t v;
365 	struct iovec *iov;
366 	int error = 0;
367 	size_t c;
368 	ssize_t oresid = uio->uio_resid;
369 	minor_t minor = getminor(dev);
370 
371 	while (uio->uio_resid > 0 && error == 0) {
372 		iov = uio->uio_iov;
373 		if (iov->iov_len == 0) {
374 			uio->uio_iov++;
375 			uio->uio_iovcnt--;
376 			if (uio->uio_iovcnt < 0)
377 				panic("mmrw");
378 			continue;
379 		}
380 		switch (minor) {
381 
382 		case M_MEM:
383 			memlist_read_lock();
384 			if (!address_in_memlist(phys_install,
385 			    (uint64_t)uio->uio_loffset, 1)) {
386 				memlist_read_unlock();
387 				error = EFAULT;
388 				break;
389 			}
390 			memlist_read_unlock();
391 
392 			v = BTOP((u_offset_t)uio->uio_loffset);
393 			error = mmio(uio, rw, v,
394 			    uio->uio_loffset & PAGEOFFSET, 0, NULL);
395 			break;
396 
397 		case M_KMEM:
398 		case M_ALLKMEM:
399 			{
400 			page_t **ppp = NULL;
401 			caddr_t vaddr = (caddr_t)uio->uio_offset;
402 			int try_lock = NEED_LOCK_KVADDR(vaddr);
403 			int locked = 0;
404 
405 			if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
406 				break;
407 
408 			if (rw == UIO_WRITE)
409 				mm_logkmem(uio);
410 
411 			/*
412 			 * If vaddr does not map a valid page, as_pagelock()
413 			 * will return failure. Hence we can't check the
414 			 * return value and return EFAULT here as we'd like.
415 			 * seg_kp and seg_kpm do not properly support
416 			 * as_pagelock() for this context so we avoid it
417 			 * using the try_lock set check above.  Some day when
418 			 * the kernel page locking gets redesigned all this
419 			 * muck can be cleaned up.
420 			 */
421 			if (try_lock)
422 				locked = (as_pagelock(&kas, &ppp, vaddr,
423 				    PAGESIZE, S_WRITE) == 0);
424 
425 			v = hat_getpfnum(kas.a_hat,
426 			    (caddr_t)(uintptr_t)uio->uio_loffset);
427 			if (v == PFN_INVALID) {
428 				if (locked)
429 					as_pageunlock(&kas, ppp, vaddr,
430 					    PAGESIZE, S_WRITE);
431 				error = EFAULT;
432 				break;
433 			}
434 
435 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
436 			    minor == M_ALLKMEM || mm_kmem_io_access,
437 			    (locked && ppp) ? *ppp : NULL);
438 			if (locked)
439 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
440 				    S_WRITE);
441 			}
442 
443 			break;
444 
445 		case M_FULL:
446 			if (rw == UIO_WRITE) {
447 				error = ENOSPC;
448 				break;
449 			}
450 			/* else it's a read, fall through to zero case */
451 			/*FALLTHROUGH*/
452 
453 		case M_ZERO:
454 			if (rw == UIO_READ) {
455 				label_t ljb;
456 
457 				if (on_fault(&ljb)) {
458 					no_fault();
459 					error = EFAULT;
460 					break;
461 				}
462 				uzero(iov->iov_base, iov->iov_len);
463 				no_fault();
464 				uio->uio_resid -= iov->iov_len;
465 				uio->uio_loffset += iov->iov_len;
466 				break;
467 			}
468 			/* else it's a write, fall through to NULL case */
469 			/*FALLTHROUGH*/
470 
471 		case M_NULL:
472 			if (rw == UIO_READ)
473 				return (0);
474 			c = iov->iov_len;
475 			iov->iov_base += c;
476 			iov->iov_len -= c;
477 			uio->uio_loffset += c;
478 			uio->uio_resid -= c;
479 			break;
480 
481 		}
482 	}
483 	return (uio->uio_resid == oresid ? error : 0);
484 }
485 
486 static int
487 mmread(dev_t dev, struct uio *uio, cred_t *cred)
488 {
489 	return (mmrw(dev, uio, UIO_READ, cred));
490 }
491 
492 static int
493 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
494 {
495 	return (mmrw(dev, uio, UIO_WRITE, cred));
496 }
497 
498 /*
499  * Private ioctl for libkvm to support kvm_physaddr().
500  * Given an address space and a VA, compute the PA.
501  */
502 static int
503 mmioctl_vtop(intptr_t data)
504 {
505 #ifdef _SYSCALL32
506 	mem_vtop32_t vtop32;
507 #endif
508 	mem_vtop_t mem_vtop;
509 	proc_t *p;
510 	pfn_t pfn = (pfn_t)PFN_INVALID;
511 	pid_t pid = 0;
512 	struct as *as;
513 	struct seg *seg;
514 
515 	if (get_udatamodel() == DATAMODEL_NATIVE) {
516 		if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
517 			return (EFAULT);
518 	}
519 #ifdef _SYSCALL32
520 	else {
521 		if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
522 			return (EFAULT);
523 		mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
524 		mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
525 
526 		if (mem_vtop.m_as != NULL)
527 			return (EINVAL);
528 	}
529 #endif
530 
531 	if (mem_vtop.m_as == &kas) {
532 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
533 	} else {
534 		if (mem_vtop.m_as == NULL) {
535 			/*
536 			 * Assume the calling process's address space if the
537 			 * caller didn't specify one.
538 			 */
539 			p = curthread->t_procp;
540 			if (p == NULL)
541 				return (EIO);
542 			mem_vtop.m_as = p->p_as;
543 		}
544 
545 		mutex_enter(&pidlock);
546 		for (p = practive; p != NULL; p = p->p_next) {
547 			if (p->p_as == mem_vtop.m_as) {
548 				pid = p->p_pid;
549 				break;
550 			}
551 		}
552 		mutex_exit(&pidlock);
553 		if (p == NULL)
554 			return (EIO);
555 		p = sprlock(pid);
556 		if (p == NULL)
557 			return (EIO);
558 		as = p->p_as;
559 		if (as == mem_vtop.m_as) {
560 			mutex_exit(&p->p_lock);
561 			AS_LOCK_ENTER(as, RW_READER);
562 			for (seg = AS_SEGFIRST(as); seg != NULL;
563 			    seg = AS_SEGNEXT(as, seg))
564 				if ((uintptr_t)mem_vtop.m_va -
565 				    (uintptr_t)seg->s_base < seg->s_size)
566 					break;
567 			if (seg != NULL)
568 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
569 			AS_LOCK_EXIT(as);
570 			mutex_enter(&p->p_lock);
571 		}
572 		sprunlock(p);
573 	}
574 	mem_vtop.m_pfn = pfn;
575 	if (pfn == PFN_INVALID)
576 		return (EIO);
577 
578 	if (get_udatamodel() == DATAMODEL_NATIVE) {
579 		if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
580 			return (EFAULT);
581 	}
582 #ifdef _SYSCALL32
583 	else {
584 		vtop32.m_pfn = mem_vtop.m_pfn;
585 		if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
586 			return (EFAULT);
587 	}
588 #endif
589 
590 	return (0);
591 }
592 
593 /*
594  * Given a PA, execute the given page retire command on it.
595  */
596 static int
597 mmioctl_page_retire(int cmd, intptr_t data)
598 {
599 	extern int page_retire_test(void);
600 	uint64_t pa;
601 
602 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
603 		return (EFAULT);
604 	}
605 
606 	switch (cmd) {
607 	case MEM_PAGE_ISRETIRED:
608 		return (page_retire_check(pa, NULL));
609 
610 	case MEM_PAGE_UNRETIRE:
611 		return (page_unretire(pa));
612 
613 	case MEM_PAGE_RETIRE:
614 		return (page_retire(pa, PR_FMA));
615 
616 	case MEM_PAGE_RETIRE_MCE:
617 		return (page_retire(pa, PR_MCE));
618 
619 	case MEM_PAGE_RETIRE_UE:
620 		return (page_retire(pa, PR_UE));
621 
622 	case MEM_PAGE_GETERRORS:
623 		{
624 			uint64_t page_errors;
625 			int rc = page_retire_check(pa, &page_errors);
626 			if (copyout(&page_errors, (void *)data,
627 			    sizeof (uint64_t))) {
628 				return (EFAULT);
629 			}
630 			return (rc);
631 		}
632 
633 	case MEM_PAGE_RETIRE_TEST:
634 		return (page_retire_test());
635 
636 	}
637 
638 	return (EINVAL);
639 }
640 
641 #ifdef __sparc
642 /*
643  * Read a mem_name_t from user-space and store it in the mem_name_t
644  * pointed to by the mem_name argument.
645  */
646 static int
647 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
648 {
649 	if (get_udatamodel() == DATAMODEL_NATIVE) {
650 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
651 			return (EFAULT);
652 	}
653 #ifdef	_SYSCALL32
654 	else {
655 		mem_name32_t mem_name32;
656 
657 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
658 			return (EFAULT);
659 		mem_name->m_addr = mem_name32.m_addr;
660 		mem_name->m_synd = mem_name32.m_synd;
661 		mem_name->m_type[0] = mem_name32.m_type[0];
662 		mem_name->m_type[1] = mem_name32.m_type[1];
663 		mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
664 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
665 		mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
666 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
667 	}
668 #endif	/* _SYSCALL32 */
669 
670 	return (0);
671 }
672 
673 /*
674  * Given a syndrome, syndrome type, and address return the
675  * associated memory name in the provided data buffer.
676  */
677 static int
678 mmioctl_get_mem_name(intptr_t data)
679 {
680 	mem_name_t mem_name;
681 	void *buf;
682 	size_t bufsize;
683 	int len, err;
684 
685 	if ((bufsize = cpu_get_name_bufsize()) == 0)
686 		return (ENOTSUP);
687 
688 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
689 		return (err);
690 
691 	buf = kmem_alloc(bufsize, KM_SLEEP);
692 
693 	/*
694 	 * Call into cpu specific code to do the lookup.
695 	 */
696 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
697 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
698 		kmem_free(buf, bufsize);
699 		return (err);
700 	}
701 
702 	if (len >= mem_name.m_namelen) {
703 		kmem_free(buf, bufsize);
704 		return (ENOSPC);
705 	}
706 
707 	if (copyoutstr(buf, (char *)mem_name.m_name,
708 	    mem_name.m_namelen, NULL) != 0) {
709 		kmem_free(buf, bufsize);
710 		return (EFAULT);
711 	}
712 
713 	kmem_free(buf, bufsize);
714 	return (0);
715 }
716 
717 /*
718  * Given a syndrome and address return information about the associated memory.
719  */
720 static int
721 mmioctl_get_mem_info(intptr_t data)
722 {
723 	mem_info_t mem_info;
724 	int err;
725 
726 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
727 		return (EFAULT);
728 
729 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
730 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
731 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
732 		return (err);
733 
734 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
735 		return (EFAULT);
736 
737 	return (0);
738 }
739 
740 /*
741  * Given a memory name, return its associated serial id
742  */
743 static int
744 mmioctl_get_mem_sid(intptr_t data)
745 {
746 	mem_name_t mem_name;
747 	void *buf;
748 	void *name;
749 	size_t	name_len;
750 	size_t bufsize;
751 	int len, err;
752 
753 	if ((bufsize = cpu_get_name_bufsize()) == 0)
754 		return (ENOTSUP);
755 
756 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
757 		return (err);
758 
759 	buf = kmem_alloc(bufsize, KM_SLEEP);
760 
761 	if (mem_name.m_namelen > 1024)
762 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
763 
764 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
765 
766 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
767 	    mem_name.m_namelen, &name_len)) != 0) {
768 		kmem_free(buf, bufsize);
769 		kmem_free(name, mem_name.m_namelen);
770 		return (err);
771 	}
772 
773 	/*
774 	 * Call into cpu specific code to do the lookup.
775 	 */
776 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
777 		kmem_free(buf, bufsize);
778 		kmem_free(name, mem_name.m_namelen);
779 		return (err);
780 	}
781 
782 	if (len > mem_name.m_sidlen) {
783 		kmem_free(buf, bufsize);
784 		kmem_free(name, mem_name.m_namelen);
785 		return (ENAMETOOLONG);
786 	}
787 
788 	if (copyoutstr(buf, (char *)mem_name.m_sid,
789 	    mem_name.m_sidlen, NULL) != 0) {
790 		kmem_free(buf, bufsize);
791 		kmem_free(name, mem_name.m_namelen);
792 		return (EFAULT);
793 	}
794 
795 	kmem_free(buf, bufsize);
796 	kmem_free(name, mem_name.m_namelen);
797 	return (0);
798 }
799 #endif	/* __sparc */
800 
801 /*
802  * Private ioctls for
803  *	libkvm to support kvm_physaddr().
804  *	FMA support for page_retire() and memory attribute information.
805  */
806 /*ARGSUSED*/
807 static int
808 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
809 {
810 	if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
811 	    (cmd != MEM_VTOP && getminor(dev) != M_MEM))
812 		return (ENXIO);
813 
814 	switch (cmd) {
815 	case MEM_VTOP:
816 		return (mmioctl_vtop(data));
817 
818 	case MEM_PAGE_RETIRE:
819 	case MEM_PAGE_ISRETIRED:
820 	case MEM_PAGE_UNRETIRE:
821 	case MEM_PAGE_RETIRE_MCE:
822 	case MEM_PAGE_RETIRE_UE:
823 	case MEM_PAGE_GETERRORS:
824 	case MEM_PAGE_RETIRE_TEST:
825 		return (mmioctl_page_retire(cmd, data));
826 
827 #ifdef __sparc
828 	case MEM_NAME:
829 		return (mmioctl_get_mem_name(data));
830 
831 	case MEM_INFO:
832 		return (mmioctl_get_mem_info(data));
833 
834 	case MEM_SID:
835 		return (mmioctl_get_mem_sid(data));
836 #else
837 	case MEM_NAME:
838 	case MEM_INFO:
839 	case MEM_SID:
840 		return (ENOTSUP);
841 #endif	/* __sparc */
842 	}
843 	return (ENXIO);
844 }
845 
846 /*ARGSUSED2*/
847 static int
848 mmmmap(dev_t dev, off_t off, int prot)
849 {
850 	pfn_t pf;
851 	struct memlist *pmem;
852 	minor_t minor = getminor(dev);
853 
854 	switch (minor) {
855 	case M_MEM:
856 		pf = btop(off);
857 		memlist_read_lock();
858 		for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
859 			if (pf >= BTOP(pmem->ml_address) &&
860 			    pf < BTOP(pmem->ml_address + pmem->ml_size)) {
861 				memlist_read_unlock();
862 				return (impl_obmem_pfnum(pf));
863 			}
864 		}
865 		memlist_read_unlock();
866 		break;
867 
868 	case M_KMEM:
869 	case M_ALLKMEM:
870 		/* no longer supported with KPR */
871 		return (-1);
872 
873 	case M_FULL:
874 	case M_ZERO:
875 		/*
876 		 * We shouldn't be mmap'ing to /dev/zero here as
877 		 * mmsegmap() should have already converted
878 		 * a mapping request for this device to a mapping
879 		 * using seg_vn for anonymous memory.
880 		 */
881 		break;
882 
883 	}
884 	return (-1);
885 }
886 
887 /*
888  * This function is called when a memory device is mmap'ed.
889  * Set up the mapping to the correct device driver.
890  */
891 static int
892 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
893     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
894 {
895 	struct segvn_crargs vn_a;
896 	struct segdev_crargs dev_a;
897 	int error;
898 	minor_t minor;
899 	off_t i;
900 
901 	minor = getminor(dev);
902 
903 	as_rangelock(as);
904 	/*
905 	 * No need to worry about vac alignment on /dev/zero
906 	 * since this is a "clone" object that doesn't yet exist.
907 	 */
908 	error = choose_addr(as, addrp, len, off,
909 	    (minor == M_MEM) || (minor == M_KMEM), flags);
910 	if (error != 0) {
911 		as_rangeunlock(as);
912 		return (error);
913 	}
914 
915 	switch (minor) {
916 	case M_MEM:
917 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
918 		if ((flags & MAP_TYPE) != MAP_SHARED) {
919 			as_rangeunlock(as);
920 			return (EINVAL);
921 		}
922 
923 		/*
924 		 * Check to ensure that the entire range is
925 		 * legal and we are not trying to map in
926 		 * more than the device will let us.
927 		 */
928 		for (i = 0; i < len; i += PAGESIZE) {
929 			if (mmmmap(dev, off + i, maxprot) == -1) {
930 				as_rangeunlock(as);
931 				return (ENXIO);
932 			}
933 		}
934 
935 		/*
936 		 * Use seg_dev segment driver for /dev/mem mapping.
937 		 */
938 		dev_a.mapfunc = mmmmap;
939 		dev_a.dev = dev;
940 		dev_a.offset = off;
941 		dev_a.type = (flags & MAP_TYPE);
942 		dev_a.prot = (uchar_t)prot;
943 		dev_a.maxprot = (uchar_t)maxprot;
944 		dev_a.hat_attr = 0;
945 
946 		/*
947 		 * Make /dev/mem mappings non-consistent since we can't
948 		 * alias pages that don't have page structs behind them,
949 		 * such as kernel stack pages. If someone mmap()s a kernel
950 		 * stack page and if we give them a tte with cv, a line from
951 		 * that page can get into both pages of the spitfire d$.
952 		 * But snoop from another processor will only invalidate
953 		 * the first page. This later caused kernel (xc_attention)
954 		 * to go into an infinite loop at pil 13 and no interrupts
955 		 * could come in. See 1203630.
956 		 *
957 		 */
958 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
959 		dev_a.devmap_data = NULL;
960 
961 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
962 		break;
963 
964 	case M_ZERO:
965 		/*
966 		 * Use seg_vn segment driver for /dev/zero mapping.
967 		 * Passing in a NULL amp gives us the "cloning" effect.
968 		 */
969 		vn_a.vp = NULL;
970 		vn_a.offset = 0;
971 		vn_a.type = (flags & MAP_TYPE);
972 		vn_a.prot = prot;
973 		vn_a.maxprot = maxprot;
974 		vn_a.flags = flags & ~MAP_TYPE;
975 		vn_a.cred = cred;
976 		vn_a.amp = NULL;
977 		vn_a.szc = 0;
978 		vn_a.lgrp_mem_policy_flags = 0;
979 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
980 		break;
981 
982 	case M_KMEM:
983 	case M_ALLKMEM:
984 		/* No longer supported with KPR. */
985 		error = ENXIO;
986 		break;
987 
988 	case M_NULL:
989 		/*
990 		 * Use seg_dev segment driver for /dev/null mapping.
991 		 */
992 		dev_a.mapfunc = mmmmap;
993 		dev_a.dev = dev;
994 		dev_a.offset = off;
995 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
996 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
997 		dev_a.hat_attr = 0;
998 		dev_a.hat_flags = 0;
999 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
1000 		break;
1001 
1002 	default:
1003 		error = ENXIO;
1004 	}
1005 
1006 	as_rangeunlock(as);
1007 	return (error);
1008 }
1009 
1010 static struct cb_ops mm_cb_ops = {
1011 	mmopen,			/* open */
1012 	nulldev,		/* close */
1013 	nodev,			/* strategy */
1014 	nodev,			/* print */
1015 	nodev,			/* dump */
1016 	mmread,			/* read */
1017 	mmwrite,		/* write */
1018 	mmioctl,		/* ioctl */
1019 	nodev,			/* devmap */
1020 	mmmmap,			/* mmap */
1021 	mmsegmap,		/* segmap */
1022 	mmchpoll,		/* poll */
1023 	mmpropop,		/* prop_op */
1024 	0,			/* streamtab  */
1025 	D_NEW | D_MP | D_64BIT | D_U64BIT
1026 };
1027 
1028 static struct dev_ops mm_ops = {
1029 	DEVO_REV,		/* devo_rev, */
1030 	0,			/* refcnt  */
1031 	mm_info,		/* get_dev_info */
1032 	nulldev,		/* identify */
1033 	nulldev,		/* probe */
1034 	mm_attach,		/* attach */
1035 	nodev,			/* detach */
1036 	nodev,			/* reset */
1037 	&mm_cb_ops,		/* driver operations */
1038 	(struct bus_ops *)0,	/* bus operations */
1039 	NULL,			/* power */
1040 	ddi_quiesce_not_needed,		/* quiesce */
1041 };
1042 
1043 static struct modldrv modldrv = {
1044 	&mod_driverops, "memory driver", &mm_ops,
1045 };
1046 
1047 static struct modlinkage modlinkage = {
1048 	MODREV_1, &modldrv, NULL
1049 };
1050 
1051 int
1052 _init(void)
1053 {
1054 	return (mod_install(&modlinkage));
1055 }
1056 
1057 int
1058 _info(struct modinfo *modinfop)
1059 {
1060 	return (mod_info(&modlinkage, modinfop));
1061 }
1062 
1063 int
1064 _fini(void)
1065 {
1066 	return (mod_remove(&modlinkage));
1067 }
1068 
1069 static int
1070 mm_kstat_update(kstat_t *ksp, int rw)
1071 {
1072 	struct memlist *pmem;
1073 	uint_t count;
1074 
1075 	if (rw == KSTAT_WRITE)
1076 		return (EACCES);
1077 
1078 	count = 0;
1079 	memlist_read_lock();
1080 	for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1081 		count++;
1082 	}
1083 	memlist_read_unlock();
1084 
1085 	ksp->ks_ndata = count;
1086 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1087 
1088 	return (0);
1089 }
1090 
1091 static int
1092 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1093 {
1094 	struct memlist *pmem;
1095 	struct memunit {
1096 		uint64_t address;
1097 		uint64_t size;
1098 	} *kspmem;
1099 
1100 	if (rw == KSTAT_WRITE)
1101 		return (EACCES);
1102 
1103 	ksp->ks_snaptime = gethrtime();
1104 
1105 	kspmem = (struct memunit *)buf;
1106 	memlist_read_lock();
1107 	for (pmem = phys_install; pmem != NULL;
1108 	    pmem = pmem->ml_next, kspmem++) {
1109 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1110 			break;
1111 		kspmem->address = pmem->ml_address;
1112 		kspmem->size = pmem->ml_size;
1113 	}
1114 	memlist_read_unlock();
1115 
1116 	return (0);
1117 }
1118