xref: /illumos-gate/usr/src/uts/common/io/mem.c (revision ff31d5bfa079d4db9f78f481637d7ed9f9fa4a49)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright (c) 2016 by Delphix. All rights reserved.
25  */
26 
27 /*
28  * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
29  */
30 
31 /*
32  * Memory special file
33  */
34 
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/user.h>
38 #include <sys/buf.h>
39 #include <sys/systm.h>
40 #include <sys/cred.h>
41 #include <sys/vm.h>
42 #include <sys/uio.h>
43 #include <sys/mman.h>
44 #include <sys/kmem.h>
45 #include <vm/seg.h>
46 #include <vm/page.h>
47 #include <sys/stat.h>
48 #include <sys/vmem.h>
49 #include <sys/memlist.h>
50 #include <sys/bootconf.h>
51 
52 #include <vm/seg_vn.h>
53 #include <vm/seg_dev.h>
54 #include <vm/seg_kmem.h>
55 #include <vm/seg_kp.h>
56 #include <vm/seg_kpm.h>
57 #include <vm/hat.h>
58 
59 #include <sys/conf.h>
60 #include <sys/mem.h>
61 #include <sys/types.h>
62 #include <sys/conf.h>
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/errno.h>
66 #include <sys/modctl.h>
67 #include <sys/memlist.h>
68 #include <sys/ddi.h>
69 #include <sys/sunddi.h>
70 #include <sys/debug.h>
71 #include <sys/fm/protocol.h>
72 
73 #if defined(__sparc)
74 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
75 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
76     uint64_t *, int *, int *, int *);
77 extern size_t cpu_get_name_bufsize(void);
78 extern int cpu_get_mem_sid(char *, char *, int, int *);
79 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
80 #elif defined(__x86)
81 #include <sys/cpu_module.h>
82 #endif	/* __sparc */
83 
84 /*
85  * Turn a byte length into a pagecount.  The DDI btop takes a
86  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
87  * large physical-memory 32-bit machines.
88  */
89 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
90 
91 static kmutex_t mm_lock;
92 static caddr_t mm_map;
93 
94 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
95 
96 static int mm_kmem_io_access;
97 
98 static int mm_kstat_update(kstat_t *ksp, int rw);
99 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
100 
101 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
102 
103 #define	MM_KMEMLOG_NENTRIES	64
104 
105 static int mm_kmemlogent;
106 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
107 
108 /*
109  * On kmem/allmem writes, we log information that might be useful in the event
110  * that a write is errant (that is, due to operator error) and induces a later
111  * problem.  Note that (in particular) in the event of such operator-induced
112  * corruption, a search over the kernel address space for the corrupted
113  * address will yield the ring buffer entry that recorded the write.  And
114  * should it seem baroque or otherwise unnecessary, yes, we need this kind of
115  * auditing facility and yes, we learned that the hard way: disturbingly,
116  * there exist recommendations for "tuning" the system that involve writing to
117  * kernel memory addresses via the kernel debugger, and -- as we discovered --
118  * these can easily be applied incorrectly or unsafely, yielding an entirely
119  * undebuggable "can't happen" kind of panic.
120  */
121 static void
122 mm_logkmem(struct uio *uio)
123 {
124 	mm_logentry_t *ent;
125 	proc_t *p = curthread->t_procp;
126 
127 	mutex_enter(&mm_lock);
128 
129 	ent = &mm_kmemlog[mm_kmemlogent++];
130 
131 	if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
132 		mm_kmemlogent = 0;
133 
134 	ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
135 	ent->mle_len = uio->uio_resid;
136 	gethrestime(&ent->mle_hrestime);
137 	ent->mle_hrtime = gethrtime();
138 	ent->mle_pid = p->p_pidp->pid_id;
139 
140 	(void) strncpy(ent->mle_psargs,
141 	    p->p_user.u_psargs, sizeof (ent->mle_psargs));
142 
143 	mutex_exit(&mm_lock);
144 }
145 
146 /*ARGSUSED1*/
147 static int
148 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
149 {
150 	int i;
151 	struct mem_minor {
152 		char *name;
153 		minor_t minor;
154 		int privonly;
155 		const char *rdpriv;
156 		const char *wrpriv;
157 		mode_t priv_mode;
158 	} mm[] = {
159 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
160 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
161 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
162 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
163 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
164 	};
165 	kstat_t *ksp;
166 
167 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
168 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
169 
170 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
171 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
172 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
173 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
174 		    DDI_FAILURE) {
175 			ddi_remove_minor_node(devi, NULL);
176 			return (DDI_FAILURE);
177 		}
178 	}
179 
180 	mm_dip = devi;
181 
182 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
183 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
184 	if (ksp != NULL) {
185 		ksp->ks_update = mm_kstat_update;
186 		ksp->ks_snapshot = mm_kstat_snapshot;
187 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
188 		kstat_install(ksp);
189 	}
190 
191 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
192 	    "kmem_io_access", 0);
193 
194 	return (DDI_SUCCESS);
195 }
196 
197 /*ARGSUSED*/
198 static int
199 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
200 {
201 	register int error;
202 
203 	switch (infocmd) {
204 	case DDI_INFO_DEVT2DEVINFO:
205 		*result = (void *)mm_dip;
206 		error = DDI_SUCCESS;
207 		break;
208 	case DDI_INFO_DEVT2INSTANCE:
209 		*result = (void *)0;
210 		error = DDI_SUCCESS;
211 		break;
212 	default:
213 		error = DDI_FAILURE;
214 	}
215 	return (error);
216 }
217 
218 /*ARGSUSED1*/
219 static int
220 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
221 {
222 	switch (getminor(*devp)) {
223 	case M_NULL:
224 	case M_ZERO:
225 	case M_MEM:
226 	case M_KMEM:
227 	case M_ALLKMEM:
228 		/* standard devices */
229 		break;
230 
231 	default:
232 		/* Unsupported or unknown type */
233 		return (EINVAL);
234 	}
235 	/* must be character device */
236 	if (typ != OTYP_CHR)
237 		return (EINVAL);
238 	return (0);
239 }
240 
241 struct pollhead	mm_pollhd;
242 
243 /*ARGSUSED*/
244 static int
245 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
246     struct pollhead **phpp)
247 {
248 	switch (getminor(dev)) {
249 	case M_NULL:
250 	case M_ZERO:
251 	case M_MEM:
252 	case M_KMEM:
253 	case M_ALLKMEM:
254 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
255 		    POLLWRNORM | POLLRDBAND | POLLWRBAND);
256 		/*
257 		 * A non NULL pollhead pointer should be returned in case
258 		 * user polls for 0 events.
259 		 */
260 		*phpp = !anyyet && !*reventsp ?
261 		    &mm_pollhd : (struct pollhead *)NULL;
262 		return (0);
263 	default:
264 		/* no other devices currently support polling */
265 		return (ENXIO);
266 	}
267 }
268 
269 static int
270 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
271     char *name, caddr_t valuep, int *lengthp)
272 {
273 	/*
274 	 * implement zero size to reduce overhead (avoid two failing
275 	 * property lookups per stat).
276 	 */
277 	return (ddi_prop_op_size(dev, dip, prop_op,
278 	    flags, name, valuep, lengthp, 0));
279 }
280 
281 static int
282 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
283     page_t *pp)
284 {
285 	int error = 0;
286 	int devload = 0;
287 	int is_memory = pf_is_memory(pfn);
288 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
289 	    (size_t)uio->uio_iov->iov_len);
290 	caddr_t va = NULL;
291 
292 	mutex_enter(&mm_lock);
293 
294 	if (is_memory && kpm_enable) {
295 		if (pp)
296 			va = hat_kpm_mapin(pp, NULL);
297 		else
298 			va = hat_kpm_mapin_pfn(pfn);
299 	}
300 
301 	if (va == NULL) {
302 		hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
303 		    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
304 		    HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
305 		va = mm_map;
306 		devload = 1;
307 	}
308 
309 	if (!is_memory) {
310 		if (allowio) {
311 			size_t c = uio->uio_iov->iov_len;
312 
313 			if (ddi_peekpokeio(NULL, uio, rw,
314 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
315 			    sizeof (int32_t)) != DDI_SUCCESS)
316 				error = EFAULT;
317 		} else
318 			error = EIO;
319 	} else
320 		error = uiomove(va + pageoff, nbytes, rw, uio);
321 
322 	if (devload)
323 		hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
324 	else if (pp)
325 		hat_kpm_mapout(pp, NULL, va);
326 	else
327 		hat_kpm_mapout_pfn(pfn);
328 
329 	mutex_exit(&mm_lock);
330 	return (error);
331 }
332 
333 static int
334 mmpagelock(struct as *as, caddr_t va)
335 {
336 	struct seg *seg;
337 	int i;
338 
339 	AS_LOCK_ENTER(as, RW_READER);
340 	seg = as_segat(as, va);
341 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
342 	AS_LOCK_EXIT(as);
343 
344 	return (i);
345 }
346 
347 #ifdef	__sparc
348 
349 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
350 
351 #else	/* __i386, __amd64 */
352 
353 #define	NEED_LOCK_KVADDR(va)	0
354 
355 #endif	/* __sparc */
356 
357 /*ARGSUSED3*/
358 static int
359 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
360 {
361 	pfn_t v;
362 	struct iovec *iov;
363 	int error = 0;
364 	size_t c;
365 	ssize_t oresid = uio->uio_resid;
366 	minor_t minor = getminor(dev);
367 
368 	while (uio->uio_resid > 0 && error == 0) {
369 		iov = uio->uio_iov;
370 		if (iov->iov_len == 0) {
371 			uio->uio_iov++;
372 			uio->uio_iovcnt--;
373 			if (uio->uio_iovcnt < 0)
374 				panic("mmrw");
375 			continue;
376 		}
377 		switch (minor) {
378 
379 		case M_MEM:
380 			memlist_read_lock();
381 			if (!address_in_memlist(phys_install,
382 			    (uint64_t)uio->uio_loffset, 1)) {
383 				memlist_read_unlock();
384 				error = EFAULT;
385 				break;
386 			}
387 			memlist_read_unlock();
388 
389 			v = BTOP((u_offset_t)uio->uio_loffset);
390 			error = mmio(uio, rw, v,
391 			    uio->uio_loffset & PAGEOFFSET, 0, NULL);
392 			break;
393 
394 		case M_KMEM:
395 		case M_ALLKMEM:
396 			{
397 			page_t **ppp = NULL;
398 			caddr_t vaddr = (caddr_t)uio->uio_offset;
399 			int try_lock = NEED_LOCK_KVADDR(vaddr);
400 			int locked = 0;
401 
402 			if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
403 				break;
404 
405 			if (rw == UIO_WRITE)
406 				mm_logkmem(uio);
407 
408 			/*
409 			 * If vaddr does not map a valid page, as_pagelock()
410 			 * will return failure. Hence we can't check the
411 			 * return value and return EFAULT here as we'd like.
412 			 * seg_kp and seg_kpm do not properly support
413 			 * as_pagelock() for this context so we avoid it
414 			 * using the try_lock set check above.  Some day when
415 			 * the kernel page locking gets redesigned all this
416 			 * muck can be cleaned up.
417 			 */
418 			if (try_lock)
419 				locked = (as_pagelock(&kas, &ppp, vaddr,
420 				    PAGESIZE, S_WRITE) == 0);
421 
422 			v = hat_getpfnum(kas.a_hat,
423 			    (caddr_t)(uintptr_t)uio->uio_loffset);
424 			if (v == PFN_INVALID) {
425 				if (locked)
426 					as_pageunlock(&kas, ppp, vaddr,
427 					    PAGESIZE, S_WRITE);
428 				error = EFAULT;
429 				break;
430 			}
431 
432 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
433 			    minor == M_ALLKMEM || mm_kmem_io_access,
434 			    (locked && ppp) ? *ppp : NULL);
435 			if (locked)
436 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
437 				    S_WRITE);
438 			}
439 
440 			break;
441 
442 		case M_ZERO:
443 			if (rw == UIO_READ) {
444 				label_t ljb;
445 
446 				if (on_fault(&ljb)) {
447 					no_fault();
448 					error = EFAULT;
449 					break;
450 				}
451 				uzero(iov->iov_base, iov->iov_len);
452 				no_fault();
453 				uio->uio_resid -= iov->iov_len;
454 				uio->uio_loffset += iov->iov_len;
455 				break;
456 			}
457 			/* else it's a write, fall through to NULL case */
458 			/*FALLTHROUGH*/
459 
460 		case M_NULL:
461 			if (rw == UIO_READ)
462 				return (0);
463 			c = iov->iov_len;
464 			iov->iov_base += c;
465 			iov->iov_len -= c;
466 			uio->uio_loffset += c;
467 			uio->uio_resid -= c;
468 			break;
469 
470 		}
471 	}
472 	return (uio->uio_resid == oresid ? error : 0);
473 }
474 
475 static int
476 mmread(dev_t dev, struct uio *uio, cred_t *cred)
477 {
478 	return (mmrw(dev, uio, UIO_READ, cred));
479 }
480 
481 static int
482 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
483 {
484 	return (mmrw(dev, uio, UIO_WRITE, cred));
485 }
486 
487 /*
488  * Private ioctl for libkvm to support kvm_physaddr().
489  * Given an address space and a VA, compute the PA.
490  */
491 static int
492 mmioctl_vtop(intptr_t data)
493 {
494 #ifdef _SYSCALL32
495 	mem_vtop32_t vtop32;
496 #endif
497 	mem_vtop_t mem_vtop;
498 	proc_t *p;
499 	pfn_t pfn = (pfn_t)PFN_INVALID;
500 	pid_t pid = 0;
501 	struct as *as;
502 	struct seg *seg;
503 
504 	if (get_udatamodel() == DATAMODEL_NATIVE) {
505 		if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
506 			return (EFAULT);
507 	}
508 #ifdef _SYSCALL32
509 	else {
510 		if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
511 			return (EFAULT);
512 		mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
513 		mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
514 
515 		if (mem_vtop.m_as != NULL)
516 			return (EINVAL);
517 	}
518 #endif
519 
520 	if (mem_vtop.m_as == &kas) {
521 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
522 	} else {
523 		if (mem_vtop.m_as == NULL) {
524 			/*
525 			 * Assume the calling process's address space if the
526 			 * caller didn't specify one.
527 			 */
528 			p = curthread->t_procp;
529 			if (p == NULL)
530 				return (EIO);
531 			mem_vtop.m_as = p->p_as;
532 		}
533 
534 		mutex_enter(&pidlock);
535 		for (p = practive; p != NULL; p = p->p_next) {
536 			if (p->p_as == mem_vtop.m_as) {
537 				pid = p->p_pid;
538 				break;
539 			}
540 		}
541 		mutex_exit(&pidlock);
542 		if (p == NULL)
543 			return (EIO);
544 		p = sprlock(pid);
545 		if (p == NULL)
546 			return (EIO);
547 		as = p->p_as;
548 		if (as == mem_vtop.m_as) {
549 			mutex_exit(&p->p_lock);
550 			AS_LOCK_ENTER(as, RW_READER);
551 			for (seg = AS_SEGFIRST(as); seg != NULL;
552 			    seg = AS_SEGNEXT(as, seg))
553 				if ((uintptr_t)mem_vtop.m_va -
554 				    (uintptr_t)seg->s_base < seg->s_size)
555 					break;
556 			if (seg != NULL)
557 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
558 			AS_LOCK_EXIT(as);
559 			mutex_enter(&p->p_lock);
560 		}
561 		sprunlock(p);
562 	}
563 	mem_vtop.m_pfn = pfn;
564 	if (pfn == PFN_INVALID)
565 		return (EIO);
566 
567 	if (get_udatamodel() == DATAMODEL_NATIVE) {
568 		if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
569 			return (EFAULT);
570 	}
571 #ifdef _SYSCALL32
572 	else {
573 		vtop32.m_pfn = mem_vtop.m_pfn;
574 		if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
575 			return (EFAULT);
576 	}
577 #endif
578 
579 	return (0);
580 }
581 
582 /*
583  * Given a PA, execute the given page retire command on it.
584  */
585 static int
586 mmioctl_page_retire(int cmd, intptr_t data)
587 {
588 	extern int page_retire_test(void);
589 	uint64_t pa;
590 
591 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
592 		return (EFAULT);
593 	}
594 
595 	switch (cmd) {
596 	case MEM_PAGE_ISRETIRED:
597 		return (page_retire_check(pa, NULL));
598 
599 	case MEM_PAGE_UNRETIRE:
600 		return (page_unretire(pa));
601 
602 	case MEM_PAGE_RETIRE:
603 		return (page_retire(pa, PR_FMA));
604 
605 	case MEM_PAGE_RETIRE_MCE:
606 		return (page_retire(pa, PR_MCE));
607 
608 	case MEM_PAGE_RETIRE_UE:
609 		return (page_retire(pa, PR_UE));
610 
611 	case MEM_PAGE_GETERRORS:
612 		{
613 			uint64_t page_errors;
614 			int rc = page_retire_check(pa, &page_errors);
615 			if (copyout(&page_errors, (void *)data,
616 			    sizeof (uint64_t))) {
617 				return (EFAULT);
618 			}
619 			return (rc);
620 		}
621 
622 	case MEM_PAGE_RETIRE_TEST:
623 		return (page_retire_test());
624 
625 	}
626 
627 	return (EINVAL);
628 }
629 
630 #ifdef __sparc
631 /*
632  * Given a syndrome, syndrome type, and address return the
633  * associated memory name in the provided data buffer.
634  */
635 static int
636 mmioctl_get_mem_name(intptr_t data)
637 {
638 	mem_name_t mem_name;
639 	void *buf;
640 	size_t bufsize;
641 	int len, err;
642 
643 	if ((bufsize = cpu_get_name_bufsize()) == 0)
644 		return (ENOTSUP);
645 
646 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
647 		return (err);
648 
649 	buf = kmem_alloc(bufsize, KM_SLEEP);
650 
651 	/*
652 	 * Call into cpu specific code to do the lookup.
653 	 */
654 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
655 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
656 		kmem_free(buf, bufsize);
657 		return (err);
658 	}
659 
660 	if (len >= mem_name.m_namelen) {
661 		kmem_free(buf, bufsize);
662 		return (ENOSPC);
663 	}
664 
665 	if (copyoutstr(buf, (char *)mem_name.m_name,
666 	    mem_name.m_namelen, NULL) != 0) {
667 		kmem_free(buf, bufsize);
668 		return (EFAULT);
669 	}
670 
671 	kmem_free(buf, bufsize);
672 	return (0);
673 }
674 
675 /*
676  * Given a syndrome and address return information about the associated memory.
677  */
678 static int
679 mmioctl_get_mem_info(intptr_t data)
680 {
681 	mem_info_t mem_info;
682 	int err;
683 
684 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
685 		return (EFAULT);
686 
687 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
688 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
689 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
690 		return (err);
691 
692 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
693 		return (EFAULT);
694 
695 	return (0);
696 }
697 
698 /*
699  * Given a memory name, return its associated serial id
700  */
701 static int
702 mmioctl_get_mem_sid(intptr_t data)
703 {
704 	mem_name_t mem_name;
705 	void *buf;
706 	void *name;
707 	size_t	name_len;
708 	size_t bufsize;
709 	int len, err;
710 
711 	if ((bufsize = cpu_get_name_bufsize()) == 0)
712 		return (ENOTSUP);
713 
714 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
715 		return (err);
716 
717 	buf = kmem_alloc(bufsize, KM_SLEEP);
718 
719 	if (mem_name.m_namelen > 1024)
720 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
721 
722 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
723 
724 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
725 	    mem_name.m_namelen, &name_len)) != 0) {
726 		kmem_free(buf, bufsize);
727 		kmem_free(name, mem_name.m_namelen);
728 		return (err);
729 	}
730 
731 	/*
732 	 * Call into cpu specific code to do the lookup.
733 	 */
734 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
735 		kmem_free(buf, bufsize);
736 		kmem_free(name, mem_name.m_namelen);
737 		return (err);
738 	}
739 
740 	if (len > mem_name.m_sidlen) {
741 		kmem_free(buf, bufsize);
742 		kmem_free(name, mem_name.m_namelen);
743 		return (ENAMETOOLONG);
744 	}
745 
746 	if (copyoutstr(buf, (char *)mem_name.m_sid,
747 	    mem_name.m_sidlen, NULL) != 0) {
748 		kmem_free(buf, bufsize);
749 		kmem_free(name, mem_name.m_namelen);
750 		return (EFAULT);
751 	}
752 
753 	kmem_free(buf, bufsize);
754 	kmem_free(name, mem_name.m_namelen);
755 	return (0);
756 }
757 #endif	/* __sparc */
758 
759 /*
760  * Private ioctls for
761  *	libkvm to support kvm_physaddr().
762  *	FMA support for page_retire() and memory attribute information.
763  */
764 /*ARGSUSED*/
765 static int
766 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
767 {
768 	if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
769 	    (cmd != MEM_VTOP && getminor(dev) != M_MEM))
770 		return (ENXIO);
771 
772 	switch (cmd) {
773 	case MEM_VTOP:
774 		return (mmioctl_vtop(data));
775 
776 	case MEM_PAGE_RETIRE:
777 	case MEM_PAGE_ISRETIRED:
778 	case MEM_PAGE_UNRETIRE:
779 	case MEM_PAGE_RETIRE_MCE:
780 	case MEM_PAGE_RETIRE_UE:
781 	case MEM_PAGE_GETERRORS:
782 	case MEM_PAGE_RETIRE_TEST:
783 		return (mmioctl_page_retire(cmd, data));
784 
785 #ifdef __sparc
786 	case MEM_NAME:
787 		return (mmioctl_get_mem_name(data));
788 
789 	case MEM_INFO:
790 		return (mmioctl_get_mem_info(data));
791 
792 	case MEM_SID:
793 		return (mmioctl_get_mem_sid(data));
794 #else
795 	case MEM_NAME:
796 	case MEM_INFO:
797 	case MEM_SID:
798 		return (ENOTSUP);
799 #endif	/* __sparc */
800 	}
801 	return (ENXIO);
802 }
803 
804 /*ARGSUSED2*/
805 static int
806 mmmmap(dev_t dev, off_t off, int prot)
807 {
808 	pfn_t pf;
809 	struct memlist *pmem;
810 	minor_t minor = getminor(dev);
811 
812 	switch (minor) {
813 	case M_MEM:
814 		pf = btop(off);
815 		memlist_read_lock();
816 		for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
817 			if (pf >= BTOP(pmem->ml_address) &&
818 			    pf < BTOP(pmem->ml_address + pmem->ml_size)) {
819 				memlist_read_unlock();
820 				return (impl_obmem_pfnum(pf));
821 			}
822 		}
823 		memlist_read_unlock();
824 		break;
825 
826 	case M_KMEM:
827 	case M_ALLKMEM:
828 		/* no longer supported with KPR */
829 		return (-1);
830 
831 	case M_ZERO:
832 		/*
833 		 * We shouldn't be mmap'ing to /dev/zero here as
834 		 * mmsegmap() should have already converted
835 		 * a mapping request for this device to a mapping
836 		 * using seg_vn for anonymous memory.
837 		 */
838 		break;
839 
840 	}
841 	return (-1);
842 }
843 
844 /*
845  * This function is called when a memory device is mmap'ed.
846  * Set up the mapping to the correct device driver.
847  */
848 static int
849 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
850     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
851 {
852 	struct segvn_crargs vn_a;
853 	struct segdev_crargs dev_a;
854 	int error;
855 	minor_t minor;
856 	off_t i;
857 
858 	minor = getminor(dev);
859 
860 	as_rangelock(as);
861 	/*
862 	 * No need to worry about vac alignment on /dev/zero
863 	 * since this is a "clone" object that doesn't yet exist.
864 	 */
865 	error = choose_addr(as, addrp, len, off,
866 	    (minor == M_MEM) || (minor == M_KMEM), flags);
867 	if (error != 0) {
868 		as_rangeunlock(as);
869 		return (error);
870 	}
871 
872 	switch (minor) {
873 	case M_MEM:
874 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
875 		if ((flags & MAP_TYPE) != MAP_SHARED) {
876 			as_rangeunlock(as);
877 			return (EINVAL);
878 		}
879 
880 		/*
881 		 * Check to ensure that the entire range is
882 		 * legal and we are not trying to map in
883 		 * more than the device will let us.
884 		 */
885 		for (i = 0; i < len; i += PAGESIZE) {
886 			if (mmmmap(dev, off + i, maxprot) == -1) {
887 				as_rangeunlock(as);
888 				return (ENXIO);
889 			}
890 		}
891 
892 		/*
893 		 * Use seg_dev segment driver for /dev/mem mapping.
894 		 */
895 		dev_a.mapfunc = mmmmap;
896 		dev_a.dev = dev;
897 		dev_a.offset = off;
898 		dev_a.type = (flags & MAP_TYPE);
899 		dev_a.prot = (uchar_t)prot;
900 		dev_a.maxprot = (uchar_t)maxprot;
901 		dev_a.hat_attr = 0;
902 
903 		/*
904 		 * Make /dev/mem mappings non-consistent since we can't
905 		 * alias pages that don't have page structs behind them,
906 		 * such as kernel stack pages. If someone mmap()s a kernel
907 		 * stack page and if we give them a tte with cv, a line from
908 		 * that page can get into both pages of the spitfire d$.
909 		 * But snoop from another processor will only invalidate
910 		 * the first page. This later caused kernel (xc_attention)
911 		 * to go into an infinite loop at pil 13 and no interrupts
912 		 * could come in. See 1203630.
913 		 *
914 		 */
915 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
916 		dev_a.devmap_data = NULL;
917 
918 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
919 		break;
920 
921 	case M_ZERO:
922 		/*
923 		 * Use seg_vn segment driver for /dev/zero mapping.
924 		 * Passing in a NULL amp gives us the "cloning" effect.
925 		 */
926 		vn_a.vp = NULL;
927 		vn_a.offset = 0;
928 		vn_a.type = (flags & MAP_TYPE);
929 		vn_a.prot = prot;
930 		vn_a.maxprot = maxprot;
931 		vn_a.flags = flags & ~MAP_TYPE;
932 		vn_a.cred = cred;
933 		vn_a.amp = NULL;
934 		vn_a.szc = 0;
935 		vn_a.lgrp_mem_policy_flags = 0;
936 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
937 		break;
938 
939 	case M_KMEM:
940 	case M_ALLKMEM:
941 		/* No longer supported with KPR. */
942 		error = ENXIO;
943 		break;
944 
945 	case M_NULL:
946 		/*
947 		 * Use seg_dev segment driver for /dev/null mapping.
948 		 */
949 		dev_a.mapfunc = mmmmap;
950 		dev_a.dev = dev;
951 		dev_a.offset = off;
952 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
953 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
954 		dev_a.hat_attr = 0;
955 		dev_a.hat_flags = 0;
956 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
957 		break;
958 
959 	default:
960 		error = ENXIO;
961 	}
962 
963 	as_rangeunlock(as);
964 	return (error);
965 }
966 
967 static struct cb_ops mm_cb_ops = {
968 	mmopen,			/* open */
969 	nulldev,		/* close */
970 	nodev,			/* strategy */
971 	nodev,			/* print */
972 	nodev,			/* dump */
973 	mmread,			/* read */
974 	mmwrite,		/* write */
975 	mmioctl,		/* ioctl */
976 	nodev,			/* devmap */
977 	mmmmap,			/* mmap */
978 	mmsegmap,		/* segmap */
979 	mmchpoll,		/* poll */
980 	mmpropop,		/* prop_op */
981 	0,			/* streamtab  */
982 	D_NEW | D_MP | D_64BIT | D_U64BIT
983 };
984 
985 static struct dev_ops mm_ops = {
986 	DEVO_REV,		/* devo_rev, */
987 	0,			/* refcnt  */
988 	mm_info,		/* get_dev_info */
989 	nulldev,		/* identify */
990 	nulldev,		/* probe */
991 	mm_attach,		/* attach */
992 	nodev,			/* detach */
993 	nodev,			/* reset */
994 	&mm_cb_ops,		/* driver operations */
995 	(struct bus_ops *)0,	/* bus operations */
996 	NULL,			/* power */
997 	ddi_quiesce_not_needed,		/* quiesce */
998 };
999 
1000 static struct modldrv modldrv = {
1001 	&mod_driverops, "memory driver", &mm_ops,
1002 };
1003 
1004 static struct modlinkage modlinkage = {
1005 	MODREV_1, &modldrv, NULL
1006 };
1007 
1008 int
1009 _init(void)
1010 {
1011 	return (mod_install(&modlinkage));
1012 }
1013 
1014 int
1015 _info(struct modinfo *modinfop)
1016 {
1017 	return (mod_info(&modlinkage, modinfop));
1018 }
1019 
1020 int
1021 _fini(void)
1022 {
1023 	return (mod_remove(&modlinkage));
1024 }
1025 
1026 static int
1027 mm_kstat_update(kstat_t *ksp, int rw)
1028 {
1029 	struct memlist *pmem;
1030 	uint_t count;
1031 
1032 	if (rw == KSTAT_WRITE)
1033 		return (EACCES);
1034 
1035 	count = 0;
1036 	memlist_read_lock();
1037 	for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1038 		count++;
1039 	}
1040 	memlist_read_unlock();
1041 
1042 	ksp->ks_ndata = count;
1043 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1044 
1045 	return (0);
1046 }
1047 
1048 static int
1049 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1050 {
1051 	struct memlist *pmem;
1052 	struct memunit {
1053 		uint64_t address;
1054 		uint64_t size;
1055 	} *kspmem;
1056 
1057 	if (rw == KSTAT_WRITE)
1058 		return (EACCES);
1059 
1060 	ksp->ks_snaptime = gethrtime();
1061 
1062 	kspmem = (struct memunit *)buf;
1063 	memlist_read_lock();
1064 	for (pmem = phys_install; pmem != NULL;
1065 	    pmem = pmem->ml_next, kspmem++) {
1066 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1067 			break;
1068 		kspmem->address = pmem->ml_address;
1069 		kspmem->size = pmem->ml_size;
1070 	}
1071 	memlist_read_unlock();
1072 
1073 	return (0);
1074 }
1075 
1076 /*
1077  * Read a mem_name_t from user-space and store it in the mem_name_t
1078  * pointed to by the mem_name argument.
1079  */
1080 static int
1081 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1082 {
1083 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1084 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1085 			return (EFAULT);
1086 	}
1087 #ifdef	_SYSCALL32
1088 	else {
1089 		mem_name32_t mem_name32;
1090 
1091 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1092 			return (EFAULT);
1093 		mem_name->m_addr = mem_name32.m_addr;
1094 		mem_name->m_synd = mem_name32.m_synd;
1095 		mem_name->m_type[0] = mem_name32.m_type[0];
1096 		mem_name->m_type[1] = mem_name32.m_type[1];
1097 		mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1098 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1099 		mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1100 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1101 	}
1102 #endif	/* _SYSCALL32 */
1103 
1104 	return (0);
1105 }
1106