xref: /illumos-gate/usr/src/uts/common/io/mem.c (revision 43051d2742bbe5911de73322064cb573b6aff975)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
28  */
29 
30 /*
31  * Memory special file
32  */
33 
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/user.h>
37 #include <sys/buf.h>
38 #include <sys/systm.h>
39 #include <sys/cred.h>
40 #include <sys/vm.h>
41 #include <sys/uio.h>
42 #include <sys/mman.h>
43 #include <sys/kmem.h>
44 #include <vm/seg.h>
45 #include <vm/page.h>
46 #include <sys/stat.h>
47 #include <sys/vmem.h>
48 #include <sys/memlist.h>
49 #include <sys/bootconf.h>
50 
51 #include <vm/seg_vn.h>
52 #include <vm/seg_dev.h>
53 #include <vm/seg_kmem.h>
54 #include <vm/seg_kp.h>
55 #include <vm/seg_kpm.h>
56 #include <vm/hat.h>
57 
58 #include <sys/conf.h>
59 #include <sys/mem.h>
60 #include <sys/types.h>
61 #include <sys/conf.h>
62 #include <sys/param.h>
63 #include <sys/systm.h>
64 #include <sys/errno.h>
65 #include <sys/modctl.h>
66 #include <sys/memlist.h>
67 #include <sys/ddi.h>
68 #include <sys/sunddi.h>
69 #include <sys/debug.h>
70 #include <sys/fm/protocol.h>
71 
72 #if defined(__sparc)
73 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
74 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
75     uint64_t *, int *, int *, int *);
76 extern size_t cpu_get_name_bufsize(void);
77 extern int cpu_get_mem_sid(char *, char *, int, int *);
78 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
79 #elif defined(__x86)
80 #include <sys/cpu_module.h>
81 #endif	/* __sparc */
82 
83 /*
84  * Turn a byte length into a pagecount.  The DDI btop takes a
85  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
86  * large physical-memory 32-bit machines.
87  */
88 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
89 
90 static kmutex_t mm_lock;
91 static caddr_t mm_map;
92 
93 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
94 
95 static int mm_kmem_io_access;
96 
97 static int mm_kstat_update(kstat_t *ksp, int rw);
98 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
99 
100 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
101 
102 #define	MM_KMEMLOG_NENTRIES	64
103 
104 static int mm_kmemlogent;
105 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
106 
107 /*
108  * On kmem/allmem writes, we log information that might be useful in the event
109  * that a write is errant (that is, due to operator error) and induces a later
110  * problem.  Note that (in particular) in the event of such operator-induced
111  * corruption, a search over the kernel address space for the corrupted
112  * address will yield the ring buffer entry that recorded the write.  And
113  * should it seem baroque or otherwise unnecessary, yes, we need this kind of
114  * auditing facility and yes, we learned that the hard way: disturbingly,
115  * there exist recommendations for "tuning" the system that involve writing to
116  * kernel memory addresses via the kernel debugger, and -- as we discovered --
117  * these can easily be applied incorrectly or unsafely, yielding an entirely
118  * undebuggable "can't happen" kind of panic.
119  */
120 static void
121 mm_logkmem(struct uio *uio)
122 {
123 	mm_logentry_t *ent;
124 	proc_t *p = curthread->t_procp;
125 
126 	mutex_enter(&mm_lock);
127 
128 	ent = &mm_kmemlog[mm_kmemlogent++];
129 
130 	if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
131 		mm_kmemlogent = 0;
132 
133 	ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
134 	ent->mle_len = uio->uio_resid;
135 	gethrestime(&ent->mle_hrestime);
136 	ent->mle_hrtime = gethrtime();
137 	ent->mle_pid = p->p_pidp->pid_id;
138 
139 	(void) strncpy(ent->mle_psargs,
140 	    p->p_user.u_psargs, sizeof (ent->mle_psargs));
141 
142 	mutex_exit(&mm_lock);
143 }
144 
145 /*ARGSUSED1*/
146 static int
147 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
148 {
149 	int i;
150 	struct mem_minor {
151 		char *name;
152 		minor_t minor;
153 		int privonly;
154 		const char *rdpriv;
155 		const char *wrpriv;
156 		mode_t priv_mode;
157 	} mm[] = {
158 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
159 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
160 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
161 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
162 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
163 	};
164 	kstat_t *ksp;
165 
166 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
167 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
168 
169 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
170 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
171 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
172 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
173 		    DDI_FAILURE) {
174 			ddi_remove_minor_node(devi, NULL);
175 			return (DDI_FAILURE);
176 		}
177 	}
178 
179 	mm_dip = devi;
180 
181 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
182 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
183 	if (ksp != NULL) {
184 		ksp->ks_update = mm_kstat_update;
185 		ksp->ks_snapshot = mm_kstat_snapshot;
186 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
187 		kstat_install(ksp);
188 	}
189 
190 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
191 	    "kmem_io_access", 0);
192 
193 	return (DDI_SUCCESS);
194 }
195 
196 /*ARGSUSED*/
197 static int
198 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
199 {
200 	register int error;
201 
202 	switch (infocmd) {
203 	case DDI_INFO_DEVT2DEVINFO:
204 		*result = (void *)mm_dip;
205 		error = DDI_SUCCESS;
206 		break;
207 	case DDI_INFO_DEVT2INSTANCE:
208 		*result = (void *)0;
209 		error = DDI_SUCCESS;
210 		break;
211 	default:
212 		error = DDI_FAILURE;
213 	}
214 	return (error);
215 }
216 
217 /*ARGSUSED1*/
218 static int
219 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
220 {
221 	switch (getminor(*devp)) {
222 	case M_NULL:
223 	case M_ZERO:
224 	case M_MEM:
225 	case M_KMEM:
226 	case M_ALLKMEM:
227 		/* standard devices */
228 		break;
229 
230 	default:
231 		/* Unsupported or unknown type */
232 		return (EINVAL);
233 	}
234 	/* must be character device */
235 	if (typ != OTYP_CHR)
236 		return (EINVAL);
237 	return (0);
238 }
239 
240 struct pollhead	mm_pollhd;
241 
242 /*ARGSUSED*/
243 static int
244 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
245     struct pollhead **phpp)
246 {
247 	switch (getminor(dev)) {
248 	case M_NULL:
249 	case M_ZERO:
250 	case M_MEM:
251 	case M_KMEM:
252 	case M_ALLKMEM:
253 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
254 		    POLLWRNORM | POLLRDBAND | POLLWRBAND);
255 		/*
256 		 * A non NULL pollhead pointer should be returned in case
257 		 * user polls for 0 events.
258 		 */
259 		*phpp = !anyyet && !*reventsp ?
260 		    &mm_pollhd : (struct pollhead *)NULL;
261 		return (0);
262 	default:
263 		/* no other devices currently support polling */
264 		return (ENXIO);
265 	}
266 }
267 
268 static int
269 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
270     char *name, caddr_t valuep, int *lengthp)
271 {
272 	/*
273 	 * implement zero size to reduce overhead (avoid two failing
274 	 * property lookups per stat).
275 	 */
276 	return (ddi_prop_op_size(dev, dip, prop_op,
277 	    flags, name, valuep, lengthp, 0));
278 }
279 
280 static int
281 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
282     page_t *pp)
283 {
284 	int error = 0;
285 	int devload = 0;
286 	int is_memory = pf_is_memory(pfn);
287 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
288 	    (size_t)uio->uio_iov->iov_len);
289 	caddr_t va = NULL;
290 
291 	mutex_enter(&mm_lock);
292 
293 	if (is_memory && kpm_enable) {
294 		if (pp)
295 			va = hat_kpm_mapin(pp, NULL);
296 		else
297 			va = hat_kpm_mapin_pfn(pfn);
298 	}
299 
300 	if (va == NULL) {
301 		hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
302 		    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
303 		    HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
304 		va = mm_map;
305 		devload = 1;
306 	}
307 
308 	if (!is_memory) {
309 		if (allowio) {
310 			size_t c = uio->uio_iov->iov_len;
311 
312 			if (ddi_peekpokeio(NULL, uio, rw,
313 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
314 			    sizeof (int32_t)) != DDI_SUCCESS)
315 				error = EFAULT;
316 		} else
317 			error = EIO;
318 	} else
319 		error = uiomove(va + pageoff, nbytes, rw, uio);
320 
321 	if (devload)
322 		hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
323 	else if (pp)
324 		hat_kpm_mapout(pp, NULL, va);
325 	else
326 		hat_kpm_mapout_pfn(pfn);
327 
328 	mutex_exit(&mm_lock);
329 	return (error);
330 }
331 
332 static int
333 mmpagelock(struct as *as, caddr_t va)
334 {
335 	struct seg *seg;
336 	int i;
337 
338 	AS_LOCK_ENTER(as, RW_READER);
339 	seg = as_segat(as, va);
340 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
341 	AS_LOCK_EXIT(as);
342 
343 	return (i);
344 }
345 
346 #ifdef	__sparc
347 
348 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
349 
350 #else	/* __i386, __amd64 */
351 
352 #define	NEED_LOCK_KVADDR(va)	0
353 
354 #endif	/* __sparc */
355 
356 /*ARGSUSED3*/
357 static int
358 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
359 {
360 	pfn_t v;
361 	struct iovec *iov;
362 	int error = 0;
363 	size_t c;
364 	ssize_t oresid = uio->uio_resid;
365 	minor_t minor = getminor(dev);
366 
367 	while (uio->uio_resid > 0 && error == 0) {
368 		iov = uio->uio_iov;
369 		if (iov->iov_len == 0) {
370 			uio->uio_iov++;
371 			uio->uio_iovcnt--;
372 			if (uio->uio_iovcnt < 0)
373 				panic("mmrw");
374 			continue;
375 		}
376 		switch (minor) {
377 
378 		case M_MEM:
379 			memlist_read_lock();
380 			if (!address_in_memlist(phys_install,
381 			    (uint64_t)uio->uio_loffset, 1)) {
382 				memlist_read_unlock();
383 				error = EFAULT;
384 				break;
385 			}
386 			memlist_read_unlock();
387 
388 			v = BTOP((u_offset_t)uio->uio_loffset);
389 			error = mmio(uio, rw, v,
390 			    uio->uio_loffset & PAGEOFFSET, 0, NULL);
391 			break;
392 
393 		case M_KMEM:
394 		case M_ALLKMEM:
395 			{
396 			page_t **ppp = NULL;
397 			caddr_t vaddr = (caddr_t)uio->uio_offset;
398 			int try_lock = NEED_LOCK_KVADDR(vaddr);
399 			int locked = 0;
400 
401 			if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
402 				break;
403 
404 			if (rw == UIO_WRITE)
405 				mm_logkmem(uio);
406 
407 			/*
408 			 * If vaddr does not map a valid page, as_pagelock()
409 			 * will return failure. Hence we can't check the
410 			 * return value and return EFAULT here as we'd like.
411 			 * seg_kp and seg_kpm do not properly support
412 			 * as_pagelock() for this context so we avoid it
413 			 * using the try_lock set check above.  Some day when
414 			 * the kernel page locking gets redesigned all this
415 			 * muck can be cleaned up.
416 			 */
417 			if (try_lock)
418 				locked = (as_pagelock(&kas, &ppp, vaddr,
419 				    PAGESIZE, S_WRITE) == 0);
420 
421 			v = hat_getpfnum(kas.a_hat,
422 			    (caddr_t)(uintptr_t)uio->uio_loffset);
423 			if (v == PFN_INVALID) {
424 				if (locked)
425 					as_pageunlock(&kas, ppp, vaddr,
426 					    PAGESIZE, S_WRITE);
427 				error = EFAULT;
428 				break;
429 			}
430 
431 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
432 			    minor == M_ALLKMEM || mm_kmem_io_access,
433 			    (locked && ppp) ? *ppp : NULL);
434 			if (locked)
435 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
436 				    S_WRITE);
437 			}
438 
439 			break;
440 
441 		case M_ZERO:
442 			if (rw == UIO_READ) {
443 				label_t ljb;
444 
445 				if (on_fault(&ljb)) {
446 					no_fault();
447 					error = EFAULT;
448 					break;
449 				}
450 				uzero(iov->iov_base, iov->iov_len);
451 				no_fault();
452 				uio->uio_resid -= iov->iov_len;
453 				uio->uio_loffset += iov->iov_len;
454 				break;
455 			}
456 			/* else it's a write, fall through to NULL case */
457 			/*FALLTHROUGH*/
458 
459 		case M_NULL:
460 			if (rw == UIO_READ)
461 				return (0);
462 			c = iov->iov_len;
463 			iov->iov_base += c;
464 			iov->iov_len -= c;
465 			uio->uio_loffset += c;
466 			uio->uio_resid -= c;
467 			break;
468 
469 		}
470 	}
471 	return (uio->uio_resid == oresid ? error : 0);
472 }
473 
474 static int
475 mmread(dev_t dev, struct uio *uio, cred_t *cred)
476 {
477 	return (mmrw(dev, uio, UIO_READ, cred));
478 }
479 
480 static int
481 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
482 {
483 	return (mmrw(dev, uio, UIO_WRITE, cred));
484 }
485 
486 /*
487  * Private ioctl for libkvm to support kvm_physaddr().
488  * Given an address space and a VA, compute the PA.
489  */
490 static int
491 mmioctl_vtop(intptr_t data)
492 {
493 #ifdef _SYSCALL32
494 	mem_vtop32_t vtop32;
495 #endif
496 	mem_vtop_t mem_vtop;
497 	proc_t *p;
498 	pfn_t pfn = (pfn_t)PFN_INVALID;
499 	pid_t pid = 0;
500 	struct as *as;
501 	struct seg *seg;
502 
503 	if (get_udatamodel() == DATAMODEL_NATIVE) {
504 		if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
505 			return (EFAULT);
506 	}
507 #ifdef _SYSCALL32
508 	else {
509 		if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
510 			return (EFAULT);
511 		mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
512 		mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
513 
514 		if (mem_vtop.m_as != NULL)
515 			return (EINVAL);
516 	}
517 #endif
518 
519 	if (mem_vtop.m_as == &kas) {
520 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
521 	} else {
522 		if (mem_vtop.m_as == NULL) {
523 			/*
524 			 * Assume the calling process's address space if the
525 			 * caller didn't specify one.
526 			 */
527 			p = curthread->t_procp;
528 			if (p == NULL)
529 				return (EIO);
530 			mem_vtop.m_as = p->p_as;
531 		}
532 
533 		mutex_enter(&pidlock);
534 		for (p = practive; p != NULL; p = p->p_next) {
535 			if (p->p_as == mem_vtop.m_as) {
536 				pid = p->p_pid;
537 				break;
538 			}
539 		}
540 		mutex_exit(&pidlock);
541 		if (p == NULL)
542 			return (EIO);
543 		p = sprlock(pid);
544 		if (p == NULL)
545 			return (EIO);
546 		as = p->p_as;
547 		if (as == mem_vtop.m_as) {
548 			mutex_exit(&p->p_lock);
549 			AS_LOCK_ENTER(as, RW_READER);
550 			for (seg = AS_SEGFIRST(as); seg != NULL;
551 			    seg = AS_SEGNEXT(as, seg))
552 				if ((uintptr_t)mem_vtop.m_va -
553 				    (uintptr_t)seg->s_base < seg->s_size)
554 					break;
555 			if (seg != NULL)
556 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
557 			AS_LOCK_EXIT(as);
558 			mutex_enter(&p->p_lock);
559 		}
560 		sprunlock(p);
561 	}
562 	mem_vtop.m_pfn = pfn;
563 	if (pfn == PFN_INVALID)
564 		return (EIO);
565 
566 	if (get_udatamodel() == DATAMODEL_NATIVE) {
567 		if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
568 			return (EFAULT);
569 	}
570 #ifdef _SYSCALL32
571 	else {
572 		vtop32.m_pfn = mem_vtop.m_pfn;
573 		if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
574 			return (EFAULT);
575 	}
576 #endif
577 
578 	return (0);
579 }
580 
581 /*
582  * Given a PA, execute the given page retire command on it.
583  */
584 static int
585 mmioctl_page_retire(int cmd, intptr_t data)
586 {
587 	extern int page_retire_test(void);
588 	uint64_t pa;
589 
590 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
591 		return (EFAULT);
592 	}
593 
594 	switch (cmd) {
595 	case MEM_PAGE_ISRETIRED:
596 		return (page_retire_check(pa, NULL));
597 
598 	case MEM_PAGE_UNRETIRE:
599 		return (page_unretire(pa));
600 
601 	case MEM_PAGE_RETIRE:
602 		return (page_retire(pa, PR_FMA));
603 
604 	case MEM_PAGE_RETIRE_MCE:
605 		return (page_retire(pa, PR_MCE));
606 
607 	case MEM_PAGE_RETIRE_UE:
608 		return (page_retire(pa, PR_UE));
609 
610 	case MEM_PAGE_GETERRORS:
611 		{
612 			uint64_t page_errors;
613 			int rc = page_retire_check(pa, &page_errors);
614 			if (copyout(&page_errors, (void *)data,
615 			    sizeof (uint64_t))) {
616 				return (EFAULT);
617 			}
618 			return (rc);
619 		}
620 
621 	case MEM_PAGE_RETIRE_TEST:
622 		return (page_retire_test());
623 
624 	}
625 
626 	return (EINVAL);
627 }
628 
629 #ifdef __sparc
630 /*
631  * Given a syndrome, syndrome type, and address return the
632  * associated memory name in the provided data buffer.
633  */
634 static int
635 mmioctl_get_mem_name(intptr_t data)
636 {
637 	mem_name_t mem_name;
638 	void *buf;
639 	size_t bufsize;
640 	int len, err;
641 
642 	if ((bufsize = cpu_get_name_bufsize()) == 0)
643 		return (ENOTSUP);
644 
645 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
646 		return (err);
647 
648 	buf = kmem_alloc(bufsize, KM_SLEEP);
649 
650 	/*
651 	 * Call into cpu specific code to do the lookup.
652 	 */
653 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
654 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
655 		kmem_free(buf, bufsize);
656 		return (err);
657 	}
658 
659 	if (len >= mem_name.m_namelen) {
660 		kmem_free(buf, bufsize);
661 		return (ENOSPC);
662 	}
663 
664 	if (copyoutstr(buf, (char *)mem_name.m_name,
665 	    mem_name.m_namelen, NULL) != 0) {
666 		kmem_free(buf, bufsize);
667 		return (EFAULT);
668 	}
669 
670 	kmem_free(buf, bufsize);
671 	return (0);
672 }
673 
674 /*
675  * Given a syndrome and address return information about the associated memory.
676  */
677 static int
678 mmioctl_get_mem_info(intptr_t data)
679 {
680 	mem_info_t mem_info;
681 	int err;
682 
683 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
684 		return (EFAULT);
685 
686 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
687 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
688 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
689 		return (err);
690 
691 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
692 		return (EFAULT);
693 
694 	return (0);
695 }
696 
697 /*
698  * Given a memory name, return its associated serial id
699  */
700 static int
701 mmioctl_get_mem_sid(intptr_t data)
702 {
703 	mem_name_t mem_name;
704 	void *buf;
705 	void *name;
706 	size_t	name_len;
707 	size_t bufsize;
708 	int len, err;
709 
710 	if ((bufsize = cpu_get_name_bufsize()) == 0)
711 		return (ENOTSUP);
712 
713 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
714 		return (err);
715 
716 	buf = kmem_alloc(bufsize, KM_SLEEP);
717 
718 	if (mem_name.m_namelen > 1024)
719 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
720 
721 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
722 
723 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
724 	    mem_name.m_namelen, &name_len)) != 0) {
725 		kmem_free(buf, bufsize);
726 		kmem_free(name, mem_name.m_namelen);
727 		return (err);
728 	}
729 
730 	/*
731 	 * Call into cpu specific code to do the lookup.
732 	 */
733 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
734 		kmem_free(buf, bufsize);
735 		kmem_free(name, mem_name.m_namelen);
736 		return (err);
737 	}
738 
739 	if (len > mem_name.m_sidlen) {
740 		kmem_free(buf, bufsize);
741 		kmem_free(name, mem_name.m_namelen);
742 		return (ENAMETOOLONG);
743 	}
744 
745 	if (copyoutstr(buf, (char *)mem_name.m_sid,
746 	    mem_name.m_sidlen, NULL) != 0) {
747 		kmem_free(buf, bufsize);
748 		kmem_free(name, mem_name.m_namelen);
749 		return (EFAULT);
750 	}
751 
752 	kmem_free(buf, bufsize);
753 	kmem_free(name, mem_name.m_namelen);
754 	return (0);
755 }
756 #endif	/* __sparc */
757 
758 /*
759  * Private ioctls for
760  *	libkvm to support kvm_physaddr().
761  *	FMA support for page_retire() and memory attribute information.
762  */
763 /*ARGSUSED*/
764 static int
765 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
766 {
767 	if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
768 	    (cmd != MEM_VTOP && getminor(dev) != M_MEM))
769 		return (ENXIO);
770 
771 	switch (cmd) {
772 	case MEM_VTOP:
773 		return (mmioctl_vtop(data));
774 
775 	case MEM_PAGE_RETIRE:
776 	case MEM_PAGE_ISRETIRED:
777 	case MEM_PAGE_UNRETIRE:
778 	case MEM_PAGE_RETIRE_MCE:
779 	case MEM_PAGE_RETIRE_UE:
780 	case MEM_PAGE_GETERRORS:
781 	case MEM_PAGE_RETIRE_TEST:
782 		return (mmioctl_page_retire(cmd, data));
783 
784 #ifdef __sparc
785 	case MEM_NAME:
786 		return (mmioctl_get_mem_name(data));
787 
788 	case MEM_INFO:
789 		return (mmioctl_get_mem_info(data));
790 
791 	case MEM_SID:
792 		return (mmioctl_get_mem_sid(data));
793 #else
794 	case MEM_NAME:
795 	case MEM_INFO:
796 	case MEM_SID:
797 		return (ENOTSUP);
798 #endif	/* __sparc */
799 	}
800 	return (ENXIO);
801 }
802 
803 /*ARGSUSED2*/
804 static int
805 mmmmap(dev_t dev, off_t off, int prot)
806 {
807 	pfn_t pf;
808 	struct memlist *pmem;
809 	minor_t minor = getminor(dev);
810 
811 	switch (minor) {
812 	case M_MEM:
813 		pf = btop(off);
814 		memlist_read_lock();
815 		for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
816 			if (pf >= BTOP(pmem->ml_address) &&
817 			    pf < BTOP(pmem->ml_address + pmem->ml_size)) {
818 				memlist_read_unlock();
819 				return (impl_obmem_pfnum(pf));
820 			}
821 		}
822 		memlist_read_unlock();
823 		break;
824 
825 	case M_KMEM:
826 	case M_ALLKMEM:
827 		/* no longer supported with KPR */
828 		return (-1);
829 
830 	case M_ZERO:
831 		/*
832 		 * We shouldn't be mmap'ing to /dev/zero here as
833 		 * mmsegmap() should have already converted
834 		 * a mapping request for this device to a mapping
835 		 * using seg_vn for anonymous memory.
836 		 */
837 		break;
838 
839 	}
840 	return (-1);
841 }
842 
843 /*
844  * This function is called when a memory device is mmap'ed.
845  * Set up the mapping to the correct device driver.
846  */
847 static int
848 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
849     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
850 {
851 	struct segvn_crargs vn_a;
852 	struct segdev_crargs dev_a;
853 	int error;
854 	minor_t minor;
855 	off_t i;
856 
857 	minor = getminor(dev);
858 
859 	as_rangelock(as);
860 	/*
861 	 * No need to worry about vac alignment on /dev/zero
862 	 * since this is a "clone" object that doesn't yet exist.
863 	 */
864 	error = choose_addr(as, addrp, len, off,
865 	    (minor == M_MEM) || (minor == M_KMEM), flags);
866 	if (error != 0) {
867 		as_rangeunlock(as);
868 		return (error);
869 	}
870 
871 	switch (minor) {
872 	case M_MEM:
873 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
874 		if ((flags & MAP_TYPE) != MAP_SHARED) {
875 			as_rangeunlock(as);
876 			return (EINVAL);
877 		}
878 
879 		/*
880 		 * Check to ensure that the entire range is
881 		 * legal and we are not trying to map in
882 		 * more than the device will let us.
883 		 */
884 		for (i = 0; i < len; i += PAGESIZE) {
885 			if (mmmmap(dev, off + i, maxprot) == -1) {
886 				as_rangeunlock(as);
887 				return (ENXIO);
888 			}
889 		}
890 
891 		/*
892 		 * Use seg_dev segment driver for /dev/mem mapping.
893 		 */
894 		dev_a.mapfunc = mmmmap;
895 		dev_a.dev = dev;
896 		dev_a.offset = off;
897 		dev_a.type = (flags & MAP_TYPE);
898 		dev_a.prot = (uchar_t)prot;
899 		dev_a.maxprot = (uchar_t)maxprot;
900 		dev_a.hat_attr = 0;
901 
902 		/*
903 		 * Make /dev/mem mappings non-consistent since we can't
904 		 * alias pages that don't have page structs behind them,
905 		 * such as kernel stack pages. If someone mmap()s a kernel
906 		 * stack page and if we give him a tte with cv, a line from
907 		 * that page can get into both pages of the spitfire d$.
908 		 * But snoop from another processor will only invalidate
909 		 * the first page. This later caused kernel (xc_attention)
910 		 * to go into an infinite loop at pil 13 and no interrupts
911 		 * could come in. See 1203630.
912 		 *
913 		 */
914 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
915 		dev_a.devmap_data = NULL;
916 
917 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
918 		break;
919 
920 	case M_ZERO:
921 		/*
922 		 * Use seg_vn segment driver for /dev/zero mapping.
923 		 * Passing in a NULL amp gives us the "cloning" effect.
924 		 */
925 		vn_a.vp = NULL;
926 		vn_a.offset = 0;
927 		vn_a.type = (flags & MAP_TYPE);
928 		vn_a.prot = prot;
929 		vn_a.maxprot = maxprot;
930 		vn_a.flags = flags & ~MAP_TYPE;
931 		vn_a.cred = cred;
932 		vn_a.amp = NULL;
933 		vn_a.szc = 0;
934 		vn_a.lgrp_mem_policy_flags = 0;
935 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
936 		break;
937 
938 	case M_KMEM:
939 	case M_ALLKMEM:
940 		/* No longer supported with KPR. */
941 		error = ENXIO;
942 		break;
943 
944 	case M_NULL:
945 		/*
946 		 * Use seg_dev segment driver for /dev/null mapping.
947 		 */
948 		dev_a.mapfunc = mmmmap;
949 		dev_a.dev = dev;
950 		dev_a.offset = off;
951 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
952 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
953 		dev_a.hat_attr = 0;
954 		dev_a.hat_flags = 0;
955 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
956 		break;
957 
958 	default:
959 		error = ENXIO;
960 	}
961 
962 	as_rangeunlock(as);
963 	return (error);
964 }
965 
966 static struct cb_ops mm_cb_ops = {
967 	mmopen,			/* open */
968 	nulldev,		/* close */
969 	nodev,			/* strategy */
970 	nodev,			/* print */
971 	nodev,			/* dump */
972 	mmread,			/* read */
973 	mmwrite,		/* write */
974 	mmioctl,		/* ioctl */
975 	nodev,			/* devmap */
976 	mmmmap,			/* mmap */
977 	mmsegmap,		/* segmap */
978 	mmchpoll,		/* poll */
979 	mmpropop,		/* prop_op */
980 	0,			/* streamtab  */
981 	D_NEW | D_MP | D_64BIT | D_U64BIT
982 };
983 
984 static struct dev_ops mm_ops = {
985 	DEVO_REV,		/* devo_rev, */
986 	0,			/* refcnt  */
987 	mm_info,		/* get_dev_info */
988 	nulldev,		/* identify */
989 	nulldev,		/* probe */
990 	mm_attach,		/* attach */
991 	nodev,			/* detach */
992 	nodev,			/* reset */
993 	&mm_cb_ops,		/* driver operations */
994 	(struct bus_ops *)0,	/* bus operations */
995 	NULL,			/* power */
996 	ddi_quiesce_not_needed,		/* quiesce */
997 };
998 
999 static struct modldrv modldrv = {
1000 	&mod_driverops, "memory driver", &mm_ops,
1001 };
1002 
1003 static struct modlinkage modlinkage = {
1004 	MODREV_1, &modldrv, NULL
1005 };
1006 
1007 int
1008 _init(void)
1009 {
1010 	return (mod_install(&modlinkage));
1011 }
1012 
1013 int
1014 _info(struct modinfo *modinfop)
1015 {
1016 	return (mod_info(&modlinkage, modinfop));
1017 }
1018 
1019 int
1020 _fini(void)
1021 {
1022 	return (mod_remove(&modlinkage));
1023 }
1024 
1025 static int
1026 mm_kstat_update(kstat_t *ksp, int rw)
1027 {
1028 	struct memlist *pmem;
1029 	uint_t count;
1030 
1031 	if (rw == KSTAT_WRITE)
1032 		return (EACCES);
1033 
1034 	count = 0;
1035 	memlist_read_lock();
1036 	for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1037 		count++;
1038 	}
1039 	memlist_read_unlock();
1040 
1041 	ksp->ks_ndata = count;
1042 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1043 
1044 	return (0);
1045 }
1046 
1047 static int
1048 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1049 {
1050 	struct memlist *pmem;
1051 	struct memunit {
1052 		uint64_t address;
1053 		uint64_t size;
1054 	} *kspmem;
1055 
1056 	if (rw == KSTAT_WRITE)
1057 		return (EACCES);
1058 
1059 	ksp->ks_snaptime = gethrtime();
1060 
1061 	kspmem = (struct memunit *)buf;
1062 	memlist_read_lock();
1063 	for (pmem = phys_install; pmem != NULL;
1064 	    pmem = pmem->ml_next, kspmem++) {
1065 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1066 			break;
1067 		kspmem->address = pmem->ml_address;
1068 		kspmem->size = pmem->ml_size;
1069 	}
1070 	memlist_read_unlock();
1071 
1072 	return (0);
1073 }
1074 
1075 /*
1076  * Read a mem_name_t from user-space and store it in the mem_name_t
1077  * pointed to by the mem_name argument.
1078  */
1079 static int
1080 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1081 {
1082 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1083 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1084 			return (EFAULT);
1085 	}
1086 #ifdef	_SYSCALL32
1087 	else {
1088 		mem_name32_t mem_name32;
1089 
1090 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1091 			return (EFAULT);
1092 		mem_name->m_addr = mem_name32.m_addr;
1093 		mem_name->m_synd = mem_name32.m_synd;
1094 		mem_name->m_type[0] = mem_name32.m_type[0];
1095 		mem_name->m_type[1] = mem_name32.m_type[1];
1096 		mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1097 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1098 		mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1099 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1100 	}
1101 #endif	/* _SYSCALL32 */
1102 
1103 	return (0);
1104 }
1105