xref: /illumos-gate/usr/src/uts/common/io/mem.c (revision d90554eb1da54eb443177f39ed0e119805d34a46)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Memory special file
31  */
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/user.h>
36 #include <sys/buf.h>
37 #include <sys/systm.h>
38 #include <sys/cred.h>
39 #include <sys/vm.h>
40 #include <sys/uio.h>
41 #include <sys/mman.h>
42 #include <sys/kmem.h>
43 #include <vm/seg.h>
44 #include <vm/page.h>
45 #include <sys/stat.h>
46 #include <sys/vmem.h>
47 #include <sys/memlist.h>
48 #include <sys/bootconf.h>
49 
50 #include <vm/seg_vn.h>
51 #include <vm/seg_dev.h>
52 #include <vm/seg_kmem.h>
53 #include <vm/seg_kp.h>
54 #include <vm/seg_kpm.h>
55 #include <vm/hat.h>
56 
57 #include <sys/conf.h>
58 #include <sys/mem.h>
59 #include <sys/types.h>
60 #include <sys/conf.h>
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/errno.h>
64 #include <sys/modctl.h>
65 #include <sys/memlist.h>
66 #include <sys/ddi.h>
67 #include <sys/sunddi.h>
68 #include <sys/debug.h>
69 #include <sys/fm/protocol.h>
70 
71 #ifdef __sparc
72 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
73 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
74     uint64_t *, int *, int *, int *);
75 extern size_t cpu_get_name_bufsize(void);
76 extern int cpu_get_mem_sid(char *, char *, int, int *);
77 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
78 #endif	/* __sparc */
79 
80 /*
81  * Turn a byte length into a pagecount.  The DDI btop takes a
82  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
83  * large physical-memory 32-bit machines.
84  */
85 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
86 
87 static kmutex_t mm_lock;
88 static caddr_t mm_map;
89 
90 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
91 
92 static int mm_kmem_io_access;
93 
94 static int mm_kstat_update(kstat_t *ksp, int rw);
95 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
96 
97 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
98 static int mm_read_mem_page(intptr_t data, mem_page_t *mpage);
99 static int mm_get_mem_fmri(mem_page_t *mpage, nvlist_t **nvl);
100 static int mm_get_paddr(nvlist_t *nvl, uint64_t *paddr);
101 
102 /*ARGSUSED1*/
103 static int
104 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
105 {
106 	int i;
107 	struct mem_minor {
108 		char *name;
109 		minor_t minor;
110 		int privonly;
111 		const char *rdpriv;
112 		const char *wrpriv;
113 		mode_t priv_mode;
114 	} mm[] = {
115 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
116 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
117 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
118 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
119 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
120 	};
121 	kstat_t *ksp;
122 
123 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
124 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
125 
126 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
127 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
128 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
129 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
130 		    DDI_FAILURE) {
131 			ddi_remove_minor_node(devi, NULL);
132 			return (DDI_FAILURE);
133 		}
134 	}
135 
136 	mm_dip = devi;
137 
138 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
139 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
140 	if (ksp != NULL) {
141 		ksp->ks_update = mm_kstat_update;
142 		ksp->ks_snapshot = mm_kstat_snapshot;
143 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
144 		kstat_install(ksp);
145 	}
146 
147 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
148 	    "kmem_io_access", 0);
149 
150 	return (DDI_SUCCESS);
151 }
152 
153 /*ARGSUSED*/
154 static int
155 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
156 {
157 	register int error;
158 
159 	switch (infocmd) {
160 	case DDI_INFO_DEVT2DEVINFO:
161 		*result = (void *)mm_dip;
162 		error = DDI_SUCCESS;
163 		break;
164 	case DDI_INFO_DEVT2INSTANCE:
165 		*result = (void *)0;
166 		error = DDI_SUCCESS;
167 		break;
168 	default:
169 		error = DDI_FAILURE;
170 	}
171 	return (error);
172 }
173 
174 /*ARGSUSED1*/
175 static int
176 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
177 {
178 	switch (getminor(*devp)) {
179 	case M_NULL:
180 	case M_ZERO:
181 	case M_MEM:
182 	case M_KMEM:
183 	case M_ALLKMEM:
184 		/* standard devices */
185 		break;
186 
187 	default:
188 		/* Unsupported or unknown type */
189 		return (EINVAL);
190 	}
191 	return (0);
192 }
193 
194 struct pollhead	mm_pollhd;
195 
196 /*ARGSUSED*/
197 static int
198 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
199     struct pollhead **phpp)
200 {
201 	switch (getminor(dev)) {
202 	case M_NULL:
203 	case M_ZERO:
204 	case M_MEM:
205 	case M_KMEM:
206 	case M_ALLKMEM:
207 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
208 			POLLWRNORM | POLLRDBAND | POLLWRBAND);
209 		/*
210 		 * A non NULL pollhead pointer should be returned in case
211 		 * user polls for 0 events.
212 		 */
213 		*phpp = !anyyet && !*reventsp ?
214 		    &mm_pollhd : (struct pollhead *)NULL;
215 		return (0);
216 	default:
217 		/* no other devices currently support polling */
218 		return (ENXIO);
219 	}
220 }
221 
222 static int
223 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
224     char *name, caddr_t valuep, int *lengthp)
225 {
226 	/*
227 	 * implement zero size to reduce overhead (avoid two failing
228 	 * property lookups per stat).
229 	 */
230 	return (ddi_prop_op_size(dev, dip, prop_op,
231 	    flags, name, valuep, lengthp, 0));
232 }
233 
234 static int
235 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio)
236 {
237 	int error = 0;
238 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
239 	    (size_t)uio->uio_iov->iov_len);
240 
241 	mutex_enter(&mm_lock);
242 	hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
243 	    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ | PROT_WRITE),
244 	    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
245 
246 	if (!pf_is_memory(pfn)) {
247 		if (allowio) {
248 			size_t c = uio->uio_iov->iov_len;
249 
250 			if (ddi_peekpokeio(NULL, uio, rw,
251 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
252 			    sizeof (int32_t)) != DDI_SUCCESS)
253 				error = EFAULT;
254 		} else
255 			error = EIO;
256 	} else
257 		error = uiomove(&mm_map[pageoff], nbytes, rw, uio);
258 
259 	hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
260 	mutex_exit(&mm_lock);
261 	return (error);
262 }
263 
264 #ifdef	__sparc
265 
266 static int
267 mmpagelock(struct as *as, caddr_t va)
268 {
269 	struct seg *seg;
270 	int i;
271 
272 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
273 	seg = as_segat(as, va);
274 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
275 	AS_LOCK_EXIT(as, &as->a_lock);
276 
277 	return (i);
278 }
279 
280 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
281 
282 #else	/* __i386, __amd64 */
283 
284 #define	NEED_LOCK_KVADDR(va)	0
285 
286 #endif	/* __sparc */
287 
288 /*ARGSUSED3*/
289 static int
290 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
291 {
292 	pfn_t v;
293 	struct iovec *iov;
294 	int error = 0;
295 	size_t c;
296 	ssize_t oresid = uio->uio_resid;
297 	minor_t minor = getminor(dev);
298 
299 	while (uio->uio_resid > 0 && error == 0) {
300 		iov = uio->uio_iov;
301 		if (iov->iov_len == 0) {
302 			uio->uio_iov++;
303 			uio->uio_iovcnt--;
304 			if (uio->uio_iovcnt < 0)
305 				panic("mmrw");
306 			continue;
307 		}
308 		switch (minor) {
309 
310 		case M_MEM:
311 			memlist_read_lock();
312 			if (!address_in_memlist(phys_install,
313 			    (uint64_t)uio->uio_loffset, 1)) {
314 				memlist_read_unlock();
315 				error = EFAULT;
316 				break;
317 			}
318 			memlist_read_unlock();
319 
320 			v = BTOP((u_offset_t)uio->uio_loffset);
321 			error = mmio(uio, rw, v,
322 			    uio->uio_loffset & PAGEOFFSET, 0);
323 			break;
324 
325 		case M_KMEM:
326 		case M_ALLKMEM:
327 			{
328 			page_t **ppp;
329 			caddr_t vaddr = (caddr_t)uio->uio_offset;
330 			int try_lock = NEED_LOCK_KVADDR(vaddr);
331 			int locked = 0;
332 
333 			/*
334 			 * If vaddr does not map a valid page, as_pagelock()
335 			 * will return failure. Hence we can't check the
336 			 * return value and return EFAULT here as we'd like.
337 			 * seg_kp and seg_kpm do not properly support
338 			 * as_pagelock() for this context so we avoid it
339 			 * using the try_lock set check above.  Some day when
340 			 * the kernel page locking gets redesigned all this
341 			 * muck can be cleaned up.
342 			 */
343 			if (try_lock)
344 				locked = (as_pagelock(&kas, &ppp, vaddr,
345 				    PAGESIZE, S_WRITE) == 0);
346 
347 			v = hat_getpfnum(kas.a_hat,
348 			    (caddr_t)(uintptr_t)uio->uio_loffset);
349 			if (v == PFN_INVALID) {
350 				if (locked)
351 					as_pageunlock(&kas, ppp, vaddr,
352 					    PAGESIZE, S_WRITE);
353 				error = EFAULT;
354 				break;
355 			}
356 
357 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
358 			    minor == M_ALLKMEM || mm_kmem_io_access);
359 			if (locked)
360 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
361 				    S_WRITE);
362 			}
363 
364 			break;
365 
366 		case M_ZERO:
367 			if (rw == UIO_READ) {
368 				label_t ljb;
369 
370 				if (on_fault(&ljb)) {
371 					no_fault();
372 					error = EFAULT;
373 					break;
374 				}
375 				uzero(iov->iov_base, iov->iov_len);
376 				no_fault();
377 				uio->uio_resid -= iov->iov_len;
378 				uio->uio_loffset += iov->iov_len;
379 				break;
380 			}
381 			/* else it's a write, fall through to NULL case */
382 			/*FALLTHROUGH*/
383 
384 		case M_NULL:
385 			if (rw == UIO_READ)
386 				return (0);
387 			c = iov->iov_len;
388 			iov->iov_base += c;
389 			iov->iov_len -= c;
390 			uio->uio_loffset += c;
391 			uio->uio_resid -= c;
392 			break;
393 
394 		}
395 	}
396 	return (uio->uio_resid == oresid ? error : 0);
397 }
398 
399 static int
400 mmread(dev_t dev, struct uio *uio, cred_t *cred)
401 {
402 	return (mmrw(dev, uio, UIO_READ, cred));
403 }
404 
405 static int
406 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
407 {
408 	return (mmrw(dev, uio, UIO_WRITE, cred));
409 }
410 
411 /*
412  * Private ioctl for libkvm to support kvm_physaddr().
413  * Given an address space and a VA, compute the PA.
414  */
415 static int
416 mmioctl_vtop(intptr_t data)
417 {
418 	mem_vtop_t mem_vtop;
419 	proc_t *p;
420 	pfn_t pfn = (pfn_t)PFN_INVALID;
421 	pid_t pid = 0;
422 	struct as *as;
423 	struct seg *seg;
424 
425 	if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
426 		return (EFAULT);
427 	if (mem_vtop.m_as == &kas) {
428 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
429 	} else if (mem_vtop.m_as == NULL) {
430 		return (EIO);
431 	} else {
432 		mutex_enter(&pidlock);
433 		for (p = practive; p != NULL; p = p->p_next) {
434 			if (p->p_as == mem_vtop.m_as) {
435 				pid = p->p_pid;
436 				break;
437 			}
438 		}
439 		mutex_exit(&pidlock);
440 		if (p == NULL)
441 			return (EIO);
442 		p = sprlock(pid);
443 		if (p == NULL)
444 			return (EIO);
445 		as = p->p_as;
446 		if (as == mem_vtop.m_as) {
447 			mutex_exit(&p->p_lock);
448 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
449 			for (seg = AS_SEGFIRST(as); seg != NULL;
450 			    seg = AS_SEGNEXT(as, seg))
451 				if ((uintptr_t)mem_vtop.m_va -
452 				    (uintptr_t)seg->s_base < seg->s_size)
453 					break;
454 			if (seg != NULL)
455 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
456 			AS_LOCK_EXIT(as, &as->a_lock);
457 			mutex_enter(&p->p_lock);
458 		}
459 		sprunlock(p);
460 	}
461 	mem_vtop.m_pfn = pfn;
462 	if (pfn == PFN_INVALID)
463 		return (EIO);
464 	if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
465 		return (EFAULT);
466 
467 	return (0);
468 }
469 
470 /*
471  * Given a PA, execute the given page retire command on it.
472  */
473 static int
474 mmioctl_page_retire(int cmd, intptr_t data)
475 {
476 	extern int page_retire_test(void);
477 	uint64_t pa;
478 
479 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
480 		return (EFAULT);
481 	}
482 
483 	switch (cmd) {
484 	case MEM_PAGE_ISRETIRED:
485 		return (page_retire_check(pa, NULL));
486 
487 	case MEM_PAGE_UNRETIRE:
488 		return (page_unretire(pa));
489 
490 	case MEM_PAGE_RETIRE:
491 		return (page_retire(pa, PR_FMA));
492 
493 	case MEM_PAGE_RETIRE_MCE:
494 		return (page_retire(pa, PR_MCE));
495 
496 	case MEM_PAGE_RETIRE_UE:
497 		return (page_retire(pa, PR_UE));
498 
499 	case MEM_PAGE_GETERRORS:
500 		{
501 			uint64_t page_errors;
502 			int rc = page_retire_check(pa, &page_errors);
503 			if (copyout(&page_errors, (void *)data,
504 			    sizeof (uint64_t))) {
505 				return (EFAULT);
506 			}
507 			return (rc);
508 		}
509 
510 	case MEM_PAGE_RETIRE_TEST:
511 		return (page_retire_test());
512 
513 	}
514 
515 	return (EINVAL);
516 }
517 
518 /*
519  * Given a mem-scheme FMRI for a page, execute the given page retire
520  * command on it.
521  */
522 static int
523 mmioctl_page_fmri_retire(int cmd, intptr_t data)
524 {
525 	mem_page_t mpage;
526 	uint64_t pa;
527 	nvlist_t *nvl;
528 	int err;
529 
530 	if ((err = mm_read_mem_page(data, &mpage)) < 0)
531 		return (err);
532 
533 	if ((err = mm_get_mem_fmri(&mpage, &nvl)) < 0)
534 		return (err);
535 
536 	if ((err = mm_get_paddr(nvl, &pa)) < 0) {
537 		nvlist_free(nvl);
538 		return (err);
539 	}
540 
541 	nvlist_free(nvl);
542 
543 	switch (cmd) {
544 	case MEM_PAGE_FMRI_ISRETIRED:
545 		return (page_retire_check(pa, NULL));
546 
547 	case MEM_PAGE_FMRI_RETIRE:
548 		return (page_retire(pa, PR_FMA));
549 	}
550 
551 	return (EINVAL);
552 }
553 
554 #ifdef __sparc
555 /*
556  * Given a syndrome, syndrome type, and address return the
557  * associated memory name in the provided data buffer.
558  */
559 static int
560 mmioctl_get_mem_name(intptr_t data)
561 {
562 	mem_name_t mem_name;
563 	void *buf;
564 	size_t bufsize;
565 	int len, err;
566 
567 	if ((bufsize = cpu_get_name_bufsize()) == 0)
568 		return (ENOTSUP);
569 
570 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
571 		return (err);
572 
573 	buf = kmem_alloc(bufsize, KM_SLEEP);
574 
575 	/*
576 	 * Call into cpu specific code to do the lookup.
577 	 */
578 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
579 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
580 		kmem_free(buf, bufsize);
581 		return (err);
582 	}
583 
584 	if (len >= mem_name.m_namelen) {
585 		kmem_free(buf, bufsize);
586 		return (ENAMETOOLONG);
587 	}
588 
589 	if (copyoutstr(buf, (char *)mem_name.m_name,
590 	    mem_name.m_namelen, NULL) != 0) {
591 		kmem_free(buf, bufsize);
592 		return (EFAULT);
593 	}
594 
595 	kmem_free(buf, bufsize);
596 	return (0);
597 }
598 
599 /*
600  * Given a syndrome and address return information about the associated memory.
601  */
602 static int
603 mmioctl_get_mem_info(intptr_t data)
604 {
605 	mem_info_t mem_info;
606 	int err;
607 
608 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
609 		return (EFAULT);
610 
611 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
612 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
613 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
614 		return (err);
615 
616 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
617 		return (EFAULT);
618 
619 	return (0);
620 }
621 
622 /*
623  * Given a memory name, return its associated serial id
624  */
625 static int
626 mmioctl_get_mem_sid(intptr_t data)
627 {
628 	mem_name_t mem_name;
629 	void *buf;
630 	void *name;
631 	size_t	name_len;
632 	size_t bufsize;
633 	int len, err;
634 
635 	if ((bufsize = cpu_get_name_bufsize()) == 0)
636 		return (ENOTSUP);
637 
638 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
639 		return (err);
640 
641 	buf = kmem_alloc(bufsize, KM_SLEEP);
642 
643 	if (mem_name.m_namelen > 1024)
644 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
645 
646 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
647 
648 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
649 	    mem_name.m_namelen, &name_len)) != 0) {
650 		kmem_free(buf, bufsize);
651 		kmem_free(name, mem_name.m_namelen);
652 		return (err);
653 	}
654 
655 	/*
656 	 * Call into cpu specific code to do the lookup.
657 	 */
658 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
659 		kmem_free(buf, bufsize);
660 		kmem_free(name, mem_name.m_namelen);
661 		return (err);
662 	}
663 
664 	if (len > mem_name.m_sidlen) {
665 		kmem_free(buf, bufsize);
666 		kmem_free(name, mem_name.m_namelen);
667 		return (ENAMETOOLONG);
668 	}
669 
670 	if (copyoutstr(buf, (char *)mem_name.m_sid,
671 	    mem_name.m_sidlen, NULL) != 0) {
672 		kmem_free(buf, bufsize);
673 		kmem_free(name, mem_name.m_namelen);
674 		return (EFAULT);
675 	}
676 
677 	kmem_free(buf, bufsize);
678 	kmem_free(name, mem_name.m_namelen);
679 	return (0);
680 }
681 #endif	/* __sparc */
682 
683 /*
684  * Private ioctls for
685  *	libkvm to support kvm_physaddr().
686  *	FMA support for page_retire() and memory attribute information.
687  */
688 /*ARGSUSED*/
689 static int
690 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
691 {
692 	if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
693 	    (cmd != MEM_VTOP && getminor(dev) != M_MEM))
694 		return (ENXIO);
695 
696 	switch (cmd) {
697 	case MEM_VTOP:
698 		return (mmioctl_vtop(data));
699 
700 	case MEM_PAGE_RETIRE:
701 	case MEM_PAGE_ISRETIRED:
702 	case MEM_PAGE_UNRETIRE:
703 	case MEM_PAGE_RETIRE_MCE:
704 	case MEM_PAGE_RETIRE_UE:
705 	case MEM_PAGE_GETERRORS:
706 	case MEM_PAGE_RETIRE_TEST:
707 		return (mmioctl_page_retire(cmd, data));
708 
709 	case MEM_PAGE_FMRI_RETIRE:
710 	case MEM_PAGE_FMRI_ISRETIRED:
711 		return (mmioctl_page_fmri_retire(cmd, data));
712 
713 #ifdef __sparc
714 	case MEM_NAME:
715 		return (mmioctl_get_mem_name(data));
716 
717 	case MEM_INFO:
718 		return (mmioctl_get_mem_info(data));
719 
720 	case MEM_SID:
721 		return (mmioctl_get_mem_sid(data));
722 #else
723 	case MEM_NAME:
724 	case MEM_INFO:
725 	case MEM_SID:
726 		return (ENOTSUP);
727 #endif	/* __sparc */
728 	}
729 	return (ENXIO);
730 }
731 
732 /*ARGSUSED2*/
733 static int
734 mmmmap(dev_t dev, off_t off, int prot)
735 {
736 	pfn_t pf;
737 	struct memlist *pmem;
738 	minor_t minor = getminor(dev);
739 
740 	switch (minor) {
741 	case M_MEM:
742 		pf = btop(off);
743 		memlist_read_lock();
744 		for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
745 			if (pf >= BTOP(pmem->address) &&
746 			    pf < BTOP(pmem->address + pmem->size)) {
747 				memlist_read_unlock();
748 				return (impl_obmem_pfnum(pf));
749 			}
750 		}
751 		memlist_read_unlock();
752 		break;
753 
754 	case M_KMEM:
755 	case M_ALLKMEM:
756 		/* no longer supported with KPR */
757 		return (-1);
758 
759 	case M_ZERO:
760 		/*
761 		 * We shouldn't be mmap'ing to /dev/zero here as
762 		 * mmsegmap() should have already converted
763 		 * a mapping request for this device to a mapping
764 		 * using seg_vn for anonymous memory.
765 		 */
766 		break;
767 
768 	}
769 	return (-1);
770 }
771 
772 /*
773  * This function is called when a memory device is mmap'ed.
774  * Set up the mapping to the correct device driver.
775  */
776 static int
777 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
778     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
779 {
780 	struct segvn_crargs vn_a;
781 	struct segdev_crargs dev_a;
782 	int error;
783 	minor_t minor;
784 	off_t i;
785 
786 	minor = getminor(dev);
787 
788 	as_rangelock(as);
789 	if ((flags & MAP_FIXED) == 0) {
790 		/*
791 		 * No need to worry about vac alignment on /dev/zero
792 		 * since this is a "clone" object that doesn't yet exist.
793 		 */
794 		map_addr(addrp, len, (offset_t)off,
795 				(minor == M_MEM) || (minor == M_KMEM), flags);
796 
797 		if (*addrp == NULL) {
798 			as_rangeunlock(as);
799 			return (ENOMEM);
800 		}
801 	} else {
802 		/*
803 		 * User specified address -
804 		 * Blow away any previous mappings.
805 		 */
806 		(void) as_unmap(as, *addrp, len);
807 	}
808 
809 	switch (minor) {
810 	case M_MEM:
811 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
812 		if ((flags & MAP_TYPE) != MAP_SHARED) {
813 			as_rangeunlock(as);
814 			return (EINVAL);
815 		}
816 
817 		/*
818 		 * Check to ensure that the entire range is
819 		 * legal and we are not trying to map in
820 		 * more than the device will let us.
821 		 */
822 		for (i = 0; i < len; i += PAGESIZE) {
823 			if (mmmmap(dev, off + i, maxprot) == -1) {
824 				as_rangeunlock(as);
825 				return (ENXIO);
826 			}
827 		}
828 
829 		/*
830 		 * Use seg_dev segment driver for /dev/mem mapping.
831 		 */
832 		dev_a.mapfunc = mmmmap;
833 		dev_a.dev = dev;
834 		dev_a.offset = off;
835 		dev_a.type = (flags & MAP_TYPE);
836 		dev_a.prot = (uchar_t)prot;
837 		dev_a.maxprot = (uchar_t)maxprot;
838 		dev_a.hat_attr = 0;
839 
840 		/*
841 		 * Make /dev/mem mappings non-consistent since we can't
842 		 * alias pages that don't have page structs behind them,
843 		 * such as kernel stack pages. If someone mmap()s a kernel
844 		 * stack page and if we give him a tte with cv, a line from
845 		 * that page can get into both pages of the spitfire d$.
846 		 * But snoop from another processor will only invalidate
847 		 * the first page. This later caused kernel (xc_attention)
848 		 * to go into an infinite loop at pil 13 and no interrupts
849 		 * could come in. See 1203630.
850 		 *
851 		 */
852 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
853 		dev_a.devmap_data = NULL;
854 
855 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
856 		break;
857 
858 	case M_ZERO:
859 		/*
860 		 * Use seg_vn segment driver for /dev/zero mapping.
861 		 * Passing in a NULL amp gives us the "cloning" effect.
862 		 */
863 		vn_a.vp = NULL;
864 		vn_a.offset = 0;
865 		vn_a.type = (flags & MAP_TYPE);
866 		vn_a.prot = prot;
867 		vn_a.maxprot = maxprot;
868 		vn_a.flags = flags & ~MAP_TYPE;
869 		vn_a.cred = cred;
870 		vn_a.amp = NULL;
871 		vn_a.szc = 0;
872 		vn_a.lgrp_mem_policy_flags = 0;
873 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
874 		break;
875 
876 	case M_KMEM:
877 	case M_ALLKMEM:
878 		/* No longer supported with KPR. */
879 		error = ENXIO;
880 		break;
881 
882 	case M_NULL:
883 		/*
884 		 * Use seg_dev segment driver for /dev/null mapping.
885 		 */
886 		dev_a.mapfunc = mmmmap;
887 		dev_a.dev = dev;
888 		dev_a.offset = off;
889 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
890 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
891 		dev_a.hat_attr = 0;
892 		dev_a.hat_flags = 0;
893 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
894 		break;
895 
896 	default:
897 		error = ENXIO;
898 	}
899 
900 	as_rangeunlock(as);
901 	return (error);
902 }
903 
904 static struct cb_ops mm_cb_ops = {
905 	mmopen,			/* open */
906 	nulldev,		/* close */
907 	nodev,			/* strategy */
908 	nodev,			/* print */
909 	nodev,			/* dump */
910 	mmread,			/* read */
911 	mmwrite,		/* write */
912 	mmioctl,		/* ioctl */
913 	nodev,			/* devmap */
914 	mmmmap,			/* mmap */
915 	mmsegmap,		/* segmap */
916 	mmchpoll,		/* poll */
917 	mmpropop,		/* prop_op */
918 	0,			/* streamtab  */
919 	D_NEW | D_MP | D_64BIT | D_U64BIT
920 };
921 
922 static struct dev_ops mm_ops = {
923 	DEVO_REV,		/* devo_rev, */
924 	0,			/* refcnt  */
925 	mm_info,		/* get_dev_info */
926 	nulldev,		/* identify */
927 	nulldev,		/* probe */
928 	mm_attach,		/* attach */
929 	nodev,			/* detach */
930 	nodev,			/* reset */
931 	&mm_cb_ops,		/* driver operations */
932 	(struct bus_ops *)0	/* bus operations */
933 };
934 
935 static struct modldrv modldrv = {
936 	&mod_driverops, "memory driver %I%", &mm_ops,
937 };
938 
939 static struct modlinkage modlinkage = {
940 	MODREV_1, &modldrv, NULL
941 };
942 
943 int
944 _init(void)
945 {
946 	return (mod_install(&modlinkage));
947 }
948 
949 int
950 _info(struct modinfo *modinfop)
951 {
952 	return (mod_info(&modlinkage, modinfop));
953 }
954 
955 int
956 _fini(void)
957 {
958 	return (mod_remove(&modlinkage));
959 }
960 
961 static int
962 mm_kstat_update(kstat_t *ksp, int rw)
963 {
964 	struct memlist *pmem;
965 	uint_t count;
966 
967 	if (rw == KSTAT_WRITE)
968 		return (EACCES);
969 
970 	count = 0;
971 	memlist_read_lock();
972 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
973 		count++;
974 	}
975 	memlist_read_unlock();
976 
977 	ksp->ks_ndata = count;
978 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
979 
980 	return (0);
981 }
982 
983 static int
984 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
985 {
986 	struct memlist *pmem;
987 	struct memunit {
988 		uint64_t address;
989 		uint64_t size;
990 	} *kspmem;
991 
992 	if (rw == KSTAT_WRITE)
993 		return (EACCES);
994 
995 	ksp->ks_snaptime = gethrtime();
996 
997 	kspmem = (struct memunit *)buf;
998 	memlist_read_lock();
999 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next, kspmem++) {
1000 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1001 			break;
1002 		kspmem->address = pmem->address;
1003 		kspmem->size = pmem->size;
1004 	}
1005 	memlist_read_unlock();
1006 
1007 	return (0);
1008 }
1009 
1010 /*
1011  * Read a mem_name_t from user-space and store it in the mem_name_t
1012  * pointed to by the mem_name argument.
1013  */
1014 static int
1015 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1016 {
1017 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1018 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1019 			return (EFAULT);
1020 	}
1021 #ifdef	_SYSCALL32
1022 	else {
1023 		mem_name32_t mem_name32;
1024 
1025 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1026 			return (EFAULT);
1027 		mem_name->m_addr = mem_name32.m_addr;
1028 		mem_name->m_synd = mem_name32.m_synd;
1029 		mem_name->m_type[0] = mem_name32.m_type[0];
1030 		mem_name->m_type[1] = mem_name32.m_type[1];
1031 		mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1032 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1033 		mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1034 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1035 	}
1036 #endif	/* _SYSCALL32 */
1037 
1038 	return (0);
1039 }
1040 
1041 /*
1042  * Read a mem_page_t from user-space and store it in the mem_page_t
1043  * pointed to by the mpage argument.
1044  */
1045 static int
1046 mm_read_mem_page(intptr_t data, mem_page_t *mpage)
1047 {
1048 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1049 		if (copyin((void *)data, mpage, sizeof (mem_page_t)) != 0)
1050 			return (EFAULT);
1051 	}
1052 #ifdef _SYSCALL32
1053 	else {
1054 		mem_page32_t	mpage32;
1055 
1056 		if (copyin((void *)data, &mpage32, sizeof (mem_page32_t)) != 0)
1057 			return (EFAULT);
1058 
1059 		mpage->m_fmri = (caddr_t)(uintptr_t)mpage32.m_fmri;
1060 		mpage->m_fmrisz = mpage32.m_fmrisz;
1061 	}
1062 #endif	/* _SYSCALL32 */
1063 
1064 	return (0);
1065 }
1066 
1067 /*
1068  * Expand an FMRI from a mem_page_t.
1069  */
1070 static int
1071 mm_get_mem_fmri(mem_page_t *mpage, nvlist_t **nvl)
1072 {
1073 	char *buf;
1074 	int err;
1075 
1076 	if (mpage->m_fmri == NULL || mpage->m_fmrisz > MEM_FMRI_MAX_BUFSIZE)
1077 		return (EINVAL);
1078 
1079 	buf = kmem_alloc(mpage->m_fmrisz, KM_SLEEP);
1080 	if (copyin(mpage->m_fmri, buf, mpage->m_fmrisz) != 0) {
1081 		kmem_free(buf, mpage->m_fmrisz);
1082 		return (EFAULT);
1083 	}
1084 
1085 	err = nvlist_unpack(buf, mpage->m_fmrisz, nvl, KM_SLEEP);
1086 	kmem_free(buf, mpage->m_fmrisz);
1087 
1088 	return (err);
1089 }
1090 
1091 static int
1092 mm_get_paddr(nvlist_t *nvl, uint64_t *paddr)
1093 {
1094 	uint8_t version;
1095 	uint64_t pa;
1096 	char *scheme;
1097 #ifdef __sparc
1098 	uint64_t offset;
1099 	char *unum;
1100 	char **serids;
1101 	uint_t nserids;
1102 	int err;
1103 #endif
1104 
1105 	/* Verify FMRI scheme name and version number */
1106 	if ((nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &scheme) != 0) ||
1107 	    (strcmp(scheme, FM_FMRI_SCHEME_MEM) != 0) ||
1108 	    (nvlist_lookup_uint8(nvl, FM_VERSION, &version) != 0) ||
1109 	    version > FM_MEM_SCHEME_VERSION) {
1110 		return (EINVAL);
1111 	}
1112 
1113 	/*
1114 	 * There are two ways a physical address can be  obtained from a mem
1115 	 * scheme FMRI.  One way is to use the "offset" and  "serial"
1116 	 * members, if they are present, together with the "unum" member to
1117 	 * calculate a physical address.  This is the preferred way since
1118 	 * it is independent of possible changes to the programming of
1119 	 * underlying hardware registers that may change the physical address.
1120 	 * If the "offset" member is not present, then the address is
1121 	 * retrieved from the "physaddr" member.
1122 	 */
1123 #ifdef __sparc
1124 	if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_OFFSET, &offset) != 0) {
1125 		if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, &pa) !=
1126 		    0) {
1127 			return (EINVAL);
1128 		}
1129 	} else if (nvlist_lookup_string(nvl, FM_FMRI_MEM_UNUM, &unum) != 0 ||
1130 	    nvlist_lookup_string_array(nvl, FM_FMRI_MEM_SERIAL_ID, &serids,
1131 	    &nserids) != 0) {
1132 		return (EINVAL);
1133 	} else {
1134 		if ((err = cpu_get_mem_addr(unum, serids[0], offset, &pa)) != 0)
1135 			return (err);
1136 	}
1137 #else /* __i386, __amd64 */
1138 	if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, &pa) != 0)
1139 		return (EINVAL);
1140 #endif /* __sparc */
1141 
1142 	*paddr = pa;
1143 	return (0);
1144 }
1145