xref: /titanic_51/usr/src/uts/common/io/mem.c (revision aec315a678aed1ef0b0c48919c1741990b929cfb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Memory special file
31  */
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/user.h>
36 #include <sys/buf.h>
37 #include <sys/systm.h>
38 #include <sys/cred.h>
39 #include <sys/vm.h>
40 #include <sys/uio.h>
41 #include <sys/mman.h>
42 #include <sys/kmem.h>
43 #include <vm/seg.h>
44 #include <vm/page.h>
45 #include <sys/stat.h>
46 #include <sys/vmem.h>
47 #include <sys/memlist.h>
48 #include <sys/bootconf.h>
49 
50 #include <vm/seg_vn.h>
51 #include <vm/seg_dev.h>
52 #include <vm/seg_kmem.h>
53 #include <vm/seg_kp.h>
54 #include <vm/seg_kpm.h>
55 #include <vm/hat.h>
56 
57 #include <sys/conf.h>
58 #include <sys/mem.h>
59 #include <sys/types.h>
60 #include <sys/conf.h>
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/errno.h>
64 #include <sys/modctl.h>
65 #include <sys/memlist.h>
66 #include <sys/ddi.h>
67 #include <sys/sunddi.h>
68 #include <sys/debug.h>
69 #include <sys/fm/protocol.h>
70 
71 #ifdef __sparc
72 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
73 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
74     uint64_t *, int *, int *, int *);
75 extern size_t cpu_get_name_bufsize(void);
76 extern int cpu_get_mem_sid(char *, char *, int, int *);
77 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
78 #endif	/* __sparc */
79 
80 /*
81  * Turn a byte length into a pagecount.  The DDI btop takes a
82  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
83  * large physical-memory 32-bit machines.
84  */
85 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
86 
87 static kmutex_t mm_lock;
88 static caddr_t mm_map;
89 
90 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
91 
92 static int mm_kmem_io_access;
93 
94 static int mm_kstat_update(kstat_t *ksp, int rw);
95 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
96 
97 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
98 static int mm_read_mem_page(intptr_t data, mem_page_t *mpage);
99 static int mm_get_mem_fmri(mem_page_t *mpage, nvlist_t **nvl);
100 static int mm_get_paddr(nvlist_t *nvl, uint64_t *paddr);
101 
102 /*ARGSUSED1*/
103 static int
104 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
105 {
106 	int i;
107 	struct mem_minor {
108 		char *name;
109 		minor_t minor;
110 		int privonly;
111 		const char *rdpriv;
112 		const char *wrpriv;
113 		mode_t priv_mode;
114 	} mm[] = {
115 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
116 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
117 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
118 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
119 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
120 	};
121 	kstat_t *ksp;
122 
123 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
124 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
125 
126 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
127 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
128 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
129 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
130 		    DDI_FAILURE) {
131 			ddi_remove_minor_node(devi, NULL);
132 			return (DDI_FAILURE);
133 		}
134 	}
135 
136 	mm_dip = devi;
137 
138 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
139 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
140 	if (ksp != NULL) {
141 		ksp->ks_update = mm_kstat_update;
142 		ksp->ks_snapshot = mm_kstat_snapshot;
143 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
144 		kstat_install(ksp);
145 	}
146 
147 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
148 	    "kmem_io_access", 0);
149 
150 	return (DDI_SUCCESS);
151 }
152 
153 /*ARGSUSED*/
154 static int
155 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
156 {
157 	register int error;
158 
159 	switch (infocmd) {
160 	case DDI_INFO_DEVT2DEVINFO:
161 		*result = (void *)mm_dip;
162 		error = DDI_SUCCESS;
163 		break;
164 	case DDI_INFO_DEVT2INSTANCE:
165 		*result = (void *)0;
166 		error = DDI_SUCCESS;
167 		break;
168 	default:
169 		error = DDI_FAILURE;
170 	}
171 	return (error);
172 }
173 
174 /*ARGSUSED1*/
175 static int
176 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
177 {
178 	switch (getminor(*devp)) {
179 	case M_NULL:
180 	case M_ZERO:
181 	case M_MEM:
182 	case M_KMEM:
183 	case M_ALLKMEM:
184 		/* standard devices */
185 		break;
186 
187 	default:
188 		/* Unsupported or unknown type */
189 		return (EINVAL);
190 	}
191 	return (0);
192 }
193 
194 struct pollhead	mm_pollhd;
195 
196 /*ARGSUSED*/
197 static int
198 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
199     struct pollhead **phpp)
200 {
201 	switch (getminor(dev)) {
202 	case M_NULL:
203 	case M_ZERO:
204 	case M_MEM:
205 	case M_KMEM:
206 	case M_ALLKMEM:
207 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
208 			POLLWRNORM | POLLRDBAND | POLLWRBAND);
209 		/*
210 		 * A non NULL pollhead pointer should be returned in case
211 		 * user polls for 0 events.
212 		 */
213 		*phpp = !anyyet && !*reventsp ?
214 		    &mm_pollhd : (struct pollhead *)NULL;
215 		return (0);
216 	default:
217 		/* no other devices currently support polling */
218 		return (ENXIO);
219 	}
220 }
221 
222 static int
223 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
224     char *name, caddr_t valuep, int *lengthp)
225 {
226 	/*
227 	 * implement zero size to reduce overhead (avoid two failing
228 	 * property lookups per stat).
229 	 */
230 	return (ddi_prop_op_size(dev, dip, prop_op,
231 	    flags, name, valuep, lengthp, 0));
232 }
233 
234 static int
235 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio)
236 {
237 	int error = 0;
238 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
239 	    (size_t)uio->uio_iov->iov_len);
240 
241 	mutex_enter(&mm_lock);
242 	hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
243 	    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ | PROT_WRITE),
244 	    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
245 
246 	if (!pf_is_memory(pfn)) {
247 		if (allowio) {
248 			size_t c = uio->uio_iov->iov_len;
249 
250 			if (ddi_peekpokeio(NULL, uio, rw,
251 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
252 			    sizeof (int32_t)) != DDI_SUCCESS)
253 				error = EFAULT;
254 		} else
255 			error = EIO;
256 	} else
257 		error = uiomove(&mm_map[pageoff], nbytes, rw, uio);
258 
259 	hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
260 	mutex_exit(&mm_lock);
261 	return (error);
262 }
263 
264 #ifdef	__sparc
265 
266 static int
267 mmpagelock(struct as *as, caddr_t va)
268 {
269 	struct seg *seg;
270 	int i;
271 
272 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
273 	seg = as_segat(as, va);
274 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
275 	AS_LOCK_EXIT(as, &as->a_lock);
276 
277 	return (i);
278 }
279 
280 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
281 
282 #else	/* __i386, __amd64 */
283 
284 #define	NEED_LOCK_KVADDR(va)	0
285 
286 #endif	/* __sparc */
287 
288 /*ARGSUSED3*/
289 static int
290 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
291 {
292 	pfn_t v;
293 	struct iovec *iov;
294 	int error = 0;
295 	size_t c;
296 	ssize_t oresid = uio->uio_resid;
297 	minor_t minor = getminor(dev);
298 
299 	while (uio->uio_resid > 0 && error == 0) {
300 		iov = uio->uio_iov;
301 		if (iov->iov_len == 0) {
302 			uio->uio_iov++;
303 			uio->uio_iovcnt--;
304 			if (uio->uio_iovcnt < 0)
305 				panic("mmrw");
306 			continue;
307 		}
308 		switch (minor) {
309 
310 		case M_MEM:
311 			memlist_read_lock();
312 			if (!address_in_memlist(phys_install,
313 			    (uint64_t)uio->uio_loffset, 1)) {
314 				memlist_read_unlock();
315 				error = EFAULT;
316 				break;
317 			}
318 			memlist_read_unlock();
319 
320 			v = BTOP((u_offset_t)uio->uio_loffset);
321 			error = mmio(uio, rw, v,
322 			    uio->uio_loffset & PAGEOFFSET, 0);
323 			break;
324 
325 		case M_KMEM:
326 		case M_ALLKMEM:
327 			{
328 			page_t **ppp;
329 			caddr_t vaddr = (caddr_t)uio->uio_offset;
330 			int try_lock = NEED_LOCK_KVADDR(vaddr);
331 			int locked = 0;
332 
333 			/*
334 			 * If vaddr does not map a valid page, as_pagelock()
335 			 * will return failure. Hence we can't check the
336 			 * return value and return EFAULT here as we'd like.
337 			 * seg_kp and seg_kpm do not properly support
338 			 * as_pagelock() for this context so we avoid it
339 			 * using the try_lock set check above.  Some day when
340 			 * the kernel page locking gets redesigned all this
341 			 * muck can be cleaned up.
342 			 */
343 			if (try_lock)
344 				locked = (as_pagelock(&kas, &ppp, vaddr,
345 				    PAGESIZE, S_WRITE) == 0);
346 
347 			v = hat_getpfnum(kas.a_hat,
348 			    (caddr_t)(uintptr_t)uio->uio_loffset);
349 			if (v == PFN_INVALID) {
350 				if (locked)
351 					as_pageunlock(&kas, ppp, vaddr,
352 					    PAGESIZE, S_WRITE);
353 				error = EFAULT;
354 				break;
355 			}
356 
357 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
358 			    minor == M_ALLKMEM || mm_kmem_io_access);
359 			if (locked)
360 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
361 				    S_WRITE);
362 			}
363 
364 			break;
365 
366 		case M_ZERO:
367 			if (rw == UIO_READ) {
368 				label_t ljb;
369 
370 				if (on_fault(&ljb)) {
371 					no_fault();
372 					error = EFAULT;
373 					break;
374 				}
375 				uzero(iov->iov_base, iov->iov_len);
376 				no_fault();
377 				uio->uio_resid -= iov->iov_len;
378 				uio->uio_loffset += iov->iov_len;
379 				break;
380 			}
381 			/* else it's a write, fall through to NULL case */
382 			/*FALLTHROUGH*/
383 
384 		case M_NULL:
385 			if (rw == UIO_READ)
386 				return (0);
387 			c = iov->iov_len;
388 			iov->iov_base += c;
389 			iov->iov_len -= c;
390 			uio->uio_loffset += c;
391 			uio->uio_resid -= c;
392 			break;
393 
394 		}
395 	}
396 	return (uio->uio_resid == oresid ? error : 0);
397 }
398 
399 static int
400 mmread(dev_t dev, struct uio *uio, cred_t *cred)
401 {
402 	return (mmrw(dev, uio, UIO_READ, cred));
403 }
404 
405 static int
406 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
407 {
408 	return (mmrw(dev, uio, UIO_WRITE, cred));
409 }
410 
411 /*
412  * Private ioctl for libkvm to support kvm_physaddr().
413  * Given an address space and a VA, compute the PA.
414  */
415 static int
416 mmioctl_vtop(intptr_t data)
417 {
418 	mem_vtop_t mem_vtop;
419 	proc_t *p;
420 	pfn_t pfn = (pfn_t)PFN_INVALID;
421 	pid_t pid = 0;
422 	struct as *as;
423 	struct seg *seg;
424 
425 	if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
426 		return (EFAULT);
427 	if (mem_vtop.m_as == &kas) {
428 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
429 	} else if (mem_vtop.m_as == NULL) {
430 		return (EIO);
431 	} else {
432 		mutex_enter(&pidlock);
433 		for (p = practive; p != NULL; p = p->p_next) {
434 			if (p->p_as == mem_vtop.m_as) {
435 				pid = p->p_pid;
436 				break;
437 			}
438 		}
439 		mutex_exit(&pidlock);
440 		if (p == NULL)
441 			return (EIO);
442 		p = sprlock(pid);
443 		if (p == NULL)
444 			return (EIO);
445 		as = p->p_as;
446 		if (as == mem_vtop.m_as) {
447 			mutex_exit(&p->p_lock);
448 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
449 			for (seg = AS_SEGFIRST(as); seg != NULL;
450 			    seg = AS_SEGNEXT(as, seg))
451 				if ((uintptr_t)mem_vtop.m_va -
452 				    (uintptr_t)seg->s_base < seg->s_size)
453 					break;
454 			if (seg != NULL)
455 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
456 			AS_LOCK_EXIT(as, &as->a_lock);
457 			mutex_enter(&p->p_lock);
458 		}
459 		sprunlock(p);
460 	}
461 	mem_vtop.m_pfn = pfn;
462 	if (pfn == PFN_INVALID)
463 		return (EIO);
464 	if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
465 		return (EFAULT);
466 
467 	return (0);
468 }
469 
470 /*
471  * Given a PA, execute the given page retire command on it.
472  */
473 static int
474 mmioctl_page_retire(int cmd, intptr_t data)
475 {
476 	extern int page_retire_test(void);
477 	uint64_t pa;
478 
479 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
480 		return (EFAULT);
481 	}
482 
483 	switch (cmd) {
484 	case MEM_PAGE_ISRETIRED:
485 		return (page_retire_check(pa, NULL));
486 
487 	case MEM_PAGE_UNRETIRE:
488 		return (page_unretire(pa));
489 
490 	case MEM_PAGE_RETIRE:
491 		return (page_retire(pa, PR_FMA));
492 
493 	case MEM_PAGE_RETIRE_MCE:
494 		return (page_retire(pa, PR_MCE));
495 
496 	case MEM_PAGE_RETIRE_UE:
497 		return (page_retire(pa, PR_UE));
498 
499 	case MEM_PAGE_GETERRORS:
500 		{
501 			uint64_t page_errors;
502 			int rc = page_retire_check(pa, &page_errors);
503 			if (copyout(&page_errors, (void *)data,
504 			    sizeof (uint64_t))) {
505 				return (EFAULT);
506 			}
507 			return (rc);
508 		}
509 
510 	case MEM_PAGE_RETIRE_TEST:
511 		return (page_retire_test());
512 
513 	}
514 
515 	return (EINVAL);
516 }
517 
518 /*
519  * Given a mem-scheme FMRI for a page, execute the given page retire
520  * command on it.
521  */
522 static int
523 mmioctl_page_fmri_retire(int cmd, intptr_t data)
524 {
525 	mem_page_t mpage;
526 	uint64_t pa;
527 	nvlist_t *nvl;
528 	int err;
529 
530 	if ((err = mm_read_mem_page(data, &mpage)) < 0)
531 		return (err);
532 
533 	if ((err = mm_get_mem_fmri(&mpage, &nvl)) < 0)
534 		return (err);
535 
536 	if ((err = mm_get_paddr(nvl, &pa)) < 0) {
537 		nvlist_free(nvl);
538 		return (err);
539 	}
540 
541 	nvlist_free(nvl);
542 
543 	switch (cmd) {
544 	case MEM_PAGE_FMRI_ISRETIRED:
545 		return (page_retire_check(pa, NULL));
546 
547 	case MEM_PAGE_FMRI_RETIRE:
548 		return (page_retire(pa, PR_FMA));
549 	}
550 
551 	return (EINVAL);
552 }
553 
554 #ifdef __sparc
555 /*
556  * Given a syndrome, syndrome type, and address return the
557  * associated memory name in the provided data buffer.
558  */
559 static int
560 mmioctl_get_mem_name(intptr_t data)
561 {
562 	mem_name_t mem_name;
563 	void *buf;
564 	size_t bufsize;
565 	int len, err;
566 
567 	if ((bufsize = cpu_get_name_bufsize()) == 0)
568 		return (ENOTSUP);
569 
570 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
571 		return (err);
572 
573 	buf = kmem_alloc(bufsize, KM_SLEEP);
574 
575 	/*
576 	 * Call into cpu specific code to do the lookup.
577 	 */
578 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
579 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
580 		kmem_free(buf, bufsize);
581 		return (err);
582 	}
583 
584 	if (len >= mem_name.m_namelen) {
585 		kmem_free(buf, bufsize);
586 		return (ENAMETOOLONG);
587 	}
588 
589 	if (copyoutstr(buf, (char *)mem_name.m_name,
590 	    mem_name.m_namelen, NULL) != 0) {
591 		kmem_free(buf, bufsize);
592 		return (EFAULT);
593 	}
594 
595 	kmem_free(buf, bufsize);
596 	return (0);
597 }
598 
599 /*
600  * Given a syndrome and address return information about the associated memory.
601  */
602 static int
603 mmioctl_get_mem_info(intptr_t data)
604 {
605 	mem_info_t mem_info;
606 	int err;
607 
608 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
609 		return (EFAULT);
610 
611 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
612 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
613 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
614 		return (err);
615 
616 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
617 		return (EFAULT);
618 
619 	return (0);
620 }
621 
622 /*
623  * Given a memory name, return its associated serial id
624  */
625 static int
626 mmioctl_get_mem_sid(intptr_t data)
627 {
628 	mem_name_t mem_name;
629 	void *buf;
630 	void *name;
631 	size_t	name_len;
632 	size_t bufsize;
633 	int len, err;
634 
635 	if ((bufsize = cpu_get_name_bufsize()) == 0)
636 		return (ENOTSUP);
637 
638 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
639 		return (err);
640 
641 	buf = kmem_alloc(bufsize, KM_SLEEP);
642 
643 	if (mem_name.m_namelen > 1024)
644 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
645 
646 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
647 
648 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
649 	    mem_name.m_namelen, &name_len)) != 0) {
650 		kmem_free(buf, bufsize);
651 		kmem_free(name, mem_name.m_namelen);
652 		return (err);
653 	}
654 
655 	/*
656 	 * Call into cpu specific code to do the lookup.
657 	 */
658 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
659 		kmem_free(buf, bufsize);
660 		kmem_free(name, mem_name.m_namelen);
661 		return (err);
662 	}
663 
664 	if (len > mem_name.m_sidlen) {
665 		kmem_free(buf, bufsize);
666 		kmem_free(name, mem_name.m_namelen);
667 		return (ENAMETOOLONG);
668 	}
669 
670 	if (copyoutstr(buf, (char *)mem_name.m_sid,
671 	    mem_name.m_sidlen, NULL) != 0) {
672 		kmem_free(buf, bufsize);
673 		kmem_free(name, mem_name.m_namelen);
674 		return (EFAULT);
675 	}
676 
677 	kmem_free(buf, bufsize);
678 	kmem_free(name, mem_name.m_namelen);
679 	return (0);
680 }
681 #endif	/* __sparc */
682 
683 /*
684  * Private ioctls for
685  *	libkvm to support kvm_physaddr().
686  *	FMA support for page_retire() and memory attribute information.
687  */
688 /*ARGSUSED*/
689 static int
690 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
691 {
692 	if (cmd == MEM_VTOP && (getminor(dev) != M_KMEM))
693 		return (ENXIO);
694 	else if (getminor(dev) != M_MEM)
695 		return (ENXIO);
696 
697 	switch (cmd) {
698 	case MEM_VTOP:
699 		return (mmioctl_vtop(data));
700 
701 	case MEM_PAGE_RETIRE:
702 	case MEM_PAGE_ISRETIRED:
703 	case MEM_PAGE_UNRETIRE:
704 	case MEM_PAGE_RETIRE_MCE:
705 	case MEM_PAGE_RETIRE_UE:
706 	case MEM_PAGE_GETERRORS:
707 	case MEM_PAGE_RETIRE_TEST:
708 		return (mmioctl_page_retire(cmd, data));
709 
710 	case MEM_PAGE_FMRI_RETIRE:
711 	case MEM_PAGE_FMRI_ISRETIRED:
712 		return (mmioctl_page_fmri_retire(cmd, data));
713 
714 #ifdef __sparc
715 	case MEM_NAME:
716 		return (mmioctl_get_mem_name(data));
717 
718 	case MEM_INFO:
719 		return (mmioctl_get_mem_info(data));
720 
721 	case MEM_SID:
722 		return (mmioctl_get_mem_sid(data));
723 #else
724 	case MEM_NAME:
725 	case MEM_INFO:
726 	case MEM_SID:
727 		return (ENOTSUP);
728 #endif	/* __sparc */
729 	}
730 	return (ENXIO);
731 }
732 
733 /*ARGSUSED2*/
734 static int
735 mmmmap(dev_t dev, off_t off, int prot)
736 {
737 	pfn_t pf;
738 	struct memlist *pmem;
739 	minor_t minor = getminor(dev);
740 
741 	switch (minor) {
742 	case M_MEM:
743 		pf = btop(off);
744 		memlist_read_lock();
745 		for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
746 			if (pf >= BTOP(pmem->address) &&
747 			    pf < BTOP(pmem->address + pmem->size)) {
748 				memlist_read_unlock();
749 				return (impl_obmem_pfnum(pf));
750 			}
751 		}
752 		memlist_read_unlock();
753 		break;
754 
755 	case M_KMEM:
756 	case M_ALLKMEM:
757 		/* no longer supported with KPR */
758 		return (-1);
759 
760 	case M_ZERO:
761 		/*
762 		 * We shouldn't be mmap'ing to /dev/zero here as
763 		 * mmsegmap() should have already converted
764 		 * a mapping request for this device to a mapping
765 		 * using seg_vn for anonymous memory.
766 		 */
767 		break;
768 
769 	}
770 	return (-1);
771 }
772 
773 /*
774  * This function is called when a memory device is mmap'ed.
775  * Set up the mapping to the correct device driver.
776  */
777 static int
778 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
779     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
780 {
781 	struct segvn_crargs vn_a;
782 	struct segdev_crargs dev_a;
783 	int error;
784 	minor_t minor;
785 	off_t i;
786 
787 	minor = getminor(dev);
788 
789 	as_rangelock(as);
790 	if ((flags & MAP_FIXED) == 0) {
791 		/*
792 		 * No need to worry about vac alignment on /dev/zero
793 		 * since this is a "clone" object that doesn't yet exist.
794 		 */
795 		map_addr(addrp, len, (offset_t)off,
796 				(minor == M_MEM) || (minor == M_KMEM), flags);
797 
798 		if (*addrp == NULL) {
799 			as_rangeunlock(as);
800 			return (ENOMEM);
801 		}
802 	} else {
803 		/*
804 		 * User specified address -
805 		 * Blow away any previous mappings.
806 		 */
807 		(void) as_unmap(as, *addrp, len);
808 	}
809 
810 	switch (minor) {
811 	case M_MEM:
812 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
813 		if ((flags & MAP_TYPE) != MAP_SHARED) {
814 			as_rangeunlock(as);
815 			return (EINVAL);
816 		}
817 
818 		/*
819 		 * Check to ensure that the entire range is
820 		 * legal and we are not trying to map in
821 		 * more than the device will let us.
822 		 */
823 		for (i = 0; i < len; i += PAGESIZE) {
824 			if (mmmmap(dev, off + i, maxprot) == -1) {
825 				as_rangeunlock(as);
826 				return (ENXIO);
827 			}
828 		}
829 
830 		/*
831 		 * Use seg_dev segment driver for /dev/mem mapping.
832 		 */
833 		dev_a.mapfunc = mmmmap;
834 		dev_a.dev = dev;
835 		dev_a.offset = off;
836 		dev_a.type = (flags & MAP_TYPE);
837 		dev_a.prot = (uchar_t)prot;
838 		dev_a.maxprot = (uchar_t)maxprot;
839 		dev_a.hat_attr = 0;
840 
841 		/*
842 		 * Make /dev/mem mappings non-consistent since we can't
843 		 * alias pages that don't have page structs behind them,
844 		 * such as kernel stack pages. If someone mmap()s a kernel
845 		 * stack page and if we give him a tte with cv, a line from
846 		 * that page can get into both pages of the spitfire d$.
847 		 * But snoop from another processor will only invalidate
848 		 * the first page. This later caused kernel (xc_attention)
849 		 * to go into an infinite loop at pil 13 and no interrupts
850 		 * could come in. See 1203630.
851 		 *
852 		 */
853 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
854 		dev_a.devmap_data = NULL;
855 
856 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
857 		break;
858 
859 	case M_ZERO:
860 		/*
861 		 * Use seg_vn segment driver for /dev/zero mapping.
862 		 * Passing in a NULL amp gives us the "cloning" effect.
863 		 */
864 		vn_a.vp = NULL;
865 		vn_a.offset = 0;
866 		vn_a.type = (flags & MAP_TYPE);
867 		vn_a.prot = prot;
868 		vn_a.maxprot = maxprot;
869 		vn_a.flags = flags & ~MAP_TYPE;
870 		vn_a.cred = cred;
871 		vn_a.amp = NULL;
872 		vn_a.szc = 0;
873 		vn_a.lgrp_mem_policy_flags = 0;
874 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
875 		break;
876 
877 	case M_KMEM:
878 	case M_ALLKMEM:
879 		/* No longer supported with KPR. */
880 		error = ENXIO;
881 		break;
882 
883 	case M_NULL:
884 		/*
885 		 * Use seg_dev segment driver for /dev/null mapping.
886 		 */
887 		dev_a.mapfunc = mmmmap;
888 		dev_a.dev = dev;
889 		dev_a.offset = off;
890 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
891 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
892 		dev_a.hat_attr = 0;
893 		dev_a.hat_flags = 0;
894 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
895 		break;
896 
897 	default:
898 		error = ENXIO;
899 	}
900 
901 	as_rangeunlock(as);
902 	return (error);
903 }
904 
905 static struct cb_ops mm_cb_ops = {
906 	mmopen,			/* open */
907 	nulldev,		/* close */
908 	nodev,			/* strategy */
909 	nodev,			/* print */
910 	nodev,			/* dump */
911 	mmread,			/* read */
912 	mmwrite,		/* write */
913 	mmioctl,		/* ioctl */
914 	nodev,			/* devmap */
915 	mmmmap,			/* mmap */
916 	mmsegmap,		/* segmap */
917 	mmchpoll,		/* poll */
918 	mmpropop,		/* prop_op */
919 	0,			/* streamtab  */
920 	D_NEW | D_MP | D_64BIT | D_U64BIT
921 };
922 
923 static struct dev_ops mm_ops = {
924 	DEVO_REV,		/* devo_rev, */
925 	0,			/* refcnt  */
926 	mm_info,		/* get_dev_info */
927 	nulldev,		/* identify */
928 	nulldev,		/* probe */
929 	mm_attach,		/* attach */
930 	nodev,			/* detach */
931 	nodev,			/* reset */
932 	&mm_cb_ops,		/* driver operations */
933 	(struct bus_ops *)0	/* bus operations */
934 };
935 
936 static struct modldrv modldrv = {
937 	&mod_driverops, "memory driver %I%", &mm_ops,
938 };
939 
940 static struct modlinkage modlinkage = {
941 	MODREV_1, &modldrv, NULL
942 };
943 
944 int
945 _init(void)
946 {
947 	return (mod_install(&modlinkage));
948 }
949 
950 int
951 _info(struct modinfo *modinfop)
952 {
953 	return (mod_info(&modlinkage, modinfop));
954 }
955 
956 int
957 _fini(void)
958 {
959 	return (mod_remove(&modlinkage));
960 }
961 
962 static int
963 mm_kstat_update(kstat_t *ksp, int rw)
964 {
965 	struct memlist *pmem;
966 	uint_t count;
967 
968 	if (rw == KSTAT_WRITE)
969 		return (EACCES);
970 
971 	count = 0;
972 	memlist_read_lock();
973 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
974 		count++;
975 	}
976 	memlist_read_unlock();
977 
978 	ksp->ks_ndata = count;
979 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
980 
981 	return (0);
982 }
983 
984 static int
985 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
986 {
987 	struct memlist *pmem;
988 	struct memunit {
989 		uint64_t address;
990 		uint64_t size;
991 	} *kspmem;
992 
993 	if (rw == KSTAT_WRITE)
994 		return (EACCES);
995 
996 	ksp->ks_snaptime = gethrtime();
997 
998 	kspmem = (struct memunit *)buf;
999 	memlist_read_lock();
1000 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next, kspmem++) {
1001 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1002 			break;
1003 		kspmem->address = pmem->address;
1004 		kspmem->size = pmem->size;
1005 	}
1006 	memlist_read_unlock();
1007 
1008 	return (0);
1009 }
1010 
1011 /*
1012  * Read a mem_name_t from user-space and store it in the mem_name_t
1013  * pointed to by the mem_name argument.
1014  */
1015 static int
1016 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1017 {
1018 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1019 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1020 			return (EFAULT);
1021 	}
1022 #ifdef	_SYSCALL32
1023 	else {
1024 		mem_name32_t mem_name32;
1025 
1026 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1027 			return (EFAULT);
1028 		mem_name->m_addr = mem_name32.m_addr;
1029 		mem_name->m_synd = mem_name32.m_synd;
1030 		mem_name->m_type[0] = mem_name32.m_type[0];
1031 		mem_name->m_type[1] = mem_name32.m_type[1];
1032 		mem_name->m_name = (caddr_t)mem_name32.m_name;
1033 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1034 		mem_name->m_sid = (caddr_t)mem_name32.m_sid;
1035 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1036 	}
1037 #endif	/* _SYSCALL32 */
1038 
1039 	return (0);
1040 }
1041 
1042 /*
1043  * Read a mem_page_t from user-space and store it in the mem_page_t
1044  * pointed to by the mpage argument.
1045  */
1046 static int
1047 mm_read_mem_page(intptr_t data, mem_page_t *mpage)
1048 {
1049 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1050 		if (copyin((void *)data, mpage, sizeof (mem_page_t)) != 0)
1051 			return (EFAULT);
1052 	}
1053 #ifdef _SYSCALL32
1054 	else {
1055 		mem_page32_t	mpage32;
1056 
1057 		if (copyin((void *)data, &mpage32, sizeof (mem_page32_t)) != 0)
1058 			return (EFAULT);
1059 
1060 		mpage->m_fmri = (caddr_t)(uintptr_t)mpage32.m_fmri;
1061 		mpage->m_fmrisz = mpage32.m_fmrisz;
1062 	}
1063 #endif	/* _SYSCALL32 */
1064 
1065 	return (0);
1066 }
1067 
1068 /*
1069  * Expand an FMRI from a mem_page_t.
1070  */
1071 static int
1072 mm_get_mem_fmri(mem_page_t *mpage, nvlist_t **nvl)
1073 {
1074 	char *buf;
1075 	int err;
1076 
1077 	if (mpage->m_fmri == NULL || mpage->m_fmrisz > MEM_FMRI_MAX_BUFSIZE)
1078 		return (EINVAL);
1079 
1080 	buf = kmem_alloc(mpage->m_fmrisz, KM_SLEEP);
1081 	if (copyin(mpage->m_fmri, buf, mpage->m_fmrisz) != 0) {
1082 		kmem_free(buf, mpage->m_fmrisz);
1083 		return (EFAULT);
1084 	}
1085 
1086 	err = nvlist_unpack(buf, mpage->m_fmrisz, nvl, KM_SLEEP);
1087 	kmem_free(buf, mpage->m_fmrisz);
1088 
1089 	return (err);
1090 }
1091 
1092 static int
1093 mm_get_paddr(nvlist_t *nvl, uint64_t *paddr)
1094 {
1095 	uint8_t version;
1096 	uint64_t pa;
1097 	char *scheme;
1098 #ifdef __sparc
1099 	uint64_t offset;
1100 	char *unum;
1101 	char **serids;
1102 	uint_t nserids;
1103 	int err;
1104 #endif
1105 
1106 	/* Verify FMRI scheme name and version number */
1107 	if ((nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &scheme) != 0) ||
1108 	    (strcmp(scheme, FM_FMRI_SCHEME_MEM) != 0) ||
1109 	    (nvlist_lookup_uint8(nvl, FM_VERSION, &version) != 0) ||
1110 	    version > FM_MEM_SCHEME_VERSION) {
1111 		return (EINVAL);
1112 	}
1113 
1114 	/*
1115 	 * There are two ways a physical address can be  obtained from a mem
1116 	 * scheme FMRI.  One way is to use the "offset" and  "serial"
1117 	 * members, if they are present, together with the "unum" member to
1118 	 * calculate a physical address.  This is the preferred way since
1119 	 * it is independent of possible changes to the programming of
1120 	 * underlying hardware registers that may change the physical address.
1121 	 * If the "offset" member is not present, then the address is
1122 	 * retrieved from the "physaddr" member.
1123 	 */
1124 #ifdef __sparc
1125 	if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_OFFSET, &offset) != 0) {
1126 		if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, &pa) !=
1127 		    0) {
1128 			return (EINVAL);
1129 		}
1130 	} else if (nvlist_lookup_string(nvl, FM_FMRI_MEM_UNUM, &unum) != 0 ||
1131 	    nvlist_lookup_string_array(nvl, FM_FMRI_MEM_SERIAL_ID, &serids,
1132 	    &nserids) != 0) {
1133 		return (EINVAL);
1134 	} else {
1135 		if ((err = cpu_get_mem_addr(unum, serids[0], offset, &pa)) != 0)
1136 			return (err);
1137 	}
1138 #else /* __i386, __amd64 */
1139 	if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, &pa) != 0)
1140 		return (EINVAL);
1141 #endif /* __sparc */
1142 
1143 	*paddr = pa;
1144 	return (0);
1145 }
1146