xref: /illumos-gate/usr/src/uts/common/io/mem.c (revision f3af49816e370d667d566ab703e94b81305a536e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Memory special file
30  */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/user.h>
35 #include <sys/buf.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/vm.h>
39 #include <sys/uio.h>
40 #include <sys/mman.h>
41 #include <sys/kmem.h>
42 #include <vm/seg.h>
43 #include <vm/page.h>
44 #include <sys/stat.h>
45 #include <sys/vmem.h>
46 #include <sys/memlist.h>
47 #include <sys/bootconf.h>
48 
49 #include <vm/seg_vn.h>
50 #include <vm/seg_dev.h>
51 #include <vm/seg_kmem.h>
52 #include <vm/seg_kp.h>
53 #include <vm/seg_kpm.h>
54 #include <vm/hat.h>
55 
56 #include <sys/conf.h>
57 #include <sys/mem.h>
58 #include <sys/types.h>
59 #include <sys/conf.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/errno.h>
63 #include <sys/modctl.h>
64 #include <sys/memlist.h>
65 #include <sys/ddi.h>
66 #include <sys/sunddi.h>
67 #include <sys/debug.h>
68 #include <sys/fm/protocol.h>
69 
70 #if defined(__sparc)
71 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
72 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
73     uint64_t *, int *, int *, int *);
74 extern size_t cpu_get_name_bufsize(void);
75 extern int cpu_get_mem_sid(char *, char *, int, int *);
76 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
77 #elif defined(__x86)
78 #include <sys/cpu_module.h>
79 #endif	/* __sparc */
80 
81 /*
82  * Turn a byte length into a pagecount.  The DDI btop takes a
83  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
84  * large physical-memory 32-bit machines.
85  */
86 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
87 
88 static kmutex_t mm_lock;
89 static caddr_t mm_map;
90 
91 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
92 
93 static int mm_kmem_io_access;
94 
95 static int mm_kstat_update(kstat_t *ksp, int rw);
96 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
97 
98 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
99 static int mm_read_mem_page(intptr_t data, mem_page_t *mpage);
100 static int mm_get_mem_fmri(mem_page_t *mpage, nvlist_t **nvl);
101 static int mm_get_paddr(nvlist_t *nvl, uint64_t *paddr);
102 
103 /*ARGSUSED1*/
104 static int
105 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
106 {
107 	int i;
108 	struct mem_minor {
109 		char *name;
110 		minor_t minor;
111 		int privonly;
112 		const char *rdpriv;
113 		const char *wrpriv;
114 		mode_t priv_mode;
115 	} mm[] = {
116 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
117 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
118 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
119 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
120 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
121 	};
122 	kstat_t *ksp;
123 
124 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
125 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
126 
127 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
128 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
129 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
130 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
131 		    DDI_FAILURE) {
132 			ddi_remove_minor_node(devi, NULL);
133 			return (DDI_FAILURE);
134 		}
135 	}
136 
137 	mm_dip = devi;
138 
139 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
140 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
141 	if (ksp != NULL) {
142 		ksp->ks_update = mm_kstat_update;
143 		ksp->ks_snapshot = mm_kstat_snapshot;
144 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
145 		kstat_install(ksp);
146 	}
147 
148 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
149 	    "kmem_io_access", 0);
150 
151 	return (DDI_SUCCESS);
152 }
153 
154 /*ARGSUSED*/
155 static int
156 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
157 {
158 	register int error;
159 
160 	switch (infocmd) {
161 	case DDI_INFO_DEVT2DEVINFO:
162 		*result = (void *)mm_dip;
163 		error = DDI_SUCCESS;
164 		break;
165 	case DDI_INFO_DEVT2INSTANCE:
166 		*result = (void *)0;
167 		error = DDI_SUCCESS;
168 		break;
169 	default:
170 		error = DDI_FAILURE;
171 	}
172 	return (error);
173 }
174 
175 /*ARGSUSED1*/
176 static int
177 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
178 {
179 	switch (getminor(*devp)) {
180 	case M_NULL:
181 	case M_ZERO:
182 	case M_MEM:
183 	case M_KMEM:
184 	case M_ALLKMEM:
185 		/* standard devices */
186 		break;
187 
188 	default:
189 		/* Unsupported or unknown type */
190 		return (EINVAL);
191 	}
192 	return (0);
193 }
194 
195 struct pollhead	mm_pollhd;
196 
197 /*ARGSUSED*/
198 static int
199 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
200     struct pollhead **phpp)
201 {
202 	switch (getminor(dev)) {
203 	case M_NULL:
204 	case M_ZERO:
205 	case M_MEM:
206 	case M_KMEM:
207 	case M_ALLKMEM:
208 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
209 		    POLLWRNORM | POLLRDBAND | POLLWRBAND);
210 		/*
211 		 * A non NULL pollhead pointer should be returned in case
212 		 * user polls for 0 events.
213 		 */
214 		*phpp = !anyyet && !*reventsp ?
215 		    &mm_pollhd : (struct pollhead *)NULL;
216 		return (0);
217 	default:
218 		/* no other devices currently support polling */
219 		return (ENXIO);
220 	}
221 }
222 
223 static int
224 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
225     char *name, caddr_t valuep, int *lengthp)
226 {
227 	/*
228 	 * implement zero size to reduce overhead (avoid two failing
229 	 * property lookups per stat).
230 	 */
231 	return (ddi_prop_op_size(dev, dip, prop_op,
232 	    flags, name, valuep, lengthp, 0));
233 }
234 
235 static int
236 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio)
237 {
238 	int error = 0;
239 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
240 	    (size_t)uio->uio_iov->iov_len);
241 
242 	mutex_enter(&mm_lock);
243 	hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
244 	    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ | PROT_WRITE),
245 	    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
246 
247 	if (!pf_is_memory(pfn)) {
248 		if (allowio) {
249 			size_t c = uio->uio_iov->iov_len;
250 
251 			if (ddi_peekpokeio(NULL, uio, rw,
252 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
253 			    sizeof (int32_t)) != DDI_SUCCESS)
254 				error = EFAULT;
255 		} else
256 			error = EIO;
257 	} else
258 		error = uiomove(&mm_map[pageoff], nbytes, rw, uio);
259 
260 	hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
261 	mutex_exit(&mm_lock);
262 	return (error);
263 }
264 
265 /*
266  * Some platforms have permanently-mapped areas without PFNs, so we check
267  * specially here.
268  */
269 static int
270 mmplatio(struct uio *uio, enum uio_rw rw)
271 {
272 	uintptr_t pageaddr = (uintptr_t)uio->uio_loffset & PAGEMASK;
273 	off_t pageoff = uio->uio_loffset & PAGEOFFSET;
274 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
275 	    (size_t)uio->uio_iov->iov_len);
276 
277 	if (!plat_mem_valid_page(pageaddr, rw))
278 		return (ENOTSUP);
279 
280 	return (uiomove((void *)(pageaddr + pageoff), nbytes, rw, uio));
281 }
282 
283 #ifdef	__sparc
284 
285 static int
286 mmpagelock(struct as *as, caddr_t va)
287 {
288 	struct seg *seg;
289 	int i;
290 
291 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
292 	seg = as_segat(as, va);
293 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
294 	AS_LOCK_EXIT(as, &as->a_lock);
295 
296 	return (i);
297 }
298 
299 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
300 
301 #else	/* __i386, __amd64 */
302 
303 #define	NEED_LOCK_KVADDR(va)	0
304 
305 #endif	/* __sparc */
306 
307 /*ARGSUSED3*/
308 static int
309 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
310 {
311 	pfn_t v;
312 	struct iovec *iov;
313 	int error = 0;
314 	size_t c;
315 	ssize_t oresid = uio->uio_resid;
316 	minor_t minor = getminor(dev);
317 
318 	while (uio->uio_resid > 0 && error == 0) {
319 		iov = uio->uio_iov;
320 		if (iov->iov_len == 0) {
321 			uio->uio_iov++;
322 			uio->uio_iovcnt--;
323 			if (uio->uio_iovcnt < 0)
324 				panic("mmrw");
325 			continue;
326 		}
327 		switch (minor) {
328 
329 		case M_MEM:
330 			memlist_read_lock();
331 			if (!address_in_memlist(phys_install,
332 			    (uint64_t)uio->uio_loffset, 1)) {
333 				memlist_read_unlock();
334 				error = EFAULT;
335 				break;
336 			}
337 			memlist_read_unlock();
338 
339 			v = BTOP((u_offset_t)uio->uio_loffset);
340 			error = mmio(uio, rw, v,
341 			    uio->uio_loffset & PAGEOFFSET, 0);
342 			break;
343 
344 		case M_KMEM:
345 		case M_ALLKMEM:
346 			{
347 			page_t **ppp;
348 			caddr_t vaddr = (caddr_t)uio->uio_offset;
349 			int try_lock = NEED_LOCK_KVADDR(vaddr);
350 			int locked = 0;
351 
352 			if ((error = mmplatio(uio, rw)) != ENOTSUP)
353 				break;
354 
355 			/*
356 			 * If vaddr does not map a valid page, as_pagelock()
357 			 * will return failure. Hence we can't check the
358 			 * return value and return EFAULT here as we'd like.
359 			 * seg_kp and seg_kpm do not properly support
360 			 * as_pagelock() for this context so we avoid it
361 			 * using the try_lock set check above.  Some day when
362 			 * the kernel page locking gets redesigned all this
363 			 * muck can be cleaned up.
364 			 */
365 			if (try_lock)
366 				locked = (as_pagelock(&kas, &ppp, vaddr,
367 				    PAGESIZE, S_WRITE) == 0);
368 
369 			v = hat_getpfnum(kas.a_hat,
370 			    (caddr_t)(uintptr_t)uio->uio_loffset);
371 			if (v == PFN_INVALID) {
372 				if (locked)
373 					as_pageunlock(&kas, ppp, vaddr,
374 					    PAGESIZE, S_WRITE);
375 				error = EFAULT;
376 				break;
377 			}
378 
379 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
380 			    minor == M_ALLKMEM || mm_kmem_io_access);
381 			if (locked)
382 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
383 				    S_WRITE);
384 			}
385 
386 			break;
387 
388 		case M_ZERO:
389 			if (rw == UIO_READ) {
390 				label_t ljb;
391 
392 				if (on_fault(&ljb)) {
393 					no_fault();
394 					error = EFAULT;
395 					break;
396 				}
397 				uzero(iov->iov_base, iov->iov_len);
398 				no_fault();
399 				uio->uio_resid -= iov->iov_len;
400 				uio->uio_loffset += iov->iov_len;
401 				break;
402 			}
403 			/* else it's a write, fall through to NULL case */
404 			/*FALLTHROUGH*/
405 
406 		case M_NULL:
407 			if (rw == UIO_READ)
408 				return (0);
409 			c = iov->iov_len;
410 			iov->iov_base += c;
411 			iov->iov_len -= c;
412 			uio->uio_loffset += c;
413 			uio->uio_resid -= c;
414 			break;
415 
416 		}
417 	}
418 	return (uio->uio_resid == oresid ? error : 0);
419 }
420 
421 static int
422 mmread(dev_t dev, struct uio *uio, cred_t *cred)
423 {
424 	return (mmrw(dev, uio, UIO_READ, cred));
425 }
426 
427 static int
428 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
429 {
430 	return (mmrw(dev, uio, UIO_WRITE, cred));
431 }
432 
433 /*
434  * Private ioctl for libkvm to support kvm_physaddr().
435  * Given an address space and a VA, compute the PA.
436  */
437 static int
438 mmioctl_vtop(intptr_t data)
439 {
440 #ifdef _SYSCALL32
441 	mem_vtop32_t vtop32;
442 #endif
443 	mem_vtop_t mem_vtop;
444 	proc_t *p;
445 	pfn_t pfn = (pfn_t)PFN_INVALID;
446 	pid_t pid = 0;
447 	struct as *as;
448 	struct seg *seg;
449 
450 	if (get_udatamodel() == DATAMODEL_NATIVE) {
451 		if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
452 			return (EFAULT);
453 	}
454 #ifdef _SYSCALL32
455 	else {
456 		if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
457 			return (EFAULT);
458 		mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
459 		mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
460 
461 		if (mem_vtop.m_as != NULL)
462 			return (EINVAL);
463 	}
464 #endif
465 
466 	if (mem_vtop.m_as == &kas) {
467 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
468 	} else {
469 		if (mem_vtop.m_as == NULL) {
470 			/*
471 			 * Assume the calling process's address space if the
472 			 * caller didn't specify one.
473 			 */
474 			p = curthread->t_procp;
475 			if (p == NULL)
476 				return (EIO);
477 			mem_vtop.m_as = p->p_as;
478 		}
479 
480 		mutex_enter(&pidlock);
481 		for (p = practive; p != NULL; p = p->p_next) {
482 			if (p->p_as == mem_vtop.m_as) {
483 				pid = p->p_pid;
484 				break;
485 			}
486 		}
487 		mutex_exit(&pidlock);
488 		if (p == NULL)
489 			return (EIO);
490 		p = sprlock(pid);
491 		if (p == NULL)
492 			return (EIO);
493 		as = p->p_as;
494 		if (as == mem_vtop.m_as) {
495 			mutex_exit(&p->p_lock);
496 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
497 			for (seg = AS_SEGFIRST(as); seg != NULL;
498 			    seg = AS_SEGNEXT(as, seg))
499 				if ((uintptr_t)mem_vtop.m_va -
500 				    (uintptr_t)seg->s_base < seg->s_size)
501 					break;
502 			if (seg != NULL)
503 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
504 			AS_LOCK_EXIT(as, &as->a_lock);
505 			mutex_enter(&p->p_lock);
506 		}
507 		sprunlock(p);
508 	}
509 	mem_vtop.m_pfn = pfn;
510 	if (pfn == PFN_INVALID)
511 		return (EIO);
512 
513 	if (get_udatamodel() == DATAMODEL_NATIVE) {
514 		if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
515 			return (EFAULT);
516 	}
517 #ifdef _SYSCALL32
518 	else {
519 		vtop32.m_pfn = mem_vtop.m_pfn;
520 		if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
521 			return (EFAULT);
522 	}
523 #endif
524 
525 	return (0);
526 }
527 
528 /*
529  * Given a PA, execute the given page retire command on it.
530  */
531 static int
532 mmioctl_page_retire(int cmd, intptr_t data)
533 {
534 	extern int page_retire_test(void);
535 	uint64_t pa;
536 
537 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
538 		return (EFAULT);
539 	}
540 
541 	switch (cmd) {
542 	case MEM_PAGE_ISRETIRED:
543 		return (page_retire_check(pa, NULL));
544 
545 	case MEM_PAGE_UNRETIRE:
546 		return (page_unretire(pa));
547 
548 	case MEM_PAGE_RETIRE:
549 		return (page_retire(pa, PR_FMA));
550 
551 	case MEM_PAGE_RETIRE_MCE:
552 		return (page_retire(pa, PR_MCE));
553 
554 	case MEM_PAGE_RETIRE_UE:
555 		return (page_retire(pa, PR_UE));
556 
557 	case MEM_PAGE_GETERRORS:
558 		{
559 			uint64_t page_errors;
560 			int rc = page_retire_check(pa, &page_errors);
561 			if (copyout(&page_errors, (void *)data,
562 			    sizeof (uint64_t))) {
563 				return (EFAULT);
564 			}
565 			return (rc);
566 		}
567 
568 	case MEM_PAGE_RETIRE_TEST:
569 		return (page_retire_test());
570 
571 	}
572 
573 	return (EINVAL);
574 }
575 
576 /*
577  * Given a mem-scheme FMRI for a page, execute the given page retire
578  * command on it.
579  */
580 static int
581 mmioctl_page_fmri_retire(int cmd, intptr_t data)
582 {
583 	mem_page_t mpage;
584 	uint64_t pa;
585 	nvlist_t *nvl;
586 	int err;
587 
588 	if ((err = mm_read_mem_page(data, &mpage)) < 0)
589 		return (err);
590 
591 	if ((err = mm_get_mem_fmri(&mpage, &nvl)) != 0)
592 		return (err);
593 
594 	if ((err = mm_get_paddr(nvl, &pa)) != 0) {
595 		nvlist_free(nvl);
596 		return (err);
597 	}
598 
599 	nvlist_free(nvl);
600 
601 	switch (cmd) {
602 	case MEM_PAGE_FMRI_ISRETIRED:
603 		return (page_retire_check(pa, NULL));
604 
605 	case MEM_PAGE_FMRI_RETIRE:
606 		return (page_retire(pa, PR_FMA));
607 	}
608 
609 	return (EINVAL);
610 }
611 
612 #ifdef __sparc
613 /*
614  * Given a syndrome, syndrome type, and address return the
615  * associated memory name in the provided data buffer.
616  */
617 static int
618 mmioctl_get_mem_name(intptr_t data)
619 {
620 	mem_name_t mem_name;
621 	void *buf;
622 	size_t bufsize;
623 	int len, err;
624 
625 	if ((bufsize = cpu_get_name_bufsize()) == 0)
626 		return (ENOTSUP);
627 
628 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
629 		return (err);
630 
631 	buf = kmem_alloc(bufsize, KM_SLEEP);
632 
633 	/*
634 	 * Call into cpu specific code to do the lookup.
635 	 */
636 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
637 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
638 		kmem_free(buf, bufsize);
639 		return (err);
640 	}
641 
642 	if (len >= mem_name.m_namelen) {
643 		kmem_free(buf, bufsize);
644 		return (ENAMETOOLONG);
645 	}
646 
647 	if (copyoutstr(buf, (char *)mem_name.m_name,
648 	    mem_name.m_namelen, NULL) != 0) {
649 		kmem_free(buf, bufsize);
650 		return (EFAULT);
651 	}
652 
653 	kmem_free(buf, bufsize);
654 	return (0);
655 }
656 
657 /*
658  * Given a syndrome and address return information about the associated memory.
659  */
660 static int
661 mmioctl_get_mem_info(intptr_t data)
662 {
663 	mem_info_t mem_info;
664 	int err;
665 
666 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
667 		return (EFAULT);
668 
669 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
670 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
671 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
672 		return (err);
673 
674 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
675 		return (EFAULT);
676 
677 	return (0);
678 }
679 
680 /*
681  * Given a memory name, return its associated serial id
682  */
683 static int
684 mmioctl_get_mem_sid(intptr_t data)
685 {
686 	mem_name_t mem_name;
687 	void *buf;
688 	void *name;
689 	size_t	name_len;
690 	size_t bufsize;
691 	int len, err;
692 
693 	if ((bufsize = cpu_get_name_bufsize()) == 0)
694 		return (ENOTSUP);
695 
696 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
697 		return (err);
698 
699 	buf = kmem_alloc(bufsize, KM_SLEEP);
700 
701 	if (mem_name.m_namelen > 1024)
702 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
703 
704 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
705 
706 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
707 	    mem_name.m_namelen, &name_len)) != 0) {
708 		kmem_free(buf, bufsize);
709 		kmem_free(name, mem_name.m_namelen);
710 		return (err);
711 	}
712 
713 	/*
714 	 * Call into cpu specific code to do the lookup.
715 	 */
716 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
717 		kmem_free(buf, bufsize);
718 		kmem_free(name, mem_name.m_namelen);
719 		return (err);
720 	}
721 
722 	if (len > mem_name.m_sidlen) {
723 		kmem_free(buf, bufsize);
724 		kmem_free(name, mem_name.m_namelen);
725 		return (ENAMETOOLONG);
726 	}
727 
728 	if (copyoutstr(buf, (char *)mem_name.m_sid,
729 	    mem_name.m_sidlen, NULL) != 0) {
730 		kmem_free(buf, bufsize);
731 		kmem_free(name, mem_name.m_namelen);
732 		return (EFAULT);
733 	}
734 
735 	kmem_free(buf, bufsize);
736 	kmem_free(name, mem_name.m_namelen);
737 	return (0);
738 }
739 #endif	/* __sparc */
740 
741 /*
742  * Private ioctls for
743  *	libkvm to support kvm_physaddr().
744  *	FMA support for page_retire() and memory attribute information.
745  */
746 /*ARGSUSED*/
747 static int
748 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
749 {
750 	if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
751 	    (cmd != MEM_VTOP && getminor(dev) != M_MEM))
752 		return (ENXIO);
753 
754 	switch (cmd) {
755 	case MEM_VTOP:
756 		return (mmioctl_vtop(data));
757 
758 	case MEM_PAGE_RETIRE:
759 	case MEM_PAGE_ISRETIRED:
760 	case MEM_PAGE_UNRETIRE:
761 	case MEM_PAGE_RETIRE_MCE:
762 	case MEM_PAGE_RETIRE_UE:
763 	case MEM_PAGE_GETERRORS:
764 	case MEM_PAGE_RETIRE_TEST:
765 		return (mmioctl_page_retire(cmd, data));
766 
767 	case MEM_PAGE_FMRI_RETIRE:
768 	case MEM_PAGE_FMRI_ISRETIRED:
769 		return (mmioctl_page_fmri_retire(cmd, data));
770 
771 #ifdef __sparc
772 	case MEM_NAME:
773 		return (mmioctl_get_mem_name(data));
774 
775 	case MEM_INFO:
776 		return (mmioctl_get_mem_info(data));
777 
778 	case MEM_SID:
779 		return (mmioctl_get_mem_sid(data));
780 #else
781 	case MEM_NAME:
782 	case MEM_INFO:
783 	case MEM_SID:
784 		return (ENOTSUP);
785 #endif	/* __sparc */
786 	}
787 	return (ENXIO);
788 }
789 
790 /*ARGSUSED2*/
791 static int
792 mmmmap(dev_t dev, off_t off, int prot)
793 {
794 	pfn_t pf;
795 	struct memlist *pmem;
796 	minor_t minor = getminor(dev);
797 
798 	switch (minor) {
799 	case M_MEM:
800 		pf = btop(off);
801 		memlist_read_lock();
802 		for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
803 			if (pf >= BTOP(pmem->address) &&
804 			    pf < BTOP(pmem->address + pmem->size)) {
805 				memlist_read_unlock();
806 				return (impl_obmem_pfnum(pf));
807 			}
808 		}
809 		memlist_read_unlock();
810 		break;
811 
812 	case M_KMEM:
813 	case M_ALLKMEM:
814 		/* no longer supported with KPR */
815 		return (-1);
816 
817 	case M_ZERO:
818 		/*
819 		 * We shouldn't be mmap'ing to /dev/zero here as
820 		 * mmsegmap() should have already converted
821 		 * a mapping request for this device to a mapping
822 		 * using seg_vn for anonymous memory.
823 		 */
824 		break;
825 
826 	}
827 	return (-1);
828 }
829 
830 /*
831  * This function is called when a memory device is mmap'ed.
832  * Set up the mapping to the correct device driver.
833  */
834 static int
835 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
836     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
837 {
838 	struct segvn_crargs vn_a;
839 	struct segdev_crargs dev_a;
840 	int error;
841 	minor_t minor;
842 	off_t i;
843 
844 	minor = getminor(dev);
845 
846 	as_rangelock(as);
847 	if ((flags & MAP_FIXED) == 0) {
848 		/*
849 		 * No need to worry about vac alignment on /dev/zero
850 		 * since this is a "clone" object that doesn't yet exist.
851 		 */
852 		map_addr(addrp, len, (offset_t)off,
853 		    (minor == M_MEM) || (minor == M_KMEM), flags);
854 
855 		if (*addrp == NULL) {
856 			as_rangeunlock(as);
857 			return (ENOMEM);
858 		}
859 	} else {
860 		/*
861 		 * User specified address -
862 		 * Blow away any previous mappings.
863 		 */
864 		(void) as_unmap(as, *addrp, len);
865 	}
866 
867 	switch (minor) {
868 	case M_MEM:
869 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
870 		if ((flags & MAP_TYPE) != MAP_SHARED) {
871 			as_rangeunlock(as);
872 			return (EINVAL);
873 		}
874 
875 		/*
876 		 * Check to ensure that the entire range is
877 		 * legal and we are not trying to map in
878 		 * more than the device will let us.
879 		 */
880 		for (i = 0; i < len; i += PAGESIZE) {
881 			if (mmmmap(dev, off + i, maxprot) == -1) {
882 				as_rangeunlock(as);
883 				return (ENXIO);
884 			}
885 		}
886 
887 		/*
888 		 * Use seg_dev segment driver for /dev/mem mapping.
889 		 */
890 		dev_a.mapfunc = mmmmap;
891 		dev_a.dev = dev;
892 		dev_a.offset = off;
893 		dev_a.type = (flags & MAP_TYPE);
894 		dev_a.prot = (uchar_t)prot;
895 		dev_a.maxprot = (uchar_t)maxprot;
896 		dev_a.hat_attr = 0;
897 
898 		/*
899 		 * Make /dev/mem mappings non-consistent since we can't
900 		 * alias pages that don't have page structs behind them,
901 		 * such as kernel stack pages. If someone mmap()s a kernel
902 		 * stack page and if we give him a tte with cv, a line from
903 		 * that page can get into both pages of the spitfire d$.
904 		 * But snoop from another processor will only invalidate
905 		 * the first page. This later caused kernel (xc_attention)
906 		 * to go into an infinite loop at pil 13 and no interrupts
907 		 * could come in. See 1203630.
908 		 *
909 		 */
910 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
911 		dev_a.devmap_data = NULL;
912 
913 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
914 		break;
915 
916 	case M_ZERO:
917 		/*
918 		 * Use seg_vn segment driver for /dev/zero mapping.
919 		 * Passing in a NULL amp gives us the "cloning" effect.
920 		 */
921 		vn_a.vp = NULL;
922 		vn_a.offset = 0;
923 		vn_a.type = (flags & MAP_TYPE);
924 		vn_a.prot = prot;
925 		vn_a.maxprot = maxprot;
926 		vn_a.flags = flags & ~MAP_TYPE;
927 		vn_a.cred = cred;
928 		vn_a.amp = NULL;
929 		vn_a.szc = 0;
930 		vn_a.lgrp_mem_policy_flags = 0;
931 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
932 		break;
933 
934 	case M_KMEM:
935 	case M_ALLKMEM:
936 		/* No longer supported with KPR. */
937 		error = ENXIO;
938 		break;
939 
940 	case M_NULL:
941 		/*
942 		 * Use seg_dev segment driver for /dev/null mapping.
943 		 */
944 		dev_a.mapfunc = mmmmap;
945 		dev_a.dev = dev;
946 		dev_a.offset = off;
947 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
948 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
949 		dev_a.hat_attr = 0;
950 		dev_a.hat_flags = 0;
951 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
952 		break;
953 
954 	default:
955 		error = ENXIO;
956 	}
957 
958 	as_rangeunlock(as);
959 	return (error);
960 }
961 
962 static struct cb_ops mm_cb_ops = {
963 	mmopen,			/* open */
964 	nulldev,		/* close */
965 	nodev,			/* strategy */
966 	nodev,			/* print */
967 	nodev,			/* dump */
968 	mmread,			/* read */
969 	mmwrite,		/* write */
970 	mmioctl,		/* ioctl */
971 	nodev,			/* devmap */
972 	mmmmap,			/* mmap */
973 	mmsegmap,		/* segmap */
974 	mmchpoll,		/* poll */
975 	mmpropop,		/* prop_op */
976 	0,			/* streamtab  */
977 	D_NEW | D_MP | D_64BIT | D_U64BIT
978 };
979 
980 static struct dev_ops mm_ops = {
981 	DEVO_REV,		/* devo_rev, */
982 	0,			/* refcnt  */
983 	mm_info,		/* get_dev_info */
984 	nulldev,		/* identify */
985 	nulldev,		/* probe */
986 	mm_attach,		/* attach */
987 	nodev,			/* detach */
988 	nodev,			/* reset */
989 	&mm_cb_ops,		/* driver operations */
990 	(struct bus_ops *)0	/* bus operations */
991 };
992 
993 static struct modldrv modldrv = {
994 	&mod_driverops, "memory driver %I%", &mm_ops,
995 };
996 
997 static struct modlinkage modlinkage = {
998 	MODREV_1, &modldrv, NULL
999 };
1000 
1001 int
1002 _init(void)
1003 {
1004 	return (mod_install(&modlinkage));
1005 }
1006 
1007 int
1008 _info(struct modinfo *modinfop)
1009 {
1010 	return (mod_info(&modlinkage, modinfop));
1011 }
1012 
1013 int
1014 _fini(void)
1015 {
1016 	return (mod_remove(&modlinkage));
1017 }
1018 
1019 static int
1020 mm_kstat_update(kstat_t *ksp, int rw)
1021 {
1022 	struct memlist *pmem;
1023 	uint_t count;
1024 
1025 	if (rw == KSTAT_WRITE)
1026 		return (EACCES);
1027 
1028 	count = 0;
1029 	memlist_read_lock();
1030 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
1031 		count++;
1032 	}
1033 	memlist_read_unlock();
1034 
1035 	ksp->ks_ndata = count;
1036 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1037 
1038 	return (0);
1039 }
1040 
1041 static int
1042 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1043 {
1044 	struct memlist *pmem;
1045 	struct memunit {
1046 		uint64_t address;
1047 		uint64_t size;
1048 	} *kspmem;
1049 
1050 	if (rw == KSTAT_WRITE)
1051 		return (EACCES);
1052 
1053 	ksp->ks_snaptime = gethrtime();
1054 
1055 	kspmem = (struct memunit *)buf;
1056 	memlist_read_lock();
1057 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next, kspmem++) {
1058 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1059 			break;
1060 		kspmem->address = pmem->address;
1061 		kspmem->size = pmem->size;
1062 	}
1063 	memlist_read_unlock();
1064 
1065 	return (0);
1066 }
1067 
1068 /*
1069  * Read a mem_name_t from user-space and store it in the mem_name_t
1070  * pointed to by the mem_name argument.
1071  */
1072 static int
1073 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1074 {
1075 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1076 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1077 			return (EFAULT);
1078 	}
1079 #ifdef	_SYSCALL32
1080 	else {
1081 		mem_name32_t mem_name32;
1082 
1083 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1084 			return (EFAULT);
1085 		mem_name->m_addr = mem_name32.m_addr;
1086 		mem_name->m_synd = mem_name32.m_synd;
1087 		mem_name->m_type[0] = mem_name32.m_type[0];
1088 		mem_name->m_type[1] = mem_name32.m_type[1];
1089 		mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1090 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1091 		mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1092 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1093 	}
1094 #endif	/* _SYSCALL32 */
1095 
1096 	return (0);
1097 }
1098 
1099 /*
1100  * Read a mem_page_t from user-space and store it in the mem_page_t
1101  * pointed to by the mpage argument.
1102  */
1103 static int
1104 mm_read_mem_page(intptr_t data, mem_page_t *mpage)
1105 {
1106 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1107 		if (copyin((void *)data, mpage, sizeof (mem_page_t)) != 0)
1108 			return (EFAULT);
1109 	}
1110 #ifdef _SYSCALL32
1111 	else {
1112 		mem_page32_t	mpage32;
1113 
1114 		if (copyin((void *)data, &mpage32, sizeof (mem_page32_t)) != 0)
1115 			return (EFAULT);
1116 
1117 		mpage->m_fmri = (caddr_t)(uintptr_t)mpage32.m_fmri;
1118 		mpage->m_fmrisz = mpage32.m_fmrisz;
1119 	}
1120 #endif	/* _SYSCALL32 */
1121 
1122 	return (0);
1123 }
1124 
1125 /*
1126  * Expand an FMRI from a mem_page_t.
1127  */
1128 static int
1129 mm_get_mem_fmri(mem_page_t *mpage, nvlist_t **nvl)
1130 {
1131 	char *buf;
1132 	int err;
1133 
1134 	if (mpage->m_fmri == NULL || mpage->m_fmrisz > MEM_FMRI_MAX_BUFSIZE)
1135 		return (EINVAL);
1136 
1137 	buf = kmem_alloc(mpage->m_fmrisz, KM_SLEEP);
1138 	if (copyin(mpage->m_fmri, buf, mpage->m_fmrisz) != 0) {
1139 		kmem_free(buf, mpage->m_fmrisz);
1140 		return (EFAULT);
1141 	}
1142 
1143 	err = nvlist_unpack(buf, mpage->m_fmrisz, nvl, KM_SLEEP);
1144 	kmem_free(buf, mpage->m_fmrisz);
1145 
1146 	return (err);
1147 }
1148 
1149 static int
1150 mm_get_paddr(nvlist_t *nvl, uint64_t *paddr)
1151 {
1152 	uint8_t version;
1153 	uint64_t pa;
1154 	char *scheme;
1155 #ifdef __sparc
1156 	uint64_t offset;
1157 	char *unum;
1158 	char **serids;
1159 	uint_t nserids;
1160 	int err;
1161 #endif
1162 
1163 	/* Verify FMRI scheme name and version number */
1164 	if ((nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &scheme) != 0) ||
1165 	    (strcmp(scheme, FM_FMRI_SCHEME_MEM) != 0) ||
1166 	    (nvlist_lookup_uint8(nvl, FM_VERSION, &version) != 0) ||
1167 	    version > FM_MEM_SCHEME_VERSION) {
1168 		return (EINVAL);
1169 	}
1170 
1171 	/*
1172 	 * There are two ways a physical address can be  obtained from a mem
1173 	 * scheme FMRI.  One way is to use the "offset" and  "serial"
1174 	 * members, if they are present, together with the "unum" member to
1175 	 * calculate a physical address.  This is the preferred way since
1176 	 * it is independent of possible changes to the programming of
1177 	 * underlying hardware registers that may change the physical address.
1178 	 * If the "offset" member is not present, then the address is
1179 	 * retrieved from the "physaddr" member.
1180 	 */
1181 #if defined(__sparc)
1182 	if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_OFFSET, &offset) != 0) {
1183 		if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, &pa) !=
1184 		    0) {
1185 			return (EINVAL);
1186 		}
1187 	} else if (nvlist_lookup_string(nvl, FM_FMRI_MEM_UNUM, &unum) != 0 ||
1188 	    nvlist_lookup_string_array(nvl, FM_FMRI_MEM_SERIAL_ID, &serids,
1189 	    &nserids) != 0) {
1190 		return (EINVAL);
1191 	} else {
1192 		err = cpu_get_mem_addr(unum, serids[0], offset, &pa);
1193 		if (err != 0) {
1194 			if (err == ENOTSUP) {
1195 				/* Fall back to physaddr */
1196 				if (nvlist_lookup_uint64(nvl,
1197 				    FM_FMRI_MEM_PHYSADDR, &pa) != 0)
1198 					return (EINVAL);
1199 			} else
1200 				return (err);
1201 		}
1202 	}
1203 #elif defined(__x86)
1204 	if (cmi_mc_unumtopa(NULL, nvl, &pa) == 0)
1205 		return (EINVAL);
1206 #else
1207 #error "port me"
1208 #endif /* __sparc */
1209 
1210 	*paddr = pa;
1211 	return (0);
1212 }
1213