xref: /titanic_50/usr/src/uts/common/io/mem.c (revision 2d84dfe88bfb9c12d1b4d2216c32b5a8b1fb56ae)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Memory special file
31  */
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/user.h>
36 #include <sys/buf.h>
37 #include <sys/systm.h>
38 #include <sys/cred.h>
39 #include <sys/vm.h>
40 #include <sys/uio.h>
41 #include <sys/mman.h>
42 #include <sys/kmem.h>
43 #include <vm/seg.h>
44 #include <vm/page.h>
45 #include <sys/stat.h>
46 #include <sys/vmem.h>
47 #include <sys/memlist.h>
48 #include <sys/bootconf.h>
49 
50 #include <vm/seg_vn.h>
51 #include <vm/seg_dev.h>
52 #include <vm/seg_kmem.h>
53 #include <vm/seg_kp.h>
54 #include <vm/seg_kpm.h>
55 #include <vm/hat.h>
56 
57 #include <sys/conf.h>
58 #include <sys/mem.h>
59 #include <sys/types.h>
60 #include <sys/conf.h>
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/errno.h>
64 #include <sys/modctl.h>
65 #include <sys/memlist.h>
66 #include <sys/ddi.h>
67 #include <sys/sunddi.h>
68 #include <sys/debug.h>
69 #include <sys/fm/protocol.h>
70 
71 #if defined(__sparc)
72 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
73 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
74     uint64_t *, int *, int *, int *);
75 extern size_t cpu_get_name_bufsize(void);
76 extern int cpu_get_mem_sid(char *, char *, int, int *);
77 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
78 #elif defined(__i386) || defined(__amd64)
79 #include <sys/cpu_module.h>
80 #endif	/* __sparc */
81 
82 /*
83  * Turn a byte length into a pagecount.  The DDI btop takes a
84  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
85  * large physical-memory 32-bit machines.
86  */
87 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
88 
89 static kmutex_t mm_lock;
90 static caddr_t mm_map;
91 
92 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
93 
94 static int mm_kmem_io_access;
95 
96 static int mm_kstat_update(kstat_t *ksp, int rw);
97 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
98 
99 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
100 static int mm_read_mem_page(intptr_t data, mem_page_t *mpage);
101 static int mm_get_mem_fmri(mem_page_t *mpage, nvlist_t **nvl);
102 static int mm_get_paddr(nvlist_t *nvl, uint64_t *paddr);
103 
104 /*ARGSUSED1*/
105 static int
106 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
107 {
108 	int i;
109 	struct mem_minor {
110 		char *name;
111 		minor_t minor;
112 		int privonly;
113 		const char *rdpriv;
114 		const char *wrpriv;
115 		mode_t priv_mode;
116 	} mm[] = {
117 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
118 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
119 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
120 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
121 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
122 	};
123 	kstat_t *ksp;
124 
125 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
126 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
127 
128 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
129 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
130 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
131 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
132 		    DDI_FAILURE) {
133 			ddi_remove_minor_node(devi, NULL);
134 			return (DDI_FAILURE);
135 		}
136 	}
137 
138 	mm_dip = devi;
139 
140 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
141 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
142 	if (ksp != NULL) {
143 		ksp->ks_update = mm_kstat_update;
144 		ksp->ks_snapshot = mm_kstat_snapshot;
145 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
146 		kstat_install(ksp);
147 	}
148 
149 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
150 	    "kmem_io_access", 0);
151 
152 	return (DDI_SUCCESS);
153 }
154 
155 /*ARGSUSED*/
156 static int
157 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
158 {
159 	register int error;
160 
161 	switch (infocmd) {
162 	case DDI_INFO_DEVT2DEVINFO:
163 		*result = (void *)mm_dip;
164 		error = DDI_SUCCESS;
165 		break;
166 	case DDI_INFO_DEVT2INSTANCE:
167 		*result = (void *)0;
168 		error = DDI_SUCCESS;
169 		break;
170 	default:
171 		error = DDI_FAILURE;
172 	}
173 	return (error);
174 }
175 
176 /*ARGSUSED1*/
177 static int
178 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
179 {
180 	switch (getminor(*devp)) {
181 	case M_NULL:
182 	case M_ZERO:
183 	case M_MEM:
184 	case M_KMEM:
185 	case M_ALLKMEM:
186 		/* standard devices */
187 		break;
188 
189 	default:
190 		/* Unsupported or unknown type */
191 		return (EINVAL);
192 	}
193 	return (0);
194 }
195 
196 struct pollhead	mm_pollhd;
197 
198 /*ARGSUSED*/
199 static int
200 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
201     struct pollhead **phpp)
202 {
203 	switch (getminor(dev)) {
204 	case M_NULL:
205 	case M_ZERO:
206 	case M_MEM:
207 	case M_KMEM:
208 	case M_ALLKMEM:
209 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
210 			POLLWRNORM | POLLRDBAND | POLLWRBAND);
211 		/*
212 		 * A non NULL pollhead pointer should be returned in case
213 		 * user polls for 0 events.
214 		 */
215 		*phpp = !anyyet && !*reventsp ?
216 		    &mm_pollhd : (struct pollhead *)NULL;
217 		return (0);
218 	default:
219 		/* no other devices currently support polling */
220 		return (ENXIO);
221 	}
222 }
223 
224 static int
225 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
226     char *name, caddr_t valuep, int *lengthp)
227 {
228 	/*
229 	 * implement zero size to reduce overhead (avoid two failing
230 	 * property lookups per stat).
231 	 */
232 	return (ddi_prop_op_size(dev, dip, prop_op,
233 	    flags, name, valuep, lengthp, 0));
234 }
235 
236 static int
237 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio)
238 {
239 	int error = 0;
240 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
241 	    (size_t)uio->uio_iov->iov_len);
242 
243 	mutex_enter(&mm_lock);
244 	hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
245 	    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ | PROT_WRITE),
246 	    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
247 
248 	if (!pf_is_memory(pfn)) {
249 		if (allowio) {
250 			size_t c = uio->uio_iov->iov_len;
251 
252 			if (ddi_peekpokeio(NULL, uio, rw,
253 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
254 			    sizeof (int32_t)) != DDI_SUCCESS)
255 				error = EFAULT;
256 		} else
257 			error = EIO;
258 	} else
259 		error = uiomove(&mm_map[pageoff], nbytes, rw, uio);
260 
261 	hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
262 	mutex_exit(&mm_lock);
263 	return (error);
264 }
265 
266 #ifdef	__sparc
267 
268 static int
269 mmpagelock(struct as *as, caddr_t va)
270 {
271 	struct seg *seg;
272 	int i;
273 
274 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
275 	seg = as_segat(as, va);
276 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
277 	AS_LOCK_EXIT(as, &as->a_lock);
278 
279 	return (i);
280 }
281 
282 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
283 
284 #else	/* __i386, __amd64 */
285 
286 #define	NEED_LOCK_KVADDR(va)	0
287 
288 #endif	/* __sparc */
289 
290 /*ARGSUSED3*/
291 static int
292 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
293 {
294 	pfn_t v;
295 	struct iovec *iov;
296 	int error = 0;
297 	size_t c;
298 	ssize_t oresid = uio->uio_resid;
299 	minor_t minor = getminor(dev);
300 
301 	while (uio->uio_resid > 0 && error == 0) {
302 		iov = uio->uio_iov;
303 		if (iov->iov_len == 0) {
304 			uio->uio_iov++;
305 			uio->uio_iovcnt--;
306 			if (uio->uio_iovcnt < 0)
307 				panic("mmrw");
308 			continue;
309 		}
310 		switch (minor) {
311 
312 		case M_MEM:
313 			memlist_read_lock();
314 			if (!address_in_memlist(phys_install,
315 			    (uint64_t)uio->uio_loffset, 1)) {
316 				memlist_read_unlock();
317 				error = EFAULT;
318 				break;
319 			}
320 			memlist_read_unlock();
321 
322 			v = BTOP((u_offset_t)uio->uio_loffset);
323 			error = mmio(uio, rw, v,
324 			    uio->uio_loffset & PAGEOFFSET, 0);
325 			break;
326 
327 		case M_KMEM:
328 		case M_ALLKMEM:
329 			{
330 			page_t **ppp;
331 			caddr_t vaddr = (caddr_t)uio->uio_offset;
332 			int try_lock = NEED_LOCK_KVADDR(vaddr);
333 			int locked = 0;
334 
335 			/*
336 			 * If vaddr does not map a valid page, as_pagelock()
337 			 * will return failure. Hence we can't check the
338 			 * return value and return EFAULT here as we'd like.
339 			 * seg_kp and seg_kpm do not properly support
340 			 * as_pagelock() for this context so we avoid it
341 			 * using the try_lock set check above.  Some day when
342 			 * the kernel page locking gets redesigned all this
343 			 * muck can be cleaned up.
344 			 */
345 			if (try_lock)
346 				locked = (as_pagelock(&kas, &ppp, vaddr,
347 				    PAGESIZE, S_WRITE) == 0);
348 
349 			v = hat_getpfnum(kas.a_hat,
350 			    (caddr_t)(uintptr_t)uio->uio_loffset);
351 			if (v == PFN_INVALID) {
352 				if (locked)
353 					as_pageunlock(&kas, ppp, vaddr,
354 					    PAGESIZE, S_WRITE);
355 				error = EFAULT;
356 				break;
357 			}
358 
359 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
360 			    minor == M_ALLKMEM || mm_kmem_io_access);
361 			if (locked)
362 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
363 				    S_WRITE);
364 			}
365 
366 			break;
367 
368 		case M_ZERO:
369 			if (rw == UIO_READ) {
370 				label_t ljb;
371 
372 				if (on_fault(&ljb)) {
373 					no_fault();
374 					error = EFAULT;
375 					break;
376 				}
377 				uzero(iov->iov_base, iov->iov_len);
378 				no_fault();
379 				uio->uio_resid -= iov->iov_len;
380 				uio->uio_loffset += iov->iov_len;
381 				break;
382 			}
383 			/* else it's a write, fall through to NULL case */
384 			/*FALLTHROUGH*/
385 
386 		case M_NULL:
387 			if (rw == UIO_READ)
388 				return (0);
389 			c = iov->iov_len;
390 			iov->iov_base += c;
391 			iov->iov_len -= c;
392 			uio->uio_loffset += c;
393 			uio->uio_resid -= c;
394 			break;
395 
396 		}
397 	}
398 	return (uio->uio_resid == oresid ? error : 0);
399 }
400 
401 static int
402 mmread(dev_t dev, struct uio *uio, cred_t *cred)
403 {
404 	return (mmrw(dev, uio, UIO_READ, cred));
405 }
406 
407 static int
408 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
409 {
410 	return (mmrw(dev, uio, UIO_WRITE, cred));
411 }
412 
413 /*
414  * Private ioctl for libkvm to support kvm_physaddr().
415  * Given an address space and a VA, compute the PA.
416  */
417 static int
418 mmioctl_vtop(intptr_t data)
419 {
420 #ifdef _SYSCALL32
421 	mem_vtop32_t vtop32;
422 #endif
423 	mem_vtop_t mem_vtop;
424 	proc_t *p;
425 	pfn_t pfn = (pfn_t)PFN_INVALID;
426 	pid_t pid = 0;
427 	struct as *as;
428 	struct seg *seg;
429 
430 	if (get_udatamodel() == DATAMODEL_NATIVE) {
431 		if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
432 			return (EFAULT);
433 	}
434 #ifdef _SYSCALL32
435 	else {
436 		if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
437 			return (EFAULT);
438 		mem_vtop.m_as = (struct as *)vtop32.m_as;
439 		mem_vtop.m_va = (void *)vtop32.m_va;
440 
441 		if (mem_vtop.m_as != NULL)
442 			return (EINVAL);
443 	}
444 #endif
445 
446 	if (mem_vtop.m_as == &kas) {
447 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
448 	} else {
449 		if (mem_vtop.m_as == NULL) {
450 			/*
451 			 * Assume the calling process's address space if the
452 			 * caller didn't specify one.
453 			 */
454 			p = curthread->t_procp;
455 			if (p == NULL)
456 				return (EIO);
457 			mem_vtop.m_as = p->p_as;
458 		}
459 
460 		mutex_enter(&pidlock);
461 		for (p = practive; p != NULL; p = p->p_next) {
462 			if (p->p_as == mem_vtop.m_as) {
463 				pid = p->p_pid;
464 				break;
465 			}
466 		}
467 		mutex_exit(&pidlock);
468 		if (p == NULL)
469 			return (EIO);
470 		p = sprlock(pid);
471 		if (p == NULL)
472 			return (EIO);
473 		as = p->p_as;
474 		if (as == mem_vtop.m_as) {
475 			mutex_exit(&p->p_lock);
476 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
477 			for (seg = AS_SEGFIRST(as); seg != NULL;
478 			    seg = AS_SEGNEXT(as, seg))
479 				if ((uintptr_t)mem_vtop.m_va -
480 				    (uintptr_t)seg->s_base < seg->s_size)
481 					break;
482 			if (seg != NULL)
483 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
484 			AS_LOCK_EXIT(as, &as->a_lock);
485 			mutex_enter(&p->p_lock);
486 		}
487 		sprunlock(p);
488 	}
489 	mem_vtop.m_pfn = pfn;
490 	if (pfn == PFN_INVALID)
491 		return (EIO);
492 
493 	if (get_udatamodel() == DATAMODEL_NATIVE) {
494 		if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
495 			return (EFAULT);
496 	}
497 #ifdef _SYSCALL32
498 	else {
499 		vtop32.m_pfn = mem_vtop.m_pfn;
500 		if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
501 			return (EFAULT);
502 	}
503 #endif
504 
505 	return (0);
506 }
507 
508 /*
509  * Given a PA, execute the given page retire command on it.
510  */
511 static int
512 mmioctl_page_retire(int cmd, intptr_t data)
513 {
514 	extern int page_retire_test(void);
515 	uint64_t pa;
516 
517 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
518 		return (EFAULT);
519 	}
520 
521 	switch (cmd) {
522 	case MEM_PAGE_ISRETIRED:
523 		return (page_retire_check(pa, NULL));
524 
525 	case MEM_PAGE_UNRETIRE:
526 		return (page_unretire(pa));
527 
528 	case MEM_PAGE_RETIRE:
529 		return (page_retire(pa, PR_FMA));
530 
531 	case MEM_PAGE_RETIRE_MCE:
532 		return (page_retire(pa, PR_MCE));
533 
534 	case MEM_PAGE_RETIRE_UE:
535 		return (page_retire(pa, PR_UE));
536 
537 	case MEM_PAGE_GETERRORS:
538 		{
539 			uint64_t page_errors;
540 			int rc = page_retire_check(pa, &page_errors);
541 			if (copyout(&page_errors, (void *)data,
542 			    sizeof (uint64_t))) {
543 				return (EFAULT);
544 			}
545 			return (rc);
546 		}
547 
548 	case MEM_PAGE_RETIRE_TEST:
549 		return (page_retire_test());
550 
551 	}
552 
553 	return (EINVAL);
554 }
555 
556 /*
557  * Given a mem-scheme FMRI for a page, execute the given page retire
558  * command on it.
559  */
560 static int
561 mmioctl_page_fmri_retire(int cmd, intptr_t data)
562 {
563 	mem_page_t mpage;
564 	uint64_t pa;
565 	nvlist_t *nvl;
566 	int err;
567 
568 	if ((err = mm_read_mem_page(data, &mpage)) < 0)
569 		return (err);
570 
571 	if ((err = mm_get_mem_fmri(&mpage, &nvl)) < 0)
572 		return (err);
573 
574 	if ((err = mm_get_paddr(nvl, &pa)) != 0) {
575 		nvlist_free(nvl);
576 		return (err);
577 	}
578 
579 	nvlist_free(nvl);
580 
581 	switch (cmd) {
582 	case MEM_PAGE_FMRI_ISRETIRED:
583 		return (page_retire_check(pa, NULL));
584 
585 	case MEM_PAGE_FMRI_RETIRE:
586 		return (page_retire(pa, PR_FMA));
587 	}
588 
589 	return (EINVAL);
590 }
591 
592 #ifdef __sparc
593 /*
594  * Given a syndrome, syndrome type, and address return the
595  * associated memory name in the provided data buffer.
596  */
597 static int
598 mmioctl_get_mem_name(intptr_t data)
599 {
600 	mem_name_t mem_name;
601 	void *buf;
602 	size_t bufsize;
603 	int len, err;
604 
605 	if ((bufsize = cpu_get_name_bufsize()) == 0)
606 		return (ENOTSUP);
607 
608 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
609 		return (err);
610 
611 	buf = kmem_alloc(bufsize, KM_SLEEP);
612 
613 	/*
614 	 * Call into cpu specific code to do the lookup.
615 	 */
616 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
617 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
618 		kmem_free(buf, bufsize);
619 		return (err);
620 	}
621 
622 	if (len >= mem_name.m_namelen) {
623 		kmem_free(buf, bufsize);
624 		return (ENAMETOOLONG);
625 	}
626 
627 	if (copyoutstr(buf, (char *)mem_name.m_name,
628 	    mem_name.m_namelen, NULL) != 0) {
629 		kmem_free(buf, bufsize);
630 		return (EFAULT);
631 	}
632 
633 	kmem_free(buf, bufsize);
634 	return (0);
635 }
636 
637 /*
638  * Given a syndrome and address return information about the associated memory.
639  */
640 static int
641 mmioctl_get_mem_info(intptr_t data)
642 {
643 	mem_info_t mem_info;
644 	int err;
645 
646 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
647 		return (EFAULT);
648 
649 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
650 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
651 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
652 		return (err);
653 
654 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
655 		return (EFAULT);
656 
657 	return (0);
658 }
659 
660 /*
661  * Given a memory name, return its associated serial id
662  */
663 static int
664 mmioctl_get_mem_sid(intptr_t data)
665 {
666 	mem_name_t mem_name;
667 	void *buf;
668 	void *name;
669 	size_t	name_len;
670 	size_t bufsize;
671 	int len, err;
672 
673 	if ((bufsize = cpu_get_name_bufsize()) == 0)
674 		return (ENOTSUP);
675 
676 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
677 		return (err);
678 
679 	buf = kmem_alloc(bufsize, KM_SLEEP);
680 
681 	if (mem_name.m_namelen > 1024)
682 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
683 
684 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
685 
686 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
687 	    mem_name.m_namelen, &name_len)) != 0) {
688 		kmem_free(buf, bufsize);
689 		kmem_free(name, mem_name.m_namelen);
690 		return (err);
691 	}
692 
693 	/*
694 	 * Call into cpu specific code to do the lookup.
695 	 */
696 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
697 		kmem_free(buf, bufsize);
698 		kmem_free(name, mem_name.m_namelen);
699 		return (err);
700 	}
701 
702 	if (len > mem_name.m_sidlen) {
703 		kmem_free(buf, bufsize);
704 		kmem_free(name, mem_name.m_namelen);
705 		return (ENAMETOOLONG);
706 	}
707 
708 	if (copyoutstr(buf, (char *)mem_name.m_sid,
709 	    mem_name.m_sidlen, NULL) != 0) {
710 		kmem_free(buf, bufsize);
711 		kmem_free(name, mem_name.m_namelen);
712 		return (EFAULT);
713 	}
714 
715 	kmem_free(buf, bufsize);
716 	kmem_free(name, mem_name.m_namelen);
717 	return (0);
718 }
719 #endif	/* __sparc */
720 
721 /*
722  * Private ioctls for
723  *	libkvm to support kvm_physaddr().
724  *	FMA support for page_retire() and memory attribute information.
725  */
726 /*ARGSUSED*/
727 static int
728 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
729 {
730 	if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
731 	    (cmd != MEM_VTOP && getminor(dev) != M_MEM))
732 		return (ENXIO);
733 
734 	switch (cmd) {
735 	case MEM_VTOP:
736 		return (mmioctl_vtop(data));
737 
738 	case MEM_PAGE_RETIRE:
739 	case MEM_PAGE_ISRETIRED:
740 	case MEM_PAGE_UNRETIRE:
741 	case MEM_PAGE_RETIRE_MCE:
742 	case MEM_PAGE_RETIRE_UE:
743 	case MEM_PAGE_GETERRORS:
744 	case MEM_PAGE_RETIRE_TEST:
745 		return (mmioctl_page_retire(cmd, data));
746 
747 	case MEM_PAGE_FMRI_RETIRE:
748 	case MEM_PAGE_FMRI_ISRETIRED:
749 		return (mmioctl_page_fmri_retire(cmd, data));
750 
751 #ifdef __sparc
752 	case MEM_NAME:
753 		return (mmioctl_get_mem_name(data));
754 
755 	case MEM_INFO:
756 		return (mmioctl_get_mem_info(data));
757 
758 	case MEM_SID:
759 		return (mmioctl_get_mem_sid(data));
760 #else
761 	case MEM_NAME:
762 	case MEM_INFO:
763 	case MEM_SID:
764 		return (ENOTSUP);
765 #endif	/* __sparc */
766 	}
767 	return (ENXIO);
768 }
769 
770 /*ARGSUSED2*/
771 static int
772 mmmmap(dev_t dev, off_t off, int prot)
773 {
774 	pfn_t pf;
775 	struct memlist *pmem;
776 	minor_t minor = getminor(dev);
777 
778 	switch (minor) {
779 	case M_MEM:
780 		pf = btop(off);
781 		memlist_read_lock();
782 		for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
783 			if (pf >= BTOP(pmem->address) &&
784 			    pf < BTOP(pmem->address + pmem->size)) {
785 				memlist_read_unlock();
786 				return (impl_obmem_pfnum(pf));
787 			}
788 		}
789 		memlist_read_unlock();
790 		break;
791 
792 	case M_KMEM:
793 	case M_ALLKMEM:
794 		/* no longer supported with KPR */
795 		return (-1);
796 
797 	case M_ZERO:
798 		/*
799 		 * We shouldn't be mmap'ing to /dev/zero here as
800 		 * mmsegmap() should have already converted
801 		 * a mapping request for this device to a mapping
802 		 * using seg_vn for anonymous memory.
803 		 */
804 		break;
805 
806 	}
807 	return (-1);
808 }
809 
810 /*
811  * This function is called when a memory device is mmap'ed.
812  * Set up the mapping to the correct device driver.
813  */
814 static int
815 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
816     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
817 {
818 	struct segvn_crargs vn_a;
819 	struct segdev_crargs dev_a;
820 	int error;
821 	minor_t minor;
822 	off_t i;
823 
824 	minor = getminor(dev);
825 
826 	as_rangelock(as);
827 	if ((flags & MAP_FIXED) == 0) {
828 		/*
829 		 * No need to worry about vac alignment on /dev/zero
830 		 * since this is a "clone" object that doesn't yet exist.
831 		 */
832 		map_addr(addrp, len, (offset_t)off,
833 				(minor == M_MEM) || (minor == M_KMEM), flags);
834 
835 		if (*addrp == NULL) {
836 			as_rangeunlock(as);
837 			return (ENOMEM);
838 		}
839 	} else {
840 		/*
841 		 * User specified address -
842 		 * Blow away any previous mappings.
843 		 */
844 		(void) as_unmap(as, *addrp, len);
845 	}
846 
847 	switch (minor) {
848 	case M_MEM:
849 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
850 		if ((flags & MAP_TYPE) != MAP_SHARED) {
851 			as_rangeunlock(as);
852 			return (EINVAL);
853 		}
854 
855 		/*
856 		 * Check to ensure that the entire range is
857 		 * legal and we are not trying to map in
858 		 * more than the device will let us.
859 		 */
860 		for (i = 0; i < len; i += PAGESIZE) {
861 			if (mmmmap(dev, off + i, maxprot) == -1) {
862 				as_rangeunlock(as);
863 				return (ENXIO);
864 			}
865 		}
866 
867 		/*
868 		 * Use seg_dev segment driver for /dev/mem mapping.
869 		 */
870 		dev_a.mapfunc = mmmmap;
871 		dev_a.dev = dev;
872 		dev_a.offset = off;
873 		dev_a.type = (flags & MAP_TYPE);
874 		dev_a.prot = (uchar_t)prot;
875 		dev_a.maxprot = (uchar_t)maxprot;
876 		dev_a.hat_attr = 0;
877 
878 		/*
879 		 * Make /dev/mem mappings non-consistent since we can't
880 		 * alias pages that don't have page structs behind them,
881 		 * such as kernel stack pages. If someone mmap()s a kernel
882 		 * stack page and if we give him a tte with cv, a line from
883 		 * that page can get into both pages of the spitfire d$.
884 		 * But snoop from another processor will only invalidate
885 		 * the first page. This later caused kernel (xc_attention)
886 		 * to go into an infinite loop at pil 13 and no interrupts
887 		 * could come in. See 1203630.
888 		 *
889 		 */
890 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
891 		dev_a.devmap_data = NULL;
892 
893 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
894 		break;
895 
896 	case M_ZERO:
897 		/*
898 		 * Use seg_vn segment driver for /dev/zero mapping.
899 		 * Passing in a NULL amp gives us the "cloning" effect.
900 		 */
901 		vn_a.vp = NULL;
902 		vn_a.offset = 0;
903 		vn_a.type = (flags & MAP_TYPE);
904 		vn_a.prot = prot;
905 		vn_a.maxprot = maxprot;
906 		vn_a.flags = flags & ~MAP_TYPE;
907 		vn_a.cred = cred;
908 		vn_a.amp = NULL;
909 		vn_a.szc = 0;
910 		vn_a.lgrp_mem_policy_flags = 0;
911 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
912 		break;
913 
914 	case M_KMEM:
915 	case M_ALLKMEM:
916 		/* No longer supported with KPR. */
917 		error = ENXIO;
918 		break;
919 
920 	case M_NULL:
921 		/*
922 		 * Use seg_dev segment driver for /dev/null mapping.
923 		 */
924 		dev_a.mapfunc = mmmmap;
925 		dev_a.dev = dev;
926 		dev_a.offset = off;
927 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
928 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
929 		dev_a.hat_attr = 0;
930 		dev_a.hat_flags = 0;
931 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
932 		break;
933 
934 	default:
935 		error = ENXIO;
936 	}
937 
938 	as_rangeunlock(as);
939 	return (error);
940 }
941 
942 static struct cb_ops mm_cb_ops = {
943 	mmopen,			/* open */
944 	nulldev,		/* close */
945 	nodev,			/* strategy */
946 	nodev,			/* print */
947 	nodev,			/* dump */
948 	mmread,			/* read */
949 	mmwrite,		/* write */
950 	mmioctl,		/* ioctl */
951 	nodev,			/* devmap */
952 	mmmmap,			/* mmap */
953 	mmsegmap,		/* segmap */
954 	mmchpoll,		/* poll */
955 	mmpropop,		/* prop_op */
956 	0,			/* streamtab  */
957 	D_NEW | D_MP | D_64BIT | D_U64BIT
958 };
959 
960 static struct dev_ops mm_ops = {
961 	DEVO_REV,		/* devo_rev, */
962 	0,			/* refcnt  */
963 	mm_info,		/* get_dev_info */
964 	nulldev,		/* identify */
965 	nulldev,		/* probe */
966 	mm_attach,		/* attach */
967 	nodev,			/* detach */
968 	nodev,			/* reset */
969 	&mm_cb_ops,		/* driver operations */
970 	(struct bus_ops *)0	/* bus operations */
971 };
972 
973 static struct modldrv modldrv = {
974 	&mod_driverops, "memory driver %I%", &mm_ops,
975 };
976 
977 static struct modlinkage modlinkage = {
978 	MODREV_1, &modldrv, NULL
979 };
980 
981 int
982 _init(void)
983 {
984 	return (mod_install(&modlinkage));
985 }
986 
987 int
988 _info(struct modinfo *modinfop)
989 {
990 	return (mod_info(&modlinkage, modinfop));
991 }
992 
993 int
994 _fini(void)
995 {
996 	return (mod_remove(&modlinkage));
997 }
998 
999 static int
1000 mm_kstat_update(kstat_t *ksp, int rw)
1001 {
1002 	struct memlist *pmem;
1003 	uint_t count;
1004 
1005 	if (rw == KSTAT_WRITE)
1006 		return (EACCES);
1007 
1008 	count = 0;
1009 	memlist_read_lock();
1010 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
1011 		count++;
1012 	}
1013 	memlist_read_unlock();
1014 
1015 	ksp->ks_ndata = count;
1016 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1017 
1018 	return (0);
1019 }
1020 
1021 static int
1022 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1023 {
1024 	struct memlist *pmem;
1025 	struct memunit {
1026 		uint64_t address;
1027 		uint64_t size;
1028 	} *kspmem;
1029 
1030 	if (rw == KSTAT_WRITE)
1031 		return (EACCES);
1032 
1033 	ksp->ks_snaptime = gethrtime();
1034 
1035 	kspmem = (struct memunit *)buf;
1036 	memlist_read_lock();
1037 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next, kspmem++) {
1038 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1039 			break;
1040 		kspmem->address = pmem->address;
1041 		kspmem->size = pmem->size;
1042 	}
1043 	memlist_read_unlock();
1044 
1045 	return (0);
1046 }
1047 
1048 /*
1049  * Read a mem_name_t from user-space and store it in the mem_name_t
1050  * pointed to by the mem_name argument.
1051  */
1052 static int
1053 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1054 {
1055 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1056 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1057 			return (EFAULT);
1058 	}
1059 #ifdef	_SYSCALL32
1060 	else {
1061 		mem_name32_t mem_name32;
1062 
1063 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1064 			return (EFAULT);
1065 		mem_name->m_addr = mem_name32.m_addr;
1066 		mem_name->m_synd = mem_name32.m_synd;
1067 		mem_name->m_type[0] = mem_name32.m_type[0];
1068 		mem_name->m_type[1] = mem_name32.m_type[1];
1069 		mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1070 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1071 		mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1072 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1073 	}
1074 #endif	/* _SYSCALL32 */
1075 
1076 	return (0);
1077 }
1078 
1079 /*
1080  * Read a mem_page_t from user-space and store it in the mem_page_t
1081  * pointed to by the mpage argument.
1082  */
1083 static int
1084 mm_read_mem_page(intptr_t data, mem_page_t *mpage)
1085 {
1086 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1087 		if (copyin((void *)data, mpage, sizeof (mem_page_t)) != 0)
1088 			return (EFAULT);
1089 	}
1090 #ifdef _SYSCALL32
1091 	else {
1092 		mem_page32_t	mpage32;
1093 
1094 		if (copyin((void *)data, &mpage32, sizeof (mem_page32_t)) != 0)
1095 			return (EFAULT);
1096 
1097 		mpage->m_fmri = (caddr_t)(uintptr_t)mpage32.m_fmri;
1098 		mpage->m_fmrisz = mpage32.m_fmrisz;
1099 	}
1100 #endif	/* _SYSCALL32 */
1101 
1102 	return (0);
1103 }
1104 
1105 /*
1106  * Expand an FMRI from a mem_page_t.
1107  */
1108 static int
1109 mm_get_mem_fmri(mem_page_t *mpage, nvlist_t **nvl)
1110 {
1111 	char *buf;
1112 	int err;
1113 
1114 	if (mpage->m_fmri == NULL || mpage->m_fmrisz > MEM_FMRI_MAX_BUFSIZE)
1115 		return (EINVAL);
1116 
1117 	buf = kmem_alloc(mpage->m_fmrisz, KM_SLEEP);
1118 	if (copyin(mpage->m_fmri, buf, mpage->m_fmrisz) != 0) {
1119 		kmem_free(buf, mpage->m_fmrisz);
1120 		return (EFAULT);
1121 	}
1122 
1123 	err = nvlist_unpack(buf, mpage->m_fmrisz, nvl, KM_SLEEP);
1124 	kmem_free(buf, mpage->m_fmrisz);
1125 
1126 	return (err);
1127 }
1128 
1129 static int
1130 mm_get_paddr(nvlist_t *nvl, uint64_t *paddr)
1131 {
1132 	uint8_t version;
1133 	uint64_t pa;
1134 	char *scheme;
1135 #ifdef __sparc
1136 	uint64_t offset;
1137 	char *unum;
1138 	char **serids;
1139 	uint_t nserids;
1140 	int err;
1141 #endif
1142 
1143 	/* Verify FMRI scheme name and version number */
1144 	if ((nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &scheme) != 0) ||
1145 	    (strcmp(scheme, FM_FMRI_SCHEME_MEM) != 0) ||
1146 	    (nvlist_lookup_uint8(nvl, FM_VERSION, &version) != 0) ||
1147 	    version > FM_MEM_SCHEME_VERSION) {
1148 		return (EINVAL);
1149 	}
1150 
1151 	/*
1152 	 * There are two ways a physical address can be  obtained from a mem
1153 	 * scheme FMRI.  One way is to use the "offset" and  "serial"
1154 	 * members, if they are present, together with the "unum" member to
1155 	 * calculate a physical address.  This is the preferred way since
1156 	 * it is independent of possible changes to the programming of
1157 	 * underlying hardware registers that may change the physical address.
1158 	 * If the "offset" member is not present, then the address is
1159 	 * retrieved from the "physaddr" member.
1160 	 */
1161 #if defined(__sparc)
1162 	if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_OFFSET, &offset) != 0) {
1163 		if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, &pa) !=
1164 		    0) {
1165 			return (EINVAL);
1166 		}
1167 	} else if (nvlist_lookup_string(nvl, FM_FMRI_MEM_UNUM, &unum) != 0 ||
1168 	    nvlist_lookup_string_array(nvl, FM_FMRI_MEM_SERIAL_ID, &serids,
1169 	    &nserids) != 0) {
1170 		return (EINVAL);
1171 	} else {
1172 		if ((err = cpu_get_mem_addr(unum, serids[0], offset, &pa)) != 0)
1173 			return (err);
1174 	}
1175 #elif defined(__i386) || defined(__amd64)
1176 	if (cmi_mc_unumtopa(NULL, nvl, &pa) == 0)
1177 		return (EINVAL);
1178 #else
1179 #error "port me"
1180 #endif /* __sparc */
1181 
1182 	*paddr = pa;
1183 	return (0);
1184 }
1185