xref: /illumos-gate/usr/src/uts/common/io/mem.c (revision 628e3cbed6489fa1db545d8524a06cd6535af456)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Memory special file
28  */
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/user.h>
33 #include <sys/buf.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/vm.h>
37 #include <sys/uio.h>
38 #include <sys/mman.h>
39 #include <sys/kmem.h>
40 #include <vm/seg.h>
41 #include <vm/page.h>
42 #include <sys/stat.h>
43 #include <sys/vmem.h>
44 #include <sys/memlist.h>
45 #include <sys/bootconf.h>
46 
47 #include <vm/seg_vn.h>
48 #include <vm/seg_dev.h>
49 #include <vm/seg_kmem.h>
50 #include <vm/seg_kp.h>
51 #include <vm/seg_kpm.h>
52 #include <vm/hat.h>
53 
54 #include <sys/conf.h>
55 #include <sys/mem.h>
56 #include <sys/types.h>
57 #include <sys/conf.h>
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/errno.h>
61 #include <sys/modctl.h>
62 #include <sys/memlist.h>
63 #include <sys/ddi.h>
64 #include <sys/sunddi.h>
65 #include <sys/debug.h>
66 #include <sys/fm/protocol.h>
67 
68 #if defined(__sparc)
69 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
70 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
71     uint64_t *, int *, int *, int *);
72 extern size_t cpu_get_name_bufsize(void);
73 extern int cpu_get_mem_sid(char *, char *, int, int *);
74 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
75 #elif defined(__x86)
76 #include <sys/cpu_module.h>
77 #endif	/* __sparc */
78 
79 /*
80  * Turn a byte length into a pagecount.  The DDI btop takes a
81  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
82  * large physical-memory 32-bit machines.
83  */
84 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
85 
86 static kmutex_t mm_lock;
87 static caddr_t mm_map;
88 
89 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
90 
91 static int mm_kmem_io_access;
92 
93 static int mm_kstat_update(kstat_t *ksp, int rw);
94 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
95 
96 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
97 
98 /*ARGSUSED1*/
99 static int
100 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
101 {
102 	int i;
103 	struct mem_minor {
104 		char *name;
105 		minor_t minor;
106 		int privonly;
107 		const char *rdpriv;
108 		const char *wrpriv;
109 		mode_t priv_mode;
110 	} mm[] = {
111 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
112 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
113 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
114 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
115 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
116 	};
117 	kstat_t *ksp;
118 
119 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
120 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
121 
122 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
123 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
124 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
125 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
126 		    DDI_FAILURE) {
127 			ddi_remove_minor_node(devi, NULL);
128 			return (DDI_FAILURE);
129 		}
130 	}
131 
132 	mm_dip = devi;
133 
134 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
135 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
136 	if (ksp != NULL) {
137 		ksp->ks_update = mm_kstat_update;
138 		ksp->ks_snapshot = mm_kstat_snapshot;
139 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
140 		kstat_install(ksp);
141 	}
142 
143 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
144 	    "kmem_io_access", 0);
145 
146 	return (DDI_SUCCESS);
147 }
148 
149 /*ARGSUSED*/
150 static int
151 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
152 {
153 	register int error;
154 
155 	switch (infocmd) {
156 	case DDI_INFO_DEVT2DEVINFO:
157 		*result = (void *)mm_dip;
158 		error = DDI_SUCCESS;
159 		break;
160 	case DDI_INFO_DEVT2INSTANCE:
161 		*result = (void *)0;
162 		error = DDI_SUCCESS;
163 		break;
164 	default:
165 		error = DDI_FAILURE;
166 	}
167 	return (error);
168 }
169 
170 /*ARGSUSED1*/
171 static int
172 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
173 {
174 	switch (getminor(*devp)) {
175 	case M_NULL:
176 	case M_ZERO:
177 	case M_MEM:
178 	case M_KMEM:
179 	case M_ALLKMEM:
180 		/* standard devices */
181 		break;
182 
183 	default:
184 		/* Unsupported or unknown type */
185 		return (EINVAL);
186 	}
187 	/* must be character device */
188 	if (typ != OTYP_CHR)
189 		return (EINVAL);
190 	return (0);
191 }
192 
193 struct pollhead	mm_pollhd;
194 
195 /*ARGSUSED*/
196 static int
197 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
198     struct pollhead **phpp)
199 {
200 	switch (getminor(dev)) {
201 	case M_NULL:
202 	case M_ZERO:
203 	case M_MEM:
204 	case M_KMEM:
205 	case M_ALLKMEM:
206 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
207 		    POLLWRNORM | POLLRDBAND | POLLWRBAND);
208 		/*
209 		 * A non NULL pollhead pointer should be returned in case
210 		 * user polls for 0 events.
211 		 */
212 		*phpp = !anyyet && !*reventsp ?
213 		    &mm_pollhd : (struct pollhead *)NULL;
214 		return (0);
215 	default:
216 		/* no other devices currently support polling */
217 		return (ENXIO);
218 	}
219 }
220 
221 static int
222 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
223     char *name, caddr_t valuep, int *lengthp)
224 {
225 	/*
226 	 * implement zero size to reduce overhead (avoid two failing
227 	 * property lookups per stat).
228 	 */
229 	return (ddi_prop_op_size(dev, dip, prop_op,
230 	    flags, name, valuep, lengthp, 0));
231 }
232 
233 extern void mach_sync_icache_pa(caddr_t, size_t);
234 #pragma weak mach_sync_icache_pa
235 
236 static int
237 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio)
238 {
239 	int error = 0;
240 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
241 	    (size_t)uio->uio_iov->iov_len);
242 
243 	mutex_enter(&mm_lock);
244 	hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
245 	    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ | PROT_WRITE),
246 	    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
247 
248 	if (!pf_is_memory(pfn)) {
249 		if (allowio) {
250 			size_t c = uio->uio_iov->iov_len;
251 
252 			if (ddi_peekpokeio(NULL, uio, rw,
253 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
254 			    sizeof (int32_t)) != DDI_SUCCESS)
255 				error = EFAULT;
256 		} else
257 			error = EIO;
258 	} else {
259 		error = uiomove(&mm_map[pageoff], nbytes, rw, uio);
260 
261 		/*
262 		 * In case this has changed executable code,
263 		 * non-coherent I-caches must be flushed.
264 		 */
265 		if (rw != UIO_READ && &mach_sync_icache_pa != NULL) {
266 			mach_sync_icache_pa((caddr_t)ptob(pfn), PAGESIZE);
267 		}
268 	}
269 
270 	hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
271 	mutex_exit(&mm_lock);
272 	return (error);
273 }
274 
275 static int
276 mmpagelock(struct as *as, caddr_t va)
277 {
278 	struct seg *seg;
279 	int i;
280 
281 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
282 	seg = as_segat(as, va);
283 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
284 	AS_LOCK_EXIT(as, &as->a_lock);
285 
286 	return (i);
287 }
288 
289 #ifdef	__sparc
290 
291 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
292 
293 #else	/* __i386, __amd64 */
294 
295 #define	NEED_LOCK_KVADDR(va)	0
296 
297 #endif	/* __sparc */
298 
299 /*ARGSUSED3*/
300 static int
301 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
302 {
303 	pfn_t v;
304 	struct iovec *iov;
305 	int error = 0;
306 	size_t c;
307 	ssize_t oresid = uio->uio_resid;
308 	minor_t minor = getminor(dev);
309 
310 	while (uio->uio_resid > 0 && error == 0) {
311 		iov = uio->uio_iov;
312 		if (iov->iov_len == 0) {
313 			uio->uio_iov++;
314 			uio->uio_iovcnt--;
315 			if (uio->uio_iovcnt < 0)
316 				panic("mmrw");
317 			continue;
318 		}
319 		switch (minor) {
320 
321 		case M_MEM:
322 			memlist_read_lock();
323 			if (!address_in_memlist(phys_install,
324 			    (uint64_t)uio->uio_loffset, 1)) {
325 				memlist_read_unlock();
326 				error = EFAULT;
327 				break;
328 			}
329 			memlist_read_unlock();
330 
331 			v = BTOP((u_offset_t)uio->uio_loffset);
332 			error = mmio(uio, rw, v,
333 			    uio->uio_loffset & PAGEOFFSET, 0);
334 			break;
335 
336 		case M_KMEM:
337 		case M_ALLKMEM:
338 			{
339 			page_t **ppp;
340 			caddr_t vaddr = (caddr_t)uio->uio_offset;
341 			int try_lock = NEED_LOCK_KVADDR(vaddr);
342 			int locked = 0;
343 
344 			if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
345 				break;
346 
347 			/*
348 			 * If vaddr does not map a valid page, as_pagelock()
349 			 * will return failure. Hence we can't check the
350 			 * return value and return EFAULT here as we'd like.
351 			 * seg_kp and seg_kpm do not properly support
352 			 * as_pagelock() for this context so we avoid it
353 			 * using the try_lock set check above.  Some day when
354 			 * the kernel page locking gets redesigned all this
355 			 * muck can be cleaned up.
356 			 */
357 			if (try_lock)
358 				locked = (as_pagelock(&kas, &ppp, vaddr,
359 				    PAGESIZE, S_WRITE) == 0);
360 
361 			v = hat_getpfnum(kas.a_hat,
362 			    (caddr_t)(uintptr_t)uio->uio_loffset);
363 			if (v == PFN_INVALID) {
364 				if (locked)
365 					as_pageunlock(&kas, ppp, vaddr,
366 					    PAGESIZE, S_WRITE);
367 				error = EFAULT;
368 				break;
369 			}
370 
371 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
372 			    minor == M_ALLKMEM || mm_kmem_io_access);
373 			if (locked)
374 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
375 				    S_WRITE);
376 			}
377 
378 			break;
379 
380 		case M_ZERO:
381 			if (rw == UIO_READ) {
382 				label_t ljb;
383 
384 				if (on_fault(&ljb)) {
385 					no_fault();
386 					error = EFAULT;
387 					break;
388 				}
389 				uzero(iov->iov_base, iov->iov_len);
390 				no_fault();
391 				uio->uio_resid -= iov->iov_len;
392 				uio->uio_loffset += iov->iov_len;
393 				break;
394 			}
395 			/* else it's a write, fall through to NULL case */
396 			/*FALLTHROUGH*/
397 
398 		case M_NULL:
399 			if (rw == UIO_READ)
400 				return (0);
401 			c = iov->iov_len;
402 			iov->iov_base += c;
403 			iov->iov_len -= c;
404 			uio->uio_loffset += c;
405 			uio->uio_resid -= c;
406 			break;
407 
408 		}
409 	}
410 	return (uio->uio_resid == oresid ? error : 0);
411 }
412 
413 static int
414 mmread(dev_t dev, struct uio *uio, cred_t *cred)
415 {
416 	return (mmrw(dev, uio, UIO_READ, cred));
417 }
418 
419 static int
420 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
421 {
422 	return (mmrw(dev, uio, UIO_WRITE, cred));
423 }
424 
425 /*
426  * Private ioctl for libkvm to support kvm_physaddr().
427  * Given an address space and a VA, compute the PA.
428  */
429 static int
430 mmioctl_vtop(intptr_t data)
431 {
432 #ifdef _SYSCALL32
433 	mem_vtop32_t vtop32;
434 #endif
435 	mem_vtop_t mem_vtop;
436 	proc_t *p;
437 	pfn_t pfn = (pfn_t)PFN_INVALID;
438 	pid_t pid = 0;
439 	struct as *as;
440 	struct seg *seg;
441 
442 	if (get_udatamodel() == DATAMODEL_NATIVE) {
443 		if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
444 			return (EFAULT);
445 	}
446 #ifdef _SYSCALL32
447 	else {
448 		if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
449 			return (EFAULT);
450 		mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
451 		mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
452 
453 		if (mem_vtop.m_as != NULL)
454 			return (EINVAL);
455 	}
456 #endif
457 
458 	if (mem_vtop.m_as == &kas) {
459 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
460 	} else {
461 		if (mem_vtop.m_as == NULL) {
462 			/*
463 			 * Assume the calling process's address space if the
464 			 * caller didn't specify one.
465 			 */
466 			p = curthread->t_procp;
467 			if (p == NULL)
468 				return (EIO);
469 			mem_vtop.m_as = p->p_as;
470 		}
471 
472 		mutex_enter(&pidlock);
473 		for (p = practive; p != NULL; p = p->p_next) {
474 			if (p->p_as == mem_vtop.m_as) {
475 				pid = p->p_pid;
476 				break;
477 			}
478 		}
479 		mutex_exit(&pidlock);
480 		if (p == NULL)
481 			return (EIO);
482 		p = sprlock(pid);
483 		if (p == NULL)
484 			return (EIO);
485 		as = p->p_as;
486 		if (as == mem_vtop.m_as) {
487 			mutex_exit(&p->p_lock);
488 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
489 			for (seg = AS_SEGFIRST(as); seg != NULL;
490 			    seg = AS_SEGNEXT(as, seg))
491 				if ((uintptr_t)mem_vtop.m_va -
492 				    (uintptr_t)seg->s_base < seg->s_size)
493 					break;
494 			if (seg != NULL)
495 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
496 			AS_LOCK_EXIT(as, &as->a_lock);
497 			mutex_enter(&p->p_lock);
498 		}
499 		sprunlock(p);
500 	}
501 	mem_vtop.m_pfn = pfn;
502 	if (pfn == PFN_INVALID)
503 		return (EIO);
504 
505 	if (get_udatamodel() == DATAMODEL_NATIVE) {
506 		if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
507 			return (EFAULT);
508 	}
509 #ifdef _SYSCALL32
510 	else {
511 		vtop32.m_pfn = mem_vtop.m_pfn;
512 		if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
513 			return (EFAULT);
514 	}
515 #endif
516 
517 	return (0);
518 }
519 
520 /*
521  * Given a PA, execute the given page retire command on it.
522  */
523 static int
524 mmioctl_page_retire(int cmd, intptr_t data)
525 {
526 	extern int page_retire_test(void);
527 	uint64_t pa;
528 
529 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
530 		return (EFAULT);
531 	}
532 
533 	switch (cmd) {
534 	case MEM_PAGE_ISRETIRED:
535 		return (page_retire_check(pa, NULL));
536 
537 	case MEM_PAGE_UNRETIRE:
538 		return (page_unretire(pa));
539 
540 	case MEM_PAGE_RETIRE:
541 		return (page_retire(pa, PR_FMA));
542 
543 	case MEM_PAGE_RETIRE_MCE:
544 		return (page_retire(pa, PR_MCE));
545 
546 	case MEM_PAGE_RETIRE_UE:
547 		return (page_retire(pa, PR_UE));
548 
549 	case MEM_PAGE_GETERRORS:
550 		{
551 			uint64_t page_errors;
552 			int rc = page_retire_check(pa, &page_errors);
553 			if (copyout(&page_errors, (void *)data,
554 			    sizeof (uint64_t))) {
555 				return (EFAULT);
556 			}
557 			return (rc);
558 		}
559 
560 	case MEM_PAGE_RETIRE_TEST:
561 		return (page_retire_test());
562 
563 	}
564 
565 	return (EINVAL);
566 }
567 
568 #ifdef __sparc
569 /*
570  * Given a syndrome, syndrome type, and address return the
571  * associated memory name in the provided data buffer.
572  */
573 static int
574 mmioctl_get_mem_name(intptr_t data)
575 {
576 	mem_name_t mem_name;
577 	void *buf;
578 	size_t bufsize;
579 	int len, err;
580 
581 	if ((bufsize = cpu_get_name_bufsize()) == 0)
582 		return (ENOTSUP);
583 
584 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
585 		return (err);
586 
587 	buf = kmem_alloc(bufsize, KM_SLEEP);
588 
589 	/*
590 	 * Call into cpu specific code to do the lookup.
591 	 */
592 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
593 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
594 		kmem_free(buf, bufsize);
595 		return (err);
596 	}
597 
598 	if (len >= mem_name.m_namelen) {
599 		kmem_free(buf, bufsize);
600 		return (ENOSPC);
601 	}
602 
603 	if (copyoutstr(buf, (char *)mem_name.m_name,
604 	    mem_name.m_namelen, NULL) != 0) {
605 		kmem_free(buf, bufsize);
606 		return (EFAULT);
607 	}
608 
609 	kmem_free(buf, bufsize);
610 	return (0);
611 }
612 
613 /*
614  * Given a syndrome and address return information about the associated memory.
615  */
616 static int
617 mmioctl_get_mem_info(intptr_t data)
618 {
619 	mem_info_t mem_info;
620 	int err;
621 
622 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
623 		return (EFAULT);
624 
625 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
626 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
627 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
628 		return (err);
629 
630 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
631 		return (EFAULT);
632 
633 	return (0);
634 }
635 
636 /*
637  * Given a memory name, return its associated serial id
638  */
639 static int
640 mmioctl_get_mem_sid(intptr_t data)
641 {
642 	mem_name_t mem_name;
643 	void *buf;
644 	void *name;
645 	size_t	name_len;
646 	size_t bufsize;
647 	int len, err;
648 
649 	if ((bufsize = cpu_get_name_bufsize()) == 0)
650 		return (ENOTSUP);
651 
652 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
653 		return (err);
654 
655 	buf = kmem_alloc(bufsize, KM_SLEEP);
656 
657 	if (mem_name.m_namelen > 1024)
658 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
659 
660 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
661 
662 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
663 	    mem_name.m_namelen, &name_len)) != 0) {
664 		kmem_free(buf, bufsize);
665 		kmem_free(name, mem_name.m_namelen);
666 		return (err);
667 	}
668 
669 	/*
670 	 * Call into cpu specific code to do the lookup.
671 	 */
672 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
673 		kmem_free(buf, bufsize);
674 		kmem_free(name, mem_name.m_namelen);
675 		return (err);
676 	}
677 
678 	if (len > mem_name.m_sidlen) {
679 		kmem_free(buf, bufsize);
680 		kmem_free(name, mem_name.m_namelen);
681 		return (ENAMETOOLONG);
682 	}
683 
684 	if (copyoutstr(buf, (char *)mem_name.m_sid,
685 	    mem_name.m_sidlen, NULL) != 0) {
686 		kmem_free(buf, bufsize);
687 		kmem_free(name, mem_name.m_namelen);
688 		return (EFAULT);
689 	}
690 
691 	kmem_free(buf, bufsize);
692 	kmem_free(name, mem_name.m_namelen);
693 	return (0);
694 }
695 #endif	/* __sparc */
696 
697 /*
698  * Private ioctls for
699  *	libkvm to support kvm_physaddr().
700  *	FMA support for page_retire() and memory attribute information.
701  */
702 /*ARGSUSED*/
703 static int
704 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
705 {
706 	if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
707 	    (cmd != MEM_VTOP && getminor(dev) != M_MEM))
708 		return (ENXIO);
709 
710 	switch (cmd) {
711 	case MEM_VTOP:
712 		return (mmioctl_vtop(data));
713 
714 	case MEM_PAGE_RETIRE:
715 	case MEM_PAGE_ISRETIRED:
716 	case MEM_PAGE_UNRETIRE:
717 	case MEM_PAGE_RETIRE_MCE:
718 	case MEM_PAGE_RETIRE_UE:
719 	case MEM_PAGE_GETERRORS:
720 	case MEM_PAGE_RETIRE_TEST:
721 		return (mmioctl_page_retire(cmd, data));
722 
723 #ifdef __sparc
724 	case MEM_NAME:
725 		return (mmioctl_get_mem_name(data));
726 
727 	case MEM_INFO:
728 		return (mmioctl_get_mem_info(data));
729 
730 	case MEM_SID:
731 		return (mmioctl_get_mem_sid(data));
732 #else
733 	case MEM_NAME:
734 	case MEM_INFO:
735 	case MEM_SID:
736 		return (ENOTSUP);
737 #endif	/* __sparc */
738 	}
739 	return (ENXIO);
740 }
741 
742 /*ARGSUSED2*/
743 static int
744 mmmmap(dev_t dev, off_t off, int prot)
745 {
746 	pfn_t pf;
747 	struct memlist *pmem;
748 	minor_t minor = getminor(dev);
749 
750 	switch (minor) {
751 	case M_MEM:
752 		pf = btop(off);
753 		memlist_read_lock();
754 		for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
755 			if (pf >= BTOP(pmem->address) &&
756 			    pf < BTOP(pmem->address + pmem->size)) {
757 				memlist_read_unlock();
758 				return (impl_obmem_pfnum(pf));
759 			}
760 		}
761 		memlist_read_unlock();
762 		break;
763 
764 	case M_KMEM:
765 	case M_ALLKMEM:
766 		/* no longer supported with KPR */
767 		return (-1);
768 
769 	case M_ZERO:
770 		/*
771 		 * We shouldn't be mmap'ing to /dev/zero here as
772 		 * mmsegmap() should have already converted
773 		 * a mapping request for this device to a mapping
774 		 * using seg_vn for anonymous memory.
775 		 */
776 		break;
777 
778 	}
779 	return (-1);
780 }
781 
782 /*
783  * This function is called when a memory device is mmap'ed.
784  * Set up the mapping to the correct device driver.
785  */
786 static int
787 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
788     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
789 {
790 	struct segvn_crargs vn_a;
791 	struct segdev_crargs dev_a;
792 	int error;
793 	minor_t minor;
794 	off_t i;
795 
796 	minor = getminor(dev);
797 
798 	as_rangelock(as);
799 	/*
800 	 * No need to worry about vac alignment on /dev/zero
801 	 * since this is a "clone" object that doesn't yet exist.
802 	 */
803 	error = choose_addr(as, addrp, len, off,
804 	    (minor == M_MEM) || (minor == M_KMEM), flags);
805 	if (error != 0) {
806 		as_rangeunlock(as);
807 		return (error);
808 	}
809 
810 	switch (minor) {
811 	case M_MEM:
812 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
813 		if ((flags & MAP_TYPE) != MAP_SHARED) {
814 			as_rangeunlock(as);
815 			return (EINVAL);
816 		}
817 
818 		/*
819 		 * Check to ensure that the entire range is
820 		 * legal and we are not trying to map in
821 		 * more than the device will let us.
822 		 */
823 		for (i = 0; i < len; i += PAGESIZE) {
824 			if (mmmmap(dev, off + i, maxprot) == -1) {
825 				as_rangeunlock(as);
826 				return (ENXIO);
827 			}
828 		}
829 
830 		/*
831 		 * Use seg_dev segment driver for /dev/mem mapping.
832 		 */
833 		dev_a.mapfunc = mmmmap;
834 		dev_a.dev = dev;
835 		dev_a.offset = off;
836 		dev_a.type = (flags & MAP_TYPE);
837 		dev_a.prot = (uchar_t)prot;
838 		dev_a.maxprot = (uchar_t)maxprot;
839 		dev_a.hat_attr = 0;
840 
841 		/*
842 		 * Make /dev/mem mappings non-consistent since we can't
843 		 * alias pages that don't have page structs behind them,
844 		 * such as kernel stack pages. If someone mmap()s a kernel
845 		 * stack page and if we give him a tte with cv, a line from
846 		 * that page can get into both pages of the spitfire d$.
847 		 * But snoop from another processor will only invalidate
848 		 * the first page. This later caused kernel (xc_attention)
849 		 * to go into an infinite loop at pil 13 and no interrupts
850 		 * could come in. See 1203630.
851 		 *
852 		 */
853 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
854 		dev_a.devmap_data = NULL;
855 
856 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
857 		break;
858 
859 	case M_ZERO:
860 		/*
861 		 * Use seg_vn segment driver for /dev/zero mapping.
862 		 * Passing in a NULL amp gives us the "cloning" effect.
863 		 */
864 		vn_a.vp = NULL;
865 		vn_a.offset = 0;
866 		vn_a.type = (flags & MAP_TYPE);
867 		vn_a.prot = prot;
868 		vn_a.maxprot = maxprot;
869 		vn_a.flags = flags & ~MAP_TYPE;
870 		vn_a.cred = cred;
871 		vn_a.amp = NULL;
872 		vn_a.szc = 0;
873 		vn_a.lgrp_mem_policy_flags = 0;
874 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
875 		break;
876 
877 	case M_KMEM:
878 	case M_ALLKMEM:
879 		/* No longer supported with KPR. */
880 		error = ENXIO;
881 		break;
882 
883 	case M_NULL:
884 		/*
885 		 * Use seg_dev segment driver for /dev/null mapping.
886 		 */
887 		dev_a.mapfunc = mmmmap;
888 		dev_a.dev = dev;
889 		dev_a.offset = off;
890 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
891 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
892 		dev_a.hat_attr = 0;
893 		dev_a.hat_flags = 0;
894 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
895 		break;
896 
897 	default:
898 		error = ENXIO;
899 	}
900 
901 	as_rangeunlock(as);
902 	return (error);
903 }
904 
905 static struct cb_ops mm_cb_ops = {
906 	mmopen,			/* open */
907 	nulldev,		/* close */
908 	nodev,			/* strategy */
909 	nodev,			/* print */
910 	nodev,			/* dump */
911 	mmread,			/* read */
912 	mmwrite,		/* write */
913 	mmioctl,		/* ioctl */
914 	nodev,			/* devmap */
915 	mmmmap,			/* mmap */
916 	mmsegmap,		/* segmap */
917 	mmchpoll,		/* poll */
918 	mmpropop,		/* prop_op */
919 	0,			/* streamtab  */
920 	D_NEW | D_MP | D_64BIT | D_U64BIT
921 };
922 
923 static struct dev_ops mm_ops = {
924 	DEVO_REV,		/* devo_rev, */
925 	0,			/* refcnt  */
926 	mm_info,		/* get_dev_info */
927 	nulldev,		/* identify */
928 	nulldev,		/* probe */
929 	mm_attach,		/* attach */
930 	nodev,			/* detach */
931 	nodev,			/* reset */
932 	&mm_cb_ops,		/* driver operations */
933 	(struct bus_ops *)0,	/* bus operations */
934 	NULL,			/* power */
935 	ddi_quiesce_not_needed,		/* quiesce */
936 };
937 
938 static struct modldrv modldrv = {
939 	&mod_driverops, "memory driver", &mm_ops,
940 };
941 
942 static struct modlinkage modlinkage = {
943 	MODREV_1, &modldrv, NULL
944 };
945 
946 int
947 _init(void)
948 {
949 	return (mod_install(&modlinkage));
950 }
951 
952 int
953 _info(struct modinfo *modinfop)
954 {
955 	return (mod_info(&modlinkage, modinfop));
956 }
957 
958 int
959 _fini(void)
960 {
961 	return (mod_remove(&modlinkage));
962 }
963 
964 static int
965 mm_kstat_update(kstat_t *ksp, int rw)
966 {
967 	struct memlist *pmem;
968 	uint_t count;
969 
970 	if (rw == KSTAT_WRITE)
971 		return (EACCES);
972 
973 	count = 0;
974 	memlist_read_lock();
975 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
976 		count++;
977 	}
978 	memlist_read_unlock();
979 
980 	ksp->ks_ndata = count;
981 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
982 
983 	return (0);
984 }
985 
986 static int
987 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
988 {
989 	struct memlist *pmem;
990 	struct memunit {
991 		uint64_t address;
992 		uint64_t size;
993 	} *kspmem;
994 
995 	if (rw == KSTAT_WRITE)
996 		return (EACCES);
997 
998 	ksp->ks_snaptime = gethrtime();
999 
1000 	kspmem = (struct memunit *)buf;
1001 	memlist_read_lock();
1002 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next, kspmem++) {
1003 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1004 			break;
1005 		kspmem->address = pmem->address;
1006 		kspmem->size = pmem->size;
1007 	}
1008 	memlist_read_unlock();
1009 
1010 	return (0);
1011 }
1012 
1013 /*
1014  * Read a mem_name_t from user-space and store it in the mem_name_t
1015  * pointed to by the mem_name argument.
1016  */
1017 static int
1018 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1019 {
1020 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1021 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1022 			return (EFAULT);
1023 	}
1024 #ifdef	_SYSCALL32
1025 	else {
1026 		mem_name32_t mem_name32;
1027 
1028 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1029 			return (EFAULT);
1030 		mem_name->m_addr = mem_name32.m_addr;
1031 		mem_name->m_synd = mem_name32.m_synd;
1032 		mem_name->m_type[0] = mem_name32.m_type[0];
1033 		mem_name->m_type[1] = mem_name32.m_type[1];
1034 		mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1035 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1036 		mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1037 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1038 	}
1039 #endif	/* _SYSCALL32 */
1040 
1041 	return (0);
1042 }
1043