xref: /illumos-gate/usr/src/uts/intel/io/vmm/seg_vmm.c (revision cb1bb6c32d034ea24e8549ef763c9c2b79413eb8)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2018 Joyent, Inc.
14  * Copyright 2021 Oxide Computer Company
15  */
16 
17 /*
18  * segvmm - Virtual-Machine-Memory segment
19  *
20  * The vmm segment driver was designed for mapping regions of kernel memory
21  * allocated to an HVM instance into userspace for manipulation there.  It
22  * draws direct lineage from the umap segment driver, but meant for larger
23  * mappings with fewer restrictions.
24  *
25  * seg*k*vmm, in contrast, has mappings for every VMM into kas.  We use its
26  * mappings here only to find the relevant PFNs in segvmm_fault_in().
27  */
28 
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/errno.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/lgrp.h>
36 #include <sys/mman.h>
37 
38 #include <vm/hat.h>
39 #include <vm/hat_pte.h>
40 #include <vm/htable.h>
41 #include <vm/as.h>
42 #include <vm/seg.h>
43 #include <vm/seg_kmem.h>
44 
45 #include <sys/seg_vmm.h>
46 
47 typedef struct segvmm_data {
48 	krwlock_t	svmd_lock;
49 	vm_object_t	*svmd_vmo;
50 	vm_client_t	*svmd_vmc;
51 	uintptr_t	svmd_off;
52 	uchar_t		svmd_prot;
53 	size_t		svmd_softlockcnt;
54 } segvmm_data_t;
55 
56 
57 static int segvmm_dup(struct seg *, struct seg *);
58 static int segvmm_unmap(struct seg *, caddr_t, size_t);
59 static void segvmm_free(struct seg *);
60 static faultcode_t segvmm_fault(struct hat *, struct seg *, caddr_t, size_t,
61     enum fault_type, enum seg_rw);
62 static faultcode_t segvmm_faulta(struct seg *, caddr_t);
63 static int segvmm_setprot(struct seg *, caddr_t, size_t, uint_t);
64 static int segvmm_checkprot(struct seg *, caddr_t, size_t, uint_t);
65 static int segvmm_sync(struct seg *, caddr_t, size_t, int, uint_t);
66 static size_t segvmm_incore(struct seg *, caddr_t, size_t, char *);
67 static int segvmm_lockop(struct seg *, caddr_t, size_t, int, int, ulong_t *,
68     size_t);
69 static int segvmm_getprot(struct seg *, caddr_t, size_t, uint_t *);
70 static u_offset_t segvmm_getoffset(struct seg *, caddr_t);
71 static int segvmm_gettype(struct seg *, caddr_t);
72 static int segvmm_getvp(struct seg *, caddr_t, struct vnode **);
73 static int segvmm_advise(struct seg *, caddr_t, size_t, uint_t);
74 static void segvmm_dump(struct seg *);
75 static int segvmm_pagelock(struct seg *, caddr_t, size_t, struct page ***,
76     enum lock_type, enum seg_rw);
77 static int segvmm_setpagesize(struct seg *, caddr_t, size_t, uint_t);
78 static int segvmm_getmemid(struct seg *, caddr_t, memid_t *);
79 static int segvmm_capable(struct seg *, segcapability_t);
80 
81 static struct seg_ops segvmm_ops = {
82 	.dup		= segvmm_dup,
83 	.unmap		= segvmm_unmap,
84 	.free		= segvmm_free,
85 	.fault		= segvmm_fault,
86 	.faulta		= segvmm_faulta,
87 	.setprot	= segvmm_setprot,
88 	.checkprot	= segvmm_checkprot,
89 	.kluster	= NULL,
90 	.swapout	= NULL,
91 	.sync		= segvmm_sync,
92 	.incore		= segvmm_incore,
93 	.lockop		= segvmm_lockop,
94 	.getprot	= segvmm_getprot,
95 	.getoffset	= segvmm_getoffset,
96 	.gettype	= segvmm_gettype,
97 	.getvp		= segvmm_getvp,
98 	.advise		= segvmm_advise,
99 	.dump		= segvmm_dump,
100 	.pagelock	= segvmm_pagelock,
101 	.setpagesize	= segvmm_setpagesize,
102 	.getmemid	= segvmm_getmemid,
103 	.getpolicy	= NULL,
104 	.capable	= segvmm_capable,
105 	.inherit	= seg_inherit_notsup
106 };
107 
108 /*
109  * Unload a region from the HAT for A/D tracking.
110  */
111 static void
112 segvmm_invalidate(void *arg, uintptr_t gpa, size_t sz)
113 {
114 	struct seg *seg = arg;
115 	segvmm_data_t *svmd = seg->s_data;
116 
117 	/*
118 	 * Invalidations are only necessary (and configured) for vmspace
119 	 * mappings.  Direct vm_object mappings are not involved.
120 	 */
121 	ASSERT3P(svmd->svmd_vmo, ==, NULL);
122 
123 	/*
124 	 * The region being invalidated may overlap with all, some, or none of
125 	 * this segment.  We are only concerned about that overlap.
126 	 */
127 	const uintptr_t start = MAX(gpa, svmd->svmd_off);
128 	const uintptr_t end = MIN(gpa + sz, svmd->svmd_off + seg->s_size);
129 	if (start >= end) {
130 		return;
131 	}
132 	ASSERT(start >= svmd->svmd_off && end <= svmd->svmd_off + seg->s_size);
133 	ASSERT(start >= gpa && end <= gpa + sz);
134 	const caddr_t unload_va = seg->s_base + (start - svmd->svmd_off);
135 	const size_t unload_sz = (end - start);
136 	ASSERT3U(unload_sz, <=, seg->s_size);
137 
138 	hat_unload(seg->s_as->a_hat, unload_va, unload_sz, HAT_UNLOAD);
139 }
140 
141 /*
142  * Create a VMM-memory-backed segment.
143  */
144 int
145 segvmm_create(struct seg **segpp, void *argsp)
146 {
147 	struct seg *seg = *segpp;
148 	segvmm_crargs_t *cra = argsp;
149 	segvmm_data_t *data;
150 
151 	VERIFY((cra->vmo == NULL && cra->vmc != NULL) ||
152 	    (cra->vmo != NULL && cra->vmc == NULL));
153 	VERIFY(cra->prot & PROT_USER);
154 	VERIFY0(cra->offset & PAGEOFFSET);
155 
156 	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
157 	rw_init(&data->svmd_lock, NULL, RW_DEFAULT, NULL);
158 	data->svmd_off = cra->offset;
159 	data->svmd_prot = cra->prot & ~PROT_USER;
160 
161 	seg->s_ops = &segvmm_ops;
162 	seg->s_data = data;
163 
164 	if (cra->vmo != NULL) {
165 		data->svmd_vmo = cra->vmo;
166 		/* Grab a hold on the VM object for the lifetime of segment */
167 		vm_object_reference(data->svmd_vmo);
168 	} else {
169 		int err;
170 
171 		data->svmd_vmc = cra->vmc;
172 		err = vmc_set_inval_cb(data->svmd_vmc, segvmm_invalidate, seg);
173 		if (err != 0) {
174 			seg->s_ops = NULL;
175 			seg->s_data = NULL;
176 			kmem_free(data, sizeof (*data));
177 			return (err);
178 		}
179 	}
180 	return (0);
181 }
182 
183 static int
184 segvmm_dup(struct seg *seg, struct seg *newseg)
185 {
186 	segvmm_data_t *svmd = seg->s_data;
187 	segvmm_data_t *newsvmd;
188 
189 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
190 
191 	newsvmd = kmem_zalloc(sizeof (segvmm_data_t), KM_SLEEP);
192 	rw_init(&newsvmd->svmd_lock, NULL, RW_DEFAULT, NULL);
193 	newsvmd->svmd_off = svmd->svmd_off;
194 	newsvmd->svmd_prot = svmd->svmd_prot;
195 
196 	newseg->s_ops = seg->s_ops;
197 	newseg->s_data = newsvmd;
198 
199 	if (svmd->svmd_vmo != NULL) {
200 		/* Grab another hold for the duplicate segment */
201 		vm_object_reference(svmd->svmd_vmo);
202 		newsvmd->svmd_vmo = svmd->svmd_vmo;
203 	} else {
204 		int err;
205 
206 		newsvmd->svmd_vmc = vmc_clone(svmd->svmd_vmc);
207 		/*
208 		 * The cloned client does not inherit the invalidation
209 		 * configuration, so attempt to set it here for the new segment.
210 		 */
211 		err = vmc_set_inval_cb(newsvmd->svmd_vmc, segvmm_invalidate,
212 		    newseg);
213 		if (err != 0) {
214 			newseg->s_ops = NULL;
215 			newseg->s_data = NULL;
216 			kmem_free(newsvmd, sizeof (*newsvmd));
217 			return (err);
218 		}
219 	}
220 
221 	return (0);
222 }
223 
224 static int
225 segvmm_unmap(struct seg *seg, caddr_t addr, size_t len)
226 {
227 	segvmm_data_t *svmd = seg->s_data;
228 
229 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
230 
231 	/* Only allow unmap of entire segment */
232 	if (addr != seg->s_base || len != seg->s_size) {
233 		return (EINVAL);
234 	}
235 	if (svmd->svmd_softlockcnt != 0) {
236 		return (EAGAIN);
237 	}
238 
239 	/* Unconditionally unload the entire segment range.  */
240 	hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP);
241 
242 	seg_free(seg);
243 	return (0);
244 }
245 
246 static void
247 segvmm_free(struct seg *seg)
248 {
249 	segvmm_data_t *svmd = seg->s_data;
250 
251 	ASSERT(svmd != NULL);
252 
253 	if (svmd->svmd_vmo != NULL) {
254 		/* Release the VM object hold this segment possessed */
255 		vm_object_release(svmd->svmd_vmo);
256 		svmd->svmd_vmo = NULL;
257 	} else {
258 		vmc_destroy(svmd->svmd_vmc);
259 		svmd->svmd_vmc = NULL;
260 	}
261 	rw_destroy(&svmd->svmd_lock);
262 	VERIFY(svmd->svmd_softlockcnt == 0);
263 	kmem_free(svmd, sizeof (*svmd));
264 	seg->s_data = NULL;
265 }
266 
267 static int
268 segvmm_fault_obj(struct hat *hat, struct seg *seg, uintptr_t va, size_t len)
269 {
270 	segvmm_data_t *svmd = seg->s_data;
271 	const uintptr_t end = va + len;
272 	const int prot = svmd->svmd_prot;
273 	const int uprot = prot | PROT_USER;
274 	vm_object_t *vmo = svmd->svmd_vmo;
275 
276 	ASSERT(vmo != NULL);
277 
278 	va &= PAGEMASK;
279 	uintptr_t off = va - (uintptr_t)seg->s_base + svmd->svmd_off;
280 	do {
281 		pfn_t pfn;
282 
283 		pfn = vm_object_pfn(vmo, off);
284 		if (pfn == PFN_INVALID) {
285 			return (FC_NOMAP);
286 		}
287 
288 		/* Ignore any large-page possibilities for now */
289 		hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, uprot, HAT_LOAD);
290 		va += PAGESIZE;
291 		off += PAGESIZE;
292 	} while (va < end);
293 
294 	return (0);
295 }
296 
297 static int
298 segvmm_fault_space(struct hat *hat, struct seg *seg, uintptr_t va, size_t len)
299 {
300 	segvmm_data_t *svmd = seg->s_data;
301 	const uintptr_t end = va + len;
302 	const int prot = svmd->svmd_prot;
303 	const int uprot = prot | PROT_USER;
304 	vm_client_t *vmc = svmd->svmd_vmc;
305 
306 	ASSERT(vmc != NULL);
307 
308 	va &= PAGEMASK;
309 	uintptr_t off = va - (uintptr_t)seg->s_base + svmd->svmd_off;
310 
311 	do {
312 		vm_page_t *vmp;
313 		pfn_t pfn;
314 
315 		vmp = vmc_hold(vmc, off, prot);
316 		if (vmp == NULL) {
317 			return (FC_NOMAP);
318 		}
319 
320 		pfn = vmp_get_pfn(vmp);
321 		ASSERT3U(pfn, !=, PFN_INVALID);
322 
323 		/* Ignore any large-page possibilities for now */
324 		hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, uprot, HAT_LOAD);
325 
326 		if (vmp_release(vmp)) {
327 			/*
328 			 * Region was unmapped from vmspace while we were
329 			 * loading it into this AS.  Communicate it as if it
330 			 * were a fault.
331 			 */
332 			hat_unload(hat, (caddr_t)va, PAGESIZE, HAT_UNLOAD);
333 			return (FC_NOMAP);
334 		}
335 
336 		va += PAGESIZE;
337 		off += PAGESIZE;
338 	} while (va < end);
339 
340 	return (0);
341 }
342 
343 /* ARGSUSED */
344 static faultcode_t
345 segvmm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
346     enum fault_type type, enum seg_rw rw)
347 {
348 	segvmm_data_t *svmd = seg->s_data;
349 	int err = 0;
350 
351 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
352 
353 	if (type == F_PROT) {
354 		/*
355 		 * Since protection on the segment is fixed, there is nothing
356 		 * to do but report an error for protection faults.
357 		 */
358 		return (FC_PROT);
359 	} else if (type == F_SOFTUNLOCK) {
360 		size_t plen = btop(len);
361 
362 		rw_enter(&svmd->svmd_lock, RW_WRITER);
363 		VERIFY(svmd->svmd_softlockcnt >= plen);
364 		svmd->svmd_softlockcnt -= plen;
365 		rw_exit(&svmd->svmd_lock);
366 		return (0);
367 	}
368 
369 	VERIFY(type == F_INVAL || type == F_SOFTLOCK);
370 	rw_enter(&svmd->svmd_lock, RW_WRITER);
371 
372 	if (svmd->svmd_vmo != NULL) {
373 		err = segvmm_fault_obj(hat, seg, (uintptr_t)addr, len);
374 	} else {
375 		err = segvmm_fault_space(hat, seg, (uintptr_t)addr, len);
376 	}
377 	if (type == F_SOFTLOCK && err == 0) {
378 		size_t nval = svmd->svmd_softlockcnt + btop(len);
379 
380 		if (svmd->svmd_softlockcnt >= nval) {
381 			rw_exit(&svmd->svmd_lock);
382 			return (FC_MAKE_ERR(EOVERFLOW));
383 		}
384 		svmd->svmd_softlockcnt = nval;
385 	}
386 
387 	rw_exit(&svmd->svmd_lock);
388 	return (err);
389 }
390 
391 /* ARGSUSED */
392 static faultcode_t
393 segvmm_faulta(struct seg *seg, caddr_t addr)
394 {
395 	/* Do nothing since asynch pagefault should not load translation. */
396 	return (0);
397 }
398 
399 /* ARGSUSED */
400 static int
401 segvmm_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
402 {
403 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
404 
405 	/* The seg_vmm driver does not yet allow protection to be changed. */
406 	return (EACCES);
407 }
408 
409 /* ARGSUSED */
410 static int
411 segvmm_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
412 {
413 	segvmm_data_t *svmd = seg->s_data;
414 	int error = 0;
415 
416 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
417 
418 	rw_enter(&svmd->svmd_lock, RW_READER);
419 	if ((svmd->svmd_prot & prot) != prot) {
420 		error = EACCES;
421 	}
422 	rw_exit(&svmd->svmd_lock);
423 	return (error);
424 }
425 
426 /* ARGSUSED */
427 static int
428 segvmm_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
429 {
430 	/* Always succeed since there are no backing store to sync */
431 	return (0);
432 }
433 
434 /* ARGSUSED */
435 static size_t
436 segvmm_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
437 {
438 	size_t sz = 0;
439 
440 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
441 
442 	len = (len + PAGEOFFSET) & PAGEMASK;
443 	while (len > 0) {
444 		*vec = 1;
445 		sz += PAGESIZE;
446 		vec++;
447 		len -= PAGESIZE;
448 	}
449 	return (sz);
450 }
451 
452 /* ARGSUSED */
453 static int
454 segvmm_lockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op,
455     ulong_t *lockmap, size_t pos)
456 {
457 	/* Report success since kernel pages are always in memory. */
458 	return (0);
459 }
460 
461 static int
462 segvmm_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
463 {
464 	segvmm_data_t *svmd = seg->s_data;
465 	size_t pgno;
466 	uint_t prot;
467 
468 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
469 
470 	rw_enter(&svmd->svmd_lock, RW_READER);
471 	prot = svmd->svmd_prot;
472 	rw_exit(&svmd->svmd_lock);
473 
474 	/*
475 	 * Reporting protection is simple since it is not tracked per-page.
476 	 */
477 	pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
478 	while (pgno > 0) {
479 		protv[--pgno] = prot;
480 	}
481 	return (0);
482 }
483 
484 /* ARGSUSED */
485 static u_offset_t
486 segvmm_getoffset(struct seg *seg, caddr_t addr)
487 {
488 	/*
489 	 * To avoid leaking information about the layout of the kernel address
490 	 * space, always report '0' as the offset.
491 	 */
492 	return (0);
493 }
494 
495 /* ARGSUSED */
496 static int
497 segvmm_gettype(struct seg *seg, caddr_t addr)
498 {
499 	/*
500 	 * Since already-existing vmm reservoir pages are being mapped into
501 	 * userspace, always report the segment type as shared.
502 	 */
503 	return (MAP_SHARED);
504 }
505 
506 /* ARGSUSED */
507 static int
508 segvmm_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
509 {
510 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
511 
512 	*vpp = NULL;
513 	return (0);
514 }
515 
516 /* ARGSUSED */
517 static int
518 segvmm_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
519 {
520 	if (behav == MADV_PURGE) {
521 		/* Purge does not make sense for this mapping */
522 		return (EINVAL);
523 	}
524 	/* Indicate success for everything else. */
525 	return (0);
526 }
527 
528 /* ARGSUSED */
529 static void
530 segvmm_dump(struct seg *seg)
531 {
532 	/*
533 	 * Since this is a mapping to share kernel data with userspace, nothing
534 	 * additional should be dumped.
535 	 */
536 }
537 
538 /* ARGSUSED */
539 static int
540 segvmm_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
541     enum lock_type type, enum seg_rw rw)
542 {
543 	return (ENOTSUP);
544 }
545 
546 /* ARGSUSED */
547 static int
548 segvmm_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
549 {
550 	return (ENOTSUP);
551 }
552 
553 static int
554 segvmm_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
555 {
556 	segvmm_data_t *svmd = seg->s_data;
557 
558 	memidp->val[0] = (uintptr_t)svmd->svmd_vmo;
559 	memidp->val[1] = (uintptr_t)(addr - seg->s_base) + svmd->svmd_off;
560 	return (0);
561 }
562 
563 /* ARGSUSED */
564 static int
565 segvmm_capable(struct seg *seg, segcapability_t capability)
566 {
567 	/* no special capablities */
568 	return (0);
569 }
570