1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 /*
29 * PMEM - Direct mapping physical memory pages to userland process
30 *
31 * Provide functions used for directly (w/o occupying kernel virtual address
32 * space) allocating and exporting physical memory pages to userland.
33 */
34
35 #include <sys/types.h>
36 #include <sys/mutex.h>
37 #include <sys/sunddi.h>
38 #include <sys/ddidevmap.h>
39 #include <sys/vnode.h>
40 #include <sys/sysmacros.h>
41 #include <vm/seg_dev.h>
42 #include <sys/pmem.h>
43 #include <vm/hat_i86.h>
44 #include <sys/task.h>
45 #include <sys/sdt.h>
46
47 /*
48 * The routines in this file allocate memory which will be accessed through
49 * the AGP GART hardware. The GART is programmed with the PFNs for this
50 * memory, and the only mechanism for removing these entries is by an
51 * explicit process operation (ioctl/close of the driver, or process exit).
52 * As such, the pages need to remain locked to ensure that they won't be
53 * relocated or paged out.
54 *
55 * To prevent these locked pages from getting in the way of page
56 * coalescing, we try to allocate large pages from the system, and carve
57 * them up to satisfy pmem allocation requests. This will keep the locked
58 * pages within a constrained area of physical memory, limiting the number
59 * of large pages that would be pinned by our locked pages. This is, of
60 * course, another take on the infamous kernel cage, and it has many of the
61 * downsides of the original cage. It also interferes with system-wide
62 * resource management decisions, as it maintains its own pool of unused
63 * pages which can't be easily reclaimed and used during low-memory
64 * situations.
65 *
66 * The right solution is for pmem to register a callback that the VM system
67 * could call, which would temporarily remove any GART entries for pages
68 * that were being relocated. This would let us leave the pages unlocked,
69 * which would remove the need for using large pages, which would simplify
70 * this code a great deal. Unfortunately, the support for these callbacks
71 * only exists on some SPARC platforms right now.
72 *
73 * Note that this is the *only* reason that large pages are used here. The
74 * GART can't perform large-page translations, and the code appropriately
75 * falls back to using small pages if page_create_va_large() fails.
76 */
77
78 #define HOLD_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
79 { mutex_enter(&dhp->dh_lock); }
80
81 #define RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
82 { mutex_exit(&dhp->dh_lock); }
83
84 #define FROM_LPG(pp) (pp->p_szc != 0)
85 #define PFIND(pp) (page_pptonum(pp) & (pmem_pgcnt - 1))
86
87 /*
88 * Structs and static variables used for pmem only.
89 */
90 typedef struct pmem_lpg {
91 page_t *pl_pp; /* start pp */
92 ulong_t *pl_bitmap; /* allocation status for each page */
93 ushort_t pl_pfree; /* this large page might be fully freed */
94 struct pmem_lpg *pl_next;
95 struct pmem_lpg *pl_prev;
96 } pmem_lpg_t;
97
98 static size_t pmem_lpgsize; /* the size of one large page */
99 static pgcnt_t pmem_pgcnt; /* the number of small pages in a large page */
100 static uint_t pmem_lszc; /* page size code of the large page */
101 /* The segment to be associated with all the allocated pages. */
102 static struct seg pmem_seg;
103 /* Fully occupied large pages allocated for pmem. */
104 static pmem_lpg_t *pmem_occ_lpgs;
105 /* Memory pool to store residual small pages from large pages. */
106 static page_t *pmem_mpool = NULL;
107 /* Number of small pages reside in pmem_mpool currently. */
108 static pgcnt_t pmem_nmpages = 0;
109 /* To protect pmem_nmpages, pmem_mpool and pmem_occ_lpgs. */
110 kmutex_t pmem_mutex;
111
112 static int lpg_isfree(pmem_lpg_t *);
113 static void pmem_lpg_sub(pmem_lpg_t **, pmem_lpg_t *);
114 static void pmem_lpg_concat(pmem_lpg_t **, pmem_lpg_t **);
115 static pmem_lpg_t *pmem_lpg_get(pmem_lpg_t *, page_t *, pmem_lpg_t **);
116 static pmem_lpg_t *pmem_lpg_alloc(uint_t);
117 static void pmem_lpg_free(pmem_lpg_t **, pmem_lpg_t *);
118 static void lpg_free(page_t *spp);
119 static pgcnt_t mpool_break(page_t **, pgcnt_t);
120 static void mpool_append(page_t **, pgcnt_t);
121 static void lpp_break(page_t **, pgcnt_t, pgcnt_t, pmem_lpg_t *);
122 static void lpp_free(page_t *, pgcnt_t, pmem_lpg_t **);
123 static int lpp_create(page_t **, pgcnt_t, pgcnt_t *, pmem_lpg_t **,
124 vnode_t *, u_offset_t *, uint_t);
125 static void tlist_in(page_t *, pgcnt_t, vnode_t *, u_offset_t *);
126 static void tlist_out(page_t *, pgcnt_t);
127 static int pmem_cookie_alloc(struct devmap_pmem_cookie **, pgcnt_t, uint_t);
128 static int pmem_lock(pgcnt_t, proc_t *p);
129
130 /*
131 * Called by driver devmap routine to pass physical memory mapping info to
132 * seg_dev framework, used only for physical memory allocated from
133 * devmap_pmem_alloc().
134 */
135 /* ARGSUSED */
136 int
devmap_pmem_setup(devmap_cookie_t dhc,dev_info_t * dip,struct devmap_callback_ctl * callbackops,devmap_pmem_cookie_t cookie,offset_t off,size_t len,uint_t maxprot,uint_t flags,ddi_device_acc_attr_t * accattrp)137 devmap_pmem_setup(devmap_cookie_t dhc, dev_info_t *dip,
138 struct devmap_callback_ctl *callbackops, devmap_pmem_cookie_t cookie,
139 offset_t off, size_t len, uint_t maxprot, uint_t flags,
140 ddi_device_acc_attr_t *accattrp)
141 {
142 devmap_handle_t *dhp = (devmap_handle_t *)dhc;
143 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
144 uint_t cache_attr = IOMEM_CACHE_ATTR(flags);
145
146 if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
147 return (DDI_FAILURE);
148
149 /*
150 * First to check if this function has been called for this dhp.
151 */
152 if (dhp->dh_flags & DEVMAP_SETUP_DONE)
153 return (DDI_FAILURE);
154
155 if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
156 return (DDI_FAILURE);
157
158 /*
159 * Check if the cache attributes are supported. Need to pay
160 * attention that only uncachable or write-combining is
161 * permitted for pmem.
162 */
163 if (i_ddi_check_cache_attr(flags) == B_FALSE ||
164 (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
165 return (DDI_FAILURE);
166
167 if (flags & DEVMAP_MAPPING_INVALID) {
168 /*
169 * If DEVMAP_MAPPING_INVALID is specified, we have to grant
170 * remap permission.
171 */
172 if (!(flags & DEVMAP_ALLOW_REMAP))
173 return (DDI_FAILURE);
174 } else {
175 dhp->dh_pcookie = (devmap_pmem_cookie_t)pcp;
176 /* dh_roff is the offset inside the dh_pcookie. */
177 dhp->dh_roff = ptob(btop(off));
178 /* Set the cache attributes correctly */
179 i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);
180 }
181
182 dhp->dh_cookie = DEVMAP_PMEM_COOKIE;
183 dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS);
184 dhp->dh_len = ptob(btopr(len));
185
186 dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
187 ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
188
189 if (callbackops != NULL) {
190 bcopy(callbackops, &dhp->dh_callbackops,
191 sizeof (struct devmap_callback_ctl));
192 }
193
194 /*
195 * Initialize dh_lock if we want to do remap.
196 */
197 if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) {
198 mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
199 dhp->dh_flags |= DEVMAP_LOCK_INITED;
200 }
201
202 dhp->dh_flags |= DEVMAP_SETUP_DONE;
203
204 return (DDI_SUCCESS);
205 }
206
207 /*
208 * Replace existing mapping using a new cookie, mainly gets called when doing
209 * fork(). Should be called in associated devmap_dup(9E).
210 */
211 /* ARGSUSED */
212 int
devmap_pmem_remap(devmap_cookie_t dhc,dev_info_t * dip,devmap_pmem_cookie_t cookie,offset_t off,size_t len,uint_t maxprot,uint_t flags,ddi_device_acc_attr_t * accattrp)213 devmap_pmem_remap(devmap_cookie_t dhc, dev_info_t *dip,
214 devmap_pmem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot,
215 uint_t flags, ddi_device_acc_attr_t *accattrp)
216 {
217 devmap_handle_t *dhp = (devmap_handle_t *)dhc;
218 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
219 uint_t cache_attr = IOMEM_CACHE_ATTR(flags);
220
221 /*
222 * Reture failure if setup has not been done or no remap permission
223 * has been granted during the setup.
224 */
225 if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 ||
226 (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0)
227 return (DDI_FAILURE);
228
229 /* No flags supported for remap yet. */
230 if (flags != 0)
231 return (DDI_FAILURE);
232
233 if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
234 return (DDI_FAILURE);
235
236 if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
237 return (DDI_FAILURE);
238
239 /*
240 * Check if the cache attributes are supported. Need to pay
241 * attention that only uncachable or write-combining is
242 * permitted for pmem.
243 */
244 if (i_ddi_check_cache_attr(flags) == B_FALSE ||
245 (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
246 return (DDI_FAILURE);
247
248 HOLD_DHP_LOCK(dhp);
249 /*
250 * Unload the old mapping of pages reloated with this dhp, so next
251 * fault will setup the new mappings. It is in segdev_faultpage that
252 * calls hat_devload to establish the mapping. Do this while holding
253 * the dhp lock so other faults dont reestablish the mappings.
254 */
255 hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
256 dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
257
258 /* Set the cache attributes correctly */
259 i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);
260
261 dhp->dh_pcookie = cookie;
262 dhp->dh_roff = ptob(btop(off));
263 dhp->dh_len = ptob(btopr(len));
264
265 /* Clear the large page size flag. */
266 dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
267
268 dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
269 ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
270 RELE_DHP_LOCK(dhp);
271 return (DDI_SUCCESS);
272 }
273
274 /*
275 * Directly (i.e., without occupying kernel virtual address space) allocate
276 * 'npages' physical memory pages for exporting to user land. The allocated
277 * page_t pointer will be recorded in cookie.
278 */
279 int
devmap_pmem_alloc(size_t size,uint_t flags,devmap_pmem_cookie_t * cookiep)280 devmap_pmem_alloc(size_t size, uint_t flags, devmap_pmem_cookie_t *cookiep)
281 {
282 u_offset_t pmem_off = 0;
283 page_t *pp = NULL;
284 page_t *lpp = NULL;
285 page_t *tlist = NULL;
286 pgcnt_t i = 0;
287 pgcnt_t rpages = 0;
288 pgcnt_t lpages = 0;
289 pgcnt_t tpages = 0;
290 pgcnt_t npages = btopr(size);
291 pmem_lpg_t *plp = NULL;
292 struct devmap_pmem_cookie *pcp;
293 uint_t reserved = 0;
294 uint_t locked = 0;
295 uint_t pflags, kflags;
296
297 *cookiep = NULL;
298
299 /*
300 * Number larger than this will cause page_create_va() to loop
301 * infinitely.
302 */
303 if (npages == 0 || npages >= total_pages / 2)
304 return (DDI_FAILURE);
305 if ((flags & (PMEM_SLEEP | PMEM_NOSLEEP)) == 0)
306 return (DDI_FAILURE);
307 pflags = flags & PMEM_NOSLEEP ? PG_EXCL : PG_WAIT;
308 kflags = flags & PMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP;
309
310 /* Allocate pmem cookie. */
311 if (pmem_cookie_alloc(&pcp, npages, kflags) == DDI_FAILURE)
312 return (DDI_FAILURE);
313 pcp->dp_npages = npages;
314
315 /*
316 * See if the requested memory can be locked.
317 */
318 pcp->dp_proc = curproc;
319 if (pmem_lock(npages, curproc) == DDI_FAILURE)
320 goto alloc_fail;
321 locked = 1;
322 /*
323 * First, grab as many as possible from pmem_mpool. If pages in
324 * pmem_mpool are enough for this request, we are done.
325 */
326 mutex_enter(&pmem_mutex);
327 tpages = mpool_break(&tlist, npages);
328 /* IOlock and hashin them into the new offset. */
329 if (tpages)
330 tlist_in(tlist, tpages, pcp->dp_vnp, &pmem_off);
331 mutex_exit(&pmem_mutex);
332
333 if (tpages == npages)
334 goto done;
335
336 rpages = npages - tpages;
337 /* Quit now if memory cannot be reserved. */
338 if (!page_resv(rpages, kflags))
339 goto alloc_fail;
340 reserved = 1;
341
342 /* If we have large pages */
343 if (pmem_lpgsize > PAGESIZE) {
344 /* Try to alloc large pages first to decrease fragmentation. */
345 i = (rpages + (pmem_pgcnt - 1)) / pmem_pgcnt;
346 if (lpp_create(&lpp, i, &lpages, &plp, pcp->dp_vnp, &pmem_off,
347 kflags) == DDI_FAILURE)
348 goto alloc_fail;
349 ASSERT(lpages == 0 ? lpp == NULL : 1);
350 }
351
352 /*
353 * Pages in large pages is more than the request, put the residual
354 * pages into pmem_mpool.
355 */
356 if (lpages >= rpages) {
357 lpp_break(&lpp, lpages, lpages - rpages, plp);
358 goto done;
359 }
360
361 /* Allocate small pages if lpp+tlist cannot satisfy the request. */
362 i = rpages - lpages;
363 if ((pp = page_create_va(pcp->dp_vnp, pmem_off, ptob(i),
364 pflags, &pmem_seg, (caddr_t)(uintptr_t)pmem_off)) == NULL)
365 goto alloc_fail;
366
367 done:
368 page_list_concat(&tlist, &lpp);
369 page_list_concat(&tlist, &pp);
370 /* Set those small pages from large pages as allocated. */
371 mutex_enter(&pmem_mutex);
372 pmem_lpg_concat(&pmem_occ_lpgs, &plp);
373 mutex_exit(&pmem_mutex);
374
375 /*
376 * Now tlist holds all the pages for this cookie. Record these pages in
377 * pmem cookie.
378 */
379 for (pp = tlist, i = 0; i < npages; i++) {
380 pcp->dp_pparray[i] = pp;
381 page_io_unlock(pp);
382 pp = pp->p_next;
383 page_sub(&tlist, pp->p_prev);
384 }
385 ASSERT(tlist == NULL);
386 *cookiep = (devmap_pmem_cookie_t)pcp;
387
388 return (DDI_SUCCESS);
389
390 alloc_fail:
391 DTRACE_PROBE(pmem__alloc__fail);
392 /* Free large pages and the associated allocation records. */
393 if (lpp)
394 lpp_free(lpp, lpages / pmem_pgcnt, &plp);
395 if (reserved == 1)
396 page_unresv(rpages);
397 /* Put those pages in tlist back into pmem_mpool. */
398 if (tpages != 0) {
399 mutex_enter(&pmem_mutex);
400 /* IOunlock, hashout and update the allocation records. */
401 tlist_out(tlist, tpages);
402 mpool_append(&tlist, tpages);
403 mutex_exit(&pmem_mutex);
404 }
405 if (locked == 1)
406 i_ddi_decr_locked_memory(pcp->dp_proc, ptob(pcp->dp_npages));
407 /* Freeing pmem_cookie. */
408 kmem_free(pcp->dp_vnp, sizeof (vnode_t));
409 kmem_free(pcp->dp_pparray, npages * sizeof (page_t *));
410 kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
411 return (DDI_FAILURE);
412 }
413
414 /*
415 * Free all small pages inside cookie, and return pages from large pages into
416 * mpool, if all the pages from one large page is in mpool, free it as a whole.
417 */
418 void
devmap_pmem_free(devmap_pmem_cookie_t cookie)419 devmap_pmem_free(devmap_pmem_cookie_t cookie)
420 {
421 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
422 pgcnt_t i;
423 pgcnt_t tpages = 0;
424 page_t *pp;
425 pmem_lpg_t *pl1, *plp;
426 pmem_lpg_t *pf_lpgs = NULL;
427 uint_t npls = 0;
428 pmem_lpg_t *last_pl = NULL;
429 pmem_lpg_t *plast_pl = NULL;
430
431 ASSERT(pcp);
432 mutex_enter(&pmem_mutex);
433 /* Free small pages and return them to memory pool. */
434 for (i = pcp->dp_npages; i > 0; i--) {
435 pp = pcp->dp_pparray[i - 1];
436 page_hashout(pp, NULL);
437 /*
438 * Remove the mapping of this single page, this mapping is
439 * created using hat_devload() in segdev_faultpage().
440 */
441 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
442 if (!FROM_LPG(pp)) {
443 /* Normal small page. */
444 page_free(pp, 1);
445 page_unresv(1);
446 } else {
447 /* Small page from large pages. */
448 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
449 if (plp && !(plp->pl_pfree)) {
450 /*
451 * Move this record to pf_lpgs list, this large
452 * page may be able to be freed as a whole.
453 */
454 pmem_lpg_sub(&pmem_occ_lpgs, plp);
455 pmem_lpg_concat(&pf_lpgs, &plp);
456 plp->pl_pfree = 1;
457 npls++;
458 last_pl = NULL;
459 } else {
460 /* Search in pf_lpgs list. */
461 plp = pmem_lpg_get(pf_lpgs, pp, &plast_pl);
462 }
463 ASSERT(plp);
464 /* Mark this page as free. */
465 BT_SET(plp->pl_bitmap, PFIND(pp));
466 /* Record this page in pmem_mpool. */
467 mpool_append(&pp, 1);
468 }
469 }
470
471 /*
472 * Find out the large pages whose pages have been freed, remove them
473 * from plp list, free them and the associated pmem_lpg struct.
474 */
475 for (plp = pf_lpgs; npls != 0; npls--) {
476 pl1 = plp;
477 plp = plp->pl_next;
478 if (lpg_isfree(pl1)) {
479 /*
480 * Get one free large page. Find all pages in this
481 * large page and remove them from pmem_mpool.
482 */
483 lpg_free(pl1->pl_pp);
484 /* Remove associated allocation records. */
485 pmem_lpg_sub(&pf_lpgs, pl1);
486 pmem_lpg_free(&pf_lpgs, pl1);
487 tpages -= pmem_pgcnt;
488 } else
489 pl1->pl_pfree = 0;
490 }
491 /* Update allocation records accordingly. */
492 pmem_lpg_concat(&pmem_occ_lpgs, &pf_lpgs);
493 mutex_exit(&pmem_mutex);
494
495 if (curproc == pcp->dp_proc)
496 i_ddi_decr_locked_memory(curproc, ptob(pcp->dp_npages));
497 kmem_free(pcp->dp_vnp, sizeof (vnode_t));
498 kmem_free(pcp->dp_pparray, pcp->dp_npages * sizeof (page_t *));
499 kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
500 }
501
502 /*
503 * To extract page frame number from specified range in a cookie.
504 */
505 int
devmap_pmem_getpfns(devmap_pmem_cookie_t cookie,uint_t start,pgcnt_t npages,pfn_t * pfnarray)506 devmap_pmem_getpfns(devmap_pmem_cookie_t cookie, uint_t start, pgcnt_t npages,
507 pfn_t *pfnarray)
508 {
509 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
510 pgcnt_t i;
511
512 if (pcp == NULL || start + npages > pcp->dp_npages)
513 return (DDI_FAILURE);
514
515 for (i = start; i < start + npages; i++)
516 pfnarray[i - start] = pfn_to_mfn(pcp->dp_pparray[i]->p_pagenum);
517
518 return (DDI_SUCCESS);
519 }
520
521 void
pmem_init()522 pmem_init()
523 {
524 mutex_init(&pmem_mutex, NULL, MUTEX_DEFAULT, NULL);
525 pmem_lszc = MIN(1, page_num_pagesizes() - 1);
526 pmem_lpgsize = page_get_pagesize(pmem_lszc);
527 pmem_pgcnt = pmem_lpgsize >> PAGESHIFT;
528 bzero(&pmem_seg, sizeof (struct seg));
529 pmem_seg.s_as = &kas;
530 }
531
532 /* Allocate kernel memory for one pmem cookie with n pages. */
533 static int
pmem_cookie_alloc(struct devmap_pmem_cookie ** pcpp,pgcnt_t n,uint_t kflags)534 pmem_cookie_alloc(struct devmap_pmem_cookie **pcpp, pgcnt_t n, uint_t kflags)
535 {
536 struct devmap_pmem_cookie *pcp;
537
538 if ((*pcpp = kmem_zalloc(sizeof (struct devmap_pmem_cookie),
539 kflags)) == NULL)
540 return (DDI_FAILURE);
541 pcp = *pcpp;
542 if ((pcp->dp_vnp =
543 kmem_zalloc(sizeof (vnode_t), kflags)) == NULL) {
544 kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
545 return (DDI_FAILURE);
546 }
547 if ((pcp->dp_pparray =
548 kmem_zalloc(n * sizeof (page_t *), kflags)) == NULL) {
549 kmem_free(pcp->dp_vnp, sizeof (vnode_t));
550 kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
551 return (DDI_FAILURE);
552 }
553 return (DDI_SUCCESS);
554 }
555
556 /* Try to lock down n pages resource */
557 static int
pmem_lock(pgcnt_t n,proc_t * p)558 pmem_lock(pgcnt_t n, proc_t *p)
559 {
560 if (i_ddi_incr_locked_memory(p, ptob(n)) != 0) {
561 return (DDI_FAILURE);
562 }
563 return (DDI_SUCCESS);
564 }
565
566 /* To check if all the pages in a large page are freed. */
567 static int
lpg_isfree(pmem_lpg_t * plp)568 lpg_isfree(pmem_lpg_t *plp)
569 {
570 uint_t i;
571
572 for (i = 0; i < BT_BITOUL(pmem_pgcnt); i++)
573 if (plp->pl_bitmap[i] != BT_ULMAXMASK)
574 return (0);
575 /* All 1 means all pages are freed. */
576 return (1);
577 }
578
579 /*
580 * Using pp to get the associated large page allocation record, searching in
581 * the splp linked list with *last as the heuristic pointer. Return NULL if
582 * not found.
583 */
584 static pmem_lpg_t *
pmem_lpg_get(pmem_lpg_t * splp,page_t * pp,pmem_lpg_t ** last)585 pmem_lpg_get(pmem_lpg_t *splp, page_t *pp, pmem_lpg_t **last)
586 {
587 pmem_lpg_t *plp;
588 pgcnt_t root_pfn;
589
590 ASSERT(pp);
591 if (splp == NULL)
592 return (NULL);
593 root_pfn = page_pptonum(pp) & ~(pmem_pgcnt - 1);
594
595 /* Try last winner first. */
596 if (*last && root_pfn == page_pptonum((*last)->pl_pp))
597 goto pl_found;
598
599 /* Else search the whole pmem_lpg list. */
600 for (plp = splp; root_pfn != page_pptonum(plp->pl_pp); ) {
601 plp = plp->pl_next;
602 if (plp == splp) {
603 plp = NULL;
604 break;
605 }
606 ASSERT(plp->pl_pp);
607 }
608
609 *last = plp;
610
611 pl_found:
612 return (*last);
613 }
614
615 /*
616 * Remove one pmem_lpg plp from the oplpp list.
617 */
618 static void
pmem_lpg_sub(pmem_lpg_t ** oplpp,pmem_lpg_t * plp)619 pmem_lpg_sub(pmem_lpg_t **oplpp, pmem_lpg_t *plp)
620 {
621 if (*oplpp == plp)
622 *oplpp = plp->pl_next; /* go to next pmem_lpg */
623
624 if (*oplpp == plp)
625 *oplpp = NULL; /* pmem_lpg list is gone */
626 else {
627 plp->pl_prev->pl_next = plp->pl_next;
628 plp->pl_next->pl_prev = plp->pl_prev;
629 }
630 plp->pl_prev = plp->pl_next = plp; /* make plp a list of one */
631 }
632
633 /*
634 * Concatenate page list nplpp onto the end of list plpp.
635 */
636 static void
pmem_lpg_concat(pmem_lpg_t ** plpp,pmem_lpg_t ** nplpp)637 pmem_lpg_concat(pmem_lpg_t **plpp, pmem_lpg_t **nplpp)
638 {
639 pmem_lpg_t *s1p, *s2p, *e1p, *e2p;
640
641 if (*nplpp == NULL) {
642 return;
643 }
644 if (*plpp == NULL) {
645 *plpp = *nplpp;
646 return;
647 }
648 s1p = *plpp;
649 e1p = s1p->pl_prev;
650 s2p = *nplpp;
651 e2p = s2p->pl_prev;
652 s1p->pl_prev = e2p;
653 e2p->pl_next = s1p;
654 e1p->pl_next = s2p;
655 s2p->pl_prev = e1p;
656 }
657
658 /*
659 * Allocate and initialize the allocation record of one large page, the init
660 * value is 'allocated'.
661 */
662 static pmem_lpg_t *
pmem_lpg_alloc(uint_t kflags)663 pmem_lpg_alloc(uint_t kflags)
664 {
665 pmem_lpg_t *plp;
666
667 ASSERT(pmem_pgcnt % BT_NBIPUL == 0);
668 plp = kmem_zalloc(sizeof (pmem_lpg_t), kflags);
669 if (plp == NULL)
670 return (NULL);
671 plp->pl_bitmap = kmem_zalloc(BT_SIZEOFMAP(pmem_pgcnt), kflags);
672 if (plp->pl_bitmap == NULL) {
673 kmem_free(plp, sizeof (*plp));
674 return (NULL);
675 }
676 plp->pl_next = plp->pl_prev = plp;
677 return (plp);
678 }
679
680 /* Free one allocation record pointed by oplp. */
681 static void
pmem_lpg_free(pmem_lpg_t ** headp,pmem_lpg_t * plp)682 pmem_lpg_free(pmem_lpg_t **headp, pmem_lpg_t *plp)
683 {
684 if (*headp == plp)
685 *headp = plp->pl_next; /* go to next pmem_lpg_t */
686
687 if (*headp == plp)
688 *headp = NULL; /* this list is gone */
689 else {
690 plp->pl_prev->pl_next = plp->pl_next;
691 plp->pl_next->pl_prev = plp->pl_prev;
692 }
693 kmem_free(plp->pl_bitmap, BT_SIZEOFMAP(pmem_pgcnt));
694 kmem_free(plp, sizeof (*plp));
695 }
696
697 /* Free one large page headed by spp from pmem_mpool. */
698 static void
lpg_free(page_t * spp)699 lpg_free(page_t *spp)
700 {
701 page_t *pp1 = spp;
702 uint_t i;
703
704 ASSERT(MUTEX_HELD(&pmem_mutex));
705 for (i = 0; i < pmem_pgcnt; i++) {
706 /* Break pp1 from pmem_mpool. */
707 page_sub(&pmem_mpool, pp1);
708 pp1++;
709 }
710 /* Free pages in this large page. */
711 page_free_pages(spp);
712 page_unresv(pmem_pgcnt);
713 pmem_nmpages -= pmem_pgcnt;
714 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
715 }
716
717 /* Put n pages in *ppp list back into pmem_mpool. */
718 static void
mpool_append(page_t ** ppp,pgcnt_t n)719 mpool_append(page_t **ppp, pgcnt_t n)
720 {
721 ASSERT(MUTEX_HELD(&pmem_mutex));
722 /* Put back pages. */
723 page_list_concat(&pmem_mpool, ppp);
724 pmem_nmpages += n;
725 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
726 }
727
728 /*
729 * Try to grab MIN(pmem_nmpages, n) pages from pmem_mpool, put them into *ppp
730 * list, and return the number of grabbed pages.
731 */
732 static pgcnt_t
mpool_break(page_t ** ppp,pgcnt_t n)733 mpool_break(page_t **ppp, pgcnt_t n)
734 {
735 pgcnt_t i;
736
737 ASSERT(MUTEX_HELD(&pmem_mutex));
738 /* Grab the pages. */
739 i = MIN(pmem_nmpages, n);
740 *ppp = pmem_mpool;
741 page_list_break(ppp, &pmem_mpool, i);
742 pmem_nmpages -= i;
743 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
744 return (i);
745 }
746
747 /*
748 * Create n large pages, lpages and plpp contains the number of small pages and
749 * allocation records list respectively.
750 */
751 static int
lpp_create(page_t ** lppp,pgcnt_t n,pgcnt_t * lpages,pmem_lpg_t ** plpp,vnode_t * vnp,u_offset_t * offp,uint_t kflags)752 lpp_create(page_t **lppp, pgcnt_t n, pgcnt_t *lpages, pmem_lpg_t **plpp,
753 vnode_t *vnp, u_offset_t *offp, uint_t kflags)
754 {
755 pgcnt_t i;
756 pmem_lpg_t *plp;
757 page_t *pp;
758
759 for (i = 0, *lpages = 0; i < n; i++) {
760 /* Allocte one large page each time. */
761 pp = page_create_va_large(vnp, *offp, pmem_lpgsize,
762 PG_EXCL, &pmem_seg, (caddr_t)(uintptr_t)*offp, NULL);
763 if (pp == NULL)
764 break;
765 *offp += pmem_lpgsize;
766 page_list_concat(lppp, &pp);
767 *lpages += pmem_pgcnt;
768 /* Add one allocation record for this large page. */
769 if ((plp = pmem_lpg_alloc(kflags)) == NULL)
770 return (DDI_FAILURE);
771 plp->pl_pp = pp;
772 pmem_lpg_concat(plpp, &plp);
773 }
774 return (DDI_SUCCESS);
775 }
776
777 /*
778 * Break the last r small pages from the large page list *lppp (with totally n
779 * small pages) and put them into pmem_mpool.
780 */
781 static void
lpp_break(page_t ** lppp,pgcnt_t n,pgcnt_t r,pmem_lpg_t * oplp)782 lpp_break(page_t **lppp, pgcnt_t n, pgcnt_t r, pmem_lpg_t *oplp)
783 {
784 page_t *pp, *pp1;
785 pgcnt_t i;
786 pmem_lpg_t *plp;
787
788 if (r == 0)
789 return;
790 ASSERT(*lppp != NULL && r < pmem_pgcnt);
791 page_list_break(lppp, &pp, n - r);
792
793 /* The residual should reside in the last large page. */
794 plp = oplp->pl_prev;
795 /* IOunlock and hashout the residual pages. */
796 for (pp1 = pp, i = 0; i < r; i++) {
797 page_io_unlock(pp1);
798 page_hashout(pp1, NULL);
799 /* Mark this page as free. */
800 BT_SET(plp->pl_bitmap, PFIND(pp1));
801 pp1 = pp1->p_next;
802 }
803 ASSERT(pp1 == pp);
804 /* Put these residual pages into memory pool. */
805 mutex_enter(&pmem_mutex);
806 mpool_append(&pp, r);
807 mutex_exit(&pmem_mutex);
808 }
809
810 /* Freeing large pages in lpp and the associated allocation records in plp. */
811 static void
lpp_free(page_t * lpp,pgcnt_t lpgs,pmem_lpg_t ** plpp)812 lpp_free(page_t *lpp, pgcnt_t lpgs, pmem_lpg_t **plpp)
813 {
814 pgcnt_t i, j;
815 page_t *pp = lpp, *pp1;
816 pmem_lpg_t *plp1, *plp2;
817
818 for (i = 0; i < lpgs; i++) {
819 for (j = 0; j < pmem_pgcnt; j++) {
820 /* IO unlock and hashout this small page. */
821 page_io_unlock(pp);
822 page_hashout(pp, NULL);
823 pp1 = pp->p_next;
824 pp->p_prev = pp->p_next = pp;
825 pp = pp1;
826 }
827 /* Free one large page at one time. */
828 page_free_pages(lpp);
829 lpp = pp;
830 }
831 /* Free associate pmem large page allocation records. */
832 for (plp1 = *plpp; *plpp; plp1 = plp2) {
833 plp2 = plp1->pl_next;
834 pmem_lpg_free(plpp, plp1);
835 }
836 }
837
838 /*
839 * IOlock and hashin all pages in tlist, associate them with vnode *pvnp
840 * and offset starting with *poffp. Update allocation records accordingly at
841 * the same time.
842 */
843 static void
tlist_in(page_t * tlist,pgcnt_t tpages,vnode_t * pvnp,u_offset_t * poffp)844 tlist_in(page_t *tlist, pgcnt_t tpages, vnode_t *pvnp, u_offset_t *poffp)
845 {
846 page_t *pp;
847 pgcnt_t i = 0;
848 pmem_lpg_t *plp, *last_pl = NULL;
849
850 ASSERT(MUTEX_HELD(&pmem_mutex));
851 for (pp = tlist; i < tpages; i++) {
852 ASSERT(FROM_LPG(pp));
853 page_io_lock(pp);
854 (void) page_hashin(pp, pvnp, *poffp, NULL);
855 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
856 /* Mark this page as allocated. */
857 BT_CLEAR(plp->pl_bitmap, PFIND(pp));
858 *poffp += PAGESIZE;
859 pp = pp->p_next;
860 }
861 ASSERT(pp == tlist);
862 }
863
864 /*
865 * IOunlock and hashout all pages in tlist, update allocation records
866 * accordingly at the same time.
867 */
868 static void
tlist_out(page_t * tlist,pgcnt_t tpages)869 tlist_out(page_t *tlist, pgcnt_t tpages)
870 {
871 page_t *pp;
872 pgcnt_t i = 0;
873 pmem_lpg_t *plp, *last_pl = NULL;
874
875 ASSERT(MUTEX_HELD(&pmem_mutex));
876 for (pp = tlist; i < tpages; i++) {
877 ASSERT(FROM_LPG(pp));
878 page_io_unlock(pp);
879 page_hashout(pp, NULL);
880 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
881 /* Mark this page as free. */
882 BT_SET(plp->pl_bitmap, PFIND(pp));
883 pp = pp->p_next;
884 }
885 ASSERT(pp == tlist);
886 }
887