1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * PMEM - Direct mapping physical memory pages to userland process 30 * 31 * Provide functions used for directly (w/o occupying kernel virtual address 32 * space) allocating and exporting physical memory pages to userland. 33 */ 34 35 #include <sys/types.h> 36 #include <sys/mutex.h> 37 #include <sys/sunddi.h> 38 #include <sys/ddidevmap.h> 39 #include <sys/vnode.h> 40 #include <sys/sysmacros.h> 41 #include <sys/project.h> 42 #include <vm/seg_dev.h> 43 #include <sys/pmem.h> 44 #include <vm/hat_i86.h> 45 #include <sys/task.h> 46 #include <sys/sdt.h> 47 48 /* 49 * The routines in this file allocate memory which will be accessed through 50 * the AGP GART hardware. The GART is programmed with the PFNs for this 51 * memory, and the only mechanism for removing these entries is by an 52 * explicit process operation (ioctl/close of the driver, or process exit). 53 * As such, the pages need to remain locked to ensure that they won't be 54 * relocated or paged out. 55 * 56 * To prevent these locked pages from getting in the way of page 57 * coalescing, we try to allocate large pages from the system, and carve 58 * them up to satisfy pmem allocation requests. This will keep the locked 59 * pages within a constrained area of physical memory, limiting the number 60 * of large pages that would be pinned by our locked pages. This is, of 61 * course, another take on the infamous kernel cage, and it has many of the 62 * downsides of the original cage. It also interferes with system-wide 63 * resource management decisions, as it maintains its own pool of unused 64 * pages which can't be easily reclaimed and used during low-memory 65 * situations. 66 * 67 * The right solution is for pmem to register a callback that the VM system 68 * could call, which would temporarily remove any GART entries for pages 69 * that were being relocated. This would let us leave the pages unlocked, 70 * which would remove the need for using large pages, which would simplify 71 * this code a great deal. Unfortunately, the support for these callbacks 72 * only exists on some SPARC platforms right now. 73 * 74 * Note that this is the *only* reason that large pages are used here. The 75 * GART can't perform large-page translations, and the code appropriately 76 * falls back to using small pages if page_create_va_large() fails. 77 */ 78 79 #define HOLD_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \ 80 { mutex_enter(&dhp->dh_lock); } 81 82 #define RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \ 83 { mutex_exit(&dhp->dh_lock); } 84 85 #define FROM_LPG(pp) (pp->p_szc != 0) 86 #define PFIND(pp) (page_pptonum(pp) & (pmem_pgcnt - 1)) 87 88 /* 89 * Structs and static variables used for pmem only. 90 */ 91 typedef struct pmem_lpg { 92 page_t *pl_pp; /* start pp */ 93 ulong_t *pl_bitmap; /* allocation status for each page */ 94 ushort_t pl_pfree; /* this large page might be fully freed */ 95 struct pmem_lpg *pl_next; 96 struct pmem_lpg *pl_prev; 97 } pmem_lpg_t; 98 99 static size_t pmem_lpgsize; /* the size of one large page */ 100 static pgcnt_t pmem_pgcnt; /* the number of small pages in a large page */ 101 static uint_t pmem_lszc; /* page size code of the large page */ 102 /* The segment to be associated with all the allocated pages. */ 103 static struct seg pmem_seg; 104 /* Fully occupied large pages allocated for pmem. */ 105 static pmem_lpg_t *pmem_occ_lpgs; 106 /* Memory pool to store residual small pages from large pages. */ 107 static page_t *pmem_mpool = NULL; 108 /* Number of small pages reside in pmem_mpool currently. */ 109 static pgcnt_t pmem_nmpages = 0; 110 /* To protect pmem_nmpages, pmem_mpool and pmem_occ_lpgs. */ 111 kmutex_t pmem_mutex; 112 113 static int lpg_isfree(pmem_lpg_t *); 114 static void pmem_lpg_sub(pmem_lpg_t **, pmem_lpg_t *); 115 static void pmem_lpg_concat(pmem_lpg_t **, pmem_lpg_t **); 116 static pmem_lpg_t *pmem_lpg_get(pmem_lpg_t *, page_t *, pmem_lpg_t **); 117 static pmem_lpg_t *pmem_lpg_alloc(uint_t); 118 static void pmem_lpg_free(pmem_lpg_t **, pmem_lpg_t *); 119 static void lpg_free(page_t *spp); 120 static pgcnt_t mpool_break(page_t **, pgcnt_t); 121 static void mpool_append(page_t **, pgcnt_t); 122 static void lpp_break(page_t **, pgcnt_t, pgcnt_t, pmem_lpg_t *); 123 static void lpp_free(page_t *, pgcnt_t, pmem_lpg_t **); 124 static int lpp_create(page_t **, pgcnt_t, pgcnt_t *, pmem_lpg_t **, 125 vnode_t *, u_offset_t *, uint_t); 126 static void tlist_in(page_t *, pgcnt_t, vnode_t *, u_offset_t *); 127 static void tlist_out(page_t *, pgcnt_t); 128 static int pmem_cookie_alloc(struct devmap_pmem_cookie **, pgcnt_t, uint_t); 129 static int pmem_lock(pgcnt_t, kproject_t **); 130 131 /* 132 * Called by driver devmap routine to pass physical memory mapping info to 133 * seg_dev framework, used only for physical memory allocated from 134 * devmap_pmem_alloc(). 135 */ 136 /* ARGSUSED */ 137 int 138 devmap_pmem_setup(devmap_cookie_t dhc, dev_info_t *dip, 139 struct devmap_callback_ctl *callbackops, devmap_pmem_cookie_t cookie, 140 offset_t off, size_t len, uint_t maxprot, uint_t flags, 141 ddi_device_acc_attr_t *accattrp) 142 { 143 devmap_handle_t *dhp = (devmap_handle_t *)dhc; 144 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 145 uint_t cache_attr = IOMEM_CACHE_ATTR(flags); 146 147 if (pcp == NULL || (off + len) > ptob(pcp->dp_npages)) 148 return (DDI_FAILURE); 149 150 /* 151 * First to check if this function has been called for this dhp. 152 */ 153 if (dhp->dh_flags & DEVMAP_SETUP_DONE) 154 return (DDI_FAILURE); 155 156 if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) 157 return (DDI_FAILURE); 158 159 /* 160 * Check if the cache attributes are supported. Need to pay 161 * attention that only uncachable or write-combining is 162 * permitted for pmem. 163 */ 164 if (i_ddi_check_cache_attr(flags) == B_FALSE || 165 (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0) 166 return (DDI_FAILURE); 167 168 if (flags & DEVMAP_MAPPING_INVALID) { 169 /* 170 * If DEVMAP_MAPPING_INVALID is specified, we have to grant 171 * remap permission. 172 */ 173 if (!(flags & DEVMAP_ALLOW_REMAP)) 174 return (DDI_FAILURE); 175 } else { 176 dhp->dh_pcookie = (devmap_pmem_cookie_t)pcp; 177 /* dh_roff is the offset inside the dh_pcookie. */ 178 dhp->dh_roff = ptob(btop(off)); 179 /* Set the cache attributes correctly */ 180 i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr); 181 } 182 183 dhp->dh_cookie = DEVMAP_PMEM_COOKIE; 184 dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS); 185 dhp->dh_len = ptob(btopr(len)); 186 187 dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; 188 ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); 189 190 if (callbackops != NULL) { 191 bcopy(callbackops, &dhp->dh_callbackops, 192 sizeof (struct devmap_callback_ctl)); 193 } 194 195 /* 196 * Initialize dh_lock if we want to do remap. 197 */ 198 if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) { 199 mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL); 200 dhp->dh_flags |= DEVMAP_LOCK_INITED; 201 } 202 203 dhp->dh_flags |= DEVMAP_SETUP_DONE; 204 205 return (DDI_SUCCESS); 206 } 207 208 /* 209 * Replace existing mapping using a new cookie, mainly gets called when doing 210 * fork(). Should be called in associated devmap_dup(9E). 211 */ 212 /* ARGSUSED */ 213 int 214 devmap_pmem_remap(devmap_cookie_t dhc, dev_info_t *dip, 215 devmap_pmem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot, 216 uint_t flags, ddi_device_acc_attr_t *accattrp) 217 { 218 devmap_handle_t *dhp = (devmap_handle_t *)dhc; 219 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 220 uint_t cache_attr = IOMEM_CACHE_ATTR(flags); 221 222 /* 223 * Reture failure if setup has not been done or no remap permission 224 * has been granted during the setup. 225 */ 226 if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 || 227 (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0) 228 return (DDI_FAILURE); 229 230 /* No flags supported for remap yet. */ 231 if (flags != 0) 232 return (DDI_FAILURE); 233 234 if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) 235 return (DDI_FAILURE); 236 237 if (pcp == NULL || (off + len) > ptob(pcp->dp_npages)) 238 return (DDI_FAILURE); 239 240 /* 241 * Check if the cache attributes are supported. Need to pay 242 * attention that only uncachable or write-combining is 243 * permitted for pmem. 244 */ 245 if (i_ddi_check_cache_attr(flags) == B_FALSE || 246 (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0) 247 return (DDI_FAILURE); 248 249 HOLD_DHP_LOCK(dhp); 250 /* 251 * Unload the old mapping of pages reloated with this dhp, so next 252 * fault will setup the new mappings. It is in segdev_faultpage that 253 * calls hat_devload to establish the mapping. Do this while holding 254 * the dhp lock so other faults dont reestablish the mappings. 255 */ 256 hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr, 257 dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER); 258 259 /* Set the cache attributes correctly */ 260 i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr); 261 262 dhp->dh_pcookie = cookie; 263 dhp->dh_roff = ptob(btop(off)); 264 dhp->dh_len = ptob(btopr(len)); 265 266 /* Clear the large page size flag. */ 267 dhp->dh_flags &= ~DEVMAP_FLAG_LARGE; 268 269 dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; 270 ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); 271 RELE_DHP_LOCK(dhp); 272 return (DDI_SUCCESS); 273 } 274 275 /* 276 * Directly (i.e., without occupying kernel virtual address space) allocate 277 * 'npages' physical memory pages for exporting to user land. The allocated 278 * page_t pointer will be recorded in cookie. 279 */ 280 int 281 devmap_pmem_alloc(size_t size, uint_t flags, devmap_pmem_cookie_t *cookiep) 282 { 283 u_offset_t pmem_off = 0; 284 page_t *pp = NULL; 285 page_t *lpp = NULL; 286 page_t *tlist = NULL; 287 pgcnt_t i = 0; 288 pgcnt_t rpages = 0; 289 pgcnt_t lpages = 0; 290 pgcnt_t tpages = 0; 291 pgcnt_t npages = btopr(size); 292 pmem_lpg_t *plp = NULL; 293 struct devmap_pmem_cookie *pcp; 294 uint_t reserved = 0; 295 uint_t locked = 0; 296 uint_t pflags, kflags; 297 298 *cookiep = NULL; 299 300 /* 301 * Number larger than this will cause page_create_va() to loop 302 * infinitely. 303 */ 304 if (npages == 0 || npages >= total_pages / 2) 305 return (DDI_FAILURE); 306 if ((flags & (PMEM_SLEEP | PMEM_NOSLEEP)) == 0) 307 return (DDI_FAILURE); 308 pflags = flags & PMEM_NOSLEEP ? PG_EXCL : PG_WAIT; 309 kflags = flags & PMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP; 310 311 /* Allocate pmem cookie. */ 312 if (pmem_cookie_alloc(&pcp, npages, kflags) == DDI_FAILURE) 313 return (DDI_FAILURE); 314 pcp->dp_npages = npages; 315 316 /* 317 * See if the requested memory can be locked. Currently we do resource 318 * controls on the project levlel only. 319 */ 320 if (pmem_lock(npages, &(pcp->dp_projp)) == DDI_FAILURE) 321 goto alloc_fail; 322 locked = 1; 323 324 /* 325 * First, grab as many as possible from pmem_mpool. If pages in 326 * pmem_mpool are enough for this request, we are done. 327 */ 328 mutex_enter(&pmem_mutex); 329 tpages = mpool_break(&tlist, npages); 330 /* IOlock and hashin them into the new offset. */ 331 if (tpages) 332 tlist_in(tlist, tpages, pcp->dp_vnp, &pmem_off); 333 mutex_exit(&pmem_mutex); 334 335 if (tpages == npages) 336 goto done; 337 338 rpages = npages - tpages; 339 /* Quit now if memory cannot be reserved. */ 340 if (!page_resv(rpages, kflags)) 341 goto alloc_fail; 342 reserved = 1; 343 344 /* Try to allocate large pages first to decrease fragmentation. */ 345 i = (rpages + (pmem_pgcnt - 1)) / pmem_pgcnt; 346 if (lpp_create(&lpp, i, &lpages, &plp, pcp->dp_vnp, &pmem_off, 347 kflags) == DDI_FAILURE) 348 goto alloc_fail; 349 ASSERT(lpages == 0 ? lpp == NULL : 1); 350 351 /* 352 * Pages in large pages is more than the request, put the residual 353 * pages into pmem_mpool. 354 */ 355 if (lpages >= rpages) { 356 lpp_break(&lpp, lpages, lpages - rpages, plp); 357 goto done; 358 } 359 360 /* Allocate small pages if lpp+tlist cannot satisfy the request. */ 361 i = rpages - lpages; 362 if ((pp = page_create_va(pcp->dp_vnp, pmem_off, ptob(i), 363 pflags, &pmem_seg, (caddr_t)(uintptr_t)pmem_off)) == NULL) 364 goto alloc_fail; 365 366 done: 367 page_list_concat(&tlist, &lpp); 368 page_list_concat(&tlist, &pp); 369 /* Set those small pages from large pages as allocated. */ 370 mutex_enter(&pmem_mutex); 371 pmem_lpg_concat(&pmem_occ_lpgs, &plp); 372 mutex_exit(&pmem_mutex); 373 374 /* 375 * Now tlist holds all the pages for this cookie. Record these pages in 376 * pmem cookie. 377 */ 378 for (pp = tlist, i = 0; i < npages; i++) { 379 pcp->dp_pparray[i] = pp; 380 page_io_unlock(pp); 381 pp = pp->p_next; 382 page_sub(&tlist, pp->p_prev); 383 } 384 ASSERT(tlist == NULL); 385 *cookiep = (devmap_pmem_cookie_t)pcp; 386 387 return (DDI_SUCCESS); 388 389 alloc_fail: 390 DTRACE_PROBE(pmem__alloc__fail); 391 /* Free large pages and the associated allocation records. */ 392 if (lpp) 393 lpp_free(lpp, lpages / pmem_pgcnt, &plp); 394 if (reserved == 1) 395 page_unresv(rpages); 396 /* Put those pages in tlist back into pmem_mpool. */ 397 if (tpages != 0) { 398 mutex_enter(&pmem_mutex); 399 /* IOunlock, hashout and update the allocation records. */ 400 tlist_out(tlist, tpages); 401 mpool_append(&tlist, tpages); 402 mutex_exit(&pmem_mutex); 403 } 404 if (locked == 1) 405 i_ddi_decr_locked_memory(NULL, NULL, pcp->dp_projp, NULL, 406 ptob(pcp->dp_npages)); 407 /* Freeing pmem_cookie. */ 408 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 409 kmem_free(pcp->dp_pparray, npages * sizeof (page_t *)); 410 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 411 return (DDI_FAILURE); 412 } 413 414 /* 415 * Free all small pages inside cookie, and return pages from large pages into 416 * mpool, if all the pages from one large page is in mpool, free it as a whole. 417 */ 418 void 419 devmap_pmem_free(devmap_pmem_cookie_t cookie) 420 { 421 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 422 pgcnt_t i; 423 pgcnt_t tpages = 0; 424 page_t *pp; 425 pmem_lpg_t *pl1, *plp; 426 pmem_lpg_t *pf_lpgs = NULL; 427 uint_t npls = 0; 428 pmem_lpg_t *last_pl = NULL; 429 pmem_lpg_t *plast_pl = NULL; 430 431 ASSERT(pcp); 432 mutex_enter(&pmem_mutex); 433 /* Free small pages and return them to memory pool. */ 434 for (i = pcp->dp_npages; i > 0; i--) { 435 pp = pcp->dp_pparray[i - 1]; 436 page_hashout(pp, NULL); 437 /* 438 * Remove the mapping of this single page, this mapping is 439 * created using hat_devload() in segdev_faultpage(). 440 */ 441 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 442 if (!FROM_LPG(pp)) { 443 /* Normal small page. */ 444 page_free(pp, 1); 445 page_unresv(1); 446 } else { 447 /* Small page from large pages. */ 448 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 449 if (plp && !(plp->pl_pfree)) { 450 /* 451 * Move this record to pf_lpgs list, this large 452 * page may be able to be freed as a whole. 453 */ 454 pmem_lpg_sub(&pmem_occ_lpgs, plp); 455 pmem_lpg_concat(&pf_lpgs, &plp); 456 plp->pl_pfree = 1; 457 npls++; 458 last_pl = NULL; 459 } else { 460 /* Search in pf_lpgs list. */ 461 plp = pmem_lpg_get(pf_lpgs, pp, &plast_pl); 462 } 463 ASSERT(plp); 464 /* Mark this page as free. */ 465 BT_SET(plp->pl_bitmap, PFIND(pp)); 466 /* Record this page in pmem_mpool. */ 467 mpool_append(&pp, 1); 468 } 469 } 470 471 /* 472 * Find out the large pages whose pages have been freed, remove them 473 * from plp list, free them and the associated pmem_lpg struct. 474 */ 475 for (plp = pf_lpgs; npls != 0; npls--) { 476 pl1 = plp; 477 plp = plp->pl_next; 478 if (lpg_isfree(pl1)) { 479 /* 480 * Get one free large page. Find all pages in this 481 * large page and remove them from pmem_mpool. 482 */ 483 lpg_free(pl1->pl_pp); 484 /* Remove associated allocation records. */ 485 pmem_lpg_sub(&pf_lpgs, pl1); 486 pmem_lpg_free(&pf_lpgs, pl1); 487 tpages -= pmem_pgcnt; 488 } else 489 pl1->pl_pfree = 0; 490 } 491 /* Update allocation records accordingly. */ 492 pmem_lpg_concat(&pmem_occ_lpgs, &pf_lpgs); 493 mutex_exit(&pmem_mutex); 494 495 i_ddi_decr_locked_memory(NULL, NULL, (kproject_t *)pcp->dp_projp, NULL, 496 ptob(pcp->dp_npages)); 497 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 498 kmem_free(pcp->dp_pparray, pcp->dp_npages * sizeof (page_t *)); 499 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 500 } 501 502 /* 503 * To extract page frame number from specified range in a cookie. 504 */ 505 int 506 devmap_pmem_getpfns(devmap_pmem_cookie_t cookie, uint_t start, pgcnt_t npages, 507 pfn_t *pfnarray) 508 { 509 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 510 pgcnt_t i; 511 512 if (pcp == NULL || start + npages > pcp->dp_npages) 513 return (DDI_FAILURE); 514 515 for (i = start; i < start + npages; i++) 516 pfnarray[i - start] = pcp->dp_pparray[i]->p_pagenum; 517 return (DDI_SUCCESS); 518 } 519 520 void 521 pmem_init() 522 { 523 mutex_init(&pmem_mutex, NULL, MUTEX_DEFAULT, NULL); 524 pmem_lszc = MIN(1, page_num_pagesizes() - 1); 525 pmem_lpgsize = page_get_pagesize(pmem_lszc); 526 pmem_pgcnt = pmem_lpgsize >> PAGESHIFT; 527 bzero(&pmem_seg, sizeof (struct seg)); 528 pmem_seg.s_as = &kas; 529 } 530 531 /* Allocate kernel memory for one pmem cookie with n pages. */ 532 static int 533 pmem_cookie_alloc(struct devmap_pmem_cookie **pcpp, pgcnt_t n, uint_t kflags) 534 { 535 struct devmap_pmem_cookie *pcp; 536 537 if ((*pcpp = kmem_zalloc(sizeof (struct devmap_pmem_cookie), 538 kflags)) == NULL) 539 return (DDI_FAILURE); 540 pcp = *pcpp; 541 if ((pcp->dp_vnp = 542 kmem_zalloc(sizeof (vnode_t), kflags)) == NULL) { 543 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 544 return (DDI_FAILURE); 545 } 546 if ((pcp->dp_pparray = 547 kmem_zalloc(n * sizeof (page_t *), kflags)) == NULL) { 548 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 549 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 550 return (DDI_FAILURE); 551 } 552 return (DDI_SUCCESS); 553 } 554 555 /* Try to lock down n pages resource for current project. */ 556 static int 557 pmem_lock(pgcnt_t n, kproject_t **prjpp) 558 { 559 mutex_enter(&curproc->p_lock); 560 if (i_ddi_incr_locked_memory(curproc, NULL, NULL, NULL, 561 ptob(n)) != 0) { 562 mutex_exit(&curproc->p_lock); 563 return (DDI_FAILURE); 564 } 565 /* Store this project in cookie for later lock/unlock. */ 566 *prjpp = curproc->p_task->tk_proj; 567 mutex_exit(&curproc->p_lock); 568 return (DDI_SUCCESS); 569 } 570 571 /* To check if all the pages in a large page are freed. */ 572 static int 573 lpg_isfree(pmem_lpg_t *plp) 574 { 575 uint_t i; 576 577 for (i = 0; i < BT_BITOUL(pmem_pgcnt); i++) 578 if (plp->pl_bitmap[i] != BT_ULMAXMASK) 579 return (0); 580 /* All 1 means all pages are freed. */ 581 return (1); 582 } 583 584 /* 585 * Using pp to get the associated large page allocation record, searching in 586 * the splp linked list with *last as the heuristic pointer. Return NULL if 587 * not found. 588 */ 589 static pmem_lpg_t * 590 pmem_lpg_get(pmem_lpg_t *splp, page_t *pp, pmem_lpg_t **last) 591 { 592 pmem_lpg_t *plp; 593 pgcnt_t root_pfn; 594 595 ASSERT(pp); 596 if (splp == NULL) 597 return (NULL); 598 root_pfn = page_pptonum(pp) & ~(pmem_pgcnt - 1); 599 600 /* Try last winner first. */ 601 if (*last && root_pfn == page_pptonum((*last)->pl_pp)) 602 goto pl_found; 603 604 /* Else search the whole pmem_lpg list. */ 605 for (plp = splp; root_pfn != page_pptonum(plp->pl_pp); ) { 606 plp = plp->pl_next; 607 if (plp == splp) { 608 plp = NULL; 609 break; 610 } 611 ASSERT(plp->pl_pp); 612 } 613 614 *last = plp; 615 616 pl_found: 617 return (*last); 618 } 619 620 /* 621 * Remove one pmem_lpg plp from the oplpp list. 622 */ 623 static void 624 pmem_lpg_sub(pmem_lpg_t **oplpp, pmem_lpg_t *plp) 625 { 626 if (*oplpp == plp) 627 *oplpp = plp->pl_next; /* go to next pmem_lpg */ 628 629 if (*oplpp == plp) 630 *oplpp = NULL; /* pmem_lpg list is gone */ 631 else { 632 plp->pl_prev->pl_next = plp->pl_next; 633 plp->pl_next->pl_prev = plp->pl_prev; 634 } 635 plp->pl_prev = plp->pl_next = plp; /* make plp a list of one */ 636 } 637 638 /* 639 * Concatenate page list nplpp onto the end of list plpp. 640 */ 641 static void 642 pmem_lpg_concat(pmem_lpg_t **plpp, pmem_lpg_t **nplpp) 643 { 644 pmem_lpg_t *s1p, *s2p, *e1p, *e2p; 645 646 if (*nplpp == NULL) { 647 return; 648 } 649 if (*plpp == NULL) { 650 *plpp = *nplpp; 651 return; 652 } 653 s1p = *plpp; 654 e1p = s1p->pl_prev; 655 s2p = *nplpp; 656 e2p = s2p->pl_prev; 657 s1p->pl_prev = e2p; 658 e2p->pl_next = s1p; 659 e1p->pl_next = s2p; 660 s2p->pl_prev = e1p; 661 } 662 663 /* 664 * Allocate and initialize the allocation record of one large page, the init 665 * value is 'allocated'. 666 */ 667 static pmem_lpg_t * 668 pmem_lpg_alloc(uint_t kflags) 669 { 670 pmem_lpg_t *plp; 671 672 ASSERT(pmem_pgcnt % BT_NBIPUL == 0); 673 plp = kmem_zalloc(sizeof (pmem_lpg_t), kflags); 674 if (plp == NULL) 675 return (NULL); 676 plp->pl_bitmap = kmem_zalloc(BT_SIZEOFMAP(pmem_pgcnt), kflags); 677 if (plp->pl_bitmap == NULL) { 678 kmem_free(plp, sizeof (*plp)); 679 return (NULL); 680 } 681 plp->pl_next = plp->pl_prev = plp; 682 return (plp); 683 } 684 685 /* Free one allocation record pointed by oplp. */ 686 static void 687 pmem_lpg_free(pmem_lpg_t **headp, pmem_lpg_t *plp) 688 { 689 if (*headp == plp) 690 *headp = plp->pl_next; /* go to next pmem_lpg_t */ 691 692 if (*headp == plp) 693 *headp = NULL; /* this list is gone */ 694 else { 695 plp->pl_prev->pl_next = plp->pl_next; 696 plp->pl_next->pl_prev = plp->pl_prev; 697 } 698 kmem_free(plp->pl_bitmap, BT_SIZEOFMAP(pmem_pgcnt)); 699 kmem_free(plp, sizeof (*plp)); 700 } 701 702 /* Free one large page headed by spp from pmem_mpool. */ 703 static void 704 lpg_free(page_t *spp) 705 { 706 page_t *pp1 = spp; 707 uint_t i; 708 709 ASSERT(MUTEX_HELD(&pmem_mutex)); 710 for (i = 0; i < pmem_pgcnt; i++) { 711 /* Break pp1 from pmem_mpool. */ 712 page_sub(&pmem_mpool, pp1); 713 pp1++; 714 } 715 /* Free pages in this large page. */ 716 page_free_pages(spp); 717 page_unresv(pmem_pgcnt); 718 pmem_nmpages -= pmem_pgcnt; 719 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 720 } 721 722 /* Put n pages in *ppp list back into pmem_mpool. */ 723 static void 724 mpool_append(page_t **ppp, pgcnt_t n) 725 { 726 ASSERT(MUTEX_HELD(&pmem_mutex)); 727 /* Put back pages. */ 728 page_list_concat(&pmem_mpool, ppp); 729 pmem_nmpages += n; 730 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 731 } 732 733 /* 734 * Try to grab MIN(pmem_nmpages, n) pages from pmem_mpool, put them into *ppp 735 * list, and return the number of grabbed pages. 736 */ 737 static pgcnt_t 738 mpool_break(page_t **ppp, pgcnt_t n) 739 { 740 pgcnt_t i; 741 742 ASSERT(MUTEX_HELD(&pmem_mutex)); 743 /* Grab the pages. */ 744 i = MIN(pmem_nmpages, n); 745 *ppp = pmem_mpool; 746 page_list_break(ppp, &pmem_mpool, i); 747 pmem_nmpages -= i; 748 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 749 return (i); 750 } 751 752 /* 753 * Create n large pages, lpages and plpp contains the number of small pages and 754 * allocation records list respectively. 755 */ 756 static int 757 lpp_create(page_t **lppp, pgcnt_t n, pgcnt_t *lpages, pmem_lpg_t **plpp, 758 vnode_t *vnp, u_offset_t *offp, uint_t kflags) 759 { 760 pgcnt_t i; 761 pmem_lpg_t *plp; 762 page_t *pp; 763 764 for (i = 0, *lpages = 0; i < n; i++) { 765 /* Allocte one large page each time. */ 766 pp = page_create_va_large(vnp, *offp, pmem_lpgsize, 767 PG_EXCL, &pmem_seg, (caddr_t)(uintptr_t)*offp, NULL); 768 if (pp == NULL) 769 break; 770 *offp += pmem_lpgsize; 771 page_list_concat(lppp, &pp); 772 *lpages += pmem_pgcnt; 773 /* Add one allocation record for this large page. */ 774 if ((plp = pmem_lpg_alloc(kflags)) == NULL) 775 return (DDI_FAILURE); 776 plp->pl_pp = pp; 777 pmem_lpg_concat(plpp, &plp); 778 } 779 return (DDI_SUCCESS); 780 } 781 782 /* 783 * Break the last r small pages from the large page list *lppp (with totally n 784 * small pages) and put them into pmem_mpool. 785 */ 786 static void 787 lpp_break(page_t **lppp, pgcnt_t n, pgcnt_t r, pmem_lpg_t *oplp) 788 { 789 page_t *pp, *pp1; 790 pgcnt_t i; 791 pmem_lpg_t *plp; 792 793 if (r == 0) 794 return; 795 ASSERT(*lppp != NULL && r < pmem_pgcnt); 796 page_list_break(lppp, &pp, n - r); 797 798 /* The residual should reside in the last large page. */ 799 plp = oplp->pl_prev; 800 /* IOunlock and hashout the residual pages. */ 801 for (pp1 = pp, i = 0; i < r; i++) { 802 page_io_unlock(pp1); 803 page_hashout(pp1, NULL); 804 /* Mark this page as free. */ 805 BT_SET(plp->pl_bitmap, PFIND(pp1)); 806 pp1 = pp1->p_next; 807 } 808 ASSERT(pp1 == pp); 809 /* Put these residual pages into memory pool. */ 810 mutex_enter(&pmem_mutex); 811 mpool_append(&pp, r); 812 mutex_exit(&pmem_mutex); 813 } 814 815 /* Freeing large pages in lpp and the associated allocation records in plp. */ 816 static void 817 lpp_free(page_t *lpp, pgcnt_t lpgs, pmem_lpg_t **plpp) 818 { 819 pgcnt_t i, j; 820 page_t *pp = lpp, *pp1; 821 pmem_lpg_t *plp1, *plp2; 822 823 for (i = 0; i < lpgs; i++) { 824 for (j = 0; j < pmem_pgcnt; j++) { 825 /* IO unlock and hashout this small page. */ 826 page_io_unlock(pp); 827 page_hashout(pp, NULL); 828 pp1 = pp->p_next; 829 pp->p_prev = pp->p_next = pp; 830 pp = pp1; 831 } 832 /* Free one large page at one time. */ 833 page_free_pages(lpp); 834 lpp = pp; 835 } 836 /* Free associate pmem large page allocation records. */ 837 for (plp1 = *plpp; *plpp; plp1 = plp2) { 838 plp2 = plp1->pl_next; 839 pmem_lpg_free(plpp, plp1); 840 } 841 } 842 843 /* 844 * IOlock and hashin all pages in tlist, associate them with vnode *pvnp 845 * and offset starting with *poffp. Update allocation records accordingly at 846 * the same time. 847 */ 848 static void 849 tlist_in(page_t *tlist, pgcnt_t tpages, vnode_t *pvnp, u_offset_t *poffp) 850 { 851 page_t *pp; 852 pgcnt_t i = 0; 853 pmem_lpg_t *plp, *last_pl = NULL; 854 855 ASSERT(MUTEX_HELD(&pmem_mutex)); 856 for (pp = tlist; i < tpages; i++) { 857 ASSERT(FROM_LPG(pp)); 858 page_io_lock(pp); 859 (void) page_hashin(pp, pvnp, *poffp, NULL); 860 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 861 /* Mark this page as allocated. */ 862 BT_CLEAR(plp->pl_bitmap, PFIND(pp)); 863 *poffp += PAGESIZE; 864 pp = pp->p_next; 865 } 866 ASSERT(pp == tlist); 867 } 868 869 /* 870 * IOunlock and hashout all pages in tlist, update allocation records 871 * accordingly at the same time. 872 */ 873 static void 874 tlist_out(page_t *tlist, pgcnt_t tpages) 875 { 876 page_t *pp; 877 pgcnt_t i = 0; 878 pmem_lpg_t *plp, *last_pl = NULL; 879 880 ASSERT(MUTEX_HELD(&pmem_mutex)); 881 for (pp = tlist; i < tpages; i++) { 882 ASSERT(FROM_LPG(pp)); 883 page_io_unlock(pp); 884 page_hashout(pp, NULL); 885 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 886 /* Mark this page as free. */ 887 BT_SET(plp->pl_bitmap, PFIND(pp)); 888 pp = pp->p_next; 889 } 890 ASSERT(pp == tlist); 891 } 892