xref: /titanic_50/usr/src/uts/i86pc/vm/hment.c (revision 90f050286227cf4c4f8aa425555d04723d331d48)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/sysmacros.h>
31 #include <sys/kmem.h>
32 #include <sys/atomic.h>
33 #include <sys/bitmap.h>
34 #include <sys/systm.h>
35 #include <vm/seg_kmem.h>
36 #include <vm/hat.h>
37 #include <vm/vm_dep.h>
38 #include <vm/hat_i86.h>
39 #include <sys/cmn_err.h>
40 
41 
42 /*
43  * When pages are shared by more than one mapping, a list of these
44  * structs hangs off of the page_t connected by the hm_next and hm_prev
45  * fields.  Every hment is also indexed by a system-wide hash table, using
46  * hm_hashnext to connect it to the chain of hments in a single hash
47  * bucket.
48  */
49 struct hment {
50 	struct hment	*hm_hashnext;	/* next mapping on hash chain */
51 	struct hment	*hm_next;	/* next mapping of same page */
52 	struct hment	*hm_prev;	/* previous mapping of same page */
53 	htable_t	*hm_htable;	/* corresponding htable_t */
54 	uint16_t	hm_entry;	/* index of pte in htable */
55 	uint16_t	hm_pad;		/* explicitly expose compiler padding */
56 #ifdef __amd64
57 	uint32_t	hm_pad2;	/* explicitly expose compiler padding */
58 #endif
59 };
60 
61 /*
62  * Value returned by hment_walk() when dealing with a single mapping
63  * embedded in the page_t.
64  */
65 #define	HMENT_EMBEDDED ((hment_t *)(uintptr_t)1)
66 
67 kmem_cache_t *hment_cache;
68 
69 /*
70  * The hment reserve is similar to the htable reserve, with the following
71  * exception. Hment's are never needed for HAT kmem allocs.
72  *
73  * The hment_reserve_amount variable is used, so that you can change it's
74  * value to zero via a kernel debugger to force stealing to get tested.
75  */
76 #define	HMENT_RESERVE_AMOUNT	(200)	/* currently a guess at right value. */
77 uint_t hment_reserve_amount = HMENT_RESERVE_AMOUNT;
78 kmutex_t hment_reserve_mutex;
79 uint_t	hment_reserve_count;
80 hment_t	*hment_reserve_pool;
81 extern  kthread_t *hat_reserves_thread;
82 
83 /*
84  * Possible performance RFE: we might need to make this dynamic, perhaps
85  * based on the number of pages in the system.
86  */
87 #define	HMENT_HASH_SIZE (64 * 1024)
88 static uint_t hment_hash_entries = HMENT_HASH_SIZE;
89 static hment_t **hment_hash;
90 
91 /*
92  * Lots of highly shared pages will have the same value for "entry" (consider
93  * the starting address of "xterm" or "sh"). So we'll distinguish them by
94  * adding the pfn of the page table into both the high bits.
95  * The shift by 9 corresponds to the range of values for entry (0..511).
96  */
97 #define	HMENT_HASH(pfn, entry) (uint32_t) 	\
98 	((((pfn) << 9) + entry + pfn) & (hment_hash_entries - 1))
99 
100 /*
101  * "mlist_lock" is a hashed mutex lock for protecting per-page mapping
102  * lists and "hash_lock" is a similar lock protecting the hment hash
103  * table.  The hashed approach is taken to avoid the spatial overhead of
104  * maintaining a separate lock for each page, while still achieving better
105  * scalability than a single lock would allow.
106  */
107 #define	MLIST_NUM_LOCK	256		/* must be power of two */
108 static kmutex_t mlist_lock[MLIST_NUM_LOCK];
109 
110 /*
111  * the shift by 9 is so that all large pages don't use the same hash bucket
112  */
113 #define	MLIST_MUTEX(pp) \
114 	&mlist_lock[((pp)->p_pagenum + ((pp)->p_pagenum >> 9)) & \
115 	(MLIST_NUM_LOCK - 1)]
116 
117 #define	HASH_NUM_LOCK	256		/* must be power of two */
118 static kmutex_t hash_lock[HASH_NUM_LOCK];
119 
120 #define	HASH_MUTEX(idx) &hash_lock[(idx) & (HASH_NUM_LOCK-1)]
121 
122 static hment_t *hment_steal(void);
123 
124 /*
125  * put one hment onto the reserves list
126  */
127 static void
128 hment_put_reserve(hment_t *hm)
129 {
130 	HATSTAT_INC(hs_hm_put_reserve);
131 	mutex_enter(&hment_reserve_mutex);
132 	hm->hm_next = hment_reserve_pool;
133 	hment_reserve_pool = hm;
134 	++hment_reserve_count;
135 	mutex_exit(&hment_reserve_mutex);
136 }
137 
138 /*
139  * Take one hment from the reserve.
140  */
141 static hment_t *
142 hment_get_reserve(void)
143 {
144 	hment_t *hm = NULL;
145 
146 	/*
147 	 * We rely on a "donation system" to refill the hment reserve
148 	 * list, which only takes place when we are allocating hments for
149 	 * user mappings.  It is theoretically possible that an incredibly
150 	 * long string of kernel hment_alloc()s with no intervening user
151 	 * hment_alloc()s could exhaust that pool.
152 	 */
153 	HATSTAT_INC(hs_hm_get_reserve);
154 	mutex_enter(&hment_reserve_mutex);
155 	if (hment_reserve_count != 0) {
156 		hm = hment_reserve_pool;
157 		hment_reserve_pool = hm->hm_next;
158 		--hment_reserve_count;
159 	}
160 	mutex_exit(&hment_reserve_mutex);
161 	return (hm);
162 }
163 
164 /*
165  * Allocate an hment
166  */
167 static hment_t *
168 hment_alloc()
169 {
170 	int km_flag = can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP;
171 	hment_t	*hm = NULL;
172 	int use_reserves = (use_boot_reserve ||
173 	    curthread == hat_reserves_thread || panicstr != NULL);
174 
175 	/*
176 	 * If we aren't using the reserves, try using kmem to get an hment.
177 	 * Donate any successful allocations to reserves if low.
178 	 *
179 	 * If we're in panic, resort to using the reserves.
180 	 */
181 	HATSTAT_INC(hs_hm_alloc);
182 	if (!use_reserves) {
183 		for (;;) {
184 			hm = kmem_cache_alloc(hment_cache, km_flag);
185 			if (hment_reserve_count >= hment_reserve_amount ||
186 			    hm == NULL || panicstr != NULL ||
187 			    curthread == hat_reserves_thread)
188 				break;
189 			hment_put_reserve(hm);
190 		}
191 	}
192 
193 	/*
194 	 * If allocation failed, we need to tap the reserves or steal
195 	 */
196 	if (hm == NULL) {
197 		if (use_reserves)
198 			hm = hment_get_reserve();
199 
200 		/*
201 		 * If we still haven't gotten an hment, attempt to steal one by
202 		 * victimizing a mapping in a user htable.
203 		 */
204 		if (hm == NULL && can_steal_post_boot)
205 			hm = hment_steal();
206 
207 		/*
208 		 * we're in dire straights, try the reserve
209 		 */
210 		if (hm == NULL)
211 			hm = hment_get_reserve();
212 
213 		/*
214 		 * still no hment is a serious problem.
215 		 */
216 		if (hm == NULL)
217 			panic("hment_alloc(): no reserve, couldn't steal");
218 	}
219 
220 
221 	hm->hm_entry = 0;
222 	hm->hm_htable = NULL;
223 	hm->hm_hashnext = NULL;
224 	hm->hm_next = NULL;
225 	hm->hm_prev = NULL;
226 	return (hm);
227 }
228 
229 /*
230  * Free an hment, possibly to the reserves list when called from the
231  * thread using the reserves. For example, when freeing an hment during an
232  * htable_steal(), we can't recurse into the kmem allocator, so we just
233  * push the hment onto the reserve list.
234  */
235 void
236 hment_free(hment_t *hm)
237 {
238 #ifdef DEBUG
239 	/*
240 	 * zero out all fields to try and force any race conditions to segfault
241 	 */
242 	bzero(hm, sizeof (*hm));
243 #endif
244 	HATSTAT_INC(hs_hm_free);
245 	if (curthread == hat_reserves_thread ||
246 	    hment_reserve_count < hment_reserve_amount)
247 		hment_put_reserve(hm);
248 	else
249 		kmem_cache_free(hment_cache, hm);
250 }
251 
252 int
253 x86_hm_held(page_t *pp)
254 {
255 	ASSERT(pp != NULL);
256 	return (MUTEX_HELD(MLIST_MUTEX(pp)));
257 }
258 
259 void
260 x86_hm_enter(page_t *pp)
261 {
262 	ASSERT(pp != NULL);
263 	mutex_enter(MLIST_MUTEX(pp));
264 }
265 
266 void
267 x86_hm_exit(page_t *pp)
268 {
269 	ASSERT(pp != NULL);
270 	mutex_exit(MLIST_MUTEX(pp));
271 }
272 
273 /*
274  * Internal routine to add a full hment to a page_t mapping list
275  */
276 static void
277 hment_insert(hment_t *hm, page_t *pp)
278 {
279 	uint_t		idx;
280 
281 	ASSERT(x86_hm_held(pp));
282 	ASSERT(!pp->p_embed);
283 
284 	/*
285 	 * Add the hment to the page's mapping list.
286 	 */
287 	++pp->p_share;
288 	hm->hm_next = pp->p_mapping;
289 	if (pp->p_mapping != NULL)
290 		((hment_t *)pp->p_mapping)->hm_prev = hm;
291 	pp->p_mapping = hm;
292 
293 	/*
294 	 * Add the hment to the system-wide hash table.
295 	 */
296 	idx = HMENT_HASH(hm->hm_htable->ht_pfn, hm->hm_entry);
297 
298 	mutex_enter(HASH_MUTEX(idx));
299 	hm->hm_hashnext = hment_hash[idx];
300 	hment_hash[idx] = hm;
301 	mutex_exit(HASH_MUTEX(idx));
302 }
303 
304 /*
305  * Prepare a mapping list entry to the given page.
306  *
307  * There are 4 different situations to deal with:
308  *
309  * - Adding the first mapping to a page_t as an embedded hment
310  * - Refaulting on an existing embedded mapping
311  * - Upgrading an embedded mapping when adding a 2nd mapping
312  * - Adding another mapping to a page_t that already has multiple mappings
313  *	 note we don't optimized for the refaulting case here.
314  *
315  * Due to competition with other threads that may be mapping/unmapping the
316  * same page and the need to drop all locks while allocating hments, any or
317  * all of the 3 situations can occur (and in almost any order) in any given
318  * call. Isn't this fun!
319  */
320 hment_t *
321 hment_prepare(htable_t *htable, uint_t entry, page_t *pp)
322 {
323 	hment_t		*hm = NULL;
324 
325 	ASSERT(x86_hm_held(pp));
326 
327 	for (;;) {
328 
329 		/*
330 		 * The most common case is establishing the first mapping to a
331 		 * page, so check that first. This doesn't need any allocated
332 		 * hment.
333 		 */
334 		if (pp->p_mapping == NULL) {
335 			ASSERT(!pp->p_embed);
336 			ASSERT(pp->p_share == 0);
337 			if (hm == NULL)
338 				break;
339 
340 			/*
341 			 * we had an hment already, so free it and retry
342 			 */
343 			goto free_and_continue;
344 		}
345 
346 		/*
347 		 * If there is an embedded mapping, we may need to
348 		 * convert it to an hment.
349 		 */
350 		if (pp->p_embed) {
351 
352 			/* should point to htable */
353 			ASSERT(pp->p_mapping != NULL);
354 
355 			/*
356 			 * If we are faulting on a pre-existing mapping
357 			 * there is no need to promote/allocate a new hment.
358 			 * This happens a lot due to segmap.
359 			 */
360 			if (pp->p_mapping == htable && pp->p_mlentry == entry) {
361 				if (hm == NULL)
362 					break;
363 				goto free_and_continue;
364 			}
365 
366 			/*
367 			 * If we have an hment allocated, use it to promote the
368 			 * existing embedded mapping.
369 			 */
370 			if (hm != NULL) {
371 				hm->hm_htable = pp->p_mapping;
372 				hm->hm_entry = pp->p_mlentry;
373 				pp->p_mapping = NULL;
374 				pp->p_share = 0;
375 				pp->p_embed = 0;
376 				hment_insert(hm, pp);
377 			}
378 
379 			/*
380 			 * We either didn't have an hment allocated or we just
381 			 * used it for the embedded mapping. In either case,
382 			 * allocate another hment and restart.
383 			 */
384 			goto allocate_and_continue;
385 		}
386 
387 		/*
388 		 * Last possibility is that we're adding an hment to a list
389 		 * of hments.
390 		 */
391 		if (hm != NULL)
392 			break;
393 allocate_and_continue:
394 		x86_hm_exit(pp);
395 		hm = hment_alloc();
396 		x86_hm_enter(pp);
397 		continue;
398 
399 free_and_continue:
400 		/*
401 		 * we allocated an hment already, free it and retry
402 		 */
403 		x86_hm_exit(pp);
404 		hment_free(hm);
405 		hm = NULL;
406 		x86_hm_enter(pp);
407 	}
408 	ASSERT(x86_hm_held(pp));
409 	return (hm);
410 }
411 
412 /*
413  * Record a mapping list entry for the htable/entry to the given page.
414  *
415  * hment_prepare() should have properly set up the situation.
416  */
417 void
418 hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm)
419 {
420 	ASSERT(x86_hm_held(pp));
421 
422 	/*
423 	 * The most common case is establishing the first mapping to a
424 	 * page, so check that first. This doesn't need any allocated
425 	 * hment.
426 	 */
427 	if (pp->p_mapping == NULL) {
428 		ASSERT(hm == NULL);
429 		ASSERT(!pp->p_embed);
430 		ASSERT(pp->p_share == 0);
431 		pp->p_embed = 1;
432 		pp->p_mapping = htable;
433 		pp->p_mlentry = entry;
434 		return;
435 	}
436 
437 	/*
438 	 * We should never get here with a pre-existing embedded maping
439 	 */
440 	ASSERT(!pp->p_embed);
441 
442 	/*
443 	 * add the new hment to the mapping list
444 	 */
445 	ASSERT(hm != NULL);
446 	hm->hm_htable = htable;
447 	hm->hm_entry = entry;
448 	hment_insert(hm, pp);
449 }
450 
451 /*
452  * Walk through the mappings for a page.
453  *
454  * must already have done an x86_hm_enter()
455  */
456 hment_t *
457 hment_walk(page_t *pp, htable_t **ht, uint_t *entry, hment_t *prev)
458 {
459 	hment_t		*hm;
460 
461 	ASSERT(x86_hm_held(pp));
462 
463 	if (pp->p_embed) {
464 		if (prev == NULL) {
465 			*ht = (htable_t *)pp->p_mapping;
466 			*entry = pp->p_mlentry;
467 			hm = HMENT_EMBEDDED;
468 		} else {
469 			ASSERT(prev == HMENT_EMBEDDED);
470 			hm = NULL;
471 		}
472 	} else {
473 		if (prev == NULL) {
474 			ASSERT(prev != HMENT_EMBEDDED);
475 			hm = (hment_t *)pp->p_mapping;
476 		} else {
477 			hm = prev->hm_next;
478 		}
479 
480 		if (hm != NULL) {
481 			*ht = hm->hm_htable;
482 			*entry = hm->hm_entry;
483 		}
484 	}
485 	return (hm);
486 }
487 
488 /*
489  * Remove a mapping to a page from its mapping list. Must have
490  * the corresponding mapping list locked.
491  * Finds the mapping list entry with the given pte_t and
492  * unlinks it from the mapping list.
493  */
494 hment_t *
495 hment_remove(page_t *pp, htable_t *ht, uint_t entry)
496 {
497 	hment_t		*prev = NULL;
498 	hment_t		*hm;
499 	uint_t		idx;
500 
501 	ASSERT(x86_hm_held(pp));
502 
503 	/*
504 	 * Check if we have only one mapping embedded in the page_t.
505 	 */
506 	if (pp->p_embed) {
507 		ASSERT(ht == (htable_t *)pp->p_mapping);
508 		ASSERT(entry == pp->p_mlentry);
509 		ASSERT(pp->p_share == 0);
510 		pp->p_mapping = NULL;
511 		pp->p_mlentry = 0;
512 		pp->p_embed = 0;
513 		return (NULL);
514 	}
515 
516 	/*
517 	 * Otherwise it must be in the list of hments.
518 	 * Find the hment in the system-wide hash table and remove it.
519 	 */
520 	ASSERT(pp->p_share != 0);
521 	idx = HMENT_HASH(ht->ht_pfn, entry);
522 	mutex_enter(HASH_MUTEX(idx));
523 	hm = hment_hash[idx];
524 	while (hm && (hm->hm_htable != ht || hm->hm_entry != entry)) {
525 		prev = hm;
526 		hm = hm->hm_hashnext;
527 	}
528 	if (hm == NULL) {
529 		panic("hment_remove() missing in hash table pp=%lx, ht=%lx,"
530 		    "entry=0x%x hash index=0x%x", (uintptr_t)pp, (uintptr_t)ht,
531 		    entry, idx);
532 	}
533 
534 	if (prev)
535 		prev->hm_hashnext = hm->hm_hashnext;
536 	else
537 		hment_hash[idx] = hm->hm_hashnext;
538 	mutex_exit(HASH_MUTEX(idx));
539 
540 	/*
541 	 * Remove the hment from the page's mapping list
542 	 */
543 	if (hm->hm_next)
544 		hm->hm_next->hm_prev = hm->hm_prev;
545 	if (hm->hm_prev)
546 		hm->hm_prev->hm_next = hm->hm_next;
547 	else
548 		pp->p_mapping = hm->hm_next;
549 
550 	--pp->p_share;
551 	hm->hm_hashnext = NULL;
552 	hm->hm_next = NULL;
553 	hm->hm_prev = NULL;
554 
555 	return (hm);
556 }
557 
558 /*
559  * Put initial hment's in the reserve pool.
560  */
561 void
562 hment_reserve(uint_t count)
563 {
564 	hment_t	*hm;
565 
566 	count += hment_reserve_amount;
567 
568 	while (hment_reserve_count < count) {
569 		hm = kmem_cache_alloc(hment_cache, KM_NOSLEEP);
570 		if (hm == NULL)
571 			return;
572 		hment_put_reserve(hm);
573 	}
574 }
575 
576 /*
577  * Readjust the hment reserves after they may have been used.
578  */
579 void
580 hment_adjust_reserve()
581 {
582 	hment_t	*hm;
583 
584 	/*
585 	 * Free up any excess reserves
586 	 */
587 	while (hment_reserve_count > hment_reserve_amount) {
588 		ASSERT(curthread != hat_reserves_thread);
589 		hm = hment_get_reserve();
590 		if (hm == NULL)
591 			return;
592 		hment_free(hm);
593 	}
594 }
595 
596 /*
597  * initialize hment data structures
598  */
599 void
600 hment_init(void)
601 {
602 	int i;
603 	int flags = KMC_NOHASH | KMC_NODEBUG;
604 
605 	/*
606 	 * Initialize kmem caches. On 32 bit kernel's we shut off
607 	 * debug information to save on precious kernel VA usage.
608 	 */
609 	hment_cache = kmem_cache_create("hment_t",
610 	    sizeof (hment_t), 0, NULL, NULL, NULL,
611 	    NULL, hat_memload_arena, flags);
612 
613 	hment_hash = kmem_zalloc(hment_hash_entries * sizeof (hment_t *),
614 	    KM_SLEEP);
615 
616 	for (i = 0; i < MLIST_NUM_LOCK; i++)
617 		mutex_init(&mlist_lock[i], NULL, MUTEX_DEFAULT, NULL);
618 
619 	for (i = 0; i < HASH_NUM_LOCK; i++)
620 		mutex_init(&hash_lock[i], NULL, MUTEX_DEFAULT, NULL);
621 
622 
623 }
624 
625 /*
626  * return the number of mappings to a page
627  *
628  * Note there is no ASSERT() that the MUTEX is held for this.
629  * Hence the return value might be inaccurate if this is called without
630  * doing an x86_hm_enter().
631  */
632 uint_t
633 hment_mapcnt(page_t *pp)
634 {
635 	uint_t cnt;
636 	uint_t szc;
637 	page_t *larger;
638 	hment_t	*hm;
639 
640 	x86_hm_enter(pp);
641 	if (pp->p_mapping == NULL)
642 		cnt = 0;
643 	else if (pp->p_embed)
644 		cnt = 1;
645 	else
646 		cnt = pp->p_share;
647 	x86_hm_exit(pp);
648 
649 	/*
650 	 * walk through all larger mapping sizes counting mappings
651 	 */
652 	for (szc = 1; szc <= pp->p_szc; ++szc) {
653 		larger = PP_GROUPLEADER(pp, szc);
654 		if (larger == pp)	/* don't double count large mappings */
655 			continue;
656 
657 		x86_hm_enter(larger);
658 		if (larger->p_mapping != NULL) {
659 			if (larger->p_embed &&
660 			    ((htable_t *)larger->p_mapping)->ht_level == szc) {
661 				++cnt;
662 			} else if (!larger->p_embed) {
663 				for (hm = larger->p_mapping; hm;
664 				    hm = hm->hm_next) {
665 					if (hm->hm_htable->ht_level == szc)
666 						++cnt;
667 				}
668 			}
669 		}
670 		x86_hm_exit(larger);
671 	}
672 	return (cnt);
673 }
674 
675 /*
676  * We need to steal an hment. Walk through all the page_t's until we
677  * find one that has multiple mappings. Unload one of the mappings
678  * and reclaim that hment. Note that we'll save/restart the starting
679  * page to try and spread the pain.
680  */
681 static page_t *last_page = NULL;
682 
683 static hment_t *
684 hment_steal(void)
685 {
686 	page_t *last = last_page;
687 	page_t *pp = last;
688 	hment_t *hm = NULL;
689 	hment_t *hm2;
690 	htable_t *ht;
691 	uint_t found_one = 0;
692 
693 	HATSTAT_INC(hs_hm_steals);
694 	if (pp == NULL)
695 		last = pp = page_first();
696 
697 	while (!found_one) {
698 		HATSTAT_INC(hs_hm_steal_exam);
699 		pp = page_next(pp);
700 		if (pp == NULL)
701 			pp = page_first();
702 
703 		/*
704 		 * The loop and function exit here if nothing found to steal.
705 		 */
706 		if (pp == last)
707 			return (NULL);
708 
709 		/*
710 		 * Only lock the page_t if it has hments.
711 		 */
712 		if (pp->p_mapping == NULL || pp->p_embed)
713 			continue;
714 
715 		/*
716 		 * Search the mapping list for a usable mapping.
717 		 */
718 		x86_hm_enter(pp);
719 		if (!pp->p_embed) {
720 			for (hm = pp->p_mapping; hm; hm = hm->hm_next) {
721 				ht = hm->hm_htable;
722 				if (ht->ht_hat != kas.a_hat &&
723 				    ht->ht_busy == 0 &&
724 				    ht->ht_lock_cnt == 0) {
725 					found_one = 1;
726 					break;
727 				}
728 			}
729 		}
730 		if (!found_one)
731 			x86_hm_exit(pp);
732 	}
733 
734 	/*
735 	 * Steal the mapping we found.  Note that hati_page_unmap() will
736 	 * do the x86_hm_exit().
737 	 */
738 	hm2 = hati_page_unmap(pp, ht, hm->hm_entry);
739 	ASSERT(hm2 == hm);
740 	last_page = pp;
741 	return (hm);
742 }
743