1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/balloon_impl.h>
28 #include <sys/hypervisor.h>
29 #include <xen/sys/xenbus_impl.h>
30 #include <sys/atomic.h>
31 #include <sys/cmn_err.h>
32 #include <sys/disp.h>
33 #include <sys/callb.h>
34 #include <xen/public/memory.h>
35 #include <vm/hat.h>
36 #include <sys/promif.h>
37 #include <vm/seg_kmem.h>
38 #include <sys/memnode.h>
39 #include <sys/param.h>
40 #include <vm/vm_dep.h>
41 #include <sys/mman.h>
42 #include <sys/memlist.h>
43 #include <sys/sysmacros.h>
44 #include <sys/machsystm.h>
45 #include <sys/sdt.h>
46
47 /*
48 * This file implements a balloon thread, which controls a domain's memory
49 * reservation, or the amount of memory a domain is currently allocated.
50 * The hypervisor provides the current memory reservation through xenbus,
51 * so we register a watch on this. We will then be signalled when the
52 * reservation changes. If it goes up, we map the new mfn's to our pfn's
53 * (allocating page_t's if necessary), and release them into the system.
54 * If the reservation goes down, we grab pages and release them back to
55 * the hypervisor, saving the page_t's for later use.
56 */
57
58 /*
59 * Various structures needed by the balloon thread
60 */
61 static bln_stats_t bln_stats;
62 static kthread_t *bln_thread;
63 static kmutex_t bln_mutex;
64 static kcondvar_t bln_cv;
65 static struct xenbus_watch bln_watch;
66 static mfn_t new_high_mfn;
67
68 /*
69 * For holding spare page_t structures - keep a singly-linked list.
70 * The list may hold both valid (pagenum < mfn_count) and invalid
71 * (pagenum >= mfn_count) page_t's. Valid page_t's should be inserted
72 * at the front, and invalid page_t's at the back. Removal should
73 * always be from the front. This is a singly-linked list using
74 * p_next, so p_prev is always NULL.
75 */
76 static page_t *bln_spare_list_front, *bln_spare_list_back;
77
78 int balloon_zero_memory = 1;
79 size_t balloon_minkmem = (8 * 1024 * 1024);
80
81 /*
82 * reassign_pfn() calls update_contig_pfnlist(), which can cause a large
83 * slowdown when calling multiple times. If we're reassigning less than the
84 * quota defined here, we just accept the slowdown. If the count is greater
85 * than the quota, we tell the contig alloc code to stop its accounting until
86 * we're done. Setting the quota to less than 2 is not supported.
87 *
88 * Note that we define our own wrapper around the external
89 * clear_and_lock_contig_pfnlist(), but we just use the version of
90 * unlock_contig_pfnlist() in vm_machdep.c.
91 */
92 uint_t bln_contig_list_quota = 50;
93
94 extern void clear_and_lock_contig_pfnlist(void);
95 extern void unlock_contig_pfnlist(void);
96
97 /*
98 * Lock the pfnlist if necessary (see above), and return whether we locked it.
99 */
100 static int
balloon_lock_contig_pfnlist(int count)101 balloon_lock_contig_pfnlist(int count) {
102 if (count > bln_contig_list_quota) {
103 clear_and_lock_contig_pfnlist();
104 return (1);
105 } else {
106 return (0);
107 }
108 }
109
110 /*
111 * The page represented by pp is being given back to the hypervisor.
112 * Add the page_t structure to our spare list.
113 */
114 static void
balloon_page_add(page_t * pp)115 balloon_page_add(page_t *pp)
116 {
117 /*
118 * We need to keep the page exclusively locked
119 * to prevent swrand from grabbing it.
120 */
121 ASSERT(PAGE_EXCL(pp));
122 ASSERT(MUTEX_HELD(&bln_mutex));
123
124 pp->p_prev = NULL;
125 if (bln_spare_list_front == NULL) {
126 bln_spare_list_front = bln_spare_list_back = pp;
127 pp->p_next = NULL;
128 } else if (pp->p_pagenum >= mfn_count) {
129 /*
130 * The pfn is invalid, so add at the end of list. Since these
131 * adds should *only* be done by balloon_init_new_pages(), and
132 * that does adds in order, the following ASSERT should
133 * never trigger.
134 */
135 ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum);
136 bln_spare_list_back->p_next = pp;
137 pp->p_next = NULL;
138 bln_spare_list_back = pp;
139 } else {
140 /* Add at beginning of list */
141 pp->p_next = bln_spare_list_front;
142 bln_spare_list_front = pp;
143 }
144 }
145
146 /*
147 * Return a page_t structure from our spare list, or NULL if none are available.
148 */
149 static page_t *
balloon_page_sub(void)150 balloon_page_sub(void)
151 {
152 page_t *pp;
153
154 ASSERT(MUTEX_HELD(&bln_mutex));
155 if (bln_spare_list_front == NULL) {
156 return (NULL);
157 }
158
159 pp = bln_spare_list_front;
160 ASSERT(PAGE_EXCL(pp));
161 ASSERT(pp->p_pagenum <= mfn_count);
162 if (pp->p_pagenum == mfn_count) {
163 return (NULL);
164 }
165
166 bln_spare_list_front = pp->p_next;
167 if (bln_spare_list_front == NULL)
168 bln_spare_list_back = NULL;
169 pp->p_next = NULL;
170 return (pp);
171 }
172
173 /*
174 * NOTE: We currently do not support growing beyond the boot memory size,
175 * so the following function will not be called. It is left in here with
176 * the hope that someday this restriction can be lifted, and this code can
177 * be used.
178 */
179
180 /*
181 * This structure is placed at the start of every block of new pages
182 */
183 typedef struct {
184 struct memseg memseg;
185 struct memlist memlist;
186 page_t pages[1];
187 } mem_structs_t;
188
189 /*
190 * To make the math below slightly less confusing, we calculate the first
191 * two parts here. page_t's are handled separately, so they are not included.
192 */
193 #define MEM_STRUCT_SIZE (sizeof (struct memseg) + sizeof (struct memlist))
194
195 /*
196 * We want to add memory, but have no spare page_t structures. Use some of
197 * our new memory for the page_t structures.
198 *
199 * Somewhat similar to kphysm_add_memory_dynamic(), but simpler.
200 */
201 static int
balloon_init_new_pages(mfn_t framelist[],pgcnt_t count)202 balloon_init_new_pages(mfn_t framelist[], pgcnt_t count)
203 {
204 pgcnt_t metapgs, totalpgs, num_pages;
205 paddr_t metasz;
206 pfn_t meta_start;
207 page_t *page_array;
208 caddr_t va;
209 int i, rv, locked;
210 mem_structs_t *mem;
211 struct memseg *segp;
212
213 /* Calculate the number of pages we're going to add */
214 totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages;
215
216 /*
217 * The following calculates the number of "meta" pages -- the pages
218 * that will be required to hold page_t structures for all new pages.
219 * Proof of this calculation is left up to the reader.
220 */
221 metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) /
222 (PAGESIZE + sizeof (page_t)));
223
224 /*
225 * Given the number of page_t structures we need, is there also
226 * room in our meta pages for a memseg and memlist struct?
227 * If not, we'll need one more meta page.
228 */
229 if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) +
230 MEM_STRUCT_SIZE))
231 metapgs++;
232
233 /*
234 * metapgs is calculated from totalpgs, which may be much larger than
235 * count. If we don't have enough pages, all of the pages in this
236 * batch will be made meta pages, and a future trip through
237 * balloon_inc_reservation() will add the rest of the meta pages.
238 */
239 if (metapgs > count)
240 metapgs = count;
241
242 /*
243 * Figure out the number of page_t structures that can fit in metapgs
244 *
245 * This will cause us to initialize more page_t structures than we
246 * need - these may be used in future memory increases.
247 */
248 metasz = pfn_to_pa(metapgs);
249 num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t);
250
251 DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t,
252 num_pages, pgcnt_t, metapgs);
253
254 /*
255 * We only increment mfn_count by count, not num_pages, to keep the
256 * space of all valid pfns contiguous. This means we create page_t
257 * structures with invalid pagenums -- we deal with this situation
258 * in balloon_page_sub.
259 */
260 mfn_count += count;
261
262 /*
263 * Get a VA for the pages that will hold page_t and other structures.
264 * The memseg and memlist structures will go at the beginning, with
265 * the page_t structures following.
266 */
267 va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP);
268 /* LINTED: improper alignment */
269 mem = (mem_structs_t *)va;
270 page_array = mem->pages;
271
272 meta_start = bln_stats.bln_max_pages;
273
274 /*
275 * Set the mfn to pfn mapping for the meta pages.
276 */
277 locked = balloon_lock_contig_pfnlist(metapgs);
278 for (i = 0; i < metapgs; i++) {
279 reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]);
280 }
281 if (locked)
282 unlock_contig_pfnlist();
283
284 /*
285 * For our meta pages, map them in and zero the page.
286 * This will be the first time touching the new pages.
287 */
288 hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages,
289 PROT_READ | PROT_WRITE,
290 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
291 bzero(va, metasz);
292
293 /*
294 * Initialize the page array for the new pages.
295 */
296 for (i = 0; i < metapgs; i++) {
297 page_array[i].p_pagenum = bln_stats.bln_max_pages++;
298 page_array[i].p_offset = (u_offset_t)-1;
299 page_iolock_init(&page_array[i]);
300 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
301 ASSERT(rv == 1);
302 }
303
304 /*
305 * For the rest of the pages, initialize the page_t struct and
306 * add them to the free list
307 */
308 for (i = metapgs; i < num_pages; i++) {
309 page_array[i].p_pagenum = bln_stats.bln_max_pages++;
310 page_array[i].p_offset = (u_offset_t)-1;
311 page_iolock_init(&page_array[i]);
312 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
313 ASSERT(rv == 1);
314 balloon_page_add(&page_array[i]);
315 }
316
317 /*
318 * Remember where I said that we don't call this function? The missing
319 * code right here is why. We need to set up kpm mappings for any new
320 * pages coming in. However, if someone starts up a domain with small
321 * memory, then greatly increases it, we could get in some horrible
322 * deadlock situations as we steal page tables for kpm use, and
323 * userland applications take them right back before we can use them
324 * to set up our new memory. Once a way around that is found, and a
325 * few other changes are made, we'll be able to enable this code.
326 */
327
328 /*
329 * Update kernel structures, part 1: memsegs list
330 */
331 mem->memseg.pages_base = meta_start;
332 mem->memseg.pages_end = bln_stats.bln_max_pages - 1;
333 mem->memseg.pages = &page_array[0];
334 mem->memseg.epages = &page_array[num_pages - 1];
335 mem->memseg.next = NULL;
336 memsegs_lock(1);
337 for (segp = memsegs; segp->next != NULL; segp = segp->next)
338 ;
339 segp->next = &mem->memseg;
340 memsegs_unlock(1);
341
342 /*
343 * Update kernel structures, part 2: mem_node array
344 */
345 mem_node_add_slice(meta_start, bln_stats.bln_max_pages);
346
347 /*
348 * Update kernel structures, part 3: phys_install array
349 * (*sigh* how many of these things do we need?)
350 */
351 memlist_write_lock();
352 memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist,
353 &phys_install);
354 memlist_write_unlock();
355
356 build_pfn_hash();
357
358 return (metapgs);
359 }
360
361 /* How many ulong_t's can we fit on a page? */
362 #define FRAME_ARRAY_SIZE (PAGESIZE / sizeof (ulong_t))
363
364 /*
365 * These are too large to declare on the stack, so we make them static instead
366 */
367 static ulong_t mfn_frames[FRAME_ARRAY_SIZE];
368 static pfn_t pfn_frames[FRAME_ARRAY_SIZE];
369
370 /*
371 * This function is called when our reservation is increasing. Make a
372 * hypervisor call to get our new pages, then integrate them into the system.
373 */
374 static spgcnt_t
balloon_inc_reservation(ulong_t credit)375 balloon_inc_reservation(ulong_t credit)
376 {
377 int i, cnt, locked;
378 int meta_pg_start, meta_pg_end;
379 long rv;
380 page_t *pp;
381 page_t *new_list_front, *new_list_back;
382
383 /* Make sure we're single-threaded. */
384 ASSERT(MUTEX_HELD(&bln_mutex));
385
386 rv = 0;
387 new_list_front = new_list_back = NULL;
388 meta_pg_start = meta_pg_end = 0;
389 bzero(mfn_frames, PAGESIZE);
390
391 if (credit > FRAME_ARRAY_SIZE)
392 credit = FRAME_ARRAY_SIZE;
393
394 xen_block_migrate();
395 rv = balloon_alloc_pages(credit, mfn_frames);
396
397 if (rv < 0) {
398 xen_allow_migrate();
399 return (0);
400 }
401 for (i = 0; i < rv; i++) {
402 if (mfn_frames[i] > new_high_mfn)
403 new_high_mfn = mfn_frames[i];
404
405 pp = balloon_page_sub();
406 if (pp == NULL) {
407 /*
408 * We pass the index into the current mfn array,
409 * then move the counter past the mfns we used
410 */
411 meta_pg_start = i;
412 cnt = balloon_init_new_pages(&mfn_frames[i], rv - i);
413 i += cnt;
414 meta_pg_end = i;
415 if (i < rv) {
416 pp = balloon_page_sub();
417 } else {
418 ASSERT(i == rv);
419 }
420 }
421 if (pp == NULL) {
422 break;
423 }
424
425 if (new_list_back == NULL) {
426 new_list_front = new_list_back = pp;
427 } else {
428 new_list_back->p_next = pp;
429 new_list_back = pp;
430 }
431 pp->p_next = NULL;
432 }
433 cnt = i;
434 locked = balloon_lock_contig_pfnlist(cnt);
435 for (i = 0, pp = new_list_front; i < meta_pg_start;
436 i++, pp = pp->p_next) {
437 reassign_pfn(pp->p_pagenum, mfn_frames[i]);
438 }
439 for (i = meta_pg_end; i < cnt; i++, pp = pp->p_next) {
440 reassign_pfn(pp->p_pagenum, mfn_frames[i]);
441 }
442 if (locked)
443 unlock_contig_pfnlist();
444
445 /*
446 * Make sure we don't allow pages without pfn->mfn mappings
447 * into the system.
448 */
449 ASSERT(pp == NULL);
450
451 while (new_list_front != NULL) {
452 pp = new_list_front;
453 new_list_front = pp->p_next;
454 page_free(pp, 1);
455 }
456
457 /*
458 * Variable review: at this point, rv contains the number of pages
459 * the hypervisor gave us. cnt contains the number of pages for which
460 * we had page_t structures. i contains the number of pages
461 * where we set up pfn <-> mfn mappings. If this ASSERT trips, that
462 * means we somehow lost page_t's from our local list.
463 */
464 ASSERT(cnt == i);
465 if (cnt < rv) {
466 /*
467 * We couldn't get page structures.
468 *
469 * This shouldn't happen, but causes no real harm if it does.
470 * On debug kernels, we'll flag it. On all kernels, we'll
471 * give back the pages we couldn't assign.
472 *
473 * Since these pages are new to the system and haven't been
474 * used, we don't bother zeroing them.
475 */
476 #ifdef DEBUG
477 cmn_err(CE_WARN, "Could only assign %d of %ld pages", cnt, rv);
478 #endif /* DEBUG */
479
480 (void) balloon_free_pages(rv - cnt, &mfn_frames[i], NULL, NULL);
481
482 rv = cnt;
483 }
484
485 xen_allow_migrate();
486 page_unresv(rv - (meta_pg_end - meta_pg_start));
487 return (rv);
488 }
489
490 /*
491 * This function is called when we want to decrease the memory reservation
492 * of our domain. Allocate the memory and make a hypervisor call to give
493 * it back.
494 */
495 static spgcnt_t
balloon_dec_reservation(ulong_t debit)496 balloon_dec_reservation(ulong_t debit)
497 {
498 int i, locked;
499 long rv;
500 ulong_t request;
501 page_t *pp;
502
503 bzero(mfn_frames, sizeof (mfn_frames));
504 bzero(pfn_frames, sizeof (pfn_frames));
505
506 if (debit > FRAME_ARRAY_SIZE) {
507 debit = FRAME_ARRAY_SIZE;
508 }
509 request = debit;
510
511 /*
512 * Don't bother if there isn't a safe amount of kmem left.
513 */
514 if (kmem_avail() < balloon_minkmem) {
515 kmem_reap();
516 if (kmem_avail() < balloon_minkmem)
517 return (0);
518 }
519
520 if (page_resv(request, KM_NOSLEEP) == 0) {
521 return (0);
522 }
523 xen_block_migrate();
524 for (i = 0; i < debit; i++) {
525 pp = page_get_high_mfn(new_high_mfn);
526 new_high_mfn = 0;
527 if (pp == NULL) {
528 /*
529 * Call kmem_reap(), then try once more,
530 * but only if there is a safe amount of
531 * kmem left.
532 */
533 kmem_reap();
534 if (kmem_avail() < balloon_minkmem ||
535 (pp = page_get_high_mfn(0)) == NULL) {
536 debit = i;
537 break;
538 }
539 }
540 ASSERT(PAGE_EXCL(pp));
541 ASSERT(!hat_page_is_mapped(pp));
542
543 balloon_page_add(pp);
544 pfn_frames[i] = pp->p_pagenum;
545 mfn_frames[i] = pfn_to_mfn(pp->p_pagenum);
546 }
547 if (debit == 0) {
548 xen_allow_migrate();
549 page_unresv(request);
550 return (0);
551 }
552
553 /*
554 * We zero all the pages before we start reassigning them in order to
555 * minimize the time spent holding the lock on the contig pfn list.
556 */
557 if (balloon_zero_memory) {
558 for (i = 0; i < debit; i++) {
559 pfnzero(pfn_frames[i], 0, PAGESIZE);
560 }
561 }
562
563 /*
564 * Remove all mappings for the pfns from the system
565 */
566 locked = balloon_lock_contig_pfnlist(debit);
567 for (i = 0; i < debit; i++) {
568 reassign_pfn(pfn_frames[i], MFN_INVALID);
569 }
570 if (locked)
571 unlock_contig_pfnlist();
572
573 rv = balloon_free_pages(debit, mfn_frames, NULL, NULL);
574
575 if (rv < 0) {
576 cmn_err(CE_WARN, "Attempt to return pages to the hypervisor "
577 "failed - up to %lu pages lost (error = %ld)", debit, rv);
578 rv = 0;
579 } else if (rv != debit) {
580 panic("Unexpected return value (%ld) from decrease reservation "
581 "hypervisor call", rv);
582 }
583
584 xen_allow_migrate();
585 if (debit != request)
586 page_unresv(request - debit);
587 return (rv);
588 }
589
590 /*
591 * This function is the callback which is called when the memory/target
592 * node is changed. When it is fired, we will read a new reservation
593 * target for our domain and signal the worker thread to make the change.
594 *
595 * If the reservation is larger than we can handle, we issue a warning. dom0
596 * does this automatically every boot, so we skip the first warning on dom0.
597 */
598 /*ARGSUSED*/
599 static void
balloon_handler(struct xenbus_watch * watch,const char ** vec,uint_t len)600 balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len)
601 {
602 ulong_t new_target_kb;
603 pgcnt_t new_target_pages;
604 int rv;
605 static uchar_t warning_cnt = 0;
606
607 rv = xenbus_scanf(NULL, "memory", "target", "%lu", &new_target_kb);
608 if (rv != 0) {
609 return;
610 }
611
612 /* new_target is in kB - change this to pages */
613 new_target_pages = kbtop(new_target_kb);
614
615 DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages);
616
617 /*
618 * Unfortunately, dom0 may give us a target that is larger than
619 * our max limit. Re-check the limit, and, if the new target is
620 * too large, adjust it downwards.
621 */
622 mutex_enter(&bln_mutex);
623 if (new_target_pages > bln_stats.bln_max_pages) {
624 DTRACE_PROBE2(balloon__target__too__large, pgcnt_t,
625 new_target_pages, pgcnt_t, bln_stats.bln_max_pages);
626 if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) {
627 cmn_err(CE_WARN, "New balloon target (0x%lx pages) is "
628 "larger than original memory size (0x%lx pages). "
629 "Ballooning beyond original memory size is not "
630 "allowed.",
631 new_target_pages, bln_stats.bln_max_pages);
632 }
633 warning_cnt = 1;
634 bln_stats.bln_new_target = bln_stats.bln_max_pages;
635 } else {
636 bln_stats.bln_new_target = new_target_pages;
637 }
638
639 mutex_exit(&bln_mutex);
640 cv_signal(&bln_cv);
641 }
642
643 /*
644 * bln_wait_sec can be used to throttle the hv calls, but by default it's
645 * turned off. If a balloon attempt fails, the wait time is forced on, and
646 * then is exponentially increased as further attempts fail.
647 */
648 uint_t bln_wait_sec = 0;
649 uint_t bln_wait_shift = 1;
650
651 /*
652 * This is the main balloon thread. Wait on the cv. When woken, if our
653 * reservation has changed, call the appropriate function to adjust the
654 * reservation.
655 */
656 static void
balloon_worker_thread(void)657 balloon_worker_thread(void)
658 {
659 uint_t bln_wait;
660 callb_cpr_t cprinfo;
661 spgcnt_t rv;
662
663 bln_wait = bln_wait_sec;
664
665 CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon");
666 for (;;) {
667 rv = 0;
668
669 mutex_enter(&bln_mutex);
670 CALLB_CPR_SAFE_BEGIN(&cprinfo);
671 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
672 /*
673 * We weren't able to fully complete the request
674 * last time through, so try again.
675 */
676 (void) cv_reltimedwait(&bln_cv, &bln_mutex,
677 (bln_wait * hz), TR_CLOCK_TICK);
678 } else {
679 cv_wait(&bln_cv, &bln_mutex);
680 }
681 CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex);
682
683 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
684 if (bln_stats.bln_new_target <
685 bln_stats.bln_current_pages) {
686 /* reservation shrunk */
687 rv = -balloon_dec_reservation(
688 bln_stats.bln_current_pages -
689 bln_stats.bln_new_target);
690 } else if (bln_stats.bln_new_target >
691 bln_stats.bln_current_pages) {
692 /* reservation grew */
693 rv = balloon_inc_reservation(
694 bln_stats.bln_new_target -
695 bln_stats.bln_current_pages);
696 }
697 }
698 if (rv == 0) {
699 if (bln_wait == 0) {
700 bln_wait = 1;
701 } else {
702 bln_wait <<= bln_wait_shift;
703 }
704 } else {
705 bln_stats.bln_current_pages += rv;
706 bln_wait = bln_wait_sec;
707 }
708 if (bln_stats.bln_current_pages < bln_stats.bln_low)
709 bln_stats.bln_low = bln_stats.bln_current_pages;
710 else if (bln_stats.bln_current_pages > bln_stats.bln_high)
711 bln_stats.bln_high = bln_stats.bln_current_pages;
712 mutex_exit(&bln_mutex);
713 }
714 }
715
716 /*
717 * Called after balloon_init(), which is below. The xenbus thread is up
718 * and running, so we can register our watch and create the balloon thread.
719 */
720 static void
balloon_config_watch(int state)721 balloon_config_watch(int state)
722 {
723 if (state != XENSTORE_UP)
724 return;
725
726 bln_watch.node = "memory/target";
727 bln_watch.callback = balloon_handler;
728 if (register_xenbus_watch(&bln_watch)) {
729 cmn_err(CE_WARN, "Failed to register balloon watcher; balloon "
730 "thread will be disabled");
731 return;
732 }
733
734 if (bln_thread == NULL)
735 bln_thread = thread_create(NULL, 0, balloon_worker_thread,
736 NULL, 0, &p0, TS_RUN, minclsyspri);
737 }
738
739 /*
740 * Basic initialization of the balloon thread. Set all of our variables,
741 * and register a callback for later when we can register a xenbus watch.
742 */
743 void
balloon_init(pgcnt_t nr_pages)744 balloon_init(pgcnt_t nr_pages)
745 {
746 domid_t domid = DOMID_SELF;
747
748 bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages;
749 bln_stats.bln_new_target = bln_stats.bln_high = nr_pages;
750 bln_stats.bln_max_pages = nr_pages;
751 cv_init(&bln_cv, NULL, CV_DEFAULT, NULL);
752
753 bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op(
754 XENMEM_maximum_reservation, &domid);
755
756 (void) xs_register_xenbus_callback(balloon_config_watch);
757 }
758
759 /*
760 * These functions are called from the network drivers when they gain a page
761 * or give one away. We simply update our count. Note that the counter
762 * tracks the number of pages we give away, so we need to subtract any
763 * amount passed to balloon_drv_added.
764 */
765 void
balloon_drv_added(int64_t delta)766 balloon_drv_added(int64_t delta)
767 {
768 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta);
769 }
770
771 void
balloon_drv_subtracted(int64_t delta)772 balloon_drv_subtracted(int64_t delta)
773 {
774 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta);
775 }
776
777 /*
778 * balloon_alloc_pages()
779 * Allocate page_cnt mfns. mfns storage provided by the caller. Returns
780 * the number of pages allocated, which could be less than page_cnt, or
781 * a negative number if an error occurred.
782 */
783 long
balloon_alloc_pages(uint_t page_cnt,mfn_t * mfns)784 balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns)
785 {
786 xen_memory_reservation_t memres;
787 long rv;
788
789 bzero(&memres, sizeof (memres));
790 /*LINTED: constant in conditional context*/
791 set_xen_guest_handle(memres.extent_start, mfns);
792 memres.domid = DOMID_SELF;
793 memres.nr_extents = page_cnt;
794
795 rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
796 if (rv > 0)
797 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv);
798 return (rv);
799 }
800
801 /*
802 * balloon_free_pages()
803 * free page_cnt pages, using any combination of mfns, pfns, and kva as long
804 * as they refer to the same mapping. If an array of mfns is passed in, we
805 * assume they were already cleared. Otherwise, we need to zero the pages
806 * before giving them back to the hypervisor. kva space is not free'd up in
807 * case the caller wants to re-use it.
808 */
809 long
balloon_free_pages(uint_t page_cnt,mfn_t * mfns,caddr_t kva,pfn_t * pfns)810 balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
811 {
812 xen_memory_reservation_t memdec;
813 mfn_t mfn;
814 pfn_t pfn;
815 uint_t i;
816 long e;
817
818
819 #if DEBUG
820 /* make sure kva is page aligned and maps to first pfn */
821 if (kva != NULL) {
822 ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0);
823 if (pfns != NULL) {
824 ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]);
825 }
826 }
827 #endif
828
829 /* if we have a kva, we can clean all pages with just one bzero */
830 if ((kva != NULL) && balloon_zero_memory) {
831 bzero(kva, (page_cnt * PAGESIZE));
832 }
833
834 /* if we were given a kva and/or a pfn */
835 if ((kva != NULL) || (pfns != NULL)) {
836
837 /*
838 * All the current callers only pass 1 page when using kva or
839 * pfns, and use mfns when passing multiple pages. If that
840 * assumption is changed, the following code will need some
841 * work. The following ASSERT() guarantees we're respecting
842 * the io locking quota.
843 */
844 ASSERT(page_cnt < bln_contig_list_quota);
845
846 /* go through all the pages */
847 for (i = 0; i < page_cnt; i++) {
848
849 /* get the next pfn */
850 if (pfns == NULL) {
851 pfn = hat_getpfnum(kas.a_hat,
852 (kva + (PAGESIZE * i)));
853 } else {
854 pfn = pfns[i];
855 }
856
857 /*
858 * if we didn't already zero this page, do it now. we
859 * need to do this *before* we give back the MFN
860 */
861 if ((kva == NULL) && (balloon_zero_memory)) {
862 pfnzero(pfn, 0, PAGESIZE);
863 }
864
865 /*
866 * unmap the pfn. We don't free up the kva vmem space
867 * so the caller can re-use it. The page must be
868 * unmapped before it is given back to the hypervisor.
869 */
870 if (kva != NULL) {
871 hat_unload(kas.a_hat, (kva + (PAGESIZE * i)),
872 PAGESIZE, HAT_UNLOAD_UNMAP);
873 }
874
875 /* grab the mfn before the pfn is marked as invalid */
876 mfn = pfn_to_mfn(pfn);
877
878 /* mark the pfn as invalid */
879 reassign_pfn(pfn, MFN_INVALID);
880
881 /*
882 * if we weren't given an array of MFNs, we need to
883 * free them up one at a time. Otherwise, we'll wait
884 * until later and do it in one hypercall
885 */
886 if (mfns == NULL) {
887 bzero(&memdec, sizeof (memdec));
888 /*LINTED: constant in conditional context*/
889 set_xen_guest_handle(memdec.extent_start, &mfn);
890 memdec.domid = DOMID_SELF;
891 memdec.nr_extents = 1;
892 e = HYPERVISOR_memory_op(
893 XENMEM_decrease_reservation, &memdec);
894 if (e != 1) {
895 cmn_err(CE_PANIC, "balloon: unable to "
896 "give a page back to the "
897 "hypervisor.\n");
898 }
899 }
900 }
901 }
902
903 /*
904 * if we were passed in MFNs, we haven't free'd them up yet. We can
905 * do it with one call.
906 */
907 if (mfns != NULL) {
908 bzero(&memdec, sizeof (memdec));
909 /*LINTED: constant in conditional context*/
910 set_xen_guest_handle(memdec.extent_start, mfns);
911 memdec.domid = DOMID_SELF;
912 memdec.nr_extents = page_cnt;
913 e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec);
914 if (e != page_cnt) {
915 cmn_err(CE_PANIC, "balloon: unable to give pages back "
916 "to the hypervisor.\n");
917 }
918 }
919
920 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt);
921 return (page_cnt);
922 }
923
924
925 /*
926 * balloon_replace_pages()
927 * Try to replace nextexts blocks of 2^order pages. addr_bits specifies
928 * how many bits of address the pages must be within (i.e. 16 would mean
929 * that the pages cannot have an address > 64k). The constrints are on
930 * what the hypervisor gives us -- we are free to give any pages in
931 * exchange. The array pp is the pages we are giving away. The caller
932 * provides storage space for mfns, which hold the new physical pages.
933 */
934 long
balloon_replace_pages(uint_t nextents,page_t ** pp,uint_t addr_bits,uint_t order,mfn_t * mfns)935 balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits,
936 uint_t order, mfn_t *mfns)
937 {
938 xen_memory_reservation_t memres;
939 long fallback_cnt;
940 long cnt;
941 uint_t i, j, page_cnt, extlen;
942 long e;
943 int locked;
944
945
946 /*
947 * we shouldn't be allocating constrained pages on a guest. It doesn't
948 * make any sense. They won't be constrained after a migration.
949 */
950 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
951
952 extlen = 1 << order;
953 page_cnt = nextents * extlen;
954 /* Give back the current pages to the hypervisor */
955 for (i = 0; i < page_cnt; i++) {
956 cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum);
957 if (cnt != 1) {
958 cmn_err(CE_PANIC, "balloon: unable to give a page back "
959 "to the hypervisor.\n");
960 }
961 }
962
963 /*
964 * try to allocate the new pages using addr_bits and order. If we can't
965 * get all of the pages, try to get the remaining pages with no
966 * constraints and, if that was successful, return the number of
967 * constrained pages we did allocate.
968 */
969 bzero(&memres, sizeof (memres));
970 /*LINTED: constant in conditional context*/
971 set_xen_guest_handle(memres.extent_start, mfns);
972 memres.domid = DOMID_SELF;
973 memres.nr_extents = nextents;
974 memres.mem_flags = XENMEMF_address_bits(addr_bits);
975 memres.extent_order = order;
976 cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
977 /* assign the new MFNs to the current PFNs */
978 locked = balloon_lock_contig_pfnlist(cnt * extlen);
979 for (i = 0; i < cnt; i++) {
980 for (j = 0; j < extlen; j++) {
981 reassign_pfn(pp[i * extlen + j]->p_pagenum,
982 mfns[i] + j);
983 }
984 }
985 if (locked)
986 unlock_contig_pfnlist();
987 if (cnt != nextents) {
988 if (cnt < 0) {
989 cnt = 0;
990 }
991
992 /*
993 * We couldn't get enough memory to satisfy our requirements.
994 * The above loop will assign the parts of the request that
995 * were successful (this part may be 0). We need to fill
996 * in the rest. The bzero below clears out extent_order and
997 * address_bits, so we'll take anything from the hypervisor
998 * to replace the pages we gave away.
999 */
1000 fallback_cnt = page_cnt - cnt * extlen;
1001 bzero(&memres, sizeof (memres));
1002 /*LINTED: constant in conditional context*/
1003 set_xen_guest_handle(memres.extent_start, mfns);
1004 memres.domid = DOMID_SELF;
1005 memres.nr_extents = fallback_cnt;
1006 e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
1007 if (e != fallback_cnt) {
1008 cmn_err(CE_PANIC, "balloon: unable to recover from "
1009 "failed increase_reservation.\n");
1010 }
1011 locked = balloon_lock_contig_pfnlist(fallback_cnt);
1012 for (i = 0; i < fallback_cnt; i++) {
1013 uint_t offset = page_cnt - fallback_cnt;
1014
1015 /*
1016 * We already used pp[0...(cnt * extlen)] before,
1017 * so start at the next entry in the pp array.
1018 */
1019 reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]);
1020 }
1021 if (locked)
1022 unlock_contig_pfnlist();
1023 }
1024
1025 /*
1026 * balloon_free_pages increments our counter. Decrement it here.
1027 */
1028 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt);
1029
1030 /*
1031 * return the number of extents we were able to replace. If we got
1032 * this far, we know all the pp's are valid.
1033 */
1034 return (cnt);
1035 }
1036
1037
1038 /*
1039 * Called from the driver - return the requested stat.
1040 */
1041 size_t
balloon_values(int cmd)1042 balloon_values(int cmd)
1043 {
1044 switch (cmd) {
1045 case BLN_IOCTL_CURRENT:
1046 return (ptokb(bln_stats.bln_current_pages));
1047 case BLN_IOCTL_TARGET:
1048 return (ptokb(bln_stats.bln_new_target));
1049 case BLN_IOCTL_LOW:
1050 return (ptokb(bln_stats.bln_low));
1051 case BLN_IOCTL_HIGH:
1052 return (ptokb(bln_stats.bln_high));
1053 case BLN_IOCTL_LIMIT:
1054 return (ptokb(bln_stats.bln_hard_limit));
1055 default:
1056 panic("Unexpected cmd %d in balloon_values()\n", cmd);
1057 }
1058 /*NOTREACHED*/
1059 }
1060