1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2016 PALO, Richard.
26 */
27
28 #include <sys/balloon_impl.h>
29 #include <sys/hypervisor.h>
30 #include <xen/sys/xenbus_impl.h>
31 #include <sys/atomic.h>
32 #include <sys/cmn_err.h>
33 #include <sys/disp.h>
34 #include <sys/callb.h>
35 #include <xen/public/memory.h>
36 #include <vm/hat.h>
37 #include <sys/promif.h>
38 #include <vm/seg_kmem.h>
39 #include <sys/memnode.h>
40 #include <sys/param.h>
41 #include <vm/vm_dep.h>
42 #include <sys/mman.h>
43 #include <sys/memlist.h>
44 #include <sys/sysmacros.h>
45 #include <sys/machsystm.h>
46 #include <sys/sdt.h>
47
48 /*
49 * This file implements a balloon thread, which controls a domain's memory
50 * reservation, or the amount of memory a domain is currently allocated.
51 * The hypervisor provides the current memory reservation through xenbus,
52 * so we register a watch on this. We will then be signalled when the
53 * reservation changes. If it goes up, we map the new mfn's to our pfn's
54 * (allocating page_t's if necessary), and release them into the system.
55 * If the reservation goes down, we grab pages and release them back to
56 * the hypervisor, saving the page_t's for later use.
57 */
58
59 /*
60 * Various structures needed by the balloon thread
61 */
62 static bln_stats_t bln_stats;
63 static kthread_t *bln_thread;
64 static kmutex_t bln_mutex;
65 static kcondvar_t bln_cv;
66 static struct xenbus_watch bln_watch;
67 static mfn_t new_high_mfn;
68
69 /*
70 * For holding spare page_t structures - keep a singly-linked list.
71 * The list may hold both valid (pagenum < mfn_count) and invalid
72 * (pagenum >= mfn_count) page_t's. Valid page_t's should be inserted
73 * at the front, and invalid page_t's at the back. Removal should
74 * always be from the front. This is a singly-linked list using
75 * p_next, so p_prev is always NULL.
76 */
77 static page_t *bln_spare_list_front, *bln_spare_list_back;
78
79 int balloon_zero_memory = 1;
80 size_t balloon_minkmem = (8 * 1024 * 1024);
81
82 /*
83 * reassign_pfn() calls update_contig_pfnlist(), which can cause a large
84 * slowdown when calling multiple times. If we're reassigning less than the
85 * quota defined here, we just accept the slowdown. If the count is greater
86 * than the quota, we tell the contig alloc code to stop its accounting until
87 * we're done. Setting the quota to less than 2 is not supported.
88 *
89 * Note that we define our own wrapper around the external
90 * clear_and_lock_contig_pfnlist(), but we just use the version of
91 * unlock_contig_pfnlist() in vm_machdep.c.
92 */
93 uint_t bln_contig_list_quota = 50;
94
95 extern void clear_and_lock_contig_pfnlist(void);
96 extern void unlock_contig_pfnlist(void);
97
98 /*
99 * Lock the pfnlist if necessary (see above), and return whether we locked it.
100 */
101 static int
balloon_lock_contig_pfnlist(int count)102 balloon_lock_contig_pfnlist(int count)
103 {
104 if (count > bln_contig_list_quota) {
105 clear_and_lock_contig_pfnlist();
106 return (1);
107 } else {
108 return (0);
109 }
110 }
111
112 /*
113 * The page represented by pp is being given back to the hypervisor.
114 * Add the page_t structure to our spare list.
115 */
116 static void
balloon_page_add(page_t * pp)117 balloon_page_add(page_t *pp)
118 {
119 /*
120 * We need to keep the page exclusively locked
121 * to prevent swrand from grabbing it.
122 */
123 ASSERT(PAGE_EXCL(pp));
124 ASSERT(MUTEX_HELD(&bln_mutex));
125
126 pp->p_prev = NULL;
127 if (bln_spare_list_front == NULL) {
128 bln_spare_list_front = bln_spare_list_back = pp;
129 pp->p_next = NULL;
130 } else if (pp->p_pagenum >= mfn_count) {
131 /*
132 * The pfn is invalid, so add at the end of list. Since these
133 * adds should *only* be done by balloon_init_new_pages(), and
134 * that does adds in order, the following ASSERT should
135 * never trigger.
136 */
137 ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum);
138 bln_spare_list_back->p_next = pp;
139 pp->p_next = NULL;
140 bln_spare_list_back = pp;
141 } else {
142 /* Add at beginning of list */
143 pp->p_next = bln_spare_list_front;
144 bln_spare_list_front = pp;
145 }
146 }
147
148 /*
149 * Return a page_t structure from our spare list, or NULL if none are available.
150 */
151 static page_t *
balloon_page_sub(void)152 balloon_page_sub(void)
153 {
154 page_t *pp;
155
156 ASSERT(MUTEX_HELD(&bln_mutex));
157 if (bln_spare_list_front == NULL) {
158 return (NULL);
159 }
160
161 pp = bln_spare_list_front;
162 ASSERT(PAGE_EXCL(pp));
163 ASSERT(pp->p_pagenum <= mfn_count);
164 if (pp->p_pagenum == mfn_count) {
165 return (NULL);
166 }
167
168 bln_spare_list_front = pp->p_next;
169 if (bln_spare_list_front == NULL)
170 bln_spare_list_back = NULL;
171 pp->p_next = NULL;
172 return (pp);
173 }
174
175 /*
176 * NOTE: We currently do not support growing beyond the boot memory size,
177 * so the following function will not be called. It is left in here with
178 * the hope that someday this restriction can be lifted, and this code can
179 * be used.
180 */
181
182 /*
183 * This structure is placed at the start of every block of new pages
184 */
185 typedef struct {
186 struct memseg memseg;
187 struct memlist memlist;
188 page_t pages[1];
189 } mem_structs_t;
190
191 /*
192 * To make the math below slightly less confusing, we calculate the first
193 * two parts here. page_t's are handled separately, so they are not included.
194 */
195 #define MEM_STRUCT_SIZE (sizeof (struct memseg) + sizeof (struct memlist))
196
197 /*
198 * We want to add memory, but have no spare page_t structures. Use some of
199 * our new memory for the page_t structures.
200 *
201 * Somewhat similar to kphysm_add_memory_dynamic(), but simpler.
202 */
203 static int
balloon_init_new_pages(mfn_t framelist[],pgcnt_t count)204 balloon_init_new_pages(mfn_t framelist[], pgcnt_t count)
205 {
206 pgcnt_t metapgs, totalpgs, num_pages;
207 paddr_t metasz;
208 pfn_t meta_start;
209 page_t *page_array;
210 caddr_t va;
211 int i, rv, locked;
212 mem_structs_t *mem;
213 struct memseg *segp;
214
215 /* Calculate the number of pages we're going to add */
216 totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages;
217
218 /*
219 * The following calculates the number of "meta" pages -- the pages
220 * that will be required to hold page_t structures for all new pages.
221 * Proof of this calculation is left up to the reader.
222 */
223 metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) /
224 (PAGESIZE + sizeof (page_t)));
225
226 /*
227 * Given the number of page_t structures we need, is there also
228 * room in our meta pages for a memseg and memlist struct?
229 * If not, we'll need one more meta page.
230 */
231 if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) +
232 MEM_STRUCT_SIZE))
233 metapgs++;
234
235 /*
236 * metapgs is calculated from totalpgs, which may be much larger than
237 * count. If we don't have enough pages, all of the pages in this
238 * batch will be made meta pages, and a future trip through
239 * balloon_inc_reservation() will add the rest of the meta pages.
240 */
241 if (metapgs > count)
242 metapgs = count;
243
244 /*
245 * Figure out the number of page_t structures that can fit in metapgs
246 *
247 * This will cause us to initialize more page_t structures than we
248 * need - these may be used in future memory increases.
249 */
250 metasz = pfn_to_pa(metapgs);
251 num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t);
252
253 DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t,
254 num_pages, pgcnt_t, metapgs);
255
256 /*
257 * We only increment mfn_count by count, not num_pages, to keep the
258 * space of all valid pfns contiguous. This means we create page_t
259 * structures with invalid pagenums -- we deal with this situation
260 * in balloon_page_sub.
261 */
262 mfn_count += count;
263
264 /*
265 * Get a VA for the pages that will hold page_t and other structures.
266 * The memseg and memlist structures will go at the beginning, with
267 * the page_t structures following.
268 */
269 va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP);
270 /* LINTED: improper alignment */
271 mem = (mem_structs_t *)va;
272 page_array = mem->pages;
273
274 meta_start = bln_stats.bln_max_pages;
275
276 /*
277 * Set the mfn to pfn mapping for the meta pages.
278 */
279 locked = balloon_lock_contig_pfnlist(metapgs);
280 for (i = 0; i < metapgs; i++) {
281 reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]);
282 }
283 if (locked)
284 unlock_contig_pfnlist();
285
286 /*
287 * For our meta pages, map them in and zero the page.
288 * This will be the first time touching the new pages.
289 */
290 hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages,
291 PROT_READ | PROT_WRITE,
292 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
293 bzero(va, metasz);
294
295 /*
296 * Initialize the page array for the new pages.
297 */
298 for (i = 0; i < metapgs; i++) {
299 page_array[i].p_pagenum = bln_stats.bln_max_pages++;
300 page_array[i].p_offset = (u_offset_t)-1;
301 page_iolock_init(&page_array[i]);
302 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
303 ASSERT(rv == 1);
304 }
305
306 /*
307 * For the rest of the pages, initialize the page_t struct and
308 * add them to the free list
309 */
310 for (i = metapgs; i < num_pages; i++) {
311 page_array[i].p_pagenum = bln_stats.bln_max_pages++;
312 page_array[i].p_offset = (u_offset_t)-1;
313 page_iolock_init(&page_array[i]);
314 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
315 ASSERT(rv == 1);
316 balloon_page_add(&page_array[i]);
317 }
318
319 /*
320 * Remember where I said that we don't call this function? The missing
321 * code right here is why. We need to set up kpm mappings for any new
322 * pages coming in. However, if someone starts up a domain with small
323 * memory, then greatly increases it, we could get in some horrible
324 * deadlock situations as we steal page tables for kpm use, and
325 * userland applications take them right back before we can use them
326 * to set up our new memory. Once a way around that is found, and a
327 * few other changes are made, we'll be able to enable this code.
328 */
329
330 /*
331 * Update kernel structures, part 1: memsegs list
332 */
333 mem->memseg.pages_base = meta_start;
334 mem->memseg.pages_end = bln_stats.bln_max_pages - 1;
335 mem->memseg.pages = &page_array[0];
336 mem->memseg.epages = &page_array[num_pages - 1];
337 mem->memseg.next = NULL;
338 memsegs_lock(1);
339 for (segp = memsegs; segp->next != NULL; segp = segp->next)
340 ;
341 segp->next = &mem->memseg;
342 memsegs_unlock(1);
343
344 /*
345 * Update kernel structures, part 2: mem_node array
346 */
347 mem_node_add_slice(meta_start, bln_stats.bln_max_pages);
348
349 /*
350 * Update kernel structures, part 3: phys_install array
351 * (*sigh* how many of these things do we need?)
352 */
353 memlist_write_lock();
354 memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist,
355 &phys_install);
356 memlist_write_unlock();
357
358 build_pfn_hash();
359
360 return (metapgs);
361 }
362
363 /* How many ulong_t's can we fit on a page? */
364 #define FRAME_ARRAY_SIZE (PAGESIZE / sizeof (ulong_t))
365
366 /*
367 * These are too large to declare on the stack, so we make them static instead
368 */
369 static ulong_t mfn_frames[FRAME_ARRAY_SIZE];
370 static pfn_t pfn_frames[FRAME_ARRAY_SIZE];
371
372 /*
373 * This function is called when our reservation is increasing. Make a
374 * hypervisor call to get our new pages, then integrate them into the system.
375 */
376 static spgcnt_t
balloon_inc_reservation(ulong_t credit)377 balloon_inc_reservation(ulong_t credit)
378 {
379 int i, cnt, locked;
380 int meta_pg_start, meta_pg_end;
381 long rv;
382 page_t *pp;
383 page_t *new_list_front, *new_list_back;
384
385 /* Make sure we're single-threaded. */
386 ASSERT(MUTEX_HELD(&bln_mutex));
387
388 rv = 0;
389 new_list_front = new_list_back = NULL;
390 meta_pg_start = meta_pg_end = 0;
391 bzero(mfn_frames, PAGESIZE);
392
393 if (credit > FRAME_ARRAY_SIZE)
394 credit = FRAME_ARRAY_SIZE;
395
396 xen_block_migrate();
397 rv = balloon_alloc_pages(credit, mfn_frames);
398
399 if (rv < 0) {
400 xen_allow_migrate();
401 return (0);
402 }
403 for (i = 0; i < rv; i++) {
404 if (mfn_frames[i] > new_high_mfn)
405 new_high_mfn = mfn_frames[i];
406
407 pp = balloon_page_sub();
408 if (pp == NULL) {
409 /*
410 * We pass the index into the current mfn array,
411 * then move the counter past the mfns we used
412 */
413 meta_pg_start = i;
414 cnt = balloon_init_new_pages(&mfn_frames[i], rv - i);
415 i += cnt;
416 meta_pg_end = i;
417 if (i < rv) {
418 pp = balloon_page_sub();
419 } else {
420 ASSERT(i == rv);
421 }
422 }
423 if (pp == NULL) {
424 break;
425 }
426
427 if (new_list_back == NULL) {
428 new_list_front = new_list_back = pp;
429 } else {
430 new_list_back->p_next = pp;
431 new_list_back = pp;
432 }
433 pp->p_next = NULL;
434 }
435 cnt = i;
436 locked = balloon_lock_contig_pfnlist(cnt);
437 for (i = 0, pp = new_list_front; i < meta_pg_start;
438 i++, pp = pp->p_next) {
439 reassign_pfn(pp->p_pagenum, mfn_frames[i]);
440 }
441 for (i = meta_pg_end; i < cnt; i++, pp = pp->p_next) {
442 reassign_pfn(pp->p_pagenum, mfn_frames[i]);
443 }
444 if (locked)
445 unlock_contig_pfnlist();
446
447 /*
448 * Make sure we don't allow pages without pfn->mfn mappings
449 * into the system.
450 */
451 ASSERT(pp == NULL);
452
453 while (new_list_front != NULL) {
454 pp = new_list_front;
455 new_list_front = pp->p_next;
456 page_free(pp, 1);
457 }
458
459 /*
460 * Variable review: at this point, rv contains the number of pages
461 * the hypervisor gave us. cnt contains the number of pages for which
462 * we had page_t structures. i contains the number of pages
463 * where we set up pfn <-> mfn mappings. If this ASSERT trips, that
464 * means we somehow lost page_t's from our local list.
465 */
466 ASSERT(cnt == i);
467 if (cnt < rv) {
468 /*
469 * We couldn't get page structures.
470 *
471 * This shouldn't happen, but causes no real harm if it does.
472 * On debug kernels, we'll flag it. On all kernels, we'll
473 * give back the pages we couldn't assign.
474 *
475 * Since these pages are new to the system and haven't been
476 * used, we don't bother zeroing them.
477 */
478 #ifdef DEBUG
479 cmn_err(CE_WARN, "Could only assign %d of %ld pages", cnt, rv);
480 #endif /* DEBUG */
481
482 (void) balloon_free_pages(rv - cnt, &mfn_frames[i], NULL, NULL);
483
484 rv = cnt;
485 }
486
487 xen_allow_migrate();
488 page_unresv(rv - (meta_pg_end - meta_pg_start));
489 return (rv);
490 }
491
492 /*
493 * This function is called when we want to decrease the memory reservation
494 * of our domain. Allocate the memory and make a hypervisor call to give
495 * it back.
496 */
497 static spgcnt_t
balloon_dec_reservation(ulong_t debit)498 balloon_dec_reservation(ulong_t debit)
499 {
500 int i, locked;
501 long rv;
502 ulong_t request;
503 page_t *pp;
504
505 bzero(mfn_frames, sizeof (mfn_frames));
506 bzero(pfn_frames, sizeof (pfn_frames));
507
508 if (debit > FRAME_ARRAY_SIZE) {
509 debit = FRAME_ARRAY_SIZE;
510 }
511 request = debit;
512
513 /*
514 * Don't bother if there isn't a safe amount of kmem left.
515 */
516 if (kmem_avail() < balloon_minkmem) {
517 kmem_reap();
518 if (kmem_avail() < balloon_minkmem)
519 return (0);
520 }
521
522 if (page_resv(request, KM_NOSLEEP) == 0) {
523 return (0);
524 }
525 xen_block_migrate();
526 for (i = 0; i < debit; i++) {
527 pp = page_get_high_mfn(new_high_mfn);
528 new_high_mfn = 0;
529 if (pp == NULL) {
530 /*
531 * Call kmem_reap(), then try once more,
532 * but only if there is a safe amount of
533 * kmem left.
534 */
535 kmem_reap();
536 if (kmem_avail() < balloon_minkmem ||
537 (pp = page_get_high_mfn(0)) == NULL) {
538 debit = i;
539 break;
540 }
541 }
542 ASSERT(PAGE_EXCL(pp));
543 ASSERT(!hat_page_is_mapped(pp));
544
545 balloon_page_add(pp);
546 pfn_frames[i] = pp->p_pagenum;
547 mfn_frames[i] = pfn_to_mfn(pp->p_pagenum);
548 }
549 if (debit == 0) {
550 xen_allow_migrate();
551 page_unresv(request);
552 return (0);
553 }
554
555 /*
556 * We zero all the pages before we start reassigning them in order to
557 * minimize the time spent holding the lock on the contig pfn list.
558 */
559 if (balloon_zero_memory) {
560 for (i = 0; i < debit; i++) {
561 pfnzero(pfn_frames[i], 0, PAGESIZE);
562 }
563 }
564
565 /*
566 * Remove all mappings for the pfns from the system
567 */
568 locked = balloon_lock_contig_pfnlist(debit);
569 for (i = 0; i < debit; i++) {
570 reassign_pfn(pfn_frames[i], MFN_INVALID);
571 }
572 if (locked)
573 unlock_contig_pfnlist();
574
575 rv = balloon_free_pages(debit, mfn_frames, NULL, NULL);
576
577 if (rv < 0) {
578 cmn_err(CE_WARN, "Attempt to return pages to the hypervisor "
579 "failed - up to %lu pages lost (error = %ld)", debit, rv);
580 rv = 0;
581 } else if (rv != debit) {
582 panic("Unexpected return value (%ld) from decrease reservation "
583 "hypervisor call", rv);
584 }
585
586 xen_allow_migrate();
587 if (debit != request)
588 page_unresv(request - debit);
589 return (rv);
590 }
591
592 /*
593 * This function is the callback which is called when the memory/target
594 * node is changed. When it is fired, we will read a new reservation
595 * target for our domain and signal the worker thread to make the change.
596 *
597 * If the reservation is larger than we can handle, we issue a warning. dom0
598 * does this automatically every boot, so we skip the first warning on dom0.
599 */
600 /*ARGSUSED*/
601 static void
balloon_handler(struct xenbus_watch * watch,const char ** vec,uint_t len)602 balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len)
603 {
604 ulong_t new_target_kb;
605 pgcnt_t new_target_pages;
606 int rv;
607 static uchar_t warning_cnt = 0;
608
609 rv = xenbus_scanf(0, "memory", "target", "%lu", &new_target_kb);
610 if (rv != 0) {
611 return;
612 }
613
614 /* new_target is in kB - change this to pages */
615 new_target_pages = kbtop(new_target_kb);
616
617 DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages);
618
619 /*
620 * Unfortunately, dom0 may give us a target that is larger than
621 * our max limit. Re-check the limit, and, if the new target is
622 * too large, adjust it downwards.
623 */
624 mutex_enter(&bln_mutex);
625 if (new_target_pages > bln_stats.bln_max_pages) {
626 DTRACE_PROBE2(balloon__target__too__large, pgcnt_t,
627 new_target_pages, pgcnt_t, bln_stats.bln_max_pages);
628 if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) {
629 cmn_err(CE_WARN, "New balloon target (0x%lx pages) is "
630 "larger than original memory size (0x%lx pages). "
631 "Ballooning beyond original memory size is not "
632 "allowed.",
633 new_target_pages, bln_stats.bln_max_pages);
634 }
635 warning_cnt = 1;
636 bln_stats.bln_new_target = bln_stats.bln_max_pages;
637 } else {
638 bln_stats.bln_new_target = new_target_pages;
639 }
640
641 mutex_exit(&bln_mutex);
642 cv_signal(&bln_cv);
643 }
644
645 /*
646 * bln_wait_sec can be used to throttle the hv calls, but by default it's
647 * turned off. If a balloon attempt fails, the wait time is forced on, and
648 * then is exponentially increased as further attempts fail.
649 */
650 uint_t bln_wait_sec = 0;
651 uint_t bln_wait_shift = 1;
652
653 /*
654 * This is the main balloon thread. Wait on the cv. When woken, if our
655 * reservation has changed, call the appropriate function to adjust the
656 * reservation.
657 */
658 static void
balloon_worker_thread(void)659 balloon_worker_thread(void)
660 {
661 uint_t bln_wait;
662 callb_cpr_t cprinfo;
663 spgcnt_t rv;
664
665 bln_wait = bln_wait_sec;
666
667 CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon");
668 for (;;) {
669 rv = 0;
670
671 mutex_enter(&bln_mutex);
672 CALLB_CPR_SAFE_BEGIN(&cprinfo);
673 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
674 /*
675 * We weren't able to fully complete the request
676 * last time through, so try again.
677 */
678 (void) cv_reltimedwait(&bln_cv, &bln_mutex,
679 (bln_wait * hz), TR_CLOCK_TICK);
680 } else {
681 cv_wait(&bln_cv, &bln_mutex);
682 }
683 CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex);
684
685 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
686 if (bln_stats.bln_new_target <
687 bln_stats.bln_current_pages) {
688 /* reservation shrunk */
689 rv = -balloon_dec_reservation(
690 bln_stats.bln_current_pages -
691 bln_stats.bln_new_target);
692 } else if (bln_stats.bln_new_target >
693 bln_stats.bln_current_pages) {
694 /* reservation grew */
695 rv = balloon_inc_reservation(
696 bln_stats.bln_new_target -
697 bln_stats.bln_current_pages);
698 }
699 }
700 if (rv == 0) {
701 if (bln_wait == 0) {
702 bln_wait = 1;
703 } else {
704 bln_wait <<= bln_wait_shift;
705 }
706 } else {
707 bln_stats.bln_current_pages += rv;
708 bln_wait = bln_wait_sec;
709 }
710 if (bln_stats.bln_current_pages < bln_stats.bln_low)
711 bln_stats.bln_low = bln_stats.bln_current_pages;
712 else if (bln_stats.bln_current_pages > bln_stats.bln_high)
713 bln_stats.bln_high = bln_stats.bln_current_pages;
714 mutex_exit(&bln_mutex);
715 }
716 }
717
718 /*
719 * Called after balloon_init(), which is below. The xenbus thread is up
720 * and running, so we can register our watch and create the balloon thread.
721 */
722 static void
balloon_config_watch(int state)723 balloon_config_watch(int state)
724 {
725 if (state != XENSTORE_UP)
726 return;
727
728 bln_watch.node = "memory/target";
729 bln_watch.callback = balloon_handler;
730 if (register_xenbus_watch(&bln_watch)) {
731 cmn_err(CE_WARN, "Failed to register balloon watcher; balloon "
732 "thread will be disabled");
733 return;
734 }
735
736 if (bln_thread == NULL)
737 bln_thread = thread_create(NULL, 0, balloon_worker_thread,
738 NULL, 0, &p0, TS_RUN, minclsyspri);
739 }
740
741 /*
742 * Basic initialization of the balloon thread. Set all of our variables,
743 * and register a callback for later when we can register a xenbus watch.
744 */
745 void
balloon_init(pgcnt_t nr_pages)746 balloon_init(pgcnt_t nr_pages)
747 {
748 domid_t domid = DOMID_SELF;
749
750 bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages;
751 bln_stats.bln_new_target = bln_stats.bln_high = nr_pages;
752 bln_stats.bln_max_pages = nr_pages;
753 cv_init(&bln_cv, NULL, CV_DEFAULT, NULL);
754
755 bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op(
756 XENMEM_maximum_reservation, &domid);
757
758 (void) xs_register_xenbus_callback(balloon_config_watch);
759 }
760
761 /*
762 * These functions are called from the network drivers when they gain a page
763 * or give one away. We simply update our count. Note that the counter
764 * tracks the number of pages we give away, so we need to subtract any
765 * amount passed to balloon_drv_added.
766 */
767 void
balloon_drv_added(int64_t delta)768 balloon_drv_added(int64_t delta)
769 {
770 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta);
771 }
772
773 void
balloon_drv_subtracted(int64_t delta)774 balloon_drv_subtracted(int64_t delta)
775 {
776 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta);
777 }
778
779 /*
780 * balloon_alloc_pages()
781 * Allocate page_cnt mfns. mfns storage provided by the caller. Returns
782 * the number of pages allocated, which could be less than page_cnt, or
783 * a negative number if an error occurred.
784 */
785 long
balloon_alloc_pages(uint_t page_cnt,mfn_t * mfns)786 balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns)
787 {
788 xen_memory_reservation_t memres;
789 long rv;
790
791 bzero(&memres, sizeof (memres));
792 /*LINTED: constant in conditional context*/
793 set_xen_guest_handle(memres.extent_start, mfns);
794 memres.domid = DOMID_SELF;
795 memres.nr_extents = page_cnt;
796
797 rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
798 if (rv > 0)
799 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv);
800 return (rv);
801 }
802
803 /*
804 * balloon_free_pages()
805 * free page_cnt pages, using any combination of mfns, pfns, and kva as long
806 * as they refer to the same mapping. If an array of mfns is passed in, we
807 * assume they were already cleared. Otherwise, we need to zero the pages
808 * before giving them back to the hypervisor. kva space is not free'd up in
809 * case the caller wants to re-use it.
810 */
811 long
balloon_free_pages(uint_t page_cnt,mfn_t * mfns,caddr_t kva,pfn_t * pfns)812 balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
813 {
814 xen_memory_reservation_t memdec;
815 mfn_t mfn;
816 pfn_t pfn;
817 uint_t i;
818 long e;
819
820
821 #if DEBUG
822 /* make sure kva is page aligned and maps to first pfn */
823 if (kva != NULL) {
824 ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0);
825 if (pfns != NULL) {
826 ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]);
827 }
828 }
829 #endif
830
831 /* if we have a kva, we can clean all pages with just one bzero */
832 if ((kva != NULL) && balloon_zero_memory) {
833 bzero(kva, (page_cnt * PAGESIZE));
834 }
835
836 /* if we were given a kva and/or a pfn */
837 if ((kva != NULL) || (pfns != NULL)) {
838
839 /*
840 * All the current callers only pass 1 page when using kva or
841 * pfns, and use mfns when passing multiple pages. If that
842 * assumption is changed, the following code will need some
843 * work. The following ASSERT() guarantees we're respecting
844 * the io locking quota.
845 */
846 ASSERT(page_cnt < bln_contig_list_quota);
847
848 /* go through all the pages */
849 for (i = 0; i < page_cnt; i++) {
850
851 /* get the next pfn */
852 if (pfns == NULL) {
853 pfn = hat_getpfnum(kas.a_hat,
854 (kva + (PAGESIZE * i)));
855 } else {
856 pfn = pfns[i];
857 }
858
859 /*
860 * if we didn't already zero this page, do it now. we
861 * need to do this *before* we give back the MFN
862 */
863 if ((kva == NULL) && (balloon_zero_memory)) {
864 pfnzero(pfn, 0, PAGESIZE);
865 }
866
867 /*
868 * unmap the pfn. We don't free up the kva vmem space
869 * so the caller can re-use it. The page must be
870 * unmapped before it is given back to the hypervisor.
871 */
872 if (kva != NULL) {
873 hat_unload(kas.a_hat, (kva + (PAGESIZE * i)),
874 PAGESIZE, HAT_UNLOAD_UNMAP);
875 }
876
877 /* grab the mfn before the pfn is marked as invalid */
878 mfn = pfn_to_mfn(pfn);
879
880 /* mark the pfn as invalid */
881 reassign_pfn(pfn, MFN_INVALID);
882
883 /*
884 * if we weren't given an array of MFNs, we need to
885 * free them up one at a time. Otherwise, we'll wait
886 * until later and do it in one hypercall
887 */
888 if (mfns == NULL) {
889 bzero(&memdec, sizeof (memdec));
890 /*LINTED: constant in conditional context*/
891 set_xen_guest_handle(memdec.extent_start, &mfn);
892 memdec.domid = DOMID_SELF;
893 memdec.nr_extents = 1;
894 e = HYPERVISOR_memory_op(
895 XENMEM_decrease_reservation, &memdec);
896 if (e != 1) {
897 cmn_err(CE_PANIC, "balloon: unable to "
898 "give a page back to the "
899 "hypervisor.\n");
900 }
901 }
902 }
903 }
904
905 /*
906 * if we were passed in MFNs, we haven't free'd them up yet. We can
907 * do it with one call.
908 */
909 if (mfns != NULL) {
910 bzero(&memdec, sizeof (memdec));
911 /*LINTED: constant in conditional context*/
912 set_xen_guest_handle(memdec.extent_start, mfns);
913 memdec.domid = DOMID_SELF;
914 memdec.nr_extents = page_cnt;
915 e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec);
916 if (e != page_cnt) {
917 cmn_err(CE_PANIC, "balloon: unable to give pages back "
918 "to the hypervisor.\n");
919 }
920 }
921
922 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt);
923 return (page_cnt);
924 }
925
926
927 /*
928 * balloon_replace_pages()
929 * Try to replace nextexts blocks of 2^order pages. addr_bits specifies
930 * how many bits of address the pages must be within (i.e. 16 would mean
931 * that the pages cannot have an address > 64k). The constrints are on
932 * what the hypervisor gives us -- we are free to give any pages in
933 * exchange. The array pp is the pages we are giving away. The caller
934 * provides storage space for mfns, which hold the new physical pages.
935 */
936 long
balloon_replace_pages(uint_t nextents,page_t ** pp,uint_t addr_bits,uint_t order,mfn_t * mfns)937 balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits,
938 uint_t order, mfn_t *mfns)
939 {
940 xen_memory_reservation_t memres;
941 long fallback_cnt;
942 long cnt;
943 uint_t i, j, page_cnt, extlen;
944 long e;
945 int locked;
946
947
948 /*
949 * we shouldn't be allocating constrained pages on a guest. It doesn't
950 * make any sense. They won't be constrained after a migration.
951 */
952 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
953
954 extlen = 1 << order;
955 page_cnt = nextents * extlen;
956 /* Give back the current pages to the hypervisor */
957 for (i = 0; i < page_cnt; i++) {
958 cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum);
959 if (cnt != 1) {
960 cmn_err(CE_PANIC, "balloon: unable to give a page back "
961 "to the hypervisor.\n");
962 }
963 }
964
965 /*
966 * try to allocate the new pages using addr_bits and order. If we can't
967 * get all of the pages, try to get the remaining pages with no
968 * constraints and, if that was successful, return the number of
969 * constrained pages we did allocate.
970 */
971 bzero(&memres, sizeof (memres));
972 /*LINTED: constant in conditional context*/
973 set_xen_guest_handle(memres.extent_start, mfns);
974 memres.domid = DOMID_SELF;
975 memres.nr_extents = nextents;
976 memres.mem_flags = XENMEMF_address_bits(addr_bits);
977 memres.extent_order = order;
978 cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
979 /* assign the new MFNs to the current PFNs */
980 locked = balloon_lock_contig_pfnlist(cnt * extlen);
981 for (i = 0; i < cnt; i++) {
982 for (j = 0; j < extlen; j++) {
983 reassign_pfn(pp[i * extlen + j]->p_pagenum,
984 mfns[i] + j);
985 }
986 }
987 if (locked)
988 unlock_contig_pfnlist();
989 if (cnt != nextents) {
990 if (cnt < 0) {
991 cnt = 0;
992 }
993
994 /*
995 * We couldn't get enough memory to satisfy our requirements.
996 * The above loop will assign the parts of the request that
997 * were successful (this part may be 0). We need to fill
998 * in the rest. The bzero below clears out extent_order and
999 * address_bits, so we'll take anything from the hypervisor
1000 * to replace the pages we gave away.
1001 */
1002 fallback_cnt = page_cnt - cnt * extlen;
1003 bzero(&memres, sizeof (memres));
1004 /*LINTED: constant in conditional context*/
1005 set_xen_guest_handle(memres.extent_start, mfns);
1006 memres.domid = DOMID_SELF;
1007 memres.nr_extents = fallback_cnt;
1008 e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
1009 if (e != fallback_cnt) {
1010 cmn_err(CE_PANIC, "balloon: unable to recover from "
1011 "failed increase_reservation.\n");
1012 }
1013 locked = balloon_lock_contig_pfnlist(fallback_cnt);
1014 for (i = 0; i < fallback_cnt; i++) {
1015 uint_t offset = page_cnt - fallback_cnt;
1016
1017 /*
1018 * We already used pp[0...(cnt * extlen)] before,
1019 * so start at the next entry in the pp array.
1020 */
1021 reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]);
1022 }
1023 if (locked)
1024 unlock_contig_pfnlist();
1025 }
1026
1027 /*
1028 * balloon_free_pages increments our counter. Decrement it here.
1029 */
1030 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt);
1031
1032 /*
1033 * return the number of extents we were able to replace. If we got
1034 * this far, we know all the pp's are valid.
1035 */
1036 return (cnt);
1037 }
1038
1039
1040 /*
1041 * Called from the driver - return the requested stat.
1042 */
1043 size_t
balloon_values(int cmd)1044 balloon_values(int cmd)
1045 {
1046 switch (cmd) {
1047 case BLN_IOCTL_CURRENT:
1048 return (ptokb(bln_stats.bln_current_pages));
1049 case BLN_IOCTL_TARGET:
1050 return (ptokb(bln_stats.bln_new_target));
1051 case BLN_IOCTL_LOW:
1052 return (ptokb(bln_stats.bln_low));
1053 case BLN_IOCTL_HIGH:
1054 return (ptokb(bln_stats.bln_high));
1055 case BLN_IOCTL_LIMIT:
1056 return (ptokb(bln_stats.bln_hard_limit));
1057 default:
1058 panic("Unexpected cmd %d in balloon_values()\n", cmd);
1059 }
1060 /*NOTREACHED*/
1061 }
1062