1 /*- 2 * Copyright (c) 2004-2009 University of Zagreb 3 * Copyright (c) 2006-2009 FreeBSD Foundation 4 * All rights reserved. 5 * 6 * This software was developed by the University of Zagreb and the 7 * FreeBSD Foundation under sponsorship by the Stichting NLnet and the 8 * FreeBSD Foundation. 9 * 10 * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org> 11 * Copyright (c) 2009 Robert N. M. Watson 12 * All rights reserved. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_ddb.h" 40 41 #include <sys/param.h> 42 #include <sys/kernel.h> 43 #include <sys/jail.h> 44 #include <sys/systm.h> 45 #include <sys/sysctl.h> 46 #include <sys/linker_set.h> 47 #include <sys/lock.h> 48 #include <sys/malloc.h> 49 #include <sys/proc.h> 50 #include <sys/socket.h> 51 #include <sys/sx.h> 52 #include <sys/sysctl.h> 53 54 #ifdef DDB 55 #include <ddb/ddb.h> 56 #endif 57 58 #include <net/if.h> 59 #include <net/if_var.h> 60 #include <net/vnet.h> 61 62 /*- 63 * This file implements core functions for virtual network stacks: 64 * 65 * - Virtual network stack management functions. 66 * 67 * - Virtual network stack memory allocator, which virtualizes global 68 * variables in the network stack 69 * 70 * - Virtualized SYSINIT's/SYSUNINIT's, which allow network stack subsystems 71 * to register startup/shutdown events to be run for each virtual network 72 * stack instance. 73 */ 74 75 MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); 76 77 /* 78 * The virtual network stack list has two read-write locks, one sleepable and 79 * the other not, so that the list can be stablized and walked in a variety 80 * of network stack contexts. Both must be acquired exclusively to modify 81 * the list, but a read lock of either lock is sufficient to walk the list. 82 */ 83 struct rwlock vnet_rwlock; 84 struct sx vnet_sxlock; 85 86 #define VNET_LIST_WLOCK() do { \ 87 sx_xlock(&vnet_sxlock); \ 88 rw_wlock(&vnet_rwlock); \ 89 } while (0) 90 91 #define VNET_LIST_WUNLOCK() do { \ 92 rw_wunlock(&vnet_rwlock); \ 93 sx_xunlock(&vnet_sxlock); \ 94 } while (0) 95 96 struct vnet_list_head vnet_head; 97 struct vnet *vnet0; 98 99 /* 100 * The virtual network stack allocator provides storage for virtualized 101 * global variables. These variables are defined/declared using the 102 * VNET_DEFINE()/VNET_DECLARE() macros, which place them in the 'set_vnet' 103 * linker set. The details of the implementation are somewhat subtle, but 104 * allow the majority of most network subsystems to maintain 105 * virtualization-agnostic. 106 * 107 * The virtual network stack allocator handles variables in the base kernel 108 * vs. modules in similar but different ways. In both cases, virtualized 109 * global variables are marked as such by being declared to be part of the 110 * vnet linker set. These "master" copies of global variables serve two 111 * functions: 112 * 113 * (1) They contain static initialization or "default" values for global 114 * variables which will be propagated to each virtual network stack 115 * instance when created. As with normal global variables, they default 116 * to zero-filled. 117 * 118 * (2) They act as unique global names by which the variable can be referred 119 * to, regardless of network stack instance. The single global symbol 120 * will be used to calculate the location of a per-virtual instance 121 * variable at run-time. 122 * 123 * Each virtual network stack instance has a complete copy of each 124 * virtualized global variable, stored in a malloc'd block of memory 125 * referred to by vnet->vnet_data_mem. Critical to the design is that each 126 * per-instance memory block is laid out identically to the master block so 127 * that the offset of each global variable is the same across all blocks. To 128 * optimize run-time access, a precalculated 'base' address, 129 * vnet->vnet_data_base, is stored in each vnet, and is the amount that can 130 * be added to the address of a 'master' instance of a variable to get to the 131 * per-vnet instance. 132 * 133 * Virtualized global variables are handled in a similar manner, but as each 134 * module has its own 'set_vnet' linker set, and we want to keep all 135 * virtualized globals togther, we reserve space in the kernel's linker set 136 * for potential module variables using a per-vnet character array, 137 * 'modspace'. The virtual network stack allocator maintains a free list to 138 * track what space in the array is free (all, initially) and as modules are 139 * linked, allocates portions of the space to specific globals. The kernel 140 * module linker queries the virtual network stack allocator and will 141 * bind references of the global to the location during linking. It also 142 * calls into the virtual network stack allocator, once the memory is 143 * initialized, in order to propagate the new static initializations to all 144 * existing virtual network stack instances so that the soon-to-be executing 145 * module will find every network stack instance with proper default values. 146 */ 147 148 /* 149 * Location of the kernel's 'set_vnet' linker set. 150 */ 151 extern uintptr_t *__start_set_vnet; 152 extern uintptr_t *__stop_set_vnet; 153 154 #define VNET_START (uintptr_t)&__start_set_vnet 155 #define VNET_STOP (uintptr_t)&__stop_set_vnet 156 157 /* 158 * Number of bytes of data in the 'set_vnet' linker set, and hence the total 159 * size of all kernel virtualized global variables, and the malloc(9) type 160 * that will be used to allocate it. 161 */ 162 #define VNET_BYTES (VNET_STOP - VNET_START) 163 164 MALLOC_DEFINE(M_VNET_DATA, "vnet_data", "VNET data"); 165 166 /* 167 * VNET_MODMIN is the minimum number of bytes we will reserve for the sum of 168 * global variables across all loaded modules. As this actually sizes an 169 * array declared as a virtualized global variable in the kernel itself, and 170 * we want the virtualized global variable space to be page-sized, we may 171 * have more space than that in practice. 172 */ 173 #define VNET_MODMIN 8192 174 #define VNET_SIZE roundup2(VNET_BYTES, PAGE_SIZE) 175 #define VNET_MODSIZE (VNET_SIZE - (VNET_BYTES - VNET_MODMIN)) 176 177 /* 178 * Space to store virtualized global variables from loadable kernel modules, 179 * and the free list to manage it. 180 */ 181 static VNET_DEFINE(char, modspace[VNET_MODMIN]); 182 183 /* 184 * Global lists of subsystem constructor and destructors for vnets. They are 185 * registered via VNET_SYSINIT() and VNET_SYSUNINIT(). The lists are 186 * protected by the vnet_sxlock global lock. 187 */ 188 static TAILQ_HEAD(vnet_sysinit_head, vnet_sysinit) vnet_constructors = 189 TAILQ_HEAD_INITIALIZER(vnet_constructors); 190 static TAILQ_HEAD(vnet_sysuninit_head, vnet_sysinit) vnet_destructors = 191 TAILQ_HEAD_INITIALIZER(vnet_destructors); 192 193 struct vnet_data_free { 194 uintptr_t vnd_start; 195 int vnd_len; 196 TAILQ_ENTRY(vnet_data_free) vnd_link; 197 }; 198 199 MALLOC_DEFINE(M_VNET_DATA_FREE, "vnet_data_free", "VNET resource accounting"); 200 static TAILQ_HEAD(, vnet_data_free) vnet_data_free_head = 201 TAILQ_HEAD_INITIALIZER(vnet_data_free_head); 202 static struct sx vnet_data_free_lock; 203 204 /* 205 * Allocate a virtual network stack. 206 */ 207 struct vnet * 208 vnet_alloc(void) 209 { 210 struct vnet *vnet; 211 212 vnet = malloc(sizeof(struct vnet), M_VNET, M_WAITOK | M_ZERO); 213 vnet->vnet_magic_n = VNET_MAGIC_N; 214 215 /* 216 * Allocate storage for virtualized global variables and copy in 217 * initial values form our 'master' copy. 218 */ 219 vnet->vnet_data_mem = malloc(VNET_SIZE, M_VNET_DATA, M_WAITOK); 220 memcpy(vnet->vnet_data_mem, (void *)VNET_START, VNET_BYTES); 221 222 /* 223 * All use of vnet-specific data will immediately subtract VNET_START 224 * from the base memory pointer, so pre-calculate that now to avoid 225 * it on each use. 226 */ 227 vnet->vnet_data_base = (uintptr_t)vnet->vnet_data_mem - VNET_START; 228 229 /* Initialize / attach vnet module instances. */ 230 CURVNET_SET_QUIET(vnet); 231 232 sx_xlock(&vnet_sxlock); 233 vnet_sysinit(); 234 CURVNET_RESTORE(); 235 236 rw_wlock(&vnet_rwlock); 237 LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le); 238 VNET_LIST_WUNLOCK(); 239 240 return (vnet); 241 } 242 243 /* 244 * Destroy a virtual network stack. 245 */ 246 void 247 vnet_destroy(struct vnet *vnet) 248 { 249 struct ifnet *ifp, *nifp; 250 251 KASSERT(vnet->vnet_sockcnt == 0, 252 ("%s: vnet still has sockets", __func__)); 253 254 VNET_LIST_WLOCK(); 255 LIST_REMOVE(vnet, vnet_le); 256 rw_wunlock(&vnet_rwlock); 257 258 CURVNET_SET_QUIET(vnet); 259 260 /* Return all inherited interfaces to their parent vnets. */ 261 TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { 262 if (ifp->if_home_vnet != ifp->if_vnet) 263 if_vmove(ifp, ifp->if_home_vnet); 264 } 265 266 vnet_sysuninit(); 267 sx_xunlock(&vnet_sxlock); 268 269 CURVNET_RESTORE(); 270 271 /* 272 * Release storage for the virtual network stack instance. 273 */ 274 free(vnet->vnet_data_mem, M_VNET_DATA); 275 vnet->vnet_data_mem = NULL; 276 vnet->vnet_data_base = 0; 277 vnet->vnet_magic_n = 0xdeadbeef; 278 free(vnet, M_VNET); 279 } 280 281 /* 282 * Boot time initialization and allocation of virtual network stacks. 283 */ 284 static void 285 vnet_init_prelink(void *arg) 286 { 287 288 rw_init(&vnet_rwlock, "vnet_rwlock"); 289 sx_init(&vnet_sxlock, "vnet_sxlock"); 290 LIST_INIT(&vnet_head); 291 } 292 SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST, 293 vnet_init_prelink, NULL); 294 295 static void 296 vnet0_init(void *arg) 297 { 298 299 /* Warn people before take off - in case we crash early. */ 300 printf("WARNING: VIMAGE (virtualized network stack) is a highly " 301 "experimental feature.\n"); 302 303 /* 304 * We MUST clear curvnet in vi_init_done() before going SMP, 305 * otherwise CURVNET_SET() macros would scream about unnecessary 306 * curvnet recursions. 307 */ 308 curvnet = prison0.pr_vnet = vnet0 = vnet_alloc(); 309 } 310 SYSINIT(vnet0_init, SI_SUB_VNET, SI_ORDER_FIRST, vnet0_init, NULL); 311 312 static void 313 vnet_init_done(void *unused) 314 { 315 316 curvnet = NULL; 317 } 318 319 SYSINIT(vnet_init_done, SI_SUB_VNET_DONE, SI_ORDER_FIRST, vnet_init_done, 320 NULL); 321 322 /* 323 * Once on boot, initialize the modspace freelist to entirely cover modspace. 324 */ 325 static void 326 vnet_data_startup(void *dummy __unused) 327 { 328 struct vnet_data_free *df; 329 330 df = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO); 331 df->vnd_start = (uintptr_t)&VNET_NAME(modspace); 332 df->vnd_len = VNET_MODSIZE; 333 TAILQ_INSERT_HEAD(&vnet_data_free_head, df, vnd_link); 334 sx_init(&vnet_data_free_lock, "vnet_data alloc lock"); 335 } 336 SYSINIT(vnet_data, SI_SUB_KLD, SI_ORDER_FIRST, vnet_data_startup, 0); 337 338 /* 339 * When a module is loaded and requires storage for a virtualized global 340 * variable, allocate space from the modspace free list. This interface 341 * should be used only by the kernel linker. 342 */ 343 void * 344 vnet_data_alloc(int size) 345 { 346 struct vnet_data_free *df; 347 void *s; 348 349 s = NULL; 350 size = roundup2(size, sizeof(void *)); 351 sx_xlock(&vnet_data_free_lock); 352 TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) { 353 if (df->vnd_len < size) 354 continue; 355 if (df->vnd_len == size) { 356 s = (void *)df->vnd_start; 357 TAILQ_REMOVE(&vnet_data_free_head, df, vnd_link); 358 free(df, M_VNET_DATA_FREE); 359 break; 360 } 361 s = (void *)df->vnd_start; 362 df->vnd_len -= size; 363 df->vnd_start = df->vnd_start + size; 364 break; 365 } 366 sx_xunlock(&vnet_data_free_lock); 367 368 return (s); 369 } 370 371 /* 372 * Free space for a virtualized global variable on module unload. 373 */ 374 void 375 vnet_data_free(void *start_arg, int size) 376 { 377 struct vnet_data_free *df; 378 struct vnet_data_free *dn; 379 uintptr_t start; 380 uintptr_t end; 381 382 size = roundup2(size, sizeof(void *)); 383 start = (uintptr_t)start_arg; 384 end = start + size; 385 /* 386 * Free a region of space and merge it with as many neighbors as 387 * possible. Keeping the list sorted simplifies this operation. 388 */ 389 sx_xlock(&vnet_data_free_lock); 390 TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) { 391 if (df->vnd_start > end) 392 break; 393 /* 394 * If we expand at the end of an entry we may have to merge 395 * it with the one following it as well. 396 */ 397 if (df->vnd_start + df->vnd_len == start) { 398 df->vnd_len += size; 399 dn = TAILQ_NEXT(df, vnd_link); 400 if (df->vnd_start + df->vnd_len == dn->vnd_start) { 401 df->vnd_len += dn->vnd_len; 402 TAILQ_REMOVE(&vnet_data_free_head, dn, 403 vnd_link); 404 free(dn, M_VNET_DATA_FREE); 405 } 406 sx_xunlock(&vnet_data_free_lock); 407 return; 408 } 409 if (df->vnd_start == end) { 410 df->vnd_start = start; 411 df->vnd_len += size; 412 sx_xunlock(&vnet_data_free_lock); 413 return; 414 } 415 } 416 dn = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO); 417 dn->vnd_start = start; 418 dn->vnd_len = size; 419 if (df) 420 TAILQ_INSERT_BEFORE(df, dn, vnd_link); 421 else 422 TAILQ_INSERT_TAIL(&vnet_data_free_head, dn, vnd_link); 423 sx_xunlock(&vnet_data_free_lock); 424 } 425 426 /* 427 * When a new virtualized global variable has been allocated, propagate its 428 * initial value to each already-allocated virtual network stack instance. 429 */ 430 void 431 vnet_data_copy(void *start, int size) 432 { 433 struct vnet *vnet; 434 435 VNET_LIST_RLOCK(); 436 LIST_FOREACH(vnet, &vnet_head, vnet_le) 437 memcpy((void *)((uintptr_t)vnet->vnet_data_base + 438 (uintptr_t)start), start, size); 439 VNET_LIST_RUNLOCK(); 440 } 441 442 /* 443 * Variants on sysctl_handle_foo that know how to handle virtualized global 444 * variables: if 'arg1' is a pointer, then we transform it to the local vnet 445 * offset. 446 */ 447 int 448 vnet_sysctl_handle_int(SYSCTL_HANDLER_ARGS) 449 { 450 451 if (arg1 != NULL) 452 arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); 453 return (sysctl_handle_int(oidp, arg1, arg2, req)); 454 } 455 456 int 457 vnet_sysctl_handle_opaque(SYSCTL_HANDLER_ARGS) 458 { 459 460 if (arg1 != NULL) 461 arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); 462 return (sysctl_handle_opaque(oidp, arg1, arg2, req)); 463 } 464 465 int 466 vnet_sysctl_handle_string(SYSCTL_HANDLER_ARGS) 467 { 468 469 if (arg1 != NULL) 470 arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); 471 return (sysctl_handle_string(oidp, arg1, arg2, req)); 472 } 473 474 int 475 vnet_sysctl_handle_uint(SYSCTL_HANDLER_ARGS) 476 { 477 478 if (arg1 != NULL) 479 arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); 480 return (sysctl_handle_int(oidp, arg1, arg2, req)); 481 } 482 483 /* 484 * Support for special SYSINIT handlers registered via VNET_SYSINIT() 485 * and VNET_SYSUNINIT(). 486 */ 487 void 488 vnet_register_sysinit(void *arg) 489 { 490 struct vnet_sysinit *vs, *vs2; 491 struct vnet *vnet; 492 493 vs = arg; 494 KASSERT(vs->subsystem > SI_SUB_VNET, ("vnet sysinit too early")); 495 496 /* Add the constructor to the global list of vnet constructors. */ 497 sx_xlock(&vnet_sxlock); 498 TAILQ_FOREACH(vs2, &vnet_constructors, link) { 499 if (vs2->subsystem > vs->subsystem) 500 break; 501 if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) 502 break; 503 } 504 if (vs2 != NULL) 505 TAILQ_INSERT_BEFORE(vs2, vs, link); 506 else 507 TAILQ_INSERT_TAIL(&vnet_constructors, vs, link); 508 509 /* 510 * Invoke the constructor on all the existing vnets when it is 511 * registered. 512 */ 513 VNET_FOREACH(vnet) { 514 CURVNET_SET_QUIET(vnet); 515 vs->func(vs->arg); 516 CURVNET_RESTORE(); 517 } 518 sx_xunlock(&vnet_sxlock); 519 } 520 521 void 522 vnet_deregister_sysinit(void *arg) 523 { 524 struct vnet_sysinit *vs; 525 526 vs = arg; 527 528 /* Remove the constructor from the global list of vnet constructors. */ 529 sx_xlock(&vnet_sxlock); 530 TAILQ_REMOVE(&vnet_constructors, vs, link); 531 sx_xunlock(&vnet_sxlock); 532 } 533 534 void 535 vnet_register_sysuninit(void *arg) 536 { 537 struct vnet_sysinit *vs, *vs2; 538 539 vs = arg; 540 541 /* Add the destructor to the global list of vnet destructors. */ 542 sx_xlock(&vnet_sxlock); 543 TAILQ_FOREACH(vs2, &vnet_destructors, link) { 544 if (vs2->subsystem > vs->subsystem) 545 break; 546 if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) 547 break; 548 } 549 if (vs2 != NULL) 550 TAILQ_INSERT_BEFORE(vs2, vs, link); 551 else 552 TAILQ_INSERT_TAIL(&vnet_destructors, vs, link); 553 sx_xunlock(&vnet_sxlock); 554 } 555 556 void 557 vnet_deregister_sysuninit(void *arg) 558 { 559 struct vnet_sysinit *vs; 560 struct vnet *vnet; 561 562 vs = arg; 563 564 /* 565 * Invoke the destructor on all the existing vnets when it is 566 * deregistered. 567 */ 568 sx_xlock(&vnet_sxlock); 569 VNET_FOREACH(vnet) { 570 CURVNET_SET_QUIET(vnet); 571 vs->func(vs->arg); 572 CURVNET_RESTORE(); 573 } 574 575 /* Remove the destructor from the global list of vnet destructors. */ 576 TAILQ_REMOVE(&vnet_destructors, vs, link); 577 sx_xunlock(&vnet_sxlock); 578 } 579 580 /* 581 * Invoke all registered vnet constructors on the current vnet. Used during 582 * vnet construction. The caller is responsible for ensuring the new vnet is 583 * the current vnet and that the vnet_sxlock lock is locked. 584 */ 585 void 586 vnet_sysinit(void) 587 { 588 struct vnet_sysinit *vs; 589 590 sx_assert(&vnet_sxlock, SA_LOCKED); 591 TAILQ_FOREACH(vs, &vnet_constructors, link) { 592 vs->func(vs->arg); 593 } 594 } 595 596 /* 597 * Invoke all registered vnet destructors on the current vnet. Used during 598 * vnet destruction. The caller is responsible for ensuring the dying vnet 599 * is the current vnet and that the vnet_sxlock lock is locked. 600 */ 601 void 602 vnet_sysuninit(void) 603 { 604 struct vnet_sysinit *vs; 605 606 sx_assert(&vnet_sxlock, SA_LOCKED); 607 TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head, 608 link) { 609 vs->func(vs->arg); 610 } 611 } 612 613 #ifdef DDB 614 DB_SHOW_COMMAND(vnets, db_show_vnets) 615 { 616 VNET_ITERATOR_DECL(vnet_iter); 617 618 VNET_FOREACH(vnet_iter) { 619 db_printf("vnet = %p\n", vnet_iter); 620 db_printf(" vnet_magic_n = 0x%x (%s, orig 0x%x)\n", 621 vnet_iter->vnet_magic_n, 622 (vnet_iter->vnet_magic_n == VNET_MAGIC_N) ? 623 "ok" : "mismatch", VNET_MAGIC_N); 624 db_printf(" vnet_ifcnt = %u\n", vnet_iter->vnet_ifcnt); 625 db_printf(" vnet_sockcnt = %u\n", vnet_iter->vnet_sockcnt); 626 db_printf(" vnet_data_mem = %p\n", vnet_iter->vnet_data_mem); 627 db_printf(" vnet_data_base = 0x%jx\n", 628 (uintmax_t)vnet_iter->vnet_data_base); 629 db_printf("\n"); 630 if (db_pager_quit) 631 break; 632 } 633 } 634 #endif 635