1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VMware vSockets Driver 4 * 5 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. 6 */ 7 8 /* Implementation notes: 9 * 10 * - There are two kinds of sockets: those created by user action (such as 11 * calling socket(2)) and those created by incoming connection request packets. 12 * 13 * - There are two "global" tables, one for bound sockets (sockets that have 14 * specified an address that they are responsible for) and one for connected 15 * sockets (sockets that have established a connection with another socket). 16 * These tables are "global" in that all sockets on the system are placed 17 * within them. - Note, though, that the bound table contains an extra entry 18 * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in 19 * that list. The bound table is used solely for lookup of sockets when packets 20 * are received and that's not necessary for SOCK_DGRAM sockets since we create 21 * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM 22 * sockets out of the bound hash buckets will reduce the chance of collisions 23 * when looking for SOCK_STREAM sockets and prevents us from having to check the 24 * socket type in the hash table lookups. 25 * 26 * - Sockets created by user action will either be "client" sockets that 27 * initiate a connection or "server" sockets that listen for connections; we do 28 * not support simultaneous connects (two "client" sockets connecting). 29 * 30 * - "Server" sockets are referred to as listener sockets throughout this 31 * implementation because they are in the TCP_LISTEN state. When a 32 * connection request is received (the second kind of socket mentioned above), 33 * we create a new socket and refer to it as a pending socket. These pending 34 * sockets are placed on the pending connection list of the listener socket. 35 * When future packets are received for the address the listener socket is 36 * bound to, we check if the source of the packet is from one that has an 37 * existing pending connection. If it does, we process the packet for the 38 * pending socket. When that socket reaches the connected state, it is removed 39 * from the listener socket's pending list and enqueued in the listener 40 * socket's accept queue. Callers of accept(2) will accept connected sockets 41 * from the listener socket's accept queue. If the socket cannot be accepted 42 * for some reason then it is marked rejected. Once the connection is 43 * accepted, it is owned by the user process and the responsibility for cleanup 44 * falls with that user process. 45 * 46 * - It is possible that these pending sockets will never reach the connected 47 * state; in fact, we may never receive another packet after the connection 48 * request. Because of this, we must schedule a cleanup function to run in the 49 * future, after some amount of time passes where a connection should have been 50 * established. This function ensures that the socket is off all lists so it 51 * cannot be retrieved, then drops all references to the socket so it is cleaned 52 * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this 53 * function will also cleanup rejected sockets, those that reach the connected 54 * state but leave it before they have been accepted. 55 * 56 * - Lock ordering for pending or accept queue sockets is: 57 * 58 * lock_sock(listener); 59 * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); 60 * 61 * Using explicit nested locking keeps lockdep happy since normally only one 62 * lock of a given class may be taken at a time. 63 * 64 * - Sockets created by user action will be cleaned up when the user process 65 * calls close(2), causing our release implementation to be called. Our release 66 * implementation will perform some cleanup then drop the last reference so our 67 * sk_destruct implementation is invoked. Our sk_destruct implementation will 68 * perform additional cleanup that's common for both types of sockets. 69 * 70 * - A socket's reference count is what ensures that the structure won't be 71 * freed. Each entry in a list (such as the "global" bound and connected tables 72 * and the listener socket's pending list and connected queue) ensures a 73 * reference. When we defer work until process context and pass a socket as our 74 * argument, we must ensure the reference count is increased to ensure the 75 * socket isn't freed before the function is run; the deferred function will 76 * then drop the reference. 77 * 78 * - sk->sk_state uses the TCP state constants because they are widely used by 79 * other address families and exposed to userspace tools like ss(8): 80 * 81 * TCP_CLOSE - unconnected 82 * TCP_SYN_SENT - connecting 83 * TCP_ESTABLISHED - connected 84 * TCP_CLOSING - disconnecting 85 * TCP_LISTEN - listening 86 * 87 * - Namespaces in vsock support two different modes: "local" and "global". 88 * Each mode defines how the namespace interacts with CIDs. 89 * Each namespace exposes two sysctl files: 90 * 91 * - /proc/sys/net/vsock/ns_mode (read-only) reports the current namespace's 92 * mode, which is set at namespace creation and immutable thereafter. 93 * - /proc/sys/net/vsock/child_ns_mode (writable) controls what mode future 94 * child namespaces will inherit when created. The initial value matches 95 * the namespace's own ns_mode. 96 * 97 * Changing child_ns_mode only affects newly created namespaces, not the 98 * current namespace or existing children. A "local" namespace cannot set 99 * child_ns_mode to "global". At namespace creation, ns_mode is inherited 100 * from the parent's child_ns_mode. 101 * 102 * The init_net mode is "global" and cannot be modified. 103 * 104 * The modes affect the allocation and accessibility of CIDs as follows: 105 * 106 * - global - access and allocation are all system-wide 107 * - all CID allocation from global namespaces draw from the same 108 * system-wide pool. 109 * - if one global namespace has already allocated some CID, another 110 * global namespace will not be able to allocate the same CID. 111 * - global mode AF_VSOCK sockets can reach any VM or socket in any global 112 * namespace, they are not contained to only their own namespace. 113 * - AF_VSOCK sockets in a global mode namespace cannot reach VMs or 114 * sockets in any local mode namespace. 115 * - local - access and allocation are contained within the namespace 116 * - CID allocation draws only from a private pool local only to the 117 * namespace, and does not affect the CIDs available for allocation in any 118 * other namespace (global or local). 119 * - VMs in a local namespace do not collide with CIDs in any other local 120 * namespace or any global namespace. For example, if a VM in a local mode 121 * namespace is given CID 10, then CID 10 is still available for 122 * allocation in any other namespace, but not in the same namespace. 123 * - AF_VSOCK sockets in a local mode namespace can connect only to VMs or 124 * other sockets within their own namespace. 125 * - sockets bound to VMADDR_CID_ANY in local namespaces will never resolve 126 * to any transport that is not compatible with local mode. There is no 127 * error that propagates to the user (as there is for connection attempts) 128 * because it is possible for some packet to reach this socket from 129 * a different transport that *does* support local mode. For 130 * example, virtio-vsock may not support local mode, but the socket 131 * may still accept a connection from vhost-vsock which does. 132 */ 133 134 #include <linux/compat.h> 135 #include <linux/types.h> 136 #include <linux/bitops.h> 137 #include <linux/cred.h> 138 #include <linux/errqueue.h> 139 #include <linux/init.h> 140 #include <linux/io.h> 141 #include <linux/kernel.h> 142 #include <linux/sched/signal.h> 143 #include <linux/kmod.h> 144 #include <linux/list.h> 145 #include <linux/miscdevice.h> 146 #include <linux/module.h> 147 #include <linux/mutex.h> 148 #include <linux/net.h> 149 #include <linux/proc_fs.h> 150 #include <linux/poll.h> 151 #include <linux/random.h> 152 #include <linux/skbuff.h> 153 #include <linux/smp.h> 154 #include <linux/socket.h> 155 #include <linux/stddef.h> 156 #include <linux/sysctl.h> 157 #include <linux/unistd.h> 158 #include <linux/wait.h> 159 #include <linux/workqueue.h> 160 #include <net/sock.h> 161 #include <net/af_vsock.h> 162 #include <net/netns/vsock.h> 163 #include <uapi/linux/vm_sockets.h> 164 #include <uapi/asm-generic/ioctls.h> 165 166 #define VSOCK_NET_MODE_STR_GLOBAL "global" 167 #define VSOCK_NET_MODE_STR_LOCAL "local" 168 169 /* 6 chars for "global", 1 for null-terminator, and 1 more for '\n'. 170 * The newline is added by proc_dostring() for read operations. 171 */ 172 #define VSOCK_NET_MODE_STR_MAX 8 173 174 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); 175 static void vsock_sk_destruct(struct sock *sk); 176 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 177 static void vsock_close(struct sock *sk, long timeout); 178 179 /* Protocol family. */ 180 struct proto vsock_proto = { 181 .name = "AF_VSOCK", 182 .owner = THIS_MODULE, 183 .obj_size = sizeof(struct vsock_sock), 184 .close = vsock_close, 185 #ifdef CONFIG_BPF_SYSCALL 186 .psock_update_sk_prot = vsock_bpf_update_proto, 187 #endif 188 }; 189 190 /* The default peer timeout indicates how long we will wait for a peer response 191 * to a control message. 192 */ 193 #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) 194 195 #define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256) 196 #define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256) 197 #define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128 198 199 /* Transport used for host->guest communication */ 200 static const struct vsock_transport *transport_h2g; 201 /* Transport used for guest->host communication */ 202 static const struct vsock_transport *transport_g2h; 203 /* Transport used for DGRAM communication */ 204 static const struct vsock_transport *transport_dgram; 205 /* Transport used for local communication */ 206 static const struct vsock_transport *transport_local; 207 static DEFINE_MUTEX(vsock_register_mutex); 208 209 /**** UTILS ****/ 210 211 /* Each bound VSocket is stored in the bind hash table and each connected 212 * VSocket is stored in the connected hash table. 213 * 214 * Unbound sockets are all put on the same list attached to the end of the hash 215 * table (vsock_unbound_sockets). Bound sockets are added to the hash table in 216 * the bucket that their local address hashes to (vsock_bound_sockets(addr) 217 * represents the list that addr hashes to). 218 * 219 * Specifically, we initialize the vsock_bind_table array to a size of 220 * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through 221 * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and 222 * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function 223 * mods with VSOCK_HASH_SIZE to ensure this. 224 */ 225 #define MAX_PORT_RETRIES 24 226 227 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) 228 #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) 229 #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) 230 231 /* XXX This can probably be implemented in a better way. */ 232 #define VSOCK_CONN_HASH(src, dst) \ 233 (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE) 234 #define vsock_connected_sockets(src, dst) \ 235 (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) 236 #define vsock_connected_sockets_vsk(vsk) \ 237 vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) 238 239 struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; 240 EXPORT_SYMBOL_GPL(vsock_bind_table); 241 struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; 242 EXPORT_SYMBOL_GPL(vsock_connected_table); 243 DEFINE_SPINLOCK(vsock_table_lock); 244 EXPORT_SYMBOL_GPL(vsock_table_lock); 245 246 /* Autobind this socket to the local address if necessary. */ 247 static int vsock_auto_bind(struct vsock_sock *vsk) 248 { 249 struct sock *sk = sk_vsock(vsk); 250 struct sockaddr_vm local_addr; 251 252 if (vsock_addr_bound(&vsk->local_addr)) 253 return 0; 254 vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 255 return __vsock_bind(sk, &local_addr); 256 } 257 258 static void vsock_init_tables(void) 259 { 260 int i; 261 262 for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) 263 INIT_LIST_HEAD(&vsock_bind_table[i]); 264 265 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) 266 INIT_LIST_HEAD(&vsock_connected_table[i]); 267 } 268 269 static void __vsock_insert_bound(struct list_head *list, 270 struct vsock_sock *vsk) 271 { 272 sock_hold(&vsk->sk); 273 list_add(&vsk->bound_table, list); 274 } 275 276 static void __vsock_insert_connected(struct list_head *list, 277 struct vsock_sock *vsk) 278 { 279 sock_hold(&vsk->sk); 280 list_add(&vsk->connected_table, list); 281 } 282 283 static void __vsock_remove_bound(struct vsock_sock *vsk) 284 { 285 list_del_init(&vsk->bound_table); 286 sock_put(&vsk->sk); 287 } 288 289 static void __vsock_remove_connected(struct vsock_sock *vsk) 290 { 291 list_del_init(&vsk->connected_table); 292 sock_put(&vsk->sk); 293 } 294 295 static struct sock *__vsock_find_bound_socket_net(struct sockaddr_vm *addr, 296 struct net *net) 297 { 298 struct vsock_sock *vsk; 299 300 list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { 301 struct sock *sk = sk_vsock(vsk); 302 303 if (vsock_addr_equals_addr(addr, &vsk->local_addr) && 304 vsock_net_check_mode(sock_net(sk), net)) 305 return sk; 306 307 if (addr->svm_port == vsk->local_addr.svm_port && 308 (vsk->local_addr.svm_cid == VMADDR_CID_ANY || 309 addr->svm_cid == VMADDR_CID_ANY) && 310 vsock_net_check_mode(sock_net(sk), net)) 311 return sk; 312 } 313 314 return NULL; 315 } 316 317 static struct sock * 318 __vsock_find_connected_socket_net(struct sockaddr_vm *src, 319 struct sockaddr_vm *dst, struct net *net) 320 { 321 struct vsock_sock *vsk; 322 323 list_for_each_entry(vsk, vsock_connected_sockets(src, dst), 324 connected_table) { 325 struct sock *sk = sk_vsock(vsk); 326 327 if (vsock_addr_equals_addr(src, &vsk->remote_addr) && 328 dst->svm_port == vsk->local_addr.svm_port && 329 vsock_net_check_mode(sock_net(sk), net)) { 330 return sk; 331 } 332 } 333 334 return NULL; 335 } 336 337 static void vsock_insert_unbound(struct vsock_sock *vsk) 338 { 339 spin_lock_bh(&vsock_table_lock); 340 __vsock_insert_bound(vsock_unbound_sockets, vsk); 341 spin_unlock_bh(&vsock_table_lock); 342 } 343 344 void vsock_insert_connected(struct vsock_sock *vsk) 345 { 346 struct list_head *list = vsock_connected_sockets( 347 &vsk->remote_addr, &vsk->local_addr); 348 349 spin_lock_bh(&vsock_table_lock); 350 __vsock_insert_connected(list, vsk); 351 spin_unlock_bh(&vsock_table_lock); 352 } 353 EXPORT_SYMBOL_GPL(vsock_insert_connected); 354 355 void vsock_remove_bound(struct vsock_sock *vsk) 356 { 357 spin_lock_bh(&vsock_table_lock); 358 if (__vsock_in_bound_table(vsk)) 359 __vsock_remove_bound(vsk); 360 spin_unlock_bh(&vsock_table_lock); 361 } 362 EXPORT_SYMBOL_GPL(vsock_remove_bound); 363 364 void vsock_remove_connected(struct vsock_sock *vsk) 365 { 366 spin_lock_bh(&vsock_table_lock); 367 if (__vsock_in_connected_table(vsk)) 368 __vsock_remove_connected(vsk); 369 spin_unlock_bh(&vsock_table_lock); 370 } 371 EXPORT_SYMBOL_GPL(vsock_remove_connected); 372 373 /* Find a bound socket, filtering by namespace and namespace mode. 374 * 375 * Use this in transports that are namespace-aware and can provide the 376 * network namespace context. 377 */ 378 struct sock *vsock_find_bound_socket_net(struct sockaddr_vm *addr, 379 struct net *net) 380 { 381 struct sock *sk; 382 383 spin_lock_bh(&vsock_table_lock); 384 sk = __vsock_find_bound_socket_net(addr, net); 385 if (sk) 386 sock_hold(sk); 387 388 spin_unlock_bh(&vsock_table_lock); 389 390 return sk; 391 } 392 EXPORT_SYMBOL_GPL(vsock_find_bound_socket_net); 393 394 /* Find a bound socket without namespace filtering. 395 * 396 * Use this in transports that lack namespace context. All sockets are 397 * treated as if in global mode. 398 */ 399 struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) 400 { 401 return vsock_find_bound_socket_net(addr, NULL); 402 } 403 EXPORT_SYMBOL_GPL(vsock_find_bound_socket); 404 405 /* Find a connected socket, filtering by namespace and namespace mode. 406 * 407 * Use this in transports that are namespace-aware and can provide the 408 * network namespace context. 409 */ 410 struct sock *vsock_find_connected_socket_net(struct sockaddr_vm *src, 411 struct sockaddr_vm *dst, 412 struct net *net) 413 { 414 struct sock *sk; 415 416 spin_lock_bh(&vsock_table_lock); 417 sk = __vsock_find_connected_socket_net(src, dst, net); 418 if (sk) 419 sock_hold(sk); 420 421 spin_unlock_bh(&vsock_table_lock); 422 423 return sk; 424 } 425 EXPORT_SYMBOL_GPL(vsock_find_connected_socket_net); 426 427 /* Find a connected socket without namespace filtering. 428 * 429 * Use this in transports that lack namespace context. All sockets are 430 * treated as if in global mode. 431 */ 432 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, 433 struct sockaddr_vm *dst) 434 { 435 return vsock_find_connected_socket_net(src, dst, NULL); 436 } 437 EXPORT_SYMBOL_GPL(vsock_find_connected_socket); 438 439 void vsock_remove_sock(struct vsock_sock *vsk) 440 { 441 /* Transport reassignment must not remove the binding. */ 442 if (sock_flag(sk_vsock(vsk), SOCK_DEAD)) 443 vsock_remove_bound(vsk); 444 445 vsock_remove_connected(vsk); 446 } 447 EXPORT_SYMBOL_GPL(vsock_remove_sock); 448 449 void vsock_for_each_connected_socket(struct vsock_transport *transport, 450 void (*fn)(struct sock *sk)) 451 { 452 int i; 453 454 spin_lock_bh(&vsock_table_lock); 455 456 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { 457 struct vsock_sock *vsk; 458 list_for_each_entry(vsk, &vsock_connected_table[i], 459 connected_table) { 460 if (vsk->transport != transport) 461 continue; 462 463 fn(sk_vsock(vsk)); 464 } 465 } 466 467 spin_unlock_bh(&vsock_table_lock); 468 } 469 EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); 470 471 void vsock_add_pending(struct sock *listener, struct sock *pending) 472 { 473 struct vsock_sock *vlistener; 474 struct vsock_sock *vpending; 475 476 vlistener = vsock_sk(listener); 477 vpending = vsock_sk(pending); 478 479 sock_hold(pending); 480 sock_hold(listener); 481 list_add_tail(&vpending->pending_links, &vlistener->pending_links); 482 } 483 EXPORT_SYMBOL_GPL(vsock_add_pending); 484 485 void vsock_remove_pending(struct sock *listener, struct sock *pending) 486 { 487 struct vsock_sock *vpending = vsock_sk(pending); 488 489 list_del_init(&vpending->pending_links); 490 sock_put(listener); 491 sock_put(pending); 492 } 493 EXPORT_SYMBOL_GPL(vsock_remove_pending); 494 495 void vsock_enqueue_accept(struct sock *listener, struct sock *connected) 496 { 497 struct vsock_sock *vlistener; 498 struct vsock_sock *vconnected; 499 500 vlistener = vsock_sk(listener); 501 vconnected = vsock_sk(connected); 502 503 sock_hold(connected); 504 sock_hold(listener); 505 list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); 506 } 507 EXPORT_SYMBOL_GPL(vsock_enqueue_accept); 508 509 static bool vsock_use_local_transport(unsigned int remote_cid) 510 { 511 lockdep_assert_held(&vsock_register_mutex); 512 513 if (!transport_local) 514 return false; 515 516 if (remote_cid == VMADDR_CID_LOCAL) 517 return true; 518 519 if (transport_g2h) { 520 return remote_cid == transport_g2h->get_local_cid(); 521 } else { 522 return remote_cid == VMADDR_CID_HOST; 523 } 524 } 525 526 static void vsock_deassign_transport(struct vsock_sock *vsk) 527 { 528 if (!vsk->transport) 529 return; 530 531 vsk->transport->destruct(vsk); 532 module_put(vsk->transport->module); 533 vsk->transport = NULL; 534 } 535 536 /* Assign a transport to a socket and call the .init transport callback. 537 * 538 * Note: for connection oriented socket this must be called when vsk->remote_addr 539 * is set (e.g. during the connect() or when a connection request on a listener 540 * socket is received). 541 * The vsk->remote_addr is used to decide which transport to use: 542 * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if 543 * g2h is not loaded, will use local transport; 544 * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field 545 * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport; 546 * - remote CID > VMADDR_CID_HOST will use host->guest transport; 547 */ 548 int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) 549 { 550 const struct vsock_transport *new_transport; 551 struct sock *sk = sk_vsock(vsk); 552 unsigned int remote_cid = vsk->remote_addr.svm_cid; 553 __u8 remote_flags; 554 int ret; 555 556 /* If the packet is coming with the source and destination CIDs higher 557 * than VMADDR_CID_HOST, then a vsock channel where all the packets are 558 * forwarded to the host should be established. Then the host will 559 * need to forward the packets to the guest. 560 * 561 * The flag is set on the (listen) receive path (psk is not NULL). On 562 * the connect path the flag can be set by the user space application. 563 */ 564 if (psk && vsk->local_addr.svm_cid > VMADDR_CID_HOST && 565 vsk->remote_addr.svm_cid > VMADDR_CID_HOST) 566 vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST; 567 568 remote_flags = vsk->remote_addr.svm_flags; 569 570 mutex_lock(&vsock_register_mutex); 571 572 switch (sk->sk_type) { 573 case SOCK_DGRAM: 574 new_transport = transport_dgram; 575 break; 576 case SOCK_STREAM: 577 case SOCK_SEQPACKET: 578 if (vsock_use_local_transport(remote_cid)) 579 new_transport = transport_local; 580 else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || 581 (remote_flags & VMADDR_FLAG_TO_HOST)) 582 new_transport = transport_g2h; 583 else 584 new_transport = transport_h2g; 585 break; 586 default: 587 ret = -ESOCKTNOSUPPORT; 588 goto err; 589 } 590 591 if (vsk->transport && vsk->transport == new_transport) { 592 ret = 0; 593 goto err; 594 } 595 596 /* We increase the module refcnt to prevent the transport unloading 597 * while there are open sockets assigned to it. 598 */ 599 if (!new_transport || !try_module_get(new_transport->module)) { 600 ret = -ENODEV; 601 goto err; 602 } 603 604 /* It's safe to release the mutex after a successful try_module_get(). 605 * Whichever transport `new_transport` points at, it won't go away until 606 * the last module_put() below or in vsock_deassign_transport(). 607 */ 608 mutex_unlock(&vsock_register_mutex); 609 610 if (vsk->transport) { 611 /* transport->release() must be called with sock lock acquired. 612 * This path can only be taken during vsock_connect(), where we 613 * have already held the sock lock. In the other cases, this 614 * function is called on a new socket which is not assigned to 615 * any transport. 616 */ 617 vsk->transport->release(vsk); 618 vsock_deassign_transport(vsk); 619 620 /* transport's release() and destruct() can touch some socket 621 * state, since we are reassigning the socket to a new transport 622 * during vsock_connect(), let's reset these fields to have a 623 * clean state. 624 */ 625 sock_reset_flag(sk, SOCK_DONE); 626 sk->sk_state = TCP_CLOSE; 627 vsk->peer_shutdown = 0; 628 } 629 630 if (sk->sk_type == SOCK_SEQPACKET) { 631 if (!new_transport->seqpacket_allow || 632 !new_transport->seqpacket_allow(vsk, remote_cid)) { 633 module_put(new_transport->module); 634 return -ESOCKTNOSUPPORT; 635 } 636 } 637 638 ret = new_transport->init(vsk, psk); 639 if (ret) { 640 module_put(new_transport->module); 641 return ret; 642 } 643 644 vsk->transport = new_transport; 645 646 return 0; 647 err: 648 mutex_unlock(&vsock_register_mutex); 649 return ret; 650 } 651 EXPORT_SYMBOL_GPL(vsock_assign_transport); 652 653 /* 654 * Provide safe access to static transport_{h2g,g2h,dgram,local} callbacks. 655 * Otherwise we may race with module removal. Do not use on `vsk->transport`. 656 */ 657 static u32 vsock_registered_transport_cid(const struct vsock_transport **transport) 658 { 659 u32 cid = VMADDR_CID_ANY; 660 661 mutex_lock(&vsock_register_mutex); 662 if (*transport) 663 cid = (*transport)->get_local_cid(); 664 mutex_unlock(&vsock_register_mutex); 665 666 return cid; 667 } 668 669 bool vsock_find_cid(unsigned int cid) 670 { 671 if (cid == vsock_registered_transport_cid(&transport_g2h)) 672 return true; 673 674 if (transport_h2g && cid == VMADDR_CID_HOST) 675 return true; 676 677 if (transport_local && cid == VMADDR_CID_LOCAL) 678 return true; 679 680 return false; 681 } 682 EXPORT_SYMBOL_GPL(vsock_find_cid); 683 684 static struct sock *vsock_dequeue_accept(struct sock *listener) 685 { 686 struct vsock_sock *vlistener; 687 struct vsock_sock *vconnected; 688 689 vlistener = vsock_sk(listener); 690 691 if (list_empty(&vlistener->accept_queue)) 692 return NULL; 693 694 vconnected = list_entry(vlistener->accept_queue.next, 695 struct vsock_sock, accept_queue); 696 697 list_del_init(&vconnected->accept_queue); 698 sock_put(listener); 699 /* The caller will need a reference on the connected socket so we let 700 * it call sock_put(). 701 */ 702 703 return sk_vsock(vconnected); 704 } 705 706 static bool vsock_is_accept_queue_empty(struct sock *sk) 707 { 708 struct vsock_sock *vsk = vsock_sk(sk); 709 return list_empty(&vsk->accept_queue); 710 } 711 712 static bool vsock_is_pending(struct sock *sk) 713 { 714 struct vsock_sock *vsk = vsock_sk(sk); 715 return !list_empty(&vsk->pending_links); 716 } 717 718 static int vsock_send_shutdown(struct sock *sk, int mode) 719 { 720 struct vsock_sock *vsk = vsock_sk(sk); 721 722 if (!vsk->transport) 723 return -ENODEV; 724 725 return vsk->transport->shutdown(vsk, mode); 726 } 727 728 static void vsock_pending_work(struct work_struct *work) 729 { 730 struct sock *sk; 731 struct sock *listener; 732 struct vsock_sock *vsk; 733 bool cleanup; 734 735 vsk = container_of(work, struct vsock_sock, pending_work.work); 736 sk = sk_vsock(vsk); 737 listener = vsk->listener; 738 cleanup = true; 739 740 lock_sock(listener); 741 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 742 743 if (vsock_is_pending(sk)) { 744 vsock_remove_pending(listener, sk); 745 746 sk_acceptq_removed(listener); 747 } else if (!vsk->rejected) { 748 /* We are not on the pending list and accept() did not reject 749 * us, so we must have been accepted by our user process. We 750 * just need to drop our references to the sockets and be on 751 * our way. 752 */ 753 cleanup = false; 754 goto out; 755 } 756 757 /* We need to remove ourself from the global connected sockets list so 758 * incoming packets can't find this socket, and to reduce the reference 759 * count. 760 */ 761 vsock_remove_connected(vsk); 762 763 sk->sk_state = TCP_CLOSE; 764 765 out: 766 release_sock(sk); 767 release_sock(listener); 768 if (cleanup) 769 sock_put(sk); 770 771 sock_put(sk); 772 sock_put(listener); 773 } 774 775 /**** SOCKET OPERATIONS ****/ 776 777 static int __vsock_bind_connectible(struct vsock_sock *vsk, 778 struct sockaddr_vm *addr) 779 { 780 struct net *net = sock_net(sk_vsock(vsk)); 781 struct sockaddr_vm new_addr; 782 783 if (!net->vsock.port) 784 net->vsock.port = get_random_u32_above(LAST_RESERVED_PORT); 785 786 vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); 787 788 if (addr->svm_port == VMADDR_PORT_ANY) { 789 bool found = false; 790 unsigned int i; 791 792 for (i = 0; i < MAX_PORT_RETRIES; i++) { 793 if (net->vsock.port == VMADDR_PORT_ANY || 794 net->vsock.port <= LAST_RESERVED_PORT) 795 net->vsock.port = LAST_RESERVED_PORT + 1; 796 797 new_addr.svm_port = net->vsock.port++; 798 799 if (!__vsock_find_bound_socket_net(&new_addr, net)) { 800 found = true; 801 break; 802 } 803 } 804 805 if (!found) 806 return -EADDRNOTAVAIL; 807 } else { 808 /* If port is in reserved range, ensure caller 809 * has necessary privileges. 810 */ 811 if (addr->svm_port <= LAST_RESERVED_PORT && 812 !capable(CAP_NET_BIND_SERVICE)) { 813 return -EACCES; 814 } 815 816 if (__vsock_find_bound_socket_net(&new_addr, net)) 817 return -EADDRINUSE; 818 } 819 820 vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); 821 822 /* Remove connection oriented sockets from the unbound list and add them 823 * to the hash table for easy lookup by its address. The unbound list 824 * is simply an extra entry at the end of the hash table, a trick used 825 * by AF_UNIX. 826 */ 827 __vsock_remove_bound(vsk); 828 __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); 829 830 return 0; 831 } 832 833 static int __vsock_bind_dgram(struct vsock_sock *vsk, 834 struct sockaddr_vm *addr) 835 { 836 return vsk->transport->dgram_bind(vsk, addr); 837 } 838 839 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) 840 { 841 struct vsock_sock *vsk = vsock_sk(sk); 842 int retval; 843 844 /* First ensure this socket isn't already bound. */ 845 if (vsock_addr_bound(&vsk->local_addr)) 846 return -EINVAL; 847 848 /* Now bind to the provided address or select appropriate values if 849 * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that 850 * like AF_INET prevents binding to a non-local IP address (in most 851 * cases), we only allow binding to a local CID. 852 */ 853 if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid)) 854 return -EADDRNOTAVAIL; 855 856 switch (sk->sk_socket->type) { 857 case SOCK_STREAM: 858 case SOCK_SEQPACKET: 859 spin_lock_bh(&vsock_table_lock); 860 retval = __vsock_bind_connectible(vsk, addr); 861 spin_unlock_bh(&vsock_table_lock); 862 break; 863 864 case SOCK_DGRAM: 865 retval = __vsock_bind_dgram(vsk, addr); 866 break; 867 868 default: 869 retval = -EINVAL; 870 break; 871 } 872 873 return retval; 874 } 875 876 static void vsock_connect_timeout(struct work_struct *work); 877 878 static struct sock *__vsock_create(struct net *net, 879 struct socket *sock, 880 struct sock *parent, 881 gfp_t priority, 882 unsigned short type, 883 int kern) 884 { 885 struct sock *sk; 886 struct vsock_sock *psk; 887 struct vsock_sock *vsk; 888 889 sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); 890 if (!sk) 891 return NULL; 892 893 sock_init_data(sock, sk); 894 895 /* sk->sk_type is normally set in sock_init_data, but only if sock is 896 * non-NULL. We make sure that our sockets always have a type by 897 * setting it here if needed. 898 */ 899 if (!sock) 900 sk->sk_type = type; 901 902 vsk = vsock_sk(sk); 903 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 904 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 905 906 sk->sk_destruct = vsock_sk_destruct; 907 sk->sk_backlog_rcv = vsock_queue_rcv_skb; 908 sock_reset_flag(sk, SOCK_DONE); 909 910 INIT_LIST_HEAD(&vsk->bound_table); 911 INIT_LIST_HEAD(&vsk->connected_table); 912 vsk->listener = NULL; 913 INIT_LIST_HEAD(&vsk->pending_links); 914 INIT_LIST_HEAD(&vsk->accept_queue); 915 vsk->rejected = false; 916 vsk->sent_request = false; 917 vsk->ignore_connecting_rst = false; 918 vsk->peer_shutdown = 0; 919 INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); 920 INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); 921 922 psk = parent ? vsock_sk(parent) : NULL; 923 if (parent) { 924 vsk->trusted = psk->trusted; 925 vsk->owner = get_cred(psk->owner); 926 vsk->connect_timeout = psk->connect_timeout; 927 vsk->buffer_size = psk->buffer_size; 928 vsk->buffer_min_size = psk->buffer_min_size; 929 vsk->buffer_max_size = psk->buffer_max_size; 930 security_sk_clone(parent, sk); 931 } else { 932 vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); 933 vsk->owner = get_current_cred(); 934 vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; 935 vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; 936 vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE; 937 vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE; 938 } 939 940 return sk; 941 } 942 943 static bool sock_type_connectible(u16 type) 944 { 945 return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET); 946 } 947 948 static void __vsock_release(struct sock *sk, int level) 949 { 950 struct vsock_sock *vsk; 951 struct sock *pending; 952 953 vsk = vsock_sk(sk); 954 pending = NULL; /* Compiler warning. */ 955 956 /* When "level" is SINGLE_DEPTH_NESTING, use the nested 957 * version to avoid the warning "possible recursive locking 958 * detected". When "level" is 0, lock_sock_nested(sk, level) 959 * is the same as lock_sock(sk). 960 */ 961 lock_sock_nested(sk, level); 962 963 /* Indicate to vsock_remove_sock() that the socket is being released and 964 * can be removed from the bound_table. Unlike transport reassignment 965 * case, where the socket must remain bound despite vsock_remove_sock() 966 * being called from the transport release() callback. 967 */ 968 sock_set_flag(sk, SOCK_DEAD); 969 970 if (vsk->transport) 971 vsk->transport->release(vsk); 972 else if (sock_type_connectible(sk->sk_type)) 973 vsock_remove_sock(vsk); 974 975 sock_orphan(sk); 976 sk->sk_shutdown = SHUTDOWN_MASK; 977 978 skb_queue_purge(&sk->sk_receive_queue); 979 980 /* Clean up any sockets that never were accepted. */ 981 while ((pending = vsock_dequeue_accept(sk)) != NULL) { 982 __vsock_release(pending, SINGLE_DEPTH_NESTING); 983 sock_put(pending); 984 } 985 986 release_sock(sk); 987 sock_put(sk); 988 } 989 990 static void vsock_sk_destruct(struct sock *sk) 991 { 992 struct vsock_sock *vsk = vsock_sk(sk); 993 994 /* Flush MSG_ZEROCOPY leftovers. */ 995 __skb_queue_purge(&sk->sk_error_queue); 996 997 vsock_deassign_transport(vsk); 998 999 /* When clearing these addresses, there's no need to set the family and 1000 * possibly register the address family with the kernel. 1001 */ 1002 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 1003 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 1004 1005 put_cred(vsk->owner); 1006 } 1007 1008 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1009 { 1010 int err; 1011 1012 err = sock_queue_rcv_skb(sk, skb); 1013 if (err) 1014 kfree_skb(skb); 1015 1016 return err; 1017 } 1018 1019 struct sock *vsock_create_connected(struct sock *parent) 1020 { 1021 return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL, 1022 parent->sk_type, 0); 1023 } 1024 EXPORT_SYMBOL_GPL(vsock_create_connected); 1025 1026 s64 vsock_stream_has_data(struct vsock_sock *vsk) 1027 { 1028 if (WARN_ON(!vsk->transport)) 1029 return 0; 1030 1031 return vsk->transport->stream_has_data(vsk); 1032 } 1033 EXPORT_SYMBOL_GPL(vsock_stream_has_data); 1034 1035 s64 vsock_connectible_has_data(struct vsock_sock *vsk) 1036 { 1037 struct sock *sk = sk_vsock(vsk); 1038 1039 if (WARN_ON(!vsk->transport)) 1040 return 0; 1041 1042 if (sk->sk_type == SOCK_SEQPACKET) 1043 return vsk->transport->seqpacket_has_data(vsk); 1044 else 1045 return vsock_stream_has_data(vsk); 1046 } 1047 EXPORT_SYMBOL_GPL(vsock_connectible_has_data); 1048 1049 s64 vsock_stream_has_space(struct vsock_sock *vsk) 1050 { 1051 if (WARN_ON(!vsk->transport)) 1052 return 0; 1053 1054 return vsk->transport->stream_has_space(vsk); 1055 } 1056 EXPORT_SYMBOL_GPL(vsock_stream_has_space); 1057 1058 void vsock_data_ready(struct sock *sk) 1059 { 1060 struct vsock_sock *vsk = vsock_sk(sk); 1061 1062 if (vsock_stream_has_data(vsk) >= sk->sk_rcvlowat || 1063 sock_flag(sk, SOCK_DONE)) 1064 sk->sk_data_ready(sk); 1065 } 1066 EXPORT_SYMBOL_GPL(vsock_data_ready); 1067 1068 /* Dummy callback required by sockmap. 1069 * See unconditional call of saved_close() in sock_map_close(). 1070 */ 1071 static void vsock_close(struct sock *sk, long timeout) 1072 { 1073 } 1074 1075 static int vsock_release(struct socket *sock) 1076 { 1077 struct sock *sk = sock->sk; 1078 1079 if (!sk) 1080 return 0; 1081 1082 sk->sk_prot->close(sk, 0); 1083 __vsock_release(sk, 0); 1084 sock->sk = NULL; 1085 sock->state = SS_FREE; 1086 1087 return 0; 1088 } 1089 1090 static int 1091 vsock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) 1092 { 1093 int err; 1094 struct sock *sk; 1095 struct sockaddr_vm *vm_addr; 1096 1097 sk = sock->sk; 1098 1099 if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) 1100 return -EINVAL; 1101 1102 lock_sock(sk); 1103 err = __vsock_bind(sk, vm_addr); 1104 release_sock(sk); 1105 1106 return err; 1107 } 1108 1109 static int vsock_getname(struct socket *sock, 1110 struct sockaddr *addr, int peer) 1111 { 1112 int err; 1113 struct sock *sk; 1114 struct vsock_sock *vsk; 1115 struct sockaddr_vm *vm_addr; 1116 1117 sk = sock->sk; 1118 vsk = vsock_sk(sk); 1119 err = 0; 1120 1121 lock_sock(sk); 1122 1123 if (peer) { 1124 if (sock->state != SS_CONNECTED) { 1125 err = -ENOTCONN; 1126 goto out; 1127 } 1128 vm_addr = &vsk->remote_addr; 1129 } else { 1130 vm_addr = &vsk->local_addr; 1131 } 1132 1133 BUILD_BUG_ON(sizeof(*vm_addr) > sizeof(struct sockaddr_storage)); 1134 memcpy(addr, vm_addr, sizeof(*vm_addr)); 1135 err = sizeof(*vm_addr); 1136 1137 out: 1138 release_sock(sk); 1139 return err; 1140 } 1141 1142 void vsock_linger(struct sock *sk) 1143 { 1144 DEFINE_WAIT_FUNC(wait, woken_wake_function); 1145 ssize_t (*unsent)(struct vsock_sock *vsk); 1146 struct vsock_sock *vsk = vsock_sk(sk); 1147 long timeout; 1148 1149 if (!sock_flag(sk, SOCK_LINGER)) 1150 return; 1151 1152 timeout = sk->sk_lingertime; 1153 if (!timeout) 1154 return; 1155 1156 /* Transports must implement `unsent_bytes` if they want to support 1157 * SOCK_LINGER through `vsock_linger()` since we use it to check when 1158 * the socket can be closed. 1159 */ 1160 unsent = vsk->transport->unsent_bytes; 1161 if (!unsent) 1162 return; 1163 1164 add_wait_queue(sk_sleep(sk), &wait); 1165 1166 do { 1167 if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait)) 1168 break; 1169 } while (!signal_pending(current) && timeout); 1170 1171 remove_wait_queue(sk_sleep(sk), &wait); 1172 } 1173 EXPORT_SYMBOL_GPL(vsock_linger); 1174 1175 static int vsock_shutdown(struct socket *sock, int mode) 1176 { 1177 int err; 1178 struct sock *sk; 1179 1180 /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses 1181 * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode 1182 * here like the other address families do. Note also that the 1183 * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), 1184 * which is what we want. 1185 */ 1186 mode++; 1187 1188 if ((mode & ~SHUTDOWN_MASK) || !mode) 1189 return -EINVAL; 1190 1191 /* If this is a connection oriented socket and it is not connected then 1192 * bail out immediately. If it is a DGRAM socket then we must first 1193 * kick the socket so that it wakes up from any sleeping calls, for 1194 * example recv(), and then afterwards return the error. 1195 */ 1196 1197 sk = sock->sk; 1198 1199 lock_sock(sk); 1200 if (sock->state == SS_UNCONNECTED) { 1201 err = -ENOTCONN; 1202 if (sock_type_connectible(sk->sk_type)) 1203 goto out; 1204 } else { 1205 sock->state = SS_DISCONNECTING; 1206 err = 0; 1207 } 1208 1209 /* Receive and send shutdowns are treated alike. */ 1210 mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); 1211 if (mode) { 1212 sk->sk_shutdown |= mode; 1213 sk->sk_state_change(sk); 1214 1215 if (sock_type_connectible(sk->sk_type)) { 1216 sock_reset_flag(sk, SOCK_DONE); 1217 vsock_send_shutdown(sk, mode); 1218 } 1219 } 1220 1221 out: 1222 release_sock(sk); 1223 return err; 1224 } 1225 1226 static __poll_t vsock_poll(struct file *file, struct socket *sock, 1227 poll_table *wait) 1228 { 1229 struct sock *sk; 1230 __poll_t mask; 1231 struct vsock_sock *vsk; 1232 1233 sk = sock->sk; 1234 vsk = vsock_sk(sk); 1235 1236 poll_wait(file, sk_sleep(sk), wait); 1237 mask = 0; 1238 1239 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) 1240 /* Signify that there has been an error on this socket. */ 1241 mask |= EPOLLERR; 1242 1243 /* INET sockets treat local write shutdown and peer write shutdown as a 1244 * case of EPOLLHUP set. 1245 */ 1246 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1247 ((sk->sk_shutdown & SEND_SHUTDOWN) && 1248 (vsk->peer_shutdown & SEND_SHUTDOWN))) { 1249 mask |= EPOLLHUP; 1250 } 1251 1252 if (sk->sk_shutdown & RCV_SHUTDOWN || 1253 vsk->peer_shutdown & SEND_SHUTDOWN) { 1254 mask |= EPOLLRDHUP; 1255 } 1256 1257 if (sk_is_readable(sk)) 1258 mask |= EPOLLIN | EPOLLRDNORM; 1259 1260 if (sock->type == SOCK_DGRAM) { 1261 /* For datagram sockets we can read if there is something in 1262 * the queue and write as long as the socket isn't shutdown for 1263 * sending. 1264 */ 1265 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || 1266 (sk->sk_shutdown & RCV_SHUTDOWN)) { 1267 mask |= EPOLLIN | EPOLLRDNORM; 1268 } 1269 1270 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 1271 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 1272 1273 } else if (sock_type_connectible(sk->sk_type)) { 1274 const struct vsock_transport *transport; 1275 1276 lock_sock(sk); 1277 1278 transport = vsk->transport; 1279 1280 /* Listening sockets that have connections in their accept 1281 * queue can be read. 1282 */ 1283 if (sk->sk_state == TCP_LISTEN 1284 && !vsock_is_accept_queue_empty(sk)) 1285 mask |= EPOLLIN | EPOLLRDNORM; 1286 1287 /* If there is something in the queue then we can read. */ 1288 if (transport && transport->stream_is_active(vsk) && 1289 !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1290 bool data_ready_now = false; 1291 int target = sock_rcvlowat(sk, 0, INT_MAX); 1292 int ret = transport->notify_poll_in( 1293 vsk, target, &data_ready_now); 1294 if (ret < 0) { 1295 mask |= EPOLLERR; 1296 } else { 1297 if (data_ready_now) 1298 mask |= EPOLLIN | EPOLLRDNORM; 1299 1300 } 1301 } 1302 1303 /* Sockets whose connections have been closed, reset, or 1304 * terminated should also be considered read, and we check the 1305 * shutdown flag for that. 1306 */ 1307 if (sk->sk_shutdown & RCV_SHUTDOWN || 1308 vsk->peer_shutdown & SEND_SHUTDOWN) { 1309 mask |= EPOLLIN | EPOLLRDNORM; 1310 } 1311 1312 /* Connected sockets that can produce data can be written. */ 1313 if (transport && sk->sk_state == TCP_ESTABLISHED) { 1314 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 1315 bool space_avail_now = false; 1316 int ret = transport->notify_poll_out( 1317 vsk, 1, &space_avail_now); 1318 if (ret < 0) { 1319 mask |= EPOLLERR; 1320 } else { 1321 if (space_avail_now) 1322 /* Remove EPOLLWRBAND since INET 1323 * sockets are not setting it. 1324 */ 1325 mask |= EPOLLOUT | EPOLLWRNORM; 1326 1327 } 1328 } 1329 } 1330 1331 /* Simulate INET socket poll behaviors, which sets 1332 * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read, 1333 * but local send is not shutdown. 1334 */ 1335 if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { 1336 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 1337 mask |= EPOLLOUT | EPOLLWRNORM; 1338 1339 } 1340 1341 release_sock(sk); 1342 } 1343 1344 return mask; 1345 } 1346 1347 static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) 1348 { 1349 struct vsock_sock *vsk = vsock_sk(sk); 1350 1351 if (WARN_ON_ONCE(!vsk->transport)) 1352 return -ENODEV; 1353 1354 return vsk->transport->read_skb(vsk, read_actor); 1355 } 1356 1357 static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1358 size_t len) 1359 { 1360 int err; 1361 struct sock *sk; 1362 struct vsock_sock *vsk; 1363 struct sockaddr_vm *remote_addr; 1364 const struct vsock_transport *transport; 1365 1366 if (msg->msg_flags & MSG_OOB) 1367 return -EOPNOTSUPP; 1368 1369 /* For now, MSG_DONTWAIT is always assumed... */ 1370 err = 0; 1371 sk = sock->sk; 1372 vsk = vsock_sk(sk); 1373 1374 lock_sock(sk); 1375 1376 transport = vsk->transport; 1377 1378 err = vsock_auto_bind(vsk); 1379 if (err) 1380 goto out; 1381 1382 1383 /* If the provided message contains an address, use that. Otherwise 1384 * fall back on the socket's remote handle (if it has been connected). 1385 */ 1386 if (msg->msg_name && 1387 vsock_addr_cast(msg->msg_name, msg->msg_namelen, 1388 &remote_addr) == 0) { 1389 /* Ensure this address is of the right type and is a valid 1390 * destination. 1391 */ 1392 1393 if (remote_addr->svm_cid == VMADDR_CID_ANY) 1394 remote_addr->svm_cid = transport->get_local_cid(); 1395 1396 if (!vsock_addr_bound(remote_addr)) { 1397 err = -EINVAL; 1398 goto out; 1399 } 1400 } else if (sock->state == SS_CONNECTED) { 1401 remote_addr = &vsk->remote_addr; 1402 1403 if (remote_addr->svm_cid == VMADDR_CID_ANY) 1404 remote_addr->svm_cid = transport->get_local_cid(); 1405 1406 /* XXX Should connect() or this function ensure remote_addr is 1407 * bound? 1408 */ 1409 if (!vsock_addr_bound(&vsk->remote_addr)) { 1410 err = -EINVAL; 1411 goto out; 1412 } 1413 } else { 1414 err = -EINVAL; 1415 goto out; 1416 } 1417 1418 if (!transport->dgram_allow(vsk, remote_addr->svm_cid, 1419 remote_addr->svm_port)) { 1420 err = -EINVAL; 1421 goto out; 1422 } 1423 1424 err = transport->dgram_enqueue(vsk, remote_addr, msg, len); 1425 1426 out: 1427 release_sock(sk); 1428 return err; 1429 } 1430 1431 static int vsock_dgram_connect(struct socket *sock, 1432 struct sockaddr_unsized *addr, int addr_len, int flags) 1433 { 1434 int err; 1435 struct sock *sk; 1436 struct vsock_sock *vsk; 1437 struct sockaddr_vm *remote_addr; 1438 1439 sk = sock->sk; 1440 vsk = vsock_sk(sk); 1441 1442 err = vsock_addr_cast(addr, addr_len, &remote_addr); 1443 if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { 1444 lock_sock(sk); 1445 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, 1446 VMADDR_PORT_ANY); 1447 sock->state = SS_UNCONNECTED; 1448 release_sock(sk); 1449 return 0; 1450 } else if (err != 0) 1451 return -EINVAL; 1452 1453 lock_sock(sk); 1454 1455 err = vsock_auto_bind(vsk); 1456 if (err) 1457 goto out; 1458 1459 if (!vsk->transport->dgram_allow(vsk, remote_addr->svm_cid, 1460 remote_addr->svm_port)) { 1461 err = -EINVAL; 1462 goto out; 1463 } 1464 1465 memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); 1466 sock->state = SS_CONNECTED; 1467 1468 /* sock map disallows redirection of non-TCP sockets with sk_state != 1469 * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set 1470 * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams. 1471 * 1472 * This doesn't seem to be abnormal state for datagram sockets, as the 1473 * same approach can be see in other datagram socket types as well 1474 * (such as unix sockets). 1475 */ 1476 sk->sk_state = TCP_ESTABLISHED; 1477 1478 out: 1479 release_sock(sk); 1480 return err; 1481 } 1482 1483 int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, 1484 size_t len, int flags) 1485 { 1486 struct sock *sk = sock->sk; 1487 struct vsock_sock *vsk = vsock_sk(sk); 1488 1489 return vsk->transport->dgram_dequeue(vsk, msg, len, flags); 1490 } 1491 1492 int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, 1493 size_t len, int flags) 1494 { 1495 #ifdef CONFIG_BPF_SYSCALL 1496 struct sock *sk = sock->sk; 1497 const struct proto *prot; 1498 1499 prot = READ_ONCE(sk->sk_prot); 1500 if (prot != &vsock_proto) 1501 return prot->recvmsg(sk, msg, len, flags, NULL); 1502 #endif 1503 1504 return __vsock_dgram_recvmsg(sock, msg, len, flags); 1505 } 1506 EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); 1507 1508 static int vsock_do_ioctl(struct socket *sock, unsigned int cmd, 1509 int __user *arg) 1510 { 1511 struct sock *sk = sock->sk; 1512 struct vsock_sock *vsk; 1513 int ret; 1514 1515 vsk = vsock_sk(sk); 1516 1517 switch (cmd) { 1518 case SIOCINQ: { 1519 ssize_t n_bytes; 1520 1521 if (!vsk->transport) { 1522 ret = -EOPNOTSUPP; 1523 break; 1524 } 1525 1526 if (sock_type_connectible(sk->sk_type) && 1527 sk->sk_state == TCP_LISTEN) { 1528 ret = -EINVAL; 1529 break; 1530 } 1531 1532 n_bytes = vsock_stream_has_data(vsk); 1533 if (n_bytes < 0) { 1534 ret = n_bytes; 1535 break; 1536 } 1537 ret = put_user(n_bytes, arg); 1538 break; 1539 } 1540 case SIOCOUTQ: { 1541 ssize_t n_bytes; 1542 1543 if (!vsk->transport || !vsk->transport->unsent_bytes) { 1544 ret = -EOPNOTSUPP; 1545 break; 1546 } 1547 1548 if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) { 1549 ret = -EINVAL; 1550 break; 1551 } 1552 1553 n_bytes = vsk->transport->unsent_bytes(vsk); 1554 if (n_bytes < 0) { 1555 ret = n_bytes; 1556 break; 1557 } 1558 1559 ret = put_user(n_bytes, arg); 1560 break; 1561 } 1562 default: 1563 ret = -ENOIOCTLCMD; 1564 } 1565 1566 return ret; 1567 } 1568 1569 static int vsock_ioctl(struct socket *sock, unsigned int cmd, 1570 unsigned long arg) 1571 { 1572 int ret; 1573 1574 lock_sock(sock->sk); 1575 ret = vsock_do_ioctl(sock, cmd, (int __user *)arg); 1576 release_sock(sock->sk); 1577 1578 return ret; 1579 } 1580 1581 static const struct proto_ops vsock_dgram_ops = { 1582 .family = PF_VSOCK, 1583 .owner = THIS_MODULE, 1584 .release = vsock_release, 1585 .bind = vsock_bind, 1586 .connect = vsock_dgram_connect, 1587 .socketpair = sock_no_socketpair, 1588 .accept = sock_no_accept, 1589 .getname = vsock_getname, 1590 .poll = vsock_poll, 1591 .ioctl = vsock_ioctl, 1592 .listen = sock_no_listen, 1593 .shutdown = vsock_shutdown, 1594 .sendmsg = vsock_dgram_sendmsg, 1595 .recvmsg = vsock_dgram_recvmsg, 1596 .mmap = sock_no_mmap, 1597 .read_skb = vsock_read_skb, 1598 }; 1599 1600 static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) 1601 { 1602 const struct vsock_transport *transport = vsk->transport; 1603 1604 if (!transport || !transport->cancel_pkt) 1605 return -EOPNOTSUPP; 1606 1607 return transport->cancel_pkt(vsk); 1608 } 1609 1610 static void vsock_connect_timeout(struct work_struct *work) 1611 { 1612 struct sock *sk; 1613 struct vsock_sock *vsk; 1614 1615 vsk = container_of(work, struct vsock_sock, connect_work.work); 1616 sk = sk_vsock(vsk); 1617 1618 lock_sock(sk); 1619 if (sk->sk_state == TCP_SYN_SENT && 1620 (sk->sk_shutdown != SHUTDOWN_MASK)) { 1621 sk->sk_state = TCP_CLOSE; 1622 sk->sk_socket->state = SS_UNCONNECTED; 1623 sk->sk_err = ETIMEDOUT; 1624 sk_error_report(sk); 1625 vsock_transport_cancel_pkt(vsk); 1626 } 1627 release_sock(sk); 1628 1629 sock_put(sk); 1630 } 1631 1632 static int vsock_connect(struct socket *sock, struct sockaddr_unsized *addr, 1633 int addr_len, int flags) 1634 { 1635 int err; 1636 struct sock *sk; 1637 struct vsock_sock *vsk; 1638 const struct vsock_transport *transport; 1639 struct sockaddr_vm *remote_addr; 1640 long timeout; 1641 DEFINE_WAIT(wait); 1642 1643 err = 0; 1644 sk = sock->sk; 1645 vsk = vsock_sk(sk); 1646 1647 lock_sock(sk); 1648 1649 /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ 1650 switch (sock->state) { 1651 case SS_CONNECTED: 1652 err = -EISCONN; 1653 goto out; 1654 case SS_DISCONNECTING: 1655 err = -EINVAL; 1656 goto out; 1657 case SS_CONNECTING: 1658 /* This continues on so we can move sock into the SS_CONNECTED 1659 * state once the connection has completed (at which point err 1660 * will be set to zero also). Otherwise, we will either wait 1661 * for the connection or return -EALREADY should this be a 1662 * non-blocking call. 1663 */ 1664 err = -EALREADY; 1665 if (flags & O_NONBLOCK) 1666 goto out; 1667 break; 1668 default: 1669 if ((sk->sk_state == TCP_LISTEN) || 1670 vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { 1671 err = -EINVAL; 1672 goto out; 1673 } 1674 1675 /* Set the remote address that we are connecting to. */ 1676 memcpy(&vsk->remote_addr, remote_addr, 1677 sizeof(vsk->remote_addr)); 1678 1679 err = vsock_assign_transport(vsk, NULL); 1680 if (err) 1681 goto out; 1682 1683 transport = vsk->transport; 1684 1685 /* The hypervisor and well-known contexts do not have socket 1686 * endpoints. 1687 */ 1688 if (!transport || 1689 !transport->stream_allow(vsk, remote_addr->svm_cid, 1690 remote_addr->svm_port)) { 1691 err = -ENETUNREACH; 1692 goto out; 1693 } 1694 1695 if (vsock_msgzerocopy_allow(transport)) { 1696 set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); 1697 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1698 /* If this option was set before 'connect()', 1699 * when transport was unknown, check that this 1700 * feature is supported here. 1701 */ 1702 err = -EOPNOTSUPP; 1703 goto out; 1704 } 1705 1706 err = vsock_auto_bind(vsk); 1707 if (err) 1708 goto out; 1709 1710 sk->sk_state = TCP_SYN_SENT; 1711 1712 err = transport->connect(vsk); 1713 if (err < 0) 1714 goto out; 1715 1716 /* sk_err might have been set as a result of an earlier 1717 * (failed) connect attempt. 1718 */ 1719 sk->sk_err = 0; 1720 1721 /* Mark sock as connecting and set the error code to in 1722 * progress in case this is a non-blocking connect. 1723 */ 1724 sock->state = SS_CONNECTING; 1725 err = -EINPROGRESS; 1726 } 1727 1728 /* The receive path will handle all communication until we are able to 1729 * enter the connected state. Here we wait for the connection to be 1730 * completed or a notification of an error. 1731 */ 1732 timeout = vsk->connect_timeout; 1733 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1734 1735 /* If the socket is already closing or it is in an error state, there 1736 * is no point in waiting. 1737 */ 1738 while (sk->sk_state != TCP_ESTABLISHED && 1739 sk->sk_state != TCP_CLOSING && sk->sk_err == 0) { 1740 if (flags & O_NONBLOCK) { 1741 /* If we're not going to block, we schedule a timeout 1742 * function to generate a timeout on the connection 1743 * attempt, in case the peer doesn't respond in a 1744 * timely manner. We hold on to the socket until the 1745 * timeout fires. 1746 */ 1747 sock_hold(sk); 1748 1749 /* If the timeout function is already scheduled, 1750 * reschedule it, then ungrab the socket refcount to 1751 * keep it balanced. 1752 */ 1753 if (mod_delayed_work(system_percpu_wq, &vsk->connect_work, 1754 timeout)) 1755 sock_put(sk); 1756 1757 /* Skip ahead to preserve error code set above. */ 1758 goto out_wait; 1759 } 1760 1761 release_sock(sk); 1762 timeout = schedule_timeout(timeout); 1763 lock_sock(sk); 1764 1765 /* Connection established. Whatever happens to socket once we 1766 * release it, that's not connect()'s concern. No need to go 1767 * into signal and timeout handling. Call it a day. 1768 * 1769 * Note that allowing to "reset" an already established socket 1770 * here is racy and insecure. 1771 */ 1772 if (sk->sk_state == TCP_ESTABLISHED) 1773 break; 1774 1775 /* If connection was _not_ established and a signal/timeout came 1776 * to be, we want the socket's state reset. User space may want 1777 * to retry. 1778 * 1779 * sk_state != TCP_ESTABLISHED implies that socket is not on 1780 * vsock_connected_table. We keep the binding and the transport 1781 * assigned. 1782 */ 1783 if (signal_pending(current) || timeout == 0) { 1784 err = timeout == 0 ? -ETIMEDOUT : sock_intr_errno(timeout); 1785 1786 /* Listener might have already responded with 1787 * VIRTIO_VSOCK_OP_RESPONSE. Its handling expects our 1788 * sk_state == TCP_SYN_SENT, which hereby we break. 1789 * In such case VIRTIO_VSOCK_OP_RST will follow. 1790 */ 1791 sk->sk_state = TCP_CLOSE; 1792 sock->state = SS_UNCONNECTED; 1793 1794 /* Try to cancel VIRTIO_VSOCK_OP_REQUEST skb sent out by 1795 * transport->connect(). 1796 */ 1797 vsock_transport_cancel_pkt(vsk); 1798 1799 goto out_wait; 1800 } 1801 1802 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1803 } 1804 1805 if (sk->sk_err) { 1806 err = -sk->sk_err; 1807 sk->sk_state = TCP_CLOSE; 1808 sock->state = SS_UNCONNECTED; 1809 } else { 1810 err = 0; 1811 } 1812 1813 out_wait: 1814 finish_wait(sk_sleep(sk), &wait); 1815 out: 1816 release_sock(sk); 1817 return err; 1818 } 1819 1820 static int vsock_accept(struct socket *sock, struct socket *newsock, 1821 struct proto_accept_arg *arg) 1822 { 1823 struct sock *listener; 1824 int err; 1825 struct sock *connected; 1826 struct vsock_sock *vconnected; 1827 long timeout; 1828 DEFINE_WAIT(wait); 1829 1830 err = 0; 1831 listener = sock->sk; 1832 1833 lock_sock(listener); 1834 1835 if (!sock_type_connectible(sock->type)) { 1836 err = -EOPNOTSUPP; 1837 goto out; 1838 } 1839 1840 if (listener->sk_state != TCP_LISTEN) { 1841 err = -EINVAL; 1842 goto out; 1843 } 1844 1845 /* Wait for children sockets to appear; these are the new sockets 1846 * created upon connection establishment. 1847 */ 1848 timeout = sock_rcvtimeo(listener, arg->flags & O_NONBLOCK); 1849 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1850 1851 while ((connected = vsock_dequeue_accept(listener)) == NULL && 1852 listener->sk_err == 0) { 1853 release_sock(listener); 1854 timeout = schedule_timeout(timeout); 1855 finish_wait(sk_sleep(listener), &wait); 1856 lock_sock(listener); 1857 1858 if (signal_pending(current)) { 1859 err = sock_intr_errno(timeout); 1860 goto out; 1861 } else if (timeout == 0) { 1862 err = -EAGAIN; 1863 goto out; 1864 } 1865 1866 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1867 } 1868 finish_wait(sk_sleep(listener), &wait); 1869 1870 if (listener->sk_err) 1871 err = -listener->sk_err; 1872 1873 if (connected) { 1874 sk_acceptq_removed(listener); 1875 1876 lock_sock_nested(connected, SINGLE_DEPTH_NESTING); 1877 vconnected = vsock_sk(connected); 1878 1879 /* If the listener socket has received an error, then we should 1880 * reject this socket and return. Note that we simply mark the 1881 * socket rejected, drop our reference, and let the cleanup 1882 * function handle the cleanup; the fact that we found it in 1883 * the listener's accept queue guarantees that the cleanup 1884 * function hasn't run yet. 1885 */ 1886 if (err) { 1887 vconnected->rejected = true; 1888 } else { 1889 newsock->state = SS_CONNECTED; 1890 sock_graft(connected, newsock); 1891 1892 set_bit(SOCK_CUSTOM_SOCKOPT, 1893 &connected->sk_socket->flags); 1894 1895 if (vsock_msgzerocopy_allow(vconnected->transport)) 1896 set_bit(SOCK_SUPPORT_ZC, 1897 &connected->sk_socket->flags); 1898 } 1899 1900 release_sock(connected); 1901 sock_put(connected); 1902 } 1903 1904 out: 1905 release_sock(listener); 1906 return err; 1907 } 1908 1909 static int vsock_listen(struct socket *sock, int backlog) 1910 { 1911 int err; 1912 struct sock *sk; 1913 struct vsock_sock *vsk; 1914 1915 sk = sock->sk; 1916 1917 lock_sock(sk); 1918 1919 if (!sock_type_connectible(sk->sk_type)) { 1920 err = -EOPNOTSUPP; 1921 goto out; 1922 } 1923 1924 if (sock->state != SS_UNCONNECTED) { 1925 err = -EINVAL; 1926 goto out; 1927 } 1928 1929 vsk = vsock_sk(sk); 1930 1931 if (!vsock_addr_bound(&vsk->local_addr)) { 1932 err = -EINVAL; 1933 goto out; 1934 } 1935 1936 sk->sk_max_ack_backlog = backlog; 1937 sk->sk_state = TCP_LISTEN; 1938 1939 err = 0; 1940 1941 out: 1942 release_sock(sk); 1943 return err; 1944 } 1945 1946 static void vsock_update_buffer_size(struct vsock_sock *vsk, 1947 const struct vsock_transport *transport, 1948 u64 val) 1949 { 1950 if (val > vsk->buffer_max_size) 1951 val = vsk->buffer_max_size; 1952 1953 if (val < vsk->buffer_min_size) 1954 val = vsk->buffer_min_size; 1955 1956 if (val != vsk->buffer_size && 1957 transport && transport->notify_buffer_size) 1958 transport->notify_buffer_size(vsk, &val); 1959 1960 vsk->buffer_size = val; 1961 } 1962 1963 static int vsock_connectible_setsockopt(struct socket *sock, 1964 int level, 1965 int optname, 1966 sockptr_t optval, 1967 unsigned int optlen) 1968 { 1969 int err; 1970 struct sock *sk; 1971 struct vsock_sock *vsk; 1972 const struct vsock_transport *transport; 1973 u64 val; 1974 1975 if (level != AF_VSOCK && level != SOL_SOCKET) 1976 return -ENOPROTOOPT; 1977 1978 #define COPY_IN(_v) \ 1979 do { \ 1980 if (optlen < sizeof(_v)) { \ 1981 err = -EINVAL; \ 1982 goto exit; \ 1983 } \ 1984 if (copy_from_sockptr(&_v, optval, sizeof(_v)) != 0) { \ 1985 err = -EFAULT; \ 1986 goto exit; \ 1987 } \ 1988 } while (0) 1989 1990 err = 0; 1991 sk = sock->sk; 1992 vsk = vsock_sk(sk); 1993 1994 lock_sock(sk); 1995 1996 transport = vsk->transport; 1997 1998 if (level == SOL_SOCKET) { 1999 int zerocopy; 2000 2001 if (optname != SO_ZEROCOPY) { 2002 release_sock(sk); 2003 return sock_setsockopt(sock, level, optname, optval, optlen); 2004 } 2005 2006 /* Use 'int' type here, because variable to 2007 * set this option usually has this type. 2008 */ 2009 COPY_IN(zerocopy); 2010 2011 if (zerocopy < 0 || zerocopy > 1) { 2012 err = -EINVAL; 2013 goto exit; 2014 } 2015 2016 if (transport && !vsock_msgzerocopy_allow(transport)) { 2017 err = -EOPNOTSUPP; 2018 goto exit; 2019 } 2020 2021 sock_valbool_flag(sk, SOCK_ZEROCOPY, zerocopy); 2022 goto exit; 2023 } 2024 2025 switch (optname) { 2026 case SO_VM_SOCKETS_BUFFER_SIZE: 2027 COPY_IN(val); 2028 vsock_update_buffer_size(vsk, transport, val); 2029 break; 2030 2031 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 2032 COPY_IN(val); 2033 vsk->buffer_max_size = val; 2034 vsock_update_buffer_size(vsk, transport, vsk->buffer_size); 2035 break; 2036 2037 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 2038 COPY_IN(val); 2039 vsk->buffer_min_size = val; 2040 vsock_update_buffer_size(vsk, transport, vsk->buffer_size); 2041 break; 2042 2043 case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: 2044 case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: { 2045 struct __kernel_sock_timeval tv; 2046 2047 err = sock_copy_user_timeval(&tv, optval, optlen, 2048 optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); 2049 if (err) 2050 break; 2051 if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && 2052 tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { 2053 vsk->connect_timeout = tv.tv_sec * HZ + 2054 DIV_ROUND_UP((unsigned long)tv.tv_usec, (USEC_PER_SEC / HZ)); 2055 if (vsk->connect_timeout == 0) 2056 vsk->connect_timeout = 2057 VSOCK_DEFAULT_CONNECT_TIMEOUT; 2058 2059 } else { 2060 err = -ERANGE; 2061 } 2062 break; 2063 } 2064 2065 default: 2066 err = -ENOPROTOOPT; 2067 break; 2068 } 2069 2070 #undef COPY_IN 2071 2072 exit: 2073 release_sock(sk); 2074 return err; 2075 } 2076 2077 static int vsock_connectible_getsockopt(struct socket *sock, 2078 int level, int optname, 2079 char __user *optval, 2080 int __user *optlen) 2081 { 2082 struct sock *sk = sock->sk; 2083 struct vsock_sock *vsk = vsock_sk(sk); 2084 2085 union { 2086 u64 val64; 2087 struct old_timeval32 tm32; 2088 struct __kernel_old_timeval tm; 2089 struct __kernel_sock_timeval stm; 2090 } v; 2091 2092 int lv = sizeof(v.val64); 2093 int len; 2094 2095 if (level != AF_VSOCK) 2096 return -ENOPROTOOPT; 2097 2098 if (get_user(len, optlen)) 2099 return -EFAULT; 2100 2101 memset(&v, 0, sizeof(v)); 2102 2103 switch (optname) { 2104 case SO_VM_SOCKETS_BUFFER_SIZE: 2105 v.val64 = vsk->buffer_size; 2106 break; 2107 2108 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 2109 v.val64 = vsk->buffer_max_size; 2110 break; 2111 2112 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 2113 v.val64 = vsk->buffer_min_size; 2114 break; 2115 2116 case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: 2117 case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: 2118 lv = sock_get_timeout(vsk->connect_timeout, &v, 2119 optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); 2120 break; 2121 2122 default: 2123 return -ENOPROTOOPT; 2124 } 2125 2126 if (len < lv) 2127 return -EINVAL; 2128 if (len > lv) 2129 len = lv; 2130 if (copy_to_user(optval, &v, len)) 2131 return -EFAULT; 2132 2133 if (put_user(len, optlen)) 2134 return -EFAULT; 2135 2136 return 0; 2137 } 2138 2139 static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, 2140 size_t len) 2141 { 2142 struct sock *sk; 2143 struct vsock_sock *vsk; 2144 const struct vsock_transport *transport; 2145 ssize_t total_written; 2146 long timeout; 2147 int err; 2148 struct vsock_transport_send_notify_data send_data; 2149 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2150 2151 sk = sock->sk; 2152 vsk = vsock_sk(sk); 2153 total_written = 0; 2154 err = 0; 2155 2156 if (msg->msg_flags & MSG_OOB) 2157 return -EOPNOTSUPP; 2158 2159 lock_sock(sk); 2160 2161 transport = vsk->transport; 2162 2163 /* Callers should not provide a destination with connection oriented 2164 * sockets. 2165 */ 2166 if (msg->msg_namelen) { 2167 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2168 goto out; 2169 } 2170 2171 /* Send data only if both sides are not shutdown in the direction. */ 2172 if (sk->sk_shutdown & SEND_SHUTDOWN || 2173 vsk->peer_shutdown & RCV_SHUTDOWN) { 2174 err = -EPIPE; 2175 goto out; 2176 } 2177 2178 if (!transport || sk->sk_state != TCP_ESTABLISHED || 2179 !vsock_addr_bound(&vsk->local_addr)) { 2180 err = -ENOTCONN; 2181 goto out; 2182 } 2183 2184 if (!vsock_addr_bound(&vsk->remote_addr)) { 2185 err = -EDESTADDRREQ; 2186 goto out; 2187 } 2188 2189 if (msg->msg_flags & MSG_ZEROCOPY && 2190 !vsock_msgzerocopy_allow(transport)) { 2191 err = -EOPNOTSUPP; 2192 goto out; 2193 } 2194 2195 /* Wait for room in the produce queue to enqueue our user's data. */ 2196 timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2197 2198 err = transport->notify_send_init(vsk, &send_data); 2199 if (err < 0) 2200 goto out; 2201 2202 while (total_written < len) { 2203 ssize_t written; 2204 2205 add_wait_queue(sk_sleep(sk), &wait); 2206 while (vsock_stream_has_space(vsk) == 0 && 2207 sk->sk_err == 0 && 2208 !(sk->sk_shutdown & SEND_SHUTDOWN) && 2209 !(vsk->peer_shutdown & RCV_SHUTDOWN)) { 2210 2211 /* Don't wait for non-blocking sockets. */ 2212 if (timeout == 0) { 2213 err = -EAGAIN; 2214 remove_wait_queue(sk_sleep(sk), &wait); 2215 goto out_err; 2216 } 2217 2218 err = transport->notify_send_pre_block(vsk, &send_data); 2219 if (err < 0) { 2220 remove_wait_queue(sk_sleep(sk), &wait); 2221 goto out_err; 2222 } 2223 2224 release_sock(sk); 2225 timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); 2226 lock_sock(sk); 2227 if (signal_pending(current)) { 2228 err = sock_intr_errno(timeout); 2229 remove_wait_queue(sk_sleep(sk), &wait); 2230 goto out_err; 2231 } else if (timeout == 0) { 2232 err = -EAGAIN; 2233 remove_wait_queue(sk_sleep(sk), &wait); 2234 goto out_err; 2235 } 2236 } 2237 remove_wait_queue(sk_sleep(sk), &wait); 2238 2239 /* These checks occur both as part of and after the loop 2240 * conditional since we need to check before and after 2241 * sleeping. 2242 */ 2243 if (sk->sk_err) { 2244 err = -sk->sk_err; 2245 goto out_err; 2246 } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || 2247 (vsk->peer_shutdown & RCV_SHUTDOWN)) { 2248 err = -EPIPE; 2249 goto out_err; 2250 } 2251 2252 err = transport->notify_send_pre_enqueue(vsk, &send_data); 2253 if (err < 0) 2254 goto out_err; 2255 2256 /* Note that enqueue will only write as many bytes as are free 2257 * in the produce queue, so we don't need to ensure len is 2258 * smaller than the queue size. It is the caller's 2259 * responsibility to check how many bytes we were able to send. 2260 */ 2261 2262 if (sk->sk_type == SOCK_SEQPACKET) { 2263 written = transport->seqpacket_enqueue(vsk, 2264 msg, len - total_written); 2265 } else { 2266 written = transport->stream_enqueue(vsk, 2267 msg, len - total_written); 2268 } 2269 2270 if (written < 0) { 2271 err = written; 2272 goto out_err; 2273 } 2274 2275 total_written += written; 2276 2277 err = transport->notify_send_post_enqueue( 2278 vsk, written, &send_data); 2279 if (err < 0) 2280 goto out_err; 2281 2282 } 2283 2284 out_err: 2285 if (total_written > 0) { 2286 /* Return number of written bytes only if: 2287 * 1) SOCK_STREAM socket. 2288 * 2) SOCK_SEQPACKET socket when whole buffer is sent. 2289 */ 2290 if (sk->sk_type == SOCK_STREAM || total_written == len) 2291 err = total_written; 2292 } 2293 out: 2294 if (sk->sk_type == SOCK_STREAM) 2295 err = sk_stream_error(sk, msg->msg_flags, err); 2296 2297 release_sock(sk); 2298 return err; 2299 } 2300 2301 static int vsock_connectible_wait_data(struct sock *sk, 2302 struct wait_queue_entry *wait, 2303 long timeout, 2304 struct vsock_transport_recv_notify_data *recv_data, 2305 size_t target) 2306 { 2307 const struct vsock_transport *transport; 2308 struct vsock_sock *vsk; 2309 s64 data; 2310 int err; 2311 2312 vsk = vsock_sk(sk); 2313 err = 0; 2314 transport = vsk->transport; 2315 2316 while (1) { 2317 prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); 2318 data = vsock_connectible_has_data(vsk); 2319 if (data != 0) 2320 break; 2321 2322 if (sk->sk_err != 0 || 2323 (sk->sk_shutdown & RCV_SHUTDOWN) || 2324 (vsk->peer_shutdown & SEND_SHUTDOWN)) { 2325 break; 2326 } 2327 2328 /* Don't wait for non-blocking sockets. */ 2329 if (timeout == 0) { 2330 err = -EAGAIN; 2331 break; 2332 } 2333 2334 if (recv_data) { 2335 err = transport->notify_recv_pre_block(vsk, target, recv_data); 2336 if (err < 0) 2337 break; 2338 } 2339 2340 release_sock(sk); 2341 timeout = schedule_timeout(timeout); 2342 lock_sock(sk); 2343 2344 if (signal_pending(current)) { 2345 err = sock_intr_errno(timeout); 2346 break; 2347 } else if (timeout == 0) { 2348 err = -EAGAIN; 2349 break; 2350 } 2351 } 2352 2353 finish_wait(sk_sleep(sk), wait); 2354 2355 if (err) 2356 return err; 2357 2358 /* Internal transport error when checking for available 2359 * data. XXX This should be changed to a connection 2360 * reset in a later change. 2361 */ 2362 if (data < 0) 2363 return -ENOMEM; 2364 2365 return data; 2366 } 2367 2368 static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2369 size_t len, int flags) 2370 { 2371 struct vsock_transport_recv_notify_data recv_data; 2372 const struct vsock_transport *transport; 2373 struct vsock_sock *vsk; 2374 ssize_t copied; 2375 size_t target; 2376 long timeout; 2377 int err; 2378 2379 DEFINE_WAIT(wait); 2380 2381 vsk = vsock_sk(sk); 2382 transport = vsk->transport; 2383 2384 /* We must not copy less than target bytes into the user's buffer 2385 * before returning successfully, so we wait for the consume queue to 2386 * have that much data to consume before dequeueing. Note that this 2387 * makes it impossible to handle cases where target is greater than the 2388 * queue size. 2389 */ 2390 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 2391 if (target >= transport->stream_rcvhiwat(vsk)) { 2392 err = -ENOMEM; 2393 goto out; 2394 } 2395 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2396 copied = 0; 2397 2398 err = transport->notify_recv_init(vsk, target, &recv_data); 2399 if (err < 0) 2400 goto out; 2401 2402 2403 while (1) { 2404 ssize_t read; 2405 2406 err = vsock_connectible_wait_data(sk, &wait, timeout, 2407 &recv_data, target); 2408 if (err <= 0) 2409 break; 2410 2411 err = transport->notify_recv_pre_dequeue(vsk, target, 2412 &recv_data); 2413 if (err < 0) 2414 break; 2415 2416 read = transport->stream_dequeue(vsk, msg, len - copied, flags); 2417 if (read < 0) { 2418 err = read; 2419 break; 2420 } 2421 2422 copied += read; 2423 2424 err = transport->notify_recv_post_dequeue(vsk, target, read, 2425 !(flags & MSG_PEEK), &recv_data); 2426 if (err < 0) 2427 goto out; 2428 2429 if (read >= target || flags & MSG_PEEK) 2430 break; 2431 2432 target -= read; 2433 } 2434 2435 if (sk->sk_err) 2436 err = -sk->sk_err; 2437 else if (sk->sk_shutdown & RCV_SHUTDOWN) 2438 err = 0; 2439 2440 if (copied > 0) 2441 err = copied; 2442 2443 out: 2444 return err; 2445 } 2446 2447 static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, 2448 size_t len, int flags) 2449 { 2450 const struct vsock_transport *transport; 2451 struct vsock_sock *vsk; 2452 ssize_t msg_len; 2453 long timeout; 2454 int err = 0; 2455 DEFINE_WAIT(wait); 2456 2457 vsk = vsock_sk(sk); 2458 transport = vsk->transport; 2459 2460 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2461 2462 err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0); 2463 if (err <= 0) 2464 goto out; 2465 2466 msg_len = transport->seqpacket_dequeue(vsk, msg, flags); 2467 2468 if (msg_len < 0) { 2469 err = msg_len; 2470 goto out; 2471 } 2472 2473 if (sk->sk_err) { 2474 err = -sk->sk_err; 2475 } else if (sk->sk_shutdown & RCV_SHUTDOWN) { 2476 err = 0; 2477 } else { 2478 /* User sets MSG_TRUNC, so return real length of 2479 * packet. 2480 */ 2481 if (flags & MSG_TRUNC) 2482 err = msg_len; 2483 else 2484 err = len - msg_data_left(msg); 2485 2486 /* Always set MSG_TRUNC if real length of packet is 2487 * bigger than user's buffer. 2488 */ 2489 if (msg_len > len) 2490 msg->msg_flags |= MSG_TRUNC; 2491 } 2492 2493 out: 2494 return err; 2495 } 2496 2497 int 2498 __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 2499 int flags) 2500 { 2501 struct sock *sk; 2502 struct vsock_sock *vsk; 2503 const struct vsock_transport *transport; 2504 int err; 2505 2506 sk = sock->sk; 2507 2508 if (unlikely(flags & MSG_ERRQUEUE)) 2509 return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR); 2510 2511 vsk = vsock_sk(sk); 2512 err = 0; 2513 2514 lock_sock(sk); 2515 2516 transport = vsk->transport; 2517 2518 if (!transport || sk->sk_state != TCP_ESTABLISHED) { 2519 /* Recvmsg is supposed to return 0 if a peer performs an 2520 * orderly shutdown. Differentiate between that case and when a 2521 * peer has not connected or a local shutdown occurred with the 2522 * SOCK_DONE flag. 2523 */ 2524 if (sock_flag(sk, SOCK_DONE)) 2525 err = 0; 2526 else 2527 err = -ENOTCONN; 2528 2529 goto out; 2530 } 2531 2532 if (flags & MSG_OOB) { 2533 err = -EOPNOTSUPP; 2534 goto out; 2535 } 2536 2537 /* We don't check peer_shutdown flag here since peer may actually shut 2538 * down, but there can be data in the queue that a local socket can 2539 * receive. 2540 */ 2541 if (sk->sk_shutdown & RCV_SHUTDOWN) { 2542 err = 0; 2543 goto out; 2544 } 2545 2546 /* It is valid on Linux to pass in a zero-length receive buffer. This 2547 * is not an error. We may as well bail out now. 2548 */ 2549 if (!len) { 2550 err = 0; 2551 goto out; 2552 } 2553 2554 if (sk->sk_type == SOCK_STREAM) 2555 err = __vsock_stream_recvmsg(sk, msg, len, flags); 2556 else 2557 err = __vsock_seqpacket_recvmsg(sk, msg, len, flags); 2558 2559 out: 2560 release_sock(sk); 2561 return err; 2562 } 2563 2564 int 2565 vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 2566 int flags) 2567 { 2568 #ifdef CONFIG_BPF_SYSCALL 2569 struct sock *sk = sock->sk; 2570 const struct proto *prot; 2571 2572 prot = READ_ONCE(sk->sk_prot); 2573 if (prot != &vsock_proto) 2574 return prot->recvmsg(sk, msg, len, flags, NULL); 2575 #endif 2576 2577 return __vsock_connectible_recvmsg(sock, msg, len, flags); 2578 } 2579 EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg); 2580 2581 static int vsock_set_rcvlowat(struct sock *sk, int val) 2582 { 2583 const struct vsock_transport *transport; 2584 struct vsock_sock *vsk; 2585 2586 vsk = vsock_sk(sk); 2587 2588 if (val > vsk->buffer_size) 2589 return -EINVAL; 2590 2591 transport = vsk->transport; 2592 2593 if (transport && transport->notify_set_rcvlowat) { 2594 int err; 2595 2596 err = transport->notify_set_rcvlowat(vsk, val); 2597 if (err) 2598 return err; 2599 } 2600 2601 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 2602 return 0; 2603 } 2604 2605 static const struct proto_ops vsock_stream_ops = { 2606 .family = PF_VSOCK, 2607 .owner = THIS_MODULE, 2608 .release = vsock_release, 2609 .bind = vsock_bind, 2610 .connect = vsock_connect, 2611 .socketpair = sock_no_socketpair, 2612 .accept = vsock_accept, 2613 .getname = vsock_getname, 2614 .poll = vsock_poll, 2615 .ioctl = vsock_ioctl, 2616 .listen = vsock_listen, 2617 .shutdown = vsock_shutdown, 2618 .setsockopt = vsock_connectible_setsockopt, 2619 .getsockopt = vsock_connectible_getsockopt, 2620 .sendmsg = vsock_connectible_sendmsg, 2621 .recvmsg = vsock_connectible_recvmsg, 2622 .mmap = sock_no_mmap, 2623 .set_rcvlowat = vsock_set_rcvlowat, 2624 .read_skb = vsock_read_skb, 2625 }; 2626 2627 static const struct proto_ops vsock_seqpacket_ops = { 2628 .family = PF_VSOCK, 2629 .owner = THIS_MODULE, 2630 .release = vsock_release, 2631 .bind = vsock_bind, 2632 .connect = vsock_connect, 2633 .socketpair = sock_no_socketpair, 2634 .accept = vsock_accept, 2635 .getname = vsock_getname, 2636 .poll = vsock_poll, 2637 .ioctl = vsock_ioctl, 2638 .listen = vsock_listen, 2639 .shutdown = vsock_shutdown, 2640 .setsockopt = vsock_connectible_setsockopt, 2641 .getsockopt = vsock_connectible_getsockopt, 2642 .sendmsg = vsock_connectible_sendmsg, 2643 .recvmsg = vsock_connectible_recvmsg, 2644 .mmap = sock_no_mmap, 2645 .read_skb = vsock_read_skb, 2646 }; 2647 2648 static int vsock_create(struct net *net, struct socket *sock, 2649 int protocol, int kern) 2650 { 2651 struct vsock_sock *vsk; 2652 struct sock *sk; 2653 int ret; 2654 2655 if (!sock) 2656 return -EINVAL; 2657 2658 if (protocol && protocol != PF_VSOCK) 2659 return -EPROTONOSUPPORT; 2660 2661 switch (sock->type) { 2662 case SOCK_DGRAM: 2663 sock->ops = &vsock_dgram_ops; 2664 break; 2665 case SOCK_STREAM: 2666 sock->ops = &vsock_stream_ops; 2667 break; 2668 case SOCK_SEQPACKET: 2669 sock->ops = &vsock_seqpacket_ops; 2670 break; 2671 default: 2672 return -ESOCKTNOSUPPORT; 2673 } 2674 2675 sock->state = SS_UNCONNECTED; 2676 2677 sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern); 2678 if (!sk) 2679 return -ENOMEM; 2680 2681 vsk = vsock_sk(sk); 2682 2683 if (sock->type == SOCK_DGRAM) { 2684 ret = vsock_assign_transport(vsk, NULL); 2685 if (ret < 0) { 2686 sock->sk = NULL; 2687 sock_put(sk); 2688 return ret; 2689 } 2690 } 2691 2692 /* SOCK_DGRAM doesn't have 'setsockopt' callback set in its 2693 * proto_ops, so there is no handler for custom logic. 2694 */ 2695 if (sock_type_connectible(sock->type)) 2696 set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); 2697 2698 vsock_insert_unbound(vsk); 2699 2700 return 0; 2701 } 2702 2703 static const struct net_proto_family vsock_family_ops = { 2704 .family = AF_VSOCK, 2705 .create = vsock_create, 2706 .owner = THIS_MODULE, 2707 }; 2708 2709 static long vsock_dev_do_ioctl(struct file *filp, 2710 unsigned int cmd, void __user *ptr) 2711 { 2712 u32 __user *p = ptr; 2713 int retval = 0; 2714 u32 cid; 2715 2716 switch (cmd) { 2717 case IOCTL_VM_SOCKETS_GET_LOCAL_CID: 2718 /* To be compatible with the VMCI behavior, we prioritize the 2719 * guest CID instead of well-know host CID (VMADDR_CID_HOST). 2720 */ 2721 cid = vsock_registered_transport_cid(&transport_g2h); 2722 if (cid == VMADDR_CID_ANY) 2723 cid = vsock_registered_transport_cid(&transport_h2g); 2724 if (cid == VMADDR_CID_ANY) 2725 cid = vsock_registered_transport_cid(&transport_local); 2726 2727 if (put_user(cid, p) != 0) 2728 retval = -EFAULT; 2729 break; 2730 2731 default: 2732 retval = -ENOIOCTLCMD; 2733 } 2734 2735 return retval; 2736 } 2737 2738 static long vsock_dev_ioctl(struct file *filp, 2739 unsigned int cmd, unsigned long arg) 2740 { 2741 return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); 2742 } 2743 2744 #ifdef CONFIG_COMPAT 2745 static long vsock_dev_compat_ioctl(struct file *filp, 2746 unsigned int cmd, unsigned long arg) 2747 { 2748 return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); 2749 } 2750 #endif 2751 2752 static const struct file_operations vsock_device_ops = { 2753 .owner = THIS_MODULE, 2754 .unlocked_ioctl = vsock_dev_ioctl, 2755 #ifdef CONFIG_COMPAT 2756 .compat_ioctl = vsock_dev_compat_ioctl, 2757 #endif 2758 .open = nonseekable_open, 2759 }; 2760 2761 static struct miscdevice vsock_device = { 2762 .name = "vsock", 2763 .fops = &vsock_device_ops, 2764 }; 2765 2766 static int __vsock_net_mode_string(const struct ctl_table *table, int write, 2767 void *buffer, size_t *lenp, loff_t *ppos, 2768 enum vsock_net_mode mode, 2769 enum vsock_net_mode *new_mode) 2770 { 2771 char data[VSOCK_NET_MODE_STR_MAX] = {0}; 2772 struct ctl_table tmp; 2773 int ret; 2774 2775 if (!table->data || !table->maxlen || !*lenp) { 2776 *lenp = 0; 2777 return 0; 2778 } 2779 2780 tmp = *table; 2781 tmp.data = data; 2782 2783 if (!write) { 2784 const char *p; 2785 2786 switch (mode) { 2787 case VSOCK_NET_MODE_GLOBAL: 2788 p = VSOCK_NET_MODE_STR_GLOBAL; 2789 break; 2790 case VSOCK_NET_MODE_LOCAL: 2791 p = VSOCK_NET_MODE_STR_LOCAL; 2792 break; 2793 default: 2794 WARN_ONCE(true, "netns has invalid vsock mode"); 2795 *lenp = 0; 2796 return 0; 2797 } 2798 2799 strscpy(data, p, sizeof(data)); 2800 tmp.maxlen = strlen(p); 2801 } 2802 2803 ret = proc_dostring(&tmp, write, buffer, lenp, ppos); 2804 if (ret || !write) 2805 return ret; 2806 2807 if (*lenp >= sizeof(data)) 2808 return -EINVAL; 2809 2810 if (!strncmp(data, VSOCK_NET_MODE_STR_GLOBAL, sizeof(data))) 2811 *new_mode = VSOCK_NET_MODE_GLOBAL; 2812 else if (!strncmp(data, VSOCK_NET_MODE_STR_LOCAL, sizeof(data))) 2813 *new_mode = VSOCK_NET_MODE_LOCAL; 2814 else 2815 return -EINVAL; 2816 2817 return 0; 2818 } 2819 2820 static int vsock_net_mode_string(const struct ctl_table *table, int write, 2821 void *buffer, size_t *lenp, loff_t *ppos) 2822 { 2823 struct net *net; 2824 2825 if (write) 2826 return -EPERM; 2827 2828 net = current->nsproxy->net_ns; 2829 2830 return __vsock_net_mode_string(table, write, buffer, lenp, ppos, 2831 vsock_net_mode(net), NULL); 2832 } 2833 2834 static int vsock_net_child_mode_string(const struct ctl_table *table, int write, 2835 void *buffer, size_t *lenp, loff_t *ppos) 2836 { 2837 enum vsock_net_mode new_mode; 2838 struct net *net; 2839 int ret; 2840 2841 net = current->nsproxy->net_ns; 2842 2843 ret = __vsock_net_mode_string(table, write, buffer, lenp, ppos, 2844 vsock_net_child_mode(net), &new_mode); 2845 if (ret) 2846 return ret; 2847 2848 if (write) { 2849 /* Prevent a "local" namespace from escalating to "global", 2850 * which would give nested namespaces access to global CIDs. 2851 */ 2852 if (vsock_net_mode(net) == VSOCK_NET_MODE_LOCAL && 2853 new_mode == VSOCK_NET_MODE_GLOBAL) 2854 return -EPERM; 2855 2856 vsock_net_set_child_mode(net, new_mode); 2857 } 2858 2859 return 0; 2860 } 2861 2862 static struct ctl_table vsock_table[] = { 2863 { 2864 .procname = "ns_mode", 2865 .data = &init_net.vsock.mode, 2866 .maxlen = VSOCK_NET_MODE_STR_MAX, 2867 .mode = 0444, 2868 .proc_handler = vsock_net_mode_string 2869 }, 2870 { 2871 .procname = "child_ns_mode", 2872 .data = &init_net.vsock.child_ns_mode, 2873 .maxlen = VSOCK_NET_MODE_STR_MAX, 2874 .mode = 0644, 2875 .proc_handler = vsock_net_child_mode_string 2876 }, 2877 }; 2878 2879 static int __net_init vsock_sysctl_register(struct net *net) 2880 { 2881 struct ctl_table *table; 2882 2883 if (net_eq(net, &init_net)) { 2884 table = vsock_table; 2885 } else { 2886 table = kmemdup(vsock_table, sizeof(vsock_table), GFP_KERNEL); 2887 if (!table) 2888 goto err_alloc; 2889 2890 table[0].data = &net->vsock.mode; 2891 table[1].data = &net->vsock.child_ns_mode; 2892 } 2893 2894 net->vsock.sysctl_hdr = register_net_sysctl_sz(net, "net/vsock", table, 2895 ARRAY_SIZE(vsock_table)); 2896 if (!net->vsock.sysctl_hdr) 2897 goto err_reg; 2898 2899 return 0; 2900 2901 err_reg: 2902 if (!net_eq(net, &init_net)) 2903 kfree(table); 2904 err_alloc: 2905 return -ENOMEM; 2906 } 2907 2908 static void vsock_sysctl_unregister(struct net *net) 2909 { 2910 const struct ctl_table *table; 2911 2912 table = net->vsock.sysctl_hdr->ctl_table_arg; 2913 unregister_net_sysctl_table(net->vsock.sysctl_hdr); 2914 if (!net_eq(net, &init_net)) 2915 kfree(table); 2916 } 2917 2918 static void vsock_net_init(struct net *net) 2919 { 2920 if (net_eq(net, &init_net)) 2921 net->vsock.mode = VSOCK_NET_MODE_GLOBAL; 2922 else 2923 net->vsock.mode = vsock_net_child_mode(current->nsproxy->net_ns); 2924 2925 net->vsock.child_ns_mode = net->vsock.mode; 2926 } 2927 2928 static __net_init int vsock_sysctl_init_net(struct net *net) 2929 { 2930 vsock_net_init(net); 2931 2932 if (vsock_sysctl_register(net)) 2933 return -ENOMEM; 2934 2935 return 0; 2936 } 2937 2938 static __net_exit void vsock_sysctl_exit_net(struct net *net) 2939 { 2940 vsock_sysctl_unregister(net); 2941 } 2942 2943 static struct pernet_operations vsock_sysctl_ops = { 2944 .init = vsock_sysctl_init_net, 2945 .exit = vsock_sysctl_exit_net, 2946 }; 2947 2948 static int __init vsock_init(void) 2949 { 2950 int err = 0; 2951 2952 vsock_init_tables(); 2953 2954 vsock_proto.owner = THIS_MODULE; 2955 vsock_device.minor = MISC_DYNAMIC_MINOR; 2956 err = misc_register(&vsock_device); 2957 if (err) { 2958 pr_err("Failed to register misc device\n"); 2959 goto err_reset_transport; 2960 } 2961 2962 err = proto_register(&vsock_proto, 1); /* we want our slab */ 2963 if (err) { 2964 pr_err("Cannot register vsock protocol\n"); 2965 goto err_deregister_misc; 2966 } 2967 2968 err = sock_register(&vsock_family_ops); 2969 if (err) { 2970 pr_err("could not register af_vsock (%d) address family: %d\n", 2971 AF_VSOCK, err); 2972 goto err_unregister_proto; 2973 } 2974 2975 if (register_pernet_subsys(&vsock_sysctl_ops)) { 2976 err = -ENOMEM; 2977 goto err_unregister_sock; 2978 } 2979 2980 vsock_bpf_build_proto(); 2981 2982 return 0; 2983 2984 err_unregister_sock: 2985 sock_unregister(AF_VSOCK); 2986 err_unregister_proto: 2987 proto_unregister(&vsock_proto); 2988 err_deregister_misc: 2989 misc_deregister(&vsock_device); 2990 err_reset_transport: 2991 return err; 2992 } 2993 2994 static void __exit vsock_exit(void) 2995 { 2996 misc_deregister(&vsock_device); 2997 sock_unregister(AF_VSOCK); 2998 proto_unregister(&vsock_proto); 2999 unregister_pernet_subsys(&vsock_sysctl_ops); 3000 } 3001 3002 const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) 3003 { 3004 return vsk->transport; 3005 } 3006 EXPORT_SYMBOL_GPL(vsock_core_get_transport); 3007 3008 int vsock_core_register(const struct vsock_transport *t, int features) 3009 { 3010 const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local; 3011 int err = mutex_lock_interruptible(&vsock_register_mutex); 3012 3013 if (err) 3014 return err; 3015 3016 t_h2g = transport_h2g; 3017 t_g2h = transport_g2h; 3018 t_dgram = transport_dgram; 3019 t_local = transport_local; 3020 3021 if (features & VSOCK_TRANSPORT_F_H2G) { 3022 if (t_h2g) { 3023 err = -EBUSY; 3024 goto err_busy; 3025 } 3026 t_h2g = t; 3027 } 3028 3029 if (features & VSOCK_TRANSPORT_F_G2H) { 3030 if (t_g2h) { 3031 err = -EBUSY; 3032 goto err_busy; 3033 } 3034 t_g2h = t; 3035 } 3036 3037 if (features & VSOCK_TRANSPORT_F_DGRAM) { 3038 if (t_dgram) { 3039 err = -EBUSY; 3040 goto err_busy; 3041 } 3042 t_dgram = t; 3043 } 3044 3045 if (features & VSOCK_TRANSPORT_F_LOCAL) { 3046 if (t_local) { 3047 err = -EBUSY; 3048 goto err_busy; 3049 } 3050 t_local = t; 3051 } 3052 3053 transport_h2g = t_h2g; 3054 transport_g2h = t_g2h; 3055 transport_dgram = t_dgram; 3056 transport_local = t_local; 3057 3058 err_busy: 3059 mutex_unlock(&vsock_register_mutex); 3060 return err; 3061 } 3062 EXPORT_SYMBOL_GPL(vsock_core_register); 3063 3064 void vsock_core_unregister(const struct vsock_transport *t) 3065 { 3066 mutex_lock(&vsock_register_mutex); 3067 3068 if (transport_h2g == t) 3069 transport_h2g = NULL; 3070 3071 if (transport_g2h == t) 3072 transport_g2h = NULL; 3073 3074 if (transport_dgram == t) 3075 transport_dgram = NULL; 3076 3077 if (transport_local == t) 3078 transport_local = NULL; 3079 3080 mutex_unlock(&vsock_register_mutex); 3081 } 3082 EXPORT_SYMBOL_GPL(vsock_core_unregister); 3083 3084 module_init(vsock_init); 3085 module_exit(vsock_exit); 3086 3087 MODULE_AUTHOR("VMware, Inc."); 3088 MODULE_DESCRIPTION("VMware Virtual Socket Family"); 3089 MODULE_VERSION("1.0.2.0-k"); 3090 MODULE_LICENSE("GPL v2"); 3091