1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel Connection Multiplexor 4 * 5 * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> 6 */ 7 8 #include <linux/bpf.h> 9 #include <linux/errno.h> 10 #include <linux/errqueue.h> 11 #include <linux/file.h> 12 #include <linux/filter.h> 13 #include <linux/in.h> 14 #include <linux/kernel.h> 15 #include <linux/module.h> 16 #include <linux/net.h> 17 #include <linux/netdevice.h> 18 #include <linux/poll.h> 19 #include <linux/rculist.h> 20 #include <linux/skbuff.h> 21 #include <linux/socket.h> 22 #include <linux/splice.h> 23 #include <linux/uaccess.h> 24 #include <linux/workqueue.h> 25 #include <linux/syscalls.h> 26 #include <linux/sched/signal.h> 27 #include <linux/uio.h> 28 29 #include <net/kcm.h> 30 #include <net/netns/generic.h> 31 #include <net/sock.h> 32 #include <uapi/linux/kcm.h> 33 #include <trace/events/sock.h> 34 35 unsigned int kcm_net_id; 36 37 static struct kmem_cache *kcm_psockp __read_mostly; 38 static struct kmem_cache *kcm_muxp __read_mostly; 39 static struct workqueue_struct *kcm_wq; 40 41 static inline struct kcm_sock *kcm_sk(const struct sock *sk) 42 { 43 return (struct kcm_sock *)sk; 44 } 45 46 static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) 47 { 48 return (struct kcm_tx_msg *)skb->cb; 49 } 50 51 static void report_csk_error(struct sock *csk, int err) 52 { 53 csk->sk_err = EPIPE; 54 sk_error_report(csk); 55 } 56 57 static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, 58 bool wakeup_kcm) 59 { 60 struct sock *csk = psock->sk; 61 struct kcm_mux *mux = psock->mux; 62 63 /* Unrecoverable error in transmit */ 64 65 spin_lock_bh(&mux->lock); 66 67 if (psock->tx_stopped) { 68 spin_unlock_bh(&mux->lock); 69 return; 70 } 71 72 psock->tx_stopped = 1; 73 KCM_STATS_INCR(psock->stats.tx_aborts); 74 75 if (!psock->tx_kcm) { 76 /* Take off psocks_avail list */ 77 list_del(&psock->psock_avail_list); 78 } else if (wakeup_kcm) { 79 /* In this case psock is being aborted while outside of 80 * write_msgs and psock is reserved. Schedule tx_work 81 * to handle the failure there. Need to commit tx_stopped 82 * before queuing work. 83 */ 84 smp_mb(); 85 86 queue_work(kcm_wq, &psock->tx_kcm->tx_work); 87 } 88 89 spin_unlock_bh(&mux->lock); 90 91 /* Report error on lower socket */ 92 report_csk_error(csk, err); 93 } 94 95 /* RX mux lock held. */ 96 static void kcm_update_rx_mux_stats(struct kcm_mux *mux, 97 struct kcm_psock *psock) 98 { 99 STRP_STATS_ADD(mux->stats.rx_bytes, 100 psock->strp.stats.bytes - 101 psock->saved_rx_bytes); 102 mux->stats.rx_msgs += 103 psock->strp.stats.msgs - psock->saved_rx_msgs; 104 psock->saved_rx_msgs = psock->strp.stats.msgs; 105 psock->saved_rx_bytes = psock->strp.stats.bytes; 106 } 107 108 static void kcm_update_tx_mux_stats(struct kcm_mux *mux, 109 struct kcm_psock *psock) 110 { 111 KCM_STATS_ADD(mux->stats.tx_bytes, 112 psock->stats.tx_bytes - psock->saved_tx_bytes); 113 mux->stats.tx_msgs += 114 psock->stats.tx_msgs - psock->saved_tx_msgs; 115 psock->saved_tx_msgs = psock->stats.tx_msgs; 116 psock->saved_tx_bytes = psock->stats.tx_bytes; 117 } 118 119 static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 120 121 /* KCM is ready to receive messages on its queue-- either the KCM is new or 122 * has become unblocked after being blocked on full socket buffer. Queue any 123 * pending ready messages on a psock. RX mux lock held. 124 */ 125 static void kcm_rcv_ready(struct kcm_sock *kcm) 126 { 127 struct kcm_mux *mux = kcm->mux; 128 struct kcm_psock *psock; 129 struct sk_buff *skb; 130 131 if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) 132 return; 133 134 while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { 135 if (kcm_queue_rcv_skb(&kcm->sk, skb)) { 136 /* Assuming buffer limit has been reached */ 137 skb_queue_head(&mux->rx_hold_queue, skb); 138 WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); 139 return; 140 } 141 } 142 143 while (!list_empty(&mux->psocks_ready)) { 144 psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, 145 psock_ready_list); 146 147 if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { 148 /* Assuming buffer limit has been reached */ 149 WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); 150 return; 151 } 152 153 /* Consumed the ready message on the psock. Schedule rx_work to 154 * get more messages. 155 */ 156 list_del(&psock->psock_ready_list); 157 psock->ready_rx_msg = NULL; 158 /* Commit clearing of ready_rx_msg for queuing work */ 159 smp_mb(); 160 161 strp_unpause(&psock->strp); 162 strp_check_rcv(&psock->strp); 163 } 164 165 /* Buffer limit is okay now, add to ready list */ 166 list_add_tail(&kcm->wait_rx_list, 167 &kcm->mux->kcm_rx_waiters); 168 /* paired with lockless reads in kcm_rfree() */ 169 WRITE_ONCE(kcm->rx_wait, true); 170 } 171 172 static void kcm_rfree(struct sk_buff *skb) 173 { 174 struct sock *sk = skb->sk; 175 struct kcm_sock *kcm = kcm_sk(sk); 176 struct kcm_mux *mux = kcm->mux; 177 unsigned int len = skb->truesize; 178 179 sk_mem_uncharge(sk, len); 180 atomic_sub(len, &sk->sk_rmem_alloc); 181 182 /* For reading rx_wait and rx_psock without holding lock */ 183 smp_mb__after_atomic(); 184 185 if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) && 186 sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { 187 spin_lock_bh(&mux->rx_lock); 188 kcm_rcv_ready(kcm); 189 spin_unlock_bh(&mux->rx_lock); 190 } 191 } 192 193 static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 194 { 195 struct sk_buff_head *list = &sk->sk_receive_queue; 196 197 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) 198 return -ENOMEM; 199 200 if (!sk_rmem_schedule(sk, skb, skb->truesize)) 201 return -ENOBUFS; 202 203 skb->dev = NULL; 204 205 skb_orphan(skb); 206 skb->sk = sk; 207 skb->destructor = kcm_rfree; 208 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 209 sk_mem_charge(sk, skb->truesize); 210 211 skb_queue_tail(list, skb); 212 213 if (!sock_flag(sk, SOCK_DEAD)) 214 sk->sk_data_ready(sk); 215 216 return 0; 217 } 218 219 /* Requeue received messages for a kcm socket to other kcm sockets. This is 220 * called with a kcm socket is receive disabled. 221 * RX mux lock held. 222 */ 223 static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) 224 { 225 struct sk_buff *skb; 226 struct kcm_sock *kcm; 227 228 while ((skb = skb_dequeue(head))) { 229 /* Reset destructor to avoid calling kcm_rcv_ready */ 230 skb->destructor = sock_rfree; 231 skb_orphan(skb); 232 try_again: 233 if (list_empty(&mux->kcm_rx_waiters)) { 234 skb_queue_tail(&mux->rx_hold_queue, skb); 235 continue; 236 } 237 238 kcm = list_first_entry(&mux->kcm_rx_waiters, 239 struct kcm_sock, wait_rx_list); 240 241 if (kcm_queue_rcv_skb(&kcm->sk, skb)) { 242 /* Should mean socket buffer full */ 243 list_del(&kcm->wait_rx_list); 244 /* paired with lockless reads in kcm_rfree() */ 245 WRITE_ONCE(kcm->rx_wait, false); 246 247 /* Commit rx_wait to read in kcm_free */ 248 smp_wmb(); 249 250 goto try_again; 251 } 252 } 253 } 254 255 /* Lower sock lock held */ 256 static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, 257 struct sk_buff *head) 258 { 259 struct kcm_mux *mux = psock->mux; 260 struct kcm_sock *kcm; 261 262 WARN_ON(psock->ready_rx_msg); 263 264 if (psock->rx_kcm) 265 return psock->rx_kcm; 266 267 spin_lock_bh(&mux->rx_lock); 268 269 if (psock->rx_kcm) { 270 spin_unlock_bh(&mux->rx_lock); 271 return psock->rx_kcm; 272 } 273 274 kcm_update_rx_mux_stats(mux, psock); 275 276 if (list_empty(&mux->kcm_rx_waiters)) { 277 psock->ready_rx_msg = head; 278 strp_pause(&psock->strp); 279 list_add_tail(&psock->psock_ready_list, 280 &mux->psocks_ready); 281 spin_unlock_bh(&mux->rx_lock); 282 return NULL; 283 } 284 285 kcm = list_first_entry(&mux->kcm_rx_waiters, 286 struct kcm_sock, wait_rx_list); 287 list_del(&kcm->wait_rx_list); 288 /* paired with lockless reads in kcm_rfree() */ 289 WRITE_ONCE(kcm->rx_wait, false); 290 291 psock->rx_kcm = kcm; 292 /* paired with lockless reads in kcm_rfree() */ 293 WRITE_ONCE(kcm->rx_psock, psock); 294 295 spin_unlock_bh(&mux->rx_lock); 296 297 return kcm; 298 } 299 300 static void kcm_done(struct kcm_sock *kcm); 301 302 static void kcm_done_work(struct work_struct *w) 303 { 304 kcm_done(container_of(w, struct kcm_sock, done_work)); 305 } 306 307 /* Lower sock held */ 308 static void unreserve_rx_kcm(struct kcm_psock *psock, 309 bool rcv_ready) 310 { 311 struct kcm_sock *kcm = psock->rx_kcm; 312 struct kcm_mux *mux = psock->mux; 313 314 if (!kcm) 315 return; 316 317 spin_lock_bh(&mux->rx_lock); 318 319 psock->rx_kcm = NULL; 320 /* paired with lockless reads in kcm_rfree() */ 321 WRITE_ONCE(kcm->rx_psock, NULL); 322 323 /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with 324 * kcm_rfree 325 */ 326 smp_mb(); 327 328 if (unlikely(kcm->done)) { 329 spin_unlock_bh(&mux->rx_lock); 330 331 /* Need to run kcm_done in a task since we need to qcquire 332 * callback locks which may already be held here. 333 */ 334 INIT_WORK(&kcm->done_work, kcm_done_work); 335 schedule_work(&kcm->done_work); 336 return; 337 } 338 339 if (unlikely(kcm->rx_disabled)) { 340 requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); 341 } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { 342 /* Check for degenerative race with rx_wait that all 343 * data was dequeued (accounted for in kcm_rfree). 344 */ 345 kcm_rcv_ready(kcm); 346 } 347 spin_unlock_bh(&mux->rx_lock); 348 } 349 350 /* Lower sock lock held */ 351 static void psock_data_ready(struct sock *sk) 352 { 353 struct kcm_psock *psock; 354 355 trace_sk_data_ready(sk); 356 357 read_lock_bh(&sk->sk_callback_lock); 358 359 psock = (struct kcm_psock *)sk->sk_user_data; 360 if (likely(psock)) 361 strp_data_ready(&psock->strp); 362 363 read_unlock_bh(&sk->sk_callback_lock); 364 } 365 366 /* Called with lower sock held */ 367 static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb) 368 { 369 struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); 370 struct kcm_sock *kcm; 371 372 try_queue: 373 kcm = reserve_rx_kcm(psock, skb); 374 if (!kcm) { 375 /* Unable to reserve a KCM, message is held in psock and strp 376 * is paused. 377 */ 378 return; 379 } 380 381 if (kcm_queue_rcv_skb(&kcm->sk, skb)) { 382 /* Should mean socket buffer full */ 383 unreserve_rx_kcm(psock, false); 384 goto try_queue; 385 } 386 } 387 388 static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) 389 { 390 struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); 391 struct bpf_prog *prog = psock->bpf_prog; 392 int res; 393 394 res = bpf_prog_run_pin_on_cpu(prog, skb); 395 return res; 396 } 397 398 static int kcm_read_sock_done(struct strparser *strp, int err) 399 { 400 struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); 401 402 unreserve_rx_kcm(psock, true); 403 404 return err; 405 } 406 407 static void psock_state_change(struct sock *sk) 408 { 409 /* TCP only does a EPOLLIN for a half close. Do a EPOLLHUP here 410 * since application will normally not poll with EPOLLIN 411 * on the TCP sockets. 412 */ 413 414 report_csk_error(sk, EPIPE); 415 } 416 417 static void psock_write_space(struct sock *sk) 418 { 419 struct kcm_psock *psock; 420 struct kcm_mux *mux; 421 struct kcm_sock *kcm; 422 423 read_lock_bh(&sk->sk_callback_lock); 424 425 psock = (struct kcm_psock *)sk->sk_user_data; 426 if (unlikely(!psock)) 427 goto out; 428 mux = psock->mux; 429 430 spin_lock_bh(&mux->lock); 431 432 /* Check if the socket is reserved so someone is waiting for sending. */ 433 kcm = psock->tx_kcm; 434 if (kcm) 435 queue_work(kcm_wq, &kcm->tx_work); 436 437 spin_unlock_bh(&mux->lock); 438 out: 439 read_unlock_bh(&sk->sk_callback_lock); 440 } 441 442 static void unreserve_psock(struct kcm_sock *kcm); 443 444 /* kcm sock is locked. */ 445 static struct kcm_psock *reserve_psock(struct kcm_sock *kcm) 446 { 447 struct kcm_mux *mux = kcm->mux; 448 struct kcm_psock *psock; 449 450 psock = kcm->tx_psock; 451 452 smp_rmb(); /* Must read tx_psock before tx_wait */ 453 454 if (psock) { 455 WARN_ON(kcm->tx_wait); 456 if (unlikely(psock->tx_stopped)) 457 unreserve_psock(kcm); 458 else 459 return kcm->tx_psock; 460 } 461 462 spin_lock_bh(&mux->lock); 463 464 /* Check again under lock to see if psock was reserved for this 465 * psock via psock_unreserve. 466 */ 467 psock = kcm->tx_psock; 468 if (unlikely(psock)) { 469 WARN_ON(kcm->tx_wait); 470 spin_unlock_bh(&mux->lock); 471 return kcm->tx_psock; 472 } 473 474 if (!list_empty(&mux->psocks_avail)) { 475 psock = list_first_entry(&mux->psocks_avail, 476 struct kcm_psock, 477 psock_avail_list); 478 list_del(&psock->psock_avail_list); 479 if (kcm->tx_wait) { 480 list_del(&kcm->wait_psock_list); 481 kcm->tx_wait = false; 482 } 483 kcm->tx_psock = psock; 484 psock->tx_kcm = kcm; 485 KCM_STATS_INCR(psock->stats.reserved); 486 } else if (!kcm->tx_wait) { 487 list_add_tail(&kcm->wait_psock_list, 488 &mux->kcm_tx_waiters); 489 kcm->tx_wait = true; 490 } 491 492 spin_unlock_bh(&mux->lock); 493 494 return psock; 495 } 496 497 /* mux lock held */ 498 static void psock_now_avail(struct kcm_psock *psock) 499 { 500 struct kcm_mux *mux = psock->mux; 501 struct kcm_sock *kcm; 502 503 if (list_empty(&mux->kcm_tx_waiters)) { 504 list_add_tail(&psock->psock_avail_list, 505 &mux->psocks_avail); 506 } else { 507 kcm = list_first_entry(&mux->kcm_tx_waiters, 508 struct kcm_sock, 509 wait_psock_list); 510 list_del(&kcm->wait_psock_list); 511 kcm->tx_wait = false; 512 psock->tx_kcm = kcm; 513 514 /* Commit before changing tx_psock since that is read in 515 * reserve_psock before queuing work. 516 */ 517 smp_mb(); 518 519 kcm->tx_psock = psock; 520 KCM_STATS_INCR(psock->stats.reserved); 521 queue_work(kcm_wq, &kcm->tx_work); 522 } 523 } 524 525 /* kcm sock is locked. */ 526 static void unreserve_psock(struct kcm_sock *kcm) 527 { 528 struct kcm_psock *psock; 529 struct kcm_mux *mux = kcm->mux; 530 531 spin_lock_bh(&mux->lock); 532 533 psock = kcm->tx_psock; 534 535 if (WARN_ON(!psock)) { 536 spin_unlock_bh(&mux->lock); 537 return; 538 } 539 540 smp_rmb(); /* Read tx_psock before tx_wait */ 541 542 kcm_update_tx_mux_stats(mux, psock); 543 544 WARN_ON(kcm->tx_wait); 545 546 kcm->tx_psock = NULL; 547 psock->tx_kcm = NULL; 548 KCM_STATS_INCR(psock->stats.unreserved); 549 550 if (unlikely(psock->tx_stopped)) { 551 if (psock->done) { 552 /* Deferred free */ 553 list_del(&psock->psock_list); 554 mux->psocks_cnt--; 555 sock_put(psock->sk); 556 fput(psock->sk->sk_socket->file); 557 kmem_cache_free(kcm_psockp, psock); 558 } 559 560 /* Don't put back on available list */ 561 562 spin_unlock_bh(&mux->lock); 563 564 return; 565 } 566 567 psock_now_avail(psock); 568 569 spin_unlock_bh(&mux->lock); 570 } 571 572 static void kcm_report_tx_retry(struct kcm_sock *kcm) 573 { 574 struct kcm_mux *mux = kcm->mux; 575 576 spin_lock_bh(&mux->lock); 577 KCM_STATS_INCR(mux->stats.tx_retries); 578 spin_unlock_bh(&mux->lock); 579 } 580 581 /* Write any messages ready on the kcm socket. Called with kcm sock lock 582 * held. Return bytes actually sent or error. 583 */ 584 static int kcm_write_msgs(struct kcm_sock *kcm) 585 { 586 unsigned int total_sent = 0; 587 struct sock *sk = &kcm->sk; 588 struct kcm_psock *psock; 589 struct sk_buff *head; 590 int ret = 0; 591 592 kcm->tx_wait_more = false; 593 psock = kcm->tx_psock; 594 if (unlikely(psock && psock->tx_stopped)) { 595 /* A reserved psock was aborted asynchronously. Unreserve 596 * it and we'll retry the message. 597 */ 598 unreserve_psock(kcm); 599 kcm_report_tx_retry(kcm); 600 if (skb_queue_empty(&sk->sk_write_queue)) 601 return 0; 602 603 kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false; 604 } 605 606 retry: 607 while ((head = skb_peek(&sk->sk_write_queue))) { 608 struct msghdr msg = { 609 .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, 610 }; 611 struct kcm_tx_msg *txm = kcm_tx_msg(head); 612 struct sk_buff *skb; 613 unsigned int msize; 614 int i; 615 616 if (!txm->started_tx) { 617 psock = reserve_psock(kcm); 618 if (!psock) 619 goto out; 620 skb = head; 621 txm->frag_offset = 0; 622 txm->sent = 0; 623 txm->started_tx = true; 624 } else { 625 if (WARN_ON(!psock)) { 626 ret = -EINVAL; 627 goto out; 628 } 629 skb = txm->frag_skb; 630 } 631 632 if (WARN_ON_ONCE(!skb_shinfo(skb)->nr_frags) || 633 WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) { 634 ret = -EINVAL; 635 goto out; 636 } 637 638 msize = 0; 639 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 640 msize += skb_frag_size(&skb_shinfo(skb)->frags[i]); 641 642 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, 643 (const struct bio_vec *)skb_shinfo(skb)->frags, 644 skb_shinfo(skb)->nr_frags, msize); 645 iov_iter_advance(&msg.msg_iter, txm->frag_offset); 646 647 do { 648 ret = sock_sendmsg(psock->sk->sk_socket, &msg); 649 if (ret <= 0) { 650 if (ret == -EAGAIN) { 651 /* Save state to try again when there's 652 * write space on the socket 653 */ 654 txm->frag_skb = skb; 655 ret = 0; 656 goto out; 657 } 658 659 /* Hard failure in sending message, abort this 660 * psock since it has lost framing 661 * synchronization and retry sending the 662 * message from the beginning. 663 */ 664 kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, 665 true); 666 unreserve_psock(kcm); 667 psock = NULL; 668 669 txm->started_tx = false; 670 kcm_report_tx_retry(kcm); 671 ret = 0; 672 goto retry; 673 } 674 675 txm->sent += ret; 676 txm->frag_offset += ret; 677 KCM_STATS_ADD(psock->stats.tx_bytes, ret); 678 } while (msg.msg_iter.count > 0); 679 680 if (skb == head) { 681 if (skb_has_frag_list(skb)) { 682 txm->frag_skb = skb_shinfo(skb)->frag_list; 683 txm->frag_offset = 0; 684 continue; 685 } 686 } else if (skb->next) { 687 txm->frag_skb = skb->next; 688 txm->frag_offset = 0; 689 continue; 690 } 691 692 /* Successfully sent the whole packet, account for it. */ 693 sk->sk_wmem_queued -= txm->sent; 694 total_sent += txm->sent; 695 skb_dequeue(&sk->sk_write_queue); 696 kfree_skb(head); 697 KCM_STATS_INCR(psock->stats.tx_msgs); 698 } 699 out: 700 if (!head) { 701 /* Done with all queued messages. */ 702 WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); 703 if (psock) 704 unreserve_psock(kcm); 705 } 706 707 /* Check if write space is available */ 708 sk->sk_write_space(sk); 709 710 return total_sent ? : ret; 711 } 712 713 static void kcm_tx_work(struct work_struct *w) 714 { 715 struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work); 716 struct sock *sk = &kcm->sk; 717 int err; 718 719 lock_sock(sk); 720 721 /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx 722 * aborts 723 */ 724 err = kcm_write_msgs(kcm); 725 if (err < 0) { 726 /* Hard failure in write, report error on KCM socket */ 727 pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err); 728 report_csk_error(&kcm->sk, -err); 729 goto out; 730 } 731 732 /* Primarily for SOCK_SEQPACKET sockets */ 733 if (likely(sk->sk_socket) && 734 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { 735 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 736 sk->sk_write_space(sk); 737 } 738 739 out: 740 release_sock(sk); 741 } 742 743 static void kcm_push(struct kcm_sock *kcm) 744 { 745 if (kcm->tx_wait_more) 746 kcm_write_msgs(kcm); 747 } 748 749 static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 750 { 751 struct sock *sk = sock->sk; 752 struct kcm_sock *kcm = kcm_sk(sk); 753 struct sk_buff *skb = NULL, *head = NULL, *frag_prev = NULL; 754 size_t copy, copied = 0; 755 long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 756 int eor = (sock->type == SOCK_DGRAM) ? 757 !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); 758 int err = -EPIPE; 759 760 mutex_lock(&kcm->tx_mutex); 761 lock_sock(sk); 762 763 /* Per tcp_sendmsg this should be in poll */ 764 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 765 766 if (sk->sk_err) 767 goto out_error; 768 769 if (kcm->seq_skb) { 770 /* Previously opened message */ 771 head = kcm->seq_skb; 772 skb = kcm_tx_msg(head)->last_skb; 773 goto start; 774 } 775 776 /* Call the sk_stream functions to manage the sndbuf mem. */ 777 if (!sk_stream_memory_free(sk)) { 778 kcm_push(kcm); 779 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 780 err = sk_stream_wait_memory(sk, &timeo); 781 if (err) 782 goto out_error; 783 } 784 785 if (msg_data_left(msg)) { 786 /* New message, alloc head skb */ 787 head = alloc_skb(0, sk->sk_allocation); 788 while (!head) { 789 kcm_push(kcm); 790 err = sk_stream_wait_memory(sk, &timeo); 791 if (err) 792 goto out_error; 793 794 head = alloc_skb(0, sk->sk_allocation); 795 } 796 797 skb = head; 798 799 /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling 800 * csum_and_copy_from_iter from skb_do_copy_data_nocache. 801 */ 802 skb->ip_summed = CHECKSUM_UNNECESSARY; 803 } 804 805 start: 806 while (msg_data_left(msg)) { 807 bool merge = true; 808 int i = skb_shinfo(skb)->nr_frags; 809 struct page_frag *pfrag = sk_page_frag(sk); 810 811 if (!sk_page_frag_refill(sk, pfrag)) 812 goto wait_for_memory; 813 814 if (!skb_can_coalesce(skb, i, pfrag->page, 815 pfrag->offset)) { 816 if (i == MAX_SKB_FRAGS) { 817 struct sk_buff *tskb; 818 819 tskb = alloc_skb(0, sk->sk_allocation); 820 if (!tskb) 821 goto wait_for_memory; 822 823 if (head == skb) 824 skb_shinfo(head)->frag_list = tskb; 825 else 826 skb->next = tskb; 827 828 frag_prev = skb; 829 skb = tskb; 830 skb->ip_summed = CHECKSUM_UNNECESSARY; 831 continue; 832 } 833 merge = false; 834 } 835 836 if (msg->msg_flags & MSG_SPLICE_PAGES) { 837 copy = msg_data_left(msg); 838 if (!sk_wmem_schedule(sk, copy)) 839 goto wait_for_memory; 840 841 err = skb_splice_from_iter(skb, &msg->msg_iter, copy); 842 if (err < 0) { 843 if (err == -EMSGSIZE) 844 goto wait_for_memory; 845 goto out_error; 846 } 847 848 copy = err; 849 skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; 850 sk_wmem_queued_add(sk, copy); 851 sk_mem_charge(sk, copy); 852 853 if (head != skb) 854 head->truesize += copy; 855 } else { 856 copy = min_t(int, msg_data_left(msg), 857 pfrag->size - pfrag->offset); 858 if (!sk_wmem_schedule(sk, copy)) 859 goto wait_for_memory; 860 861 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, 862 pfrag->page, 863 pfrag->offset, 864 copy); 865 if (err) 866 goto out_error; 867 868 /* Update the skb. */ 869 if (merge) { 870 skb_frag_size_add( 871 &skb_shinfo(skb)->frags[i - 1], copy); 872 } else { 873 skb_fill_page_desc(skb, i, pfrag->page, 874 pfrag->offset, copy); 875 get_page(pfrag->page); 876 } 877 878 pfrag->offset += copy; 879 } 880 881 copied += copy; 882 if (head != skb) { 883 head->len += copy; 884 head->data_len += copy; 885 } 886 887 continue; 888 889 wait_for_memory: 890 kcm_push(kcm); 891 err = sk_stream_wait_memory(sk, &timeo); 892 if (err) 893 goto out_error; 894 } 895 896 if (eor) { 897 bool not_busy = skb_queue_empty(&sk->sk_write_queue); 898 899 if (head) { 900 /* Message complete, queue it on send buffer */ 901 __skb_queue_tail(&sk->sk_write_queue, head); 902 kcm->seq_skb = NULL; 903 KCM_STATS_INCR(kcm->stats.tx_msgs); 904 } 905 906 if (msg->msg_flags & MSG_BATCH) { 907 kcm->tx_wait_more = true; 908 } else if (kcm->tx_wait_more || not_busy) { 909 err = kcm_write_msgs(kcm); 910 if (err < 0) { 911 /* We got a hard error in write_msgs but have 912 * already queued this message. Report an error 913 * in the socket, but don't affect return value 914 * from sendmsg 915 */ 916 pr_warn("KCM: Hard failure on kcm_write_msgs\n"); 917 report_csk_error(&kcm->sk, -err); 918 } 919 } 920 } else { 921 /* Message not complete, save state */ 922 partial_message: 923 if (head) { 924 kcm->seq_skb = head; 925 kcm_tx_msg(head)->last_skb = skb; 926 } 927 } 928 929 KCM_STATS_ADD(kcm->stats.tx_bytes, copied); 930 931 release_sock(sk); 932 mutex_unlock(&kcm->tx_mutex); 933 return copied; 934 935 out_error: 936 kcm_push(kcm); 937 938 /* When MAX_SKB_FRAGS was reached, a new skb was allocated and 939 * linked into the frag_list before data copy. If the copy 940 * subsequently failed, this skb has zero frags. Remove it from 941 * the frag_list to prevent kcm_write_msgs from later hitting 942 * WARN_ON(!skb_shinfo(skb)->nr_frags). 943 */ 944 if (frag_prev && !skb_shinfo(skb)->nr_frags) { 945 if (head == frag_prev) 946 skb_shinfo(head)->frag_list = NULL; 947 else 948 frag_prev->next = NULL; 949 kfree_skb(skb); 950 /* Update skb as it may be saved in partial_message via goto */ 951 skb = frag_prev; 952 } 953 954 if (sock->type == SOCK_SEQPACKET) { 955 /* Wrote some bytes before encountering an 956 * error, return partial success. 957 */ 958 if (copied) 959 goto partial_message; 960 if (head != kcm->seq_skb) 961 kfree_skb(head); 962 } else { 963 kfree_skb(head); 964 kcm->seq_skb = NULL; 965 } 966 967 err = sk_stream_error(sk, msg->msg_flags, err); 968 969 /* make sure we wake any epoll edge trigger waiter */ 970 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) 971 sk->sk_write_space(sk); 972 973 release_sock(sk); 974 mutex_unlock(&kcm->tx_mutex); 975 return err; 976 } 977 978 static void kcm_splice_eof(struct socket *sock) 979 { 980 struct sock *sk = sock->sk; 981 struct kcm_sock *kcm = kcm_sk(sk); 982 983 if (skb_queue_empty_lockless(&sk->sk_write_queue)) 984 return; 985 986 lock_sock(sk); 987 kcm_write_msgs(kcm); 988 release_sock(sk); 989 } 990 991 static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, 992 size_t len, int flags) 993 { 994 struct sock *sk = sock->sk; 995 struct kcm_sock *kcm = kcm_sk(sk); 996 int err = 0; 997 struct strp_msg *stm; 998 int copied = 0; 999 struct sk_buff *skb; 1000 1001 skb = skb_recv_datagram(sk, flags, &err); 1002 if (!skb) 1003 goto out; 1004 1005 /* Okay, have a message on the receive queue */ 1006 1007 stm = strp_msg(skb); 1008 1009 if (len > stm->full_len) 1010 len = stm->full_len; 1011 1012 err = skb_copy_datagram_msg(skb, stm->offset, msg, len); 1013 if (err < 0) 1014 goto out; 1015 1016 copied = len; 1017 if (likely(!(flags & MSG_PEEK))) { 1018 KCM_STATS_ADD(kcm->stats.rx_bytes, copied); 1019 if (copied < stm->full_len) { 1020 if (sock->type == SOCK_DGRAM) { 1021 /* Truncated message */ 1022 msg->msg_flags |= MSG_TRUNC; 1023 goto msg_finished; 1024 } 1025 stm->offset += copied; 1026 stm->full_len -= copied; 1027 } else { 1028 msg_finished: 1029 /* Finished with message */ 1030 msg->msg_flags |= MSG_EOR; 1031 KCM_STATS_INCR(kcm->stats.rx_msgs); 1032 } 1033 } 1034 1035 out: 1036 skb_free_datagram(sk, skb); 1037 return copied ? : err; 1038 } 1039 1040 static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, 1041 struct pipe_inode_info *pipe, size_t len, 1042 unsigned int flags) 1043 { 1044 struct sock *sk = sock->sk; 1045 struct kcm_sock *kcm = kcm_sk(sk); 1046 struct strp_msg *stm; 1047 int err = 0; 1048 ssize_t copied; 1049 struct sk_buff *skb; 1050 1051 if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK) 1052 flags = MSG_DONTWAIT; 1053 else 1054 flags = 0; 1055 1056 /* Only support splice for SOCKSEQPACKET */ 1057 1058 skb = skb_recv_datagram(sk, flags, &err); 1059 if (!skb) 1060 goto err_out; 1061 1062 /* Okay, have a message on the receive queue */ 1063 1064 stm = strp_msg(skb); 1065 1066 if (len > stm->full_len) 1067 len = stm->full_len; 1068 1069 copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags); 1070 if (copied < 0) { 1071 err = copied; 1072 goto err_out; 1073 } 1074 1075 KCM_STATS_ADD(kcm->stats.rx_bytes, copied); 1076 1077 stm->offset += copied; 1078 stm->full_len -= copied; 1079 1080 /* We have no way to return MSG_EOR. If all the bytes have been 1081 * read we still leave the message in the receive socket buffer. 1082 * A subsequent recvmsg needs to be done to return MSG_EOR and 1083 * finish reading the message. 1084 */ 1085 1086 skb_free_datagram(sk, skb); 1087 return copied; 1088 1089 err_out: 1090 skb_free_datagram(sk, skb); 1091 return err; 1092 } 1093 1094 /* kcm sock lock held */ 1095 static void kcm_recv_disable(struct kcm_sock *kcm) 1096 { 1097 struct kcm_mux *mux = kcm->mux; 1098 1099 if (kcm->rx_disabled) 1100 return; 1101 1102 spin_lock_bh(&mux->rx_lock); 1103 1104 kcm->rx_disabled = 1; 1105 1106 /* If a psock is reserved we'll do cleanup in unreserve */ 1107 if (!kcm->rx_psock) { 1108 if (kcm->rx_wait) { 1109 list_del(&kcm->wait_rx_list); 1110 /* paired with lockless reads in kcm_rfree() */ 1111 WRITE_ONCE(kcm->rx_wait, false); 1112 } 1113 1114 requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); 1115 } 1116 1117 spin_unlock_bh(&mux->rx_lock); 1118 } 1119 1120 /* kcm sock lock held */ 1121 static void kcm_recv_enable(struct kcm_sock *kcm) 1122 { 1123 struct kcm_mux *mux = kcm->mux; 1124 1125 if (!kcm->rx_disabled) 1126 return; 1127 1128 spin_lock_bh(&mux->rx_lock); 1129 1130 kcm->rx_disabled = 0; 1131 kcm_rcv_ready(kcm); 1132 1133 spin_unlock_bh(&mux->rx_lock); 1134 } 1135 1136 static int kcm_setsockopt(struct socket *sock, int level, int optname, 1137 sockptr_t optval, unsigned int optlen) 1138 { 1139 struct kcm_sock *kcm = kcm_sk(sock->sk); 1140 int val, valbool; 1141 int err = 0; 1142 1143 if (level != SOL_KCM) 1144 return -ENOPROTOOPT; 1145 1146 if (optlen < sizeof(int)) 1147 return -EINVAL; 1148 1149 if (copy_from_sockptr(&val, optval, sizeof(int))) 1150 return -EFAULT; 1151 1152 valbool = val ? 1 : 0; 1153 1154 switch (optname) { 1155 case KCM_RECV_DISABLE: 1156 lock_sock(&kcm->sk); 1157 if (valbool) 1158 kcm_recv_disable(kcm); 1159 else 1160 kcm_recv_enable(kcm); 1161 release_sock(&kcm->sk); 1162 break; 1163 default: 1164 err = -ENOPROTOOPT; 1165 } 1166 1167 return err; 1168 } 1169 1170 static int kcm_getsockopt(struct socket *sock, int level, int optname, 1171 sockopt_t *opt) 1172 { 1173 struct kcm_sock *kcm = kcm_sk(sock->sk); 1174 int val, len; 1175 1176 if (level != SOL_KCM) 1177 return -ENOPROTOOPT; 1178 1179 len = opt->optlen; 1180 if (len < 0) 1181 return -EINVAL; 1182 1183 len = min_t(unsigned int, len, sizeof(int)); 1184 1185 switch (optname) { 1186 case KCM_RECV_DISABLE: 1187 val = kcm->rx_disabled; 1188 break; 1189 default: 1190 return -ENOPROTOOPT; 1191 } 1192 1193 opt->optlen = len; 1194 if (copy_to_iter(&val, len, &opt->iter_out) != len) 1195 return -EFAULT; 1196 return 0; 1197 } 1198 1199 static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) 1200 { 1201 struct kcm_sock *tkcm; 1202 struct list_head *head; 1203 int index = 0; 1204 1205 /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so 1206 * we set sk_state, otherwise epoll_wait always returns right away with 1207 * EPOLLHUP 1208 */ 1209 kcm->sk.sk_state = TCP_ESTABLISHED; 1210 1211 /* Add to mux's kcm sockets list */ 1212 kcm->mux = mux; 1213 spin_lock_bh(&mux->lock); 1214 1215 head = &mux->kcm_socks; 1216 list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) { 1217 if (tkcm->index != index) 1218 break; 1219 head = &tkcm->kcm_sock_list; 1220 index++; 1221 } 1222 1223 list_add(&kcm->kcm_sock_list, head); 1224 kcm->index = index; 1225 1226 mux->kcm_socks_cnt++; 1227 spin_unlock_bh(&mux->lock); 1228 1229 INIT_WORK(&kcm->tx_work, kcm_tx_work); 1230 mutex_init(&kcm->tx_mutex); 1231 1232 spin_lock_bh(&mux->rx_lock); 1233 kcm_rcv_ready(kcm); 1234 spin_unlock_bh(&mux->rx_lock); 1235 } 1236 1237 static int kcm_attach(struct socket *sock, struct socket *csock, 1238 struct bpf_prog *prog) 1239 { 1240 struct kcm_sock *kcm = kcm_sk(sock->sk); 1241 struct kcm_mux *mux = kcm->mux; 1242 struct sock *csk; 1243 struct kcm_psock *psock = NULL, *tpsock; 1244 struct list_head *head; 1245 int index = 0; 1246 static const struct strp_callbacks cb = { 1247 .rcv_msg = kcm_rcv_strparser, 1248 .parse_msg = kcm_parse_func_strparser, 1249 .read_sock_done = kcm_read_sock_done, 1250 }; 1251 int err = 0; 1252 1253 csk = csock->sk; 1254 if (!csk) 1255 return -EINVAL; 1256 1257 lock_sock(csk); 1258 1259 /* Only allow TCP sockets to be attached for now */ 1260 if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) || 1261 csk->sk_protocol != IPPROTO_TCP) { 1262 err = -EOPNOTSUPP; 1263 goto out; 1264 } 1265 1266 /* Don't allow listeners or closed sockets */ 1267 if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) { 1268 err = -EOPNOTSUPP; 1269 goto out; 1270 } 1271 1272 psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); 1273 if (!psock) { 1274 err = -ENOMEM; 1275 goto out; 1276 } 1277 1278 psock->mux = mux; 1279 psock->sk = csk; 1280 psock->bpf_prog = prog; 1281 1282 write_lock_bh(&csk->sk_callback_lock); 1283 1284 /* Check if sk_user_data is already by KCM or someone else. 1285 * Must be done under lock to prevent race conditions. 1286 */ 1287 if (csk->sk_user_data) { 1288 write_unlock_bh(&csk->sk_callback_lock); 1289 kmem_cache_free(kcm_psockp, psock); 1290 err = -EALREADY; 1291 goto out; 1292 } 1293 1294 err = strp_init(&psock->strp, csk, &cb); 1295 if (err) { 1296 write_unlock_bh(&csk->sk_callback_lock); 1297 kmem_cache_free(kcm_psockp, psock); 1298 goto out; 1299 } 1300 1301 psock->save_data_ready = csk->sk_data_ready; 1302 psock->save_write_space = csk->sk_write_space; 1303 psock->save_state_change = csk->sk_state_change; 1304 csk->sk_user_data = psock; 1305 WRITE_ONCE(csk->sk_data_ready, psock_data_ready); 1306 WRITE_ONCE(csk->sk_write_space, psock_write_space); 1307 csk->sk_state_change = psock_state_change; 1308 1309 write_unlock_bh(&csk->sk_callback_lock); 1310 1311 sock_hold(csk); 1312 1313 /* Finished initialization, now add the psock to the MUX. */ 1314 spin_lock_bh(&mux->lock); 1315 head = &mux->psocks; 1316 list_for_each_entry(tpsock, &mux->psocks, psock_list) { 1317 if (tpsock->index != index) 1318 break; 1319 head = &tpsock->psock_list; 1320 index++; 1321 } 1322 1323 list_add(&psock->psock_list, head); 1324 psock->index = index; 1325 1326 KCM_STATS_INCR(mux->stats.psock_attach); 1327 mux->psocks_cnt++; 1328 psock_now_avail(psock); 1329 spin_unlock_bh(&mux->lock); 1330 1331 /* Schedule RX work in case there are already bytes queued */ 1332 strp_check_rcv(&psock->strp); 1333 1334 out: 1335 release_sock(csk); 1336 1337 return err; 1338 } 1339 1340 static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) 1341 { 1342 struct socket *csock; 1343 struct bpf_prog *prog; 1344 int err; 1345 1346 csock = sockfd_lookup(info->fd, &err); 1347 if (!csock) 1348 return -ENOENT; 1349 1350 prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER); 1351 if (IS_ERR(prog)) { 1352 err = PTR_ERR(prog); 1353 goto out; 1354 } 1355 1356 err = kcm_attach(sock, csock, prog); 1357 if (err) { 1358 bpf_prog_put(prog); 1359 goto out; 1360 } 1361 1362 /* Keep reference on file also */ 1363 1364 return 0; 1365 out: 1366 sockfd_put(csock); 1367 return err; 1368 } 1369 1370 static void kcm_unattach(struct kcm_psock *psock) 1371 { 1372 struct sock *csk = psock->sk; 1373 struct kcm_mux *mux = psock->mux; 1374 1375 lock_sock(csk); 1376 1377 /* Stop getting callbacks from TCP socket. After this there should 1378 * be no way to reserve a kcm for this psock. 1379 */ 1380 write_lock_bh(&csk->sk_callback_lock); 1381 csk->sk_user_data = NULL; 1382 WRITE_ONCE(csk->sk_data_ready, psock->save_data_ready); 1383 WRITE_ONCE(csk->sk_write_space, psock->save_write_space); 1384 csk->sk_state_change = psock->save_state_change; 1385 strp_stop(&psock->strp); 1386 1387 if (WARN_ON(psock->rx_kcm)) { 1388 write_unlock_bh(&csk->sk_callback_lock); 1389 release_sock(csk); 1390 return; 1391 } 1392 1393 spin_lock_bh(&mux->rx_lock); 1394 1395 /* Stop receiver activities. After this point psock should not be 1396 * able to get onto ready list either through callbacks or work. 1397 */ 1398 if (psock->ready_rx_msg) { 1399 list_del(&psock->psock_ready_list); 1400 kfree_skb(psock->ready_rx_msg); 1401 psock->ready_rx_msg = NULL; 1402 KCM_STATS_INCR(mux->stats.rx_ready_drops); 1403 } 1404 1405 spin_unlock_bh(&mux->rx_lock); 1406 1407 write_unlock_bh(&csk->sk_callback_lock); 1408 1409 /* Call strp_done without sock lock */ 1410 release_sock(csk); 1411 strp_done(&psock->strp); 1412 lock_sock(csk); 1413 1414 bpf_prog_put(psock->bpf_prog); 1415 1416 spin_lock_bh(&mux->lock); 1417 1418 aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); 1419 save_strp_stats(&psock->strp, &mux->aggregate_strp_stats); 1420 1421 KCM_STATS_INCR(mux->stats.psock_unattach); 1422 1423 if (psock->tx_kcm) { 1424 /* psock was reserved. Just mark it finished and we will clean 1425 * up in the kcm paths, we need kcm lock which can not be 1426 * acquired here. 1427 */ 1428 KCM_STATS_INCR(mux->stats.psock_unattach_rsvd); 1429 spin_unlock_bh(&mux->lock); 1430 1431 /* We are unattaching a socket that is reserved. Abort the 1432 * socket since we may be out of sync in sending on it. We need 1433 * to do this without the mux lock. 1434 */ 1435 kcm_abort_tx_psock(psock, EPIPE, false); 1436 1437 spin_lock_bh(&mux->lock); 1438 if (!psock->tx_kcm) { 1439 /* psock now unreserved in window mux was unlocked */ 1440 goto no_reserved; 1441 } 1442 psock->done = 1; 1443 1444 /* Commit done before queuing work to process it */ 1445 smp_mb(); 1446 1447 /* Queue tx work to make sure psock->done is handled */ 1448 queue_work(kcm_wq, &psock->tx_kcm->tx_work); 1449 spin_unlock_bh(&mux->lock); 1450 } else { 1451 no_reserved: 1452 if (!psock->tx_stopped) 1453 list_del(&psock->psock_avail_list); 1454 list_del(&psock->psock_list); 1455 mux->psocks_cnt--; 1456 spin_unlock_bh(&mux->lock); 1457 1458 sock_put(csk); 1459 fput(csk->sk_socket->file); 1460 kmem_cache_free(kcm_psockp, psock); 1461 } 1462 1463 release_sock(csk); 1464 } 1465 1466 static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) 1467 { 1468 struct kcm_sock *kcm = kcm_sk(sock->sk); 1469 struct kcm_mux *mux = kcm->mux; 1470 struct kcm_psock *psock; 1471 struct socket *csock; 1472 struct sock *csk; 1473 int err; 1474 1475 csock = sockfd_lookup(info->fd, &err); 1476 if (!csock) 1477 return -ENOENT; 1478 1479 csk = csock->sk; 1480 if (!csk) { 1481 err = -EINVAL; 1482 goto out; 1483 } 1484 1485 err = -ENOENT; 1486 1487 spin_lock_bh(&mux->lock); 1488 1489 list_for_each_entry(psock, &mux->psocks, psock_list) { 1490 if (psock->sk != csk) 1491 continue; 1492 1493 /* Found the matching psock */ 1494 1495 if (psock->unattaching || WARN_ON(psock->done)) { 1496 err = -EALREADY; 1497 break; 1498 } 1499 1500 psock->unattaching = 1; 1501 1502 spin_unlock_bh(&mux->lock); 1503 1504 /* Lower socket lock should already be held */ 1505 kcm_unattach(psock); 1506 1507 err = 0; 1508 goto out; 1509 } 1510 1511 spin_unlock_bh(&mux->lock); 1512 1513 out: 1514 sockfd_put(csock); 1515 return err; 1516 } 1517 1518 static struct proto kcm_proto = { 1519 .name = "KCM", 1520 .owner = THIS_MODULE, 1521 .obj_size = sizeof(struct kcm_sock), 1522 }; 1523 1524 /* Clone a kcm socket. */ 1525 static struct file *kcm_clone(struct socket *osock) 1526 { 1527 struct socket *newsock; 1528 struct sock *newsk; 1529 1530 newsock = sock_alloc(); 1531 if (!newsock) 1532 return ERR_PTR(-ENFILE); 1533 1534 newsock->type = osock->type; 1535 newsock->ops = osock->ops; 1536 1537 __module_get(newsock->ops->owner); 1538 1539 newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, 1540 &kcm_proto, false); 1541 if (!newsk) { 1542 sock_release(newsock); 1543 return ERR_PTR(-ENOMEM); 1544 } 1545 sock_init_data(newsock, newsk); 1546 init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); 1547 1548 return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); 1549 } 1550 1551 static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 1552 { 1553 int err; 1554 1555 switch (cmd) { 1556 case SIOCKCMATTACH: { 1557 struct kcm_attach info; 1558 1559 if (copy_from_user(&info, (void __user *)arg, sizeof(info))) 1560 return -EFAULT; 1561 1562 err = kcm_attach_ioctl(sock, &info); 1563 1564 break; 1565 } 1566 case SIOCKCMUNATTACH: { 1567 struct kcm_unattach info; 1568 1569 if (copy_from_user(&info, (void __user *)arg, sizeof(info))) 1570 return -EFAULT; 1571 1572 err = kcm_unattach_ioctl(sock, &info); 1573 1574 break; 1575 } 1576 case SIOCKCMCLONE: { 1577 struct kcm_clone info; 1578 1579 FD_PREPARE(fdf, 0, kcm_clone(sock)); 1580 if (fdf.err) 1581 return fdf.err; 1582 1583 info.fd = fd_prepare_fd(fdf); 1584 if (copy_to_user((void __user *)arg, &info, sizeof(info))) 1585 return -EFAULT; 1586 1587 fd_publish(fdf); 1588 err = 0; 1589 break; 1590 } 1591 default: 1592 err = -ENOIOCTLCMD; 1593 break; 1594 } 1595 1596 return err; 1597 } 1598 1599 static void release_mux(struct kcm_mux *mux) 1600 { 1601 struct kcm_net *knet = mux->knet; 1602 struct kcm_psock *psock, *tmp_psock; 1603 1604 /* Release psocks */ 1605 list_for_each_entry_safe(psock, tmp_psock, 1606 &mux->psocks, psock_list) { 1607 if (!WARN_ON(psock->unattaching)) 1608 kcm_unattach(psock); 1609 } 1610 1611 if (WARN_ON(mux->psocks_cnt)) 1612 return; 1613 1614 __skb_queue_purge(&mux->rx_hold_queue); 1615 1616 mutex_lock(&knet->mutex); 1617 aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); 1618 aggregate_psock_stats(&mux->aggregate_psock_stats, 1619 &knet->aggregate_psock_stats); 1620 aggregate_strp_stats(&mux->aggregate_strp_stats, 1621 &knet->aggregate_strp_stats); 1622 list_del_rcu(&mux->kcm_mux_list); 1623 knet->count--; 1624 mutex_unlock(&knet->mutex); 1625 1626 kfree_rcu(mux, rcu); 1627 } 1628 1629 static void kcm_done(struct kcm_sock *kcm) 1630 { 1631 struct kcm_mux *mux = kcm->mux; 1632 struct sock *sk = &kcm->sk; 1633 int socks_cnt; 1634 1635 spin_lock_bh(&mux->rx_lock); 1636 if (kcm->rx_psock) { 1637 /* Cleanup in unreserve_rx_kcm */ 1638 WARN_ON(kcm->done); 1639 kcm->rx_disabled = 1; 1640 kcm->done = 1; 1641 spin_unlock_bh(&mux->rx_lock); 1642 return; 1643 } 1644 1645 if (kcm->rx_wait) { 1646 list_del(&kcm->wait_rx_list); 1647 /* paired with lockless reads in kcm_rfree() */ 1648 WRITE_ONCE(kcm->rx_wait, false); 1649 } 1650 /* Move any pending receive messages to other kcm sockets */ 1651 requeue_rx_msgs(mux, &sk->sk_receive_queue); 1652 1653 spin_unlock_bh(&mux->rx_lock); 1654 1655 if (WARN_ON(sk_rmem_alloc_get(sk))) 1656 return; 1657 1658 /* Detach from MUX */ 1659 spin_lock_bh(&mux->lock); 1660 1661 list_del(&kcm->kcm_sock_list); 1662 mux->kcm_socks_cnt--; 1663 socks_cnt = mux->kcm_socks_cnt; 1664 1665 spin_unlock_bh(&mux->lock); 1666 1667 if (!socks_cnt) { 1668 /* We are done with the mux now. */ 1669 release_mux(mux); 1670 } 1671 1672 WARN_ON(kcm->rx_wait); 1673 1674 sock_put(&kcm->sk); 1675 } 1676 1677 /* Called by kcm_release to close a KCM socket. 1678 * If this is the last KCM socket on the MUX, destroy the MUX. 1679 */ 1680 static int kcm_release(struct socket *sock) 1681 { 1682 struct sock *sk = sock->sk; 1683 struct kcm_sock *kcm; 1684 struct kcm_mux *mux; 1685 struct kcm_psock *psock; 1686 1687 if (!sk) 1688 return 0; 1689 1690 kcm = kcm_sk(sk); 1691 mux = kcm->mux; 1692 1693 lock_sock(sk); 1694 sock_orphan(sk); 1695 kfree_skb(kcm->seq_skb); 1696 1697 /* Purge queue under lock to avoid race condition with tx_work trying 1698 * to act when queue is nonempty. If tx_work runs after this point 1699 * it will just return. 1700 */ 1701 __skb_queue_purge(&sk->sk_write_queue); 1702 1703 release_sock(sk); 1704 1705 spin_lock_bh(&mux->lock); 1706 if (kcm->tx_wait) { 1707 /* Take of tx_wait list, after this point there should be no way 1708 * that a psock will be assigned to this kcm. 1709 */ 1710 list_del(&kcm->wait_psock_list); 1711 kcm->tx_wait = false; 1712 } 1713 spin_unlock_bh(&mux->lock); 1714 1715 /* Cancel work. After this point there should be no outside references 1716 * to the kcm socket. 1717 */ 1718 disable_work_sync(&kcm->tx_work); 1719 1720 lock_sock(sk); 1721 psock = kcm->tx_psock; 1722 if (psock) { 1723 /* A psock was reserved, so we need to kill it since it 1724 * may already have some bytes queued from a message. We 1725 * need to do this after removing kcm from tx_wait list. 1726 */ 1727 kcm_abort_tx_psock(psock, EPIPE, false); 1728 unreserve_psock(kcm); 1729 } 1730 release_sock(sk); 1731 1732 WARN_ON(kcm->tx_wait); 1733 WARN_ON(kcm->tx_psock); 1734 1735 sock->sk = NULL; 1736 1737 kcm_done(kcm); 1738 1739 return 0; 1740 } 1741 1742 static const struct proto_ops kcm_dgram_ops = { 1743 .family = PF_KCM, 1744 .owner = THIS_MODULE, 1745 .release = kcm_release, 1746 .bind = sock_no_bind, 1747 .connect = sock_no_connect, 1748 .socketpair = sock_no_socketpair, 1749 .accept = sock_no_accept, 1750 .getname = sock_no_getname, 1751 .poll = datagram_poll, 1752 .ioctl = kcm_ioctl, 1753 .listen = sock_no_listen, 1754 .shutdown = sock_no_shutdown, 1755 .setsockopt = kcm_setsockopt, 1756 .getsockopt_iter = kcm_getsockopt, 1757 .sendmsg = kcm_sendmsg, 1758 .recvmsg = kcm_recvmsg, 1759 .mmap = sock_no_mmap, 1760 .splice_eof = kcm_splice_eof, 1761 }; 1762 1763 static const struct proto_ops kcm_seqpacket_ops = { 1764 .family = PF_KCM, 1765 .owner = THIS_MODULE, 1766 .release = kcm_release, 1767 .bind = sock_no_bind, 1768 .connect = sock_no_connect, 1769 .socketpair = sock_no_socketpair, 1770 .accept = sock_no_accept, 1771 .getname = sock_no_getname, 1772 .poll = datagram_poll, 1773 .ioctl = kcm_ioctl, 1774 .listen = sock_no_listen, 1775 .shutdown = sock_no_shutdown, 1776 .setsockopt = kcm_setsockopt, 1777 .getsockopt_iter = kcm_getsockopt, 1778 .sendmsg = kcm_sendmsg, 1779 .recvmsg = kcm_recvmsg, 1780 .mmap = sock_no_mmap, 1781 .splice_eof = kcm_splice_eof, 1782 .splice_read = kcm_splice_read, 1783 }; 1784 1785 /* Create proto operation for kcm sockets */ 1786 static int kcm_create(struct net *net, struct socket *sock, 1787 int protocol, int kern) 1788 { 1789 struct kcm_net *knet = net_generic(net, kcm_net_id); 1790 struct sock *sk; 1791 struct kcm_mux *mux; 1792 1793 switch (sock->type) { 1794 case SOCK_DGRAM: 1795 sock->ops = &kcm_dgram_ops; 1796 break; 1797 case SOCK_SEQPACKET: 1798 sock->ops = &kcm_seqpacket_ops; 1799 break; 1800 default: 1801 return -ESOCKTNOSUPPORT; 1802 } 1803 1804 if (protocol != KCMPROTO_CONNECTED) 1805 return -EPROTONOSUPPORT; 1806 1807 sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern); 1808 if (!sk) 1809 return -ENOMEM; 1810 1811 /* Allocate a kcm mux, shared between KCM sockets */ 1812 mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL); 1813 if (!mux) { 1814 sk_free(sk); 1815 return -ENOMEM; 1816 } 1817 1818 spin_lock_init(&mux->lock); 1819 spin_lock_init(&mux->rx_lock); 1820 INIT_LIST_HEAD(&mux->kcm_socks); 1821 INIT_LIST_HEAD(&mux->kcm_rx_waiters); 1822 INIT_LIST_HEAD(&mux->kcm_tx_waiters); 1823 1824 INIT_LIST_HEAD(&mux->psocks); 1825 INIT_LIST_HEAD(&mux->psocks_ready); 1826 INIT_LIST_HEAD(&mux->psocks_avail); 1827 1828 mux->knet = knet; 1829 1830 /* Add new MUX to list */ 1831 mutex_lock(&knet->mutex); 1832 list_add_rcu(&mux->kcm_mux_list, &knet->mux_list); 1833 knet->count++; 1834 mutex_unlock(&knet->mutex); 1835 1836 skb_queue_head_init(&mux->rx_hold_queue); 1837 1838 /* Init KCM socket */ 1839 sock_init_data(sock, sk); 1840 init_kcm_sock(kcm_sk(sk), mux); 1841 1842 return 0; 1843 } 1844 1845 static const struct net_proto_family kcm_family_ops = { 1846 .family = PF_KCM, 1847 .create = kcm_create, 1848 .owner = THIS_MODULE, 1849 }; 1850 1851 static __net_init int kcm_init_net(struct net *net) 1852 { 1853 struct kcm_net *knet = net_generic(net, kcm_net_id); 1854 1855 INIT_LIST_HEAD_RCU(&knet->mux_list); 1856 mutex_init(&knet->mutex); 1857 1858 return 0; 1859 } 1860 1861 static __net_exit void kcm_exit_net(struct net *net) 1862 { 1863 struct kcm_net *knet = net_generic(net, kcm_net_id); 1864 1865 /* All KCM sockets should be closed at this point, which should mean 1866 * that all multiplexors and psocks have been destroyed. 1867 */ 1868 WARN_ON(!list_empty(&knet->mux_list)); 1869 1870 mutex_destroy(&knet->mutex); 1871 } 1872 1873 static struct pernet_operations kcm_net_ops = { 1874 .init = kcm_init_net, 1875 .exit = kcm_exit_net, 1876 .id = &kcm_net_id, 1877 .size = sizeof(struct kcm_net), 1878 }; 1879 1880 static int __init kcm_init(void) 1881 { 1882 int err = -ENOMEM; 1883 1884 kcm_muxp = KMEM_CACHE(kcm_mux, SLAB_HWCACHE_ALIGN); 1885 if (!kcm_muxp) 1886 goto fail; 1887 1888 kcm_psockp = KMEM_CACHE(kcm_psock, SLAB_HWCACHE_ALIGN); 1889 if (!kcm_psockp) 1890 goto fail; 1891 1892 kcm_wq = create_singlethread_workqueue("kkcmd"); 1893 if (!kcm_wq) 1894 goto fail; 1895 1896 err = proto_register(&kcm_proto, 1); 1897 if (err) 1898 goto fail; 1899 1900 err = register_pernet_device(&kcm_net_ops); 1901 if (err) 1902 goto net_ops_fail; 1903 1904 err = sock_register(&kcm_family_ops); 1905 if (err) 1906 goto sock_register_fail; 1907 1908 err = kcm_proc_init(); 1909 if (err) 1910 goto proc_init_fail; 1911 1912 return 0; 1913 1914 proc_init_fail: 1915 sock_unregister(PF_KCM); 1916 1917 sock_register_fail: 1918 unregister_pernet_device(&kcm_net_ops); 1919 1920 net_ops_fail: 1921 proto_unregister(&kcm_proto); 1922 1923 fail: 1924 kmem_cache_destroy(kcm_muxp); 1925 kmem_cache_destroy(kcm_psockp); 1926 1927 if (kcm_wq) 1928 destroy_workqueue(kcm_wq); 1929 1930 return err; 1931 } 1932 1933 static void __exit kcm_exit(void) 1934 { 1935 kcm_proc_exit(); 1936 sock_unregister(PF_KCM); 1937 unregister_pernet_device(&kcm_net_ops); 1938 proto_unregister(&kcm_proto); 1939 destroy_workqueue(kcm_wq); 1940 1941 kmem_cache_destroy(kcm_muxp); 1942 kmem_cache_destroy(kcm_psockp); 1943 } 1944 1945 module_init(kcm_init); 1946 module_exit(kcm_exit); 1947 1948 MODULE_LICENSE("GPL"); 1949 MODULE_DESCRIPTION("KCM (Kernel Connection Multiplexor) sockets"); 1950 MODULE_ALIAS_NETPROTO(PF_KCM); 1951