1 #include <linux/bpf.h> 2 #include <linux/errno.h> 3 #include <linux/errqueue.h> 4 #include <linux/file.h> 5 #include <linux/in.h> 6 #include <linux/kernel.h> 7 #include <linux/module.h> 8 #include <linux/net.h> 9 #include <linux/netdevice.h> 10 #include <linux/poll.h> 11 #include <linux/rculist.h> 12 #include <linux/skbuff.h> 13 #include <linux/socket.h> 14 #include <linux/uaccess.h> 15 #include <linux/workqueue.h> 16 #include <linux/syscalls.h> 17 #include <net/kcm.h> 18 #include <net/netns/generic.h> 19 #include <net/sock.h> 20 #include <net/tcp.h> 21 #include <uapi/linux/kcm.h> 22 23 unsigned int kcm_net_id; 24 25 static struct kmem_cache *kcm_psockp __read_mostly; 26 static struct kmem_cache *kcm_muxp __read_mostly; 27 static struct workqueue_struct *kcm_wq; 28 29 static inline struct kcm_sock *kcm_sk(const struct sock *sk) 30 { 31 return (struct kcm_sock *)sk; 32 } 33 34 static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) 35 { 36 return (struct kcm_tx_msg *)skb->cb; 37 } 38 39 static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb) 40 { 41 return (struct kcm_rx_msg *)((void *)skb->cb + 42 offsetof(struct qdisc_skb_cb, data)); 43 } 44 45 static void report_csk_error(struct sock *csk, int err) 46 { 47 csk->sk_err = EPIPE; 48 csk->sk_error_report(csk); 49 } 50 51 /* Callback lock held */ 52 static void kcm_abort_rx_psock(struct kcm_psock *psock, int err, 53 struct sk_buff *skb) 54 { 55 struct sock *csk = psock->sk; 56 57 /* Unrecoverable error in receive */ 58 59 del_timer(&psock->rx_msg_timer); 60 61 if (psock->rx_stopped) 62 return; 63 64 psock->rx_stopped = 1; 65 KCM_STATS_INCR(psock->stats.rx_aborts); 66 67 /* Report an error on the lower socket */ 68 report_csk_error(csk, err); 69 } 70 71 static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, 72 bool wakeup_kcm) 73 { 74 struct sock *csk = psock->sk; 75 struct kcm_mux *mux = psock->mux; 76 77 /* Unrecoverable error in transmit */ 78 79 spin_lock_bh(&mux->lock); 80 81 if (psock->tx_stopped) { 82 spin_unlock_bh(&mux->lock); 83 return; 84 } 85 86 psock->tx_stopped = 1; 87 KCM_STATS_INCR(psock->stats.tx_aborts); 88 89 if (!psock->tx_kcm) { 90 /* Take off psocks_avail list */ 91 list_del(&psock->psock_avail_list); 92 } else if (wakeup_kcm) { 93 /* In this case psock is being aborted while outside of 94 * write_msgs and psock is reserved. Schedule tx_work 95 * to handle the failure there. Need to commit tx_stopped 96 * before queuing work. 97 */ 98 smp_mb(); 99 100 queue_work(kcm_wq, &psock->tx_kcm->tx_work); 101 } 102 103 spin_unlock_bh(&mux->lock); 104 105 /* Report error on lower socket */ 106 report_csk_error(csk, err); 107 } 108 109 /* RX mux lock held. */ 110 static void kcm_update_rx_mux_stats(struct kcm_mux *mux, 111 struct kcm_psock *psock) 112 { 113 KCM_STATS_ADD(mux->stats.rx_bytes, 114 psock->stats.rx_bytes - psock->saved_rx_bytes); 115 mux->stats.rx_msgs += 116 psock->stats.rx_msgs - psock->saved_rx_msgs; 117 psock->saved_rx_msgs = psock->stats.rx_msgs; 118 psock->saved_rx_bytes = psock->stats.rx_bytes; 119 } 120 121 static void kcm_update_tx_mux_stats(struct kcm_mux *mux, 122 struct kcm_psock *psock) 123 { 124 KCM_STATS_ADD(mux->stats.tx_bytes, 125 psock->stats.tx_bytes - psock->saved_tx_bytes); 126 mux->stats.tx_msgs += 127 psock->stats.tx_msgs - psock->saved_tx_msgs; 128 psock->saved_tx_msgs = psock->stats.tx_msgs; 129 psock->saved_tx_bytes = psock->stats.tx_bytes; 130 } 131 132 static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 133 134 /* KCM is ready to receive messages on its queue-- either the KCM is new or 135 * has become unblocked after being blocked on full socket buffer. Queue any 136 * pending ready messages on a psock. RX mux lock held. 137 */ 138 static void kcm_rcv_ready(struct kcm_sock *kcm) 139 { 140 struct kcm_mux *mux = kcm->mux; 141 struct kcm_psock *psock; 142 struct sk_buff *skb; 143 144 if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) 145 return; 146 147 while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { 148 if (kcm_queue_rcv_skb(&kcm->sk, skb)) { 149 /* Assuming buffer limit has been reached */ 150 skb_queue_head(&mux->rx_hold_queue, skb); 151 WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); 152 return; 153 } 154 } 155 156 while (!list_empty(&mux->psocks_ready)) { 157 psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, 158 psock_ready_list); 159 160 if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { 161 /* Assuming buffer limit has been reached */ 162 WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); 163 return; 164 } 165 166 /* Consumed the ready message on the psock. Schedule rx_work to 167 * get more messages. 168 */ 169 list_del(&psock->psock_ready_list); 170 psock->ready_rx_msg = NULL; 171 172 /* Commit clearing of ready_rx_msg for queuing work */ 173 smp_mb(); 174 175 queue_work(kcm_wq, &psock->rx_work); 176 } 177 178 /* Buffer limit is okay now, add to ready list */ 179 list_add_tail(&kcm->wait_rx_list, 180 &kcm->mux->kcm_rx_waiters); 181 kcm->rx_wait = true; 182 } 183 184 static void kcm_rfree(struct sk_buff *skb) 185 { 186 struct sock *sk = skb->sk; 187 struct kcm_sock *kcm = kcm_sk(sk); 188 struct kcm_mux *mux = kcm->mux; 189 unsigned int len = skb->truesize; 190 191 sk_mem_uncharge(sk, len); 192 atomic_sub(len, &sk->sk_rmem_alloc); 193 194 /* For reading rx_wait and rx_psock without holding lock */ 195 smp_mb__after_atomic(); 196 197 if (!kcm->rx_wait && !kcm->rx_psock && 198 sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { 199 spin_lock_bh(&mux->rx_lock); 200 kcm_rcv_ready(kcm); 201 spin_unlock_bh(&mux->rx_lock); 202 } 203 } 204 205 static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 206 { 207 struct sk_buff_head *list = &sk->sk_receive_queue; 208 209 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) 210 return -ENOMEM; 211 212 if (!sk_rmem_schedule(sk, skb, skb->truesize)) 213 return -ENOBUFS; 214 215 skb->dev = NULL; 216 217 skb_orphan(skb); 218 skb->sk = sk; 219 skb->destructor = kcm_rfree; 220 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 221 sk_mem_charge(sk, skb->truesize); 222 223 skb_queue_tail(list, skb); 224 225 if (!sock_flag(sk, SOCK_DEAD)) 226 sk->sk_data_ready(sk); 227 228 return 0; 229 } 230 231 /* Requeue received messages for a kcm socket to other kcm sockets. This is 232 * called with a kcm socket is receive disabled. 233 * RX mux lock held. 234 */ 235 static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) 236 { 237 struct sk_buff *skb; 238 struct kcm_sock *kcm; 239 240 while ((skb = __skb_dequeue(head))) { 241 /* Reset destructor to avoid calling kcm_rcv_ready */ 242 skb->destructor = sock_rfree; 243 skb_orphan(skb); 244 try_again: 245 if (list_empty(&mux->kcm_rx_waiters)) { 246 skb_queue_tail(&mux->rx_hold_queue, skb); 247 continue; 248 } 249 250 kcm = list_first_entry(&mux->kcm_rx_waiters, 251 struct kcm_sock, wait_rx_list); 252 253 if (kcm_queue_rcv_skb(&kcm->sk, skb)) { 254 /* Should mean socket buffer full */ 255 list_del(&kcm->wait_rx_list); 256 kcm->rx_wait = false; 257 258 /* Commit rx_wait to read in kcm_free */ 259 smp_wmb(); 260 261 goto try_again; 262 } 263 } 264 } 265 266 /* Lower sock lock held */ 267 static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, 268 struct sk_buff *head) 269 { 270 struct kcm_mux *mux = psock->mux; 271 struct kcm_sock *kcm; 272 273 WARN_ON(psock->ready_rx_msg); 274 275 if (psock->rx_kcm) 276 return psock->rx_kcm; 277 278 spin_lock_bh(&mux->rx_lock); 279 280 if (psock->rx_kcm) { 281 spin_unlock_bh(&mux->rx_lock); 282 return psock->rx_kcm; 283 } 284 285 kcm_update_rx_mux_stats(mux, psock); 286 287 if (list_empty(&mux->kcm_rx_waiters)) { 288 psock->ready_rx_msg = head; 289 list_add_tail(&psock->psock_ready_list, 290 &mux->psocks_ready); 291 spin_unlock_bh(&mux->rx_lock); 292 return NULL; 293 } 294 295 kcm = list_first_entry(&mux->kcm_rx_waiters, 296 struct kcm_sock, wait_rx_list); 297 list_del(&kcm->wait_rx_list); 298 kcm->rx_wait = false; 299 300 psock->rx_kcm = kcm; 301 kcm->rx_psock = psock; 302 303 spin_unlock_bh(&mux->rx_lock); 304 305 return kcm; 306 } 307 308 static void kcm_done(struct kcm_sock *kcm); 309 310 static void kcm_done_work(struct work_struct *w) 311 { 312 kcm_done(container_of(w, struct kcm_sock, done_work)); 313 } 314 315 /* Lower sock held */ 316 static void unreserve_rx_kcm(struct kcm_psock *psock, 317 bool rcv_ready) 318 { 319 struct kcm_sock *kcm = psock->rx_kcm; 320 struct kcm_mux *mux = psock->mux; 321 322 if (!kcm) 323 return; 324 325 spin_lock_bh(&mux->rx_lock); 326 327 psock->rx_kcm = NULL; 328 kcm->rx_psock = NULL; 329 330 /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with 331 * kcm_rfree 332 */ 333 smp_mb(); 334 335 if (unlikely(kcm->done)) { 336 spin_unlock_bh(&mux->rx_lock); 337 338 /* Need to run kcm_done in a task since we need to qcquire 339 * callback locks which may already be held here. 340 */ 341 INIT_WORK(&kcm->done_work, kcm_done_work); 342 schedule_work(&kcm->done_work); 343 return; 344 } 345 346 if (unlikely(kcm->rx_disabled)) { 347 requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); 348 } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { 349 /* Check for degenerative race with rx_wait that all 350 * data was dequeued (accounted for in kcm_rfree). 351 */ 352 kcm_rcv_ready(kcm); 353 } 354 spin_unlock_bh(&mux->rx_lock); 355 } 356 357 static void kcm_start_rx_timer(struct kcm_psock *psock) 358 { 359 if (psock->sk->sk_rcvtimeo) 360 mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo); 361 } 362 363 /* Macro to invoke filter function. */ 364 #define KCM_RUN_FILTER(prog, ctx) \ 365 (*prog->bpf_func)(ctx, prog->insnsi) 366 367 /* Lower socket lock held */ 368 static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, 369 unsigned int orig_offset, size_t orig_len) 370 { 371 struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data; 372 struct kcm_rx_msg *rxm; 373 struct kcm_sock *kcm; 374 struct sk_buff *head, *skb; 375 size_t eaten = 0, cand_len; 376 ssize_t extra; 377 int err; 378 bool cloned_orig = false; 379 380 if (psock->ready_rx_msg) 381 return 0; 382 383 head = psock->rx_skb_head; 384 if (head) { 385 /* Message already in progress */ 386 387 rxm = kcm_rx_msg(head); 388 if (unlikely(rxm->early_eaten)) { 389 /* Already some number of bytes on the receive sock 390 * data saved in rx_skb_head, just indicate they 391 * are consumed. 392 */ 393 eaten = orig_len <= rxm->early_eaten ? 394 orig_len : rxm->early_eaten; 395 rxm->early_eaten -= eaten; 396 397 return eaten; 398 } 399 400 if (unlikely(orig_offset)) { 401 /* Getting data with a non-zero offset when a message is 402 * in progress is not expected. If it does happen, we 403 * need to clone and pull since we can't deal with 404 * offsets in the skbs for a message expect in the head. 405 */ 406 orig_skb = skb_clone(orig_skb, GFP_ATOMIC); 407 if (!orig_skb) { 408 KCM_STATS_INCR(psock->stats.rx_mem_fail); 409 desc->error = -ENOMEM; 410 return 0; 411 } 412 if (!pskb_pull(orig_skb, orig_offset)) { 413 KCM_STATS_INCR(psock->stats.rx_mem_fail); 414 kfree_skb(orig_skb); 415 desc->error = -ENOMEM; 416 return 0; 417 } 418 cloned_orig = true; 419 orig_offset = 0; 420 } 421 422 if (!psock->rx_skb_nextp) { 423 /* We are going to append to the frags_list of head. 424 * Need to unshare the frag_list. 425 */ 426 err = skb_unclone(head, GFP_ATOMIC); 427 if (err) { 428 KCM_STATS_INCR(psock->stats.rx_mem_fail); 429 desc->error = err; 430 return 0; 431 } 432 433 if (unlikely(skb_shinfo(head)->frag_list)) { 434 /* We can't append to an sk_buff that already 435 * has a frag_list. We create a new head, point 436 * the frag_list of that to the old head, and 437 * then are able to use the old head->next for 438 * appending to the message. 439 */ 440 if (WARN_ON(head->next)) { 441 desc->error = -EINVAL; 442 return 0; 443 } 444 445 skb = alloc_skb(0, GFP_ATOMIC); 446 if (!skb) { 447 KCM_STATS_INCR(psock->stats.rx_mem_fail); 448 desc->error = -ENOMEM; 449 return 0; 450 } 451 skb->len = head->len; 452 skb->data_len = head->len; 453 skb->truesize = head->truesize; 454 *kcm_rx_msg(skb) = *kcm_rx_msg(head); 455 psock->rx_skb_nextp = &head->next; 456 skb_shinfo(skb)->frag_list = head; 457 psock->rx_skb_head = skb; 458 head = skb; 459 } else { 460 psock->rx_skb_nextp = 461 &skb_shinfo(head)->frag_list; 462 } 463 } 464 } 465 466 while (eaten < orig_len) { 467 /* Always clone since we will consume something */ 468 skb = skb_clone(orig_skb, GFP_ATOMIC); 469 if (!skb) { 470 KCM_STATS_INCR(psock->stats.rx_mem_fail); 471 desc->error = -ENOMEM; 472 break; 473 } 474 475 cand_len = orig_len - eaten; 476 477 head = psock->rx_skb_head; 478 if (!head) { 479 head = skb; 480 psock->rx_skb_head = head; 481 /* Will set rx_skb_nextp on next packet if needed */ 482 psock->rx_skb_nextp = NULL; 483 rxm = kcm_rx_msg(head); 484 memset(rxm, 0, sizeof(*rxm)); 485 rxm->offset = orig_offset + eaten; 486 } else { 487 /* Unclone since we may be appending to an skb that we 488 * already share a frag_list with. 489 */ 490 err = skb_unclone(skb, GFP_ATOMIC); 491 if (err) { 492 KCM_STATS_INCR(psock->stats.rx_mem_fail); 493 desc->error = err; 494 break; 495 } 496 497 rxm = kcm_rx_msg(head); 498 *psock->rx_skb_nextp = skb; 499 psock->rx_skb_nextp = &skb->next; 500 head->data_len += skb->len; 501 head->len += skb->len; 502 head->truesize += skb->truesize; 503 } 504 505 if (!rxm->full_len) { 506 ssize_t len; 507 508 len = KCM_RUN_FILTER(psock->bpf_prog, head); 509 510 if (!len) { 511 /* Need more header to determine length */ 512 if (!rxm->accum_len) { 513 /* Start RX timer for new message */ 514 kcm_start_rx_timer(psock); 515 } 516 rxm->accum_len += cand_len; 517 eaten += cand_len; 518 KCM_STATS_INCR(psock->stats.rx_need_more_hdr); 519 WARN_ON(eaten != orig_len); 520 break; 521 } else if (len > psock->sk->sk_rcvbuf) { 522 /* Message length exceeds maximum allowed */ 523 KCM_STATS_INCR(psock->stats.rx_msg_too_big); 524 desc->error = -EMSGSIZE; 525 psock->rx_skb_head = NULL; 526 kcm_abort_rx_psock(psock, EMSGSIZE, head); 527 break; 528 } else if (len <= (ssize_t)head->len - 529 skb->len - rxm->offset) { 530 /* Length must be into new skb (and also 531 * greater than zero) 532 */ 533 KCM_STATS_INCR(psock->stats.rx_bad_hdr_len); 534 desc->error = -EPROTO; 535 psock->rx_skb_head = NULL; 536 kcm_abort_rx_psock(psock, EPROTO, head); 537 break; 538 } 539 540 rxm->full_len = len; 541 } 542 543 extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len; 544 545 if (extra < 0) { 546 /* Message not complete yet. */ 547 if (rxm->full_len - rxm->accum_len > 548 tcp_inq(psock->sk)) { 549 /* Don't have the whole messages in the socket 550 * buffer. Set psock->rx_need_bytes to wait for 551 * the rest of the message. Also, set "early 552 * eaten" since we've already buffered the skb 553 * but don't consume yet per tcp_read_sock. 554 */ 555 556 if (!rxm->accum_len) { 557 /* Start RX timer for new message */ 558 kcm_start_rx_timer(psock); 559 } 560 561 psock->rx_need_bytes = rxm->full_len - 562 rxm->accum_len; 563 rxm->accum_len += cand_len; 564 rxm->early_eaten = cand_len; 565 KCM_STATS_ADD(psock->stats.rx_bytes, cand_len); 566 desc->count = 0; /* Stop reading socket */ 567 break; 568 } 569 rxm->accum_len += cand_len; 570 eaten += cand_len; 571 WARN_ON(eaten != orig_len); 572 break; 573 } 574 575 /* Positive extra indicates ore bytes than needed for the 576 * message 577 */ 578 579 WARN_ON(extra > cand_len); 580 581 eaten += (cand_len - extra); 582 583 /* Hurray, we have a new message! */ 584 del_timer(&psock->rx_msg_timer); 585 psock->rx_skb_head = NULL; 586 KCM_STATS_INCR(psock->stats.rx_msgs); 587 588 try_queue: 589 kcm = reserve_rx_kcm(psock, head); 590 if (!kcm) { 591 /* Unable to reserve a KCM, message is held in psock. */ 592 break; 593 } 594 595 if (kcm_queue_rcv_skb(&kcm->sk, head)) { 596 /* Should mean socket buffer full */ 597 unreserve_rx_kcm(psock, false); 598 goto try_queue; 599 } 600 } 601 602 if (cloned_orig) 603 kfree_skb(orig_skb); 604 605 KCM_STATS_ADD(psock->stats.rx_bytes, eaten); 606 607 return eaten; 608 } 609 610 /* Called with lock held on lower socket */ 611 static int psock_tcp_read_sock(struct kcm_psock *psock) 612 { 613 read_descriptor_t desc; 614 615 desc.arg.data = psock; 616 desc.error = 0; 617 desc.count = 1; /* give more than one skb per call */ 618 619 /* sk should be locked here, so okay to do tcp_read_sock */ 620 tcp_read_sock(psock->sk, &desc, kcm_tcp_recv); 621 622 unreserve_rx_kcm(psock, true); 623 624 return desc.error; 625 } 626 627 /* Lower sock lock held */ 628 static void psock_tcp_data_ready(struct sock *sk) 629 { 630 struct kcm_psock *psock; 631 632 read_lock_bh(&sk->sk_callback_lock); 633 634 psock = (struct kcm_psock *)sk->sk_user_data; 635 if (unlikely(!psock || psock->rx_stopped)) 636 goto out; 637 638 if (psock->ready_rx_msg) 639 goto out; 640 641 if (psock->rx_need_bytes) { 642 if (tcp_inq(sk) >= psock->rx_need_bytes) 643 psock->rx_need_bytes = 0; 644 else 645 goto out; 646 } 647 648 if (psock_tcp_read_sock(psock) == -ENOMEM) 649 queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); 650 651 out: 652 read_unlock_bh(&sk->sk_callback_lock); 653 } 654 655 static void do_psock_rx_work(struct kcm_psock *psock) 656 { 657 read_descriptor_t rd_desc; 658 struct sock *csk = psock->sk; 659 660 /* We need the read lock to synchronize with psock_tcp_data_ready. We 661 * need the socket lock for calling tcp_read_sock. 662 */ 663 lock_sock(csk); 664 read_lock_bh(&csk->sk_callback_lock); 665 666 if (unlikely(csk->sk_user_data != psock)) 667 goto out; 668 669 if (unlikely(psock->rx_stopped)) 670 goto out; 671 672 if (psock->ready_rx_msg) 673 goto out; 674 675 rd_desc.arg.data = psock; 676 677 if (psock_tcp_read_sock(psock) == -ENOMEM) 678 queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); 679 680 out: 681 read_unlock_bh(&csk->sk_callback_lock); 682 release_sock(csk); 683 } 684 685 static void psock_rx_work(struct work_struct *w) 686 { 687 do_psock_rx_work(container_of(w, struct kcm_psock, rx_work)); 688 } 689 690 static void psock_rx_delayed_work(struct work_struct *w) 691 { 692 do_psock_rx_work(container_of(w, struct kcm_psock, 693 rx_delayed_work.work)); 694 } 695 696 static void psock_tcp_state_change(struct sock *sk) 697 { 698 /* TCP only does a POLLIN for a half close. Do a POLLHUP here 699 * since application will normally not poll with POLLIN 700 * on the TCP sockets. 701 */ 702 703 report_csk_error(sk, EPIPE); 704 } 705 706 static void psock_tcp_write_space(struct sock *sk) 707 { 708 struct kcm_psock *psock; 709 struct kcm_mux *mux; 710 struct kcm_sock *kcm; 711 712 read_lock_bh(&sk->sk_callback_lock); 713 714 psock = (struct kcm_psock *)sk->sk_user_data; 715 if (unlikely(!psock)) 716 goto out; 717 718 mux = psock->mux; 719 720 spin_lock_bh(&mux->lock); 721 722 /* Check if the socket is reserved so someone is waiting for sending. */ 723 kcm = psock->tx_kcm; 724 if (kcm) 725 queue_work(kcm_wq, &kcm->tx_work); 726 727 spin_unlock_bh(&mux->lock); 728 out: 729 read_unlock_bh(&sk->sk_callback_lock); 730 } 731 732 static void unreserve_psock(struct kcm_sock *kcm); 733 734 /* kcm sock is locked. */ 735 static struct kcm_psock *reserve_psock(struct kcm_sock *kcm) 736 { 737 struct kcm_mux *mux = kcm->mux; 738 struct kcm_psock *psock; 739 740 psock = kcm->tx_psock; 741 742 smp_rmb(); /* Must read tx_psock before tx_wait */ 743 744 if (psock) { 745 WARN_ON(kcm->tx_wait); 746 if (unlikely(psock->tx_stopped)) 747 unreserve_psock(kcm); 748 else 749 return kcm->tx_psock; 750 } 751 752 spin_lock_bh(&mux->lock); 753 754 /* Check again under lock to see if psock was reserved for this 755 * psock via psock_unreserve. 756 */ 757 psock = kcm->tx_psock; 758 if (unlikely(psock)) { 759 WARN_ON(kcm->tx_wait); 760 spin_unlock_bh(&mux->lock); 761 return kcm->tx_psock; 762 } 763 764 if (!list_empty(&mux->psocks_avail)) { 765 psock = list_first_entry(&mux->psocks_avail, 766 struct kcm_psock, 767 psock_avail_list); 768 list_del(&psock->psock_avail_list); 769 if (kcm->tx_wait) { 770 list_del(&kcm->wait_psock_list); 771 kcm->tx_wait = false; 772 } 773 kcm->tx_psock = psock; 774 psock->tx_kcm = kcm; 775 KCM_STATS_INCR(psock->stats.reserved); 776 } else if (!kcm->tx_wait) { 777 list_add_tail(&kcm->wait_psock_list, 778 &mux->kcm_tx_waiters); 779 kcm->tx_wait = true; 780 } 781 782 spin_unlock_bh(&mux->lock); 783 784 return psock; 785 } 786 787 /* mux lock held */ 788 static void psock_now_avail(struct kcm_psock *psock) 789 { 790 struct kcm_mux *mux = psock->mux; 791 struct kcm_sock *kcm; 792 793 if (list_empty(&mux->kcm_tx_waiters)) { 794 list_add_tail(&psock->psock_avail_list, 795 &mux->psocks_avail); 796 } else { 797 kcm = list_first_entry(&mux->kcm_tx_waiters, 798 struct kcm_sock, 799 wait_psock_list); 800 list_del(&kcm->wait_psock_list); 801 kcm->tx_wait = false; 802 psock->tx_kcm = kcm; 803 804 /* Commit before changing tx_psock since that is read in 805 * reserve_psock before queuing work. 806 */ 807 smp_mb(); 808 809 kcm->tx_psock = psock; 810 KCM_STATS_INCR(psock->stats.reserved); 811 queue_work(kcm_wq, &kcm->tx_work); 812 } 813 } 814 815 /* kcm sock is locked. */ 816 static void unreserve_psock(struct kcm_sock *kcm) 817 { 818 struct kcm_psock *psock; 819 struct kcm_mux *mux = kcm->mux; 820 821 spin_lock_bh(&mux->lock); 822 823 psock = kcm->tx_psock; 824 825 if (WARN_ON(!psock)) { 826 spin_unlock_bh(&mux->lock); 827 return; 828 } 829 830 smp_rmb(); /* Read tx_psock before tx_wait */ 831 832 kcm_update_tx_mux_stats(mux, psock); 833 834 WARN_ON(kcm->tx_wait); 835 836 kcm->tx_psock = NULL; 837 psock->tx_kcm = NULL; 838 KCM_STATS_INCR(psock->stats.unreserved); 839 840 if (unlikely(psock->tx_stopped)) { 841 if (psock->done) { 842 /* Deferred free */ 843 list_del(&psock->psock_list); 844 mux->psocks_cnt--; 845 sock_put(psock->sk); 846 fput(psock->sk->sk_socket->file); 847 kmem_cache_free(kcm_psockp, psock); 848 } 849 850 /* Don't put back on available list */ 851 852 spin_unlock_bh(&mux->lock); 853 854 return; 855 } 856 857 psock_now_avail(psock); 858 859 spin_unlock_bh(&mux->lock); 860 } 861 862 static void kcm_report_tx_retry(struct kcm_sock *kcm) 863 { 864 struct kcm_mux *mux = kcm->mux; 865 866 spin_lock_bh(&mux->lock); 867 KCM_STATS_INCR(mux->stats.tx_retries); 868 spin_unlock_bh(&mux->lock); 869 } 870 871 /* Write any messages ready on the kcm socket. Called with kcm sock lock 872 * held. Return bytes actually sent or error. 873 */ 874 static int kcm_write_msgs(struct kcm_sock *kcm) 875 { 876 struct sock *sk = &kcm->sk; 877 struct kcm_psock *psock; 878 struct sk_buff *skb, *head; 879 struct kcm_tx_msg *txm; 880 unsigned short fragidx, frag_offset; 881 unsigned int sent, total_sent = 0; 882 int ret = 0; 883 884 kcm->tx_wait_more = false; 885 psock = kcm->tx_psock; 886 if (unlikely(psock && psock->tx_stopped)) { 887 /* A reserved psock was aborted asynchronously. Unreserve 888 * it and we'll retry the message. 889 */ 890 unreserve_psock(kcm); 891 kcm_report_tx_retry(kcm); 892 if (skb_queue_empty(&sk->sk_write_queue)) 893 return 0; 894 895 kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0; 896 897 } else if (skb_queue_empty(&sk->sk_write_queue)) { 898 return 0; 899 } 900 901 head = skb_peek(&sk->sk_write_queue); 902 txm = kcm_tx_msg(head); 903 904 if (txm->sent) { 905 /* Send of first skbuff in queue already in progress */ 906 if (WARN_ON(!psock)) { 907 ret = -EINVAL; 908 goto out; 909 } 910 sent = txm->sent; 911 frag_offset = txm->frag_offset; 912 fragidx = txm->fragidx; 913 skb = txm->frag_skb; 914 915 goto do_frag; 916 } 917 918 try_again: 919 psock = reserve_psock(kcm); 920 if (!psock) 921 goto out; 922 923 do { 924 skb = head; 925 txm = kcm_tx_msg(head); 926 sent = 0; 927 928 do_frag_list: 929 if (WARN_ON(!skb_shinfo(skb)->nr_frags)) { 930 ret = -EINVAL; 931 goto out; 932 } 933 934 for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; 935 fragidx++) { 936 skb_frag_t *frag; 937 938 frag_offset = 0; 939 do_frag: 940 frag = &skb_shinfo(skb)->frags[fragidx]; 941 if (WARN_ON(!frag->size)) { 942 ret = -EINVAL; 943 goto out; 944 } 945 946 ret = kernel_sendpage(psock->sk->sk_socket, 947 frag->page.p, 948 frag->page_offset + frag_offset, 949 frag->size - frag_offset, 950 MSG_DONTWAIT); 951 if (ret <= 0) { 952 if (ret == -EAGAIN) { 953 /* Save state to try again when there's 954 * write space on the socket 955 */ 956 txm->sent = sent; 957 txm->frag_offset = frag_offset; 958 txm->fragidx = fragidx; 959 txm->frag_skb = skb; 960 961 ret = 0; 962 goto out; 963 } 964 965 /* Hard failure in sending message, abort this 966 * psock since it has lost framing 967 * synchonization and retry sending the 968 * message from the beginning. 969 */ 970 kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, 971 true); 972 unreserve_psock(kcm); 973 974 txm->sent = 0; 975 kcm_report_tx_retry(kcm); 976 ret = 0; 977 978 goto try_again; 979 } 980 981 sent += ret; 982 frag_offset += ret; 983 KCM_STATS_ADD(psock->stats.tx_bytes, ret); 984 if (frag_offset < frag->size) { 985 /* Not finished with this frag */ 986 goto do_frag; 987 } 988 } 989 990 if (skb == head) { 991 if (skb_has_frag_list(skb)) { 992 skb = skb_shinfo(skb)->frag_list; 993 goto do_frag_list; 994 } 995 } else if (skb->next) { 996 skb = skb->next; 997 goto do_frag_list; 998 } 999 1000 /* Successfully sent the whole packet, account for it. */ 1001 skb_dequeue(&sk->sk_write_queue); 1002 kfree_skb(head); 1003 sk->sk_wmem_queued -= sent; 1004 total_sent += sent; 1005 KCM_STATS_INCR(psock->stats.tx_msgs); 1006 } while ((head = skb_peek(&sk->sk_write_queue))); 1007 out: 1008 if (!head) { 1009 /* Done with all queued messages. */ 1010 WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); 1011 unreserve_psock(kcm); 1012 } 1013 1014 /* Check if write space is available */ 1015 sk->sk_write_space(sk); 1016 1017 return total_sent ? : ret; 1018 } 1019 1020 static void kcm_tx_work(struct work_struct *w) 1021 { 1022 struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work); 1023 struct sock *sk = &kcm->sk; 1024 int err; 1025 1026 lock_sock(sk); 1027 1028 /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx 1029 * aborts 1030 */ 1031 err = kcm_write_msgs(kcm); 1032 if (err < 0) { 1033 /* Hard failure in write, report error on KCM socket */ 1034 pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err); 1035 report_csk_error(&kcm->sk, -err); 1036 goto out; 1037 } 1038 1039 /* Primarily for SOCK_SEQPACKET sockets */ 1040 if (likely(sk->sk_socket) && 1041 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { 1042 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1043 sk->sk_write_space(sk); 1044 } 1045 1046 out: 1047 release_sock(sk); 1048 } 1049 1050 static void kcm_push(struct kcm_sock *kcm) 1051 { 1052 if (kcm->tx_wait_more) 1053 kcm_write_msgs(kcm); 1054 } 1055 1056 static ssize_t kcm_sendpage(struct socket *sock, struct page *page, 1057 int offset, size_t size, int flags) 1058 1059 { 1060 struct sock *sk = sock->sk; 1061 struct kcm_sock *kcm = kcm_sk(sk); 1062 struct sk_buff *skb = NULL, *head = NULL; 1063 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 1064 bool eor; 1065 int err = 0; 1066 int i; 1067 1068 if (flags & MSG_SENDPAGE_NOTLAST) 1069 flags |= MSG_MORE; 1070 1071 /* No MSG_EOR from splice, only look at MSG_MORE */ 1072 eor = !(flags & MSG_MORE); 1073 1074 lock_sock(sk); 1075 1076 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1077 1078 err = -EPIPE; 1079 if (sk->sk_err) 1080 goto out_error; 1081 1082 if (kcm->seq_skb) { 1083 /* Previously opened message */ 1084 head = kcm->seq_skb; 1085 skb = kcm_tx_msg(head)->last_skb; 1086 i = skb_shinfo(skb)->nr_frags; 1087 1088 if (skb_can_coalesce(skb, i, page, offset)) { 1089 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); 1090 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; 1091 goto coalesced; 1092 } 1093 1094 if (i >= MAX_SKB_FRAGS) { 1095 struct sk_buff *tskb; 1096 1097 tskb = alloc_skb(0, sk->sk_allocation); 1098 while (!tskb) { 1099 kcm_push(kcm); 1100 err = sk_stream_wait_memory(sk, &timeo); 1101 if (err) 1102 goto out_error; 1103 } 1104 1105 if (head == skb) 1106 skb_shinfo(head)->frag_list = tskb; 1107 else 1108 skb->next = tskb; 1109 1110 skb = tskb; 1111 skb->ip_summed = CHECKSUM_UNNECESSARY; 1112 i = 0; 1113 } 1114 } else { 1115 /* Call the sk_stream functions to manage the sndbuf mem. */ 1116 if (!sk_stream_memory_free(sk)) { 1117 kcm_push(kcm); 1118 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1119 err = sk_stream_wait_memory(sk, &timeo); 1120 if (err) 1121 goto out_error; 1122 } 1123 1124 head = alloc_skb(0, sk->sk_allocation); 1125 while (!head) { 1126 kcm_push(kcm); 1127 err = sk_stream_wait_memory(sk, &timeo); 1128 if (err) 1129 goto out_error; 1130 } 1131 1132 skb = head; 1133 i = 0; 1134 } 1135 1136 get_page(page); 1137 skb_fill_page_desc(skb, i, page, offset, size); 1138 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; 1139 1140 coalesced: 1141 skb->len += size; 1142 skb->data_len += size; 1143 skb->truesize += size; 1144 sk->sk_wmem_queued += size; 1145 sk_mem_charge(sk, size); 1146 1147 if (head != skb) { 1148 head->len += size; 1149 head->data_len += size; 1150 head->truesize += size; 1151 } 1152 1153 if (eor) { 1154 bool not_busy = skb_queue_empty(&sk->sk_write_queue); 1155 1156 /* Message complete, queue it on send buffer */ 1157 __skb_queue_tail(&sk->sk_write_queue, head); 1158 kcm->seq_skb = NULL; 1159 KCM_STATS_INCR(kcm->stats.tx_msgs); 1160 1161 if (flags & MSG_BATCH) { 1162 kcm->tx_wait_more = true; 1163 } else if (kcm->tx_wait_more || not_busy) { 1164 err = kcm_write_msgs(kcm); 1165 if (err < 0) { 1166 /* We got a hard error in write_msgs but have 1167 * already queued this message. Report an error 1168 * in the socket, but don't affect return value 1169 * from sendmsg 1170 */ 1171 pr_warn("KCM: Hard failure on kcm_write_msgs\n"); 1172 report_csk_error(&kcm->sk, -err); 1173 } 1174 } 1175 } else { 1176 /* Message not complete, save state */ 1177 kcm->seq_skb = head; 1178 kcm_tx_msg(head)->last_skb = skb; 1179 } 1180 1181 KCM_STATS_ADD(kcm->stats.tx_bytes, size); 1182 1183 release_sock(sk); 1184 return size; 1185 1186 out_error: 1187 kcm_push(kcm); 1188 1189 err = sk_stream_error(sk, flags, err); 1190 1191 /* make sure we wake any epoll edge trigger waiter */ 1192 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) 1193 sk->sk_write_space(sk); 1194 1195 release_sock(sk); 1196 return err; 1197 } 1198 1199 static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 1200 { 1201 struct sock *sk = sock->sk; 1202 struct kcm_sock *kcm = kcm_sk(sk); 1203 struct sk_buff *skb = NULL, *head = NULL; 1204 size_t copy, copied = 0; 1205 long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1206 int eor = (sock->type == SOCK_DGRAM) ? 1207 !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); 1208 int err = -EPIPE; 1209 1210 lock_sock(sk); 1211 1212 /* Per tcp_sendmsg this should be in poll */ 1213 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1214 1215 if (sk->sk_err) 1216 goto out_error; 1217 1218 if (kcm->seq_skb) { 1219 /* Previously opened message */ 1220 head = kcm->seq_skb; 1221 skb = kcm_tx_msg(head)->last_skb; 1222 goto start; 1223 } 1224 1225 /* Call the sk_stream functions to manage the sndbuf mem. */ 1226 if (!sk_stream_memory_free(sk)) { 1227 kcm_push(kcm); 1228 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1229 err = sk_stream_wait_memory(sk, &timeo); 1230 if (err) 1231 goto out_error; 1232 } 1233 1234 /* New message, alloc head skb */ 1235 head = alloc_skb(0, sk->sk_allocation); 1236 while (!head) { 1237 kcm_push(kcm); 1238 err = sk_stream_wait_memory(sk, &timeo); 1239 if (err) 1240 goto out_error; 1241 1242 head = alloc_skb(0, sk->sk_allocation); 1243 } 1244 1245 skb = head; 1246 1247 /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling 1248 * csum_and_copy_from_iter from skb_do_copy_data_nocache. 1249 */ 1250 skb->ip_summed = CHECKSUM_UNNECESSARY; 1251 1252 start: 1253 while (msg_data_left(msg)) { 1254 bool merge = true; 1255 int i = skb_shinfo(skb)->nr_frags; 1256 struct page_frag *pfrag = sk_page_frag(sk); 1257 1258 if (!sk_page_frag_refill(sk, pfrag)) 1259 goto wait_for_memory; 1260 1261 if (!skb_can_coalesce(skb, i, pfrag->page, 1262 pfrag->offset)) { 1263 if (i == MAX_SKB_FRAGS) { 1264 struct sk_buff *tskb; 1265 1266 tskb = alloc_skb(0, sk->sk_allocation); 1267 if (!tskb) 1268 goto wait_for_memory; 1269 1270 if (head == skb) 1271 skb_shinfo(head)->frag_list = tskb; 1272 else 1273 skb->next = tskb; 1274 1275 skb = tskb; 1276 skb->ip_summed = CHECKSUM_UNNECESSARY; 1277 continue; 1278 } 1279 merge = false; 1280 } 1281 1282 copy = min_t(int, msg_data_left(msg), 1283 pfrag->size - pfrag->offset); 1284 1285 if (!sk_wmem_schedule(sk, copy)) 1286 goto wait_for_memory; 1287 1288 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, 1289 pfrag->page, 1290 pfrag->offset, 1291 copy); 1292 if (err) 1293 goto out_error; 1294 1295 /* Update the skb. */ 1296 if (merge) { 1297 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1298 } else { 1299 skb_fill_page_desc(skb, i, pfrag->page, 1300 pfrag->offset, copy); 1301 get_page(pfrag->page); 1302 } 1303 1304 pfrag->offset += copy; 1305 copied += copy; 1306 if (head != skb) { 1307 head->len += copy; 1308 head->data_len += copy; 1309 } 1310 1311 continue; 1312 1313 wait_for_memory: 1314 kcm_push(kcm); 1315 err = sk_stream_wait_memory(sk, &timeo); 1316 if (err) 1317 goto out_error; 1318 } 1319 1320 if (eor) { 1321 bool not_busy = skb_queue_empty(&sk->sk_write_queue); 1322 1323 /* Message complete, queue it on send buffer */ 1324 __skb_queue_tail(&sk->sk_write_queue, head); 1325 kcm->seq_skb = NULL; 1326 KCM_STATS_INCR(kcm->stats.tx_msgs); 1327 1328 if (msg->msg_flags & MSG_BATCH) { 1329 kcm->tx_wait_more = true; 1330 } else if (kcm->tx_wait_more || not_busy) { 1331 err = kcm_write_msgs(kcm); 1332 if (err < 0) { 1333 /* We got a hard error in write_msgs but have 1334 * already queued this message. Report an error 1335 * in the socket, but don't affect return value 1336 * from sendmsg 1337 */ 1338 pr_warn("KCM: Hard failure on kcm_write_msgs\n"); 1339 report_csk_error(&kcm->sk, -err); 1340 } 1341 } 1342 } else { 1343 /* Message not complete, save state */ 1344 partial_message: 1345 kcm->seq_skb = head; 1346 kcm_tx_msg(head)->last_skb = skb; 1347 } 1348 1349 KCM_STATS_ADD(kcm->stats.tx_bytes, copied); 1350 1351 release_sock(sk); 1352 return copied; 1353 1354 out_error: 1355 kcm_push(kcm); 1356 1357 if (copied && sock->type == SOCK_SEQPACKET) { 1358 /* Wrote some bytes before encountering an 1359 * error, return partial success. 1360 */ 1361 goto partial_message; 1362 } 1363 1364 if (head != kcm->seq_skb) 1365 kfree_skb(head); 1366 1367 err = sk_stream_error(sk, msg->msg_flags, err); 1368 1369 /* make sure we wake any epoll edge trigger waiter */ 1370 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) 1371 sk->sk_write_space(sk); 1372 1373 release_sock(sk); 1374 return err; 1375 } 1376 1377 static struct sk_buff *kcm_wait_data(struct sock *sk, int flags, 1378 long timeo, int *err) 1379 { 1380 struct sk_buff *skb; 1381 1382 while (!(skb = skb_peek(&sk->sk_receive_queue))) { 1383 if (sk->sk_err) { 1384 *err = sock_error(sk); 1385 return NULL; 1386 } 1387 1388 if (sock_flag(sk, SOCK_DONE)) 1389 return NULL; 1390 1391 if ((flags & MSG_DONTWAIT) || !timeo) { 1392 *err = -EAGAIN; 1393 return NULL; 1394 } 1395 1396 sk_wait_data(sk, &timeo, NULL); 1397 1398 /* Handle signals */ 1399 if (signal_pending(current)) { 1400 *err = sock_intr_errno(timeo); 1401 return NULL; 1402 } 1403 } 1404 1405 return skb; 1406 } 1407 1408 static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, 1409 size_t len, int flags) 1410 { 1411 struct sock *sk = sock->sk; 1412 struct kcm_sock *kcm = kcm_sk(sk); 1413 int err = 0; 1414 long timeo; 1415 struct kcm_rx_msg *rxm; 1416 int copied = 0; 1417 struct sk_buff *skb; 1418 1419 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 1420 1421 lock_sock(sk); 1422 1423 skb = kcm_wait_data(sk, flags, timeo, &err); 1424 if (!skb) 1425 goto out; 1426 1427 /* Okay, have a message on the receive queue */ 1428 1429 rxm = kcm_rx_msg(skb); 1430 1431 if (len > rxm->full_len) 1432 len = rxm->full_len; 1433 1434 err = skb_copy_datagram_msg(skb, rxm->offset, msg, len); 1435 if (err < 0) 1436 goto out; 1437 1438 copied = len; 1439 if (likely(!(flags & MSG_PEEK))) { 1440 KCM_STATS_ADD(kcm->stats.rx_bytes, copied); 1441 if (copied < rxm->full_len) { 1442 if (sock->type == SOCK_DGRAM) { 1443 /* Truncated message */ 1444 msg->msg_flags |= MSG_TRUNC; 1445 goto msg_finished; 1446 } 1447 rxm->offset += copied; 1448 rxm->full_len -= copied; 1449 } else { 1450 msg_finished: 1451 /* Finished with message */ 1452 msg->msg_flags |= MSG_EOR; 1453 KCM_STATS_INCR(kcm->stats.rx_msgs); 1454 skb_unlink(skb, &sk->sk_receive_queue); 1455 kfree_skb(skb); 1456 } 1457 } 1458 1459 out: 1460 release_sock(sk); 1461 1462 return copied ? : err; 1463 } 1464 1465 static ssize_t kcm_sock_splice(struct sock *sk, 1466 struct pipe_inode_info *pipe, 1467 struct splice_pipe_desc *spd) 1468 { 1469 int ret; 1470 1471 release_sock(sk); 1472 ret = splice_to_pipe(pipe, spd); 1473 lock_sock(sk); 1474 1475 return ret; 1476 } 1477 1478 static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, 1479 struct pipe_inode_info *pipe, size_t len, 1480 unsigned int flags) 1481 { 1482 struct sock *sk = sock->sk; 1483 struct kcm_sock *kcm = kcm_sk(sk); 1484 long timeo; 1485 struct kcm_rx_msg *rxm; 1486 int err = 0; 1487 ssize_t copied; 1488 struct sk_buff *skb; 1489 1490 /* Only support splice for SOCKSEQPACKET */ 1491 1492 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 1493 1494 lock_sock(sk); 1495 1496 skb = kcm_wait_data(sk, flags, timeo, &err); 1497 if (!skb) 1498 goto err_out; 1499 1500 /* Okay, have a message on the receive queue */ 1501 1502 rxm = kcm_rx_msg(skb); 1503 1504 if (len > rxm->full_len) 1505 len = rxm->full_len; 1506 1507 copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags, 1508 kcm_sock_splice); 1509 if (copied < 0) { 1510 err = copied; 1511 goto err_out; 1512 } 1513 1514 KCM_STATS_ADD(kcm->stats.rx_bytes, copied); 1515 1516 rxm->offset += copied; 1517 rxm->full_len -= copied; 1518 1519 /* We have no way to return MSG_EOR. If all the bytes have been 1520 * read we still leave the message in the receive socket buffer. 1521 * A subsequent recvmsg needs to be done to return MSG_EOR and 1522 * finish reading the message. 1523 */ 1524 1525 release_sock(sk); 1526 1527 return copied; 1528 1529 err_out: 1530 release_sock(sk); 1531 1532 return err; 1533 } 1534 1535 /* kcm sock lock held */ 1536 static void kcm_recv_disable(struct kcm_sock *kcm) 1537 { 1538 struct kcm_mux *mux = kcm->mux; 1539 1540 if (kcm->rx_disabled) 1541 return; 1542 1543 spin_lock_bh(&mux->rx_lock); 1544 1545 kcm->rx_disabled = 1; 1546 1547 /* If a psock is reserved we'll do cleanup in unreserve */ 1548 if (!kcm->rx_psock) { 1549 if (kcm->rx_wait) { 1550 list_del(&kcm->wait_rx_list); 1551 kcm->rx_wait = false; 1552 } 1553 1554 requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); 1555 } 1556 1557 spin_unlock_bh(&mux->rx_lock); 1558 } 1559 1560 /* kcm sock lock held */ 1561 static void kcm_recv_enable(struct kcm_sock *kcm) 1562 { 1563 struct kcm_mux *mux = kcm->mux; 1564 1565 if (!kcm->rx_disabled) 1566 return; 1567 1568 spin_lock_bh(&mux->rx_lock); 1569 1570 kcm->rx_disabled = 0; 1571 kcm_rcv_ready(kcm); 1572 1573 spin_unlock_bh(&mux->rx_lock); 1574 } 1575 1576 static int kcm_setsockopt(struct socket *sock, int level, int optname, 1577 char __user *optval, unsigned int optlen) 1578 { 1579 struct kcm_sock *kcm = kcm_sk(sock->sk); 1580 int val, valbool; 1581 int err = 0; 1582 1583 if (level != SOL_KCM) 1584 return -ENOPROTOOPT; 1585 1586 if (optlen < sizeof(int)) 1587 return -EINVAL; 1588 1589 if (get_user(val, (int __user *)optval)) 1590 return -EINVAL; 1591 1592 valbool = val ? 1 : 0; 1593 1594 switch (optname) { 1595 case KCM_RECV_DISABLE: 1596 lock_sock(&kcm->sk); 1597 if (valbool) 1598 kcm_recv_disable(kcm); 1599 else 1600 kcm_recv_enable(kcm); 1601 release_sock(&kcm->sk); 1602 break; 1603 default: 1604 err = -ENOPROTOOPT; 1605 } 1606 1607 return err; 1608 } 1609 1610 static int kcm_getsockopt(struct socket *sock, int level, int optname, 1611 char __user *optval, int __user *optlen) 1612 { 1613 struct kcm_sock *kcm = kcm_sk(sock->sk); 1614 int val, len; 1615 1616 if (level != SOL_KCM) 1617 return -ENOPROTOOPT; 1618 1619 if (get_user(len, optlen)) 1620 return -EFAULT; 1621 1622 len = min_t(unsigned int, len, sizeof(int)); 1623 if (len < 0) 1624 return -EINVAL; 1625 1626 switch (optname) { 1627 case KCM_RECV_DISABLE: 1628 val = kcm->rx_disabled; 1629 break; 1630 default: 1631 return -ENOPROTOOPT; 1632 } 1633 1634 if (put_user(len, optlen)) 1635 return -EFAULT; 1636 if (copy_to_user(optval, &val, len)) 1637 return -EFAULT; 1638 return 0; 1639 } 1640 1641 static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) 1642 { 1643 struct kcm_sock *tkcm; 1644 struct list_head *head; 1645 int index = 0; 1646 1647 /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so 1648 * we set sk_state, otherwise epoll_wait always returns right away with 1649 * POLLHUP 1650 */ 1651 kcm->sk.sk_state = TCP_ESTABLISHED; 1652 1653 /* Add to mux's kcm sockets list */ 1654 kcm->mux = mux; 1655 spin_lock_bh(&mux->lock); 1656 1657 head = &mux->kcm_socks; 1658 list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) { 1659 if (tkcm->index != index) 1660 break; 1661 head = &tkcm->kcm_sock_list; 1662 index++; 1663 } 1664 1665 list_add(&kcm->kcm_sock_list, head); 1666 kcm->index = index; 1667 1668 mux->kcm_socks_cnt++; 1669 spin_unlock_bh(&mux->lock); 1670 1671 INIT_WORK(&kcm->tx_work, kcm_tx_work); 1672 1673 spin_lock_bh(&mux->rx_lock); 1674 kcm_rcv_ready(kcm); 1675 spin_unlock_bh(&mux->rx_lock); 1676 } 1677 1678 static void kcm_rx_msg_timeout(unsigned long arg) 1679 { 1680 struct kcm_psock *psock = (struct kcm_psock *)arg; 1681 1682 /* Message assembly timed out */ 1683 KCM_STATS_INCR(psock->stats.rx_msg_timeouts); 1684 kcm_abort_rx_psock(psock, ETIMEDOUT, NULL); 1685 } 1686 1687 static int kcm_attach(struct socket *sock, struct socket *csock, 1688 struct bpf_prog *prog) 1689 { 1690 struct kcm_sock *kcm = kcm_sk(sock->sk); 1691 struct kcm_mux *mux = kcm->mux; 1692 struct sock *csk; 1693 struct kcm_psock *psock = NULL, *tpsock; 1694 struct list_head *head; 1695 int index = 0; 1696 1697 if (csock->ops->family != PF_INET && 1698 csock->ops->family != PF_INET6) 1699 return -EINVAL; 1700 1701 csk = csock->sk; 1702 if (!csk) 1703 return -EINVAL; 1704 1705 /* Only support TCP for now */ 1706 if (csk->sk_protocol != IPPROTO_TCP) 1707 return -EINVAL; 1708 1709 psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); 1710 if (!psock) 1711 return -ENOMEM; 1712 1713 psock->mux = mux; 1714 psock->sk = csk; 1715 psock->bpf_prog = prog; 1716 1717 setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout, 1718 (unsigned long)psock); 1719 1720 INIT_WORK(&psock->rx_work, psock_rx_work); 1721 INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work); 1722 1723 sock_hold(csk); 1724 1725 write_lock_bh(&csk->sk_callback_lock); 1726 psock->save_data_ready = csk->sk_data_ready; 1727 psock->save_write_space = csk->sk_write_space; 1728 psock->save_state_change = csk->sk_state_change; 1729 csk->sk_user_data = psock; 1730 csk->sk_data_ready = psock_tcp_data_ready; 1731 csk->sk_write_space = psock_tcp_write_space; 1732 csk->sk_state_change = psock_tcp_state_change; 1733 write_unlock_bh(&csk->sk_callback_lock); 1734 1735 /* Finished initialization, now add the psock to the MUX. */ 1736 spin_lock_bh(&mux->lock); 1737 head = &mux->psocks; 1738 list_for_each_entry(tpsock, &mux->psocks, psock_list) { 1739 if (tpsock->index != index) 1740 break; 1741 head = &tpsock->psock_list; 1742 index++; 1743 } 1744 1745 list_add(&psock->psock_list, head); 1746 psock->index = index; 1747 1748 KCM_STATS_INCR(mux->stats.psock_attach); 1749 mux->psocks_cnt++; 1750 psock_now_avail(psock); 1751 spin_unlock_bh(&mux->lock); 1752 1753 /* Schedule RX work in case there are already bytes queued */ 1754 queue_work(kcm_wq, &psock->rx_work); 1755 1756 return 0; 1757 } 1758 1759 static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) 1760 { 1761 struct socket *csock; 1762 struct bpf_prog *prog; 1763 int err; 1764 1765 csock = sockfd_lookup(info->fd, &err); 1766 if (!csock) 1767 return -ENOENT; 1768 1769 prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER); 1770 if (IS_ERR(prog)) { 1771 err = PTR_ERR(prog); 1772 goto out; 1773 } 1774 1775 err = kcm_attach(sock, csock, prog); 1776 if (err) { 1777 bpf_prog_put(prog); 1778 goto out; 1779 } 1780 1781 /* Keep reference on file also */ 1782 1783 return 0; 1784 out: 1785 fput(csock->file); 1786 return err; 1787 } 1788 1789 static void kcm_unattach(struct kcm_psock *psock) 1790 { 1791 struct sock *csk = psock->sk; 1792 struct kcm_mux *mux = psock->mux; 1793 1794 /* Stop getting callbacks from TCP socket. After this there should 1795 * be no way to reserve a kcm for this psock. 1796 */ 1797 write_lock_bh(&csk->sk_callback_lock); 1798 csk->sk_user_data = NULL; 1799 csk->sk_data_ready = psock->save_data_ready; 1800 csk->sk_write_space = psock->save_write_space; 1801 csk->sk_state_change = psock->save_state_change; 1802 psock->rx_stopped = 1; 1803 1804 if (WARN_ON(psock->rx_kcm)) { 1805 write_unlock_bh(&csk->sk_callback_lock); 1806 return; 1807 } 1808 1809 spin_lock_bh(&mux->rx_lock); 1810 1811 /* Stop receiver activities. After this point psock should not be 1812 * able to get onto ready list either through callbacks or work. 1813 */ 1814 if (psock->ready_rx_msg) { 1815 list_del(&psock->psock_ready_list); 1816 kfree_skb(psock->ready_rx_msg); 1817 psock->ready_rx_msg = NULL; 1818 KCM_STATS_INCR(mux->stats.rx_ready_drops); 1819 } 1820 1821 spin_unlock_bh(&mux->rx_lock); 1822 1823 write_unlock_bh(&csk->sk_callback_lock); 1824 1825 del_timer_sync(&psock->rx_msg_timer); 1826 cancel_work_sync(&psock->rx_work); 1827 cancel_delayed_work_sync(&psock->rx_delayed_work); 1828 1829 bpf_prog_put(psock->bpf_prog); 1830 1831 kfree_skb(psock->rx_skb_head); 1832 psock->rx_skb_head = NULL; 1833 1834 spin_lock_bh(&mux->lock); 1835 1836 aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); 1837 1838 KCM_STATS_INCR(mux->stats.psock_unattach); 1839 1840 if (psock->tx_kcm) { 1841 /* psock was reserved. Just mark it finished and we will clean 1842 * up in the kcm paths, we need kcm lock which can not be 1843 * acquired here. 1844 */ 1845 KCM_STATS_INCR(mux->stats.psock_unattach_rsvd); 1846 spin_unlock_bh(&mux->lock); 1847 1848 /* We are unattaching a socket that is reserved. Abort the 1849 * socket since we may be out of sync in sending on it. We need 1850 * to do this without the mux lock. 1851 */ 1852 kcm_abort_tx_psock(psock, EPIPE, false); 1853 1854 spin_lock_bh(&mux->lock); 1855 if (!psock->tx_kcm) { 1856 /* psock now unreserved in window mux was unlocked */ 1857 goto no_reserved; 1858 } 1859 psock->done = 1; 1860 1861 /* Commit done before queuing work to process it */ 1862 smp_mb(); 1863 1864 /* Queue tx work to make sure psock->done is handled */ 1865 queue_work(kcm_wq, &psock->tx_kcm->tx_work); 1866 spin_unlock_bh(&mux->lock); 1867 } else { 1868 no_reserved: 1869 if (!psock->tx_stopped) 1870 list_del(&psock->psock_avail_list); 1871 list_del(&psock->psock_list); 1872 mux->psocks_cnt--; 1873 spin_unlock_bh(&mux->lock); 1874 1875 sock_put(csk); 1876 fput(csk->sk_socket->file); 1877 kmem_cache_free(kcm_psockp, psock); 1878 } 1879 } 1880 1881 static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) 1882 { 1883 struct kcm_sock *kcm = kcm_sk(sock->sk); 1884 struct kcm_mux *mux = kcm->mux; 1885 struct kcm_psock *psock; 1886 struct socket *csock; 1887 struct sock *csk; 1888 int err; 1889 1890 csock = sockfd_lookup(info->fd, &err); 1891 if (!csock) 1892 return -ENOENT; 1893 1894 csk = csock->sk; 1895 if (!csk) { 1896 err = -EINVAL; 1897 goto out; 1898 } 1899 1900 err = -ENOENT; 1901 1902 spin_lock_bh(&mux->lock); 1903 1904 list_for_each_entry(psock, &mux->psocks, psock_list) { 1905 if (psock->sk != csk) 1906 continue; 1907 1908 /* Found the matching psock */ 1909 1910 if (psock->unattaching || WARN_ON(psock->done)) { 1911 err = -EALREADY; 1912 break; 1913 } 1914 1915 psock->unattaching = 1; 1916 1917 spin_unlock_bh(&mux->lock); 1918 1919 kcm_unattach(psock); 1920 1921 err = 0; 1922 goto out; 1923 } 1924 1925 spin_unlock_bh(&mux->lock); 1926 1927 out: 1928 fput(csock->file); 1929 return err; 1930 } 1931 1932 static struct proto kcm_proto = { 1933 .name = "KCM", 1934 .owner = THIS_MODULE, 1935 .obj_size = sizeof(struct kcm_sock), 1936 }; 1937 1938 /* Clone a kcm socket. */ 1939 static int kcm_clone(struct socket *osock, struct kcm_clone *info, 1940 struct socket **newsockp) 1941 { 1942 struct socket *newsock; 1943 struct sock *newsk; 1944 struct file *newfile; 1945 int err, newfd; 1946 1947 err = -ENFILE; 1948 newsock = sock_alloc(); 1949 if (!newsock) 1950 goto out; 1951 1952 newsock->type = osock->type; 1953 newsock->ops = osock->ops; 1954 1955 __module_get(newsock->ops->owner); 1956 1957 newfd = get_unused_fd_flags(0); 1958 if (unlikely(newfd < 0)) { 1959 err = newfd; 1960 goto out_fd_fail; 1961 } 1962 1963 newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); 1964 if (unlikely(IS_ERR(newfile))) { 1965 err = PTR_ERR(newfile); 1966 goto out_sock_alloc_fail; 1967 } 1968 1969 newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, 1970 &kcm_proto, true); 1971 if (!newsk) { 1972 err = -ENOMEM; 1973 goto out_sk_alloc_fail; 1974 } 1975 1976 sock_init_data(newsock, newsk); 1977 init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); 1978 1979 fd_install(newfd, newfile); 1980 *newsockp = newsock; 1981 info->fd = newfd; 1982 1983 return 0; 1984 1985 out_sk_alloc_fail: 1986 fput(newfile); 1987 out_sock_alloc_fail: 1988 put_unused_fd(newfd); 1989 out_fd_fail: 1990 sock_release(newsock); 1991 out: 1992 return err; 1993 } 1994 1995 static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 1996 { 1997 int err; 1998 1999 switch (cmd) { 2000 case SIOCKCMATTACH: { 2001 struct kcm_attach info; 2002 2003 if (copy_from_user(&info, (void __user *)arg, sizeof(info))) 2004 err = -EFAULT; 2005 2006 err = kcm_attach_ioctl(sock, &info); 2007 2008 break; 2009 } 2010 case SIOCKCMUNATTACH: { 2011 struct kcm_unattach info; 2012 2013 if (copy_from_user(&info, (void __user *)arg, sizeof(info))) 2014 err = -EFAULT; 2015 2016 err = kcm_unattach_ioctl(sock, &info); 2017 2018 break; 2019 } 2020 case SIOCKCMCLONE: { 2021 struct kcm_clone info; 2022 struct socket *newsock = NULL; 2023 2024 if (copy_from_user(&info, (void __user *)arg, sizeof(info))) 2025 err = -EFAULT; 2026 2027 err = kcm_clone(sock, &info, &newsock); 2028 2029 if (!err) { 2030 if (copy_to_user((void __user *)arg, &info, 2031 sizeof(info))) { 2032 err = -EFAULT; 2033 sys_close(info.fd); 2034 } 2035 } 2036 2037 break; 2038 } 2039 default: 2040 err = -ENOIOCTLCMD; 2041 break; 2042 } 2043 2044 return err; 2045 } 2046 2047 static void free_mux(struct rcu_head *rcu) 2048 { 2049 struct kcm_mux *mux = container_of(rcu, 2050 struct kcm_mux, rcu); 2051 2052 kmem_cache_free(kcm_muxp, mux); 2053 } 2054 2055 static void release_mux(struct kcm_mux *mux) 2056 { 2057 struct kcm_net *knet = mux->knet; 2058 struct kcm_psock *psock, *tmp_psock; 2059 2060 /* Release psocks */ 2061 list_for_each_entry_safe(psock, tmp_psock, 2062 &mux->psocks, psock_list) { 2063 if (!WARN_ON(psock->unattaching)) 2064 kcm_unattach(psock); 2065 } 2066 2067 if (WARN_ON(mux->psocks_cnt)) 2068 return; 2069 2070 __skb_queue_purge(&mux->rx_hold_queue); 2071 2072 mutex_lock(&knet->mutex); 2073 aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); 2074 aggregate_psock_stats(&mux->aggregate_psock_stats, 2075 &knet->aggregate_psock_stats); 2076 list_del_rcu(&mux->kcm_mux_list); 2077 knet->count--; 2078 mutex_unlock(&knet->mutex); 2079 2080 call_rcu(&mux->rcu, free_mux); 2081 } 2082 2083 static void kcm_done(struct kcm_sock *kcm) 2084 { 2085 struct kcm_mux *mux = kcm->mux; 2086 struct sock *sk = &kcm->sk; 2087 int socks_cnt; 2088 2089 spin_lock_bh(&mux->rx_lock); 2090 if (kcm->rx_psock) { 2091 /* Cleanup in unreserve_rx_kcm */ 2092 WARN_ON(kcm->done); 2093 kcm->rx_disabled = 1; 2094 kcm->done = 1; 2095 spin_unlock_bh(&mux->rx_lock); 2096 return; 2097 } 2098 2099 if (kcm->rx_wait) { 2100 list_del(&kcm->wait_rx_list); 2101 kcm->rx_wait = false; 2102 } 2103 /* Move any pending receive messages to other kcm sockets */ 2104 requeue_rx_msgs(mux, &sk->sk_receive_queue); 2105 2106 spin_unlock_bh(&mux->rx_lock); 2107 2108 if (WARN_ON(sk_rmem_alloc_get(sk))) 2109 return; 2110 2111 /* Detach from MUX */ 2112 spin_lock_bh(&mux->lock); 2113 2114 list_del(&kcm->kcm_sock_list); 2115 mux->kcm_socks_cnt--; 2116 socks_cnt = mux->kcm_socks_cnt; 2117 2118 spin_unlock_bh(&mux->lock); 2119 2120 if (!socks_cnt) { 2121 /* We are done with the mux now. */ 2122 release_mux(mux); 2123 } 2124 2125 WARN_ON(kcm->rx_wait); 2126 2127 sock_put(&kcm->sk); 2128 } 2129 2130 /* Called by kcm_release to close a KCM socket. 2131 * If this is the last KCM socket on the MUX, destroy the MUX. 2132 */ 2133 static int kcm_release(struct socket *sock) 2134 { 2135 struct sock *sk = sock->sk; 2136 struct kcm_sock *kcm; 2137 struct kcm_mux *mux; 2138 struct kcm_psock *psock; 2139 2140 if (!sk) 2141 return 0; 2142 2143 kcm = kcm_sk(sk); 2144 mux = kcm->mux; 2145 2146 sock_orphan(sk); 2147 kfree_skb(kcm->seq_skb); 2148 2149 lock_sock(sk); 2150 /* Purge queue under lock to avoid race condition with tx_work trying 2151 * to act when queue is nonempty. If tx_work runs after this point 2152 * it will just return. 2153 */ 2154 __skb_queue_purge(&sk->sk_write_queue); 2155 release_sock(sk); 2156 2157 spin_lock_bh(&mux->lock); 2158 if (kcm->tx_wait) { 2159 /* Take of tx_wait list, after this point there should be no way 2160 * that a psock will be assigned to this kcm. 2161 */ 2162 list_del(&kcm->wait_psock_list); 2163 kcm->tx_wait = false; 2164 } 2165 spin_unlock_bh(&mux->lock); 2166 2167 /* Cancel work. After this point there should be no outside references 2168 * to the kcm socket. 2169 */ 2170 cancel_work_sync(&kcm->tx_work); 2171 2172 lock_sock(sk); 2173 psock = kcm->tx_psock; 2174 if (psock) { 2175 /* A psock was reserved, so we need to kill it since it 2176 * may already have some bytes queued from a message. We 2177 * need to do this after removing kcm from tx_wait list. 2178 */ 2179 kcm_abort_tx_psock(psock, EPIPE, false); 2180 unreserve_psock(kcm); 2181 } 2182 release_sock(sk); 2183 2184 WARN_ON(kcm->tx_wait); 2185 WARN_ON(kcm->tx_psock); 2186 2187 sock->sk = NULL; 2188 2189 kcm_done(kcm); 2190 2191 return 0; 2192 } 2193 2194 static const struct proto_ops kcm_dgram_ops = { 2195 .family = PF_KCM, 2196 .owner = THIS_MODULE, 2197 .release = kcm_release, 2198 .bind = sock_no_bind, 2199 .connect = sock_no_connect, 2200 .socketpair = sock_no_socketpair, 2201 .accept = sock_no_accept, 2202 .getname = sock_no_getname, 2203 .poll = datagram_poll, 2204 .ioctl = kcm_ioctl, 2205 .listen = sock_no_listen, 2206 .shutdown = sock_no_shutdown, 2207 .setsockopt = kcm_setsockopt, 2208 .getsockopt = kcm_getsockopt, 2209 .sendmsg = kcm_sendmsg, 2210 .recvmsg = kcm_recvmsg, 2211 .mmap = sock_no_mmap, 2212 .sendpage = kcm_sendpage, 2213 }; 2214 2215 static const struct proto_ops kcm_seqpacket_ops = { 2216 .family = PF_KCM, 2217 .owner = THIS_MODULE, 2218 .release = kcm_release, 2219 .bind = sock_no_bind, 2220 .connect = sock_no_connect, 2221 .socketpair = sock_no_socketpair, 2222 .accept = sock_no_accept, 2223 .getname = sock_no_getname, 2224 .poll = datagram_poll, 2225 .ioctl = kcm_ioctl, 2226 .listen = sock_no_listen, 2227 .shutdown = sock_no_shutdown, 2228 .setsockopt = kcm_setsockopt, 2229 .getsockopt = kcm_getsockopt, 2230 .sendmsg = kcm_sendmsg, 2231 .recvmsg = kcm_recvmsg, 2232 .mmap = sock_no_mmap, 2233 .sendpage = kcm_sendpage, 2234 .splice_read = kcm_splice_read, 2235 }; 2236 2237 /* Create proto operation for kcm sockets */ 2238 static int kcm_create(struct net *net, struct socket *sock, 2239 int protocol, int kern) 2240 { 2241 struct kcm_net *knet = net_generic(net, kcm_net_id); 2242 struct sock *sk; 2243 struct kcm_mux *mux; 2244 2245 switch (sock->type) { 2246 case SOCK_DGRAM: 2247 sock->ops = &kcm_dgram_ops; 2248 break; 2249 case SOCK_SEQPACKET: 2250 sock->ops = &kcm_seqpacket_ops; 2251 break; 2252 default: 2253 return -ESOCKTNOSUPPORT; 2254 } 2255 2256 if (protocol != KCMPROTO_CONNECTED) 2257 return -EPROTONOSUPPORT; 2258 2259 sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern); 2260 if (!sk) 2261 return -ENOMEM; 2262 2263 /* Allocate a kcm mux, shared between KCM sockets */ 2264 mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL); 2265 if (!mux) { 2266 sk_free(sk); 2267 return -ENOMEM; 2268 } 2269 2270 spin_lock_init(&mux->lock); 2271 spin_lock_init(&mux->rx_lock); 2272 INIT_LIST_HEAD(&mux->kcm_socks); 2273 INIT_LIST_HEAD(&mux->kcm_rx_waiters); 2274 INIT_LIST_HEAD(&mux->kcm_tx_waiters); 2275 2276 INIT_LIST_HEAD(&mux->psocks); 2277 INIT_LIST_HEAD(&mux->psocks_ready); 2278 INIT_LIST_HEAD(&mux->psocks_avail); 2279 2280 mux->knet = knet; 2281 2282 /* Add new MUX to list */ 2283 mutex_lock(&knet->mutex); 2284 list_add_rcu(&mux->kcm_mux_list, &knet->mux_list); 2285 knet->count++; 2286 mutex_unlock(&knet->mutex); 2287 2288 skb_queue_head_init(&mux->rx_hold_queue); 2289 2290 /* Init KCM socket */ 2291 sock_init_data(sock, sk); 2292 init_kcm_sock(kcm_sk(sk), mux); 2293 2294 return 0; 2295 } 2296 2297 static struct net_proto_family kcm_family_ops = { 2298 .family = PF_KCM, 2299 .create = kcm_create, 2300 .owner = THIS_MODULE, 2301 }; 2302 2303 static __net_init int kcm_init_net(struct net *net) 2304 { 2305 struct kcm_net *knet = net_generic(net, kcm_net_id); 2306 2307 INIT_LIST_HEAD_RCU(&knet->mux_list); 2308 mutex_init(&knet->mutex); 2309 2310 return 0; 2311 } 2312 2313 static __net_exit void kcm_exit_net(struct net *net) 2314 { 2315 struct kcm_net *knet = net_generic(net, kcm_net_id); 2316 2317 /* All KCM sockets should be closed at this point, which should mean 2318 * that all multiplexors and psocks have been destroyed. 2319 */ 2320 WARN_ON(!list_empty(&knet->mux_list)); 2321 } 2322 2323 static struct pernet_operations kcm_net_ops = { 2324 .init = kcm_init_net, 2325 .exit = kcm_exit_net, 2326 .id = &kcm_net_id, 2327 .size = sizeof(struct kcm_net), 2328 }; 2329 2330 static int __init kcm_init(void) 2331 { 2332 int err = -ENOMEM; 2333 2334 kcm_muxp = kmem_cache_create("kcm_mux_cache", 2335 sizeof(struct kcm_mux), 0, 2336 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 2337 if (!kcm_muxp) 2338 goto fail; 2339 2340 kcm_psockp = kmem_cache_create("kcm_psock_cache", 2341 sizeof(struct kcm_psock), 0, 2342 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 2343 if (!kcm_psockp) 2344 goto fail; 2345 2346 kcm_wq = create_singlethread_workqueue("kkcmd"); 2347 if (!kcm_wq) 2348 goto fail; 2349 2350 err = proto_register(&kcm_proto, 1); 2351 if (err) 2352 goto fail; 2353 2354 err = sock_register(&kcm_family_ops); 2355 if (err) 2356 goto sock_register_fail; 2357 2358 err = register_pernet_device(&kcm_net_ops); 2359 if (err) 2360 goto net_ops_fail; 2361 2362 err = kcm_proc_init(); 2363 if (err) 2364 goto proc_init_fail; 2365 2366 return 0; 2367 2368 proc_init_fail: 2369 unregister_pernet_device(&kcm_net_ops); 2370 2371 net_ops_fail: 2372 sock_unregister(PF_KCM); 2373 2374 sock_register_fail: 2375 proto_unregister(&kcm_proto); 2376 2377 fail: 2378 kmem_cache_destroy(kcm_muxp); 2379 kmem_cache_destroy(kcm_psockp); 2380 2381 if (kcm_wq) 2382 destroy_workqueue(kcm_wq); 2383 2384 return err; 2385 } 2386 2387 static void __exit kcm_exit(void) 2388 { 2389 kcm_proc_exit(); 2390 unregister_pernet_device(&kcm_net_ops); 2391 sock_unregister(PF_KCM); 2392 proto_unregister(&kcm_proto); 2393 destroy_workqueue(kcm_wq); 2394 2395 kmem_cache_destroy(kcm_muxp); 2396 kmem_cache_destroy(kcm_psockp); 2397 } 2398 2399 module_init(kcm_init); 2400 module_exit(kcm_exit); 2401 2402 MODULE_LICENSE("GPL"); 2403 MODULE_ALIAS_NETPROTO(PF_KCM); 2404 2405