1 /* 2 * VMware vSockets Driver 3 * 4 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved. 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License as published by the Free 8 * Software Foundation version 2 and no later version. 9 * 10 * This program is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 * more details. 14 */ 15 16 #include <linux/types.h> 17 #include <linux/socket.h> 18 #include <linux/stddef.h> 19 #include <net/sock.h> 20 21 #include "vmci_transport_notify.h" 22 23 #define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name) 24 25 static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk) 26 { 27 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 28 bool retval; 29 u64 notify_limit; 30 31 if (!PKT_FIELD(vsk, peer_waiting_write)) 32 return false; 33 34 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 35 /* When the sender blocks, we take that as a sign that the sender is 36 * faster than the receiver. To reduce the transmit rate of the sender, 37 * we delay the sending of the read notification by decreasing the 38 * write_notify_window. The notification is delayed until the number of 39 * bytes used in the queue drops below the write_notify_window. 40 */ 41 42 if (!PKT_FIELD(vsk, peer_waiting_write_detected)) { 43 PKT_FIELD(vsk, peer_waiting_write_detected) = true; 44 if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) { 45 PKT_FIELD(vsk, write_notify_window) = 46 PKT_FIELD(vsk, write_notify_min_window); 47 } else { 48 PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE; 49 if (PKT_FIELD(vsk, write_notify_window) < 50 PKT_FIELD(vsk, write_notify_min_window)) 51 PKT_FIELD(vsk, write_notify_window) = 52 PKT_FIELD(vsk, write_notify_min_window); 53 54 } 55 } 56 notify_limit = vmci_trans(vsk)->consume_size - 57 PKT_FIELD(vsk, write_notify_window); 58 #else 59 notify_limit = 0; 60 #endif 61 62 /* For now we ignore the wait information and just see if the free 63 * space exceeds the notify limit. Note that improving this function 64 * to be more intelligent will not require a protocol change and will 65 * retain compatibility between endpoints with mixed versions of this 66 * function. 67 * 68 * The notify_limit is used to delay notifications in the case where 69 * flow control is enabled. Below the test is expressed in terms of 70 * free space in the queue: if free_space > ConsumeSize - 71 * write_notify_window then notify An alternate way of expressing this 72 * is to rewrite the expression to use the data ready in the receive 73 * queue: if write_notify_window > bufferReady then notify as 74 * free_space == ConsumeSize - bufferReady. 75 */ 76 retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) > 77 notify_limit; 78 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 79 if (retval) { 80 /* 81 * Once we notify the peer, we reset the detected flag so the 82 * next wait will again cause a decrease in the window size. 83 */ 84 85 PKT_FIELD(vsk, peer_waiting_write_detected) = false; 86 } 87 #endif 88 return retval; 89 #else 90 return true; 91 #endif 92 } 93 94 static bool vmci_transport_notify_waiting_read(struct vsock_sock *vsk) 95 { 96 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 97 if (!PKT_FIELD(vsk, peer_waiting_read)) 98 return false; 99 100 /* For now we ignore the wait information and just see if there is any 101 * data for our peer to read. Note that improving this function to be 102 * more intelligent will not require a protocol change and will retain 103 * compatibility between endpoints with mixed versions of this 104 * function. 105 */ 106 return vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) > 0; 107 #else 108 return true; 109 #endif 110 } 111 112 static void 113 vmci_transport_handle_waiting_read(struct sock *sk, 114 struct vmci_transport_packet *pkt, 115 bool bottom_half, 116 struct sockaddr_vm *dst, 117 struct sockaddr_vm *src) 118 { 119 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 120 struct vsock_sock *vsk; 121 122 vsk = vsock_sk(sk); 123 124 PKT_FIELD(vsk, peer_waiting_read) = true; 125 memcpy(&PKT_FIELD(vsk, peer_waiting_read_info), &pkt->u.wait, 126 sizeof(PKT_FIELD(vsk, peer_waiting_read_info))); 127 128 if (vmci_transport_notify_waiting_read(vsk)) { 129 bool sent; 130 131 if (bottom_half) 132 sent = vmci_transport_send_wrote_bh(dst, src) > 0; 133 else 134 sent = vmci_transport_send_wrote(sk) > 0; 135 136 if (sent) 137 PKT_FIELD(vsk, peer_waiting_read) = false; 138 } 139 #endif 140 } 141 142 static void 143 vmci_transport_handle_waiting_write(struct sock *sk, 144 struct vmci_transport_packet *pkt, 145 bool bottom_half, 146 struct sockaddr_vm *dst, 147 struct sockaddr_vm *src) 148 { 149 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 150 struct vsock_sock *vsk; 151 152 vsk = vsock_sk(sk); 153 154 PKT_FIELD(vsk, peer_waiting_write) = true; 155 memcpy(&PKT_FIELD(vsk, peer_waiting_write_info), &pkt->u.wait, 156 sizeof(PKT_FIELD(vsk, peer_waiting_write_info))); 157 158 if (vmci_transport_notify_waiting_write(vsk)) { 159 bool sent; 160 161 if (bottom_half) 162 sent = vmci_transport_send_read_bh(dst, src) > 0; 163 else 164 sent = vmci_transport_send_read(sk) > 0; 165 166 if (sent) 167 PKT_FIELD(vsk, peer_waiting_write) = false; 168 } 169 #endif 170 } 171 172 static void 173 vmci_transport_handle_read(struct sock *sk, 174 struct vmci_transport_packet *pkt, 175 bool bottom_half, 176 struct sockaddr_vm *dst, struct sockaddr_vm *src) 177 { 178 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 179 struct vsock_sock *vsk; 180 181 vsk = vsock_sk(sk); 182 PKT_FIELD(vsk, sent_waiting_write) = false; 183 #endif 184 185 sk->sk_write_space(sk); 186 } 187 188 static bool send_waiting_read(struct sock *sk, u64 room_needed) 189 { 190 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 191 struct vsock_sock *vsk; 192 struct vmci_transport_waiting_info waiting_info; 193 u64 tail; 194 u64 head; 195 u64 room_left; 196 bool ret; 197 198 vsk = vsock_sk(sk); 199 200 if (PKT_FIELD(vsk, sent_waiting_read)) 201 return true; 202 203 if (PKT_FIELD(vsk, write_notify_window) < 204 vmci_trans(vsk)->consume_size) 205 PKT_FIELD(vsk, write_notify_window) = 206 min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE, 207 vmci_trans(vsk)->consume_size); 208 209 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, &tail, &head); 210 room_left = vmci_trans(vsk)->consume_size - head; 211 if (room_needed >= room_left) { 212 waiting_info.offset = room_needed - room_left; 213 waiting_info.generation = 214 PKT_FIELD(vsk, consume_q_generation) + 1; 215 } else { 216 waiting_info.offset = head + room_needed; 217 waiting_info.generation = PKT_FIELD(vsk, consume_q_generation); 218 } 219 220 ret = vmci_transport_send_waiting_read(sk, &waiting_info) > 0; 221 if (ret) 222 PKT_FIELD(vsk, sent_waiting_read) = true; 223 224 return ret; 225 #else 226 return true; 227 #endif 228 } 229 230 static bool send_waiting_write(struct sock *sk, u64 room_needed) 231 { 232 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 233 struct vsock_sock *vsk; 234 struct vmci_transport_waiting_info waiting_info; 235 u64 tail; 236 u64 head; 237 u64 room_left; 238 bool ret; 239 240 vsk = vsock_sk(sk); 241 242 if (PKT_FIELD(vsk, sent_waiting_write)) 243 return true; 244 245 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, &tail, &head); 246 room_left = vmci_trans(vsk)->produce_size - tail; 247 if (room_needed + 1 >= room_left) { 248 /* Wraps around to current generation. */ 249 waiting_info.offset = room_needed + 1 - room_left; 250 waiting_info.generation = PKT_FIELD(vsk, produce_q_generation); 251 } else { 252 waiting_info.offset = tail + room_needed + 1; 253 waiting_info.generation = 254 PKT_FIELD(vsk, produce_q_generation) - 1; 255 } 256 257 ret = vmci_transport_send_waiting_write(sk, &waiting_info) > 0; 258 if (ret) 259 PKT_FIELD(vsk, sent_waiting_write) = true; 260 261 return ret; 262 #else 263 return true; 264 #endif 265 } 266 267 static int vmci_transport_send_read_notification(struct sock *sk) 268 { 269 struct vsock_sock *vsk; 270 bool sent_read; 271 unsigned int retries; 272 int err; 273 274 vsk = vsock_sk(sk); 275 sent_read = false; 276 retries = 0; 277 err = 0; 278 279 if (vmci_transport_notify_waiting_write(vsk)) { 280 /* Notify the peer that we have read, retrying the send on 281 * failure up to our maximum value. XXX For now we just log 282 * the failure, but later we should schedule a work item to 283 * handle the resend until it succeeds. That would require 284 * keeping track of work items in the vsk and cleaning them up 285 * upon socket close. 286 */ 287 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && 288 !sent_read && 289 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 290 err = vmci_transport_send_read(sk); 291 if (err >= 0) 292 sent_read = true; 293 294 retries++; 295 } 296 297 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) 298 pr_err("%p unable to send read notify to peer\n", sk); 299 else 300 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 301 PKT_FIELD(vsk, peer_waiting_write) = false; 302 #endif 303 304 } 305 return err; 306 } 307 308 static void 309 vmci_transport_handle_wrote(struct sock *sk, 310 struct vmci_transport_packet *pkt, 311 bool bottom_half, 312 struct sockaddr_vm *dst, struct sockaddr_vm *src) 313 { 314 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 315 struct vsock_sock *vsk = vsock_sk(sk); 316 PKT_FIELD(vsk, sent_waiting_read) = false; 317 #endif 318 sk->sk_data_ready(sk); 319 } 320 321 static void vmci_transport_notify_pkt_socket_init(struct sock *sk) 322 { 323 struct vsock_sock *vsk = vsock_sk(sk); 324 325 PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE; 326 PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE; 327 PKT_FIELD(vsk, peer_waiting_read) = false; 328 PKT_FIELD(vsk, peer_waiting_write) = false; 329 PKT_FIELD(vsk, peer_waiting_write_detected) = false; 330 PKT_FIELD(vsk, sent_waiting_read) = false; 331 PKT_FIELD(vsk, sent_waiting_write) = false; 332 PKT_FIELD(vsk, produce_q_generation) = 0; 333 PKT_FIELD(vsk, consume_q_generation) = 0; 334 335 memset(&PKT_FIELD(vsk, peer_waiting_read_info), 0, 336 sizeof(PKT_FIELD(vsk, peer_waiting_read_info))); 337 memset(&PKT_FIELD(vsk, peer_waiting_write_info), 0, 338 sizeof(PKT_FIELD(vsk, peer_waiting_write_info))); 339 } 340 341 static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk) 342 { 343 } 344 345 static int 346 vmci_transport_notify_pkt_poll_in(struct sock *sk, 347 size_t target, bool *data_ready_now) 348 { 349 struct vsock_sock *vsk = vsock_sk(sk); 350 351 if (vsock_stream_has_data(vsk)) { 352 *data_ready_now = true; 353 } else { 354 /* We can't read right now because there is nothing in the 355 * queue. Ask for notifications when there is something to 356 * read. 357 */ 358 if (sk->sk_state == TCP_ESTABLISHED) { 359 if (!send_waiting_read(sk, 1)) 360 return -1; 361 362 } 363 *data_ready_now = false; 364 } 365 366 return 0; 367 } 368 369 static int 370 vmci_transport_notify_pkt_poll_out(struct sock *sk, 371 size_t target, bool *space_avail_now) 372 { 373 s64 produce_q_free_space; 374 struct vsock_sock *vsk = vsock_sk(sk); 375 376 produce_q_free_space = vsock_stream_has_space(vsk); 377 if (produce_q_free_space > 0) { 378 *space_avail_now = true; 379 return 0; 380 } else if (produce_q_free_space == 0) { 381 /* This is a connected socket but we can't currently send data. 382 * Notify the peer that we are waiting if the queue is full. We 383 * only send a waiting write if the queue is full because 384 * otherwise we end up in an infinite WAITING_WRITE, READ, 385 * WAITING_WRITE, READ, etc. loop. Treat failing to send the 386 * notification as a socket error, passing that back through 387 * the mask. 388 */ 389 if (!send_waiting_write(sk, 1)) 390 return -1; 391 392 *space_avail_now = false; 393 } 394 395 return 0; 396 } 397 398 static int 399 vmci_transport_notify_pkt_recv_init( 400 struct sock *sk, 401 size_t target, 402 struct vmci_transport_recv_notify_data *data) 403 { 404 struct vsock_sock *vsk = vsock_sk(sk); 405 406 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY 407 data->consume_head = 0; 408 data->produce_tail = 0; 409 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 410 data->notify_on_block = false; 411 412 if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) { 413 PKT_FIELD(vsk, write_notify_min_window) = target + 1; 414 if (PKT_FIELD(vsk, write_notify_window) < 415 PKT_FIELD(vsk, write_notify_min_window)) { 416 /* If the current window is smaller than the new 417 * minimal window size, we need to reevaluate whether 418 * we need to notify the sender. If the number of ready 419 * bytes are smaller than the new window, we need to 420 * send a notification to the sender before we block. 421 */ 422 423 PKT_FIELD(vsk, write_notify_window) = 424 PKT_FIELD(vsk, write_notify_min_window); 425 data->notify_on_block = true; 426 } 427 } 428 #endif 429 #endif 430 431 return 0; 432 } 433 434 static int 435 vmci_transport_notify_pkt_recv_pre_block( 436 struct sock *sk, 437 size_t target, 438 struct vmci_transport_recv_notify_data *data) 439 { 440 int err = 0; 441 442 /* Notify our peer that we are waiting for data to read. */ 443 if (!send_waiting_read(sk, target)) { 444 err = -EHOSTUNREACH; 445 return err; 446 } 447 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 448 if (data->notify_on_block) { 449 err = vmci_transport_send_read_notification(sk); 450 if (err < 0) 451 return err; 452 453 data->notify_on_block = false; 454 } 455 #endif 456 457 return err; 458 } 459 460 static int 461 vmci_transport_notify_pkt_recv_pre_dequeue( 462 struct sock *sk, 463 size_t target, 464 struct vmci_transport_recv_notify_data *data) 465 { 466 struct vsock_sock *vsk = vsock_sk(sk); 467 468 /* Now consume up to len bytes from the queue. Note that since we have 469 * the socket locked we should copy at least ready bytes. 470 */ 471 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 472 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, 473 &data->produce_tail, 474 &data->consume_head); 475 #endif 476 477 return 0; 478 } 479 480 static int 481 vmci_transport_notify_pkt_recv_post_dequeue( 482 struct sock *sk, 483 size_t target, 484 ssize_t copied, 485 bool data_read, 486 struct vmci_transport_recv_notify_data *data) 487 { 488 struct vsock_sock *vsk; 489 int err; 490 491 vsk = vsock_sk(sk); 492 err = 0; 493 494 if (data_read) { 495 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 496 /* Detect a wrap-around to maintain queue generation. Note 497 * that this is safe since we hold the socket lock across the 498 * two queue pair operations. 499 */ 500 if (copied >= 501 vmci_trans(vsk)->consume_size - data->consume_head) 502 PKT_FIELD(vsk, consume_q_generation)++; 503 #endif 504 505 err = vmci_transport_send_read_notification(sk); 506 if (err < 0) 507 return err; 508 509 } 510 return err; 511 } 512 513 static int 514 vmci_transport_notify_pkt_send_init( 515 struct sock *sk, 516 struct vmci_transport_send_notify_data *data) 517 { 518 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY 519 data->consume_head = 0; 520 data->produce_tail = 0; 521 #endif 522 523 return 0; 524 } 525 526 static int 527 vmci_transport_notify_pkt_send_pre_block( 528 struct sock *sk, 529 struct vmci_transport_send_notify_data *data) 530 { 531 /* Notify our peer that we are waiting for room to write. */ 532 if (!send_waiting_write(sk, 1)) 533 return -EHOSTUNREACH; 534 535 return 0; 536 } 537 538 static int 539 vmci_transport_notify_pkt_send_pre_enqueue( 540 struct sock *sk, 541 struct vmci_transport_send_notify_data *data) 542 { 543 struct vsock_sock *vsk = vsock_sk(sk); 544 545 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 546 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, 547 &data->produce_tail, 548 &data->consume_head); 549 #endif 550 551 return 0; 552 } 553 554 static int 555 vmci_transport_notify_pkt_send_post_enqueue( 556 struct sock *sk, 557 ssize_t written, 558 struct vmci_transport_send_notify_data *data) 559 { 560 int err = 0; 561 struct vsock_sock *vsk; 562 bool sent_wrote = false; 563 int retries = 0; 564 565 vsk = vsock_sk(sk); 566 567 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 568 /* Detect a wrap-around to maintain queue generation. Note that this 569 * is safe since we hold the socket lock across the two queue pair 570 * operations. 571 */ 572 if (written >= vmci_trans(vsk)->produce_size - data->produce_tail) 573 PKT_FIELD(vsk, produce_q_generation)++; 574 575 #endif 576 577 if (vmci_transport_notify_waiting_read(vsk)) { 578 /* Notify the peer that we have written, retrying the send on 579 * failure up to our maximum value. See the XXX comment for the 580 * corresponding piece of code in StreamRecvmsg() for potential 581 * improvements. 582 */ 583 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && 584 !sent_wrote && 585 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 586 err = vmci_transport_send_wrote(sk); 587 if (err >= 0) 588 sent_wrote = true; 589 590 retries++; 591 } 592 593 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 594 pr_err("%p unable to send wrote notify to peer\n", sk); 595 return err; 596 } else { 597 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 598 PKT_FIELD(vsk, peer_waiting_read) = false; 599 #endif 600 } 601 } 602 return err; 603 } 604 605 static void 606 vmci_transport_notify_pkt_handle_pkt( 607 struct sock *sk, 608 struct vmci_transport_packet *pkt, 609 bool bottom_half, 610 struct sockaddr_vm *dst, 611 struct sockaddr_vm *src, bool *pkt_processed) 612 { 613 bool processed = false; 614 615 switch (pkt->type) { 616 case VMCI_TRANSPORT_PACKET_TYPE_WROTE: 617 vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src); 618 processed = true; 619 break; 620 case VMCI_TRANSPORT_PACKET_TYPE_READ: 621 vmci_transport_handle_read(sk, pkt, bottom_half, dst, src); 622 processed = true; 623 break; 624 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE: 625 vmci_transport_handle_waiting_write(sk, pkt, bottom_half, 626 dst, src); 627 processed = true; 628 break; 629 630 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ: 631 vmci_transport_handle_waiting_read(sk, pkt, bottom_half, 632 dst, src); 633 processed = true; 634 break; 635 } 636 637 if (pkt_processed) 638 *pkt_processed = processed; 639 } 640 641 static void vmci_transport_notify_pkt_process_request(struct sock *sk) 642 { 643 struct vsock_sock *vsk = vsock_sk(sk); 644 645 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; 646 if (vmci_trans(vsk)->consume_size < 647 PKT_FIELD(vsk, write_notify_min_window)) 648 PKT_FIELD(vsk, write_notify_min_window) = 649 vmci_trans(vsk)->consume_size; 650 } 651 652 static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk) 653 { 654 struct vsock_sock *vsk = vsock_sk(sk); 655 656 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; 657 if (vmci_trans(vsk)->consume_size < 658 PKT_FIELD(vsk, write_notify_min_window)) 659 PKT_FIELD(vsk, write_notify_min_window) = 660 vmci_trans(vsk)->consume_size; 661 } 662 663 /* Socket control packet based operations. */ 664 const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = { 665 .socket_init = vmci_transport_notify_pkt_socket_init, 666 .socket_destruct = vmci_transport_notify_pkt_socket_destruct, 667 .poll_in = vmci_transport_notify_pkt_poll_in, 668 .poll_out = vmci_transport_notify_pkt_poll_out, 669 .handle_notify_pkt = vmci_transport_notify_pkt_handle_pkt, 670 .recv_init = vmci_transport_notify_pkt_recv_init, 671 .recv_pre_block = vmci_transport_notify_pkt_recv_pre_block, 672 .recv_pre_dequeue = vmci_transport_notify_pkt_recv_pre_dequeue, 673 .recv_post_dequeue = vmci_transport_notify_pkt_recv_post_dequeue, 674 .send_init = vmci_transport_notify_pkt_send_init, 675 .send_pre_block = vmci_transport_notify_pkt_send_pre_block, 676 .send_pre_enqueue = vmci_transport_notify_pkt_send_pre_enqueue, 677 .send_post_enqueue = vmci_transport_notify_pkt_send_post_enqueue, 678 .process_request = vmci_transport_notify_pkt_process_request, 679 .process_negotiate = vmci_transport_notify_pkt_process_negotiate, 680 }; 681