1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * PACKET - implements raw packet sockets. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * 12 * Fixes: 13 * Alan Cox : verify_area() now used correctly 14 * Alan Cox : new skbuff lists, look ma no backlogs! 15 * Alan Cox : tidied skbuff lists. 16 * Alan Cox : Now uses generic datagram routines I 17 * added. Also fixed the peek/read crash 18 * from all old Linux datagram code. 19 * Alan Cox : Uses the improved datagram code. 20 * Alan Cox : Added NULL's for socket options. 21 * Alan Cox : Re-commented the code. 22 * Alan Cox : Use new kernel side addressing 23 * Rob Janssen : Correct MTU usage. 24 * Dave Platt : Counter leaks caused by incorrect 25 * interrupt locking and some slightly 26 * dubious gcc output. Can you read 27 * compiler: it said _VOLATILE_ 28 * Richard Kooijman : Timestamp fixes. 29 * Alan Cox : New buffers. Use sk->mac.raw. 30 * Alan Cox : sendmsg/recvmsg support. 31 * Alan Cox : Protocol setting support 32 * Alexey Kuznetsov : Untied from IPv4 stack. 33 * Cyrus Durgin : Fixed kerneld for kmod. 34 * Michal Ostrowski : Module initialization cleanup. 35 * Ulises Alonso : Frame number limit removal and 36 * packet_set_ring memory leak. 37 * Eric Biederman : Allow for > 8 byte hardware addresses. 38 * The convention is that longer addresses 39 * will simply extend the hardware address 40 * byte arrays at the end of sockaddr_ll 41 * and packet_mreq. 42 * Johann Baudy : Added TX RING. 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction 44 * layer. 45 * Copyright (C) 2011, <lokec@ccs.neu.edu> 46 * 47 * 48 * This program is free software; you can redistribute it and/or 49 * modify it under the terms of the GNU General Public License 50 * as published by the Free Software Foundation; either version 51 * 2 of the License, or (at your option) any later version. 52 * 53 */ 54 55 #include <linux/types.h> 56 #include <linux/mm.h> 57 #include <linux/capability.h> 58 #include <linux/fcntl.h> 59 #include <linux/socket.h> 60 #include <linux/in.h> 61 #include <linux/inet.h> 62 #include <linux/netdevice.h> 63 #include <linux/if_packet.h> 64 #include <linux/wireless.h> 65 #include <linux/kernel.h> 66 #include <linux/kmod.h> 67 #include <linux/slab.h> 68 #include <linux/vmalloc.h> 69 #include <net/net_namespace.h> 70 #include <net/ip.h> 71 #include <net/protocol.h> 72 #include <linux/skbuff.h> 73 #include <net/sock.h> 74 #include <linux/errno.h> 75 #include <linux/timer.h> 76 #include <asm/uaccess.h> 77 #include <asm/ioctls.h> 78 #include <asm/page.h> 79 #include <asm/cacheflush.h> 80 #include <asm/io.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 #include <linux/poll.h> 84 #include <linux/module.h> 85 #include <linux/init.h> 86 #include <linux/mutex.h> 87 #include <linux/if_vlan.h> 88 #include <linux/virtio_net.h> 89 #include <linux/errqueue.h> 90 #include <linux/net_tstamp.h> 91 #include <linux/reciprocal_div.h> 92 #ifdef CONFIG_INET 93 #include <net/inet_common.h> 94 #endif 95 96 #include "internal.h" 97 98 /* 99 Assumptions: 100 - if device has no dev->hard_header routine, it adds and removes ll header 101 inside itself. In this case ll header is invisible outside of device, 102 but higher levels still should reserve dev->hard_header_len. 103 Some devices are enough clever to reallocate skb, when header 104 will not fit to reserved space (tunnel), another ones are silly 105 (PPP). 106 - packet socket receives packets with pulled ll header, 107 so that SOCK_RAW should push it back. 108 109 On receive: 110 ----------- 111 112 Incoming, dev->hard_header!=NULL 113 mac_header -> ll header 114 data -> data 115 116 Outgoing, dev->hard_header!=NULL 117 mac_header -> ll header 118 data -> ll header 119 120 Incoming, dev->hard_header==NULL 121 mac_header -> UNKNOWN position. It is very likely, that it points to ll 122 header. PPP makes it, that is wrong, because introduce 123 assymetry between rx and tx paths. 124 data -> data 125 126 Outgoing, dev->hard_header==NULL 127 mac_header -> data. ll header is still not built! 128 data -> data 129 130 Resume 131 If dev->hard_header==NULL we are unlikely to restore sensible ll header. 132 133 134 On transmit: 135 ------------ 136 137 dev->hard_header != NULL 138 mac_header -> ll header 139 data -> ll header 140 141 dev->hard_header == NULL (ll header is added by device, we cannot control it) 142 mac_header -> data 143 data -> data 144 145 We should set nh.raw on output to correct posistion, 146 packet classifier depends on it. 147 */ 148 149 /* Private packet socket structures. */ 150 151 /* identical to struct packet_mreq except it has 152 * a longer address field. 153 */ 154 struct packet_mreq_max { 155 int mr_ifindex; 156 unsigned short mr_type; 157 unsigned short mr_alen; 158 unsigned char mr_address[MAX_ADDR_LEN]; 159 }; 160 161 union tpacket_uhdr { 162 struct tpacket_hdr *h1; 163 struct tpacket2_hdr *h2; 164 struct tpacket3_hdr *h3; 165 void *raw; 166 }; 167 168 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, 169 int closing, int tx_ring); 170 171 #define V3_ALIGNMENT (8) 172 173 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) 174 175 #define BLK_PLUS_PRIV(sz_of_priv) \ 176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) 177 178 #define PGV_FROM_VMALLOC 1 179 180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) 181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) 182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) 183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) 184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) 185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) 186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x))) 187 188 struct packet_sock; 189 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); 190 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, 191 struct packet_type *pt, struct net_device *orig_dev); 192 193 static void *packet_previous_frame(struct packet_sock *po, 194 struct packet_ring_buffer *rb, 195 int status); 196 static void packet_increment_head(struct packet_ring_buffer *buff); 197 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *, 198 struct tpacket_block_desc *); 199 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, 200 struct packet_sock *); 201 static void prb_retire_current_block(struct tpacket_kbdq_core *, 202 struct packet_sock *, unsigned int status); 203 static int prb_queue_frozen(struct tpacket_kbdq_core *); 204 static void prb_open_block(struct tpacket_kbdq_core *, 205 struct tpacket_block_desc *); 206 static void prb_retire_rx_blk_timer_expired(unsigned long); 207 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); 208 static void prb_init_blk_timer(struct packet_sock *, 209 struct tpacket_kbdq_core *, 210 void (*func) (unsigned long)); 211 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); 212 static void prb_clear_rxhash(struct tpacket_kbdq_core *, 213 struct tpacket3_hdr *); 214 static void prb_fill_vlan_info(struct tpacket_kbdq_core *, 215 struct tpacket3_hdr *); 216 static void packet_flush_mclist(struct sock *sk); 217 218 struct packet_skb_cb { 219 unsigned int origlen; 220 union { 221 struct sockaddr_pkt pkt; 222 struct sockaddr_ll ll; 223 } sa; 224 }; 225 226 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) 227 228 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) 229 #define GET_PBLOCK_DESC(x, bid) \ 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) 231 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) 233 #define GET_NEXT_PRB_BLK_NUM(x) \ 234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ 235 ((x)->kactive_blk_num+1) : 0) 236 237 static void __fanout_unlink(struct sock *sk, struct packet_sock *po); 238 static void __fanout_link(struct sock *sk, struct packet_sock *po); 239 240 static int packet_direct_xmit(struct sk_buff *skb) 241 { 242 struct net_device *dev = skb->dev; 243 const struct net_device_ops *ops = dev->netdev_ops; 244 netdev_features_t features; 245 struct netdev_queue *txq; 246 u16 queue_map; 247 int ret; 248 249 if (unlikely(!netif_running(dev) || 250 !netif_carrier_ok(dev))) { 251 kfree_skb(skb); 252 return NET_XMIT_DROP; 253 } 254 255 features = netif_skb_features(skb); 256 if (skb_needs_linearize(skb, features) && 257 __skb_linearize(skb)) { 258 kfree_skb(skb); 259 return NET_XMIT_DROP; 260 } 261 262 queue_map = skb_get_queue_mapping(skb); 263 txq = netdev_get_tx_queue(dev, queue_map); 264 265 __netif_tx_lock_bh(txq); 266 if (unlikely(netif_xmit_frozen_or_stopped(txq))) { 267 ret = NETDEV_TX_BUSY; 268 kfree_skb(skb); 269 goto out; 270 } 271 272 ret = ops->ndo_start_xmit(skb, dev); 273 if (likely(dev_xmit_complete(ret))) 274 txq_trans_update(txq); 275 else 276 kfree_skb(skb); 277 out: 278 __netif_tx_unlock_bh(txq); 279 return ret; 280 } 281 282 static struct net_device *packet_cached_dev_get(struct packet_sock *po) 283 { 284 struct net_device *dev; 285 286 rcu_read_lock(); 287 dev = rcu_dereference(po->cached_dev); 288 if (likely(dev)) 289 dev_hold(dev); 290 rcu_read_unlock(); 291 292 return dev; 293 } 294 295 static void packet_cached_dev_assign(struct packet_sock *po, 296 struct net_device *dev) 297 { 298 rcu_assign_pointer(po->cached_dev, dev); 299 } 300 301 static void packet_cached_dev_reset(struct packet_sock *po) 302 { 303 RCU_INIT_POINTER(po->cached_dev, NULL); 304 } 305 306 static bool packet_use_direct_xmit(const struct packet_sock *po) 307 { 308 return po->xmit == packet_direct_xmit; 309 } 310 311 static u16 packet_pick_tx_queue(struct net_device *dev) 312 { 313 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; 314 } 315 316 /* register_prot_hook must be invoked with the po->bind_lock held, 317 * or from a context in which asynchronous accesses to the packet 318 * socket is not possible (packet_create()). 319 */ 320 static void register_prot_hook(struct sock *sk) 321 { 322 struct packet_sock *po = pkt_sk(sk); 323 324 if (!po->running) { 325 if (po->fanout) 326 __fanout_link(sk, po); 327 else 328 dev_add_pack(&po->prot_hook); 329 330 sock_hold(sk); 331 po->running = 1; 332 } 333 } 334 335 /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock 336 * held. If the sync parameter is true, we will temporarily drop 337 * the po->bind_lock and do a synchronize_net to make sure no 338 * asynchronous packet processing paths still refer to the elements 339 * of po->prot_hook. If the sync parameter is false, it is the 340 * callers responsibility to take care of this. 341 */ 342 static void __unregister_prot_hook(struct sock *sk, bool sync) 343 { 344 struct packet_sock *po = pkt_sk(sk); 345 346 po->running = 0; 347 348 if (po->fanout) 349 __fanout_unlink(sk, po); 350 else 351 __dev_remove_pack(&po->prot_hook); 352 353 __sock_put(sk); 354 355 if (sync) { 356 spin_unlock(&po->bind_lock); 357 synchronize_net(); 358 spin_lock(&po->bind_lock); 359 } 360 } 361 362 static void unregister_prot_hook(struct sock *sk, bool sync) 363 { 364 struct packet_sock *po = pkt_sk(sk); 365 366 if (po->running) 367 __unregister_prot_hook(sk, sync); 368 } 369 370 static inline __pure struct page *pgv_to_page(void *addr) 371 { 372 if (is_vmalloc_addr(addr)) 373 return vmalloc_to_page(addr); 374 return virt_to_page(addr); 375 } 376 377 static void __packet_set_status(struct packet_sock *po, void *frame, int status) 378 { 379 union tpacket_uhdr h; 380 381 h.raw = frame; 382 switch (po->tp_version) { 383 case TPACKET_V1: 384 h.h1->tp_status = status; 385 flush_dcache_page(pgv_to_page(&h.h1->tp_status)); 386 break; 387 case TPACKET_V2: 388 h.h2->tp_status = status; 389 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 390 break; 391 case TPACKET_V3: 392 default: 393 WARN(1, "TPACKET version not supported.\n"); 394 BUG(); 395 } 396 397 smp_wmb(); 398 } 399 400 static int __packet_get_status(struct packet_sock *po, void *frame) 401 { 402 union tpacket_uhdr h; 403 404 smp_rmb(); 405 406 h.raw = frame; 407 switch (po->tp_version) { 408 case TPACKET_V1: 409 flush_dcache_page(pgv_to_page(&h.h1->tp_status)); 410 return h.h1->tp_status; 411 case TPACKET_V2: 412 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 413 return h.h2->tp_status; 414 case TPACKET_V3: 415 default: 416 WARN(1, "TPACKET version not supported.\n"); 417 BUG(); 418 return 0; 419 } 420 } 421 422 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, 423 unsigned int flags) 424 { 425 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); 426 427 if (shhwtstamps) { 428 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) && 429 ktime_to_timespec_cond(shhwtstamps->syststamp, ts)) 430 return TP_STATUS_TS_SYS_HARDWARE; 431 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) && 432 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) 433 return TP_STATUS_TS_RAW_HARDWARE; 434 } 435 436 if (ktime_to_timespec_cond(skb->tstamp, ts)) 437 return TP_STATUS_TS_SOFTWARE; 438 439 return 0; 440 } 441 442 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, 443 struct sk_buff *skb) 444 { 445 union tpacket_uhdr h; 446 struct timespec ts; 447 __u32 ts_status; 448 449 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) 450 return 0; 451 452 h.raw = frame; 453 switch (po->tp_version) { 454 case TPACKET_V1: 455 h.h1->tp_sec = ts.tv_sec; 456 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; 457 break; 458 case TPACKET_V2: 459 h.h2->tp_sec = ts.tv_sec; 460 h.h2->tp_nsec = ts.tv_nsec; 461 break; 462 case TPACKET_V3: 463 default: 464 WARN(1, "TPACKET version not supported.\n"); 465 BUG(); 466 } 467 468 /* one flush is safe, as both fields always lie on the same cacheline */ 469 flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); 470 smp_wmb(); 471 472 return ts_status; 473 } 474 475 static void *packet_lookup_frame(struct packet_sock *po, 476 struct packet_ring_buffer *rb, 477 unsigned int position, 478 int status) 479 { 480 unsigned int pg_vec_pos, frame_offset; 481 union tpacket_uhdr h; 482 483 pg_vec_pos = position / rb->frames_per_block; 484 frame_offset = position % rb->frames_per_block; 485 486 h.raw = rb->pg_vec[pg_vec_pos].buffer + 487 (frame_offset * rb->frame_size); 488 489 if (status != __packet_get_status(po, h.raw)) 490 return NULL; 491 492 return h.raw; 493 } 494 495 static void *packet_current_frame(struct packet_sock *po, 496 struct packet_ring_buffer *rb, 497 int status) 498 { 499 return packet_lookup_frame(po, rb, rb->head, status); 500 } 501 502 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) 503 { 504 del_timer_sync(&pkc->retire_blk_timer); 505 } 506 507 static void prb_shutdown_retire_blk_timer(struct packet_sock *po, 508 int tx_ring, 509 struct sk_buff_head *rb_queue) 510 { 511 struct tpacket_kbdq_core *pkc; 512 513 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : 514 GET_PBDQC_FROM_RB(&po->rx_ring); 515 516 spin_lock_bh(&rb_queue->lock); 517 pkc->delete_blk_timer = 1; 518 spin_unlock_bh(&rb_queue->lock); 519 520 prb_del_retire_blk_timer(pkc); 521 } 522 523 static void prb_init_blk_timer(struct packet_sock *po, 524 struct tpacket_kbdq_core *pkc, 525 void (*func) (unsigned long)) 526 { 527 init_timer(&pkc->retire_blk_timer); 528 pkc->retire_blk_timer.data = (long)po; 529 pkc->retire_blk_timer.function = func; 530 pkc->retire_blk_timer.expires = jiffies; 531 } 532 533 static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) 534 { 535 struct tpacket_kbdq_core *pkc; 536 537 if (tx_ring) 538 BUG(); 539 540 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : 541 GET_PBDQC_FROM_RB(&po->rx_ring); 542 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); 543 } 544 545 static int prb_calc_retire_blk_tmo(struct packet_sock *po, 546 int blk_size_in_bytes) 547 { 548 struct net_device *dev; 549 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; 550 struct ethtool_cmd ecmd; 551 int err; 552 u32 speed; 553 554 rtnl_lock(); 555 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); 556 if (unlikely(!dev)) { 557 rtnl_unlock(); 558 return DEFAULT_PRB_RETIRE_TOV; 559 } 560 err = __ethtool_get_settings(dev, &ecmd); 561 speed = ethtool_cmd_speed(&ecmd); 562 rtnl_unlock(); 563 if (!err) { 564 /* 565 * If the link speed is so slow you don't really 566 * need to worry about perf anyways 567 */ 568 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) { 569 return DEFAULT_PRB_RETIRE_TOV; 570 } else { 571 msec = 1; 572 div = speed / 1000; 573 } 574 } 575 576 mbits = (blk_size_in_bytes * 8) / (1024 * 1024); 577 578 if (div) 579 mbits /= div; 580 581 tmo = mbits * msec; 582 583 if (div) 584 return tmo+1; 585 return tmo; 586 } 587 588 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, 589 union tpacket_req_u *req_u) 590 { 591 p1->feature_req_word = req_u->req3.tp_feature_req_word; 592 } 593 594 static void init_prb_bdqc(struct packet_sock *po, 595 struct packet_ring_buffer *rb, 596 struct pgv *pg_vec, 597 union tpacket_req_u *req_u, int tx_ring) 598 { 599 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); 600 struct tpacket_block_desc *pbd; 601 602 memset(p1, 0x0, sizeof(*p1)); 603 604 p1->knxt_seq_num = 1; 605 p1->pkbdq = pg_vec; 606 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; 607 p1->pkblk_start = pg_vec[0].buffer; 608 p1->kblk_size = req_u->req3.tp_block_size; 609 p1->knum_blocks = req_u->req3.tp_block_nr; 610 p1->hdrlen = po->tp_hdrlen; 611 p1->version = po->tp_version; 612 p1->last_kactive_blk_num = 0; 613 po->stats.stats3.tp_freeze_q_cnt = 0; 614 if (req_u->req3.tp_retire_blk_tov) 615 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; 616 else 617 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, 618 req_u->req3.tp_block_size); 619 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); 620 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; 621 622 prb_init_ft_ops(p1, req_u); 623 prb_setup_retire_blk_timer(po, tx_ring); 624 prb_open_block(p1, pbd); 625 } 626 627 /* Do NOT update the last_blk_num first. 628 * Assumes sk_buff_head lock is held. 629 */ 630 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) 631 { 632 mod_timer(&pkc->retire_blk_timer, 633 jiffies + pkc->tov_in_jiffies); 634 pkc->last_kactive_blk_num = pkc->kactive_blk_num; 635 } 636 637 /* 638 * Timer logic: 639 * 1) We refresh the timer only when we open a block. 640 * By doing this we don't waste cycles refreshing the timer 641 * on packet-by-packet basis. 642 * 643 * With a 1MB block-size, on a 1Gbps line, it will take 644 * i) ~8 ms to fill a block + ii) memcpy etc. 645 * In this cut we are not accounting for the memcpy time. 646 * 647 * So, if the user sets the 'tmo' to 10ms then the timer 648 * will never fire while the block is still getting filled 649 * (which is what we want). However, the user could choose 650 * to close a block early and that's fine. 651 * 652 * But when the timer does fire, we check whether or not to refresh it. 653 * Since the tmo granularity is in msecs, it is not too expensive 654 * to refresh the timer, lets say every '8' msecs. 655 * Either the user can set the 'tmo' or we can derive it based on 656 * a) line-speed and b) block-size. 657 * prb_calc_retire_blk_tmo() calculates the tmo. 658 * 659 */ 660 static void prb_retire_rx_blk_timer_expired(unsigned long data) 661 { 662 struct packet_sock *po = (struct packet_sock *)data; 663 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 664 unsigned int frozen; 665 struct tpacket_block_desc *pbd; 666 667 spin_lock(&po->sk.sk_receive_queue.lock); 668 669 frozen = prb_queue_frozen(pkc); 670 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 671 672 if (unlikely(pkc->delete_blk_timer)) 673 goto out; 674 675 /* We only need to plug the race when the block is partially filled. 676 * tpacket_rcv: 677 * lock(); increment BLOCK_NUM_PKTS; unlock() 678 * copy_bits() is in progress ... 679 * timer fires on other cpu: 680 * we can't retire the current block because copy_bits 681 * is in progress. 682 * 683 */ 684 if (BLOCK_NUM_PKTS(pbd)) { 685 while (atomic_read(&pkc->blk_fill_in_prog)) { 686 /* Waiting for skb_copy_bits to finish... */ 687 cpu_relax(); 688 } 689 } 690 691 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { 692 if (!frozen) { 693 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); 694 if (!prb_dispatch_next_block(pkc, po)) 695 goto refresh_timer; 696 else 697 goto out; 698 } else { 699 /* Case 1. Queue was frozen because user-space was 700 * lagging behind. 701 */ 702 if (prb_curr_blk_in_use(pkc, pbd)) { 703 /* 704 * Ok, user-space is still behind. 705 * So just refresh the timer. 706 */ 707 goto refresh_timer; 708 } else { 709 /* Case 2. queue was frozen,user-space caught up, 710 * now the link went idle && the timer fired. 711 * We don't have a block to close.So we open this 712 * block and restart the timer. 713 * opening a block thaws the queue,restarts timer 714 * Thawing/timer-refresh is a side effect. 715 */ 716 prb_open_block(pkc, pbd); 717 goto out; 718 } 719 } 720 } 721 722 refresh_timer: 723 _prb_refresh_rx_retire_blk_timer(pkc); 724 725 out: 726 spin_unlock(&po->sk.sk_receive_queue.lock); 727 } 728 729 static void prb_flush_block(struct tpacket_kbdq_core *pkc1, 730 struct tpacket_block_desc *pbd1, __u32 status) 731 { 732 /* Flush everything minus the block header */ 733 734 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 735 u8 *start, *end; 736 737 start = (u8 *)pbd1; 738 739 /* Skip the block header(we know header WILL fit in 4K) */ 740 start += PAGE_SIZE; 741 742 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); 743 for (; start < end; start += PAGE_SIZE) 744 flush_dcache_page(pgv_to_page(start)); 745 746 smp_wmb(); 747 #endif 748 749 /* Now update the block status. */ 750 751 BLOCK_STATUS(pbd1) = status; 752 753 /* Flush the block header */ 754 755 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 756 start = (u8 *)pbd1; 757 flush_dcache_page(pgv_to_page(start)); 758 759 smp_wmb(); 760 #endif 761 } 762 763 /* 764 * Side effect: 765 * 766 * 1) flush the block 767 * 2) Increment active_blk_num 768 * 769 * Note:We DONT refresh the timer on purpose. 770 * Because almost always the next block will be opened. 771 */ 772 static void prb_close_block(struct tpacket_kbdq_core *pkc1, 773 struct tpacket_block_desc *pbd1, 774 struct packet_sock *po, unsigned int stat) 775 { 776 __u32 status = TP_STATUS_USER | stat; 777 778 struct tpacket3_hdr *last_pkt; 779 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; 780 781 if (po->stats.stats3.tp_drops) 782 status |= TP_STATUS_LOSING; 783 784 last_pkt = (struct tpacket3_hdr *)pkc1->prev; 785 last_pkt->tp_next_offset = 0; 786 787 /* Get the ts of the last pkt */ 788 if (BLOCK_NUM_PKTS(pbd1)) { 789 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; 790 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; 791 } else { 792 /* Ok, we tmo'd - so get the current time */ 793 struct timespec ts; 794 getnstimeofday(&ts); 795 h1->ts_last_pkt.ts_sec = ts.tv_sec; 796 h1->ts_last_pkt.ts_nsec = ts.tv_nsec; 797 } 798 799 smp_wmb(); 800 801 /* Flush the block */ 802 prb_flush_block(pkc1, pbd1, status); 803 804 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); 805 } 806 807 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) 808 { 809 pkc->reset_pending_on_curr_blk = 0; 810 } 811 812 /* 813 * Side effect of opening a block: 814 * 815 * 1) prb_queue is thawed. 816 * 2) retire_blk_timer is refreshed. 817 * 818 */ 819 static void prb_open_block(struct tpacket_kbdq_core *pkc1, 820 struct tpacket_block_desc *pbd1) 821 { 822 struct timespec ts; 823 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; 824 825 smp_rmb(); 826 827 /* We could have just memset this but we will lose the 828 * flexibility of making the priv area sticky 829 */ 830 831 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; 832 BLOCK_NUM_PKTS(pbd1) = 0; 833 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); 834 835 getnstimeofday(&ts); 836 837 h1->ts_first_pkt.ts_sec = ts.tv_sec; 838 h1->ts_first_pkt.ts_nsec = ts.tv_nsec; 839 840 pkc1->pkblk_start = (char *)pbd1; 841 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); 842 843 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); 844 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; 845 846 pbd1->version = pkc1->version; 847 pkc1->prev = pkc1->nxt_offset; 848 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; 849 850 prb_thaw_queue(pkc1); 851 _prb_refresh_rx_retire_blk_timer(pkc1); 852 853 smp_wmb(); 854 } 855 856 /* 857 * Queue freeze logic: 858 * 1) Assume tp_block_nr = 8 blocks. 859 * 2) At time 't0', user opens Rx ring. 860 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 861 * 4) user-space is either sleeping or processing block '0'. 862 * 5) tpacket_rcv is currently filling block '7', since there is no space left, 863 * it will close block-7,loop around and try to fill block '0'. 864 * call-flow: 865 * __packet_lookup_frame_in_block 866 * prb_retire_current_block() 867 * prb_dispatch_next_block() 868 * |->(BLOCK_STATUS == USER) evaluates to true 869 * 5.1) Since block-0 is currently in-use, we just freeze the queue. 870 * 6) Now there are two cases: 871 * 6.1) Link goes idle right after the queue is frozen. 872 * But remember, the last open_block() refreshed the timer. 873 * When this timer expires,it will refresh itself so that we can 874 * re-open block-0 in near future. 875 * 6.2) Link is busy and keeps on receiving packets. This is a simple 876 * case and __packet_lookup_frame_in_block will check if block-0 877 * is free and can now be re-used. 878 */ 879 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, 880 struct packet_sock *po) 881 { 882 pkc->reset_pending_on_curr_blk = 1; 883 po->stats.stats3.tp_freeze_q_cnt++; 884 } 885 886 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) 887 888 /* 889 * If the next block is free then we will dispatch it 890 * and return a good offset. 891 * Else, we will freeze the queue. 892 * So, caller must check the return value. 893 */ 894 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, 895 struct packet_sock *po) 896 { 897 struct tpacket_block_desc *pbd; 898 899 smp_rmb(); 900 901 /* 1. Get current block num */ 902 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 903 904 /* 2. If this block is currently in_use then freeze the queue */ 905 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { 906 prb_freeze_queue(pkc, po); 907 return NULL; 908 } 909 910 /* 911 * 3. 912 * open this block and return the offset where the first packet 913 * needs to get stored. 914 */ 915 prb_open_block(pkc, pbd); 916 return (void *)pkc->nxt_offset; 917 } 918 919 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, 920 struct packet_sock *po, unsigned int status) 921 { 922 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 923 924 /* retire/close the current block */ 925 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { 926 /* 927 * Plug the case where copy_bits() is in progress on 928 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't 929 * have space to copy the pkt in the current block and 930 * called prb_retire_current_block() 931 * 932 * We don't need to worry about the TMO case because 933 * the timer-handler already handled this case. 934 */ 935 if (!(status & TP_STATUS_BLK_TMO)) { 936 while (atomic_read(&pkc->blk_fill_in_prog)) { 937 /* Waiting for skb_copy_bits to finish... */ 938 cpu_relax(); 939 } 940 } 941 prb_close_block(pkc, pbd, po, status); 942 return; 943 } 944 } 945 946 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc, 947 struct tpacket_block_desc *pbd) 948 { 949 return TP_STATUS_USER & BLOCK_STATUS(pbd); 950 } 951 952 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) 953 { 954 return pkc->reset_pending_on_curr_blk; 955 } 956 957 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) 958 { 959 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); 960 atomic_dec(&pkc->blk_fill_in_prog); 961 } 962 963 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, 964 struct tpacket3_hdr *ppd) 965 { 966 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); 967 } 968 969 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, 970 struct tpacket3_hdr *ppd) 971 { 972 ppd->hv1.tp_rxhash = 0; 973 } 974 975 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, 976 struct tpacket3_hdr *ppd) 977 { 978 if (vlan_tx_tag_present(pkc->skb)) { 979 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); 980 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); 981 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; 982 } else { 983 ppd->hv1.tp_vlan_tci = 0; 984 ppd->hv1.tp_vlan_tpid = 0; 985 ppd->tp_status = TP_STATUS_AVAILABLE; 986 } 987 } 988 989 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, 990 struct tpacket3_hdr *ppd) 991 { 992 ppd->hv1.tp_padding = 0; 993 prb_fill_vlan_info(pkc, ppd); 994 995 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) 996 prb_fill_rxhash(pkc, ppd); 997 else 998 prb_clear_rxhash(pkc, ppd); 999 } 1000 1001 static void prb_fill_curr_block(char *curr, 1002 struct tpacket_kbdq_core *pkc, 1003 struct tpacket_block_desc *pbd, 1004 unsigned int len) 1005 { 1006 struct tpacket3_hdr *ppd; 1007 1008 ppd = (struct tpacket3_hdr *)curr; 1009 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); 1010 pkc->prev = curr; 1011 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); 1012 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); 1013 BLOCK_NUM_PKTS(pbd) += 1; 1014 atomic_inc(&pkc->blk_fill_in_prog); 1015 prb_run_all_ft_ops(pkc, ppd); 1016 } 1017 1018 /* Assumes caller has the sk->rx_queue.lock */ 1019 static void *__packet_lookup_frame_in_block(struct packet_sock *po, 1020 struct sk_buff *skb, 1021 int status, 1022 unsigned int len 1023 ) 1024 { 1025 struct tpacket_kbdq_core *pkc; 1026 struct tpacket_block_desc *pbd; 1027 char *curr, *end; 1028 1029 pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 1030 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 1031 1032 /* Queue is frozen when user space is lagging behind */ 1033 if (prb_queue_frozen(pkc)) { 1034 /* 1035 * Check if that last block which caused the queue to freeze, 1036 * is still in_use by user-space. 1037 */ 1038 if (prb_curr_blk_in_use(pkc, pbd)) { 1039 /* Can't record this packet */ 1040 return NULL; 1041 } else { 1042 /* 1043 * Ok, the block was released by user-space. 1044 * Now let's open that block. 1045 * opening a block also thaws the queue. 1046 * Thawing is a side effect. 1047 */ 1048 prb_open_block(pkc, pbd); 1049 } 1050 } 1051 1052 smp_mb(); 1053 curr = pkc->nxt_offset; 1054 pkc->skb = skb; 1055 end = (char *)pbd + pkc->kblk_size; 1056 1057 /* first try the current block */ 1058 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { 1059 prb_fill_curr_block(curr, pkc, pbd, len); 1060 return (void *)curr; 1061 } 1062 1063 /* Ok, close the current block */ 1064 prb_retire_current_block(pkc, po, 0); 1065 1066 /* Now, try to dispatch the next block */ 1067 curr = (char *)prb_dispatch_next_block(pkc, po); 1068 if (curr) { 1069 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 1070 prb_fill_curr_block(curr, pkc, pbd, len); 1071 return (void *)curr; 1072 } 1073 1074 /* 1075 * No free blocks are available.user_space hasn't caught up yet. 1076 * Queue was just frozen and now this packet will get dropped. 1077 */ 1078 return NULL; 1079 } 1080 1081 static void *packet_current_rx_frame(struct packet_sock *po, 1082 struct sk_buff *skb, 1083 int status, unsigned int len) 1084 { 1085 char *curr = NULL; 1086 switch (po->tp_version) { 1087 case TPACKET_V1: 1088 case TPACKET_V2: 1089 curr = packet_lookup_frame(po, &po->rx_ring, 1090 po->rx_ring.head, status); 1091 return curr; 1092 case TPACKET_V3: 1093 return __packet_lookup_frame_in_block(po, skb, status, len); 1094 default: 1095 WARN(1, "TPACKET version not supported\n"); 1096 BUG(); 1097 return NULL; 1098 } 1099 } 1100 1101 static void *prb_lookup_block(struct packet_sock *po, 1102 struct packet_ring_buffer *rb, 1103 unsigned int idx, 1104 int status) 1105 { 1106 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); 1107 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); 1108 1109 if (status != BLOCK_STATUS(pbd)) 1110 return NULL; 1111 return pbd; 1112 } 1113 1114 static int prb_previous_blk_num(struct packet_ring_buffer *rb) 1115 { 1116 unsigned int prev; 1117 if (rb->prb_bdqc.kactive_blk_num) 1118 prev = rb->prb_bdqc.kactive_blk_num-1; 1119 else 1120 prev = rb->prb_bdqc.knum_blocks-1; 1121 return prev; 1122 } 1123 1124 /* Assumes caller has held the rx_queue.lock */ 1125 static void *__prb_previous_block(struct packet_sock *po, 1126 struct packet_ring_buffer *rb, 1127 int status) 1128 { 1129 unsigned int previous = prb_previous_blk_num(rb); 1130 return prb_lookup_block(po, rb, previous, status); 1131 } 1132 1133 static void *packet_previous_rx_frame(struct packet_sock *po, 1134 struct packet_ring_buffer *rb, 1135 int status) 1136 { 1137 if (po->tp_version <= TPACKET_V2) 1138 return packet_previous_frame(po, rb, status); 1139 1140 return __prb_previous_block(po, rb, status); 1141 } 1142 1143 static void packet_increment_rx_head(struct packet_sock *po, 1144 struct packet_ring_buffer *rb) 1145 { 1146 switch (po->tp_version) { 1147 case TPACKET_V1: 1148 case TPACKET_V2: 1149 return packet_increment_head(rb); 1150 case TPACKET_V3: 1151 default: 1152 WARN(1, "TPACKET version not supported.\n"); 1153 BUG(); 1154 return; 1155 } 1156 } 1157 1158 static void *packet_previous_frame(struct packet_sock *po, 1159 struct packet_ring_buffer *rb, 1160 int status) 1161 { 1162 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; 1163 return packet_lookup_frame(po, rb, previous, status); 1164 } 1165 1166 static void packet_increment_head(struct packet_ring_buffer *buff) 1167 { 1168 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; 1169 } 1170 1171 static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) 1172 { 1173 struct sock *sk = &po->sk; 1174 bool has_room; 1175 1176 if (po->prot_hook.func != tpacket_rcv) 1177 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) 1178 <= sk->sk_rcvbuf; 1179 1180 spin_lock(&sk->sk_receive_queue.lock); 1181 if (po->tp_version == TPACKET_V3) 1182 has_room = prb_lookup_block(po, &po->rx_ring, 1183 po->rx_ring.prb_bdqc.kactive_blk_num, 1184 TP_STATUS_KERNEL); 1185 else 1186 has_room = packet_lookup_frame(po, &po->rx_ring, 1187 po->rx_ring.head, 1188 TP_STATUS_KERNEL); 1189 spin_unlock(&sk->sk_receive_queue.lock); 1190 1191 return has_room; 1192 } 1193 1194 static void packet_sock_destruct(struct sock *sk) 1195 { 1196 skb_queue_purge(&sk->sk_error_queue); 1197 1198 WARN_ON(atomic_read(&sk->sk_rmem_alloc)); 1199 WARN_ON(atomic_read(&sk->sk_wmem_alloc)); 1200 1201 if (!sock_flag(sk, SOCK_DEAD)) { 1202 pr_err("Attempt to release alive packet socket: %p\n", sk); 1203 return; 1204 } 1205 1206 sk_refcnt_debug_dec(sk); 1207 } 1208 1209 static int fanout_rr_next(struct packet_fanout *f, unsigned int num) 1210 { 1211 int x = atomic_read(&f->rr_cur) + 1; 1212 1213 if (x >= num) 1214 x = 0; 1215 1216 return x; 1217 } 1218 1219 static unsigned int fanout_demux_hash(struct packet_fanout *f, 1220 struct sk_buff *skb, 1221 unsigned int num) 1222 { 1223 return reciprocal_divide(skb->rxhash, num); 1224 } 1225 1226 static unsigned int fanout_demux_lb(struct packet_fanout *f, 1227 struct sk_buff *skb, 1228 unsigned int num) 1229 { 1230 int cur, old; 1231 1232 cur = atomic_read(&f->rr_cur); 1233 while ((old = atomic_cmpxchg(&f->rr_cur, cur, 1234 fanout_rr_next(f, num))) != cur) 1235 cur = old; 1236 return cur; 1237 } 1238 1239 static unsigned int fanout_demux_cpu(struct packet_fanout *f, 1240 struct sk_buff *skb, 1241 unsigned int num) 1242 { 1243 return smp_processor_id() % num; 1244 } 1245 1246 static unsigned int fanout_demux_rnd(struct packet_fanout *f, 1247 struct sk_buff *skb, 1248 unsigned int num) 1249 { 1250 return reciprocal_divide(prandom_u32(), num); 1251 } 1252 1253 static unsigned int fanout_demux_rollover(struct packet_fanout *f, 1254 struct sk_buff *skb, 1255 unsigned int idx, unsigned int skip, 1256 unsigned int num) 1257 { 1258 unsigned int i, j; 1259 1260 i = j = min_t(int, f->next[idx], num - 1); 1261 do { 1262 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { 1263 if (i != j) 1264 f->next[idx] = i; 1265 return i; 1266 } 1267 if (++i == num) 1268 i = 0; 1269 } while (i != j); 1270 1271 return idx; 1272 } 1273 1274 static bool fanout_has_flag(struct packet_fanout *f, u16 flag) 1275 { 1276 return f->flags & (flag >> 8); 1277 } 1278 1279 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, 1280 struct packet_type *pt, struct net_device *orig_dev) 1281 { 1282 struct packet_fanout *f = pt->af_packet_priv; 1283 unsigned int num = f->num_members; 1284 struct packet_sock *po; 1285 unsigned int idx; 1286 1287 if (!net_eq(dev_net(dev), read_pnet(&f->net)) || 1288 !num) { 1289 kfree_skb(skb); 1290 return 0; 1291 } 1292 1293 switch (f->type) { 1294 case PACKET_FANOUT_HASH: 1295 default: 1296 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { 1297 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); 1298 if (!skb) 1299 return 0; 1300 } 1301 skb_get_hash(skb); 1302 idx = fanout_demux_hash(f, skb, num); 1303 break; 1304 case PACKET_FANOUT_LB: 1305 idx = fanout_demux_lb(f, skb, num); 1306 break; 1307 case PACKET_FANOUT_CPU: 1308 idx = fanout_demux_cpu(f, skb, num); 1309 break; 1310 case PACKET_FANOUT_RND: 1311 idx = fanout_demux_rnd(f, skb, num); 1312 break; 1313 case PACKET_FANOUT_ROLLOVER: 1314 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); 1315 break; 1316 } 1317 1318 po = pkt_sk(f->arr[idx]); 1319 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) && 1320 unlikely(!packet_rcv_has_room(po, skb))) { 1321 idx = fanout_demux_rollover(f, skb, idx, idx, num); 1322 po = pkt_sk(f->arr[idx]); 1323 } 1324 1325 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); 1326 } 1327 1328 DEFINE_MUTEX(fanout_mutex); 1329 EXPORT_SYMBOL_GPL(fanout_mutex); 1330 static LIST_HEAD(fanout_list); 1331 1332 static void __fanout_link(struct sock *sk, struct packet_sock *po) 1333 { 1334 struct packet_fanout *f = po->fanout; 1335 1336 spin_lock(&f->lock); 1337 f->arr[f->num_members] = sk; 1338 smp_wmb(); 1339 f->num_members++; 1340 spin_unlock(&f->lock); 1341 } 1342 1343 static void __fanout_unlink(struct sock *sk, struct packet_sock *po) 1344 { 1345 struct packet_fanout *f = po->fanout; 1346 int i; 1347 1348 spin_lock(&f->lock); 1349 for (i = 0; i < f->num_members; i++) { 1350 if (f->arr[i] == sk) 1351 break; 1352 } 1353 BUG_ON(i >= f->num_members); 1354 f->arr[i] = f->arr[f->num_members - 1]; 1355 f->num_members--; 1356 spin_unlock(&f->lock); 1357 } 1358 1359 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) 1360 { 1361 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout) 1362 return true; 1363 1364 return false; 1365 } 1366 1367 static int fanout_add(struct sock *sk, u16 id, u16 type_flags) 1368 { 1369 struct packet_sock *po = pkt_sk(sk); 1370 struct packet_fanout *f, *match; 1371 u8 type = type_flags & 0xff; 1372 u8 flags = type_flags >> 8; 1373 int err; 1374 1375 switch (type) { 1376 case PACKET_FANOUT_ROLLOVER: 1377 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) 1378 return -EINVAL; 1379 case PACKET_FANOUT_HASH: 1380 case PACKET_FANOUT_LB: 1381 case PACKET_FANOUT_CPU: 1382 case PACKET_FANOUT_RND: 1383 break; 1384 default: 1385 return -EINVAL; 1386 } 1387 1388 if (!po->running) 1389 return -EINVAL; 1390 1391 if (po->fanout) 1392 return -EALREADY; 1393 1394 mutex_lock(&fanout_mutex); 1395 match = NULL; 1396 list_for_each_entry(f, &fanout_list, list) { 1397 if (f->id == id && 1398 read_pnet(&f->net) == sock_net(sk)) { 1399 match = f; 1400 break; 1401 } 1402 } 1403 err = -EINVAL; 1404 if (match && match->flags != flags) 1405 goto out; 1406 if (!match) { 1407 err = -ENOMEM; 1408 match = kzalloc(sizeof(*match), GFP_KERNEL); 1409 if (!match) 1410 goto out; 1411 write_pnet(&match->net, sock_net(sk)); 1412 match->id = id; 1413 match->type = type; 1414 match->flags = flags; 1415 atomic_set(&match->rr_cur, 0); 1416 INIT_LIST_HEAD(&match->list); 1417 spin_lock_init(&match->lock); 1418 atomic_set(&match->sk_ref, 0); 1419 match->prot_hook.type = po->prot_hook.type; 1420 match->prot_hook.dev = po->prot_hook.dev; 1421 match->prot_hook.func = packet_rcv_fanout; 1422 match->prot_hook.af_packet_priv = match; 1423 match->prot_hook.id_match = match_fanout_group; 1424 dev_add_pack(&match->prot_hook); 1425 list_add(&match->list, &fanout_list); 1426 } 1427 err = -EINVAL; 1428 if (match->type == type && 1429 match->prot_hook.type == po->prot_hook.type && 1430 match->prot_hook.dev == po->prot_hook.dev) { 1431 err = -ENOSPC; 1432 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) { 1433 __dev_remove_pack(&po->prot_hook); 1434 po->fanout = match; 1435 atomic_inc(&match->sk_ref); 1436 __fanout_link(sk, po); 1437 err = 0; 1438 } 1439 } 1440 out: 1441 mutex_unlock(&fanout_mutex); 1442 return err; 1443 } 1444 1445 static void fanout_release(struct sock *sk) 1446 { 1447 struct packet_sock *po = pkt_sk(sk); 1448 struct packet_fanout *f; 1449 1450 f = po->fanout; 1451 if (!f) 1452 return; 1453 1454 mutex_lock(&fanout_mutex); 1455 po->fanout = NULL; 1456 1457 if (atomic_dec_and_test(&f->sk_ref)) { 1458 list_del(&f->list); 1459 dev_remove_pack(&f->prot_hook); 1460 kfree(f); 1461 } 1462 mutex_unlock(&fanout_mutex); 1463 } 1464 1465 static const struct proto_ops packet_ops; 1466 1467 static const struct proto_ops packet_ops_spkt; 1468 1469 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, 1470 struct packet_type *pt, struct net_device *orig_dev) 1471 { 1472 struct sock *sk; 1473 struct sockaddr_pkt *spkt; 1474 1475 /* 1476 * When we registered the protocol we saved the socket in the data 1477 * field for just this event. 1478 */ 1479 1480 sk = pt->af_packet_priv; 1481 1482 /* 1483 * Yank back the headers [hope the device set this 1484 * right or kerboom...] 1485 * 1486 * Incoming packets have ll header pulled, 1487 * push it back. 1488 * 1489 * For outgoing ones skb->data == skb_mac_header(skb) 1490 * so that this procedure is noop. 1491 */ 1492 1493 if (skb->pkt_type == PACKET_LOOPBACK) 1494 goto out; 1495 1496 if (!net_eq(dev_net(dev), sock_net(sk))) 1497 goto out; 1498 1499 skb = skb_share_check(skb, GFP_ATOMIC); 1500 if (skb == NULL) 1501 goto oom; 1502 1503 /* drop any routing info */ 1504 skb_dst_drop(skb); 1505 1506 /* drop conntrack reference */ 1507 nf_reset(skb); 1508 1509 spkt = &PACKET_SKB_CB(skb)->sa.pkt; 1510 1511 skb_push(skb, skb->data - skb_mac_header(skb)); 1512 1513 /* 1514 * The SOCK_PACKET socket receives _all_ frames. 1515 */ 1516 1517 spkt->spkt_family = dev->type; 1518 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); 1519 spkt->spkt_protocol = skb->protocol; 1520 1521 /* 1522 * Charge the memory to the socket. This is done specifically 1523 * to prevent sockets using all the memory up. 1524 */ 1525 1526 if (sock_queue_rcv_skb(sk, skb) == 0) 1527 return 0; 1528 1529 out: 1530 kfree_skb(skb); 1531 oom: 1532 return 0; 1533 } 1534 1535 1536 /* 1537 * Output a raw packet to a device layer. This bypasses all the other 1538 * protocol layers and you must therefore supply it with a complete frame 1539 */ 1540 1541 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, 1542 struct msghdr *msg, size_t len) 1543 { 1544 struct sock *sk = sock->sk; 1545 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name; 1546 struct sk_buff *skb = NULL; 1547 struct net_device *dev; 1548 __be16 proto = 0; 1549 int err; 1550 int extra_len = 0; 1551 1552 /* 1553 * Get and verify the address. 1554 */ 1555 1556 if (saddr) { 1557 if (msg->msg_namelen < sizeof(struct sockaddr)) 1558 return -EINVAL; 1559 if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) 1560 proto = saddr->spkt_protocol; 1561 } else 1562 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ 1563 1564 /* 1565 * Find the device first to size check it 1566 */ 1567 1568 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; 1569 retry: 1570 rcu_read_lock(); 1571 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); 1572 err = -ENODEV; 1573 if (dev == NULL) 1574 goto out_unlock; 1575 1576 err = -ENETDOWN; 1577 if (!(dev->flags & IFF_UP)) 1578 goto out_unlock; 1579 1580 /* 1581 * You may not queue a frame bigger than the mtu. This is the lowest level 1582 * raw protocol and you must do your own fragmentation at this level. 1583 */ 1584 1585 if (unlikely(sock_flag(sk, SOCK_NOFCS))) { 1586 if (!netif_supports_nofcs(dev)) { 1587 err = -EPROTONOSUPPORT; 1588 goto out_unlock; 1589 } 1590 extra_len = 4; /* We're doing our own CRC */ 1591 } 1592 1593 err = -EMSGSIZE; 1594 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) 1595 goto out_unlock; 1596 1597 if (!skb) { 1598 size_t reserved = LL_RESERVED_SPACE(dev); 1599 int tlen = dev->needed_tailroom; 1600 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; 1601 1602 rcu_read_unlock(); 1603 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); 1604 if (skb == NULL) 1605 return -ENOBUFS; 1606 /* FIXME: Save some space for broken drivers that write a hard 1607 * header at transmission time by themselves. PPP is the notable 1608 * one here. This should really be fixed at the driver level. 1609 */ 1610 skb_reserve(skb, reserved); 1611 skb_reset_network_header(skb); 1612 1613 /* Try to align data part correctly */ 1614 if (hhlen) { 1615 skb->data -= hhlen; 1616 skb->tail -= hhlen; 1617 if (len < hhlen) 1618 skb_reset_network_header(skb); 1619 } 1620 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); 1621 if (err) 1622 goto out_free; 1623 goto retry; 1624 } 1625 1626 if (len > (dev->mtu + dev->hard_header_len + extra_len)) { 1627 /* Earlier code assumed this would be a VLAN pkt, 1628 * double-check this now that we have the actual 1629 * packet in hand. 1630 */ 1631 struct ethhdr *ehdr; 1632 skb_reset_mac_header(skb); 1633 ehdr = eth_hdr(skb); 1634 if (ehdr->h_proto != htons(ETH_P_8021Q)) { 1635 err = -EMSGSIZE; 1636 goto out_unlock; 1637 } 1638 } 1639 1640 skb->protocol = proto; 1641 skb->dev = dev; 1642 skb->priority = sk->sk_priority; 1643 skb->mark = sk->sk_mark; 1644 1645 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); 1646 1647 if (unlikely(extra_len == 4)) 1648 skb->no_fcs = 1; 1649 1650 skb_probe_transport_header(skb, 0); 1651 1652 dev_queue_xmit(skb); 1653 rcu_read_unlock(); 1654 return len; 1655 1656 out_unlock: 1657 rcu_read_unlock(); 1658 out_free: 1659 kfree_skb(skb); 1660 return err; 1661 } 1662 1663 static unsigned int run_filter(const struct sk_buff *skb, 1664 const struct sock *sk, 1665 unsigned int res) 1666 { 1667 struct sk_filter *filter; 1668 1669 rcu_read_lock(); 1670 filter = rcu_dereference(sk->sk_filter); 1671 if (filter != NULL) 1672 res = SK_RUN_FILTER(filter, skb); 1673 rcu_read_unlock(); 1674 1675 return res; 1676 } 1677 1678 /* 1679 * This function makes lazy skb cloning in hope that most of packets 1680 * are discarded by BPF. 1681 * 1682 * Note tricky part: we DO mangle shared skb! skb->data, skb->len 1683 * and skb->cb are mangled. It works because (and until) packets 1684 * falling here are owned by current CPU. Output packets are cloned 1685 * by dev_queue_xmit_nit(), input packets are processed by net_bh 1686 * sequencially, so that if we return skb to original state on exit, 1687 * we will not harm anyone. 1688 */ 1689 1690 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, 1691 struct packet_type *pt, struct net_device *orig_dev) 1692 { 1693 struct sock *sk; 1694 struct sockaddr_ll *sll; 1695 struct packet_sock *po; 1696 u8 *skb_head = skb->data; 1697 int skb_len = skb->len; 1698 unsigned int snaplen, res; 1699 1700 if (skb->pkt_type == PACKET_LOOPBACK) 1701 goto drop; 1702 1703 sk = pt->af_packet_priv; 1704 po = pkt_sk(sk); 1705 1706 if (!net_eq(dev_net(dev), sock_net(sk))) 1707 goto drop; 1708 1709 skb->dev = dev; 1710 1711 if (dev->header_ops) { 1712 /* The device has an explicit notion of ll header, 1713 * exported to higher levels. 1714 * 1715 * Otherwise, the device hides details of its frame 1716 * structure, so that corresponding packet head is 1717 * never delivered to user. 1718 */ 1719 if (sk->sk_type != SOCK_DGRAM) 1720 skb_push(skb, skb->data - skb_mac_header(skb)); 1721 else if (skb->pkt_type == PACKET_OUTGOING) { 1722 /* Special case: outgoing packets have ll header at head */ 1723 skb_pull(skb, skb_network_offset(skb)); 1724 } 1725 } 1726 1727 snaplen = skb->len; 1728 1729 res = run_filter(skb, sk, snaplen); 1730 if (!res) 1731 goto drop_n_restore; 1732 if (snaplen > res) 1733 snaplen = res; 1734 1735 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) 1736 goto drop_n_acct; 1737 1738 if (skb_shared(skb)) { 1739 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); 1740 if (nskb == NULL) 1741 goto drop_n_acct; 1742 1743 if (skb_head != skb->data) { 1744 skb->data = skb_head; 1745 skb->len = skb_len; 1746 } 1747 consume_skb(skb); 1748 skb = nskb; 1749 } 1750 1751 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 > 1752 sizeof(skb->cb)); 1753 1754 sll = &PACKET_SKB_CB(skb)->sa.ll; 1755 sll->sll_family = AF_PACKET; 1756 sll->sll_hatype = dev->type; 1757 sll->sll_protocol = skb->protocol; 1758 sll->sll_pkttype = skb->pkt_type; 1759 if (unlikely(po->origdev)) 1760 sll->sll_ifindex = orig_dev->ifindex; 1761 else 1762 sll->sll_ifindex = dev->ifindex; 1763 1764 sll->sll_halen = dev_parse_header(skb, sll->sll_addr); 1765 1766 PACKET_SKB_CB(skb)->origlen = skb->len; 1767 1768 if (pskb_trim(skb, snaplen)) 1769 goto drop_n_acct; 1770 1771 skb_set_owner_r(skb, sk); 1772 skb->dev = NULL; 1773 skb_dst_drop(skb); 1774 1775 /* drop conntrack reference */ 1776 nf_reset(skb); 1777 1778 spin_lock(&sk->sk_receive_queue.lock); 1779 po->stats.stats1.tp_packets++; 1780 skb->dropcount = atomic_read(&sk->sk_drops); 1781 __skb_queue_tail(&sk->sk_receive_queue, skb); 1782 spin_unlock(&sk->sk_receive_queue.lock); 1783 sk->sk_data_ready(sk, skb->len); 1784 return 0; 1785 1786 drop_n_acct: 1787 spin_lock(&sk->sk_receive_queue.lock); 1788 po->stats.stats1.tp_drops++; 1789 atomic_inc(&sk->sk_drops); 1790 spin_unlock(&sk->sk_receive_queue.lock); 1791 1792 drop_n_restore: 1793 if (skb_head != skb->data && skb_shared(skb)) { 1794 skb->data = skb_head; 1795 skb->len = skb_len; 1796 } 1797 drop: 1798 consume_skb(skb); 1799 return 0; 1800 } 1801 1802 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, 1803 struct packet_type *pt, struct net_device *orig_dev) 1804 { 1805 struct sock *sk; 1806 struct packet_sock *po; 1807 struct sockaddr_ll *sll; 1808 union tpacket_uhdr h; 1809 u8 *skb_head = skb->data; 1810 int skb_len = skb->len; 1811 unsigned int snaplen, res; 1812 unsigned long status = TP_STATUS_USER; 1813 unsigned short macoff, netoff, hdrlen; 1814 struct sk_buff *copy_skb = NULL; 1815 struct timespec ts; 1816 __u32 ts_status; 1817 1818 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. 1819 * We may add members to them until current aligned size without forcing 1820 * userspace to call getsockopt(..., PACKET_HDRLEN, ...). 1821 */ 1822 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); 1823 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); 1824 1825 if (skb->pkt_type == PACKET_LOOPBACK) 1826 goto drop; 1827 1828 sk = pt->af_packet_priv; 1829 po = pkt_sk(sk); 1830 1831 if (!net_eq(dev_net(dev), sock_net(sk))) 1832 goto drop; 1833 1834 if (dev->header_ops) { 1835 if (sk->sk_type != SOCK_DGRAM) 1836 skb_push(skb, skb->data - skb_mac_header(skb)); 1837 else if (skb->pkt_type == PACKET_OUTGOING) { 1838 /* Special case: outgoing packets have ll header at head */ 1839 skb_pull(skb, skb_network_offset(skb)); 1840 } 1841 } 1842 1843 if (skb->ip_summed == CHECKSUM_PARTIAL) 1844 status |= TP_STATUS_CSUMNOTREADY; 1845 1846 snaplen = skb->len; 1847 1848 res = run_filter(skb, sk, snaplen); 1849 if (!res) 1850 goto drop_n_restore; 1851 if (snaplen > res) 1852 snaplen = res; 1853 1854 if (sk->sk_type == SOCK_DGRAM) { 1855 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + 1856 po->tp_reserve; 1857 } else { 1858 unsigned int maclen = skb_network_offset(skb); 1859 netoff = TPACKET_ALIGN(po->tp_hdrlen + 1860 (maclen < 16 ? 16 : maclen)) + 1861 po->tp_reserve; 1862 macoff = netoff - maclen; 1863 } 1864 if (po->tp_version <= TPACKET_V2) { 1865 if (macoff + snaplen > po->rx_ring.frame_size) { 1866 if (po->copy_thresh && 1867 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { 1868 if (skb_shared(skb)) { 1869 copy_skb = skb_clone(skb, GFP_ATOMIC); 1870 } else { 1871 copy_skb = skb_get(skb); 1872 skb_head = skb->data; 1873 } 1874 if (copy_skb) 1875 skb_set_owner_r(copy_skb, sk); 1876 } 1877 snaplen = po->rx_ring.frame_size - macoff; 1878 if ((int)snaplen < 0) 1879 snaplen = 0; 1880 } 1881 } 1882 spin_lock(&sk->sk_receive_queue.lock); 1883 h.raw = packet_current_rx_frame(po, skb, 1884 TP_STATUS_KERNEL, (macoff+snaplen)); 1885 if (!h.raw) 1886 goto ring_is_full; 1887 if (po->tp_version <= TPACKET_V2) { 1888 packet_increment_rx_head(po, &po->rx_ring); 1889 /* 1890 * LOSING will be reported till you read the stats, 1891 * because it's COR - Clear On Read. 1892 * Anyways, moving it for V1/V2 only as V3 doesn't need this 1893 * at packet level. 1894 */ 1895 if (po->stats.stats1.tp_drops) 1896 status |= TP_STATUS_LOSING; 1897 } 1898 po->stats.stats1.tp_packets++; 1899 if (copy_skb) { 1900 status |= TP_STATUS_COPY; 1901 __skb_queue_tail(&sk->sk_receive_queue, copy_skb); 1902 } 1903 spin_unlock(&sk->sk_receive_queue.lock); 1904 1905 skb_copy_bits(skb, 0, h.raw + macoff, snaplen); 1906 1907 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) 1908 getnstimeofday(&ts); 1909 1910 status |= ts_status; 1911 1912 switch (po->tp_version) { 1913 case TPACKET_V1: 1914 h.h1->tp_len = skb->len; 1915 h.h1->tp_snaplen = snaplen; 1916 h.h1->tp_mac = macoff; 1917 h.h1->tp_net = netoff; 1918 h.h1->tp_sec = ts.tv_sec; 1919 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; 1920 hdrlen = sizeof(*h.h1); 1921 break; 1922 case TPACKET_V2: 1923 h.h2->tp_len = skb->len; 1924 h.h2->tp_snaplen = snaplen; 1925 h.h2->tp_mac = macoff; 1926 h.h2->tp_net = netoff; 1927 h.h2->tp_sec = ts.tv_sec; 1928 h.h2->tp_nsec = ts.tv_nsec; 1929 if (vlan_tx_tag_present(skb)) { 1930 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); 1931 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); 1932 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; 1933 } else { 1934 h.h2->tp_vlan_tci = 0; 1935 h.h2->tp_vlan_tpid = 0; 1936 } 1937 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); 1938 hdrlen = sizeof(*h.h2); 1939 break; 1940 case TPACKET_V3: 1941 /* tp_nxt_offset,vlan are already populated above. 1942 * So DONT clear those fields here 1943 */ 1944 h.h3->tp_status |= status; 1945 h.h3->tp_len = skb->len; 1946 h.h3->tp_snaplen = snaplen; 1947 h.h3->tp_mac = macoff; 1948 h.h3->tp_net = netoff; 1949 h.h3->tp_sec = ts.tv_sec; 1950 h.h3->tp_nsec = ts.tv_nsec; 1951 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); 1952 hdrlen = sizeof(*h.h3); 1953 break; 1954 default: 1955 BUG(); 1956 } 1957 1958 sll = h.raw + TPACKET_ALIGN(hdrlen); 1959 sll->sll_halen = dev_parse_header(skb, sll->sll_addr); 1960 sll->sll_family = AF_PACKET; 1961 sll->sll_hatype = dev->type; 1962 sll->sll_protocol = skb->protocol; 1963 sll->sll_pkttype = skb->pkt_type; 1964 if (unlikely(po->origdev)) 1965 sll->sll_ifindex = orig_dev->ifindex; 1966 else 1967 sll->sll_ifindex = dev->ifindex; 1968 1969 smp_mb(); 1970 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 1971 { 1972 u8 *start, *end; 1973 1974 if (po->tp_version <= TPACKET_V2) { 1975 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw 1976 + macoff + snaplen); 1977 for (start = h.raw; start < end; start += PAGE_SIZE) 1978 flush_dcache_page(pgv_to_page(start)); 1979 } 1980 smp_wmb(); 1981 } 1982 #endif 1983 if (po->tp_version <= TPACKET_V2) 1984 __packet_set_status(po, h.raw, status); 1985 else 1986 prb_clear_blk_fill_status(&po->rx_ring); 1987 1988 sk->sk_data_ready(sk, 0); 1989 1990 drop_n_restore: 1991 if (skb_head != skb->data && skb_shared(skb)) { 1992 skb->data = skb_head; 1993 skb->len = skb_len; 1994 } 1995 drop: 1996 kfree_skb(skb); 1997 return 0; 1998 1999 ring_is_full: 2000 po->stats.stats1.tp_drops++; 2001 spin_unlock(&sk->sk_receive_queue.lock); 2002 2003 sk->sk_data_ready(sk, 0); 2004 kfree_skb(copy_skb); 2005 goto drop_n_restore; 2006 } 2007 2008 static void tpacket_destruct_skb(struct sk_buff *skb) 2009 { 2010 struct packet_sock *po = pkt_sk(skb->sk); 2011 void *ph; 2012 2013 if (likely(po->tx_ring.pg_vec)) { 2014 __u32 ts; 2015 2016 ph = skb_shinfo(skb)->destructor_arg; 2017 BUG_ON(atomic_read(&po->tx_ring.pending) == 0); 2018 atomic_dec(&po->tx_ring.pending); 2019 2020 ts = __packet_set_timestamp(po, ph, skb); 2021 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); 2022 } 2023 2024 sock_wfree(skb); 2025 } 2026 2027 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, 2028 void *frame, struct net_device *dev, int size_max, 2029 __be16 proto, unsigned char *addr, int hlen) 2030 { 2031 union tpacket_uhdr ph; 2032 int to_write, offset, len, tp_len, nr_frags, len_max; 2033 struct socket *sock = po->sk.sk_socket; 2034 struct page *page; 2035 void *data; 2036 int err; 2037 2038 ph.raw = frame; 2039 2040 skb->protocol = proto; 2041 skb->dev = dev; 2042 skb->priority = po->sk.sk_priority; 2043 skb->mark = po->sk.sk_mark; 2044 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); 2045 skb_shinfo(skb)->destructor_arg = ph.raw; 2046 2047 switch (po->tp_version) { 2048 case TPACKET_V2: 2049 tp_len = ph.h2->tp_len; 2050 break; 2051 default: 2052 tp_len = ph.h1->tp_len; 2053 break; 2054 } 2055 if (unlikely(tp_len > size_max)) { 2056 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); 2057 return -EMSGSIZE; 2058 } 2059 2060 skb_reserve(skb, hlen); 2061 skb_reset_network_header(skb); 2062 2063 if (!packet_use_direct_xmit(po)) 2064 skb_probe_transport_header(skb, 0); 2065 if (unlikely(po->tp_tx_has_off)) { 2066 int off_min, off_max, off; 2067 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); 2068 off_max = po->tx_ring.frame_size - tp_len; 2069 if (sock->type == SOCK_DGRAM) { 2070 switch (po->tp_version) { 2071 case TPACKET_V2: 2072 off = ph.h2->tp_net; 2073 break; 2074 default: 2075 off = ph.h1->tp_net; 2076 break; 2077 } 2078 } else { 2079 switch (po->tp_version) { 2080 case TPACKET_V2: 2081 off = ph.h2->tp_mac; 2082 break; 2083 default: 2084 off = ph.h1->tp_mac; 2085 break; 2086 } 2087 } 2088 if (unlikely((off < off_min) || (off_max < off))) 2089 return -EINVAL; 2090 data = ph.raw + off; 2091 } else { 2092 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); 2093 } 2094 to_write = tp_len; 2095 2096 if (sock->type == SOCK_DGRAM) { 2097 err = dev_hard_header(skb, dev, ntohs(proto), addr, 2098 NULL, tp_len); 2099 if (unlikely(err < 0)) 2100 return -EINVAL; 2101 } else if (dev->hard_header_len) { 2102 /* net device doesn't like empty head */ 2103 if (unlikely(tp_len <= dev->hard_header_len)) { 2104 pr_err("packet size is too short (%d < %d)\n", 2105 tp_len, dev->hard_header_len); 2106 return -EINVAL; 2107 } 2108 2109 skb_push(skb, dev->hard_header_len); 2110 err = skb_store_bits(skb, 0, data, 2111 dev->hard_header_len); 2112 if (unlikely(err)) 2113 return err; 2114 2115 data += dev->hard_header_len; 2116 to_write -= dev->hard_header_len; 2117 } 2118 2119 offset = offset_in_page(data); 2120 len_max = PAGE_SIZE - offset; 2121 len = ((to_write > len_max) ? len_max : to_write); 2122 2123 skb->data_len = to_write; 2124 skb->len += to_write; 2125 skb->truesize += to_write; 2126 atomic_add(to_write, &po->sk.sk_wmem_alloc); 2127 2128 while (likely(to_write)) { 2129 nr_frags = skb_shinfo(skb)->nr_frags; 2130 2131 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { 2132 pr_err("Packet exceed the number of skb frags(%lu)\n", 2133 MAX_SKB_FRAGS); 2134 return -EFAULT; 2135 } 2136 2137 page = pgv_to_page(data); 2138 data += len; 2139 flush_dcache_page(page); 2140 get_page(page); 2141 skb_fill_page_desc(skb, nr_frags, page, offset, len); 2142 to_write -= len; 2143 offset = 0; 2144 len_max = PAGE_SIZE; 2145 len = ((to_write > len_max) ? len_max : to_write); 2146 } 2147 2148 return tp_len; 2149 } 2150 2151 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) 2152 { 2153 struct sk_buff *skb; 2154 struct net_device *dev; 2155 __be16 proto; 2156 int err, reserve = 0; 2157 void *ph; 2158 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; 2159 int tp_len, size_max; 2160 unsigned char *addr; 2161 int len_sum = 0; 2162 int status = TP_STATUS_AVAILABLE; 2163 int hlen, tlen; 2164 2165 mutex_lock(&po->pg_vec_lock); 2166 2167 if (likely(saddr == NULL)) { 2168 dev = packet_cached_dev_get(po); 2169 proto = po->num; 2170 addr = NULL; 2171 } else { 2172 err = -EINVAL; 2173 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) 2174 goto out; 2175 if (msg->msg_namelen < (saddr->sll_halen 2176 + offsetof(struct sockaddr_ll, 2177 sll_addr))) 2178 goto out; 2179 proto = saddr->sll_protocol; 2180 addr = saddr->sll_addr; 2181 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); 2182 } 2183 2184 err = -ENXIO; 2185 if (unlikely(dev == NULL)) 2186 goto out; 2187 err = -ENETDOWN; 2188 if (unlikely(!(dev->flags & IFF_UP))) 2189 goto out_put; 2190 2191 reserve = dev->hard_header_len; 2192 2193 size_max = po->tx_ring.frame_size 2194 - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); 2195 2196 if (size_max > dev->mtu + reserve) 2197 size_max = dev->mtu + reserve; 2198 2199 do { 2200 ph = packet_current_frame(po, &po->tx_ring, 2201 TP_STATUS_SEND_REQUEST); 2202 2203 if (unlikely(ph == NULL)) { 2204 schedule(); 2205 continue; 2206 } 2207 2208 status = TP_STATUS_SEND_REQUEST; 2209 hlen = LL_RESERVED_SPACE(dev); 2210 tlen = dev->needed_tailroom; 2211 skb = sock_alloc_send_skb(&po->sk, 2212 hlen + tlen + sizeof(struct sockaddr_ll), 2213 0, &err); 2214 2215 if (unlikely(skb == NULL)) 2216 goto out_status; 2217 2218 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, 2219 addr, hlen); 2220 2221 if (unlikely(tp_len < 0)) { 2222 if (po->tp_loss) { 2223 __packet_set_status(po, ph, 2224 TP_STATUS_AVAILABLE); 2225 packet_increment_head(&po->tx_ring); 2226 kfree_skb(skb); 2227 continue; 2228 } else { 2229 status = TP_STATUS_WRONG_FORMAT; 2230 err = tp_len; 2231 goto out_status; 2232 } 2233 } 2234 2235 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); 2236 skb->destructor = tpacket_destruct_skb; 2237 __packet_set_status(po, ph, TP_STATUS_SENDING); 2238 atomic_inc(&po->tx_ring.pending); 2239 2240 status = TP_STATUS_SEND_REQUEST; 2241 err = po->xmit(skb); 2242 if (unlikely(err > 0)) { 2243 err = net_xmit_errno(err); 2244 if (err && __packet_get_status(po, ph) == 2245 TP_STATUS_AVAILABLE) { 2246 /* skb was destructed already */ 2247 skb = NULL; 2248 goto out_status; 2249 } 2250 /* 2251 * skb was dropped but not destructed yet; 2252 * let's treat it like congestion or err < 0 2253 */ 2254 err = 0; 2255 } 2256 packet_increment_head(&po->tx_ring); 2257 len_sum += tp_len; 2258 } while (likely((ph != NULL) || 2259 ((!(msg->msg_flags & MSG_DONTWAIT)) && 2260 (atomic_read(&po->tx_ring.pending)))) 2261 ); 2262 2263 err = len_sum; 2264 goto out_put; 2265 2266 out_status: 2267 __packet_set_status(po, ph, status); 2268 kfree_skb(skb); 2269 out_put: 2270 dev_put(dev); 2271 out: 2272 mutex_unlock(&po->pg_vec_lock); 2273 return err; 2274 } 2275 2276 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, 2277 size_t reserve, size_t len, 2278 size_t linear, int noblock, 2279 int *err) 2280 { 2281 struct sk_buff *skb; 2282 2283 /* Under a page? Don't bother with paged skb. */ 2284 if (prepad + len < PAGE_SIZE || !linear) 2285 linear = len; 2286 2287 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, 2288 err, 0); 2289 if (!skb) 2290 return NULL; 2291 2292 skb_reserve(skb, reserve); 2293 skb_put(skb, linear); 2294 skb->data_len = len - linear; 2295 skb->len += len - linear; 2296 2297 return skb; 2298 } 2299 2300 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) 2301 { 2302 struct sock *sk = sock->sk; 2303 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; 2304 struct sk_buff *skb; 2305 struct net_device *dev; 2306 __be16 proto; 2307 unsigned char *addr; 2308 int err, reserve = 0; 2309 struct virtio_net_hdr vnet_hdr = { 0 }; 2310 int offset = 0; 2311 int vnet_hdr_len; 2312 struct packet_sock *po = pkt_sk(sk); 2313 unsigned short gso_type = 0; 2314 int hlen, tlen; 2315 int extra_len = 0; 2316 2317 /* 2318 * Get and verify the address. 2319 */ 2320 2321 if (likely(saddr == NULL)) { 2322 dev = packet_cached_dev_get(po); 2323 proto = po->num; 2324 addr = NULL; 2325 } else { 2326 err = -EINVAL; 2327 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) 2328 goto out; 2329 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) 2330 goto out; 2331 proto = saddr->sll_protocol; 2332 addr = saddr->sll_addr; 2333 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); 2334 } 2335 2336 err = -ENXIO; 2337 if (unlikely(dev == NULL)) 2338 goto out_unlock; 2339 err = -ENETDOWN; 2340 if (unlikely(!(dev->flags & IFF_UP))) 2341 goto out_unlock; 2342 2343 if (sock->type == SOCK_RAW) 2344 reserve = dev->hard_header_len; 2345 if (po->has_vnet_hdr) { 2346 vnet_hdr_len = sizeof(vnet_hdr); 2347 2348 err = -EINVAL; 2349 if (len < vnet_hdr_len) 2350 goto out_unlock; 2351 2352 len -= vnet_hdr_len; 2353 2354 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov, 2355 vnet_hdr_len); 2356 if (err < 0) 2357 goto out_unlock; 2358 2359 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && 2360 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > 2361 vnet_hdr.hdr_len)) 2362 vnet_hdr.hdr_len = vnet_hdr.csum_start + 2363 vnet_hdr.csum_offset + 2; 2364 2365 err = -EINVAL; 2366 if (vnet_hdr.hdr_len > len) 2367 goto out_unlock; 2368 2369 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { 2370 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { 2371 case VIRTIO_NET_HDR_GSO_TCPV4: 2372 gso_type = SKB_GSO_TCPV4; 2373 break; 2374 case VIRTIO_NET_HDR_GSO_TCPV6: 2375 gso_type = SKB_GSO_TCPV6; 2376 break; 2377 case VIRTIO_NET_HDR_GSO_UDP: 2378 gso_type = SKB_GSO_UDP; 2379 break; 2380 default: 2381 goto out_unlock; 2382 } 2383 2384 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) 2385 gso_type |= SKB_GSO_TCP_ECN; 2386 2387 if (vnet_hdr.gso_size == 0) 2388 goto out_unlock; 2389 2390 } 2391 } 2392 2393 if (unlikely(sock_flag(sk, SOCK_NOFCS))) { 2394 if (!netif_supports_nofcs(dev)) { 2395 err = -EPROTONOSUPPORT; 2396 goto out_unlock; 2397 } 2398 extra_len = 4; /* We're doing our own CRC */ 2399 } 2400 2401 err = -EMSGSIZE; 2402 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) 2403 goto out_unlock; 2404 2405 err = -ENOBUFS; 2406 hlen = LL_RESERVED_SPACE(dev); 2407 tlen = dev->needed_tailroom; 2408 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len, 2409 msg->msg_flags & MSG_DONTWAIT, &err); 2410 if (skb == NULL) 2411 goto out_unlock; 2412 2413 skb_set_network_header(skb, reserve); 2414 2415 err = -EINVAL; 2416 if (sock->type == SOCK_DGRAM && 2417 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0) 2418 goto out_free; 2419 2420 /* Returns -EFAULT on error */ 2421 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); 2422 if (err) 2423 goto out_free; 2424 2425 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); 2426 2427 if (!gso_type && (len > dev->mtu + reserve + extra_len)) { 2428 /* Earlier code assumed this would be a VLAN pkt, 2429 * double-check this now that we have the actual 2430 * packet in hand. 2431 */ 2432 struct ethhdr *ehdr; 2433 skb_reset_mac_header(skb); 2434 ehdr = eth_hdr(skb); 2435 if (ehdr->h_proto != htons(ETH_P_8021Q)) { 2436 err = -EMSGSIZE; 2437 goto out_free; 2438 } 2439 } 2440 2441 skb->protocol = proto; 2442 skb->dev = dev; 2443 skb->priority = sk->sk_priority; 2444 skb->mark = sk->sk_mark; 2445 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); 2446 2447 if (po->has_vnet_hdr) { 2448 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 2449 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start, 2450 vnet_hdr.csum_offset)) { 2451 err = -EINVAL; 2452 goto out_free; 2453 } 2454 } 2455 2456 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size; 2457 skb_shinfo(skb)->gso_type = gso_type; 2458 2459 /* Header must be checked, and gso_segs computed. */ 2460 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 2461 skb_shinfo(skb)->gso_segs = 0; 2462 2463 len += vnet_hdr_len; 2464 } 2465 2466 if (!packet_use_direct_xmit(po)) 2467 skb_probe_transport_header(skb, reserve); 2468 if (unlikely(extra_len == 4)) 2469 skb->no_fcs = 1; 2470 2471 err = po->xmit(skb); 2472 if (err > 0 && (err = net_xmit_errno(err)) != 0) 2473 goto out_unlock; 2474 2475 dev_put(dev); 2476 2477 return len; 2478 2479 out_free: 2480 kfree_skb(skb); 2481 out_unlock: 2482 if (dev) 2483 dev_put(dev); 2484 out: 2485 return err; 2486 } 2487 2488 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, 2489 struct msghdr *msg, size_t len) 2490 { 2491 struct sock *sk = sock->sk; 2492 struct packet_sock *po = pkt_sk(sk); 2493 2494 if (po->tx_ring.pg_vec) 2495 return tpacket_snd(po, msg); 2496 else 2497 return packet_snd(sock, msg, len); 2498 } 2499 2500 /* 2501 * Close a PACKET socket. This is fairly simple. We immediately go 2502 * to 'closed' state and remove our protocol entry in the device list. 2503 */ 2504 2505 static int packet_release(struct socket *sock) 2506 { 2507 struct sock *sk = sock->sk; 2508 struct packet_sock *po; 2509 struct net *net; 2510 union tpacket_req_u req_u; 2511 2512 if (!sk) 2513 return 0; 2514 2515 net = sock_net(sk); 2516 po = pkt_sk(sk); 2517 2518 mutex_lock(&net->packet.sklist_lock); 2519 sk_del_node_init_rcu(sk); 2520 mutex_unlock(&net->packet.sklist_lock); 2521 2522 preempt_disable(); 2523 sock_prot_inuse_add(net, sk->sk_prot, -1); 2524 preempt_enable(); 2525 2526 spin_lock(&po->bind_lock); 2527 unregister_prot_hook(sk, false); 2528 packet_cached_dev_reset(po); 2529 2530 if (po->prot_hook.dev) { 2531 dev_put(po->prot_hook.dev); 2532 po->prot_hook.dev = NULL; 2533 } 2534 spin_unlock(&po->bind_lock); 2535 2536 packet_flush_mclist(sk); 2537 2538 if (po->rx_ring.pg_vec) { 2539 memset(&req_u, 0, sizeof(req_u)); 2540 packet_set_ring(sk, &req_u, 1, 0); 2541 } 2542 2543 if (po->tx_ring.pg_vec) { 2544 memset(&req_u, 0, sizeof(req_u)); 2545 packet_set_ring(sk, &req_u, 1, 1); 2546 } 2547 2548 fanout_release(sk); 2549 2550 synchronize_net(); 2551 /* 2552 * Now the socket is dead. No more input will appear. 2553 */ 2554 sock_orphan(sk); 2555 sock->sk = NULL; 2556 2557 /* Purge queues */ 2558 2559 skb_queue_purge(&sk->sk_receive_queue); 2560 sk_refcnt_debug_release(sk); 2561 2562 sock_put(sk); 2563 return 0; 2564 } 2565 2566 /* 2567 * Attach a packet hook. 2568 */ 2569 2570 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) 2571 { 2572 struct packet_sock *po = pkt_sk(sk); 2573 2574 if (po->fanout) { 2575 if (dev) 2576 dev_put(dev); 2577 2578 return -EINVAL; 2579 } 2580 2581 lock_sock(sk); 2582 2583 spin_lock(&po->bind_lock); 2584 unregister_prot_hook(sk, true); 2585 2586 po->num = protocol; 2587 po->prot_hook.type = protocol; 2588 if (po->prot_hook.dev) 2589 dev_put(po->prot_hook.dev); 2590 2591 po->prot_hook.dev = dev; 2592 po->ifindex = dev ? dev->ifindex : 0; 2593 2594 packet_cached_dev_assign(po, dev); 2595 2596 if (protocol == 0) 2597 goto out_unlock; 2598 2599 if (!dev || (dev->flags & IFF_UP)) { 2600 register_prot_hook(sk); 2601 } else { 2602 sk->sk_err = ENETDOWN; 2603 if (!sock_flag(sk, SOCK_DEAD)) 2604 sk->sk_error_report(sk); 2605 } 2606 2607 out_unlock: 2608 spin_unlock(&po->bind_lock); 2609 release_sock(sk); 2610 return 0; 2611 } 2612 2613 /* 2614 * Bind a packet socket to a device 2615 */ 2616 2617 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, 2618 int addr_len) 2619 { 2620 struct sock *sk = sock->sk; 2621 char name[15]; 2622 struct net_device *dev; 2623 int err = -ENODEV; 2624 2625 /* 2626 * Check legality 2627 */ 2628 2629 if (addr_len != sizeof(struct sockaddr)) 2630 return -EINVAL; 2631 strlcpy(name, uaddr->sa_data, sizeof(name)); 2632 2633 dev = dev_get_by_name(sock_net(sk), name); 2634 if (dev) 2635 err = packet_do_bind(sk, dev, pkt_sk(sk)->num); 2636 return err; 2637 } 2638 2639 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 2640 { 2641 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; 2642 struct sock *sk = sock->sk; 2643 struct net_device *dev = NULL; 2644 int err; 2645 2646 2647 /* 2648 * Check legality 2649 */ 2650 2651 if (addr_len < sizeof(struct sockaddr_ll)) 2652 return -EINVAL; 2653 if (sll->sll_family != AF_PACKET) 2654 return -EINVAL; 2655 2656 if (sll->sll_ifindex) { 2657 err = -ENODEV; 2658 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex); 2659 if (dev == NULL) 2660 goto out; 2661 } 2662 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num); 2663 2664 out: 2665 return err; 2666 } 2667 2668 static struct proto packet_proto = { 2669 .name = "PACKET", 2670 .owner = THIS_MODULE, 2671 .obj_size = sizeof(struct packet_sock), 2672 }; 2673 2674 /* 2675 * Create a packet of type SOCK_PACKET. 2676 */ 2677 2678 static int packet_create(struct net *net, struct socket *sock, int protocol, 2679 int kern) 2680 { 2681 struct sock *sk; 2682 struct packet_sock *po; 2683 __be16 proto = (__force __be16)protocol; /* weird, but documented */ 2684 int err; 2685 2686 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 2687 return -EPERM; 2688 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && 2689 sock->type != SOCK_PACKET) 2690 return -ESOCKTNOSUPPORT; 2691 2692 sock->state = SS_UNCONNECTED; 2693 2694 err = -ENOBUFS; 2695 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); 2696 if (sk == NULL) 2697 goto out; 2698 2699 sock->ops = &packet_ops; 2700 if (sock->type == SOCK_PACKET) 2701 sock->ops = &packet_ops_spkt; 2702 2703 sock_init_data(sock, sk); 2704 2705 po = pkt_sk(sk); 2706 sk->sk_family = PF_PACKET; 2707 po->num = proto; 2708 po->xmit = dev_queue_xmit; 2709 2710 packet_cached_dev_reset(po); 2711 2712 sk->sk_destruct = packet_sock_destruct; 2713 sk_refcnt_debug_inc(sk); 2714 2715 /* 2716 * Attach a protocol block 2717 */ 2718 2719 spin_lock_init(&po->bind_lock); 2720 mutex_init(&po->pg_vec_lock); 2721 po->prot_hook.func = packet_rcv; 2722 2723 if (sock->type == SOCK_PACKET) 2724 po->prot_hook.func = packet_rcv_spkt; 2725 2726 po->prot_hook.af_packet_priv = sk; 2727 2728 if (proto) { 2729 po->prot_hook.type = proto; 2730 register_prot_hook(sk); 2731 } 2732 2733 mutex_lock(&net->packet.sklist_lock); 2734 sk_add_node_rcu(sk, &net->packet.sklist); 2735 mutex_unlock(&net->packet.sklist_lock); 2736 2737 preempt_disable(); 2738 sock_prot_inuse_add(net, &packet_proto, 1); 2739 preempt_enable(); 2740 2741 return 0; 2742 out: 2743 return err; 2744 } 2745 2746 /* 2747 * Pull a packet from our receive queue and hand it to the user. 2748 * If necessary we block. 2749 */ 2750 2751 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, 2752 struct msghdr *msg, size_t len, int flags) 2753 { 2754 struct sock *sk = sock->sk; 2755 struct sk_buff *skb; 2756 int copied, err; 2757 int vnet_hdr_len = 0; 2758 2759 err = -EINVAL; 2760 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) 2761 goto out; 2762 2763 #if 0 2764 /* What error should we return now? EUNATTACH? */ 2765 if (pkt_sk(sk)->ifindex < 0) 2766 return -ENODEV; 2767 #endif 2768 2769 if (flags & MSG_ERRQUEUE) { 2770 err = sock_recv_errqueue(sk, msg, len, 2771 SOL_PACKET, PACKET_TX_TIMESTAMP); 2772 goto out; 2773 } 2774 2775 /* 2776 * Call the generic datagram receiver. This handles all sorts 2777 * of horrible races and re-entrancy so we can forget about it 2778 * in the protocol layers. 2779 * 2780 * Now it will return ENETDOWN, if device have just gone down, 2781 * but then it will block. 2782 */ 2783 2784 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); 2785 2786 /* 2787 * An error occurred so return it. Because skb_recv_datagram() 2788 * handles the blocking we don't see and worry about blocking 2789 * retries. 2790 */ 2791 2792 if (skb == NULL) 2793 goto out; 2794 2795 if (pkt_sk(sk)->has_vnet_hdr) { 2796 struct virtio_net_hdr vnet_hdr = { 0 }; 2797 2798 err = -EINVAL; 2799 vnet_hdr_len = sizeof(vnet_hdr); 2800 if (len < vnet_hdr_len) 2801 goto out_free; 2802 2803 len -= vnet_hdr_len; 2804 2805 if (skb_is_gso(skb)) { 2806 struct skb_shared_info *sinfo = skb_shinfo(skb); 2807 2808 /* This is a hint as to how much should be linear. */ 2809 vnet_hdr.hdr_len = skb_headlen(skb); 2810 vnet_hdr.gso_size = sinfo->gso_size; 2811 if (sinfo->gso_type & SKB_GSO_TCPV4) 2812 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 2813 else if (sinfo->gso_type & SKB_GSO_TCPV6) 2814 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 2815 else if (sinfo->gso_type & SKB_GSO_UDP) 2816 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; 2817 else if (sinfo->gso_type & SKB_GSO_FCOE) 2818 goto out_free; 2819 else 2820 BUG(); 2821 if (sinfo->gso_type & SKB_GSO_TCP_ECN) 2822 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; 2823 } else 2824 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; 2825 2826 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2827 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 2828 vnet_hdr.csum_start = skb_checksum_start_offset(skb); 2829 vnet_hdr.csum_offset = skb->csum_offset; 2830 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { 2831 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; 2832 } /* else everything is zero */ 2833 2834 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr, 2835 vnet_hdr_len); 2836 if (err < 0) 2837 goto out_free; 2838 } 2839 2840 /* You lose any data beyond the buffer you gave. If it worries 2841 * a user program they can ask the device for its MTU 2842 * anyway. 2843 */ 2844 copied = skb->len; 2845 if (copied > len) { 2846 copied = len; 2847 msg->msg_flags |= MSG_TRUNC; 2848 } 2849 2850 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 2851 if (err) 2852 goto out_free; 2853 2854 sock_recv_ts_and_drops(msg, sk, skb); 2855 2856 if (msg->msg_name) { 2857 /* If the address length field is there to be filled 2858 * in, we fill it in now. 2859 */ 2860 if (sock->type == SOCK_PACKET) { 2861 msg->msg_namelen = sizeof(struct sockaddr_pkt); 2862 } else { 2863 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; 2864 msg->msg_namelen = sll->sll_halen + 2865 offsetof(struct sockaddr_ll, sll_addr); 2866 } 2867 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, 2868 msg->msg_namelen); 2869 } 2870 2871 if (pkt_sk(sk)->auxdata) { 2872 struct tpacket_auxdata aux; 2873 2874 aux.tp_status = TP_STATUS_USER; 2875 if (skb->ip_summed == CHECKSUM_PARTIAL) 2876 aux.tp_status |= TP_STATUS_CSUMNOTREADY; 2877 aux.tp_len = PACKET_SKB_CB(skb)->origlen; 2878 aux.tp_snaplen = skb->len; 2879 aux.tp_mac = 0; 2880 aux.tp_net = skb_network_offset(skb); 2881 if (vlan_tx_tag_present(skb)) { 2882 aux.tp_vlan_tci = vlan_tx_tag_get(skb); 2883 aux.tp_vlan_tpid = ntohs(skb->vlan_proto); 2884 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; 2885 } else { 2886 aux.tp_vlan_tci = 0; 2887 aux.tp_vlan_tpid = 0; 2888 } 2889 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); 2890 } 2891 2892 /* 2893 * Free or return the buffer as appropriate. Again this 2894 * hides all the races and re-entrancy issues from us. 2895 */ 2896 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); 2897 2898 out_free: 2899 skb_free_datagram(sk, skb); 2900 out: 2901 return err; 2902 } 2903 2904 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, 2905 int *uaddr_len, int peer) 2906 { 2907 struct net_device *dev; 2908 struct sock *sk = sock->sk; 2909 2910 if (peer) 2911 return -EOPNOTSUPP; 2912 2913 uaddr->sa_family = AF_PACKET; 2914 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); 2915 rcu_read_lock(); 2916 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); 2917 if (dev) 2918 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); 2919 rcu_read_unlock(); 2920 *uaddr_len = sizeof(*uaddr); 2921 2922 return 0; 2923 } 2924 2925 static int packet_getname(struct socket *sock, struct sockaddr *uaddr, 2926 int *uaddr_len, int peer) 2927 { 2928 struct net_device *dev; 2929 struct sock *sk = sock->sk; 2930 struct packet_sock *po = pkt_sk(sk); 2931 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); 2932 2933 if (peer) 2934 return -EOPNOTSUPP; 2935 2936 sll->sll_family = AF_PACKET; 2937 sll->sll_ifindex = po->ifindex; 2938 sll->sll_protocol = po->num; 2939 sll->sll_pkttype = 0; 2940 rcu_read_lock(); 2941 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); 2942 if (dev) { 2943 sll->sll_hatype = dev->type; 2944 sll->sll_halen = dev->addr_len; 2945 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); 2946 } else { 2947 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ 2948 sll->sll_halen = 0; 2949 } 2950 rcu_read_unlock(); 2951 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; 2952 2953 return 0; 2954 } 2955 2956 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, 2957 int what) 2958 { 2959 switch (i->type) { 2960 case PACKET_MR_MULTICAST: 2961 if (i->alen != dev->addr_len) 2962 return -EINVAL; 2963 if (what > 0) 2964 return dev_mc_add(dev, i->addr); 2965 else 2966 return dev_mc_del(dev, i->addr); 2967 break; 2968 case PACKET_MR_PROMISC: 2969 return dev_set_promiscuity(dev, what); 2970 break; 2971 case PACKET_MR_ALLMULTI: 2972 return dev_set_allmulti(dev, what); 2973 break; 2974 case PACKET_MR_UNICAST: 2975 if (i->alen != dev->addr_len) 2976 return -EINVAL; 2977 if (what > 0) 2978 return dev_uc_add(dev, i->addr); 2979 else 2980 return dev_uc_del(dev, i->addr); 2981 break; 2982 default: 2983 break; 2984 } 2985 return 0; 2986 } 2987 2988 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) 2989 { 2990 for ( ; i; i = i->next) { 2991 if (i->ifindex == dev->ifindex) 2992 packet_dev_mc(dev, i, what); 2993 } 2994 } 2995 2996 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) 2997 { 2998 struct packet_sock *po = pkt_sk(sk); 2999 struct packet_mclist *ml, *i; 3000 struct net_device *dev; 3001 int err; 3002 3003 rtnl_lock(); 3004 3005 err = -ENODEV; 3006 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); 3007 if (!dev) 3008 goto done; 3009 3010 err = -EINVAL; 3011 if (mreq->mr_alen > dev->addr_len) 3012 goto done; 3013 3014 err = -ENOBUFS; 3015 i = kmalloc(sizeof(*i), GFP_KERNEL); 3016 if (i == NULL) 3017 goto done; 3018 3019 err = 0; 3020 for (ml = po->mclist; ml; ml = ml->next) { 3021 if (ml->ifindex == mreq->mr_ifindex && 3022 ml->type == mreq->mr_type && 3023 ml->alen == mreq->mr_alen && 3024 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { 3025 ml->count++; 3026 /* Free the new element ... */ 3027 kfree(i); 3028 goto done; 3029 } 3030 } 3031 3032 i->type = mreq->mr_type; 3033 i->ifindex = mreq->mr_ifindex; 3034 i->alen = mreq->mr_alen; 3035 memcpy(i->addr, mreq->mr_address, i->alen); 3036 i->count = 1; 3037 i->next = po->mclist; 3038 po->mclist = i; 3039 err = packet_dev_mc(dev, i, 1); 3040 if (err) { 3041 po->mclist = i->next; 3042 kfree(i); 3043 } 3044 3045 done: 3046 rtnl_unlock(); 3047 return err; 3048 } 3049 3050 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) 3051 { 3052 struct packet_mclist *ml, **mlp; 3053 3054 rtnl_lock(); 3055 3056 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { 3057 if (ml->ifindex == mreq->mr_ifindex && 3058 ml->type == mreq->mr_type && 3059 ml->alen == mreq->mr_alen && 3060 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { 3061 if (--ml->count == 0) { 3062 struct net_device *dev; 3063 *mlp = ml->next; 3064 dev = __dev_get_by_index(sock_net(sk), ml->ifindex); 3065 if (dev) 3066 packet_dev_mc(dev, ml, -1); 3067 kfree(ml); 3068 } 3069 rtnl_unlock(); 3070 return 0; 3071 } 3072 } 3073 rtnl_unlock(); 3074 return -EADDRNOTAVAIL; 3075 } 3076 3077 static void packet_flush_mclist(struct sock *sk) 3078 { 3079 struct packet_sock *po = pkt_sk(sk); 3080 struct packet_mclist *ml; 3081 3082 if (!po->mclist) 3083 return; 3084 3085 rtnl_lock(); 3086 while ((ml = po->mclist) != NULL) { 3087 struct net_device *dev; 3088 3089 po->mclist = ml->next; 3090 dev = __dev_get_by_index(sock_net(sk), ml->ifindex); 3091 if (dev != NULL) 3092 packet_dev_mc(dev, ml, -1); 3093 kfree(ml); 3094 } 3095 rtnl_unlock(); 3096 } 3097 3098 static int 3099 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) 3100 { 3101 struct sock *sk = sock->sk; 3102 struct packet_sock *po = pkt_sk(sk); 3103 int ret; 3104 3105 if (level != SOL_PACKET) 3106 return -ENOPROTOOPT; 3107 3108 switch (optname) { 3109 case PACKET_ADD_MEMBERSHIP: 3110 case PACKET_DROP_MEMBERSHIP: 3111 { 3112 struct packet_mreq_max mreq; 3113 int len = optlen; 3114 memset(&mreq, 0, sizeof(mreq)); 3115 if (len < sizeof(struct packet_mreq)) 3116 return -EINVAL; 3117 if (len > sizeof(mreq)) 3118 len = sizeof(mreq); 3119 if (copy_from_user(&mreq, optval, len)) 3120 return -EFAULT; 3121 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) 3122 return -EINVAL; 3123 if (optname == PACKET_ADD_MEMBERSHIP) 3124 ret = packet_mc_add(sk, &mreq); 3125 else 3126 ret = packet_mc_drop(sk, &mreq); 3127 return ret; 3128 } 3129 3130 case PACKET_RX_RING: 3131 case PACKET_TX_RING: 3132 { 3133 union tpacket_req_u req_u; 3134 int len; 3135 3136 switch (po->tp_version) { 3137 case TPACKET_V1: 3138 case TPACKET_V2: 3139 len = sizeof(req_u.req); 3140 break; 3141 case TPACKET_V3: 3142 default: 3143 len = sizeof(req_u.req3); 3144 break; 3145 } 3146 if (optlen < len) 3147 return -EINVAL; 3148 if (pkt_sk(sk)->has_vnet_hdr) 3149 return -EINVAL; 3150 if (copy_from_user(&req_u.req, optval, len)) 3151 return -EFAULT; 3152 return packet_set_ring(sk, &req_u, 0, 3153 optname == PACKET_TX_RING); 3154 } 3155 case PACKET_COPY_THRESH: 3156 { 3157 int val; 3158 3159 if (optlen != sizeof(val)) 3160 return -EINVAL; 3161 if (copy_from_user(&val, optval, sizeof(val))) 3162 return -EFAULT; 3163 3164 pkt_sk(sk)->copy_thresh = val; 3165 return 0; 3166 } 3167 case PACKET_VERSION: 3168 { 3169 int val; 3170 3171 if (optlen != sizeof(val)) 3172 return -EINVAL; 3173 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 3174 return -EBUSY; 3175 if (copy_from_user(&val, optval, sizeof(val))) 3176 return -EFAULT; 3177 switch (val) { 3178 case TPACKET_V1: 3179 case TPACKET_V2: 3180 case TPACKET_V3: 3181 po->tp_version = val; 3182 return 0; 3183 default: 3184 return -EINVAL; 3185 } 3186 } 3187 case PACKET_RESERVE: 3188 { 3189 unsigned int val; 3190 3191 if (optlen != sizeof(val)) 3192 return -EINVAL; 3193 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 3194 return -EBUSY; 3195 if (copy_from_user(&val, optval, sizeof(val))) 3196 return -EFAULT; 3197 po->tp_reserve = val; 3198 return 0; 3199 } 3200 case PACKET_LOSS: 3201 { 3202 unsigned int val; 3203 3204 if (optlen != sizeof(val)) 3205 return -EINVAL; 3206 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 3207 return -EBUSY; 3208 if (copy_from_user(&val, optval, sizeof(val))) 3209 return -EFAULT; 3210 po->tp_loss = !!val; 3211 return 0; 3212 } 3213 case PACKET_AUXDATA: 3214 { 3215 int val; 3216 3217 if (optlen < sizeof(val)) 3218 return -EINVAL; 3219 if (copy_from_user(&val, optval, sizeof(val))) 3220 return -EFAULT; 3221 3222 po->auxdata = !!val; 3223 return 0; 3224 } 3225 case PACKET_ORIGDEV: 3226 { 3227 int val; 3228 3229 if (optlen < sizeof(val)) 3230 return -EINVAL; 3231 if (copy_from_user(&val, optval, sizeof(val))) 3232 return -EFAULT; 3233 3234 po->origdev = !!val; 3235 return 0; 3236 } 3237 case PACKET_VNET_HDR: 3238 { 3239 int val; 3240 3241 if (sock->type != SOCK_RAW) 3242 return -EINVAL; 3243 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 3244 return -EBUSY; 3245 if (optlen < sizeof(val)) 3246 return -EINVAL; 3247 if (copy_from_user(&val, optval, sizeof(val))) 3248 return -EFAULT; 3249 3250 po->has_vnet_hdr = !!val; 3251 return 0; 3252 } 3253 case PACKET_TIMESTAMP: 3254 { 3255 int val; 3256 3257 if (optlen != sizeof(val)) 3258 return -EINVAL; 3259 if (copy_from_user(&val, optval, sizeof(val))) 3260 return -EFAULT; 3261 3262 po->tp_tstamp = val; 3263 return 0; 3264 } 3265 case PACKET_FANOUT: 3266 { 3267 int val; 3268 3269 if (optlen != sizeof(val)) 3270 return -EINVAL; 3271 if (copy_from_user(&val, optval, sizeof(val))) 3272 return -EFAULT; 3273 3274 return fanout_add(sk, val & 0xffff, val >> 16); 3275 } 3276 case PACKET_TX_HAS_OFF: 3277 { 3278 unsigned int val; 3279 3280 if (optlen != sizeof(val)) 3281 return -EINVAL; 3282 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 3283 return -EBUSY; 3284 if (copy_from_user(&val, optval, sizeof(val))) 3285 return -EFAULT; 3286 po->tp_tx_has_off = !!val; 3287 return 0; 3288 } 3289 case PACKET_QDISC_BYPASS: 3290 { 3291 int val; 3292 3293 if (optlen != sizeof(val)) 3294 return -EINVAL; 3295 if (copy_from_user(&val, optval, sizeof(val))) 3296 return -EFAULT; 3297 3298 po->xmit = val ? packet_direct_xmit : dev_queue_xmit; 3299 return 0; 3300 } 3301 default: 3302 return -ENOPROTOOPT; 3303 } 3304 } 3305 3306 static int packet_getsockopt(struct socket *sock, int level, int optname, 3307 char __user *optval, int __user *optlen) 3308 { 3309 int len; 3310 int val, lv = sizeof(val); 3311 struct sock *sk = sock->sk; 3312 struct packet_sock *po = pkt_sk(sk); 3313 void *data = &val; 3314 union tpacket_stats_u st; 3315 3316 if (level != SOL_PACKET) 3317 return -ENOPROTOOPT; 3318 3319 if (get_user(len, optlen)) 3320 return -EFAULT; 3321 3322 if (len < 0) 3323 return -EINVAL; 3324 3325 switch (optname) { 3326 case PACKET_STATISTICS: 3327 spin_lock_bh(&sk->sk_receive_queue.lock); 3328 memcpy(&st, &po->stats, sizeof(st)); 3329 memset(&po->stats, 0, sizeof(po->stats)); 3330 spin_unlock_bh(&sk->sk_receive_queue.lock); 3331 3332 if (po->tp_version == TPACKET_V3) { 3333 lv = sizeof(struct tpacket_stats_v3); 3334 st.stats3.tp_packets += st.stats3.tp_drops; 3335 data = &st.stats3; 3336 } else { 3337 lv = sizeof(struct tpacket_stats); 3338 st.stats1.tp_packets += st.stats1.tp_drops; 3339 data = &st.stats1; 3340 } 3341 3342 break; 3343 case PACKET_AUXDATA: 3344 val = po->auxdata; 3345 break; 3346 case PACKET_ORIGDEV: 3347 val = po->origdev; 3348 break; 3349 case PACKET_VNET_HDR: 3350 val = po->has_vnet_hdr; 3351 break; 3352 case PACKET_VERSION: 3353 val = po->tp_version; 3354 break; 3355 case PACKET_HDRLEN: 3356 if (len > sizeof(int)) 3357 len = sizeof(int); 3358 if (copy_from_user(&val, optval, len)) 3359 return -EFAULT; 3360 switch (val) { 3361 case TPACKET_V1: 3362 val = sizeof(struct tpacket_hdr); 3363 break; 3364 case TPACKET_V2: 3365 val = sizeof(struct tpacket2_hdr); 3366 break; 3367 case TPACKET_V3: 3368 val = sizeof(struct tpacket3_hdr); 3369 break; 3370 default: 3371 return -EINVAL; 3372 } 3373 break; 3374 case PACKET_RESERVE: 3375 val = po->tp_reserve; 3376 break; 3377 case PACKET_LOSS: 3378 val = po->tp_loss; 3379 break; 3380 case PACKET_TIMESTAMP: 3381 val = po->tp_tstamp; 3382 break; 3383 case PACKET_FANOUT: 3384 val = (po->fanout ? 3385 ((u32)po->fanout->id | 3386 ((u32)po->fanout->type << 16) | 3387 ((u32)po->fanout->flags << 24)) : 3388 0); 3389 break; 3390 case PACKET_TX_HAS_OFF: 3391 val = po->tp_tx_has_off; 3392 break; 3393 case PACKET_QDISC_BYPASS: 3394 val = packet_use_direct_xmit(po); 3395 break; 3396 default: 3397 return -ENOPROTOOPT; 3398 } 3399 3400 if (len > lv) 3401 len = lv; 3402 if (put_user(len, optlen)) 3403 return -EFAULT; 3404 if (copy_to_user(optval, data, len)) 3405 return -EFAULT; 3406 return 0; 3407 } 3408 3409 3410 static int packet_notifier(struct notifier_block *this, 3411 unsigned long msg, void *ptr) 3412 { 3413 struct sock *sk; 3414 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 3415 struct net *net = dev_net(dev); 3416 3417 rcu_read_lock(); 3418 sk_for_each_rcu(sk, &net->packet.sklist) { 3419 struct packet_sock *po = pkt_sk(sk); 3420 3421 switch (msg) { 3422 case NETDEV_UNREGISTER: 3423 if (po->mclist) 3424 packet_dev_mclist(dev, po->mclist, -1); 3425 /* fallthrough */ 3426 3427 case NETDEV_DOWN: 3428 if (dev->ifindex == po->ifindex) { 3429 spin_lock(&po->bind_lock); 3430 if (po->running) { 3431 __unregister_prot_hook(sk, false); 3432 sk->sk_err = ENETDOWN; 3433 if (!sock_flag(sk, SOCK_DEAD)) 3434 sk->sk_error_report(sk); 3435 } 3436 if (msg == NETDEV_UNREGISTER) { 3437 packet_cached_dev_reset(po); 3438 po->ifindex = -1; 3439 if (po->prot_hook.dev) 3440 dev_put(po->prot_hook.dev); 3441 po->prot_hook.dev = NULL; 3442 } 3443 spin_unlock(&po->bind_lock); 3444 } 3445 break; 3446 case NETDEV_UP: 3447 if (dev->ifindex == po->ifindex) { 3448 spin_lock(&po->bind_lock); 3449 if (po->num) 3450 register_prot_hook(sk); 3451 spin_unlock(&po->bind_lock); 3452 } 3453 break; 3454 } 3455 } 3456 rcu_read_unlock(); 3457 return NOTIFY_DONE; 3458 } 3459 3460 3461 static int packet_ioctl(struct socket *sock, unsigned int cmd, 3462 unsigned long arg) 3463 { 3464 struct sock *sk = sock->sk; 3465 3466 switch (cmd) { 3467 case SIOCOUTQ: 3468 { 3469 int amount = sk_wmem_alloc_get(sk); 3470 3471 return put_user(amount, (int __user *)arg); 3472 } 3473 case SIOCINQ: 3474 { 3475 struct sk_buff *skb; 3476 int amount = 0; 3477 3478 spin_lock_bh(&sk->sk_receive_queue.lock); 3479 skb = skb_peek(&sk->sk_receive_queue); 3480 if (skb) 3481 amount = skb->len; 3482 spin_unlock_bh(&sk->sk_receive_queue.lock); 3483 return put_user(amount, (int __user *)arg); 3484 } 3485 case SIOCGSTAMP: 3486 return sock_get_timestamp(sk, (struct timeval __user *)arg); 3487 case SIOCGSTAMPNS: 3488 return sock_get_timestampns(sk, (struct timespec __user *)arg); 3489 3490 #ifdef CONFIG_INET 3491 case SIOCADDRT: 3492 case SIOCDELRT: 3493 case SIOCDARP: 3494 case SIOCGARP: 3495 case SIOCSARP: 3496 case SIOCGIFADDR: 3497 case SIOCSIFADDR: 3498 case SIOCGIFBRDADDR: 3499 case SIOCSIFBRDADDR: 3500 case SIOCGIFNETMASK: 3501 case SIOCSIFNETMASK: 3502 case SIOCGIFDSTADDR: 3503 case SIOCSIFDSTADDR: 3504 case SIOCSIFFLAGS: 3505 return inet_dgram_ops.ioctl(sock, cmd, arg); 3506 #endif 3507 3508 default: 3509 return -ENOIOCTLCMD; 3510 } 3511 return 0; 3512 } 3513 3514 static unsigned int packet_poll(struct file *file, struct socket *sock, 3515 poll_table *wait) 3516 { 3517 struct sock *sk = sock->sk; 3518 struct packet_sock *po = pkt_sk(sk); 3519 unsigned int mask = datagram_poll(file, sock, wait); 3520 3521 spin_lock_bh(&sk->sk_receive_queue.lock); 3522 if (po->rx_ring.pg_vec) { 3523 if (!packet_previous_rx_frame(po, &po->rx_ring, 3524 TP_STATUS_KERNEL)) 3525 mask |= POLLIN | POLLRDNORM; 3526 } 3527 spin_unlock_bh(&sk->sk_receive_queue.lock); 3528 spin_lock_bh(&sk->sk_write_queue.lock); 3529 if (po->tx_ring.pg_vec) { 3530 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) 3531 mask |= POLLOUT | POLLWRNORM; 3532 } 3533 spin_unlock_bh(&sk->sk_write_queue.lock); 3534 return mask; 3535 } 3536 3537 3538 /* Dirty? Well, I still did not learn better way to account 3539 * for user mmaps. 3540 */ 3541 3542 static void packet_mm_open(struct vm_area_struct *vma) 3543 { 3544 struct file *file = vma->vm_file; 3545 struct socket *sock = file->private_data; 3546 struct sock *sk = sock->sk; 3547 3548 if (sk) 3549 atomic_inc(&pkt_sk(sk)->mapped); 3550 } 3551 3552 static void packet_mm_close(struct vm_area_struct *vma) 3553 { 3554 struct file *file = vma->vm_file; 3555 struct socket *sock = file->private_data; 3556 struct sock *sk = sock->sk; 3557 3558 if (sk) 3559 atomic_dec(&pkt_sk(sk)->mapped); 3560 } 3561 3562 static const struct vm_operations_struct packet_mmap_ops = { 3563 .open = packet_mm_open, 3564 .close = packet_mm_close, 3565 }; 3566 3567 static void free_pg_vec(struct pgv *pg_vec, unsigned int order, 3568 unsigned int len) 3569 { 3570 int i; 3571 3572 for (i = 0; i < len; i++) { 3573 if (likely(pg_vec[i].buffer)) { 3574 if (is_vmalloc_addr(pg_vec[i].buffer)) 3575 vfree(pg_vec[i].buffer); 3576 else 3577 free_pages((unsigned long)pg_vec[i].buffer, 3578 order); 3579 pg_vec[i].buffer = NULL; 3580 } 3581 } 3582 kfree(pg_vec); 3583 } 3584 3585 static char *alloc_one_pg_vec_page(unsigned long order) 3586 { 3587 char *buffer = NULL; 3588 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | 3589 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; 3590 3591 buffer = (char *) __get_free_pages(gfp_flags, order); 3592 3593 if (buffer) 3594 return buffer; 3595 3596 /* 3597 * __get_free_pages failed, fall back to vmalloc 3598 */ 3599 buffer = vzalloc((1 << order) * PAGE_SIZE); 3600 3601 if (buffer) 3602 return buffer; 3603 3604 /* 3605 * vmalloc failed, lets dig into swap here 3606 */ 3607 gfp_flags &= ~__GFP_NORETRY; 3608 buffer = (char *)__get_free_pages(gfp_flags, order); 3609 if (buffer) 3610 return buffer; 3611 3612 /* 3613 * complete and utter failure 3614 */ 3615 return NULL; 3616 } 3617 3618 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) 3619 { 3620 unsigned int block_nr = req->tp_block_nr; 3621 struct pgv *pg_vec; 3622 int i; 3623 3624 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL); 3625 if (unlikely(!pg_vec)) 3626 goto out; 3627 3628 for (i = 0; i < block_nr; i++) { 3629 pg_vec[i].buffer = alloc_one_pg_vec_page(order); 3630 if (unlikely(!pg_vec[i].buffer)) 3631 goto out_free_pgvec; 3632 } 3633 3634 out: 3635 return pg_vec; 3636 3637 out_free_pgvec: 3638 free_pg_vec(pg_vec, order, block_nr); 3639 pg_vec = NULL; 3640 goto out; 3641 } 3642 3643 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, 3644 int closing, int tx_ring) 3645 { 3646 struct pgv *pg_vec = NULL; 3647 struct packet_sock *po = pkt_sk(sk); 3648 int was_running, order = 0; 3649 struct packet_ring_buffer *rb; 3650 struct sk_buff_head *rb_queue; 3651 __be16 num; 3652 int err = -EINVAL; 3653 /* Added to avoid minimal code churn */ 3654 struct tpacket_req *req = &req_u->req; 3655 3656 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ 3657 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { 3658 WARN(1, "Tx-ring is not supported.\n"); 3659 goto out; 3660 } 3661 3662 rb = tx_ring ? &po->tx_ring : &po->rx_ring; 3663 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; 3664 3665 err = -EBUSY; 3666 if (!closing) { 3667 if (atomic_read(&po->mapped)) 3668 goto out; 3669 if (atomic_read(&rb->pending)) 3670 goto out; 3671 } 3672 3673 if (req->tp_block_nr) { 3674 /* Sanity tests and some calculations */ 3675 err = -EBUSY; 3676 if (unlikely(rb->pg_vec)) 3677 goto out; 3678 3679 switch (po->tp_version) { 3680 case TPACKET_V1: 3681 po->tp_hdrlen = TPACKET_HDRLEN; 3682 break; 3683 case TPACKET_V2: 3684 po->tp_hdrlen = TPACKET2_HDRLEN; 3685 break; 3686 case TPACKET_V3: 3687 po->tp_hdrlen = TPACKET3_HDRLEN; 3688 break; 3689 } 3690 3691 err = -EINVAL; 3692 if (unlikely((int)req->tp_block_size <= 0)) 3693 goto out; 3694 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) 3695 goto out; 3696 if (unlikely(req->tp_frame_size < po->tp_hdrlen + 3697 po->tp_reserve)) 3698 goto out; 3699 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) 3700 goto out; 3701 3702 rb->frames_per_block = req->tp_block_size/req->tp_frame_size; 3703 if (unlikely(rb->frames_per_block <= 0)) 3704 goto out; 3705 if (unlikely((rb->frames_per_block * req->tp_block_nr) != 3706 req->tp_frame_nr)) 3707 goto out; 3708 3709 err = -ENOMEM; 3710 order = get_order(req->tp_block_size); 3711 pg_vec = alloc_pg_vec(req, order); 3712 if (unlikely(!pg_vec)) 3713 goto out; 3714 switch (po->tp_version) { 3715 case TPACKET_V3: 3716 /* Transmit path is not supported. We checked 3717 * it above but just being paranoid 3718 */ 3719 if (!tx_ring) 3720 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); 3721 break; 3722 default: 3723 break; 3724 } 3725 } 3726 /* Done */ 3727 else { 3728 err = -EINVAL; 3729 if (unlikely(req->tp_frame_nr)) 3730 goto out; 3731 } 3732 3733 lock_sock(sk); 3734 3735 /* Detach socket from network */ 3736 spin_lock(&po->bind_lock); 3737 was_running = po->running; 3738 num = po->num; 3739 if (was_running) { 3740 po->num = 0; 3741 __unregister_prot_hook(sk, false); 3742 } 3743 spin_unlock(&po->bind_lock); 3744 3745 synchronize_net(); 3746 3747 err = -EBUSY; 3748 mutex_lock(&po->pg_vec_lock); 3749 if (closing || atomic_read(&po->mapped) == 0) { 3750 err = 0; 3751 spin_lock_bh(&rb_queue->lock); 3752 swap(rb->pg_vec, pg_vec); 3753 rb->frame_max = (req->tp_frame_nr - 1); 3754 rb->head = 0; 3755 rb->frame_size = req->tp_frame_size; 3756 spin_unlock_bh(&rb_queue->lock); 3757 3758 swap(rb->pg_vec_order, order); 3759 swap(rb->pg_vec_len, req->tp_block_nr); 3760 3761 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; 3762 po->prot_hook.func = (po->rx_ring.pg_vec) ? 3763 tpacket_rcv : packet_rcv; 3764 skb_queue_purge(rb_queue); 3765 if (atomic_read(&po->mapped)) 3766 pr_err("packet_mmap: vma is busy: %d\n", 3767 atomic_read(&po->mapped)); 3768 } 3769 mutex_unlock(&po->pg_vec_lock); 3770 3771 spin_lock(&po->bind_lock); 3772 if (was_running) { 3773 po->num = num; 3774 register_prot_hook(sk); 3775 } 3776 spin_unlock(&po->bind_lock); 3777 if (closing && (po->tp_version > TPACKET_V2)) { 3778 /* Because we don't support block-based V3 on tx-ring */ 3779 if (!tx_ring) 3780 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue); 3781 } 3782 release_sock(sk); 3783 3784 if (pg_vec) 3785 free_pg_vec(pg_vec, order, req->tp_block_nr); 3786 out: 3787 return err; 3788 } 3789 3790 static int packet_mmap(struct file *file, struct socket *sock, 3791 struct vm_area_struct *vma) 3792 { 3793 struct sock *sk = sock->sk; 3794 struct packet_sock *po = pkt_sk(sk); 3795 unsigned long size, expected_size; 3796 struct packet_ring_buffer *rb; 3797 unsigned long start; 3798 int err = -EINVAL; 3799 int i; 3800 3801 if (vma->vm_pgoff) 3802 return -EINVAL; 3803 3804 mutex_lock(&po->pg_vec_lock); 3805 3806 expected_size = 0; 3807 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { 3808 if (rb->pg_vec) { 3809 expected_size += rb->pg_vec_len 3810 * rb->pg_vec_pages 3811 * PAGE_SIZE; 3812 } 3813 } 3814 3815 if (expected_size == 0) 3816 goto out; 3817 3818 size = vma->vm_end - vma->vm_start; 3819 if (size != expected_size) 3820 goto out; 3821 3822 start = vma->vm_start; 3823 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { 3824 if (rb->pg_vec == NULL) 3825 continue; 3826 3827 for (i = 0; i < rb->pg_vec_len; i++) { 3828 struct page *page; 3829 void *kaddr = rb->pg_vec[i].buffer; 3830 int pg_num; 3831 3832 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { 3833 page = pgv_to_page(kaddr); 3834 err = vm_insert_page(vma, start, page); 3835 if (unlikely(err)) 3836 goto out; 3837 start += PAGE_SIZE; 3838 kaddr += PAGE_SIZE; 3839 } 3840 } 3841 } 3842 3843 atomic_inc(&po->mapped); 3844 vma->vm_ops = &packet_mmap_ops; 3845 err = 0; 3846 3847 out: 3848 mutex_unlock(&po->pg_vec_lock); 3849 return err; 3850 } 3851 3852 static const struct proto_ops packet_ops_spkt = { 3853 .family = PF_PACKET, 3854 .owner = THIS_MODULE, 3855 .release = packet_release, 3856 .bind = packet_bind_spkt, 3857 .connect = sock_no_connect, 3858 .socketpair = sock_no_socketpair, 3859 .accept = sock_no_accept, 3860 .getname = packet_getname_spkt, 3861 .poll = datagram_poll, 3862 .ioctl = packet_ioctl, 3863 .listen = sock_no_listen, 3864 .shutdown = sock_no_shutdown, 3865 .setsockopt = sock_no_setsockopt, 3866 .getsockopt = sock_no_getsockopt, 3867 .sendmsg = packet_sendmsg_spkt, 3868 .recvmsg = packet_recvmsg, 3869 .mmap = sock_no_mmap, 3870 .sendpage = sock_no_sendpage, 3871 }; 3872 3873 static const struct proto_ops packet_ops = { 3874 .family = PF_PACKET, 3875 .owner = THIS_MODULE, 3876 .release = packet_release, 3877 .bind = packet_bind, 3878 .connect = sock_no_connect, 3879 .socketpair = sock_no_socketpair, 3880 .accept = sock_no_accept, 3881 .getname = packet_getname, 3882 .poll = packet_poll, 3883 .ioctl = packet_ioctl, 3884 .listen = sock_no_listen, 3885 .shutdown = sock_no_shutdown, 3886 .setsockopt = packet_setsockopt, 3887 .getsockopt = packet_getsockopt, 3888 .sendmsg = packet_sendmsg, 3889 .recvmsg = packet_recvmsg, 3890 .mmap = packet_mmap, 3891 .sendpage = sock_no_sendpage, 3892 }; 3893 3894 static const struct net_proto_family packet_family_ops = { 3895 .family = PF_PACKET, 3896 .create = packet_create, 3897 .owner = THIS_MODULE, 3898 }; 3899 3900 static struct notifier_block packet_netdev_notifier = { 3901 .notifier_call = packet_notifier, 3902 }; 3903 3904 #ifdef CONFIG_PROC_FS 3905 3906 static void *packet_seq_start(struct seq_file *seq, loff_t *pos) 3907 __acquires(RCU) 3908 { 3909 struct net *net = seq_file_net(seq); 3910 3911 rcu_read_lock(); 3912 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); 3913 } 3914 3915 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3916 { 3917 struct net *net = seq_file_net(seq); 3918 return seq_hlist_next_rcu(v, &net->packet.sklist, pos); 3919 } 3920 3921 static void packet_seq_stop(struct seq_file *seq, void *v) 3922 __releases(RCU) 3923 { 3924 rcu_read_unlock(); 3925 } 3926 3927 static int packet_seq_show(struct seq_file *seq, void *v) 3928 { 3929 if (v == SEQ_START_TOKEN) 3930 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); 3931 else { 3932 struct sock *s = sk_entry(v); 3933 const struct packet_sock *po = pkt_sk(s); 3934 3935 seq_printf(seq, 3936 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", 3937 s, 3938 atomic_read(&s->sk_refcnt), 3939 s->sk_type, 3940 ntohs(po->num), 3941 po->ifindex, 3942 po->running, 3943 atomic_read(&s->sk_rmem_alloc), 3944 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), 3945 sock_i_ino(s)); 3946 } 3947 3948 return 0; 3949 } 3950 3951 static const struct seq_operations packet_seq_ops = { 3952 .start = packet_seq_start, 3953 .next = packet_seq_next, 3954 .stop = packet_seq_stop, 3955 .show = packet_seq_show, 3956 }; 3957 3958 static int packet_seq_open(struct inode *inode, struct file *file) 3959 { 3960 return seq_open_net(inode, file, &packet_seq_ops, 3961 sizeof(struct seq_net_private)); 3962 } 3963 3964 static const struct file_operations packet_seq_fops = { 3965 .owner = THIS_MODULE, 3966 .open = packet_seq_open, 3967 .read = seq_read, 3968 .llseek = seq_lseek, 3969 .release = seq_release_net, 3970 }; 3971 3972 #endif 3973 3974 static int __net_init packet_net_init(struct net *net) 3975 { 3976 mutex_init(&net->packet.sklist_lock); 3977 INIT_HLIST_HEAD(&net->packet.sklist); 3978 3979 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops)) 3980 return -ENOMEM; 3981 3982 return 0; 3983 } 3984 3985 static void __net_exit packet_net_exit(struct net *net) 3986 { 3987 remove_proc_entry("packet", net->proc_net); 3988 } 3989 3990 static struct pernet_operations packet_net_ops = { 3991 .init = packet_net_init, 3992 .exit = packet_net_exit, 3993 }; 3994 3995 3996 static void __exit packet_exit(void) 3997 { 3998 unregister_netdevice_notifier(&packet_netdev_notifier); 3999 unregister_pernet_subsys(&packet_net_ops); 4000 sock_unregister(PF_PACKET); 4001 proto_unregister(&packet_proto); 4002 } 4003 4004 static int __init packet_init(void) 4005 { 4006 int rc = proto_register(&packet_proto, 0); 4007 4008 if (rc != 0) 4009 goto out; 4010 4011 sock_register(&packet_family_ops); 4012 register_pernet_subsys(&packet_net_ops); 4013 register_netdevice_notifier(&packet_netdev_notifier); 4014 out: 4015 return rc; 4016 } 4017 4018 module_init(packet_init); 4019 module_exit(packet_exit); 4020 MODULE_LICENSE("GPL"); 4021 MODULE_ALIAS_NETPROTO(PF_PACKET); 4022