in_pcb.c (a0e610c43975ca0ec0bfc7d1df88d8b7a3cb871c) in_pcb.c (f3e7afe2d7b262ab55ab818445d4dfdb6e0c70a9)
1/*-
2 * Copyright (c) 1982, 1986, 1991, 1993, 1995
3 * The Regents of the University of California.
4 * Copyright (c) 2007-2009 Robert N. M. Watson
5 * Copyright (c) 2010-2011 Juniper Networks, Inc.
6 * All rights reserved.
7 *
8 * Portions of this software were developed by Robert N. M. Watson under

--- 28 unchanged lines hidden (view full) ---

37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD$");
40
41#include "opt_ddb.h"
42#include "opt_ipsec.h"
43#include "opt_inet.h"
44#include "opt_inet6.h"
1/*-
2 * Copyright (c) 1982, 1986, 1991, 1993, 1995
3 * The Regents of the University of California.
4 * Copyright (c) 2007-2009 Robert N. M. Watson
5 * Copyright (c) 2010-2011 Juniper Networks, Inc.
6 * All rights reserved.
7 *
8 * Portions of this software were developed by Robert N. M. Watson under

--- 28 unchanged lines hidden (view full) ---

37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD$");
40
41#include "opt_ddb.h"
42#include "opt_ipsec.h"
43#include "opt_inet.h"
44#include "opt_inet6.h"
45#include "opt_ratelimit.h"
45#include "opt_pcbgroup.h"
46#include "opt_rss.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/lock.h>
51#include <sys/malloc.h>
52#include <sys/mbuf.h>
53#include <sys/callout.h>
54#include <sys/eventhandler.h>
55#include <sys/domain.h>
56#include <sys/protosw.h>
57#include <sys/rmlock.h>
58#include <sys/socket.h>
59#include <sys/socketvar.h>
46#include "opt_pcbgroup.h"
47#include "opt_rss.h"
48
49#include <sys/param.h>
50#include <sys/systm.h>
51#include <sys/lock.h>
52#include <sys/malloc.h>
53#include <sys/mbuf.h>
54#include <sys/callout.h>
55#include <sys/eventhandler.h>
56#include <sys/domain.h>
57#include <sys/protosw.h>
58#include <sys/rmlock.h>
59#include <sys/socket.h>
60#include <sys/socketvar.h>
61#include <sys/sockio.h>
60#include <sys/priv.h>
61#include <sys/proc.h>
62#include <sys/refcount.h>
63#include <sys/jail.h>
64#include <sys/kernel.h>
65#include <sys/sysctl.h>
66
67#ifdef DDB

--- 1067 unchanged lines hidden (view full) ---

1135 * socket, in which case in_pcbfree() is deferred.
1136 */
1137void
1138in_pcbdetach(struct inpcb *inp)
1139{
1140
1141 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1142
62#include <sys/priv.h>
63#include <sys/proc.h>
64#include <sys/refcount.h>
65#include <sys/jail.h>
66#include <sys/kernel.h>
67#include <sys/sysctl.h>
68
69#ifdef DDB

--- 1067 unchanged lines hidden (view full) ---

1137 * socket, in which case in_pcbfree() is deferred.
1138 */
1139void
1140in_pcbdetach(struct inpcb *inp)
1141{
1142
1143 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1144
1145#ifdef RATELIMIT
1146 if (inp->inp_snd_tag != NULL)
1147 in_pcbdetach_txrtlmt(inp);
1148#endif
1143 inp->inp_socket->so_pcb = NULL;
1144 inp->inp_socket = NULL;
1145}
1146
1147/*
1148 * in_pcbref() bumps the reference count on an inpcb in order to maintain
1149 * stability of an inpcb pointer despite the inpcb lock being released. This
1150 * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,

--- 1521 unchanged lines hidden (view full) ---

2672 db_printf("usage: show inpcb <addr>\n");
2673 return;
2674 }
2675 inp = (struct inpcb *)addr;
2676
2677 db_print_inpcb(inp, "inpcb", 0);
2678}
2679#endif /* DDB */
1149 inp->inp_socket->so_pcb = NULL;
1150 inp->inp_socket = NULL;
1151}
1152
1153/*
1154 * in_pcbref() bumps the reference count on an inpcb in order to maintain
1155 * stability of an inpcb pointer despite the inpcb lock being released. This
1156 * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,

--- 1521 unchanged lines hidden (view full) ---

2678 db_printf("usage: show inpcb <addr>\n");
2679 return;
2680 }
2681 inp = (struct inpcb *)addr;
2682
2683 db_print_inpcb(inp, "inpcb", 0);
2684}
2685#endif /* DDB */
2686
2687#ifdef RATELIMIT
2688/*
2689 * Modify TX rate limit based on the existing "inp->inp_snd_tag",
2690 * if any.
2691 */
2692int
2693in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
2694{
2695 union if_snd_tag_modify_params params = {
2696 .rate_limit.max_rate = max_pacing_rate,
2697 };
2698 struct m_snd_tag *mst;
2699 struct ifnet *ifp;
2700 int error;
2701
2702 mst = inp->inp_snd_tag;
2703 if (mst == NULL)
2704 return (EINVAL);
2705
2706 ifp = mst->ifp;
2707 if (ifp == NULL)
2708 return (EINVAL);
2709
2710 if (ifp->if_snd_tag_modify == NULL) {
2711 error = EOPNOTSUPP;
2712 } else {
2713 error = ifp->if_snd_tag_modify(mst, &params);
2714 }
2715 return (error);
2716}
2717
2718/*
2719 * Query existing TX rate limit based on the existing
2720 * "inp->inp_snd_tag", if any.
2721 */
2722int
2723in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
2724{
2725 union if_snd_tag_query_params params = { };
2726 struct m_snd_tag *mst;
2727 struct ifnet *ifp;
2728 int error;
2729
2730 mst = inp->inp_snd_tag;
2731 if (mst == NULL)
2732 return (EINVAL);
2733
2734 ifp = mst->ifp;
2735 if (ifp == NULL)
2736 return (EINVAL);
2737
2738 if (ifp->if_snd_tag_query == NULL) {
2739 error = EOPNOTSUPP;
2740 } else {
2741 error = ifp->if_snd_tag_query(mst, &params);
2742 if (error == 0 && p_max_pacing_rate != NULL)
2743 *p_max_pacing_rate = params.rate_limit.max_rate;
2744 }
2745 return (error);
2746}
2747
2748/*
2749 * Allocate a new TX rate limit send tag from the network interface
2750 * given by the "ifp" argument and save it in "inp->inp_snd_tag":
2751 */
2752int
2753in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
2754 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
2755{
2756 union if_snd_tag_alloc_params params = {
2757 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
2758 .rate_limit.hdr.flowid = flowid,
2759 .rate_limit.hdr.flowtype = flowtype,
2760 .rate_limit.max_rate = max_pacing_rate,
2761 };
2762 int error;
2763
2764 INP_WLOCK_ASSERT(inp);
2765
2766 if (inp->inp_snd_tag != NULL)
2767 return (EINVAL);
2768
2769 if (ifp->if_snd_tag_alloc == NULL) {
2770 error = EOPNOTSUPP;
2771 } else {
2772 error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
2773
2774 /*
2775 * At success increment the refcount on
2776 * the send tag's network interface:
2777 */
2778 if (error == 0)
2779 if_ref(inp->inp_snd_tag->ifp);
2780 }
2781 return (error);
2782}
2783
2784/*
2785 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
2786 * if any:
2787 */
2788void
2789in_pcbdetach_txrtlmt(struct inpcb *inp)
2790{
2791 struct m_snd_tag *mst;
2792 struct ifnet *ifp;
2793
2794 INP_WLOCK_ASSERT(inp);
2795
2796 mst = inp->inp_snd_tag;
2797 inp->inp_snd_tag = NULL;
2798
2799 if (mst == NULL)
2800 return;
2801
2802 ifp = mst->ifp;
2803 if (ifp == NULL)
2804 return;
2805
2806 /*
2807 * If the device was detached while we still had reference(s)
2808 * on the ifp, we assume if_snd_tag_free() was replaced with
2809 * stubs.
2810 */
2811 ifp->if_snd_tag_free(mst);
2812
2813 /* release reference count on network interface */
2814 if_rele(ifp);
2815}
2816
2817/*
2818 * This function should be called when the INP_RATE_LIMIT_CHANGED flag
2819 * is set in the fast path and will attach/detach/modify the TX rate
2820 * limit send tag based on the socket's so_max_pacing_rate value.
2821 */
2822void
2823in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
2824{
2825 struct socket *socket;
2826 uint32_t max_pacing_rate;
2827 bool did_upgrade;
2828 int error;
2829
2830 if (inp == NULL)
2831 return;
2832
2833 socket = inp->inp_socket;
2834 if (socket == NULL)
2835 return;
2836
2837 if (!INP_WLOCKED(inp)) {
2838 /*
2839 * NOTE: If the write locking fails, we need to bail
2840 * out and use the non-ratelimited ring for the
2841 * transmit until there is a new chance to get the
2842 * write lock.
2843 */
2844 if (!INP_TRY_UPGRADE(inp))
2845 return;
2846 did_upgrade = 1;
2847 } else {
2848 did_upgrade = 0;
2849 }
2850
2851 /*
2852 * NOTE: The so_max_pacing_rate value is read unlocked,
2853 * because atomic updates are not required since the variable
2854 * is checked at every mbuf we send. It is assumed that the
2855 * variable read itself will be atomic.
2856 */
2857 max_pacing_rate = socket->so_max_pacing_rate;
2858
2859 /*
2860 * NOTE: When attaching to a network interface a reference is
2861 * made to ensure the network interface doesn't go away until
2862 * all ratelimit connections are gone. The network interface
2863 * pointers compared below represent valid network interfaces,
2864 * except when comparing towards NULL.
2865 */
2866 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
2867 error = 0;
2868 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
2869 if (inp->inp_snd_tag != NULL)
2870 in_pcbdetach_txrtlmt(inp);
2871 error = 0;
2872 } else if (inp->inp_snd_tag == NULL) {
2873 /*
2874 * In order to utilize packet pacing with RSS, we need
2875 * to wait until there is a valid RSS hash before we
2876 * can proceed:
2877 */
2878 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
2879 error = EAGAIN;
2880 } else {
2881 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
2882 mb->m_pkthdr.flowid, max_pacing_rate);
2883 }
2884 } else {
2885 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
2886 }
2887 if (error == 0 || error == EOPNOTSUPP)
2888 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
2889 if (did_upgrade)
2890 INP_DOWNGRADE(inp);
2891}
2892
2893/*
2894 * Track route changes for TX rate limiting.
2895 */
2896void
2897in_pcboutput_eagain(struct inpcb *inp)
2898{
2899 struct socket *socket;
2900 bool did_upgrade;
2901
2902 if (inp == NULL)
2903 return;
2904
2905 socket = inp->inp_socket;
2906 if (socket == NULL)
2907 return;
2908
2909 if (inp->inp_snd_tag == NULL)
2910 return;
2911
2912 if (!INP_WLOCKED(inp)) {
2913 /*
2914 * NOTE: If the write locking fails, we need to bail
2915 * out and use the non-ratelimited ring for the
2916 * transmit until there is a new chance to get the
2917 * write lock.
2918 */
2919 if (!INP_TRY_UPGRADE(inp))
2920 return;
2921 did_upgrade = 1;
2922 } else {
2923 did_upgrade = 0;
2924 }
2925
2926 /* detach rate limiting */
2927 in_pcbdetach_txrtlmt(inp);
2928
2929 /* make sure new mbuf send tag allocation is made */
2930 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
2931
2932 if (did_upgrade)
2933 INP_DOWNGRADE(inp);
2934}
2935#endif /* RATELIMIT */