netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
@ 2017-08-31 23:27 Sridhar Samudrala
  2017-09-10  1:28 ` Alexander Duyck
                   ` (4 more replies)
  0 siblings, 5 replies; 21+ messages in thread
From: Sridhar Samudrala @ 2017-08-31 23:27 UTC (permalink / raw)
  To: alexander.h.duyck, netdev

This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
to enable symmetric tx and rx queues on a socket.

This option is specifically useful for epoll based multi threaded workloads
where each thread handles packets received on a single RX queue . In this model,
we have noticed that it helps to send the packets on the same TX queue
corresponding to the queue-pair associated with the RX queue specifically when
busy poll is enabled with epoll().

Two new fields are added to struct sock_common to cache the last rx ifindex and
the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached
rx queue when this option is enabled and the TX is happening on the same device.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 include/net/request_sock.h        |  1 +
 include/net/sock.h                | 17 +++++++++++++++++
 include/uapi/asm-generic/socket.h |  2 ++
 net/core/dev.c                    |  8 +++++++-
 net/core/sock.c                   | 10 ++++++++++
 net/ipv4/tcp_input.c              |  1 +
 net/ipv4/tcp_ipv4.c               |  1 +
 net/ipv4/tcp_minisocks.c          |  1 +
 8 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 23e2205..c3bc12e 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct request_sock *req)
 	req_to_sk(req)->sk_prot = sk_listener->sk_prot;
 	sk_node_init(&req_to_sk(req)->sk_node);
 	sk_tx_queue_clear(req_to_sk(req));
+	req_to_sk(req)->sk_symmetric_queues = sk_listener->sk_symmetric_queues;
 	req->saved_syn = NULL;
 	refcount_set(&req->rsk_refcnt, 0);
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 03a3625..3421809 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char *msg, ...)
  *	@skc_node: main hash linkage for various protocol lookup tables
  *	@skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  *	@skc_tx_queue_mapping: tx queue number for this connection
+ *	@skc_rx_queue_mapping: rx queue number for this connection
+ *	@skc_rx_ifindex: rx ifindex for this connection
  *	@skc_flags: place holder for sk_flags
  *		%SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  *		%SO_OOBINLINE settings, %SO_TIMESTAMPING settings
  *	@skc_incoming_cpu: record/match cpu processing incoming packets
  *	@skc_refcnt: reference count
+ *	@skc_symmetric_queues: symmetric tx/rx queues
  *
  *	This is the minimal network layer representation of sockets, the header
  *	for struct sock and struct inet_timewait_sock.
@@ -177,6 +180,7 @@ struct sock_common {
 	unsigned char		skc_reuseport:1;
 	unsigned char		skc_ipv6only:1;
 	unsigned char		skc_net_refcnt:1;
+	unsigned char		skc_symmetric_queues:1;
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
@@ -214,6 +218,8 @@ struct sock_common {
 		struct hlist_nulls_node skc_nulls_node;
 	};
 	int			skc_tx_queue_mapping;
+	int			skc_rx_queue_mapping;
+	int			skc_rx_ifindex;
 	union {
 		int		skc_incoming_cpu;
 		u32		skc_rcv_wnd;
@@ -324,6 +330,8 @@ struct sock {
 #define sk_nulls_node		__sk_common.skc_nulls_node
 #define sk_refcnt		__sk_common.skc_refcnt
 #define sk_tx_queue_mapping	__sk_common.skc_tx_queue_mapping
+#define sk_rx_queue_mapping	__sk_common.skc_rx_queue_mapping
+#define sk_rx_ifindex		__sk_common.skc_rx_ifindex
 
 #define sk_dontcopy_begin	__sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end		__sk_common.skc_dontcopy_end
@@ -340,6 +348,7 @@ struct sock {
 #define sk_reuseport		__sk_common.skc_reuseport
 #define sk_ipv6only		__sk_common.skc_ipv6only
 #define sk_net_refcnt		__sk_common.skc_net_refcnt
+#define sk_symmetric_queues	__sk_common.skc_symmetric_queues
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_prot			__sk_common.skc_prot
@@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
 	return sk ? sk->sk_tx_queue_mapping : -1;
 }
 
+static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
+{
+	if (sk->sk_symmetric_queues) {
+		sk->sk_rx_ifindex = skb->skb_iif;
+		sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+	}
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
 	sk_tx_queue_clear(sk);
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index e47c9e4..f6b416e 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -106,4 +106,6 @@
 
 #define SO_ZEROCOPY		60
 
+#define SO_SYMMETRIC_QUEUES	61
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 270b547..d96cda8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 
 	if (queue_index < 0 || skb->ooo_okay ||
 	    queue_index >= dev->real_num_tx_queues) {
-		int new_index = get_xps_queue(dev, skb);
+		int new_index = -1;
+
+		if (sk && sk->sk_symmetric_queues && dev->ifindex == sk->sk_rx_ifindex)
+			new_index = sk->sk_rx_queue_mapping;
+
+		if (new_index < 0 || new_index >= dev->real_num_tx_queues)
+			new_index = get_xps_queue(dev, skb);
 
 		if (new_index < 0)
 			new_index = skb_tx_hash(dev, skb);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9b7b6bb..3876cce 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1059,6 +1059,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
 		break;
 
+	case SO_SYMMETRIC_QUEUES:
+		sk->sk_symmetric_queues = valbool;
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -1391,6 +1395,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
 		break;
 
+	case SO_SYMMETRIC_QUEUES:
+		v.val = sk->sk_symmetric_queues;
+		break;
+
 	default:
 		/* We implement the SO_SNDLOWAT etc to not be settable
 		 * (1003.1g 7).
@@ -2738,6 +2746,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_max_pacing_rate = ~0U;
 	sk->sk_pacing_rate = ~0U;
 	sk->sk_incoming_cpu = -1;
+	sk->sk_rx_ifindex = -1;
+	sk->sk_rx_queue_mapping = -1;
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5d7656..12381e0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6356,6 +6356,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tcp_rsk(req)->snt_isn = isn;
 	tcp_rsk(req)->txhash = net_tx_rndhash();
 	tcp_openreq_init_rwin(req, sk, dst);
+	sk_mark_rx_queue(req_to_sk(req), skb);
 	if (!want_cookie) {
 		tcp_reqsk_record_syn(sk, req, skb);
 		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a63486a..82f9af4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1450,6 +1450,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 
 		sock_rps_save_rxhash(sk, skb);
 		sk_mark_napi_id(sk, skb);
+		sk_mark_rx_queue(sk, skb);
 		if (dst) {
 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
 			    !dst->ops->check(dst, 0)) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 188a6f3..2b5efd5 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -809,6 +809,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 
 	/* record NAPI ID of child */
 	sk_mark_napi_id(child, skb);
+	sk_mark_rx_queue(child, skb);
 
 	tcp_segs_in(tcp_sk(child), skb);
 	if (!sock_owned_by_user(child)) {
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-08-31 23:27 [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue Sridhar Samudrala
@ 2017-09-10  1:28 ` Alexander Duyck
  2017-09-11 16:48   ` Samudrala, Sridhar
  2017-09-10  5:32 ` Tom Herbert
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 21+ messages in thread
From: Alexander Duyck @ 2017-09-10  1:28 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: Duyck, Alexander H, Netdev

On Thu, Aug 31, 2017 at 4:27 PM, Sridhar Samudrala
<sridhar.samudrala@intel.com> wrote:
> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
> to enable symmetric tx and rx queues on a socket.
>
> This option is specifically useful for epoll based multi threaded workloads
> where each thread handles packets received on a single RX queue . In this model,
> we have noticed that it helps to send the packets on the same TX queue
> corresponding to the queue-pair associated with the RX queue specifically when
> busy poll is enabled with epoll().
>
> Two new fields are added to struct sock_common to cache the last rx ifindex and
> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached
> rx queue when this option is enabled and the TX is happening on the same device.
>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> ---
>  include/net/request_sock.h        |  1 +
>  include/net/sock.h                | 17 +++++++++++++++++
>  include/uapi/asm-generic/socket.h |  2 ++
>  net/core/dev.c                    |  8 +++++++-
>  net/core/sock.c                   | 10 ++++++++++
>  net/ipv4/tcp_input.c              |  1 +
>  net/ipv4/tcp_ipv4.c               |  1 +
>  net/ipv4/tcp_minisocks.c          |  1 +
>  8 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
> index 23e2205..c3bc12e 100644
> --- a/include/net/request_sock.h
> +++ b/include/net/request_sock.h
> @@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct request_sock *req)
>         req_to_sk(req)->sk_prot = sk_listener->sk_prot;
>         sk_node_init(&req_to_sk(req)->sk_node);
>         sk_tx_queue_clear(req_to_sk(req));
> +       req_to_sk(req)->sk_symmetric_queues = sk_listener->sk_symmetric_queues;
>         req->saved_syn = NULL;
>         refcount_set(&req->rsk_refcnt, 0);
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 03a3625..3421809 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char *msg, ...)
>   *     @skc_node: main hash linkage for various protocol lookup tables
>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>   *     @skc_tx_queue_mapping: tx queue number for this connection
> + *     @skc_rx_queue_mapping: rx queue number for this connection
> + *     @skc_rx_ifindex: rx ifindex for this connection
>   *     @skc_flags: place holder for sk_flags
>   *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>   *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>   *     @skc_incoming_cpu: record/match cpu processing incoming packets
>   *     @skc_refcnt: reference count
> + *     @skc_symmetric_queues: symmetric tx/rx queues
>   *
>   *     This is the minimal network layer representation of sockets, the header
>   *     for struct sock and struct inet_timewait_sock.
> @@ -177,6 +180,7 @@ struct sock_common {
>         unsigned char           skc_reuseport:1;
>         unsigned char           skc_ipv6only:1;
>         unsigned char           skc_net_refcnt:1;
> +       unsigned char           skc_symmetric_queues:1;
>         int                     skc_bound_dev_if;
>         union {
>                 struct hlist_node       skc_bind_node;
> @@ -214,6 +218,8 @@ struct sock_common {
>                 struct hlist_nulls_node skc_nulls_node;
>         };
>         int                     skc_tx_queue_mapping;
> +       int                     skc_rx_queue_mapping;
> +       int                     skc_rx_ifindex;
>         union {
>                 int             skc_incoming_cpu;
>                 u32             skc_rcv_wnd;
> @@ -324,6 +330,8 @@ struct sock {
>  #define sk_nulls_node          __sk_common.skc_nulls_node
>  #define sk_refcnt              __sk_common.skc_refcnt
>  #define sk_tx_queue_mapping    __sk_common.skc_tx_queue_mapping
> +#define sk_rx_queue_mapping    __sk_common.skc_rx_queue_mapping
> +#define sk_rx_ifindex          __sk_common.skc_rx_ifindex
>
>  #define sk_dontcopy_begin      __sk_common.skc_dontcopy_begin
>  #define sk_dontcopy_end                __sk_common.skc_dontcopy_end
> @@ -340,6 +348,7 @@ struct sock {
>  #define sk_reuseport           __sk_common.skc_reuseport
>  #define sk_ipv6only            __sk_common.skc_ipv6only
>  #define sk_net_refcnt          __sk_common.skc_net_refcnt
> +#define sk_symmetric_queues    __sk_common.skc_symmetric_queues
>  #define sk_bound_dev_if                __sk_common.skc_bound_dev_if
>  #define sk_bind_node           __sk_common.skc_bind_node
>  #define sk_prot                        __sk_common.skc_prot
> @@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
>         return sk ? sk->sk_tx_queue_mapping : -1;
>  }
>
> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
> +{
> +       if (sk->sk_symmetric_queues) {
> +               sk->sk_rx_ifindex = skb->skb_iif;
> +               sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
> +       }
> +}
> +
>  static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>  {
>         sk_tx_queue_clear(sk);
> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
> index e47c9e4..f6b416e 100644
> --- a/include/uapi/asm-generic/socket.h
> +++ b/include/uapi/asm-generic/socket.h
> @@ -106,4 +106,6 @@
>
>  #define SO_ZEROCOPY            60
>
> +#define SO_SYMMETRIC_QUEUES    61
> +
>  #endif /* __ASM_GENERIC_SOCKET_H */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 270b547..d96cda8 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
>
>         if (queue_index < 0 || skb->ooo_okay ||
>             queue_index >= dev->real_num_tx_queues) {
> -               int new_index = get_xps_queue(dev, skb);
> +               int new_index = -1;
> +
> +               if (sk && sk->sk_symmetric_queues && dev->ifindex == sk->sk_rx_ifindex)
> +                       new_index = sk->sk_rx_queue_mapping;
> +
> +               if (new_index < 0 || new_index >= dev->real_num_tx_queues)
> +                       new_index = get_xps_queue(dev, skb);
>
>                 if (new_index < 0)
>                         new_index = skb_tx_hash(dev, skb);

So one thing I am not sure about is if we should be overriding XPS. It
might make sense to instead place this after XPS so that if the root
user configures it then it applies, otherwise if the socket is
requesting symmetric queues you could fall back to that, and then
finally just use hashing as the final solution for distributing the
workload.

That way if somebody decides to reserve queues for some sort of
specific traffic like AF_PACKET then they can configure the Tx via
XPS, configure the Rx via RSS redirection table reprogramming, and
then setup a filters on the hardware to direct the traffic they want
to the queues that are running AF_PACKET.

> diff --git a/net/core/sock.c b/net/core/sock.c
> index 9b7b6bb..3876cce 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1059,6 +1059,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
>                         sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
>                 break;
>
> +       case SO_SYMMETRIC_QUEUES:
> +               sk->sk_symmetric_queues = valbool;
> +               break;
> +
>         default:
>                 ret = -ENOPROTOOPT;
>                 break;
> @@ -1391,6 +1395,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
>                 break;
>
> +       case SO_SYMMETRIC_QUEUES:
> +               v.val = sk->sk_symmetric_queues;
> +               break;
> +
>         default:
>                 /* We implement the SO_SNDLOWAT etc to not be settable
>                  * (1003.1g 7).
> @@ -2738,6 +2746,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
>         sk->sk_max_pacing_rate = ~0U;
>         sk->sk_pacing_rate = ~0U;
>         sk->sk_incoming_cpu = -1;
> +       sk->sk_rx_ifindex = -1;
> +       sk->sk_rx_queue_mapping = -1;
>         /*
>          * Before updating sk_refcnt, we must commit prior changes to memory
>          * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index c5d7656..12381e0 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -6356,6 +6356,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
>         tcp_rsk(req)->snt_isn = isn;
>         tcp_rsk(req)->txhash = net_tx_rndhash();
>         tcp_openreq_init_rwin(req, sk, dst);
> +       sk_mark_rx_queue(req_to_sk(req), skb);
>         if (!want_cookie) {
>                 tcp_reqsk_record_syn(sk, req, skb);
>                 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index a63486a..82f9af4 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1450,6 +1450,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
>
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_mark_rx_queue(sk, skb);
>                 if (dst) {
>                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
>                             !dst->ops->check(dst, 0)) {
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index 188a6f3..2b5efd5 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -809,6 +809,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
>
>         /* record NAPI ID of child */
>         sk_mark_napi_id(child, skb);
> +       sk_mark_rx_queue(child, skb);
>
>         tcp_segs_in(tcp_sk(child), skb);
>         if (!sock_owned_by_user(child)) {
> --
> 1.8.3.1
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-08-31 23:27 [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue Sridhar Samudrala
  2017-09-10  1:28 ` Alexander Duyck
@ 2017-09-10  5:32 ` Tom Herbert
  2017-09-10 15:19 ` Tom Herbert
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 21+ messages in thread
From: Tom Herbert @ 2017-09-10  5:32 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: Alexander Duyck, Linux Kernel Network Developers

On Thu, Aug 31, 2017 at 4:27 PM, Sridhar Samudrala
<sridhar.samudrala@intel.com> wrote:
> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
> to enable symmetric tx and rx queues on a socket.
>
> This option is specifically useful for epoll based multi threaded workloads
> where each thread handles packets received on a single RX queue . In this model,
> we have noticed that it helps to send the packets on the same TX queue
> corresponding to the queue-pair associated with the RX queue specifically when
> busy poll is enabled with epoll().
>
Please provide more details, test results on exactly how this helps.
Why would this better than than optimized XPS?

Thanks,
Tom

> Two new fields are added to struct sock_common to cache the last rx ifindex and
> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached
> rx queue when this option is enabled and the TX is happening on the same device.
>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> ---
>  include/net/request_sock.h        |  1 +
>  include/net/sock.h                | 17 +++++++++++++++++
>  include/uapi/asm-generic/socket.h |  2 ++
>  net/core/dev.c                    |  8 +++++++-
>  net/core/sock.c                   | 10 ++++++++++
>  net/ipv4/tcp_input.c              |  1 +
>  net/ipv4/tcp_ipv4.c               |  1 +
>  net/ipv4/tcp_minisocks.c          |  1 +
>  8 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
> index 23e2205..c3bc12e 100644
> --- a/include/net/request_sock.h
> +++ b/include/net/request_sock.h
> @@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct request_sock *req)
>         req_to_sk(req)->sk_prot = sk_listener->sk_prot;
>         sk_node_init(&req_to_sk(req)->sk_node);
>         sk_tx_queue_clear(req_to_sk(req));
> +       req_to_sk(req)->sk_symmetric_queues = sk_listener->sk_symmetric_queues;
>         req->saved_syn = NULL;
>         refcount_set(&req->rsk_refcnt, 0);
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 03a3625..3421809 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char *msg, ...)
>   *     @skc_node: main hash linkage for various protocol lookup tables
>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>   *     @skc_tx_queue_mapping: tx queue number for this connection
> + *     @skc_rx_queue_mapping: rx queue number for this connection
> + *     @skc_rx_ifindex: rx ifindex for this connection
>   *     @skc_flags: place holder for sk_flags
>   *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>   *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>   *     @skc_incoming_cpu: record/match cpu processing incoming packets
>   *     @skc_refcnt: reference count
> + *     @skc_symmetric_queues: symmetric tx/rx queues
>   *
>   *     This is the minimal network layer representation of sockets, the header
>   *     for struct sock and struct inet_timewait_sock.
> @@ -177,6 +180,7 @@ struct sock_common {
>         unsigned char           skc_reuseport:1;
>         unsigned char           skc_ipv6only:1;
>         unsigned char           skc_net_refcnt:1;
> +       unsigned char           skc_symmetric_queues:1;
>         int                     skc_bound_dev_if;
>         union {
>                 struct hlist_node       skc_bind_node;
> @@ -214,6 +218,8 @@ struct sock_common {
>                 struct hlist_nulls_node skc_nulls_node;
>         };
>         int                     skc_tx_queue_mapping;
> +       int                     skc_rx_queue_mapping;
> +       int                     skc_rx_ifindex;
>         union {
>                 int             skc_incoming_cpu;
>                 u32             skc_rcv_wnd;
> @@ -324,6 +330,8 @@ struct sock {
>  #define sk_nulls_node          __sk_common.skc_nulls_node
>  #define sk_refcnt              __sk_common.skc_refcnt
>  #define sk_tx_queue_mapping    __sk_common.skc_tx_queue_mapping
> +#define sk_rx_queue_mapping    __sk_common.skc_rx_queue_mapping
> +#define sk_rx_ifindex          __sk_common.skc_rx_ifindex
>
>  #define sk_dontcopy_begin      __sk_common.skc_dontcopy_begin
>  #define sk_dontcopy_end                __sk_common.skc_dontcopy_end
> @@ -340,6 +348,7 @@ struct sock {
>  #define sk_reuseport           __sk_common.skc_reuseport
>  #define sk_ipv6only            __sk_common.skc_ipv6only
>  #define sk_net_refcnt          __sk_common.skc_net_refcnt
> +#define sk_symmetric_queues    __sk_common.skc_symmetric_queues
>  #define sk_bound_dev_if                __sk_common.skc_bound_dev_if
>  #define sk_bind_node           __sk_common.skc_bind_node
>  #define sk_prot                        __sk_common.skc_prot
> @@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
>         return sk ? sk->sk_tx_queue_mapping : -1;
>  }
>
> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
> +{
> +       if (sk->sk_symmetric_queues) {
> +               sk->sk_rx_ifindex = skb->skb_iif;
> +               sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
> +       }
> +}
> +
>  static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>  {
>         sk_tx_queue_clear(sk);
> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
> index e47c9e4..f6b416e 100644
> --- a/include/uapi/asm-generic/socket.h
> +++ b/include/uapi/asm-generic/socket.h
> @@ -106,4 +106,6 @@
>
>  #define SO_ZEROCOPY            60
>
> +#define SO_SYMMETRIC_QUEUES    61
> +
>  #endif /* __ASM_GENERIC_SOCKET_H */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 270b547..d96cda8 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
>
>         if (queue_index < 0 || skb->ooo_okay ||
>             queue_index >= dev->real_num_tx_queues) {
> -               int new_index = get_xps_queue(dev, skb);
> +               int new_index = -1;
> +
> +               if (sk && sk->sk_symmetric_queues && dev->ifindex == sk->sk_rx_ifindex)
> +                       new_index = sk->sk_rx_queue_mapping;
> +
> +               if (new_index < 0 || new_index >= dev->real_num_tx_queues)
> +                       new_index = get_xps_queue(dev, skb);
>
>                 if (new_index < 0)
>                         new_index = skb_tx_hash(dev, skb);
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 9b7b6bb..3876cce 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1059,6 +1059,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
>                         sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
>                 break;
>
> +       case SO_SYMMETRIC_QUEUES:
> +               sk->sk_symmetric_queues = valbool;
> +               break;
> +
>         default:
>                 ret = -ENOPROTOOPT;
>                 break;
> @@ -1391,6 +1395,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
>                 break;
>
> +       case SO_SYMMETRIC_QUEUES:
> +               v.val = sk->sk_symmetric_queues;
> +               break;
> +
>         default:
>                 /* We implement the SO_SNDLOWAT etc to not be settable
>                  * (1003.1g 7).
> @@ -2738,6 +2746,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
>         sk->sk_max_pacing_rate = ~0U;
>         sk->sk_pacing_rate = ~0U;
>         sk->sk_incoming_cpu = -1;
> +       sk->sk_rx_ifindex = -1;
> +       sk->sk_rx_queue_mapping = -1;
>         /*
>          * Before updating sk_refcnt, we must commit prior changes to memory
>          * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index c5d7656..12381e0 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -6356,6 +6356,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
>         tcp_rsk(req)->snt_isn = isn;
>         tcp_rsk(req)->txhash = net_tx_rndhash();
>         tcp_openreq_init_rwin(req, sk, dst);
> +       sk_mark_rx_queue(req_to_sk(req), skb);
>         if (!want_cookie) {
>                 tcp_reqsk_record_syn(sk, req, skb);
>                 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index a63486a..82f9af4 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1450,6 +1450,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
>
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_mark_rx_queue(sk, skb);
>                 if (dst) {
>                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
>                             !dst->ops->check(dst, 0)) {
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index 188a6f3..2b5efd5 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -809,6 +809,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
>
>         /* record NAPI ID of child */
>         sk_mark_napi_id(child, skb);
> +       sk_mark_rx_queue(child, skb);
>
>         tcp_segs_in(tcp_sk(child), skb);
>         if (!sock_owned_by_user(child)) {
> --
> 1.8.3.1
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-08-31 23:27 [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue Sridhar Samudrala
  2017-09-10  1:28 ` Alexander Duyck
  2017-09-10  5:32 ` Tom Herbert
@ 2017-09-10 15:19 ` Tom Herbert
  2017-09-11 16:49   ` Samudrala, Sridhar
  2017-09-12  3:12 ` Tom Herbert
  2017-09-20 15:30 ` Hannes Frederic Sowa
  4 siblings, 1 reply; 21+ messages in thread
From: Tom Herbert @ 2017-09-10 15:19 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: Alexander Duyck, Linux Kernel Network Developers

On Thu, Aug 31, 2017 at 4:27 PM, Sridhar Samudrala
<sridhar.samudrala@intel.com> wrote:
> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
> to enable symmetric tx and rx queues on a socket.
>
> This option is specifically useful for epoll based multi threaded workloads
> where each thread handles packets received on a single RX queue . In this model,
> we have noticed that it helps to send the packets on the same TX queue
> corresponding to the queue-pair associated with the RX queue specifically when
> busy poll is enabled with epoll().
>
> Two new fields are added to struct sock_common to cache the last rx ifindex and
> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached
> rx queue when this option is enabled and the TX is happening on the same device.
>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> ---
>  include/net/request_sock.h        |  1 +
>  include/net/sock.h                | 17 +++++++++++++++++
>  include/uapi/asm-generic/socket.h |  2 ++
>  net/core/dev.c                    |  8 +++++++-
>  net/core/sock.c                   | 10 ++++++++++
>  net/ipv4/tcp_input.c              |  1 +
>  net/ipv4/tcp_ipv4.c               |  1 +
>  net/ipv4/tcp_minisocks.c          |  1 +
>  8 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
> index 23e2205..c3bc12e 100644
> --- a/include/net/request_sock.h
> +++ b/include/net/request_sock.h
> @@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct request_sock *req)
>         req_to_sk(req)->sk_prot = sk_listener->sk_prot;
>         sk_node_init(&req_to_sk(req)->sk_node);
>         sk_tx_queue_clear(req_to_sk(req));
> +       req_to_sk(req)->sk_symmetric_queues = sk_listener->sk_symmetric_queues;
>         req->saved_syn = NULL;
>         refcount_set(&req->rsk_refcnt, 0);
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 03a3625..3421809 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char *msg, ...)
>   *     @skc_node: main hash linkage for various protocol lookup tables
>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>   *     @skc_tx_queue_mapping: tx queue number for this connection
> + *     @skc_rx_queue_mapping: rx queue number for this connection
> + *     @skc_rx_ifindex: rx ifindex for this connection
>   *     @skc_flags: place holder for sk_flags
>   *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>   *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>   *     @skc_incoming_cpu: record/match cpu processing incoming packets
>   *     @skc_refcnt: reference count
> + *     @skc_symmetric_queues: symmetric tx/rx queues
>   *
>   *     This is the minimal network layer representation of sockets, the header
>   *     for struct sock and struct inet_timewait_sock.
> @@ -177,6 +180,7 @@ struct sock_common {
>         unsigned char           skc_reuseport:1;
>         unsigned char           skc_ipv6only:1;
>         unsigned char           skc_net_refcnt:1;
> +       unsigned char           skc_symmetric_queues:1;
>         int                     skc_bound_dev_if;
>         union {
>                 struct hlist_node       skc_bind_node;
> @@ -214,6 +218,8 @@ struct sock_common {
>                 struct hlist_nulls_node skc_nulls_node;
>         };
>         int                     skc_tx_queue_mapping;
> +       int                     skc_rx_queue_mapping;
> +       int                     skc_rx_ifindex;
>         union {
>                 int             skc_incoming_cpu;
>                 u32             skc_rcv_wnd;
> @@ -324,6 +330,8 @@ struct sock {
>  #define sk_nulls_node          __sk_common.skc_nulls_node
>  #define sk_refcnt              __sk_common.skc_refcnt
>  #define sk_tx_queue_mapping    __sk_common.skc_tx_queue_mapping
> +#define sk_rx_queue_mapping    __sk_common.skc_rx_queue_mapping
> +#define sk_rx_ifindex          __sk_common.skc_rx_ifindex
>
>  #define sk_dontcopy_begin      __sk_common.skc_dontcopy_begin
>  #define sk_dontcopy_end                __sk_common.skc_dontcopy_end
> @@ -340,6 +348,7 @@ struct sock {
>  #define sk_reuseport           __sk_common.skc_reuseport
>  #define sk_ipv6only            __sk_common.skc_ipv6only
>  #define sk_net_refcnt          __sk_common.skc_net_refcnt
> +#define sk_symmetric_queues    __sk_common.skc_symmetric_queues
>  #define sk_bound_dev_if                __sk_common.skc_bound_dev_if
>  #define sk_bind_node           __sk_common.skc_bind_node
>  #define sk_prot                        __sk_common.skc_prot
> @@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
>         return sk ? sk->sk_tx_queue_mapping : -1;
>  }
>
> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
> +{
> +       if (sk->sk_symmetric_queues) {
> +               sk->sk_rx_ifindex = skb->skb_iif;
> +               sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
> +       }
> +}
> +
>  static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>  {
>         sk_tx_queue_clear(sk);
> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
> index e47c9e4..f6b416e 100644
> --- a/include/uapi/asm-generic/socket.h
> +++ b/include/uapi/asm-generic/socket.h
> @@ -106,4 +106,6 @@
>
>  #define SO_ZEROCOPY            60
>
> +#define SO_SYMMETRIC_QUEUES    61
> +
>  #endif /* __ASM_GENERIC_SOCKET_H */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 270b547..d96cda8 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
>
>         if (queue_index < 0 || skb->ooo_okay ||
>             queue_index >= dev->real_num_tx_queues) {
> -               int new_index = get_xps_queue(dev, skb);
> +               int new_index = -1;
> +
> +               if (sk && sk->sk_symmetric_queues && dev->ifindex == sk->sk_rx_ifindex)
> +                       new_index = sk->sk_rx_queue_mapping;
> +
> +               if (new_index < 0 || new_index >= dev->real_num_tx_queues)
> +                       new_index = get_xps_queue(dev, skb);

This enforces that notion of queue pairs which is not universal
concept to NICs. There are many devices and instances where we
purposely avoid having a 1-1 relationship between rx and tx queues.
An alternative might be to create a rx queue to tx queue map, add the
rx queue argument to get_xps_queue, and then that function can
consider the mapping. The administrator can configure the mapping as
appropriate and can select which rx queues are subject to the mapping.

>
>                 if (new_index < 0)
>                         new_index = skb_tx_hash(dev, skb);
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 9b7b6bb..3876cce 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1059,6 +1059,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
>                         sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
>                 break;
>
> +       case SO_SYMMETRIC_QUEUES:
> +               sk->sk_symmetric_queues = valbool;
> +               break;
> +
Allowing users control over this seems problematic to me. The intent
of packet steering is to provide good loading across the whole
systems, not just for individual applications. Exposing this control
makes that mission harder.

>         default:
>                 ret = -ENOPROTOOPT;
>                 break;
> @@ -1391,6 +1395,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
>                 break;
>
> +       case SO_SYMMETRIC_QUEUES:
> +               v.val = sk->sk_symmetric_queues;
> +               break;
> +
>         default:
>                 /* We implement the SO_SNDLOWAT etc to not be settable
>                  * (1003.1g 7).
> @@ -2738,6 +2746,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
>         sk->sk_max_pacing_rate = ~0U;
>         sk->sk_pacing_rate = ~0U;
>         sk->sk_incoming_cpu = -1;
> +       sk->sk_rx_ifindex = -1;
> +       sk->sk_rx_queue_mapping = -1;
>         /*
>          * Before updating sk_refcnt, we must commit prior changes to memory
>          * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index c5d7656..12381e0 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -6356,6 +6356,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
>         tcp_rsk(req)->snt_isn = isn;
>         tcp_rsk(req)->txhash = net_tx_rndhash();
>         tcp_openreq_init_rwin(req, sk, dst);
> +       sk_mark_rx_queue(req_to_sk(req), skb);
>         if (!want_cookie) {
>                 tcp_reqsk_record_syn(sk, req, skb);
>                 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index a63486a..82f9af4 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1450,6 +1450,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
>
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_mark_rx_queue(sk, skb);

This could be part of sock_rps_save_rxhash instead of new functions in
core receive path. That could be renamed sock_save_rx_info or
something like that. UDP support is also lacking with this patch so
that gets solved by a common function also.

>                 if (dst) {
>                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
>                             !dst->ops->check(dst, 0)) {
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index 188a6f3..2b5efd5 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -809,6 +809,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
>
>         /* record NAPI ID of child */
>         sk_mark_napi_id(child, skb);
> +       sk_mark_rx_queue(child, skb);
>
>         tcp_segs_in(tcp_sk(child), skb);
>         if (!sock_owned_by_user(child)) {
> --
> 1.8.3.1
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-10  1:28 ` Alexander Duyck
@ 2017-09-11 16:48   ` Samudrala, Sridhar
  2017-09-11 17:48     ` Alexander Duyck
  0 siblings, 1 reply; 21+ messages in thread
From: Samudrala, Sridhar @ 2017-09-11 16:48 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Duyck, Alexander H, Netdev



On 9/9/2017 6:28 PM, Alexander Duyck wrote:
> On Thu, Aug 31, 2017 at 4:27 PM, Sridhar Samudrala
> <sridhar.samudrala@intel.com> wrote:
>> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
>> to enable symmetric tx and rx queues on a socket.
>>
>> This option is specifically useful for epoll based multi threaded workloads
>> where each thread handles packets received on a single RX queue . In this model,
>> we have noticed that it helps to send the packets on the same TX queue
>> corresponding to the queue-pair associated with the RX queue specifically when
>> busy poll is enabled with epoll().
>>
>> Two new fields are added to struct sock_common to cache the last rx ifindex and
>> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached
>> rx queue when this option is enabled and the TX is happening on the same device.
>>
>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> ---
>>   include/net/request_sock.h        |  1 +
>>   include/net/sock.h                | 17 +++++++++++++++++
>>   include/uapi/asm-generic/socket.h |  2 ++
>>   net/core/dev.c                    |  8 +++++++-
>>   net/core/sock.c                   | 10 ++++++++++
>>   net/ipv4/tcp_input.c              |  1 +
>>   net/ipv4/tcp_ipv4.c               |  1 +
>>   net/ipv4/tcp_minisocks.c          |  1 +
>>   8 files changed, 40 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
>> index 23e2205..c3bc12e 100644
>> --- a/include/net/request_sock.h
>> +++ b/include/net/request_sock.h
>> @@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct request_sock *req)
>>          req_to_sk(req)->sk_prot = sk_listener->sk_prot;
>>          sk_node_init(&req_to_sk(req)->sk_node);
>>          sk_tx_queue_clear(req_to_sk(req));
>> +       req_to_sk(req)->sk_symmetric_queues = sk_listener->sk_symmetric_queues;
>>          req->saved_syn = NULL;
>>          refcount_set(&req->rsk_refcnt, 0);
>>
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index 03a3625..3421809 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char *msg, ...)
>>    *     @skc_node: main hash linkage for various protocol lookup tables
>>    *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>>    *     @skc_tx_queue_mapping: tx queue number for this connection
>> + *     @skc_rx_queue_mapping: rx queue number for this connection
>> + *     @skc_rx_ifindex: rx ifindex for this connection
>>    *     @skc_flags: place holder for sk_flags
>>    *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>>    *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>>    *     @skc_incoming_cpu: record/match cpu processing incoming packets
>>    *     @skc_refcnt: reference count
>> + *     @skc_symmetric_queues: symmetric tx/rx queues
>>    *
>>    *     This is the minimal network layer representation of sockets, the header
>>    *     for struct sock and struct inet_timewait_sock.
>> @@ -177,6 +180,7 @@ struct sock_common {
>>          unsigned char           skc_reuseport:1;
>>          unsigned char           skc_ipv6only:1;
>>          unsigned char           skc_net_refcnt:1;
>> +       unsigned char           skc_symmetric_queues:1;
>>          int                     skc_bound_dev_if;
>>          union {
>>                  struct hlist_node       skc_bind_node;
>> @@ -214,6 +218,8 @@ struct sock_common {
>>                  struct hlist_nulls_node skc_nulls_node;
>>          };
>>          int                     skc_tx_queue_mapping;
>> +       int                     skc_rx_queue_mapping;
>> +       int                     skc_rx_ifindex;
>>          union {
>>                  int             skc_incoming_cpu;
>>                  u32             skc_rcv_wnd;
>> @@ -324,6 +330,8 @@ struct sock {
>>   #define sk_nulls_node          __sk_common.skc_nulls_node
>>   #define sk_refcnt              __sk_common.skc_refcnt
>>   #define sk_tx_queue_mapping    __sk_common.skc_tx_queue_mapping
>> +#define sk_rx_queue_mapping    __sk_common.skc_rx_queue_mapping
>> +#define sk_rx_ifindex          __sk_common.skc_rx_ifindex
>>
>>   #define sk_dontcopy_begin      __sk_common.skc_dontcopy_begin
>>   #define sk_dontcopy_end                __sk_common.skc_dontcopy_end
>> @@ -340,6 +348,7 @@ struct sock {
>>   #define sk_reuseport           __sk_common.skc_reuseport
>>   #define sk_ipv6only            __sk_common.skc_ipv6only
>>   #define sk_net_refcnt          __sk_common.skc_net_refcnt
>> +#define sk_symmetric_queues    __sk_common.skc_symmetric_queues
>>   #define sk_bound_dev_if                __sk_common.skc_bound_dev_if
>>   #define sk_bind_node           __sk_common.skc_bind_node
>>   #define sk_prot                        __sk_common.skc_prot
>> @@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
>>          return sk ? sk->sk_tx_queue_mapping : -1;
>>   }
>>
>> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
>> +{
>> +       if (sk->sk_symmetric_queues) {
>> +               sk->sk_rx_ifindex = skb->skb_iif;
>> +               sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
>> +       }
>> +}
>> +
>>   static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>>   {
>>          sk_tx_queue_clear(sk);
>> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
>> index e47c9e4..f6b416e 100644
>> --- a/include/uapi/asm-generic/socket.h
>> +++ b/include/uapi/asm-generic/socket.h
>> @@ -106,4 +106,6 @@
>>
>>   #define SO_ZEROCOPY            60
>>
>> +#define SO_SYMMETRIC_QUEUES    61
>> +
>>   #endif /* __ASM_GENERIC_SOCKET_H */
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 270b547..d96cda8 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
>>
>>          if (queue_index < 0 || skb->ooo_okay ||
>>              queue_index >= dev->real_num_tx_queues) {
>> -               int new_index = get_xps_queue(dev, skb);
>> +               int new_index = -1;
>> +
>> +               if (sk && sk->sk_symmetric_queues && dev->ifindex == sk->sk_rx_ifindex)
>> +                       new_index = sk->sk_rx_queue_mapping;
>> +
>> +               if (new_index < 0 || new_index >= dev->real_num_tx_queues)
>> +                       new_index = get_xps_queue(dev, skb);
>>
>>                  if (new_index < 0)
>>                          new_index = skb_tx_hash(dev, skb);
> So one thing I am not sure about is if we should be overriding XPS. It
> might make sense to instead place this after XPS so that if the root
> user configures it then it applies, otherwise if the socket is
> requesting symmetric queues you could fall back to that, and then
> finally just use hashing as the final solution for distributing the
> workload.
Isn't XPS on by default and all the devices that support XPS setup XPS 
maps as part of
the initialization?
Are you suggesting that the root user needs to disable XPS on the 
specific queues before an
application can use this option to enable symmetric queues?


>
> That way if somebody decides to reserve queues for some sort of
> specific traffic like AF_PACKET then they can configure the Tx via
> XPS, configure the Rx via RSS redirection table reprogramming, and
> then setup a filters on the hardware to direct the traffic they want
> to the queues that are running AF_PACKET.
>
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index 9b7b6bb..3876cce 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -1059,6 +1059,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
>>                          sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
>>                  break;
>>
>> +       case SO_SYMMETRIC_QUEUES:
>> +               sk->sk_symmetric_queues = valbool;
>> +               break;
>> +
>>          default:
>>                  ret = -ENOPROTOOPT;
>>                  break;
>> @@ -1391,6 +1395,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>>                  v.val = sock_flag(sk, SOCK_ZEROCOPY);
>>                  break;
>>
>> +       case SO_SYMMETRIC_QUEUES:
>> +               v.val = sk->sk_symmetric_queues;
>> +               break;
>> +
>>          default:
>>                  /* We implement the SO_SNDLOWAT etc to not be settable
>>                   * (1003.1g 7).
>> @@ -2738,6 +2746,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
>>          sk->sk_max_pacing_rate = ~0U;
>>          sk->sk_pacing_rate = ~0U;
>>          sk->sk_incoming_cpu = -1;
>> +       sk->sk_rx_ifindex = -1;
>> +       sk->sk_rx_queue_mapping = -1;
>>          /*
>>           * Before updating sk_refcnt, we must commit prior changes to memory
>>           * (Documentation/RCU/rculist_nulls.txt for details)
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index c5d7656..12381e0 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -6356,6 +6356,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
>>          tcp_rsk(req)->snt_isn = isn;
>>          tcp_rsk(req)->txhash = net_tx_rndhash();
>>          tcp_openreq_init_rwin(req, sk, dst);
>> +       sk_mark_rx_queue(req_to_sk(req), skb);
>>          if (!want_cookie) {
>>                  tcp_reqsk_record_syn(sk, req, skb);
>>                  fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>> index a63486a..82f9af4 100644
>> --- a/net/ipv4/tcp_ipv4.c
>> +++ b/net/ipv4/tcp_ipv4.c
>> @@ -1450,6 +1450,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
>>
>>                  sock_rps_save_rxhash(sk, skb);
>>                  sk_mark_napi_id(sk, skb);
>> +               sk_mark_rx_queue(sk, skb);
>>                  if (dst) {
>>                          if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
>>                              !dst->ops->check(dst, 0)) {
>> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
>> index 188a6f3..2b5efd5 100644
>> --- a/net/ipv4/tcp_minisocks.c
>> +++ b/net/ipv4/tcp_minisocks.c
>> @@ -809,6 +809,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
>>
>>          /* record NAPI ID of child */
>>          sk_mark_napi_id(child, skb);
>> +       sk_mark_rx_queue(child, skb);
>>
>>          tcp_segs_in(tcp_sk(child), skb);
>>          if (!sock_owned_by_user(child)) {
>> --
>> 1.8.3.1
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-10 15:19 ` Tom Herbert
@ 2017-09-11 16:49   ` Samudrala, Sridhar
  2017-09-11 22:07     ` Tom Herbert
  0 siblings, 1 reply; 21+ messages in thread
From: Samudrala, Sridhar @ 2017-09-11 16:49 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Alexander Duyck, Linux Kernel Network Developers

On 9/10/2017 8:19 AM, Tom Herbert wrote:
> On Thu, Aug 31, 2017 at 4:27 PM, Sridhar Samudrala
> <sridhar.samudrala@intel.com> wrote:
>> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
>> to enable symmetric tx and rx queues on a socket.
>>
>> This option is specifically useful for epoll based multi threaded workloads
>> where each thread handles packets received on a single RX queue . In this model,
>> we have noticed that it helps to send the packets on the same TX queue
>> corresponding to the queue-pair associated with the RX queue specifically when
>> busy poll is enabled with epoll().
>>
>> Two new fields are added to struct sock_common to cache the last rx ifindex and
>> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached
>> rx queue when this option is enabled and the TX is happening on the same device.
>>
>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> ---
>>   include/net/request_sock.h        |  1 +
>>   include/net/sock.h                | 17 +++++++++++++++++
>>   include/uapi/asm-generic/socket.h |  2 ++
>>   net/core/dev.c                    |  8 +++++++-
>>   net/core/sock.c                   | 10 ++++++++++
>>   net/ipv4/tcp_input.c              |  1 +
>>   net/ipv4/tcp_ipv4.c               |  1 +
>>   net/ipv4/tcp_minisocks.c          |  1 +
>>   8 files changed, 40 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
>> index 23e2205..c3bc12e 100644
>> --- a/include/net/request_sock.h
>> +++ b/include/net/request_sock.h
>> @@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct request_sock *req)
>>          req_to_sk(req)->sk_prot = sk_listener->sk_prot;
>>          sk_node_init(&req_to_sk(req)->sk_node);
>>          sk_tx_queue_clear(req_to_sk(req));
>> +       req_to_sk(req)->sk_symmetric_queues = sk_listener->sk_symmetric_queues;
>>          req->saved_syn = NULL;
>>          refcount_set(&req->rsk_refcnt, 0);
>>
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index 03a3625..3421809 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char *msg, ...)
>>    *     @skc_node: main hash linkage for various protocol lookup tables
>>    *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>>    *     @skc_tx_queue_mapping: tx queue number for this connection
>> + *     @skc_rx_queue_mapping: rx queue number for this connection
>> + *     @skc_rx_ifindex: rx ifindex for this connection
>>    *     @skc_flags: place holder for sk_flags
>>    *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>>    *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>>    *     @skc_incoming_cpu: record/match cpu processing incoming packets
>>    *     @skc_refcnt: reference count
>> + *     @skc_symmetric_queues: symmetric tx/rx queues
>>    *
>>    *     This is the minimal network layer representation of sockets, the header
>>    *     for struct sock and struct inet_timewait_sock.
>> @@ -177,6 +180,7 @@ struct sock_common {
>>          unsigned char           skc_reuseport:1;
>>          unsigned char           skc_ipv6only:1;
>>          unsigned char           skc_net_refcnt:1;
>> +       unsigned char           skc_symmetric_queues:1;
>>          int                     skc_bound_dev_if;
>>          union {
>>                  struct hlist_node       skc_bind_node;
>> @@ -214,6 +218,8 @@ struct sock_common {
>>                  struct hlist_nulls_node skc_nulls_node;
>>          };
>>          int                     skc_tx_queue_mapping;
>> +       int                     skc_rx_queue_mapping;
>> +       int                     skc_rx_ifindex;
>>          union {
>>                  int             skc_incoming_cpu;
>>                  u32             skc_rcv_wnd;
>> @@ -324,6 +330,8 @@ struct sock {
>>   #define sk_nulls_node          __sk_common.skc_nulls_node
>>   #define sk_refcnt              __sk_common.skc_refcnt
>>   #define sk_tx_queue_mapping    __sk_common.skc_tx_queue_mapping
>> +#define sk_rx_queue_mapping    __sk_common.skc_rx_queue_mapping
>> +#define sk_rx_ifindex          __sk_common.skc_rx_ifindex
>>
>>   #define sk_dontcopy_begin      __sk_common.skc_dontcopy_begin
>>   #define sk_dontcopy_end                __sk_common.skc_dontcopy_end
>> @@ -340,6 +348,7 @@ struct sock {
>>   #define sk_reuseport           __sk_common.skc_reuseport
>>   #define sk_ipv6only            __sk_common.skc_ipv6only
>>   #define sk_net_refcnt          __sk_common.skc_net_refcnt
>> +#define sk_symmetric_queues    __sk_common.skc_symmetric_queues
>>   #define sk_bound_dev_if                __sk_common.skc_bound_dev_if
>>   #define sk_bind_node           __sk_common.skc_bind_node
>>   #define sk_prot                        __sk_common.skc_prot
>> @@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
>>          return sk ? sk->sk_tx_queue_mapping : -1;
>>   }
>>
>> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
>> +{
>> +       if (sk->sk_symmetric_queues) {
>> +               sk->sk_rx_ifindex = skb->skb_iif;
>> +               sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
>> +       }
>> +}
>> +
>>   static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>>   {
>>          sk_tx_queue_clear(sk);
>> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
>> index e47c9e4..f6b416e 100644
>> --- a/include/uapi/asm-generic/socket.h
>> +++ b/include/uapi/asm-generic/socket.h
>> @@ -106,4 +106,6 @@
>>
>>   #define SO_ZEROCOPY            60
>>
>> +#define SO_SYMMETRIC_QUEUES    61
>> +
>>   #endif /* __ASM_GENERIC_SOCKET_H */
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 270b547..d96cda8 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
>>
>>          if (queue_index < 0 || skb->ooo_okay ||
>>              queue_index >= dev->real_num_tx_queues) {
>> -               int new_index = get_xps_queue(dev, skb);
>> +               int new_index = -1;
>> +
>> +               if (sk && sk->sk_symmetric_queues && dev->ifindex == sk->sk_rx_ifindex)
>> +                       new_index = sk->sk_rx_queue_mapping;
>> +
>> +               if (new_index < 0 || new_index >= dev->real_num_tx_queues)
>> +                       new_index = get_xps_queue(dev, skb);
> This enforces that notion of queue pairs which is not universal
> concept to NICs. There are many devices and instances where we
> purposely avoid having a 1-1 relationship between rx and tx queues.

Yes. This patch assumes that TX and RX queues come in pairs.

> An alternative might be to create a rx queue to tx queue map, add the
> rx queue argument to get_xps_queue, and then that function can
> consider the mapping. The administrator can configure the mapping as
> appropriate and can select which rx queues are subject to the mapping.
This alternative looks much cleaner and doesn't require the apps to 
configure the
queues. Do we need to support 1 to many rx to tx queue mappings?
For our symmetric queues usecase, where a single application thread is 
associated with
1 queue-pair,  1-1 mapping is sufficient.
Do you see any usecase where it is useful to support 1-many mappings?
I guess i can add a sysfs entry per rx-queue to setup a tx-queue OR  
tx-queue-map.


>
>>                  if (new_index < 0)
>>                          new_index = skb_tx_hash(dev, skb);
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index 9b7b6bb..3876cce 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -1059,6 +1059,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
>>                          sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
>>                  break;
>>
>> +       case SO_SYMMETRIC_QUEUES:
>> +               sk->sk_symmetric_queues = valbool;
>> +               break;
>> +
> Allowing users control over this seems problematic to me. The intent
> of packet steering is to provide good loading across the whole
> systems, not just for individual applications. Exposing this control
> makes that mission harder.

Sure. If we can do this on a per rxqueue basis that can be configured by 
the administrator,
it would be a better option.

>
>>          default:
>>                  ret = -ENOPROTOOPT;
>>                  break;
>> @@ -1391,6 +1395,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>>                  v.val = sock_flag(sk, SOCK_ZEROCOPY);
>>                  break;
>>
>> +       case SO_SYMMETRIC_QUEUES:
>> +               v.val = sk->sk_symmetric_queues;
>> +               break;
>> +
>>          default:
>>                  /* We implement the SO_SNDLOWAT etc to not be settable
>>                   * (1003.1g 7).
>> @@ -2738,6 +2746,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
>>          sk->sk_max_pacing_rate = ~0U;
>>          sk->sk_pacing_rate = ~0U;
>>          sk->sk_incoming_cpu = -1;
>> +       sk->sk_rx_ifindex = -1;
>> +       sk->sk_rx_queue_mapping = -1;
>>          /*
>>           * Before updating sk_refcnt, we must commit prior changes to memory
>>           * (Documentation/RCU/rculist_nulls.txt for details)
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index c5d7656..12381e0 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -6356,6 +6356,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
>>          tcp_rsk(req)->snt_isn = isn;
>>          tcp_rsk(req)->txhash = net_tx_rndhash();
>>          tcp_openreq_init_rwin(req, sk, dst);
>> +       sk_mark_rx_queue(req_to_sk(req), skb);
>>          if (!want_cookie) {
>>                  tcp_reqsk_record_syn(sk, req, skb);
>>                  fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>> index a63486a..82f9af4 100644
>> --- a/net/ipv4/tcp_ipv4.c
>> +++ b/net/ipv4/tcp_ipv4.c
>> @@ -1450,6 +1450,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
>>
>>                  sock_rps_save_rxhash(sk, skb);
>>                  sk_mark_napi_id(sk, skb);
>> +               sk_mark_rx_queue(sk, skb);
> This could be part of sock_rps_save_rxhash instead of new functions in
> core receive path. That could be renamed sock_save_rx_info or
> something like that. UDP support is also lacking with this patch so
> that gets solved by a common function also.

Sure. Will look into re factoring sock_rps_save_rxhash() to save the rx 
queue info.

>
>>                  if (dst) {
>>                          if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
>>                              !dst->ops->check(dst, 0)) {
>> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
>> index 188a6f3..2b5efd5 100644
>> --- a/net/ipv4/tcp_minisocks.c
>> +++ b/net/ipv4/tcp_minisocks.c
>> @@ -809,6 +809,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
>>
>>          /* record NAPI ID of child */
>>          sk_mark_napi_id(child, skb);
>> +       sk_mark_rx_queue(child, skb);
>>
>>          tcp_segs_in(tcp_sk(child), skb);
>>          if (!sock_owned_by_user(child)) {
>> --
>> 1.8.3.1
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-11 16:48   ` Samudrala, Sridhar
@ 2017-09-11 17:48     ` Alexander Duyck
  0 siblings, 0 replies; 21+ messages in thread
From: Alexander Duyck @ 2017-09-11 17:48 UTC (permalink / raw)
  To: Samudrala, Sridhar; +Cc: Duyck, Alexander H, Netdev

On Mon, Sep 11, 2017 at 9:48 AM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
>
>
> On 9/9/2017 6:28 PM, Alexander Duyck wrote:
>>
>> On Thu, Aug 31, 2017 at 4:27 PM, Sridhar Samudrala
>> <sridhar.samudrala@intel.com> wrote:
>>>
>>> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be
>>> used
>>> to enable symmetric tx and rx queues on a socket.
>>>
>>> This option is specifically useful for epoll based multi threaded
>>> workloads
>>> where each thread handles packets received on a single RX queue . In this
>>> model,
>>> we have noticed that it helps to send the packets on the same TX queue
>>> corresponding to the queue-pair associated with the RX queue specifically
>>> when
>>> busy poll is enabled with epoll().
>>>
>>> Two new fields are added to struct sock_common to cache the last rx
>>> ifindex and
>>> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns
>>> the cached
>>> rx queue when this option is enabled and the TX is happening on the same
>>> device.
>>>
>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>> ---
>>>   include/net/request_sock.h        |  1 +
>>>   include/net/sock.h                | 17 +++++++++++++++++
>>>   include/uapi/asm-generic/socket.h |  2 ++
>>>   net/core/dev.c                    |  8 +++++++-
>>>   net/core/sock.c                   | 10 ++++++++++
>>>   net/ipv4/tcp_input.c              |  1 +
>>>   net/ipv4/tcp_ipv4.c               |  1 +
>>>   net/ipv4/tcp_minisocks.c          |  1 +
>>>   8 files changed, 40 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
>>> index 23e2205..c3bc12e 100644
>>> --- a/include/net/request_sock.h
>>> +++ b/include/net/request_sock.h
>>> @@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct
>>> request_sock *req)
>>>          req_to_sk(req)->sk_prot = sk_listener->sk_prot;
>>>          sk_node_init(&req_to_sk(req)->sk_node);
>>>          sk_tx_queue_clear(req_to_sk(req));
>>> +       req_to_sk(req)->sk_symmetric_queues =
>>> sk_listener->sk_symmetric_queues;
>>>          req->saved_syn = NULL;
>>>          refcount_set(&req->rsk_refcnt, 0);
>>>
>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>> index 03a3625..3421809 100644
>>> --- a/include/net/sock.h
>>> +++ b/include/net/sock.h
>>> @@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char
>>> *msg, ...)
>>>    *     @skc_node: main hash linkage for various protocol lookup tables
>>>    *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>>>    *     @skc_tx_queue_mapping: tx queue number for this connection
>>> + *     @skc_rx_queue_mapping: rx queue number for this connection
>>> + *     @skc_rx_ifindex: rx ifindex for this connection
>>>    *     @skc_flags: place holder for sk_flags
>>>    *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>>>    *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>>>    *     @skc_incoming_cpu: record/match cpu processing incoming packets
>>>    *     @skc_refcnt: reference count
>>> + *     @skc_symmetric_queues: symmetric tx/rx queues
>>>    *
>>>    *     This is the minimal network layer representation of sockets, the
>>> header
>>>    *     for struct sock and struct inet_timewait_sock.
>>> @@ -177,6 +180,7 @@ struct sock_common {
>>>          unsigned char           skc_reuseport:1;
>>>          unsigned char           skc_ipv6only:1;
>>>          unsigned char           skc_net_refcnt:1;
>>> +       unsigned char           skc_symmetric_queues:1;
>>>          int                     skc_bound_dev_if;
>>>          union {
>>>                  struct hlist_node       skc_bind_node;
>>> @@ -214,6 +218,8 @@ struct sock_common {
>>>                  struct hlist_nulls_node skc_nulls_node;
>>>          };
>>>          int                     skc_tx_queue_mapping;
>>> +       int                     skc_rx_queue_mapping;
>>> +       int                     skc_rx_ifindex;
>>>          union {
>>>                  int             skc_incoming_cpu;
>>>                  u32             skc_rcv_wnd;
>>> @@ -324,6 +330,8 @@ struct sock {
>>>   #define sk_nulls_node          __sk_common.skc_nulls_node
>>>   #define sk_refcnt              __sk_common.skc_refcnt
>>>   #define sk_tx_queue_mapping    __sk_common.skc_tx_queue_mapping
>>> +#define sk_rx_queue_mapping    __sk_common.skc_rx_queue_mapping
>>> +#define sk_rx_ifindex          __sk_common.skc_rx_ifindex
>>>
>>>   #define sk_dontcopy_begin      __sk_common.skc_dontcopy_begin
>>>   #define sk_dontcopy_end                __sk_common.skc_dontcopy_end
>>> @@ -340,6 +348,7 @@ struct sock {
>>>   #define sk_reuseport           __sk_common.skc_reuseport
>>>   #define sk_ipv6only            __sk_common.skc_ipv6only
>>>   #define sk_net_refcnt          __sk_common.skc_net_refcnt
>>> +#define sk_symmetric_queues    __sk_common.skc_symmetric_queues
>>>   #define sk_bound_dev_if                __sk_common.skc_bound_dev_if
>>>   #define sk_bind_node           __sk_common.skc_bind_node
>>>   #define sk_prot                        __sk_common.skc_prot
>>> @@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct
>>> sock *sk)
>>>          return sk ? sk->sk_tx_queue_mapping : -1;
>>>   }
>>>
>>> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff
>>> *skb)
>>> +{
>>> +       if (sk->sk_symmetric_queues) {
>>> +               sk->sk_rx_ifindex = skb->skb_iif;
>>> +               sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
>>> +       }
>>> +}
>>> +
>>>   static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>>>   {
>>>          sk_tx_queue_clear(sk);
>>> diff --git a/include/uapi/asm-generic/socket.h
>>> b/include/uapi/asm-generic/socket.h
>>> index e47c9e4..f6b416e 100644
>>> --- a/include/uapi/asm-generic/socket.h
>>> +++ b/include/uapi/asm-generic/socket.h
>>> @@ -106,4 +106,6 @@
>>>
>>>   #define SO_ZEROCOPY            60
>>>
>>> +#define SO_SYMMETRIC_QUEUES    61
>>> +
>>>   #endif /* __ASM_GENERIC_SOCKET_H */
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index 270b547..d96cda8 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device
>>> *dev, struct sk_buff *skb)
>>>
>>>          if (queue_index < 0 || skb->ooo_okay ||
>>>              queue_index >= dev->real_num_tx_queues) {
>>> -               int new_index = get_xps_queue(dev, skb);
>>> +               int new_index = -1;
>>> +
>>> +               if (sk && sk->sk_symmetric_queues && dev->ifindex ==
>>> sk->sk_rx_ifindex)
>>> +                       new_index = sk->sk_rx_queue_mapping;
>>> +
>>> +               if (new_index < 0 || new_index >=
>>> dev->real_num_tx_queues)
>>> +                       new_index = get_xps_queue(dev, skb);
>>>
>>>                  if (new_index < 0)
>>>                          new_index = skb_tx_hash(dev, skb);
>>
>> So one thing I am not sure about is if we should be overriding XPS. It
>> might make sense to instead place this after XPS so that if the root
>> user configures it then it applies, otherwise if the socket is
>> requesting symmetric queues you could fall back to that, and then
>> finally just use hashing as the final solution for distributing the
>> workload.
>
> Isn't XPS on by default and all the devices that support XPS setup XPS maps
> as part of
> the initialization?
> Are you suggesting that the root user needs to disable XPS on the specific
> queues before an
> application can use this option to enable symmetric queues?
>

XPS and this symmetric queue logic won't really play well together
since they are both attempting to do the same thing but from different
ends. Some sort of priority needs to be defined, and I would place the
request of the kernel/root user above the request of a socket and/or
application.

I guess it comes down to if we let the Tx CPU pick the queue or Rx
queue, and I would argue with an XPS configuration in place the kernel
is requesting that the Tx CPU sets the Tx queue, not the Rx/Tx hash or
Rx queue. This is similar to how we handle this for routing today as
XPS will also override the Rx to Tx mapping used there.

>>
>> That way if somebody decides to reserve queues for some sort of
>> specific traffic like AF_PACKET then they can configure the Tx via
>> XPS, configure the Rx via RSS redirection table reprogramming, and
>> then setup a filters on the hardware to direct the traffic they want
>> to the queues that are running AF_PACKET.
>>
>>> diff --git a/net/core/sock.c b/net/core/sock.c
>>> index 9b7b6bb..3876cce 100644
>>> --- a/net/core/sock.c
>>> +++ b/net/core/sock.c
>>> @@ -1059,6 +1059,10 @@ int sock_setsockopt(struct socket *sock, int
>>> level, int optname,
>>>                          sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
>>>                  break;
>>>
>>> +       case SO_SYMMETRIC_QUEUES:
>>> +               sk->sk_symmetric_queues = valbool;
>>> +               break;
>>> +
>>>          default:
>>>                  ret = -ENOPROTOOPT;
>>>                  break;
>>> @@ -1391,6 +1395,10 @@ int sock_getsockopt(struct socket *sock, int
>>> level, int optname,
>>>                  v.val = sock_flag(sk, SOCK_ZEROCOPY);
>>>                  break;
>>>
>>> +       case SO_SYMMETRIC_QUEUES:
>>> +               v.val = sk->sk_symmetric_queues;
>>> +               break;
>>> +
>>>          default:
>>>                  /* We implement the SO_SNDLOWAT etc to not be settable
>>>                   * (1003.1g 7).
>>> @@ -2738,6 +2746,8 @@ void sock_init_data(struct socket *sock, struct
>>> sock *sk)
>>>          sk->sk_max_pacing_rate = ~0U;
>>>          sk->sk_pacing_rate = ~0U;
>>>          sk->sk_incoming_cpu = -1;
>>> +       sk->sk_rx_ifindex = -1;
>>> +       sk->sk_rx_queue_mapping = -1;
>>>          /*
>>>           * Before updating sk_refcnt, we must commit prior changes to
>>> memory
>>>           * (Documentation/RCU/rculist_nulls.txt for details)
>>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>>> index c5d7656..12381e0 100644
>>> --- a/net/ipv4/tcp_input.c
>>> +++ b/net/ipv4/tcp_input.c
>>> @@ -6356,6 +6356,7 @@ int tcp_conn_request(struct request_sock_ops
>>> *rsk_ops,
>>>          tcp_rsk(req)->snt_isn = isn;
>>>          tcp_rsk(req)->txhash = net_tx_rndhash();
>>>          tcp_openreq_init_rwin(req, sk, dst);
>>> +       sk_mark_rx_queue(req_to_sk(req), skb);
>>>          if (!want_cookie) {
>>>                  tcp_reqsk_record_syn(sk, req, skb);
>>>                  fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
>>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>>> index a63486a..82f9af4 100644
>>> --- a/net/ipv4/tcp_ipv4.c
>>> +++ b/net/ipv4/tcp_ipv4.c
>>> @@ -1450,6 +1450,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff
>>> *skb)
>>>
>>>                  sock_rps_save_rxhash(sk, skb);
>>>                  sk_mark_napi_id(sk, skb);
>>> +               sk_mark_rx_queue(sk, skb);
>>>                  if (dst) {
>>>                          if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif
>>> ||
>>>                              !dst->ops->check(dst, 0)) {
>>> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
>>> index 188a6f3..2b5efd5 100644
>>> --- a/net/ipv4/tcp_minisocks.c
>>> +++ b/net/ipv4/tcp_minisocks.c
>>> @@ -809,6 +809,7 @@ int tcp_child_process(struct sock *parent, struct
>>> sock *child,
>>>
>>>          /* record NAPI ID of child */
>>>          sk_mark_napi_id(child, skb);
>>> +       sk_mark_rx_queue(child, skb);
>>>
>>>          tcp_segs_in(tcp_sk(child), skb);
>>>          if (!sock_owned_by_user(child)) {
>>> --
>>> 1.8.3.1
>>>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-11 16:49   ` Samudrala, Sridhar
@ 2017-09-11 22:07     ` Tom Herbert
  2017-09-11 22:50       ` Alexander Duyck
  0 siblings, 1 reply; 21+ messages in thread
From: Tom Herbert @ 2017-09-11 22:07 UTC (permalink / raw)
  To: Samudrala, Sridhar; +Cc: Alexander Duyck, Linux Kernel Network Developers

On Mon, Sep 11, 2017 at 9:49 AM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
> On 9/10/2017 8:19 AM, Tom Herbert wrote:
>>
>> On Thu, Aug 31, 2017 at 4:27 PM, Sridhar Samudrala
>> <sridhar.samudrala@intel.com> wrote:
>>>
>>> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be
>>> used
>>> to enable symmetric tx and rx queues on a socket.
>>>
>>> This option is specifically useful for epoll based multi threaded
>>> workloads
>>> where each thread handles packets received on a single RX queue . In this
>>> model,
>>> we have noticed that it helps to send the packets on the same TX queue
>>> corresponding to the queue-pair associated with the RX queue specifically
>>> when
>>> busy poll is enabled with epoll().
>>>
>>> Two new fields are added to struct sock_common to cache the last rx
>>> ifindex and
>>> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns
>>> the cached
>>> rx queue when this option is enabled and the TX is happening on the same
>>> device.
>>>
>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>> ---
>>>   include/net/request_sock.h        |  1 +
>>>   include/net/sock.h                | 17 +++++++++++++++++
>>>   include/uapi/asm-generic/socket.h |  2 ++
>>>   net/core/dev.c                    |  8 +++++++-
>>>   net/core/sock.c                   | 10 ++++++++++
>>>   net/ipv4/tcp_input.c              |  1 +
>>>   net/ipv4/tcp_ipv4.c               |  1 +
>>>   net/ipv4/tcp_minisocks.c          |  1 +
>>>   8 files changed, 40 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
>>> index 23e2205..c3bc12e 100644
>>> --- a/include/net/request_sock.h
>>> +++ b/include/net/request_sock.h
>>> @@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct
>>> request_sock *req)
>>>          req_to_sk(req)->sk_prot = sk_listener->sk_prot;
>>>          sk_node_init(&req_to_sk(req)->sk_node);
>>>          sk_tx_queue_clear(req_to_sk(req));
>>> +       req_to_sk(req)->sk_symmetric_queues =
>>> sk_listener->sk_symmetric_queues;
>>>          req->saved_syn = NULL;
>>>          refcount_set(&req->rsk_refcnt, 0);
>>>
>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>> index 03a3625..3421809 100644
>>> --- a/include/net/sock.h
>>> +++ b/include/net/sock.h
>>> @@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char
>>> *msg, ...)
>>>    *     @skc_node: main hash linkage for various protocol lookup tables
>>>    *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>>>    *     @skc_tx_queue_mapping: tx queue number for this connection
>>> + *     @skc_rx_queue_mapping: rx queue number for this connection
>>> + *     @skc_rx_ifindex: rx ifindex for this connection
>>>    *     @skc_flags: place holder for sk_flags
>>>    *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>>>    *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>>>    *     @skc_incoming_cpu: record/match cpu processing incoming packets
>>>    *     @skc_refcnt: reference count
>>> + *     @skc_symmetric_queues: symmetric tx/rx queues
>>>    *
>>>    *     This is the minimal network layer representation of sockets, the
>>> header
>>>    *     for struct sock and struct inet_timewait_sock.
>>> @@ -177,6 +180,7 @@ struct sock_common {
>>>          unsigned char           skc_reuseport:1;
>>>          unsigned char           skc_ipv6only:1;
>>>          unsigned char           skc_net_refcnt:1;
>>> +       unsigned char           skc_symmetric_queues:1;
>>>          int                     skc_bound_dev_if;
>>>          union {
>>>                  struct hlist_node       skc_bind_node;
>>> @@ -214,6 +218,8 @@ struct sock_common {
>>>                  struct hlist_nulls_node skc_nulls_node;
>>>          };
>>>          int                     skc_tx_queue_mapping;
>>> +       int                     skc_rx_queue_mapping;
>>> +       int                     skc_rx_ifindex;
>>>          union {
>>>                  int             skc_incoming_cpu;
>>>                  u32             skc_rcv_wnd;
>>> @@ -324,6 +330,8 @@ struct sock {
>>>   #define sk_nulls_node          __sk_common.skc_nulls_node
>>>   #define sk_refcnt              __sk_common.skc_refcnt
>>>   #define sk_tx_queue_mapping    __sk_common.skc_tx_queue_mapping
>>> +#define sk_rx_queue_mapping    __sk_common.skc_rx_queue_mapping
>>> +#define sk_rx_ifindex          __sk_common.skc_rx_ifindex
>>>
>>>   #define sk_dontcopy_begin      __sk_common.skc_dontcopy_begin
>>>   #define sk_dontcopy_end                __sk_common.skc_dontcopy_end
>>> @@ -340,6 +348,7 @@ struct sock {
>>>   #define sk_reuseport           __sk_common.skc_reuseport
>>>   #define sk_ipv6only            __sk_common.skc_ipv6only
>>>   #define sk_net_refcnt          __sk_common.skc_net_refcnt
>>> +#define sk_symmetric_queues    __sk_common.skc_symmetric_queues
>>>   #define sk_bound_dev_if                __sk_common.skc_bound_dev_if
>>>   #define sk_bind_node           __sk_common.skc_bind_node
>>>   #define sk_prot                        __sk_common.skc_prot
>>> @@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct
>>> sock *sk)
>>>          return sk ? sk->sk_tx_queue_mapping : -1;
>>>   }
>>>
>>> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff
>>> *skb)
>>> +{
>>> +       if (sk->sk_symmetric_queues) {
>>> +               sk->sk_rx_ifindex = skb->skb_iif;
>>> +               sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
>>> +       }
>>> +}
>>> +
>>>   static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>>>   {
>>>          sk_tx_queue_clear(sk);
>>> diff --git a/include/uapi/asm-generic/socket.h
>>> b/include/uapi/asm-generic/socket.h
>>> index e47c9e4..f6b416e 100644
>>> --- a/include/uapi/asm-generic/socket.h
>>> +++ b/include/uapi/asm-generic/socket.h
>>> @@ -106,4 +106,6 @@
>>>
>>>   #define SO_ZEROCOPY            60
>>>
>>> +#define SO_SYMMETRIC_QUEUES    61
>>> +
>>>   #endif /* __ASM_GENERIC_SOCKET_H */
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index 270b547..d96cda8 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device
>>> *dev, struct sk_buff *skb)
>>>
>>>          if (queue_index < 0 || skb->ooo_okay ||
>>>              queue_index >= dev->real_num_tx_queues) {
>>> -               int new_index = get_xps_queue(dev, skb);
>>> +               int new_index = -1;
>>> +
>>> +               if (sk && sk->sk_symmetric_queues && dev->ifindex ==
>>> sk->sk_rx_ifindex)
>>> +                       new_index = sk->sk_rx_queue_mapping;
>>> +
>>> +               if (new_index < 0 || new_index >=
>>> dev->real_num_tx_queues)
>>> +                       new_index = get_xps_queue(dev, skb);
>>
>> This enforces that notion of queue pairs which is not universal
>> concept to NICs. There are many devices and instances where we
>> purposely avoid having a 1-1 relationship between rx and tx queues.
>
>
> Yes. This patch assumes that TX and RX queues come in pairs.
>
>> An alternative might be to create a rx queue to tx queue map, add the
>> rx queue argument to get_xps_queue, and then that function can
>> consider the mapping. The administrator can configure the mapping as
>> appropriate and can select which rx queues are subject to the mapping.
>
> This alternative looks much cleaner and doesn't require the apps to
> configure the
> queues. Do we need to support 1 to many rx to tx queue mappings?
> For our symmetric queues usecase, where a single application thread is
> associated with
> 1 queue-pair,  1-1 mapping is sufficient.
> Do you see any usecase where it is useful to support 1-many mappings?
> I guess i can add a sysfs entry per rx-queue to setup a tx-queue OR
> tx-queue-map.

There is no reason do disallow 1 to many, XPS already does that. In
fact, the mapping algorithm in XSP is pretty much what is needed where
instead of mapping a CPU to a queue set, this just maps a rx queue to
queue set. ooo handling can still be done, although it might be less
critical in this case.

Tom

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-11 22:07     ` Tom Herbert
@ 2017-09-11 22:50       ` Alexander Duyck
  0 siblings, 0 replies; 21+ messages in thread
From: Alexander Duyck @ 2017-09-11 22:50 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Samudrala, Sridhar, Alexander Duyck, Linux Kernel Network Developers

On Mon, Sep 11, 2017 at 3:07 PM, Tom Herbert <tom@herbertland.com> wrote:
> On Mon, Sep 11, 2017 at 9:49 AM, Samudrala, Sridhar
> <sridhar.samudrala@intel.com> wrote:
>> On 9/10/2017 8:19 AM, Tom Herbert wrote:
>>>
>>> On Thu, Aug 31, 2017 at 4:27 PM, Sridhar Samudrala
>>> <sridhar.samudrala@intel.com> wrote:
>>>>
>>>> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be
>>>> used
>>>> to enable symmetric tx and rx queues on a socket.
>>>>
>>>> This option is specifically useful for epoll based multi threaded
>>>> workloads
>>>> where each thread handles packets received on a single RX queue . In this
>>>> model,
>>>> we have noticed that it helps to send the packets on the same TX queue
>>>> corresponding to the queue-pair associated with the RX queue specifically
>>>> when
>>>> busy poll is enabled with epoll().
>>>>
>>>> Two new fields are added to struct sock_common to cache the last rx
>>>> ifindex and
>>>> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns
>>>> the cached
>>>> rx queue when this option is enabled and the TX is happening on the same
>>>> device.
>>>>
>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>>> ---
>>>>   include/net/request_sock.h        |  1 +
>>>>   include/net/sock.h                | 17 +++++++++++++++++
>>>>   include/uapi/asm-generic/socket.h |  2 ++
>>>>   net/core/dev.c                    |  8 +++++++-
>>>>   net/core/sock.c                   | 10 ++++++++++
>>>>   net/ipv4/tcp_input.c              |  1 +
>>>>   net/ipv4/tcp_ipv4.c               |  1 +
>>>>   net/ipv4/tcp_minisocks.c          |  1 +
>>>>   8 files changed, 40 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
>>>> index 23e2205..c3bc12e 100644
>>>> --- a/include/net/request_sock.h
>>>> +++ b/include/net/request_sock.h
>>>> @@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct
>>>> request_sock *req)
>>>>          req_to_sk(req)->sk_prot = sk_listener->sk_prot;
>>>>          sk_node_init(&req_to_sk(req)->sk_node);
>>>>          sk_tx_queue_clear(req_to_sk(req));
>>>> +       req_to_sk(req)->sk_symmetric_queues =
>>>> sk_listener->sk_symmetric_queues;
>>>>          req->saved_syn = NULL;
>>>>          refcount_set(&req->rsk_refcnt, 0);
>>>>
>>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>>> index 03a3625..3421809 100644
>>>> --- a/include/net/sock.h
>>>> +++ b/include/net/sock.h
>>>> @@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char
>>>> *msg, ...)
>>>>    *     @skc_node: main hash linkage for various protocol lookup tables
>>>>    *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>>>>    *     @skc_tx_queue_mapping: tx queue number for this connection
>>>> + *     @skc_rx_queue_mapping: rx queue number for this connection
>>>> + *     @skc_rx_ifindex: rx ifindex for this connection
>>>>    *     @skc_flags: place holder for sk_flags
>>>>    *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>>>>    *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>>>>    *     @skc_incoming_cpu: record/match cpu processing incoming packets
>>>>    *     @skc_refcnt: reference count
>>>> + *     @skc_symmetric_queues: symmetric tx/rx queues
>>>>    *
>>>>    *     This is the minimal network layer representation of sockets, the
>>>> header
>>>>    *     for struct sock and struct inet_timewait_sock.
>>>> @@ -177,6 +180,7 @@ struct sock_common {
>>>>          unsigned char           skc_reuseport:1;
>>>>          unsigned char           skc_ipv6only:1;
>>>>          unsigned char           skc_net_refcnt:1;
>>>> +       unsigned char           skc_symmetric_queues:1;
>>>>          int                     skc_bound_dev_if;
>>>>          union {
>>>>                  struct hlist_node       skc_bind_node;
>>>> @@ -214,6 +218,8 @@ struct sock_common {
>>>>                  struct hlist_nulls_node skc_nulls_node;
>>>>          };
>>>>          int                     skc_tx_queue_mapping;
>>>> +       int                     skc_rx_queue_mapping;
>>>> +       int                     skc_rx_ifindex;
>>>>          union {
>>>>                  int             skc_incoming_cpu;
>>>>                  u32             skc_rcv_wnd;
>>>> @@ -324,6 +330,8 @@ struct sock {
>>>>   #define sk_nulls_node          __sk_common.skc_nulls_node
>>>>   #define sk_refcnt              __sk_common.skc_refcnt
>>>>   #define sk_tx_queue_mapping    __sk_common.skc_tx_queue_mapping
>>>> +#define sk_rx_queue_mapping    __sk_common.skc_rx_queue_mapping
>>>> +#define sk_rx_ifindex          __sk_common.skc_rx_ifindex
>>>>
>>>>   #define sk_dontcopy_begin      __sk_common.skc_dontcopy_begin
>>>>   #define sk_dontcopy_end                __sk_common.skc_dontcopy_end
>>>> @@ -340,6 +348,7 @@ struct sock {
>>>>   #define sk_reuseport           __sk_common.skc_reuseport
>>>>   #define sk_ipv6only            __sk_common.skc_ipv6only
>>>>   #define sk_net_refcnt          __sk_common.skc_net_refcnt
>>>> +#define sk_symmetric_queues    __sk_common.skc_symmetric_queues
>>>>   #define sk_bound_dev_if                __sk_common.skc_bound_dev_if
>>>>   #define sk_bind_node           __sk_common.skc_bind_node
>>>>   #define sk_prot                        __sk_common.skc_prot
>>>> @@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct
>>>> sock *sk)
>>>>          return sk ? sk->sk_tx_queue_mapping : -1;
>>>>   }
>>>>
>>>> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff
>>>> *skb)
>>>> +{
>>>> +       if (sk->sk_symmetric_queues) {
>>>> +               sk->sk_rx_ifindex = skb->skb_iif;
>>>> +               sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
>>>> +       }
>>>> +}
>>>> +
>>>>   static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>>>>   {
>>>>          sk_tx_queue_clear(sk);
>>>> diff --git a/include/uapi/asm-generic/socket.h
>>>> b/include/uapi/asm-generic/socket.h
>>>> index e47c9e4..f6b416e 100644
>>>> --- a/include/uapi/asm-generic/socket.h
>>>> +++ b/include/uapi/asm-generic/socket.h
>>>> @@ -106,4 +106,6 @@
>>>>
>>>>   #define SO_ZEROCOPY            60
>>>>
>>>> +#define SO_SYMMETRIC_QUEUES    61
>>>> +
>>>>   #endif /* __ASM_GENERIC_SOCKET_H */
>>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>>> index 270b547..d96cda8 100644
>>>> --- a/net/core/dev.c
>>>> +++ b/net/core/dev.c
>>>> @@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device
>>>> *dev, struct sk_buff *skb)
>>>>
>>>>          if (queue_index < 0 || skb->ooo_okay ||
>>>>              queue_index >= dev->real_num_tx_queues) {
>>>> -               int new_index = get_xps_queue(dev, skb);
>>>> +               int new_index = -1;
>>>> +
>>>> +               if (sk && sk->sk_symmetric_queues && dev->ifindex ==
>>>> sk->sk_rx_ifindex)
>>>> +                       new_index = sk->sk_rx_queue_mapping;
>>>> +
>>>> +               if (new_index < 0 || new_index >=
>>>> dev->real_num_tx_queues)
>>>> +                       new_index = get_xps_queue(dev, skb);
>>>
>>> This enforces that notion of queue pairs which is not universal
>>> concept to NICs. There are many devices and instances where we
>>> purposely avoid having a 1-1 relationship between rx and tx queues.
>>
>>
>> Yes. This patch assumes that TX and RX queues come in pairs.
>>
>>> An alternative might be to create a rx queue to tx queue map, add the
>>> rx queue argument to get_xps_queue, and then that function can
>>> consider the mapping. The administrator can configure the mapping as
>>> appropriate and can select which rx queues are subject to the mapping.
>>
>> This alternative looks much cleaner and doesn't require the apps to
>> configure the
>> queues. Do we need to support 1 to many rx to tx queue mappings?
>> For our symmetric queues usecase, where a single application thread is
>> associated with
>> 1 queue-pair,  1-1 mapping is sufficient.
>> Do you see any usecase where it is useful to support 1-many mappings?
>> I guess i can add a sysfs entry per rx-queue to setup a tx-queue OR
>> tx-queue-map.
>
> There is no reason do disallow 1 to many, XPS already does that. In
> fact, the mapping algorithm in XSP is pretty much what is needed where
> instead of mapping a CPU to a queue set, this just maps a rx queue to
> queue set. ooo handling can still be done, although it might be less
> critical in this case.
>
> Tom

Actually I wonder if we couldn't re-purpose the queue_mapping field
that is already there in the sk_buff for this. It might provide a more
elegant way to deal with the logic for already dealing with the
recorded Rx queue at the start of __skb_tx_hash. Then if the socket
wants this it could send down the packet with the queue_mapping field
set and we would be using some the map to figure out the Rx to Tx
mapping for either routing/bridging or this socket option.

Anyway just a thought.

- Alex

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-08-31 23:27 [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue Sridhar Samudrala
                   ` (2 preceding siblings ...)
  2017-09-10 15:19 ` Tom Herbert
@ 2017-09-12  3:12 ` Tom Herbert
  2017-09-12  3:53   ` Eric Dumazet
  2017-09-20 15:30 ` Hannes Frederic Sowa
  4 siblings, 1 reply; 21+ messages in thread
From: Tom Herbert @ 2017-09-12  3:12 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: Alexander Duyck, Linux Kernel Network Developers

On Thu, Aug 31, 2017 at 4:27 PM, Sridhar Samudrala
<sridhar.samudrala@intel.com> wrote:
> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
> to enable symmetric tx and rx queues on a socket.
>
> This option is specifically useful for epoll based multi threaded workloads
> where each thread handles packets received on a single RX queue . In this model,
> we have noticed that it helps to send the packets on the same TX queue
> corresponding to the queue-pair associated with the RX queue specifically when
> busy poll is enabled with epoll().
>
> Two new fields are added to struct sock_common to cache the last rx ifindex and
> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached
> rx queue when this option is enabled and the TX is happening on the same device.
>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> ---
>  include/net/request_sock.h        |  1 +
>  include/net/sock.h                | 17 +++++++++++++++++
>  include/uapi/asm-generic/socket.h |  2 ++
>  net/core/dev.c                    |  8 +++++++-
>  net/core/sock.c                   | 10 ++++++++++
>  net/ipv4/tcp_input.c              |  1 +
>  net/ipv4/tcp_ipv4.c               |  1 +
>  net/ipv4/tcp_minisocks.c          |  1 +
>  8 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
> index 23e2205..c3bc12e 100644
> --- a/include/net/request_sock.h
> +++ b/include/net/request_sock.h
> @@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct request_sock *req)
>         req_to_sk(req)->sk_prot = sk_listener->sk_prot;
>         sk_node_init(&req_to_sk(req)->sk_node);
>         sk_tx_queue_clear(req_to_sk(req));
> +       req_to_sk(req)->sk_symmetric_queues = sk_listener->sk_symmetric_queues;
>         req->saved_syn = NULL;
>         refcount_set(&req->rsk_refcnt, 0);
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 03a3625..3421809 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char *msg, ...)
>   *     @skc_node: main hash linkage for various protocol lookup tables
>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>   *     @skc_tx_queue_mapping: tx queue number for this connection
> + *     @skc_rx_queue_mapping: rx queue number for this connection
> + *     @skc_rx_ifindex: rx ifindex for this connection
>   *     @skc_flags: place holder for sk_flags
>   *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>   *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>   *     @skc_incoming_cpu: record/match cpu processing incoming packets
>   *     @skc_refcnt: reference count
> + *     @skc_symmetric_queues: symmetric tx/rx queues
>   *
>   *     This is the minimal network layer representation of sockets, the header
>   *     for struct sock and struct inet_timewait_sock.
> @@ -177,6 +180,7 @@ struct sock_common {
>         unsigned char           skc_reuseport:1;
>         unsigned char           skc_ipv6only:1;
>         unsigned char           skc_net_refcnt:1;
> +       unsigned char           skc_symmetric_queues:1;
>         int                     skc_bound_dev_if;
>         union {
>                 struct hlist_node       skc_bind_node;
> @@ -214,6 +218,8 @@ struct sock_common {
>                 struct hlist_nulls_node skc_nulls_node;
>         };
>         int                     skc_tx_queue_mapping;
> +       int                     skc_rx_queue_mapping;
> +       int                     skc_rx_ifindex;

Two ints in sock_common for this purpose is quite expensive and the
use case for this is limited-- even if a RX->TX queue mapping were
introduced to eliminate the queue pair assumption this still won't
help if the receive and transmit interfaces are different for the
connection. I think we really need to see some very compelling results
to be able to justify this.

Thanks,
Tom

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-12  3:12 ` Tom Herbert
@ 2017-09-12  3:53   ` Eric Dumazet
  2017-09-12  6:27     ` Samudrala, Sridhar
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2017-09-12  3:53 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Sridhar Samudrala, Alexander Duyck, Linux Kernel Network Developers

On Mon, 2017-09-11 at 20:12 -0700, Tom Herbert wrote:

> Two ints in sock_common for this purpose is quite expensive and the
> use case for this is limited-- even if a RX->TX queue mapping were
> introduced to eliminate the queue pair assumption this still won't
> help if the receive and transmit interfaces are different for the
> connection. I think we really need to see some very compelling results
> to be able to justify this.
> 

Yes, this is unreasonable cost.

XPS should really cover the case already.
 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-12  3:53   ` Eric Dumazet
@ 2017-09-12  6:27     ` Samudrala, Sridhar
  2017-09-12 15:47       ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Samudrala, Sridhar @ 2017-09-12  6:27 UTC (permalink / raw)
  To: Eric Dumazet, Tom Herbert
  Cc: Alexander Duyck, Linux Kernel Network Developers



On 9/11/2017 8:53 PM, Eric Dumazet wrote:
> On Mon, 2017-09-11 at 20:12 -0700, Tom Herbert wrote:
>
>> Two ints in sock_common for this purpose is quite expensive and the
>> use case for this is limited-- even if a RX->TX queue mapping were
>> introduced to eliminate the queue pair assumption this still won't
>> help if the receive and transmit interfaces are different for the
>> connection. I think we really need to see some very compelling results
>> to be able to justify this.
Will try to collect and post some perf data with symmetric queue 
configuration.

> Yes, this is unreasonable cost.
>
> XPS should really cover the case already.
>   
Eric,

Can you clarify how XPS covers the RX-> TX queue mapping case?
Is it possible to configure XPS to select TX queue based on the RX queue 
of a flow?
IIUC, it is based on the CPU of the thread doing the transmit OR based 
on skb->priority to TC mapping?
It may be possible to get this effect if the the threads are pinned to a 
core, but if the app threads are
freely moving, i am not sure how XPS can be configured to select the TX 
queue based on the RX queue of a flow.

Thanks
Sridhar

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-12  6:27     ` Samudrala, Sridhar
@ 2017-09-12 15:47       ` Eric Dumazet
  2017-09-12 22:31         ` Samudrala, Sridhar
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2017-09-12 15:47 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: Tom Herbert, Alexander Duyck, Linux Kernel Network Developers

On Mon, 2017-09-11 at 23:27 -0700, Samudrala, Sridhar wrote:
> 
> On 9/11/2017 8:53 PM, Eric Dumazet wrote:
> > On Mon, 2017-09-11 at 20:12 -0700, Tom Herbert wrote:
> >
> >> Two ints in sock_common for this purpose is quite expensive and the
> >> use case for this is limited-- even if a RX->TX queue mapping were
> >> introduced to eliminate the queue pair assumption this still won't
> >> help if the receive and transmit interfaces are different for the
> >> connection. I think we really need to see some very compelling results
> >> to be able to justify this.
> Will try to collect and post some perf data with symmetric queue 
> configuration.
> 
> > Yes, this is unreasonable cost.
> >
> > XPS should really cover the case already.
> >   
> Eric,
> 
> Can you clarify how XPS covers the RX-> TX queue mapping case?
> Is it possible to configure XPS to select TX queue based on the RX queue 
> of a flow?
> IIUC, it is based on the CPU of the thread doing the transmit OR based 
> on skb->priority to TC mapping?
> It may be possible to get this effect if the the threads are pinned to a 
> core, but if the app threads are
> freely moving, i am not sure how XPS can be configured to select the TX 
> queue based on the RX queue of a flow.

If application is freely moving, how NIC can properly select the RX
queue so that packets are coming to the appropriate queue ?

This is called aRFS, and it does not scale to millions of flows.
We tried in the past, and this went nowhere really, since the setup cost
is prohibitive and DDOS vulnerable.

XPS will follow the thread, since selection is done on current cpu.

The problem is RX side. If application is free to migrate, then special
support (aRFS) is needed from the hardware.

At least for passive connections, we already have all the support in the
kernel so that you can have one thread per NIC queue, dealing with
sockets that have incoming packets all received on one NIC RX queue.
(And of course all TX packets will use the symmetric TX queue)

SO_REUSEPORT plus appropriate BPF filter can achieve that.

Say you have 32 queues, 32 cpus.

Simply use 32 listeners, 32 threads (or 32 pools of threads)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-12 15:47       ` Eric Dumazet
@ 2017-09-12 22:31         ` Samudrala, Sridhar
  2017-09-12 22:53           ` Tom Herbert
  0 siblings, 1 reply; 21+ messages in thread
From: Samudrala, Sridhar @ 2017-09-12 22:31 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tom Herbert, Alexander Duyck, Linux Kernel Network Developers



On 9/12/2017 8:47 AM, Eric Dumazet wrote:
> On Mon, 2017-09-11 at 23:27 -0700, Samudrala, Sridhar wrote:
>> On 9/11/2017 8:53 PM, Eric Dumazet wrote:
>>> On Mon, 2017-09-11 at 20:12 -0700, Tom Herbert wrote:
>>>
>>>> Two ints in sock_common for this purpose is quite expensive and the
>>>> use case for this is limited-- even if a RX->TX queue mapping were
>>>> introduced to eliminate the queue pair assumption this still won't
>>>> help if the receive and transmit interfaces are different for the
>>>> connection. I think we really need to see some very compelling results
>>>> to be able to justify this.
>> Will try to collect and post some perf data with symmetric queue
>> configuration.
>>
>>> Yes, this is unreasonable cost.
>>>
>>> XPS should really cover the case already.
>>>    
>> Eric,
>>
>> Can you clarify how XPS covers the RX-> TX queue mapping case?
>> Is it possible to configure XPS to select TX queue based on the RX queue
>> of a flow?
>> IIUC, it is based on the CPU of the thread doing the transmit OR based
>> on skb->priority to TC mapping?
>> It may be possible to get this effect if the the threads are pinned to a
>> core, but if the app threads are
>> freely moving, i am not sure how XPS can be configured to select the TX
>> queue based on the RX queue of a flow.
> If application is freely moving, how NIC can properly select the RX
> queue so that packets are coming to the appropriate queue ?
The RX queue is selected via RSS and we don't want to move the flow based on
where the thread is running.
>
> This is called aRFS, and it does not scale to millions of flows.
> We tried in the past, and this went nowhere really, since the setup cost
> is prohibitive and DDOS vulnerable.
>
> XPS will follow the thread, since selection is done on current cpu.
>
> The problem is RX side. If application is free to migrate, then special
> support (aRFS) is needed from the hardware.
This may be true if most of the rx processing is happening in the 
interrupt context.
But with busy polling,  i think we don't need aRFS as a thread should be 
able to poll
any queue irrespective of where it is running.
>
> At least for passive connections, we already have all the support in the
> kernel so that you can have one thread per NIC queue, dealing with
> sockets that have incoming packets all received on one NIC RX queue.
> (And of course all TX packets will use the symmetric TX queue)
>
> SO_REUSEPORT plus appropriate BPF filter can achieve that.
>
> Say you have 32 queues, 32 cpus.
>
> Simply use 32 listeners, 32 threads (or 32 pools of threads)
Yes. This will work if each thread is pinned to a core associated with 
the RX interrupt.
It may not be possible to pin the threads to a core.
Instead we want to associate a thread to a queue and do all the RX and 
TX completion
of a queue in the same thread context via busy polling.

Thanks
Sridhar

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-12 22:31         ` Samudrala, Sridhar
@ 2017-09-12 22:53           ` Tom Herbert
  2017-09-20  0:34             ` Samudrala, Sridhar
  0 siblings, 1 reply; 21+ messages in thread
From: Tom Herbert @ 2017-09-12 22:53 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: Eric Dumazet, Alexander Duyck, Linux Kernel Network Developers

On Tue, Sep 12, 2017 at 3:31 PM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
>
>
> On 9/12/2017 8:47 AM, Eric Dumazet wrote:
>>
>> On Mon, 2017-09-11 at 23:27 -0700, Samudrala, Sridhar wrote:
>>>
>>> On 9/11/2017 8:53 PM, Eric Dumazet wrote:
>>>>
>>>> On Mon, 2017-09-11 at 20:12 -0700, Tom Herbert wrote:
>>>>
>>>>> Two ints in sock_common for this purpose is quite expensive and the
>>>>> use case for this is limited-- even if a RX->TX queue mapping were
>>>>> introduced to eliminate the queue pair assumption this still won't
>>>>> help if the receive and transmit interfaces are different for the
>>>>> connection. I think we really need to see some very compelling results
>>>>> to be able to justify this.
>>>
>>> Will try to collect and post some perf data with symmetric queue
>>> configuration.
>>>
>>>> Yes, this is unreasonable cost.
>>>>
>>>> XPS should really cover the case already.
>>>>
>>>
>>> Eric,
>>>
>>> Can you clarify how XPS covers the RX-> TX queue mapping case?
>>> Is it possible to configure XPS to select TX queue based on the RX queue
>>> of a flow?
>>> IIUC, it is based on the CPU of the thread doing the transmit OR based
>>> on skb->priority to TC mapping?
>>> It may be possible to get this effect if the the threads are pinned to a
>>> core, but if the app threads are
>>> freely moving, i am not sure how XPS can be configured to select the TX
>>> queue based on the RX queue of a flow.
>>
>> If application is freely moving, how NIC can properly select the RX
>> queue so that packets are coming to the appropriate queue ?
>
> The RX queue is selected via RSS and we don't want to move the flow based on
> where the thread is running.

Unless flow director is enabled on the Intel device... This was, I
believe, one of the first attempts to introduce a queue pair notion to
general purpose NICs. The idea was that the device records the TX
queue for a flow and then uses that to determine receive queue in a
symmetric fashion. aRFS is similar, but was under SW control how the
mapping is done. As Eric mentioned there are scalability issues with
these mechanisms, but we also found that flow director can easily
reorder packets whenever the thread moves.

>>
>>
>> This is called aRFS, and it does not scale to millions of flows.
>> We tried in the past, and this went nowhere really, since the setup cost
>> is prohibitive and DDOS vulnerable.
>>
>> XPS will follow the thread, since selection is done on current cpu.
>>
>> The problem is RX side. If application is free to migrate, then special
>> support (aRFS) is needed from the hardware.
>
> This may be true if most of the rx processing is happening in the interrupt
> context.
> But with busy polling,  i think we don't need aRFS as a thread should be
> able to poll
> any queue irrespective of where it is running.

It's not just a problem with interrupt processing, in general we like
to have all receive processing an subsequent transmit of a reply to be
done on one CPU. Silo'ing is good for performance and parallelism.
This can sometimes be relaxed in situations where CPUs share a cache
so crossing CPUs is not not costly.

>>
>>
>> At least for passive connections, we already have all the support in the
>> kernel so that you can have one thread per NIC queue, dealing with
>> sockets that have incoming packets all received on one NIC RX queue.
>> (And of course all TX packets will use the symmetric TX queue)
>>
>> SO_REUSEPORT plus appropriate BPF filter can achieve that.
>>
>> Say you have 32 queues, 32 cpus.
>>
>> Simply use 32 listeners, 32 threads (or 32 pools of threads)
>
> Yes. This will work if each thread is pinned to a core associated with the
> RX interrupt.
> It may not be possible to pin the threads to a core.
> Instead we want to associate a thread to a queue and do all the RX and TX
> completion
> of a queue in the same thread context via busy polling.
>
When that happens it's possible for RX to be done on the completely
wrong CPU which we know is suboptimal. However, this shouldn't
negatively affect TX side since XPS will just use the queue
appropriate for running CPU. Like Eric said, this is really a receive
problem more than a transmit problem. Keeping them as independent
paths seems to be a good approach.

Tom

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-12 22:53           ` Tom Herbert
@ 2017-09-20  0:34             ` Samudrala, Sridhar
  2017-09-20  0:48               ` Tom Herbert
  0 siblings, 1 reply; 21+ messages in thread
From: Samudrala, Sridhar @ 2017-09-20  0:34 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Eric Dumazet, Alexander Duyck, Linux Kernel Network Developers

On 9/12/2017 3:53 PM, Tom Herbert wrote:
> On Tue, Sep 12, 2017 at 3:31 PM, Samudrala, Sridhar
> <sridhar.samudrala@intel.com> wrote:
>>
>> On 9/12/2017 8:47 AM, Eric Dumazet wrote:
>>> On Mon, 2017-09-11 at 23:27 -0700, Samudrala, Sridhar wrote:
>>>> On 9/11/2017 8:53 PM, Eric Dumazet wrote:
>>>>> On Mon, 2017-09-11 at 20:12 -0700, Tom Herbert wrote:
>>>>>
>>>>>> Two ints in sock_common for this purpose is quite expensive and the
>>>>>> use case for this is limited-- even if a RX->TX queue mapping were
>>>>>> introduced to eliminate the queue pair assumption this still won't
>>>>>> help if the receive and transmit interfaces are different for the
>>>>>> connection. I think we really need to see some very compelling results
>>>>>> to be able to justify this.
>>>> Will try to collect and post some perf data with symmetric queue
>>>> configuration.

Here is some performance data i collected with memcached workload over
ixgbe 10Gb NIC with mcblaster benchmark.
ixgbe is configured with 16 queues and rx-usecs is set to 1000 for a 
very low
interrupt rate.
       ethtool -L p1p1 combined 16
       ethtool -C p1p1 rx-usecs 1000
and busy poll is set to 1000usecs
       sysctl net.core.busy_poll = 1000

16 threads  800K requests/sec
=============================
                   rtt(min/avg/max)usecs     intr/sec contextswitch/sec
-----------------------------------------------------------------------
Default                2/182/10641            23391 61163
Symmetric Queues       2/50/6311              20457 32843

32 threads  800K requests/sec
=============================
                  rtt(min/avg/max)usecs     intr/sec contextswitch/sec
------------------------------------------------------------------------
Default                2/162/6390            32168 69450
Symmetric Queues        2/50/3853            35044 35847

>>>>
>>>>> Yes, this is unreasonable cost.
>>>>>
>>>>> XPS should really cover the case already.
>>>>>
>>>> Eric,
>>>>
>>>> Can you clarify how XPS covers the RX-> TX queue mapping case?
>>>> Is it possible to configure XPS to select TX queue based on the RX queue
>>>> of a flow?
>>>> IIUC, it is based on the CPU of the thread doing the transmit OR based
>>>> on skb->priority to TC mapping?
>>>> It may be possible to get this effect if the the threads are pinned to a
>>>> core, but if the app threads are
>>>> freely moving, i am not sure how XPS can be configured to select the TX
>>>> queue based on the RX queue of a flow.
>>> If application is freely moving, how NIC can properly select the RX
>>> queue so that packets are coming to the appropriate queue ?
>> The RX queue is selected via RSS and we don't want to move the flow based on
>> where the thread is running.
> Unless flow director is enabled on the Intel device... This was, I
> believe, one of the first attempts to introduce a queue pair notion to
> general purpose NICs. The idea was that the device records the TX
> queue for a flow and then uses that to determine receive queue in a
> symmetric fashion. aRFS is similar, but was under SW control how the
> mapping is done. As Eric mentioned there are scalability issues with
> these mechanisms, but we also found that flow director can easily
> reorder packets whenever the thread moves.

You must be referring to the ATR(application targeted routing) feature 
on Intel
NICs wherea flow director entry is added for a flow based on TX queue 
used for
that flow. Instead, we would like to select the TX queue based on the RX 
queue
of a flow.


>
>>>
>>> This is called aRFS, and it does not scale to millions of flows.
>>> We tried in the past, and this went nowhere really, since the setup cost
>>> is prohibitive and DDOS vulnerable.
>>>
>>> XPS will follow the thread, since selection is done on current cpu.
>>>
>>> The problem is RX side. If application is free to migrate, then special
>>> support (aRFS) is needed from the hardware.
>> This may be true if most of the rx processing is happening in the interrupt
>> context.
>> But with busy polling,  i think we don't need aRFS as a thread should be
>> able to poll
>> any queue irrespective of where it is running.
> It's not just a problem with interrupt processing, in general we like
> to have all receive processing an subsequent transmit of a reply to be
> done on one CPU. Silo'ing is good for performance and parallelism.
> This can sometimes be relaxed in situations where CPUs share a cache
> so crossing CPUs is not not costly.

Yes. We would like to get this behavior even without binding the app 
thread to a CPU.


>
>>>
>>> At least for passive connections, we already have all the support in the
>>> kernel so that you can have one thread per NIC queue, dealing with
>>> sockets that have incoming packets all received on one NIC RX queue.
>>> (And of course all TX packets will use the symmetric TX queue)
>>>
>>> SO_REUSEPORT plus appropriate BPF filter can achieve that.
>>>
>>> Say you have 32 queues, 32 cpus.
>>>
>>> Simply use 32 listeners, 32 threads (or 32 pools of threads)
>> Yes. This will work if each thread is pinned to a core associated with the
>> RX interrupt.
>> It may not be possible to pin the threads to a core.
>> Instead we want to associate a thread to a queue and do all the RX and TX
>> completion
>> of a queue in the same thread context via busy polling.
>>
> When that happens it's possible for RX to be done on the completely
> wrong CPU which we know is suboptimal. However, this shouldn't
> negatively affect TX side since XPS will just use the queue
> appropriate for running CPU. Like Eric said, this is really a receive
> problem more than a transmit problem. Keeping them as independent
> paths seems to be a good approach.
>
>

We are noticing that when majority of packets are received via busy 
polling, it
should not be an issue if RX processing is handled by a thread running 
on a core
that is different from the core that is associated with the RX 
interrupt. Also, as
the TX completions on the associated TX queue are processed along with 
the RX
processing via busy polling, we would like the Transmits also to happen 
in the same
thread context.

Would appreciate any feedback or thoughts on optional configuration to 
enable selection
of TX queue based on the RX queue of a flow.

Thanks
Sridhar

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-20  0:34             ` Samudrala, Sridhar
@ 2017-09-20  0:48               ` Tom Herbert
       [not found]                 ` <fe565f14-156e-d703-c91d-d67136a0a0c0@intel.com>
  0 siblings, 1 reply; 21+ messages in thread
From: Tom Herbert @ 2017-09-20  0:48 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: Eric Dumazet, Alexander Duyck, Linux Kernel Network Developers

On Tue, Sep 19, 2017 at 5:34 PM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
> On 9/12/2017 3:53 PM, Tom Herbert wrote:
>>
>> On Tue, Sep 12, 2017 at 3:31 PM, Samudrala, Sridhar
>> <sridhar.samudrala@intel.com> wrote:
>>>
>>>
>>> On 9/12/2017 8:47 AM, Eric Dumazet wrote:
>>>>
>>>> On Mon, 2017-09-11 at 23:27 -0700, Samudrala, Sridhar wrote:
>>>>>
>>>>> On 9/11/2017 8:53 PM, Eric Dumazet wrote:
>>>>>>
>>>>>> On Mon, 2017-09-11 at 20:12 -0700, Tom Herbert wrote:
>>>>>>
>>>>>>> Two ints in sock_common for this purpose is quite expensive and the
>>>>>>> use case for this is limited-- even if a RX->TX queue mapping were
>>>>>>> introduced to eliminate the queue pair assumption this still won't
>>>>>>> help if the receive and transmit interfaces are different for the
>>>>>>> connection. I think we really need to see some very compelling
>>>>>>> results
>>>>>>> to be able to justify this.
>>>>>
>>>>> Will try to collect and post some perf data with symmetric queue
>>>>> configuration.
>
>
> Here is some performance data i collected with memcached workload over
> ixgbe 10Gb NIC with mcblaster benchmark.
> ixgbe is configured with 16 queues and rx-usecs is set to 1000 for a very
> low
> interrupt rate.
>       ethtool -L p1p1 combined 16
>       ethtool -C p1p1 rx-usecs 1000
> and busy poll is set to 1000usecs
>       sysctl net.core.busy_poll = 1000
>
> 16 threads  800K requests/sec
> =============================
>                   rtt(min/avg/max)usecs     intr/sec contextswitch/sec
> -----------------------------------------------------------------------
> Default                2/182/10641            23391 61163
> Symmetric Queues       2/50/6311              20457 32843
>
> 32 threads  800K requests/sec
> =============================
>                  rtt(min/avg/max)usecs     intr/sec contextswitch/sec
> ------------------------------------------------------------------------
> Default                2/162/6390            32168 69450
> Symmetric Queues        2/50/3853            35044 35847
>
No idea what "Default" configuration is. Please report how xps_cpus is
being set, how many RSS queues there are, and what the mapping is
between RSS queues and CPUs and shared caches. Also, whether and
threads are pinned.

Thanks,
Tom

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
       [not found]                 ` <fe565f14-156e-d703-c91d-d67136a0a0c0@intel.com>
@ 2017-09-20  5:13                   ` Eric Dumazet
  2017-09-20 14:18                     ` Tom Herbert
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2017-09-20  5:13 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: Tom Herbert, Alexander Duyck, Linux Kernel Network Developers

On Tue, 2017-09-19 at 21:59 -0700, Samudrala, Sridhar wrote:
> On 9/19/2017 5:48 PM, Tom Herbert wrote:
> > On Tue, Sep 19, 2017 at 5:34 PM, Samudrala, Sridhar
> > <sridhar.samudrala@intel.com> wrote:
> > > On 9/12/2017 3:53 PM, Tom Herbert wrote:
> > > > On Tue, Sep 12, 2017 at 3:31 PM, Samudrala, Sridhar
> > > > <sridhar.samudrala@intel.com> wrote:
> > > > > 
> > > > > On 9/12/2017 8:47 AM, Eric Dumazet wrote:
> > > > > > On Mon, 2017-09-11 at 23:27 -0700, Samudrala, Sridhar wrote:
> > > > > > > On 9/11/2017 8:53 PM, Eric Dumazet wrote:
> > > > > > > > On Mon, 2017-09-11 at 20:12 -0700, Tom Herbert wrote:
> > > > > > > > 
> > > > > > > > > Two ints in sock_common for this purpose is quite expensive and the
> > > > > > > > > use case for this is limited-- even if a RX->TX queue mapping were
> > > > > > > > > introduced to eliminate the queue pair assumption this still won't
> > > > > > > > > help if the receive and transmit interfaces are different for the
> > > > > > > > > connection. I think we really need to see some very compelling
> > > > > > > > > results
> > > > > > > > > to be able to justify this.
> > > > > > > Will try to collect and post some perf data with symmetric queue
> > > > > > > configuration.
> > > 
> > > Here is some performance data i collected with memcached workload over
> > > ixgbe 10Gb NIC with mcblaster benchmark.
> > > ixgbe is configured with 16 queues and rx-usecs is set to 1000 for a very
> > > low
> > > interrupt rate.
> > >       ethtool -L p1p1 combined 16
> > >       ethtool -C p1p1 rx-usecs 1000
> > > and busy poll is set to 1000usecs
> > >       sysctl net.core.busy_poll = 1000
> > > 
> > > 16 threads  800K requests/sec
> > > =============================
> > >                   rtt(min/avg/max)usecs     intr/sec contextswitch/sec
> > > -----------------------------------------------------------------------
> > > Default                2/182/10641            23391 61163
> > > Symmetric Queues       2/50/6311              20457 32843
> > > 
> > > 32 threads  800K requests/sec
> > > =============================
> > >                  rtt(min/avg/max)usecs     intr/sec contextswitch/sec
> > > ------------------------------------------------------------------------
> > > Default                2/162/6390            32168 69450
> > > Symmetric Queues        2/50/3853            35044 35847
> > > 
> > No idea what "Default" configuration is. Please report how xps_cpus is
> > being set, how many RSS queues there are, and what the mapping is
> > between RSS queues and CPUs and shared caches. Also, whether and
> > threads are pinned.
> Default is linux 4.13 with the settings i listed above.    
>         ethtool -L p1p1 combined 16
>         ethtool -C p1p1 rx-usecs 1000
>         sysctl net.core.busy_poll = 1000
> 
> # ethtool -x p1p1
> RX flow hash indirection table for p1p1 with 16 RX ring(s):
>     0:      0     1     2     3     4     5     6     7
>     8:      8     9    10    11    12    13    14    15
>    16:      0     1     2     3     4     5     6     7
>    24:      8     9    10    11    12    13    14    15
>    32:      0     1     2     3     4     5     6     7
>    40:      8     9    10    11    12    13    14    15
>    48:      0     1     2     3     4     5     6     7
>    56:      8     9    10    11    12    13    14    15
>    64:      0     1     2     3     4     5     6     7
>    72:      8     9    10    11    12    13    14    15
>    80:      0     1     2     3     4     5     6     7
>    88:      8     9    10    11    12    13    14    15
>    96:      0     1     2     3     4     5     6     7
>   104:      8     9    10    11    12    13    14    15
>   112:      0     1     2     3     4     5     6     7
>   120:      8     9    10    11    12    13    14    15
> 
> smp_affinity for the 16 queuepairs
>         141 p1p1-TxRx-0 0000,00000001
>         142 p1p1-TxRx-1 0000,00000002
>         143 p1p1-TxRx-2 0000,00000004
>         144 p1p1-TxRx-3 0000,00000008
>         145 p1p1-TxRx-4 0000,00000010
>         146 p1p1-TxRx-5 0000,00000020
>         147 p1p1-TxRx-6 0000,00000040
>         148 p1p1-TxRx-7 0000,00000080
>         149 p1p1-TxRx-8 0000,00000100
>         150 p1p1-TxRx-9 0000,00000200
>         151 p1p1-TxRx-10 0000,00000400
>         152 p1p1-TxRx-11 0000,00000800
>         153 p1p1-TxRx-12 0000,00001000
>         154 p1p1-TxRx-13 0000,00002000
>         155 p1p1-TxRx-14 0000,00004000
>         156 p1p1-TxRx-15 0000,00008000
> xps_cpus for the 16 Tx queues
>         0000,00000001
>         0000,00000002
>         0000,00000004
>         0000,00000008
>         0000,00000010
>         0000,00000020
>         0000,00000040
>         0000,00000080
>         0000,00000100
>         0000,00000200
>         0000,00000400
>         0000,00000800
>         0000,00001000
>         0000,00002000
>         0000,00004000
>         0000,00008000
> memcached threads are not pinned.
> 

...

I urge you to take the time to properly tune this host.

linux kernel does not do automagic configuration. This is user policy.

Documentation/networking/scaling.txt has everything you need.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-20  5:13                   ` Eric Dumazet
@ 2017-09-20 14:18                     ` Tom Herbert
  2017-09-20 16:51                       ` Samudrala, Sridhar
  0 siblings, 1 reply; 21+ messages in thread
From: Tom Herbert @ 2017-09-20 14:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Samudrala, Sridhar, Alexander Duyck, Linux Kernel Network Developers

On Tue, Sep 19, 2017 at 10:13 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2017-09-19 at 21:59 -0700, Samudrala, Sridhar wrote:
>> On 9/19/2017 5:48 PM, Tom Herbert wrote:
>> > On Tue, Sep 19, 2017 at 5:34 PM, Samudrala, Sridhar
>> > <sridhar.samudrala@intel.com> wrote:
>> > > On 9/12/2017 3:53 PM, Tom Herbert wrote:
>> > > > On Tue, Sep 12, 2017 at 3:31 PM, Samudrala, Sridhar
>> > > > <sridhar.samudrala@intel.com> wrote:
>> > > > >
>> > > > > On 9/12/2017 8:47 AM, Eric Dumazet wrote:
>> > > > > > On Mon, 2017-09-11 at 23:27 -0700, Samudrala, Sridhar wrote:
>> > > > > > > On 9/11/2017 8:53 PM, Eric Dumazet wrote:
>> > > > > > > > On Mon, 2017-09-11 at 20:12 -0700, Tom Herbert wrote:
>> > > > > > > >
>> > > > > > > > > Two ints in sock_common for this purpose is quite expensive and the
>> > > > > > > > > use case for this is limited-- even if a RX->TX queue mapping were
>> > > > > > > > > introduced to eliminate the queue pair assumption this still won't
>> > > > > > > > > help if the receive and transmit interfaces are different for the
>> > > > > > > > > connection. I think we really need to see some very compelling
>> > > > > > > > > results
>> > > > > > > > > to be able to justify this.
>> > > > > > > Will try to collect and post some perf data with symmetric queue
>> > > > > > > configuration.
>> > >
>> > > Here is some performance data i collected with memcached workload over
>> > > ixgbe 10Gb NIC with mcblaster benchmark.
>> > > ixgbe is configured with 16 queues and rx-usecs is set to 1000 for a very
>> > > low
>> > > interrupt rate.
>> > >       ethtool -L p1p1 combined 16
>> > >       ethtool -C p1p1 rx-usecs 1000
>> > > and busy poll is set to 1000usecs
>> > >       sysctl net.core.busy_poll = 1000
>> > >
>> > > 16 threads  800K requests/sec
>> > > =============================
>> > >                   rtt(min/avg/max)usecs     intr/sec contextswitch/sec
>> > > -----------------------------------------------------------------------
>> > > Default                2/182/10641            23391 61163
>> > > Symmetric Queues       2/50/6311              20457 32843
>> > >
>> > > 32 threads  800K requests/sec
>> > > =============================
>> > >                  rtt(min/avg/max)usecs     intr/sec contextswitch/sec
>> > > ------------------------------------------------------------------------
>> > > Default                2/162/6390            32168 69450
>> > > Symmetric Queues        2/50/3853            35044 35847
>> > >
>> > No idea what "Default" configuration is. Please report how xps_cpus is
>> > being set, how many RSS queues there are, and what the mapping is
>> > between RSS queues and CPUs and shared caches. Also, whether and
>> > threads are pinned.
>> Default is linux 4.13 with the settings i listed above.
>>         ethtool -L p1p1 combined 16
>>         ethtool -C p1p1 rx-usecs 1000
>>         sysctl net.core.busy_poll = 1000
>>
>> # ethtool -x p1p1
>> RX flow hash indirection table for p1p1 with 16 RX ring(s):
>>     0:      0     1     2     3     4     5     6     7
>>     8:      8     9    10    11    12    13    14    15
>>    16:      0     1     2     3     4     5     6     7
>>    24:      8     9    10    11    12    13    14    15
>>    32:      0     1     2     3     4     5     6     7
>>    40:      8     9    10    11    12    13    14    15
>>    48:      0     1     2     3     4     5     6     7
>>    56:      8     9    10    11    12    13    14    15
>>    64:      0     1     2     3     4     5     6     7
>>    72:      8     9    10    11    12    13    14    15
>>    80:      0     1     2     3     4     5     6     7
>>    88:      8     9    10    11    12    13    14    15
>>    96:      0     1     2     3     4     5     6     7
>>   104:      8     9    10    11    12    13    14    15
>>   112:      0     1     2     3     4     5     6     7
>>   120:      8     9    10    11    12    13    14    15
>>
>> smp_affinity for the 16 queuepairs
>>         141 p1p1-TxRx-0 0000,00000001
>>         142 p1p1-TxRx-1 0000,00000002
>>         143 p1p1-TxRx-2 0000,00000004
>>         144 p1p1-TxRx-3 0000,00000008
>>         145 p1p1-TxRx-4 0000,00000010
>>         146 p1p1-TxRx-5 0000,00000020
>>         147 p1p1-TxRx-6 0000,00000040
>>         148 p1p1-TxRx-7 0000,00000080
>>         149 p1p1-TxRx-8 0000,00000100
>>         150 p1p1-TxRx-9 0000,00000200
>>         151 p1p1-TxRx-10 0000,00000400
>>         152 p1p1-TxRx-11 0000,00000800
>>         153 p1p1-TxRx-12 0000,00001000
>>         154 p1p1-TxRx-13 0000,00002000
>>         155 p1p1-TxRx-14 0000,00004000
>>         156 p1p1-TxRx-15 0000,00008000
>> xps_cpus for the 16 Tx queues
>>         0000,00000001
>>         0000,00000002
>>         0000,00000004
>>         0000,00000008
>>         0000,00000010
>>         0000,00000020
>>         0000,00000040
>>         0000,00000080
>>         0000,00000100
>>         0000,00000200
>>         0000,00000400
>>         0000,00000800
>>         0000,00001000
>>         0000,00002000
>>         0000,00004000
>>         0000,00008000
>> memcached threads are not pinned.
>>
>
> ...
>
> I urge you to take the time to properly tune this host.
>
> linux kernel does not do automagic configuration. This is user policy.
>
> Documentation/networking/scaling.txt has everything you need.
>
Yes, tuning a system for optimal performance is difficult. Even if you
find a performance benefit for a configuration on one system, that
might not translate to another. In other words, if you've produced
some code that seems to perform better than previous implementation on
a test machine it's not enough to be satisfied with that. We want
understand _why_ there is a difference. If you can show there is
intrinsic benefits to the queue-pair model that we can't achieve with
existing implementation _and_ can show there are ill effects in other
circumstances, then you should have a good case to make changes.

In the case of memcached, threads inevitably migrate off the CPU they
were created on, the data follows the thread but the RX-queue does not
change which means that the receive path is crosses CPUs or caches.
But, then in the queuepair case that also means transmit completions
are crossing CPUs. We don't normally expect that to be a good thing.
However, transmit completion processing does not happen in the
critical path, so if that work is being deferred to a less busy CPU
there may benefits. That's only a theory, analysis and experimentation
should be able to get to the root cause.

Thanks,
Tom

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-08-31 23:27 [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue Sridhar Samudrala
                   ` (3 preceding siblings ...)
  2017-09-12  3:12 ` Tom Herbert
@ 2017-09-20 15:30 ` Hannes Frederic Sowa
  4 siblings, 0 replies; 21+ messages in thread
From: Hannes Frederic Sowa @ 2017-09-20 15:30 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: alexander.h.duyck, netdev

Sridhar Samudrala <sridhar.samudrala@intel.com> writes:

> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
> to enable symmetric tx and rx queues on a socket.
>
> This option is specifically useful for epoll based multi threaded workloads
> where each thread handles packets received on a single RX queue . In this model,
> we have noticed that it helps to send the packets on the same TX queue
> corresponding to the queue-pair associated with the RX queue specifically when
> busy poll is enabled with epoll().
>
> Two new fields are added to struct sock_common to cache the last rx ifindex and
> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached
> rx queue when this option is enabled and the TX is happening on the same device.

Would it help to make the rx and tx skb hashes symmetric
(skb_get_hash_symmetric) on request?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
  2017-09-20 14:18                     ` Tom Herbert
@ 2017-09-20 16:51                       ` Samudrala, Sridhar
  0 siblings, 0 replies; 21+ messages in thread
From: Samudrala, Sridhar @ 2017-09-20 16:51 UTC (permalink / raw)
  To: Tom Herbert, Eric Dumazet
  Cc: Alexander Duyck, Linux Kernel Network Developers



On 9/20/2017 7:18 AM, Tom Herbert wrote:
> On Tue, Sep 19, 2017 at 10:13 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> On Tue, 2017-09-19 at 21:59 -0700, Samudrala, Sridhar wrote:
>>> On 9/19/2017 5:48 PM, Tom Herbert wrote:
>>>> On Tue, Sep 19, 2017 at 5:34 PM, Samudrala, Sridhar
>>>> <sridhar.samudrala@intel.com> wrote:
>>>>> On 9/12/2017 3:53 PM, Tom Herbert wrote:
>>>>>> On Tue, Sep 12, 2017 at 3:31 PM, Samudrala, Sridhar
>>>>>> <sridhar.samudrala@intel.com> wrote:
>>>>>>> On 9/12/2017 8:47 AM, Eric Dumazet wrote:
>>>>>>>> On Mon, 2017-09-11 at 23:27 -0700, Samudrala, Sridhar wrote:
>>>>>>>>> On 9/11/2017 8:53 PM, Eric Dumazet wrote:
>>>>>>>>>> On Mon, 2017-09-11 at 20:12 -0700, Tom Herbert wrote:
>>>>>>>>>>
>>>>>>>>>>> Two ints in sock_common for this purpose is quite expensive and the
>>>>>>>>>>> use case for this is limited-- even if a RX->TX queue mapping were
>>>>>>>>>>> introduced to eliminate the queue pair assumption this still won't
>>>>>>>>>>> help if the receive and transmit interfaces are different for the
>>>>>>>>>>> connection. I think we really need to see some very compelling
>>>>>>>>>>> results
>>>>>>>>>>> to be able to justify this.
>>>>>>>>> Will try to collect and post some perf data with symmetric queue
>>>>>>>>> configuration.
>>>>> Here is some performance data i collected with memcached workload over
>>>>> ixgbe 10Gb NIC with mcblaster benchmark.
>>>>> ixgbe is configured with 16 queues and rx-usecs is set to 1000 for a very
>>>>> low
>>>>> interrupt rate.
>>>>>        ethtool -L p1p1 combined 16
>>>>>        ethtool -C p1p1 rx-usecs 1000
>>>>> and busy poll is set to 1000usecs
>>>>>        sysctl net.core.busy_poll = 1000
>>>>>
>>>>> 16 threads  800K requests/sec
>>>>> =============================
>>>>>                    rtt(min/avg/max)usecs     intr/sec contextswitch/sec
>>>>> -----------------------------------------------------------------------
>>>>> Default                2/182/10641            23391 61163
>>>>> Symmetric Queues       2/50/6311              20457 32843
>>>>>
>>>>> 32 threads  800K requests/sec
>>>>> =============================
>>>>>                   rtt(min/avg/max)usecs     intr/sec contextswitch/sec
>>>>> ------------------------------------------------------------------------
>>>>> Default                2/162/6390            32168 69450
>>>>> Symmetric Queues        2/50/3853            35044 35847
>>>>>
>>>> No idea what "Default" configuration is. Please report how xps_cpus is
>>>> being set, how many RSS queues there are, and what the mapping is
>>>> between RSS queues and CPUs and shared caches. Also, whether and
>>>> threads are pinned.
>>> Default is linux 4.13 with the settings i listed above.
>>>          ethtool -L p1p1 combined 16
>>>          ethtool -C p1p1 rx-usecs 1000
>>>          sysctl net.core.busy_poll = 1000
>>>
>>> # ethtool -x p1p1
>>> RX flow hash indirection table for p1p1 with 16 RX ring(s):
>>>      0:      0     1     2     3     4     5     6     7
>>>      8:      8     9    10    11    12    13    14    15
>>>     16:      0     1     2     3     4     5     6     7
>>>     24:      8     9    10    11    12    13    14    15
>>>     32:      0     1     2     3     4     5     6     7
>>>     40:      8     9    10    11    12    13    14    15
>>>     48:      0     1     2     3     4     5     6     7
>>>     56:      8     9    10    11    12    13    14    15
>>>     64:      0     1     2     3     4     5     6     7
>>>     72:      8     9    10    11    12    13    14    15
>>>     80:      0     1     2     3     4     5     6     7
>>>     88:      8     9    10    11    12    13    14    15
>>>     96:      0     1     2     3     4     5     6     7
>>>    104:      8     9    10    11    12    13    14    15
>>>    112:      0     1     2     3     4     5     6     7
>>>    120:      8     9    10    11    12    13    14    15
>>>
>>> smp_affinity for the 16 queuepairs
>>>          141 p1p1-TxRx-0 0000,00000001
>>>          142 p1p1-TxRx-1 0000,00000002
>>>          143 p1p1-TxRx-2 0000,00000004
>>>          144 p1p1-TxRx-3 0000,00000008
>>>          145 p1p1-TxRx-4 0000,00000010
>>>          146 p1p1-TxRx-5 0000,00000020
>>>          147 p1p1-TxRx-6 0000,00000040
>>>          148 p1p1-TxRx-7 0000,00000080
>>>          149 p1p1-TxRx-8 0000,00000100
>>>          150 p1p1-TxRx-9 0000,00000200
>>>          151 p1p1-TxRx-10 0000,00000400
>>>          152 p1p1-TxRx-11 0000,00000800
>>>          153 p1p1-TxRx-12 0000,00001000
>>>          154 p1p1-TxRx-13 0000,00002000
>>>          155 p1p1-TxRx-14 0000,00004000
>>>          156 p1p1-TxRx-15 0000,00008000
>>> xps_cpus for the 16 Tx queues
>>>          0000,00000001
>>>          0000,00000002
>>>          0000,00000004
>>>          0000,00000008
>>>          0000,00000010
>>>          0000,00000020
>>>          0000,00000040
>>>          0000,00000080
>>>          0000,00000100
>>>          0000,00000200
>>>          0000,00000400
>>>          0000,00000800
>>>          0000,00001000
>>>          0000,00002000
>>>          0000,00004000
>>>          0000,00008000
>>> memcached threads are not pinned.
>>>
>> ...
>>
>> I urge you to take the time to properly tune this host.
>>
>> linux kernel does not do automagic configuration. This is user policy.
>>
>> Documentation/networking/scaling.txt has everything you need.
>>
> Yes, tuning a system for optimal performance is difficult. Even if you
> find a performance benefit for a configuration on one system, that
> might not translate to another. In other words, if you've produced
> some code that seems to perform better than previous implementation on
> a test machine it's not enough to be satisfied with that. We want
> understand _why_ there is a difference. If you can show there is
> intrinsic benefits to the queue-pair model that we can't achieve with
> existing implementation _and_ can show there are ill effects in other
> circumstances, then you should have a good case to make changes.
>
> In the case of memcached, threads inevitably migrate off the CPU they
> were created on, the data follows the thread but the RX-queue does not
> change which means that the receive path is crosses CPUs or caches.
> But, then in the queuepair case that also means transmit completions
> are crossing CPUs. We don't normally expect that to be a good thing.
> However, transmit completion processing does not happen in the
> critical path, so if that work is being deferred to a less busy CPU
> there may benefits. That's only a theory, analysis and experimentation
> should be able to get to the root cause.
>
With regards to tuning, forgot to mention that memcached is updated to
select thethread based on incoming queue via SO_INCOMING_NAPI_ID and
is started with16 threads to match the number of RX queues.
If i do pinning of memcached threads to each of the 16 cores, i do get
similar performance as symmetric queues. But this symmetric queues 
configuration
is to support scenarios where it is not possible to pin the threads of the
application.

Thanks
Sridhar

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2017-09-20 16:51 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-31 23:27 [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue Sridhar Samudrala
2017-09-10  1:28 ` Alexander Duyck
2017-09-11 16:48   ` Samudrala, Sridhar
2017-09-11 17:48     ` Alexander Duyck
2017-09-10  5:32 ` Tom Herbert
2017-09-10 15:19 ` Tom Herbert
2017-09-11 16:49   ` Samudrala, Sridhar
2017-09-11 22:07     ` Tom Herbert
2017-09-11 22:50       ` Alexander Duyck
2017-09-12  3:12 ` Tom Herbert
2017-09-12  3:53   ` Eric Dumazet
2017-09-12  6:27     ` Samudrala, Sridhar
2017-09-12 15:47       ` Eric Dumazet
2017-09-12 22:31         ` Samudrala, Sridhar
2017-09-12 22:53           ` Tom Herbert
2017-09-20  0:34             ` Samudrala, Sridhar
2017-09-20  0:48               ` Tom Herbert
     [not found]                 ` <fe565f14-156e-d703-c91d-d67136a0a0c0@intel.com>
2017-09-20  5:13                   ` Eric Dumazet
2017-09-20 14:18                     ` Tom Herbert
2017-09-20 16:51                       ` Samudrala, Sridhar
2017-09-20 15:30 ` Hannes Frederic Sowa

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).