[PATCH v3 net-next 0/4] net: low latency Ethernet device polling

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
@ 2013-05-20 10:15 ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 10:15 UTC (permalink / raw)
  To: Dave Miller
  Cc: linux-kernel, netdev, Jesse Brandeburg, Don Skidmore,
	e1000-devel, Willem de Bruijn, Andi Kleen, HPA, Eliezer Tamir

updated with the comments I got so far.

Thanks,
Eliezer

^ permalink raw reply	[flat|nested] 69+ messages in thread

* [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
@ 2013-05-20 10:15 ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 10:15 UTC (permalink / raw)
  To: Dave Miller
  Cc: Willem de Bruijn, e1000-devel, netdev, HPA, linux-kernel,
	Jesse Brandeburg, Andi Kleen, Eliezer Tamir

updated with the comments I got so far.

Thanks,
Eliezer

------------------------------------------------------------------------------
AlienVault Unified Security Management (USM) platform delivers complete
security visibility with the essential security capabilities. Easily and
efficiently configure, manage, and operate all of your security controls
from a single console and one unified framework. Download a free trial.
http://p.sf.net/sfu/alienvault_d2d
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-20 10:15 ` Eliezer Tamir
@ 2013-05-20 10:16   ` Eliezer Tamir
  -1 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 10:16 UTC (permalink / raw)
  To: Dave Miller
  Cc: linux-kernel, netdev, Jesse Brandeburg, Don Skidmore,
	e1000-devel, Willem de Bruijn, Andi Kleen, HPA, Eliezer Tamir

Adds a new ndo_ll_poll method and the code that supports and uses it.
This method can be used by low latency applications to busy poll ethernet
device queues directly from the socket code. The ip_low_latency_poll sysctl
entry controls how many cycles to poll. Set to zero to disable.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
---

 Documentation/networking/ip-sysctl.txt |    5 ++
 include/linux/netdevice.h              |    3 +
 include/linux/skbuff.h                 |    8 ++-
 include/net/ll_poll.h                  |   86 ++++++++++++++++++++++++++++++++
 include/net/sock.h                     |    3 +
 net/core/datagram.c                    |    7 +++
 net/core/skbuff.c                      |    4 +
 net/core/sock.c                        |    6 ++
 net/ipv4/Kconfig                       |   12 ++++
 net/ipv4/sysctl_net_ipv4.c             |   10 ++++
 net/socket.c                           |   24 +++++++++
 11 files changed, 165 insertions(+), 3 deletions(-)
 create mode 100644 include/net/ll_poll.h

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f98ca63..cfcf0ea 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -19,6 +19,11 @@ ip_no_pmtu_disc - BOOLEAN
 	Disable Path MTU Discovery.
 	default FALSE
 
+ip_low_latency_poll - INTEGER
+	Low latency busy poll timeout. (needs CONFIG_INET_LL_RX_POLL)
+	Approximate time in ms to spin waiting for packets on the device queue.
+	default 0
+
 min_pmtu - INTEGER
 	default 552 - minimum discovered Path MTU
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a94a5a0..e25f798 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -943,6 +943,9 @@ struct net_device_ops {
 						     gfp_t gfp);
 	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
 #endif
+#ifdef CONFIG_INET_LL_RX_POLL
+	int			(*ndo_ll_poll)(struct napi_struct *dev);
+#endif
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
 	int			(*ndo_set_vf_vlan)(struct net_device *dev,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2e0ced1..4047e1e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -384,6 +384,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
  *	@dma_cookie: a cookie to one of several possible DMA operations
  *		done by skb DMA functions
+ *      @dev_ref: the NAPI struct this skb came from
  *	@secmark: security marking
  *	@mark: Generic packet mark
  *	@dropcount: total number of sk_receive_queue overflows
@@ -497,8 +498,11 @@ struct sk_buff {
 	/* 7/9 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
-#ifdef CONFIG_NET_DMA
-	dma_cookie_t		dma_cookie;
+#if defined CONFIG_NET_DMA || defined CONFIG_INET_LL_RX_POLL
+	union {
+		struct napi_struct	*dev_ref;
+		dma_cookie_t		dma_cookie;
+	};
 #endif
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32			secmark;
diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h
new file mode 100644
index 0000000..d3c86d6
--- /dev/null
+++ b/include/net/ll_poll.h
@@ -0,0 +1,86 @@
+/*
+ * low latency network device queue flush
+ * Copyright(c) 2013 Intel Corporation.
+ * Author: Eliezer Tamir
+ *
+ * For now this depends on CONFIG_I86_TSC
+ */
+
+#ifndef _LINUX_NET_LL_POLL_H
+#define _LINUX_NET_LL_POLL_H
+
+#ifdef CONFIG_INET_LL_RX_POLL
+#include <linux/netdevice.h>
+struct napi_struct;
+extern int sysctl_net_ll_poll __read_mostly;
+
+/* return values from ndo_ll_poll */
+#define LL_FLUSH_DONE		0
+#define LL_FLUSH_FAILED		1
+#define LL_FLUSH_BUSY		2
+
+/* we don't mind a ~2.5% imprecision */
+#define TSC_MHZ (tsc_khz >> 10)
+
+static inline bool sk_valid_ll(struct sock *sk)
+{
+	return sysctl_net_ll_poll && sk->dev_ref &&
+		!need_resched() && !signal_pending(current);
+}
+
+static inline bool sk_poll_ll(struct sock *sk, int nonblock)
+{
+	struct napi_struct *napi = sk->dev_ref;
+	const struct net_device_ops *ops;
+	unsigned long end_time = TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll)
+					+ get_cycles();
+
+	if (!napi->dev->netdev_ops->ndo_ll_poll)
+		return false;
+
+	local_bh_disable();
+
+	ops = napi->dev->netdev_ops;
+	while (skb_queue_empty(&sk->sk_receive_queue) &&
+			!time_after((unsigned long)get_cycles(), end_time)) {
+		if (ops->ndo_ll_poll(napi) == LL_FLUSH_FAILED)
+				break; /* premanent failure */
+		if (nonblock)
+			break;
+	}
+
+	local_bh_enable();
+
+	return !skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
+{
+	skb->dev_ref = napi;
+}
+
+static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
+{
+	sk->dev_ref = skb->dev_ref;
+}
+#else /* CONFIG_INET_LL_RX_FLUSH */
+
+static inline bool sk_valid_ll(struct sock *sk)
+{
+	return 0;
+}
+
+static inline bool sk_poll_ll(struct sock *sk, int nonblock)
+{
+	return 0;
+}
+
+static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
+{
+}
+
+static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
+{
+}
+#endif /* CONFIG_INET_LL_RX_FLUSH */
+#endif /* _LINUX_NET_LL_POLL_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 66772cf..eeabdcd4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -399,6 +399,9 @@ struct sock {
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
 	void                    (*sk_destruct)(struct sock *sk);
+#ifdef CONFIG_INET_LL_RX_POLL
+	struct napi_struct	*dev_ref;
+#endif
 };
 
 /*
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b71423d..df3dab8 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -56,6 +56,7 @@
 #include <net/sock.h>
 #include <net/tcp_states.h>
 #include <trace/events/skb.h>
+#include <net/ll_poll.h>
 
 /*
  *	Is a socket 'connection oriented' ?
@@ -201,12 +202,18 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 			} else
 				__skb_unlink(skb, queue);
 
+			sk_mark_ll(sk, skb);
 			spin_unlock_irqrestore(&queue->lock, cpu_flags);
 			*off = _off;
 			return skb;
 		}
 		spin_unlock_irqrestore(&queue->lock, cpu_flags);
 
+#ifdef CONFIG_INET_LL_RX_POLL
+		if (sk_valid_ll(sk) && sk_poll_ll(sk, flags & MSG_DONTWAIT))
+			continue;
+#endif
+
 		/* User doesn't want to wait */
 		error = -EAGAIN;
 		if (!timeo)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index af9185d..4efd230 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -739,6 +739,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->vlan_tci		= old->vlan_tci;
 
 	skb_copy_secmark(new, old);
+
+#ifdef CONFIG_INET_LL_RX_POLL
+	new->dev_ref		= old->dev_ref;
+#endif
 }
 
 /*
diff --git a/net/core/sock.c b/net/core/sock.c
index 6ba327d..d8058ce 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -139,6 +139,8 @@
 #include <net/tcp.h>
 #endif
 
+#include <net/ll_poll.h>
+
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
@@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	sk->sk_stamp = ktime_set(-1L, 0);
 
+#ifdef CONFIG_INET_LL_RX_POLL
+	sk->dev_ref	=	NULL;
+#endif
+
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 8603ca8..d209f0f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -409,6 +409,18 @@ config INET_LRO
 
 	  If unsure, say Y.
 
+config INET_LL_RX_POLL
+	bool "Low Latency Receive Poll"
+	depends on X86_TSC
+	default n
+	---help---
+	  Support Low Latency Receive Queue Poll.
+	  (For network card drivers which support this option.)
+	  When waiting for data in read or poll call directly into the the device driver
+	  to flush packets which may be pending on the device queues into the stack.
+
+	  If unsure, say N.
+
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index fa2f63f..d0fcaaf 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -25,6 +25,7 @@
 #include <net/inet_frag.h>
 #include <net/ping.h>
 #include <net/tcp_memcontrol.h>
+#include <net/ll_poll.h>
 
 static int zero;
 static int one = 1;
@@ -326,6 +327,15 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+#ifdef CONFIG_INET_LL_RX_POLL
+	{
+		.procname	= "ip_low_latency_poll",
+		.data		= &sysctl_net_ll_poll,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
 	{
 		.procname	= "tcp_syn_retries",
 		.data		= &sysctl_tcp_syn_retries,
diff --git a/net/socket.c b/net/socket.c
index 6b94633..e7cce5b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -105,6 +105,12 @@
 #include <linux/sockios.h>
 #include <linux/atalk.h>
 
+#ifdef CONFIG_INET_LL_RX_POLL
+#include <net/ll_poll.h>
+int sysctl_net_ll_poll __read_mostly;
+EXPORT_SYMBOL_GPL(sysctl_net_ll_poll);
+#endif
+
 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos);
@@ -1142,13 +1148,29 @@ EXPORT_SYMBOL(sock_create_lite);
 /* No kernel lock held - perfect */
 static unsigned int sock_poll(struct file *file, poll_table *wait)
 {
+	unsigned int poll_result;
 	struct socket *sock;
 
 	/*
 	 *      We can't return errors to poll, so it's either yes or no.
 	 */
 	sock = file->private_data;
-	return sock->ops->poll(file, sock, wait);
+
+	poll_result = sock->ops->poll(file, sock, wait);
+
+#ifdef CONFIG_INET_LL_RX_POLL
+	if (wait &&
+	    !(poll_result & (POLLRDNORM | POLLERR | POLLRDHUP | POLLHUP))) {
+		struct sock *sk = sock->sk;
+
+		/* only try once per poll */
+		if (sk_valid_ll(sk) && sk_poll_ll(sk, 1))
+			poll_result = sock->ops->poll(file, sock, wait);
+
+	}
+#endif /* CONFIG_INET_LL_RX_POLL */
+
+	return poll_result;
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)


^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
@ 2013-05-20 10:16   ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 10:16 UTC (permalink / raw)
  To: Dave Miller
  Cc: Willem de Bruijn, e1000-devel, netdev, HPA, linux-kernel,
	Jesse Brandeburg, Andi Kleen, Eliezer Tamir

Adds a new ndo_ll_poll method and the code that supports and uses it.
This method can be used by low latency applications to busy poll ethernet
device queues directly from the socket code. The ip_low_latency_poll sysctl
entry controls how many cycles to poll. Set to zero to disable.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
---

 Documentation/networking/ip-sysctl.txt |    5 ++
 include/linux/netdevice.h              |    3 +
 include/linux/skbuff.h                 |    8 ++-
 include/net/ll_poll.h                  |   86 ++++++++++++++++++++++++++++++++
 include/net/sock.h                     |    3 +
 net/core/datagram.c                    |    7 +++
 net/core/skbuff.c                      |    4 +
 net/core/sock.c                        |    6 ++
 net/ipv4/Kconfig                       |   12 ++++
 net/ipv4/sysctl_net_ipv4.c             |   10 ++++
 net/socket.c                           |   24 +++++++++
 11 files changed, 165 insertions(+), 3 deletions(-)
 create mode 100644 include/net/ll_poll.h

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f98ca63..cfcf0ea 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -19,6 +19,11 @@ ip_no_pmtu_disc - BOOLEAN
 	Disable Path MTU Discovery.
 	default FALSE
 
+ip_low_latency_poll - INTEGER
+	Low latency busy poll timeout. (needs CONFIG_INET_LL_RX_POLL)
+	Approximate time in ms to spin waiting for packets on the device queue.
+	default 0
+
 min_pmtu - INTEGER
 	default 552 - minimum discovered Path MTU
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a94a5a0..e25f798 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -943,6 +943,9 @@ struct net_device_ops {
 						     gfp_t gfp);
 	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
 #endif
+#ifdef CONFIG_INET_LL_RX_POLL
+	int			(*ndo_ll_poll)(struct napi_struct *dev);
+#endif
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
 	int			(*ndo_set_vf_vlan)(struct net_device *dev,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2e0ced1..4047e1e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -384,6 +384,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
  *	@dma_cookie: a cookie to one of several possible DMA operations
  *		done by skb DMA functions
+ *      @dev_ref: the NAPI struct this skb came from
  *	@secmark: security marking
  *	@mark: Generic packet mark
  *	@dropcount: total number of sk_receive_queue overflows
@@ -497,8 +498,11 @@ struct sk_buff {
 	/* 7/9 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
-#ifdef CONFIG_NET_DMA
-	dma_cookie_t		dma_cookie;
+#if defined CONFIG_NET_DMA || defined CONFIG_INET_LL_RX_POLL
+	union {
+		struct napi_struct	*dev_ref;
+		dma_cookie_t		dma_cookie;
+	};
 #endif
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32			secmark;
diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h
new file mode 100644
index 0000000..d3c86d6
--- /dev/null
+++ b/include/net/ll_poll.h
@@ -0,0 +1,86 @@
+/*
+ * low latency network device queue flush
+ * Copyright(c) 2013 Intel Corporation.
+ * Author: Eliezer Tamir
+ *
+ * For now this depends on CONFIG_I86_TSC
+ */
+
+#ifndef _LINUX_NET_LL_POLL_H
+#define _LINUX_NET_LL_POLL_H
+
+#ifdef CONFIG_INET_LL_RX_POLL
+#include <linux/netdevice.h>
+struct napi_struct;
+extern int sysctl_net_ll_poll __read_mostly;
+
+/* return values from ndo_ll_poll */
+#define LL_FLUSH_DONE		0
+#define LL_FLUSH_FAILED		1
+#define LL_FLUSH_BUSY		2
+
+/* we don't mind a ~2.5% imprecision */
+#define TSC_MHZ (tsc_khz >> 10)
+
+static inline bool sk_valid_ll(struct sock *sk)
+{
+	return sysctl_net_ll_poll && sk->dev_ref &&
+		!need_resched() && !signal_pending(current);
+}
+
+static inline bool sk_poll_ll(struct sock *sk, int nonblock)
+{
+	struct napi_struct *napi = sk->dev_ref;
+	const struct net_device_ops *ops;
+	unsigned long end_time = TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll)
+					+ get_cycles();
+
+	if (!napi->dev->netdev_ops->ndo_ll_poll)
+		return false;
+
+	local_bh_disable();
+
+	ops = napi->dev->netdev_ops;
+	while (skb_queue_empty(&sk->sk_receive_queue) &&
+			!time_after((unsigned long)get_cycles(), end_time)) {
+		if (ops->ndo_ll_poll(napi) == LL_FLUSH_FAILED)
+				break; /* premanent failure */
+		if (nonblock)
+			break;
+	}
+
+	local_bh_enable();
+
+	return !skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
+{
+	skb->dev_ref = napi;
+}
+
+static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
+{
+	sk->dev_ref = skb->dev_ref;
+}
+#else /* CONFIG_INET_LL_RX_FLUSH */
+
+static inline bool sk_valid_ll(struct sock *sk)
+{
+	return 0;
+}
+
+static inline bool sk_poll_ll(struct sock *sk, int nonblock)
+{
+	return 0;
+}
+
+static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
+{
+}
+
+static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
+{
+}
+#endif /* CONFIG_INET_LL_RX_FLUSH */
+#endif /* _LINUX_NET_LL_POLL_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 66772cf..eeabdcd4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -399,6 +399,9 @@ struct sock {
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
 	void                    (*sk_destruct)(struct sock *sk);
+#ifdef CONFIG_INET_LL_RX_POLL
+	struct napi_struct	*dev_ref;
+#endif
 };
 
 /*
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b71423d..df3dab8 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -56,6 +56,7 @@
 #include <net/sock.h>
 #include <net/tcp_states.h>
 #include <trace/events/skb.h>
+#include <net/ll_poll.h>
 
 /*
  *	Is a socket 'connection oriented' ?
@@ -201,12 +202,18 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 			} else
 				__skb_unlink(skb, queue);
 
+			sk_mark_ll(sk, skb);
 			spin_unlock_irqrestore(&queue->lock, cpu_flags);
 			*off = _off;
 			return skb;
 		}
 		spin_unlock_irqrestore(&queue->lock, cpu_flags);
 
+#ifdef CONFIG_INET_LL_RX_POLL
+		if (sk_valid_ll(sk) && sk_poll_ll(sk, flags & MSG_DONTWAIT))
+			continue;
+#endif
+
 		/* User doesn't want to wait */
 		error = -EAGAIN;
 		if (!timeo)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index af9185d..4efd230 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -739,6 +739,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->vlan_tci		= old->vlan_tci;
 
 	skb_copy_secmark(new, old);
+
+#ifdef CONFIG_INET_LL_RX_POLL
+	new->dev_ref		= old->dev_ref;
+#endif
 }
 
 /*
diff --git a/net/core/sock.c b/net/core/sock.c
index 6ba327d..d8058ce 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -139,6 +139,8 @@
 #include <net/tcp.h>
 #endif
 
+#include <net/ll_poll.h>
+
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
@@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	sk->sk_stamp = ktime_set(-1L, 0);
 
+#ifdef CONFIG_INET_LL_RX_POLL
+	sk->dev_ref	=	NULL;
+#endif
+
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 8603ca8..d209f0f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -409,6 +409,18 @@ config INET_LRO
 
 	  If unsure, say Y.
 
+config INET_LL_RX_POLL
+	bool "Low Latency Receive Poll"
+	depends on X86_TSC
+	default n
+	---help---
+	  Support Low Latency Receive Queue Poll.
+	  (For network card drivers which support this option.)
+	  When waiting for data in read or poll call directly into the the device driver
+	  to flush packets which may be pending on the device queues into the stack.
+
+	  If unsure, say N.
+
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index fa2f63f..d0fcaaf 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -25,6 +25,7 @@
 #include <net/inet_frag.h>
 #include <net/ping.h>
 #include <net/tcp_memcontrol.h>
+#include <net/ll_poll.h>
 
 static int zero;
 static int one = 1;
@@ -326,6 +327,15 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+#ifdef CONFIG_INET_LL_RX_POLL
+	{
+		.procname	= "ip_low_latency_poll",
+		.data		= &sysctl_net_ll_poll,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
 	{
 		.procname	= "tcp_syn_retries",
 		.data		= &sysctl_tcp_syn_retries,
diff --git a/net/socket.c b/net/socket.c
index 6b94633..e7cce5b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -105,6 +105,12 @@
 #include <linux/sockios.h>
 #include <linux/atalk.h>
 
+#ifdef CONFIG_INET_LL_RX_POLL
+#include <net/ll_poll.h>
+int sysctl_net_ll_poll __read_mostly;
+EXPORT_SYMBOL_GPL(sysctl_net_ll_poll);
+#endif
+
 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos);
@@ -1142,13 +1148,29 @@ EXPORT_SYMBOL(sock_create_lite);
 /* No kernel lock held - perfect */
 static unsigned int sock_poll(struct file *file, poll_table *wait)
 {
+	unsigned int poll_result;
 	struct socket *sock;
 
 	/*
 	 *      We can't return errors to poll, so it's either yes or no.
 	 */
 	sock = file->private_data;
-	return sock->ops->poll(file, sock, wait);
+
+	poll_result = sock->ops->poll(file, sock, wait);
+
+#ifdef CONFIG_INET_LL_RX_POLL
+	if (wait &&
+	    !(poll_result & (POLLRDNORM | POLLERR | POLLRDHUP | POLLHUP))) {
+		struct sock *sk = sock->sk;
+
+		/* only try once per poll */
+		if (sk_valid_ll(sk) && sk_poll_ll(sk, 1))
+			poll_result = sock->ops->poll(file, sock, wait);
+
+	}
+#endif /* CONFIG_INET_LL_RX_POLL */
+
+	return poll_result;
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)


------------------------------------------------------------------------------
AlienVault Unified Security Management (USM) platform delivers complete
security visibility with the essential security capabilities. Easily and
efficiently configure, manage, and operate all of your security controls
from a single console and one unified framework. Download a free trial.
http://p.sf.net/sfu/alienvault_d2d
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH v3 net-next 2/4] tcp: add TCP support for low latency receive poll.
  2013-05-20 10:15 ` Eliezer Tamir
@ 2013-05-20 10:16   ` Eliezer Tamir
  -1 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 10:16 UTC (permalink / raw)
  To: Dave Miller
  Cc: linux-kernel, netdev, Jesse Brandeburg, Don Skidmore,
	e1000-devel, Willem de Bruijn, Andi Kleen, HPA, Eliezer Tamir

an example of how one could add support for ndo_ll_poll to TCP.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
---

 net/ipv4/Kconfig     |   11 +++++++++++
 net/ipv4/tcp.c       |    9 +++++++++
 net/ipv4/tcp_input.c |    4 ++++
 3 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index d209f0f..8a3239e 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -421,6 +421,17 @@ config INET_LL_RX_POLL
 
 	  If unsure, say N.
 
+config INET_LL_TCP_POLL
+	bool "Low Latency TCP Receive Poll"
+	depends on INET_LL_RX_POLL
+	default n
+	---help---
+	  TCP support for Low Latency TCP Queue Poll.
+	  (For network cards that support this option.)
+	  Add support to the TCP stack for direct polling of the network card.
+
+	  If unsure, say N.
+
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dcb116d..85b8040 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -279,6 +279,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
+#include <net/ll_poll.h>
 
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 
@@ -1504,6 +1505,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			if (offset + 1 != skb->len)
 				continue;
 		}
+		sk_mark_ll(sk, skb);
 		if (tcp_hdr(skb)->fin) {
 			sk_eat_skb(sk, skb, false);
 			++seq;
@@ -1551,6 +1553,12 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	struct sk_buff *skb;
 	u32 urg_hole = 0;
 
+#ifdef CONFIG_INET_LL_TCP_POLL
+	if (sk_valid_ll(sk) && skb_queue_empty(&sk->sk_receive_queue)
+	    && (sk->sk_state == TCP_ESTABLISHED))
+		sk_poll_ll(sk, nonblock);
+#endif
+
 	lock_sock(sk);
 
 	err = -ENOTCONN;
@@ -1855,6 +1863,7 @@ do_prequeue:
 					break;
 				}
 			}
+			sk_mark_ll(sk, skb);
 		}
 
 		*seq += used;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b358e8c..f3f293b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -74,6 +74,7 @@
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
 #include <net/netdma.h>
+#include <net/ll_poll.h>
 
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -4329,6 +4330,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 				tp->copied_seq += chunk;
 				eaten = (chunk == skb->len);
 				tcp_rcv_space_adjust(sk);
+				sk_mark_ll(sk, skb);
 			}
 			local_bh_disable();
 		}
@@ -4896,6 +4898,7 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 		tp->ucopy.len -= chunk;
 		tp->copied_seq += chunk;
 		tcp_rcv_space_adjust(sk);
+		sk_mark_ll(sk, skb);
 	}
 
 	local_bh_disable();
@@ -4955,6 +4958,7 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
 		tp->ucopy.len -= chunk;
 		tp->copied_seq += chunk;
 		tcp_rcv_space_adjust(sk);
+		sk_mark_ll(sk, skb);
 
 		if ((tp->ucopy.len == 0) ||
 		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||


^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH v3 net-next 2/4] tcp: add TCP support for low latency receive poll.
@ 2013-05-20 10:16   ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 10:16 UTC (permalink / raw)
  To: Dave Miller
  Cc: Willem de Bruijn, e1000-devel, netdev, HPA, linux-kernel,
	Jesse Brandeburg, Andi Kleen, Eliezer Tamir

an example of how one could add support for ndo_ll_poll to TCP.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
---

 net/ipv4/Kconfig     |   11 +++++++++++
 net/ipv4/tcp.c       |    9 +++++++++
 net/ipv4/tcp_input.c |    4 ++++
 3 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index d209f0f..8a3239e 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -421,6 +421,17 @@ config INET_LL_RX_POLL
 
 	  If unsure, say N.
 
+config INET_LL_TCP_POLL
+	bool "Low Latency TCP Receive Poll"
+	depends on INET_LL_RX_POLL
+	default n
+	---help---
+	  TCP support for Low Latency TCP Queue Poll.
+	  (For network cards that support this option.)
+	  Add support to the TCP stack for direct polling of the network card.
+
+	  If unsure, say N.
+
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dcb116d..85b8040 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -279,6 +279,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
+#include <net/ll_poll.h>
 
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 
@@ -1504,6 +1505,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			if (offset + 1 != skb->len)
 				continue;
 		}
+		sk_mark_ll(sk, skb);
 		if (tcp_hdr(skb)->fin) {
 			sk_eat_skb(sk, skb, false);
 			++seq;
@@ -1551,6 +1553,12 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	struct sk_buff *skb;
 	u32 urg_hole = 0;
 
+#ifdef CONFIG_INET_LL_TCP_POLL
+	if (sk_valid_ll(sk) && skb_queue_empty(&sk->sk_receive_queue)
+	    && (sk->sk_state == TCP_ESTABLISHED))
+		sk_poll_ll(sk, nonblock);
+#endif
+
 	lock_sock(sk);
 
 	err = -ENOTCONN;
@@ -1855,6 +1863,7 @@ do_prequeue:
 					break;
 				}
 			}
+			sk_mark_ll(sk, skb);
 		}
 
 		*seq += used;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b358e8c..f3f293b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -74,6 +74,7 @@
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
 #include <net/netdma.h>
+#include <net/ll_poll.h>
 
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -4329,6 +4330,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 				tp->copied_seq += chunk;
 				eaten = (chunk == skb->len);
 				tcp_rcv_space_adjust(sk);
+				sk_mark_ll(sk, skb);
 			}
 			local_bh_disable();
 		}
@@ -4896,6 +4898,7 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 		tp->ucopy.len -= chunk;
 		tp->copied_seq += chunk;
 		tcp_rcv_space_adjust(sk);
+		sk_mark_ll(sk, skb);
 	}
 
 	local_bh_disable();
@@ -4955,6 +4958,7 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
 		tp->ucopy.len -= chunk;
 		tp->copied_seq += chunk;
 		tcp_rcv_space_adjust(sk);
+		sk_mark_ll(sk, skb);
 
 		if ((tp->ucopy.len == 0) ||
 		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||


------------------------------------------------------------------------------
AlienVault Unified Security Management (USM) platform delivers complete
security visibility with the essential security capabilities. Easily and
efficiently configure, manage, and operate all of your security controls
from a single console and one unified framework. Download a free trial.
http://p.sf.net/sfu/alienvault_d2d
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-20 10:15 ` Eliezer Tamir
@ 2013-05-20 10:16   ` Eliezer Tamir
  -1 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 10:16 UTC (permalink / raw)
  To: Dave Miller
  Cc: linux-kernel, netdev, Jesse Brandeburg, Don Skidmore,
	e1000-devel, Willem de Bruijn, Andi Kleen, HPA, Eliezer Tamir

Add the ixgbe driver code implementing ndo_ll_poll.
It should be easy for other drivers to do something similar
in order to enable support for CONFIG_INET_LL_RX_POLL

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
---

 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |   96 +++++++++++++++++++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   80 +++++++++++++++++++--
 2 files changed, 168 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index ca93238..72be661 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -356,9 +356,105 @@ struct ixgbe_q_vector {
 	struct rcu_head rcu;	/* to avoid race with update stats on free */
 	char name[IFNAMSIZ + 9];
 
+#ifdef CONFIG_INET_LL_RX_POLL
+	unsigned int state;
+#define IXGBE_QV_STATE_IDLE        0
+#define IXGBE_QV_STATE_NAPI	   1    /* NAPI owns this QV */
+#define IXGBE_QV_STATE_POLL	   2    /* poll owns this QV */
+#define IXGBE_QV_LOCKED (IXGBE_QV_STATE_NAPI | IXGBE_QV_STATE_POLL)
+#define IXGBE_QV_STATE_NAPI_YIELD  4    /* NAPI yielded this QV */
+#define IXGBE_QV_STATE_POLL_YIELD  8    /* poll yielded this QV */
+#define IXGBE_QV_YIELD (IXGBE_QV_STATE_NAPI_YIELD | IXGBE_QV_STATE_POLL_YIELD)
+#define IXGBE_QV_USER_PEND (IXGBE_QV_STATE_POLL | IXGBE_QV_STATE_POLL_YIELD)
+	spinlock_t lock;
+#endif  /* CONFIG_INET_LL_RX_POLL */
+
 	/* for dynamic allocation of rings associated with this q_vector */
 	struct ixgbe_ring ring[0] ____cacheline_internodealigned_in_smp;
 };
+#ifdef CONFIG_INET_LL_RX_POLL
+static inline void ixgbe_qv_init_lock(struct ixgbe_q_vector *q_vector)
+{
+
+	spin_lock_init(&q_vector->lock);
+	q_vector->state = IXGBE_QV_STATE_IDLE;
+}
+
+/* called from the device poll rutine to get ownership of a q_vector */
+static inline int ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector)
+{
+	int rc = true;
+	spin_lock(&q_vector->lock);
+	if (q_vector->state & IXGBE_QV_LOCKED) {
+		WARN_ON(q_vector->state & IXGBE_QV_STATE_NAPI);
+		q_vector->state |= IXGBE_QV_STATE_NAPI_YIELD;
+		rc = false;
+	} else
+		/* we don't care if someone yielded */
+		q_vector->state = IXGBE_QV_STATE_NAPI;
+	spin_unlock(&q_vector->lock);
+	return rc;
+}
+
+/* returns true is someone tried to get the qv while napi had it */
+static inline int ixgbe_qv_unlock_napi(struct ixgbe_q_vector *q_vector)
+{
+	int rc = false;
+	spin_lock(&q_vector->lock);
+	WARN_ON(q_vector->state & (IXGBE_QV_STATE_POLL |
+			       IXGBE_QV_STATE_NAPI_YIELD));
+
+	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
+		rc = true;
+	q_vector->state = IXGBE_QV_STATE_IDLE;
+	spin_unlock(&q_vector->lock);
+	return rc;
+}
+
+/* called from ixgbe_low_latency_poll() */
+static inline int ixgbe_qv_lock_poll(struct ixgbe_q_vector *q_vector)
+{
+	int rc = true;
+	spin_lock_bh(&q_vector->lock);
+	if ((q_vector->state & IXGBE_QV_LOCKED)) {
+		q_vector->state |= IXGBE_QV_STATE_POLL_YIELD;
+		rc = false;
+	} else
+		/* preserve yield marks */
+		q_vector->state |= IXGBE_QV_STATE_POLL;
+	spin_unlock_bh(&q_vector->lock);
+	return rc;
+}
+
+/* returns true if someone tried to get the qv while it was locked */
+static inline int ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector)
+{
+	int rc = false;
+	spin_lock_bh(&q_vector->lock);
+	WARN_ON(q_vector->state & (IXGBE_QV_STATE_NAPI));
+
+	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
+		rc = true;
+	q_vector->state = IXGBE_QV_STATE_IDLE;
+	spin_unlock_bh(&q_vector->lock);
+	return rc;
+}
+
+/* true if a socket is polling, even if it did not get the lock */
+static inline int ixgbe_qv_ll_polling(struct ixgbe_q_vector *q_vector)
+{
+	WARN_ON(!(q_vector->state & IXGBE_QV_LOCKED));
+	return q_vector->state & IXGBE_QV_USER_PEND;
+}
+#else
+#define ixgbe_qv_init_lock(qv) do {} while (0)
+#define ixgbe_qv_lock_napi(qv) 1
+#define ixgbe_qv_unlock_napi(qv) 0
+#define ixgbe_qv_lock_poll(qv) 0
+#define ixgbe_qv_unlock_poll(qv) 0
+#define ixgbe_qv_ll_polling(qv) 0
+#endif /* CONFIG_INET_LL_RX_POLL */
+
 #ifdef CONFIG_IXGBE_HWMON
 
 #define IXGBE_HWMON_TYPE_LOC		0
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d30fbdd..628b7b1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -47,6 +47,7 @@
 #include <linux/if_bridge.h>
 #include <linux/prefetch.h>
 #include <scsi/fc/fc_fcoe.h>
+#include <net/ll_poll.h>
 
 #include "ixgbe.h"
 #include "ixgbe_common.h"
@@ -144,6 +145,14 @@ static int debug = -1;
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
 
+#ifdef CONFIG_INET_LL_RX_POLL
+static int allow_unsafe_removal;
+static int unsafe_to_remove;
+module_param(allow_unsafe_removal, int, 0);
+MODULE_PARM_DESC(allow_unsafe_removal,
+	"Allow removal of module after low latency receive was used");
+#endif
+
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
 MODULE_DESCRIPTION("Intel(R) 10 Gigabit PCI Express Network Driver");
 MODULE_LICENSE("GPL");
@@ -1504,7 +1513,9 @@ static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector,
 {
 	struct ixgbe_adapter *adapter = q_vector->adapter;
 
-	if (!(adapter->flags & IXGBE_FLAG_IN_NETPOLL))
+	if (ixgbe_qv_ll_polling(q_vector))
+		netif_receive_skb(skb);
+	else if (!(adapter->flags & IXGBE_FLAG_IN_NETPOLL))
 		napi_gro_receive(&q_vector->napi, skb);
 	else
 		netif_rx(skb);
@@ -1892,9 +1903,9 @@ dma_sync:
  * expensive overhead for IOMMU access this provides a means of avoiding
  * it by maintaining the mapping of the page to the syste.
  *
- * Returns true if all work is completed without reaching budget
+ * Returns amount of work completed
  **/
-static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
+static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			       struct ixgbe_ring *rx_ring,
 			       const int budget)
 {
@@ -1976,6 +1987,7 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		}
 
 #endif /* IXGBE_FCOE */
+		skb_mark_ll(skb, &q_vector->napi);
 		ixgbe_rx_skb(q_vector, skb);
 
 		/* update budget accounting */
@@ -1992,9 +2004,45 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 	if (cleaned_count)
 		ixgbe_alloc_rx_buffers(rx_ring, cleaned_count);
 
-	return (total_rx_packets < budget);
+	return total_rx_packets;
 }
 
+#ifdef CONFIG_INET_LL_RX_POLL
+/* must be called with local_bh_disable()d */
+static int ixgbe_low_latency_recv(struct napi_struct *napi)
+{
+	struct ixgbe_q_vector *q_vector =
+			container_of(napi, struct ixgbe_q_vector, napi);
+	struct ixgbe_adapter *adapter = q_vector->adapter;
+	struct ixgbe_ring  *ring;
+	int found;
+
+	if (unlikely(!unsafe_to_remove)) {
+		unsafe_to_remove = 1;
+		if (!allow_unsafe_removal) {
+			pr_info("module may no longer be removed\n");
+			try_module_get(THIS_MODULE);
+		}
+	}
+
+	if (test_bit(__IXGBE_DOWN, &adapter->state))
+		return LL_FLUSH_FAILED;
+
+	if (!ixgbe_qv_lock_poll(q_vector))
+		return LL_FLUSH_BUSY;
+
+	ixgbe_for_each_ring(ring, q_vector->rx) {
+		found = ixgbe_clean_rx_irq(q_vector, ring, 4);
+		if (found)
+			break;
+	}
+
+	ixgbe_qv_unlock_poll(q_vector);
+
+	return LL_FLUSH_DONE;
+}
+#endif	/* CONFIG_INET_LL_RX_POLL */
+
 /**
  * ixgbe_configure_msix - Configure MSI-X hardware
  * @adapter: board private structure
@@ -2550,6 +2598,9 @@ int ixgbe_poll(struct napi_struct *napi, int budget)
 	ixgbe_for_each_ring(ring, q_vector->tx)
 		clean_complete &= !!ixgbe_clean_tx_irq(q_vector, ring);
 
+	if (!ixgbe_qv_lock_napi(q_vector))
+		return budget;
+
 	/* attempt to distribute budget to each queue fairly, but don't allow
 	 * the budget to go below 1 because we'll exit polling */
 	if (q_vector->rx.count > 1)
@@ -2558,9 +2609,10 @@ int ixgbe_poll(struct napi_struct *napi, int budget)
 		per_ring_budget = budget;
 
 	ixgbe_for_each_ring(ring, q_vector->rx)
-		clean_complete &= ixgbe_clean_rx_irq(q_vector, ring,
-						     per_ring_budget);
+		clean_complete &= (ixgbe_clean_rx_irq(q_vector, ring,
+				   per_ring_budget) < per_ring_budget);
 
+	ixgbe_qv_unlock_napi(q_vector);
 	/* If all work not completed, return budget and keep polling */
 	if (!clean_complete)
 		return budget;
@@ -3747,16 +3799,25 @@ static void ixgbe_napi_enable_all(struct ixgbe_adapter *adapter)
 {
 	int q_idx;
 
-	for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++)
+	for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++) {
+		ixgbe_qv_init_lock(adapter->q_vector[q_idx]);
 		napi_enable(&adapter->q_vector[q_idx]->napi);
+	}
 }
 
 static void ixgbe_napi_disable_all(struct ixgbe_adapter *adapter)
 {
 	int q_idx;
 
-	for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++)
+	local_bh_disable(); /* for ixgbe_qv_lock_napi() */
+	for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++) {
 		napi_disable(&adapter->q_vector[q_idx]->napi);
+		while (!ixgbe_qv_lock_napi(adapter->q_vector[q_idx])) {
+			pr_info("QV %d locked\n", q_idx);
+			mdelay(1);
+		}
+	}
+	local_bh_enable();
 }
 
 #ifdef CONFIG_IXGBE_DCB
@@ -7177,6 +7238,9 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= ixgbe_netpoll,
 #endif
+#ifdef CONFIG_INET_LL_RX_POLL
+	.ndo_ll_poll		= ixgbe_low_latency_recv,
+#endif
 #ifdef IXGBE_FCOE
 	.ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get,
 	.ndo_fcoe_ddp_target = ixgbe_fcoe_ddp_target,


^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-20 10:16   ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 10:16 UTC (permalink / raw)
  To: Dave Miller
  Cc: Willem de Bruijn, e1000-devel, netdev, HPA, linux-kernel,
	Jesse Brandeburg, Andi Kleen, Eliezer Tamir

Add the ixgbe driver code implementing ndo_ll_poll.
It should be easy for other drivers to do something similar
in order to enable support for CONFIG_INET_LL_RX_POLL

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
---

 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |   96 +++++++++++++++++++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   80 +++++++++++++++++++--
 2 files changed, 168 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index ca93238..72be661 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -356,9 +356,105 @@ struct ixgbe_q_vector {
 	struct rcu_head rcu;	/* to avoid race with update stats on free */
 	char name[IFNAMSIZ + 9];
 
+#ifdef CONFIG_INET_LL_RX_POLL
+	unsigned int state;
+#define IXGBE_QV_STATE_IDLE        0
+#define IXGBE_QV_STATE_NAPI	   1    /* NAPI owns this QV */
+#define IXGBE_QV_STATE_POLL	   2    /* poll owns this QV */
+#define IXGBE_QV_LOCKED (IXGBE_QV_STATE_NAPI | IXGBE_QV_STATE_POLL)
+#define IXGBE_QV_STATE_NAPI_YIELD  4    /* NAPI yielded this QV */
+#define IXGBE_QV_STATE_POLL_YIELD  8    /* poll yielded this QV */
+#define IXGBE_QV_YIELD (IXGBE_QV_STATE_NAPI_YIELD | IXGBE_QV_STATE_POLL_YIELD)
+#define IXGBE_QV_USER_PEND (IXGBE_QV_STATE_POLL | IXGBE_QV_STATE_POLL_YIELD)
+	spinlock_t lock;
+#endif  /* CONFIG_INET_LL_RX_POLL */
+
 	/* for dynamic allocation of rings associated with this q_vector */
 	struct ixgbe_ring ring[0] ____cacheline_internodealigned_in_smp;
 };
+#ifdef CONFIG_INET_LL_RX_POLL
+static inline void ixgbe_qv_init_lock(struct ixgbe_q_vector *q_vector)
+{
+
+	spin_lock_init(&q_vector->lock);
+	q_vector->state = IXGBE_QV_STATE_IDLE;
+}
+
+/* called from the device poll rutine to get ownership of a q_vector */
+static inline int ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector)
+{
+	int rc = true;
+	spin_lock(&q_vector->lock);
+	if (q_vector->state & IXGBE_QV_LOCKED) {
+		WARN_ON(q_vector->state & IXGBE_QV_STATE_NAPI);
+		q_vector->state |= IXGBE_QV_STATE_NAPI_YIELD;
+		rc = false;
+	} else
+		/* we don't care if someone yielded */
+		q_vector->state = IXGBE_QV_STATE_NAPI;
+	spin_unlock(&q_vector->lock);
+	return rc;
+}
+
+/* returns true is someone tried to get the qv while napi had it */
+static inline int ixgbe_qv_unlock_napi(struct ixgbe_q_vector *q_vector)
+{
+	int rc = false;
+	spin_lock(&q_vector->lock);
+	WARN_ON(q_vector->state & (IXGBE_QV_STATE_POLL |
+			       IXGBE_QV_STATE_NAPI_YIELD));
+
+	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
+		rc = true;
+	q_vector->state = IXGBE_QV_STATE_IDLE;
+	spin_unlock(&q_vector->lock);
+	return rc;
+}
+
+/* called from ixgbe_low_latency_poll() */
+static inline int ixgbe_qv_lock_poll(struct ixgbe_q_vector *q_vector)
+{
+	int rc = true;
+	spin_lock_bh(&q_vector->lock);
+	if ((q_vector->state & IXGBE_QV_LOCKED)) {
+		q_vector->state |= IXGBE_QV_STATE_POLL_YIELD;
+		rc = false;
+	} else
+		/* preserve yield marks */
+		q_vector->state |= IXGBE_QV_STATE_POLL;
+	spin_unlock_bh(&q_vector->lock);
+	return rc;
+}
+
+/* returns true if someone tried to get the qv while it was locked */
+static inline int ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector)
+{
+	int rc = false;
+	spin_lock_bh(&q_vector->lock);
+	WARN_ON(q_vector->state & (IXGBE_QV_STATE_NAPI));
+
+	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
+		rc = true;
+	q_vector->state = IXGBE_QV_STATE_IDLE;
+	spin_unlock_bh(&q_vector->lock);
+	return rc;
+}
+
+/* true if a socket is polling, even if it did not get the lock */
+static inline int ixgbe_qv_ll_polling(struct ixgbe_q_vector *q_vector)
+{
+	WARN_ON(!(q_vector->state & IXGBE_QV_LOCKED));
+	return q_vector->state & IXGBE_QV_USER_PEND;
+}
+#else
+#define ixgbe_qv_init_lock(qv) do {} while (0)
+#define ixgbe_qv_lock_napi(qv) 1
+#define ixgbe_qv_unlock_napi(qv) 0
+#define ixgbe_qv_lock_poll(qv) 0
+#define ixgbe_qv_unlock_poll(qv) 0
+#define ixgbe_qv_ll_polling(qv) 0
+#endif /* CONFIG_INET_LL_RX_POLL */
+
 #ifdef CONFIG_IXGBE_HWMON
 
 #define IXGBE_HWMON_TYPE_LOC		0
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d30fbdd..628b7b1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -47,6 +47,7 @@
 #include <linux/if_bridge.h>
 #include <linux/prefetch.h>
 #include <scsi/fc/fc_fcoe.h>
+#include <net/ll_poll.h>
 
 #include "ixgbe.h"
 #include "ixgbe_common.h"
@@ -144,6 +145,14 @@ static int debug = -1;
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
 
+#ifdef CONFIG_INET_LL_RX_POLL
+static int allow_unsafe_removal;
+static int unsafe_to_remove;
+module_param(allow_unsafe_removal, int, 0);
+MODULE_PARM_DESC(allow_unsafe_removal,
+	"Allow removal of module after low latency receive was used");
+#endif
+
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
 MODULE_DESCRIPTION("Intel(R) 10 Gigabit PCI Express Network Driver");
 MODULE_LICENSE("GPL");
@@ -1504,7 +1513,9 @@ static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector,
 {
 	struct ixgbe_adapter *adapter = q_vector->adapter;
 
-	if (!(adapter->flags & IXGBE_FLAG_IN_NETPOLL))
+	if (ixgbe_qv_ll_polling(q_vector))
+		netif_receive_skb(skb);
+	else if (!(adapter->flags & IXGBE_FLAG_IN_NETPOLL))
 		napi_gro_receive(&q_vector->napi, skb);
 	else
 		netif_rx(skb);
@@ -1892,9 +1903,9 @@ dma_sync:
  * expensive overhead for IOMMU access this provides a means of avoiding
  * it by maintaining the mapping of the page to the syste.
  *
- * Returns true if all work is completed without reaching budget
+ * Returns amount of work completed
  **/
-static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
+static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			       struct ixgbe_ring *rx_ring,
 			       const int budget)
 {
@@ -1976,6 +1987,7 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		}
 
 #endif /* IXGBE_FCOE */
+		skb_mark_ll(skb, &q_vector->napi);
 		ixgbe_rx_skb(q_vector, skb);
 
 		/* update budget accounting */
@@ -1992,9 +2004,45 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 	if (cleaned_count)
 		ixgbe_alloc_rx_buffers(rx_ring, cleaned_count);
 
-	return (total_rx_packets < budget);
+	return total_rx_packets;
 }
 
+#ifdef CONFIG_INET_LL_RX_POLL
+/* must be called with local_bh_disable()d */
+static int ixgbe_low_latency_recv(struct napi_struct *napi)
+{
+	struct ixgbe_q_vector *q_vector =
+			container_of(napi, struct ixgbe_q_vector, napi);
+	struct ixgbe_adapter *adapter = q_vector->adapter;
+	struct ixgbe_ring  *ring;
+	int found;
+
+	if (unlikely(!unsafe_to_remove)) {
+		unsafe_to_remove = 1;
+		if (!allow_unsafe_removal) {
+			pr_info("module may no longer be removed\n");
+			try_module_get(THIS_MODULE);
+		}
+	}
+
+	if (test_bit(__IXGBE_DOWN, &adapter->state))
+		return LL_FLUSH_FAILED;
+
+	if (!ixgbe_qv_lock_poll(q_vector))
+		return LL_FLUSH_BUSY;
+
+	ixgbe_for_each_ring(ring, q_vector->rx) {
+		found = ixgbe_clean_rx_irq(q_vector, ring, 4);
+		if (found)
+			break;
+	}
+
+	ixgbe_qv_unlock_poll(q_vector);
+
+	return LL_FLUSH_DONE;
+}
+#endif	/* CONFIG_INET_LL_RX_POLL */
+
 /**
  * ixgbe_configure_msix - Configure MSI-X hardware
  * @adapter: board private structure
@@ -2550,6 +2598,9 @@ int ixgbe_poll(struct napi_struct *napi, int budget)
 	ixgbe_for_each_ring(ring, q_vector->tx)
 		clean_complete &= !!ixgbe_clean_tx_irq(q_vector, ring);
 
+	if (!ixgbe_qv_lock_napi(q_vector))
+		return budget;
+
 	/* attempt to distribute budget to each queue fairly, but don't allow
 	 * the budget to go below 1 because we'll exit polling */
 	if (q_vector->rx.count > 1)
@@ -2558,9 +2609,10 @@ int ixgbe_poll(struct napi_struct *napi, int budget)
 		per_ring_budget = budget;
 
 	ixgbe_for_each_ring(ring, q_vector->rx)
-		clean_complete &= ixgbe_clean_rx_irq(q_vector, ring,
-						     per_ring_budget);
+		clean_complete &= (ixgbe_clean_rx_irq(q_vector, ring,
+				   per_ring_budget) < per_ring_budget);
 
+	ixgbe_qv_unlock_napi(q_vector);
 	/* If all work not completed, return budget and keep polling */
 	if (!clean_complete)
 		return budget;
@@ -3747,16 +3799,25 @@ static void ixgbe_napi_enable_all(struct ixgbe_adapter *adapter)
 {
 	int q_idx;
 
-	for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++)
+	for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++) {
+		ixgbe_qv_init_lock(adapter->q_vector[q_idx]);
 		napi_enable(&adapter->q_vector[q_idx]->napi);
+	}
 }
 
 static void ixgbe_napi_disable_all(struct ixgbe_adapter *adapter)
 {
 	int q_idx;
 
-	for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++)
+	local_bh_disable(); /* for ixgbe_qv_lock_napi() */
+	for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++) {
 		napi_disable(&adapter->q_vector[q_idx]->napi);
+		while (!ixgbe_qv_lock_napi(adapter->q_vector[q_idx])) {
+			pr_info("QV %d locked\n", q_idx);
+			mdelay(1);
+		}
+	}
+	local_bh_enable();
 }
 
 #ifdef CONFIG_IXGBE_DCB
@@ -7177,6 +7238,9 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= ixgbe_netpoll,
 #endif
+#ifdef CONFIG_INET_LL_RX_POLL
+	.ndo_ll_poll		= ixgbe_low_latency_recv,
+#endif
 #ifdef IXGBE_FCOE
 	.ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get,
 	.ndo_fcoe_ddp_target = ixgbe_fcoe_ddp_target,


------------------------------------------------------------------------------
AlienVault Unified Security Management (USM) platform delivers complete
security visibility with the essential security capabilities. Easily and
efficiently configure, manage, and operate all of your security controls
from a single console and one unified framework. Download a free trial.
http://p.sf.net/sfu/alienvault_d2d
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH v3 net-next 4/4] ixgbe: add extra stats for ndo_ll_poll
  2013-05-20 10:15 ` Eliezer Tamir
@ 2013-05-20 10:16   ` Eliezer Tamir
  -1 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 10:16 UTC (permalink / raw)
  To: Dave Miller
  Cc: linux-kernel, netdev, Jesse Brandeburg, Don Skidmore,
	e1000-devel, Willem de Bruijn, Andi Kleen, HPA, Eliezer Tamir

Add additional statistics to the ixgbe driver for ndo_ll_poll
Defined under LL_EXTENDED_STATS

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
---

 drivers/net/ethernet/intel/ixgbe/ixgbe.h         |   14 ++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   40 ++++++++++++++++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |    6 +++
 3 files changed, 60 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 72be661..2a7de7c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -52,6 +52,9 @@
 #include <linux/dca.h>
 #endif
 
+#ifdef CONFIG_INET_LL_RX_POLL
+#define LL_EXTENDED_STATS
+#endif  /* CONFIG_INET_LL_RX_POLL */
 /* common prefix used by pr_<> macros */
 #undef pr_fmt
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -182,6 +185,11 @@ struct ixgbe_rx_buffer {
 struct ixgbe_queue_stats {
 	u64 packets;
 	u64 bytes;
+#ifdef LL_EXTENDED_STATS
+	u64 yields;
+	u64 misses;
+	u64 cleaned;
+#endif  /* LL_EXTENDED_STATS */
 };
 
 struct ixgbe_tx_queue_stats {
@@ -389,6 +397,9 @@ static inline int ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector)
 		WARN_ON(q_vector->state & IXGBE_QV_STATE_NAPI);
 		q_vector->state |= IXGBE_QV_STATE_NAPI_YIELD;
 		rc = false;
+#ifdef LL_EXTENDED_STATS
+		q_vector->tx.ring->stats.yields++;
+#endif
 	} else
 		/* we don't care if someone yielded */
 		q_vector->state = IXGBE_QV_STATE_NAPI;
@@ -419,6 +430,9 @@ static inline int ixgbe_qv_lock_poll(struct ixgbe_q_vector *q_vector)
 	if ((q_vector->state & IXGBE_QV_LOCKED)) {
 		q_vector->state |= IXGBE_QV_STATE_POLL_YIELD;
 		rc = false;
+#ifdef LL_EXTENDED_STATS
+		q_vector->rx.ring->stats.yields++;
+#endif
 	} else
 		/* preserve yield marks */
 		q_vector->state |= IXGBE_QV_STATE_POLL;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index d375472..24e2e7a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -1054,6 +1054,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 			data[i] = 0;
 			data[i+1] = 0;
 			i += 2;
+#ifdef LL_EXTENDED_STATS
+			data[i] = 0;
+			data[i+1] = 0;
+			data[i+2] = 0;
+			i += 3;
+#endif
 			continue;
 		}
 
@@ -1063,6 +1069,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 			data[i+1] = ring->stats.bytes;
 		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
 		i += 2;
+#ifdef LL_EXTENDED_STATS
+		data[i] = ring->stats.yields;
+		data[i+1] = ring->stats.misses;
+		data[i+2] = ring->stats.cleaned;
+		i += 3;
+#endif
 	}
 	for (j = 0; j < IXGBE_NUM_RX_QUEUES; j++) {
 		ring = adapter->rx_ring[j];
@@ -1070,6 +1082,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 			data[i] = 0;
 			data[i+1] = 0;
 			i += 2;
+#ifdef LL_EXTENDED_STATS
+			data[i] = 0;
+			data[i+1] = 0;
+			data[i+2] = 0;
+			i += 3;
+#endif
 			continue;
 		}
 
@@ -1079,6 +1097,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 			data[i+1] = ring->stats.bytes;
 		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
 		i += 2;
+#ifdef LL_EXTENDED_STATS
+		data[i] = ring->stats.yields;
+		data[i+1] = ring->stats.misses;
+		data[i+2] = ring->stats.cleaned;
+		i += 3;
+#endif
 	}
 
 	for (j = 0; j < IXGBE_MAX_PACKET_BUFFERS; j++) {
@@ -1115,12 +1139,28 @@ static void ixgbe_get_strings(struct net_device *netdev, u32 stringset,
 			p += ETH_GSTRING_LEN;
 			sprintf(p, "tx_queue_%u_bytes", i);
 			p += ETH_GSTRING_LEN;
+#ifdef LL_EXTENDED_STATS
+			sprintf(p, "tx_q_%u_napi_yield", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "tx_q_%u_misses", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "tx_q_%u_cleaned", i);
+			p += ETH_GSTRING_LEN;
+#endif /* LL_EXTENDED_STATS */
 		}
 		for (i = 0; i < IXGBE_NUM_RX_QUEUES; i++) {
 			sprintf(p, "rx_queue_%u_packets", i);
 			p += ETH_GSTRING_LEN;
 			sprintf(p, "rx_queue_%u_bytes", i);
 			p += ETH_GSTRING_LEN;
+#ifdef LL_EXTENDED_STATS
+			sprintf(p, "rx_q_%u_ll_poll_yield", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "rx_q_%u_misses", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "rx_q_%u_cleaned", i);
+			p += ETH_GSTRING_LEN;
+#endif /* LL_EXTENDED_STATS */
 		}
 		for (i = 0; i < IXGBE_MAX_PACKET_BUFFERS; i++) {
 			sprintf(p, "tx_pb_%u_pxon", i);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 628b7b1..1b2214b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2033,6 +2033,12 @@ static int ixgbe_low_latency_recv(struct napi_struct *napi)
 
 	ixgbe_for_each_ring(ring, q_vector->rx) {
 		found = ixgbe_clean_rx_irq(q_vector, ring, 4);
+#ifdef LL_EXTENDED_STATS
+		if (found)
+			ring->stats.cleaned += found;
+		else
+			ring->stats.misses++;
+#endif
 		if (found)
 			break;
 	}


^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH v3 net-next 4/4] ixgbe: add extra stats for ndo_ll_poll
@ 2013-05-20 10:16   ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 10:16 UTC (permalink / raw)
  To: Dave Miller
  Cc: Willem de Bruijn, e1000-devel, netdev, HPA, linux-kernel,
	Jesse Brandeburg, Andi Kleen, Eliezer Tamir

Add additional statistics to the ixgbe driver for ndo_ll_poll
Defined under LL_EXTENDED_STATS

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
---

 drivers/net/ethernet/intel/ixgbe/ixgbe.h         |   14 ++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   40 ++++++++++++++++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |    6 +++
 3 files changed, 60 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 72be661..2a7de7c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -52,6 +52,9 @@
 #include <linux/dca.h>
 #endif
 
+#ifdef CONFIG_INET_LL_RX_POLL
+#define LL_EXTENDED_STATS
+#endif  /* CONFIG_INET_LL_RX_POLL */
 /* common prefix used by pr_<> macros */
 #undef pr_fmt
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -182,6 +185,11 @@ struct ixgbe_rx_buffer {
 struct ixgbe_queue_stats {
 	u64 packets;
 	u64 bytes;
+#ifdef LL_EXTENDED_STATS
+	u64 yields;
+	u64 misses;
+	u64 cleaned;
+#endif  /* LL_EXTENDED_STATS */
 };
 
 struct ixgbe_tx_queue_stats {
@@ -389,6 +397,9 @@ static inline int ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector)
 		WARN_ON(q_vector->state & IXGBE_QV_STATE_NAPI);
 		q_vector->state |= IXGBE_QV_STATE_NAPI_YIELD;
 		rc = false;
+#ifdef LL_EXTENDED_STATS
+		q_vector->tx.ring->stats.yields++;
+#endif
 	} else
 		/* we don't care if someone yielded */
 		q_vector->state = IXGBE_QV_STATE_NAPI;
@@ -419,6 +430,9 @@ static inline int ixgbe_qv_lock_poll(struct ixgbe_q_vector *q_vector)
 	if ((q_vector->state & IXGBE_QV_LOCKED)) {
 		q_vector->state |= IXGBE_QV_STATE_POLL_YIELD;
 		rc = false;
+#ifdef LL_EXTENDED_STATS
+		q_vector->rx.ring->stats.yields++;
+#endif
 	} else
 		/* preserve yield marks */
 		q_vector->state |= IXGBE_QV_STATE_POLL;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index d375472..24e2e7a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -1054,6 +1054,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 			data[i] = 0;
 			data[i+1] = 0;
 			i += 2;
+#ifdef LL_EXTENDED_STATS
+			data[i] = 0;
+			data[i+1] = 0;
+			data[i+2] = 0;
+			i += 3;
+#endif
 			continue;
 		}
 
@@ -1063,6 +1069,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 			data[i+1] = ring->stats.bytes;
 		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
 		i += 2;
+#ifdef LL_EXTENDED_STATS
+		data[i] = ring->stats.yields;
+		data[i+1] = ring->stats.misses;
+		data[i+2] = ring->stats.cleaned;
+		i += 3;
+#endif
 	}
 	for (j = 0; j < IXGBE_NUM_RX_QUEUES; j++) {
 		ring = adapter->rx_ring[j];
@@ -1070,6 +1082,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 			data[i] = 0;
 			data[i+1] = 0;
 			i += 2;
+#ifdef LL_EXTENDED_STATS
+			data[i] = 0;
+			data[i+1] = 0;
+			data[i+2] = 0;
+			i += 3;
+#endif
 			continue;
 		}
 
@@ -1079,6 +1097,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 			data[i+1] = ring->stats.bytes;
 		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
 		i += 2;
+#ifdef LL_EXTENDED_STATS
+		data[i] = ring->stats.yields;
+		data[i+1] = ring->stats.misses;
+		data[i+2] = ring->stats.cleaned;
+		i += 3;
+#endif
 	}
 
 	for (j = 0; j < IXGBE_MAX_PACKET_BUFFERS; j++) {
@@ -1115,12 +1139,28 @@ static void ixgbe_get_strings(struct net_device *netdev, u32 stringset,
 			p += ETH_GSTRING_LEN;
 			sprintf(p, "tx_queue_%u_bytes", i);
 			p += ETH_GSTRING_LEN;
+#ifdef LL_EXTENDED_STATS
+			sprintf(p, "tx_q_%u_napi_yield", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "tx_q_%u_misses", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "tx_q_%u_cleaned", i);
+			p += ETH_GSTRING_LEN;
+#endif /* LL_EXTENDED_STATS */
 		}
 		for (i = 0; i < IXGBE_NUM_RX_QUEUES; i++) {
 			sprintf(p, "rx_queue_%u_packets", i);
 			p += ETH_GSTRING_LEN;
 			sprintf(p, "rx_queue_%u_bytes", i);
 			p += ETH_GSTRING_LEN;
+#ifdef LL_EXTENDED_STATS
+			sprintf(p, "rx_q_%u_ll_poll_yield", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "rx_q_%u_misses", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "rx_q_%u_cleaned", i);
+			p += ETH_GSTRING_LEN;
+#endif /* LL_EXTENDED_STATS */
 		}
 		for (i = 0; i < IXGBE_MAX_PACKET_BUFFERS; i++) {
 			sprintf(p, "tx_pb_%u_pxon", i);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 628b7b1..1b2214b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2033,6 +2033,12 @@ static int ixgbe_low_latency_recv(struct napi_struct *napi)
 
 	ixgbe_for_each_ring(ring, q_vector->rx) {
 		found = ixgbe_clean_rx_irq(q_vector, ring, 4);
+#ifdef LL_EXTENDED_STATS
+		if (found)
+			ring->stats.cleaned += found;
+		else
+			ring->stats.misses++;
+#endif
 		if (found)
 			break;
 	}


------------------------------------------------------------------------------
AlienVault Unified Security Management (USM) platform delivers complete
security visibility with the essential security capabilities. Easily and
efficiently configure, manage, and operate all of your security controls
from a single console and one unified framework. Download a free trial.
http://p.sf.net/sfu/alienvault_d2d
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 2/4] tcp: add TCP support for low latency receive poll.
  2013-05-20 10:16   ` Eliezer Tamir
  (?)
@ 2013-05-20 13:49   ` Eric Dumazet
  2013-05-20 14:59       ` Eliezer Tamir
  -1 siblings, 1 reply; 69+ messages in thread
From: Eric Dumazet @ 2013-05-20 13:49 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

On Mon, 2013-05-20 at 13:16 +0300, Eliezer Tamir wrote:
>  
> +config INET_LL_TCP_POLL
> +	bool "Low Latency TCP Receive Poll"
> +	depends on INET_LL_RX_POLL
> +	default n
> +	---help---
> +	  TCP support for Low Latency TCP Queue Poll.
> +	  (For network cards that support this option.)
> +	  Add support to the TCP stack for direct polling of the network card.
> +
> +	  If unsure, say N.
> +

Oh well, code is small and already guarded by CONFIG_INET_LL_RX_POLL




^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 2/4] tcp: add TCP support for low latency receive poll.
  2013-05-20 13:49   ` Eric Dumazet
@ 2013-05-20 14:59       ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 14:59 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

on 20/05/2013 16:49, Eric Dumazet wrote:
> On Mon, 2013-05-20 at 13:16 +0300, Eliezer Tamir wrote:
>>
>> +config INET_LL_TCP_POLL
>> +	bool "Low Latency TCP Receive Poll"
>> +	depends on INET_LL_RX_POLL
>> +	default n
>> +	---help---
>> +	  TCP support for Low Latency TCP Queue Poll.
>> +	  (For network cards that support this option.)
>> +	  Add support to the TCP stack for direct polling of the network card.
>> +
>> +	  If unsure, say N.
>> +
>
> Oh well, code is small and already guarded by CONFIG_INET_LL_RX_POLL

I will remove the separate config option.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 2/4] tcp: add TCP support for low latency receive poll.
@ 2013-05-20 14:59       ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-20 14:59 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Willem de Bruijn, e1000-devel, netdev, HPA, Jesse Brandeburg,
	linux-kernel, Andi Kleen, Eliezer Tamir, Dave Miller

on 20/05/2013 16:49, Eric Dumazet wrote:
> On Mon, 2013-05-20 at 13:16 +0300, Eliezer Tamir wrote:
>>
>> +config INET_LL_TCP_POLL
>> +	bool "Low Latency TCP Receive Poll"
>> +	depends on INET_LL_RX_POLL
>> +	default n
>> +	---help---
>> +	  TCP support for Low Latency TCP Queue Poll.
>> +	  (For network cards that support this option.)
>> +	  Add support to the TCP stack for direct polling of the network card.
>> +
>> +	  If unsure, say N.
>> +
>
> Oh well, code is small and already guarded by CONFIG_INET_LL_RX_POLL

I will remove the separate config option.


------------------------------------------------------------------------------
AlienVault Unified Security Management (USM) platform delivers complete
security visibility with the essential security capabilities. Easily and
efficiently configure, manage, and operate all of your security controls
from a single console and one unified framework. Download a free trial.
http://p.sf.net/sfu/alienvault_d2d
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-20 10:16   ` Eliezer Tamir
  (?)
@ 2013-05-20 15:29   ` Eric Dumazet
  2013-05-20 19:40     ` David Miller
  2013-05-21  7:28     ` Eliezer Tamir
  -1 siblings, 2 replies; 69+ messages in thread
From: Eric Dumazet @ 2013-05-20 15:29 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

On Mon, 2013-05-20 at 13:16 +0300, Eliezer Tamir wrote:
> Adds a new ndo_ll_poll method and the code that supports and uses it.
> This method can be used by low latency applications to busy poll ethernet
> device queues directly from the socket code. The ip_low_latency_poll sysctl
> entry controls how many cycles to poll. Set to zero to disable.
> 

This changelog lacks a lot of information, see below.

Part of this information was in your 0/4 text, but it wont be included
in the git tree.

> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
> Tested-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
> ---
> 

> +
> +static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
> +{
> +	skb->dev_ref = napi;
> +}
> +
> +static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
> +{
> +	sk->dev_ref = skb->dev_ref;
> +}

I do not see why it's safe to keep a pointer to a napi object without
taking a reference, or something to prevent object being removed.

Using a genid might be enough. (some counter incremented every time a
napi is dismantled)

Alternatively, use a napi_id instead of a pointer.




^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-20 15:29   ` Eric Dumazet
@ 2013-05-20 19:40     ` David Miller
  2013-05-21  7:28     ` Eliezer Tamir
  1 sibling, 0 replies; 69+ messages in thread
From: David Miller @ 2013-05-20 19:40 UTC (permalink / raw)
  To: eric.dumazet
  Cc: eliezer.tamir, linux-kernel, netdev, jesse.brandeburg,
	donald.c.skidmore, e1000-devel, willemb, andi, hpa, eliezer

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 20 May 2013 08:29:24 -0700

> Part of this information was in your 0/4 text, but it wont be included
> in the git tree.

Yes it will, in the merge commit I make when I merge this stuff in.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-20 10:16   ` Eliezer Tamir
@ 2013-05-20 20:20     ` Or Gerlitz
  -1 siblings, 0 replies; 69+ messages in thread
From: Or Gerlitz @ 2013-05-20 20:20 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

On Mon, May 20, 2013 at 1:16 PM, Eliezer Tamir
<eliezer.tamir@linux.intel.com> wrote:
> Add the ixgbe driver code implementing ndo_ll_poll.
> It should be easy for other drivers to do something similar
> in order to enable support for CONFIG_INET_LL_RX_POLL

I am not sure,


> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
[...]
> @@ -144,6 +145,14 @@ static int debug = -1;
>  module_param(debug, int, 0);
>  MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
>
> +#ifdef CONFIG_INET_LL_RX_POLL
> +static int allow_unsafe_removal;
> +static int unsafe_to_remove;
> +module_param(allow_unsafe_removal, int, 0);
> +MODULE_PARM_DESC(allow_unsafe_removal,
> +       "Allow removal of module after low latency receive was used");
> +#endif

what?!

[...]

> +#ifdef CONFIG_INET_LL_RX_POLL
> +/* must be called with local_bh_disable()d */
> +static int ixgbe_low_latency_recv(struct napi_struct *napi)
> +{
> +       struct ixgbe_q_vector *q_vector =
> +                       container_of(napi, struct ixgbe_q_vector, napi);
> +       struct ixgbe_adapter *adapter = q_vector->adapter;
> +       struct ixgbe_ring  *ring;
> +       int found;
> +
> +       if (unlikely(!unsafe_to_remove)) {
> +               unsafe_to_remove = 1;
> +               if (!allow_unsafe_removal) {
> +                       pr_info("module may no longer be removed\n");
> +                       try_module_get(THIS_MODULE);
> +               }
> +       }

guys, so what is going here, you were asking to put this series in
net-next, and you expect each other driver implementing this ndo to
follow this undocumented hack? or maybe this code was just left here
by mistake from previous implementations and just needed to be
removed?  please clarify.

Or.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-20 20:20     ` Or Gerlitz
  0 siblings, 0 replies; 69+ messages in thread
From: Or Gerlitz @ 2013-05-20 20:20 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Willem de Bruijn, e1000-devel, netdev, HPA, Jesse Brandeburg,
	linux-kernel, Andi Kleen, Eliezer Tamir, Dave Miller

On Mon, May 20, 2013 at 1:16 PM, Eliezer Tamir
<eliezer.tamir@linux.intel.com> wrote:
> Add the ixgbe driver code implementing ndo_ll_poll.
> It should be easy for other drivers to do something similar
> in order to enable support for CONFIG_INET_LL_RX_POLL

I am not sure,


> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
[...]
> @@ -144,6 +145,14 @@ static int debug = -1;
>  module_param(debug, int, 0);
>  MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
>
> +#ifdef CONFIG_INET_LL_RX_POLL
> +static int allow_unsafe_removal;
> +static int unsafe_to_remove;
> +module_param(allow_unsafe_removal, int, 0);
> +MODULE_PARM_DESC(allow_unsafe_removal,
> +       "Allow removal of module after low latency receive was used");
> +#endif

what?!

[...]

> +#ifdef CONFIG_INET_LL_RX_POLL
> +/* must be called with local_bh_disable()d */
> +static int ixgbe_low_latency_recv(struct napi_struct *napi)
> +{
> +       struct ixgbe_q_vector *q_vector =
> +                       container_of(napi, struct ixgbe_q_vector, napi);
> +       struct ixgbe_adapter *adapter = q_vector->adapter;
> +       struct ixgbe_ring  *ring;
> +       int found;
> +
> +       if (unlikely(!unsafe_to_remove)) {
> +               unsafe_to_remove = 1;
> +               if (!allow_unsafe_removal) {
> +                       pr_info("module may no longer be removed\n");
> +                       try_module_get(THIS_MODULE);
> +               }
> +       }

guys, so what is going here, you were asking to put this series in
net-next, and you expect each other driver implementing this ndo to
follow this undocumented hack? or maybe this code was just left here
by mistake from previous implementations and just needed to be
removed?  please clarify.

Or.

------------------------------------------------------------------------------
AlienVault Unified Security Management (USM) platform delivers complete
security visibility with the essential security capabilities. Easily and
efficiently configure, manage, and operate all of your security controls
from a single console and one unified framework. Download a free trial.
http://p.sf.net/sfu/alienvault_d2d
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-20 20:20     ` Or Gerlitz
  (?)
@ 2013-05-20 20:33     ` Andi Kleen
  2013-05-20 20:42       ` Or Gerlitz
  -1 siblings, 1 reply; 69+ messages in thread
From: Andi Kleen @ 2013-05-20 20:33 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Eliezer Tamir, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	Andi Kleen, HPA, Eliezer Tamir

> guys, so what is going here, you were asking to put this series in
> net-next, and you expect each other driver implementing this ndo to
> follow this undocumented hack? or maybe this code was just left here
> by mistake from previous implementations and just needed to be
> removed?  please clarify.

This is discussed in 0/x

-Andi

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-20 20:33     ` Andi Kleen
@ 2013-05-20 20:42       ` Or Gerlitz
  2013-05-20 21:01         ` Andi Kleen
  0 siblings, 1 reply; 69+ messages in thread
From: Or Gerlitz @ 2013-05-20 20:42 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Willem de Bruijn, Eliezer Tamir, e1000-devel, netdev, HPA,
	Jesse Brandeburg, linux-kernel, Eliezer Tamir, Dave Miller


[-- Attachment #1.1: Type: text/plain, Size: 196 bytes --]

On Mon, May 20, 2013 at 11:33 PM, Andi Kleen <andi@firstfloor.org> wrote:

>
> This is discussed in 0/x
>

I am not with you, V3's cover letter is empty, and in V2's cover letter I
don't see that

[-- Attachment #2: Type: text/plain, Size: 403 bytes --]

------------------------------------------------------------------------------
AlienVault Unified Security Management (USM) platform delivers complete
security visibility with the essential security capabilities. Easily and
efficiently configure, manage, and operate all of your security controls
from a single console and one unified framework. Download a free trial.
http://p.sf.net/sfu/alienvault_d2d

[-- Attachment #3: Type: text/plain, Size: 257 bytes --]

_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-20 20:42       ` Or Gerlitz
@ 2013-05-20 21:01         ` Andi Kleen
  2013-05-21  6:23             ` Or Gerlitz
  0 siblings, 1 reply; 69+ messages in thread
From: Andi Kleen @ 2013-05-20 21:01 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Andi Kleen, Eliezer Tamir, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	HPA, Eliezer Tamir

On Mon, May 20, 2013 at 11:42:37PM +0300, Or Gerlitz wrote:
> On Mon, May 20, 2013 at 11:33 PM, Andi Kleen <andi@firstfloor.org> wrote:
> 
> >
> > This is discussed in 0/x
> >
> 
> I am not with you, V3's cover letter is empty, and in V2's cover letter I
> don't see that

It was here. Looks like Eliezer didn't reuse the full cover later.

http://www.gossamer-threads.com/lists/linux/kernel/1715294?page=last


-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-20 21:01         ` Andi Kleen
@ 2013-05-21  6:23             ` Or Gerlitz
  0 siblings, 0 replies; 69+ messages in thread
From: Or Gerlitz @ 2013-05-21  6:23 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Eliezer Tamir, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	HPA, Eliezer Tamir

On Tue, May 21, 2013 at 12:01 AM, Andi Kleen <andi@firstfloor.org> wrote:
>
> On Mon, May 20, 2013 at 11:42:37PM +0300, Or Gerlitz wrote:
> > On Mon, May 20, 2013 at 11:33 PM, Andi Kleen <andi@firstfloor.org> wrote:
> >
> > >
> > > This is discussed in 0/x
> > >
> >
> > I am not with you, V3's cover letter is empty, and in V2's cover letter I
> > don't see that
>
> It was here. Looks like Eliezer didn't reuse the full cover later.
> http://www.gossamer-threads.com/lists/linux/kernel/1715294?page=last
>


Grepping for "ixgbe" in the link you sent only brings the following hits:

Patch 3 shows how this method would be implemented for the ixgbe driver.
Patch 4 adds statistics to the ixgbe driver for ndo_ll_poll events.

does "shows how this method would be implemented" means patch #4 is
just RFC or proof-of-concept code and is not actually submitted for
acceptance?

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-21  6:23             ` Or Gerlitz
  0 siblings, 0 replies; 69+ messages in thread
From: Or Gerlitz @ 2013-05-21  6:23 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Willem de Bruijn, Eliezer Tamir, e1000-devel, netdev, HPA,
	Jesse Brandeburg, linux-kernel, Eliezer Tamir, Dave Miller

On Tue, May 21, 2013 at 12:01 AM, Andi Kleen <andi@firstfloor.org> wrote:
>
> On Mon, May 20, 2013 at 11:42:37PM +0300, Or Gerlitz wrote:
> > On Mon, May 20, 2013 at 11:33 PM, Andi Kleen <andi@firstfloor.org> wrote:
> >
> > >
> > > This is discussed in 0/x
> > >
> >
> > I am not with you, V3's cover letter is empty, and in V2's cover letter I
> > don't see that
>
> It was here. Looks like Eliezer didn't reuse the full cover later.
> http://www.gossamer-threads.com/lists/linux/kernel/1715294?page=last
>


Grepping for "ixgbe" in the link you sent only brings the following hits:

Patch 3 shows how this method would be implemented for the ixgbe driver.
Patch 4 adds statistics to the ixgbe driver for ndo_ll_poll events.

does "shows how this method would be implemented" means patch #4 is
just RFC or proof-of-concept code and is not actually submitted for
acceptance?

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-20 20:20     ` Or Gerlitz
@ 2013-05-21  6:54       ` Eliezer Tamir
  -1 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-21  6:54 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

On 20/05/2013 23:20, Or Gerlitz wrote:
> On Mon, May 20, 2013 at 1:16 PM, Eliezer Tamir
> <eliezer.tamir@linux.intel.com> wrote:
>> Add the ixgbe driver code implementing ndo_ll_poll.
>> It should be easy for other drivers to do something similar
>> in order to enable support for CONFIG_INET_LL_RX_POLL
>
> I am not sure,

Willem ported this to <some undisclosed HW that they use at Google>, his 
feedback was that it was not a major effort.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-21  6:54       ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-21  6:54 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Willem de Bruijn, e1000-devel, netdev, HPA, Jesse Brandeburg,
	linux-kernel, Andi Kleen, Eliezer Tamir, Dave Miller

On 20/05/2013 23:20, Or Gerlitz wrote:
> On Mon, May 20, 2013 at 1:16 PM, Eliezer Tamir
> <eliezer.tamir@linux.intel.com> wrote:
>> Add the ixgbe driver code implementing ndo_ll_poll.
>> It should be easy for other drivers to do something similar
>> in order to enable support for CONFIG_INET_LL_RX_POLL
>
> I am not sure,

Willem ported this to <some undisclosed HW that they use at Google>, his 
feedback was that it was not a major effort.

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21  6:54       ` Eliezer Tamir
@ 2013-05-21  7:06         ` Eilon Greenstein
  -1 siblings, 0 replies; 69+ messages in thread
From: Eilon Greenstein @ 2013-05-21  7:06 UTC (permalink / raw)
  To: Eliezer Tamir, Or Gerlitz
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

On Tue, 2013-05-21 at 09:54 +0300, Eliezer Tamir wrote:
> On 20/05/2013 23:20, Or Gerlitz wrote:
> > On Mon, May 20, 2013 at 1:16 PM, Eliezer Tamir
> > <eliezer.tamir@linux.intel.com> wrote:
> >> Add the ixgbe driver code implementing ndo_ll_poll.
> >> It should be easy for other drivers to do something similar
> >> in order to enable support for CONFIG_INET_LL_RX_POLL
> >
> > I am not sure,
> 
> Willem ported this to <some undisclosed HW that they use at Google>, his 
> feedback was that it was not a major effort.

We also played with applying a similar patch on the bnx2x and it looks
great :) - Thanks Eliezer!

Or - at least for the bnx2x, it is easy to add support for this new ndo.

Hopefully this series will be accepted so we can send follow up support
for the bnx2x as well.

Thanks,
Eilon




^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-21  7:06         ` Eilon Greenstein
  0 siblings, 0 replies; 69+ messages in thread
From: Eilon Greenstein @ 2013-05-21  7:06 UTC (permalink / raw)
  To: Eliezer Tamir, Or Gerlitz
  Cc: Willem de Bruijn, e1000-devel, netdev, HPA, Jesse Brandeburg,
	linux-kernel, Andi Kleen, Eliezer Tamir, Dave Miller

On Tue, 2013-05-21 at 09:54 +0300, Eliezer Tamir wrote:
> On 20/05/2013 23:20, Or Gerlitz wrote:
> > On Mon, May 20, 2013 at 1:16 PM, Eliezer Tamir
> > <eliezer.tamir@linux.intel.com> wrote:
> >> Add the ixgbe driver code implementing ndo_ll_poll.
> >> It should be easy for other drivers to do something similar
> >> in order to enable support for CONFIG_INET_LL_RX_POLL
> >
> > I am not sure,
> 
> Willem ported this to <some undisclosed HW that they use at Google>, his 
> feedback was that it was not a major effort.

We also played with applying a similar patch on the bnx2x and it looks
great :) - Thanks Eliezer!

Or - at least for the bnx2x, it is easy to add support for this new ndo.

Hopefully this series will be accepted so we can send follow up support
for the bnx2x as well.

Thanks,
Eilon




------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21  7:06         ` Eilon Greenstein
  (?)
@ 2013-05-21  7:14         ` David Miller
  2013-05-21  8:24             ` Or Gerlitz
  -1 siblings, 1 reply; 69+ messages in thread
From: David Miller @ 2013-05-21  7:14 UTC (permalink / raw)
  To: eilong
  Cc: eliezer.tamir, or.gerlitz, linux-kernel, netdev,
	jesse.brandeburg, donald.c.skidmore, e1000-devel, willemb, andi,
	hpa, eliezer

From: "Eilon Greenstein" <eilong@broadcom.com>
Date: Tue, 21 May 2013 10:06:43 +0300

> Hopefully this series will be accepted so we can send follow up support
> for the bnx2x as well.

I think in two or three more iterations it will be merged.

There are no objections on the fundamentals, it's just implementation
details and coding style at this point.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-20 15:29   ` Eric Dumazet
  2013-05-20 19:40     ` David Miller
@ 2013-05-21  7:28     ` Eliezer Tamir
  2013-05-21 13:28       ` Eric Dumazet
  1 sibling, 1 reply; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-21  7:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

On 20/05/2013 18:29, Eric Dumazet wrote:
> On Mon, 2013-05-20 at 13:16 +0300, Eliezer Tamir wrote:
---
>> +static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
>> +{
>> +	skb->dev_ref = napi;
>> +}
>> +
>> +static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
>> +{
>> +	sk->dev_ref = skb->dev_ref;
>> +}
>
> I do not see why it's safe to keep a pointer to a napi object without
> taking a reference, or something to prevent object being removed.
>
> Using a genid might be enough. (some counter incremented every time a
> napi is dismantled)

I really like this approach and I tried it.
The main problem I had is that you need to increase the size of the skb 
to store the generation id unless you stuff it in the flags2 bitfield.
There appear to be only 7 useful bit left there.
Is it OK to use them all up?


> Alternatively, use a napi_id instead of a pointer.

I'm not sure I understand what you propose.

-Eliezer

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21  7:06         ` Eilon Greenstein
@ 2013-05-21  8:21           ` Or Gerlitz
  -1 siblings, 0 replies; 69+ messages in thread
From: Or Gerlitz @ 2013-05-21  8:21 UTC (permalink / raw)
  To: eilong
  Cc: Eliezer Tamir, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	Andi Kleen, HPA, Eliezer Tamir

On Tue, May 21, 2013 at 10:06 AM, Eilon Greenstein <eilong@broadcom.com> wrote:
> Or - at least for the bnx2x, it is easy to add support for this new ndo.

Do you understand what's the equivalent of that mysterious module
param for your driver/HW - or you just copied and pasted that black
magic code?

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-21  8:21           ` Or Gerlitz
  0 siblings, 0 replies; 69+ messages in thread
From: Or Gerlitz @ 2013-05-21  8:21 UTC (permalink / raw)
  To: eilong
  Cc: Willem de Bruijn, Eliezer Tamir, e1000-devel, netdev, HPA,
	Jesse Brandeburg, linux-kernel, Andi Kleen, Eliezer Tamir,
	Dave Miller

On Tue, May 21, 2013 at 10:06 AM, Eilon Greenstein <eilong@broadcom.com> wrote:
> Or - at least for the bnx2x, it is easy to add support for this new ndo.

Do you understand what's the equivalent of that mysterious module
param for your driver/HW - or you just copied and pasted that black
magic code?

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21  7:14         ` David Miller
@ 2013-05-21  8:24             ` Or Gerlitz
  0 siblings, 0 replies; 69+ messages in thread
From: Or Gerlitz @ 2013-05-21  8:24 UTC (permalink / raw)
  To: David Miller
  Cc: eilong, eliezer.tamir, linux-kernel, netdev, jesse.brandeburg,
	donald.c.skidmore, e1000-devel, willemb, andi, hpa, eliezer

On Tue, May 21, 2013 at 10:14 AM, David Miller <davem@davemloft.net> wrote:
> From: "Eilon Greenstein" <eilong@broadcom.com>
> Date: Tue, 21 May 2013 10:06:43 +0300
>
>> Hopefully this series will be accepted so we can send follow up support
>> for the bnx2x as well.
>
> I think in two or three more iterations it will be merged.
>
> There are no objections on the fundamentals, it's just implementation
> details and coding style at this point.

Dave, sorry, I might be a bit behind the rest of the reviewers, but I
just fail to understand nor find any reference that explains the
module param of ixgbe nor it makes sense to me to merge that piece of
the code upstream (its not for staging, correct?), as I wrote here
http://marc.info/?l=linux-netdev&m=136908123432072&w=2 basically, I
know you're not a great fun of module params (to say the least) and
surely not something named  "allow_unsafe_removal", thoughts?

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-21  8:24             ` Or Gerlitz
  0 siblings, 0 replies; 69+ messages in thread
From: Or Gerlitz @ 2013-05-21  8:24 UTC (permalink / raw)
  To: David Miller
  Cc: willemb, eliezer.tamir, e1000-devel, netdev, hpa,
	jesse.brandeburg, eliezer, linux-kernel, andi, eilong

On Tue, May 21, 2013 at 10:14 AM, David Miller <davem@davemloft.net> wrote:
> From: "Eilon Greenstein" <eilong@broadcom.com>
> Date: Tue, 21 May 2013 10:06:43 +0300
>
>> Hopefully this series will be accepted so we can send follow up support
>> for the bnx2x as well.
>
> I think in two or three more iterations it will be merged.
>
> There are no objections on the fundamentals, it's just implementation
> details and coding style at this point.

Dave, sorry, I might be a bit behind the rest of the reviewers, but I
just fail to understand nor find any reference that explains the
module param of ixgbe nor it makes sense to me to merge that piece of
the code upstream (its not for staging, correct?), as I wrote here
http://marc.info/?l=linux-netdev&m=136908123432072&w=2 basically, I
know you're not a great fun of module params (to say the least) and
surely not something named  "allow_unsafe_removal", thoughts?

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21  8:21           ` Or Gerlitz
@ 2013-05-21  8:28             ` Eilon Greenstein
  -1 siblings, 0 replies; 69+ messages in thread
From: Eilon Greenstein @ 2013-05-21  8:28 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Eliezer Tamir, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	Andi Kleen, HPA, Eliezer Tamir

On Tue, 2013-05-21 at 11:21 +0300, Or Gerlitz wrote:
> On Tue, May 21, 2013 at 10:06 AM, Eilon Greenstein <eilong@broadcom.com> wrote:
> > Or - at least for the bnx2x, it is easy to add support for this new ndo.
> 
> Do you understand what's the equivalent of that mysterious module
> param for your driver/HW - or you just copied and pasted that black
> magic code?
> 

The module parameter is not the interesting part of this patch. It is
clear that unloading the module while this sort of traffic is running is
not safe and the alternative of adding a reference count or something
similar sounds too costly (after all, this patch is about performance).
I just played with it to get a feel of the latency improvement - I did
not try unloading the module during traffic so I do not care about the
module parameter part right now. I agree that the unload should be
looked into - but the general concept is great and it is a very nice
improvement.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-21  8:28             ` Eilon Greenstein
  0 siblings, 0 replies; 69+ messages in thread
From: Eilon Greenstein @ 2013-05-21  8:28 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Willem de Bruijn, Eliezer Tamir, e1000-devel, netdev, HPA,
	Jesse Brandeburg, linux-kernel, Andi Kleen, Eliezer Tamir,
	Dave Miller

On Tue, 2013-05-21 at 11:21 +0300, Or Gerlitz wrote:
> On Tue, May 21, 2013 at 10:06 AM, Eilon Greenstein <eilong@broadcom.com> wrote:
> > Or - at least for the bnx2x, it is easy to add support for this new ndo.
> 
> Do you understand what's the equivalent of that mysterious module
> param for your driver/HW - or you just copied and pasted that black
> magic code?
> 

The module parameter is not the interesting part of this patch. It is
clear that unloading the module while this sort of traffic is running is
not safe and the alternative of adding a reference count or something
similar sounds too costly (after all, this patch is about performance).
I just played with it to get a feel of the latency improvement - I did
not try unloading the module during traffic so I do not care about the
module parameter part right now. I agree that the unload should be
looked into - but the general concept is great and it is a very nice
improvement.




------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21  8:24             ` Or Gerlitz
@ 2013-05-21  8:31               ` Eliezer Tamir
  -1 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-21  8:31 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: David Miller, eilong, linux-kernel, netdev, jesse.brandeburg,
	donald.c.skidmore, e1000-devel, willemb, andi, hpa, eliezer

On 21/05/2013 11:24, Or Gerlitz wrote:
> On Tue, May 21, 2013 at 10:14 AM, David Miller <davem@davemloft.net> wrote:
>> From: "Eilon Greenstein" <eilong@broadcom.com>
>> Date: Tue, 21 May 2013 10:06:43 +0300
>>
>>> Hopefully this series will be accepted so we can send follow up support
>>> for the bnx2x as well.
>>
>> I think in two or three more iterations it will be merged.
>>
>> There are no objections on the fundamentals, it's just implementation
>> details and coding style at this point.
>
> Dave, sorry, I might be a bit behind the rest of the reviewers, but I
> just fail to understand nor find any reference that explains the
> module param of ixgbe nor it makes sense to me to merge that piece of
> the code upstream (its not for staging, correct?), as I wrote here
> http://marc.info/?l=linux-netdev&m=136908123432072&w=2 basically, I
> know you're not a great fun of module params (to say the least) and
> surely not something named  "allow_unsafe_removal", thoughts?

from v2 0/4

6. To avoid the overhead of reference counting napi structs by skbs
and sockets in the fastpath, and increasing the size of the skb struct,
we no longer allow unloading the module once this feature has been used.

It seems that for most of the people interested in busy-polling, giving
up the ability to blindly remove the module for a slight but measurable
performance gain is a good tradeoff.
(There is a module parameter to override this behavior and if you know
what you are doing and are careful to stop the processes you can safely
unload, but we don't enforce this.)


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-21  8:31               ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-21  8:31 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: willemb, e1000-devel, netdev, hpa, jesse.brandeburg, eliezer,
	linux-kernel, andi, eilong, David Miller

On 21/05/2013 11:24, Or Gerlitz wrote:
> On Tue, May 21, 2013 at 10:14 AM, David Miller <davem@davemloft.net> wrote:
>> From: "Eilon Greenstein" <eilong@broadcom.com>
>> Date: Tue, 21 May 2013 10:06:43 +0300
>>
>>> Hopefully this series will be accepted so we can send follow up support
>>> for the bnx2x as well.
>>
>> I think in two or three more iterations it will be merged.
>>
>> There are no objections on the fundamentals, it's just implementation
>> details and coding style at this point.
>
> Dave, sorry, I might be a bit behind the rest of the reviewers, but I
> just fail to understand nor find any reference that explains the
> module param of ixgbe nor it makes sense to me to merge that piece of
> the code upstream (its not for staging, correct?), as I wrote here
> http://marc.info/?l=linux-netdev&m=136908123432072&w=2 basically, I
> know you're not a great fun of module params (to say the least) and
> surely not something named  "allow_unsafe_removal", thoughts?

from v2 0/4

6. To avoid the overhead of reference counting napi structs by skbs
and sockets in the fastpath, and increasing the size of the skb struct,
we no longer allow unloading the module once this feature has been used.

It seems that for most of the people interested in busy-polling, giving
up the ability to blindly remove the module for a slight but measurable
performance gain is a good tradeoff.
(There is a module parameter to override this behavior and if you know
what you are doing and are careful to stop the processes you can safely
unload, but we don't enforce this.)


------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21  8:24             ` Or Gerlitz
  (?)
  (?)
@ 2013-05-21  8:39             ` David Miller
  2013-05-21  8:42                 ` Eliezer Tamir
  2013-05-21  8:43                 ` Or Gerlitz
  -1 siblings, 2 replies; 69+ messages in thread
From: David Miller @ 2013-05-21  8:39 UTC (permalink / raw)
  To: or.gerlitz
  Cc: eilong, eliezer.tamir, linux-kernel, netdev, jesse.brandeburg,
	donald.c.skidmore, e1000-devel, willemb, andi, hpa, eliezer

From: Or Gerlitz <or.gerlitz@gmail.com>
Date: Tue, 21 May 2013 11:24:41 +0300

> On Tue, May 21, 2013 at 10:14 AM, David Miller <davem@davemloft.net> wrote:
>> From: "Eilon Greenstein" <eilong@broadcom.com>
>> Date: Tue, 21 May 2013 10:06:43 +0300
>>
>>> Hopefully this series will be accepted so we can send follow up support
>>> for the bnx2x as well.
>>
>> I think in two or three more iterations it will be merged.
>>
>> There are no objections on the fundamentals, it's just implementation
>> details and coding style at this point.
> 
> Dave, sorry, I might be a bit behind the rest of the reviewers, but I
> just fail to understand nor find any reference that explains the
> module param of ixgbe nor it makes sense to me to merge that piece of
> the code upstream (its not for staging, correct?), as I wrote here
> http://marc.info/?l=linux-netdev&m=136908123432072&w=2 basically, I
> know you're not a great fun of module params (to say the least) and
> surely not something named  "allow_unsafe_removal", thoughts?

It's one of those "implementation details", I hate it too.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21  8:39             ` David Miller
@ 2013-05-21  8:42                 ` Eliezer Tamir
  2013-05-21  8:43                 ` Or Gerlitz
  1 sibling, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-21  8:42 UTC (permalink / raw)
  To: David Miller
  Cc: or.gerlitz, eilong, linux-kernel, netdev, jesse.brandeburg,
	donald.c.skidmore, e1000-devel, willemb, andi, hpa, eliezer

On 21/05/2013 11:39, David Miller wrote:
> From: Or Gerlitz <or.gerlitz@gmail.com>
> Date: Tue, 21 May 2013 11:24:41 +0300
>
>> On Tue, May 21, 2013 at 10:14 AM, David Miller <davem@davemloft.net> wrote:
>>> From: "Eilon Greenstein" <eilong@broadcom.com>
>>> Date: Tue, 21 May 2013 10:06:43 +0300
>>>
>>>> Hopefully this series will be accepted so we can send follow up support
>>>> for the bnx2x as well.
>>>
>>> I think in two or three more iterations it will be merged.
>>>
>>> There are no objections on the fundamentals, it's just implementation
>>> details and coding style at this point.
>>
>> Dave, sorry, I might be a bit behind the rest of the reviewers, but I
>> just fail to understand nor find any reference that explains the
>> module param of ixgbe nor it makes sense to me to merge that piece of
>> the code upstream (its not for staging, correct?), as I wrote here
>> http://marc.info/?l=linux-netdev&m=136908123432072&w=2 basically, I
>> know you're not a great fun of module params (to say the least) and
>> surely not something named  "allow_unsafe_removal", thoughts?
>
> It's one of those "implementation details", I hate it too.

I'm open to suggestions.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-21  8:42                 ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-21  8:42 UTC (permalink / raw)
  To: David Miller
  Cc: willemb, or.gerlitz, e1000-devel, netdev, hpa, jesse.brandeburg,
	eliezer, linux-kernel, andi, eilong

On 21/05/2013 11:39, David Miller wrote:
> From: Or Gerlitz <or.gerlitz@gmail.com>
> Date: Tue, 21 May 2013 11:24:41 +0300
>
>> On Tue, May 21, 2013 at 10:14 AM, David Miller <davem@davemloft.net> wrote:
>>> From: "Eilon Greenstein" <eilong@broadcom.com>
>>> Date: Tue, 21 May 2013 10:06:43 +0300
>>>
>>>> Hopefully this series will be accepted so we can send follow up support
>>>> for the bnx2x as well.
>>>
>>> I think in two or three more iterations it will be merged.
>>>
>>> There are no objections on the fundamentals, it's just implementation
>>> details and coding style at this point.
>>
>> Dave, sorry, I might be a bit behind the rest of the reviewers, but I
>> just fail to understand nor find any reference that explains the
>> module param of ixgbe nor it makes sense to me to merge that piece of
>> the code upstream (its not for staging, correct?), as I wrote here
>> http://marc.info/?l=linux-netdev&m=136908123432072&w=2 basically, I
>> know you're not a great fun of module params (to say the least) and
>> surely not something named  "allow_unsafe_removal", thoughts?
>
> It's one of those "implementation details", I hate it too.

I'm open to suggestions.

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21  8:39             ` David Miller
@ 2013-05-21  8:43                 ` Or Gerlitz
  2013-05-21  8:43                 ` Or Gerlitz
  1 sibling, 0 replies; 69+ messages in thread
From: Or Gerlitz @ 2013-05-21  8:43 UTC (permalink / raw)
  To: David Miller
  Cc: eilong, eliezer.tamir, linux-kernel, netdev, jesse.brandeburg,
	donald.c.skidmore, e1000-devel, willemb, andi, hpa, eliezer

On Tue, May 21, 2013 at 11:39 AM, David Miller <davem@davemloft.net> wrote:

> It's one of those "implementation details", I hate it too.

Maybe if we bake it on this list little further we can see how to get
away from that, or what's the most non ugly way for that?

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-21  8:43                 ` Or Gerlitz
  0 siblings, 0 replies; 69+ messages in thread
From: Or Gerlitz @ 2013-05-21  8:43 UTC (permalink / raw)
  To: David Miller
  Cc: willemb, eliezer.tamir, e1000-devel, netdev, hpa,
	jesse.brandeburg, eliezer, linux-kernel, andi, eilong

On Tue, May 21, 2013 at 11:39 AM, David Miller <davem@davemloft.net> wrote:

> It's one of those "implementation details", I hate it too.

Maybe if we bake it on this list little further we can see how to get
away from that, or what's the most non ugly way for that?

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21  8:43                 ` Or Gerlitz
  (?)
@ 2013-05-21 10:27                 ` Eliezer Tamir
  2013-05-21 10:41                   ` Or Gerlitz
  -1 siblings, 1 reply; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-21 10:27 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: David Miller, eilong, linux-kernel, netdev, jesse.brandeburg,
	donald.c.skidmore, e1000-devel, willemb, andi, hpa, eliezer

On 21/05/2013 11:43, Or Gerlitz wrote:
> On Tue, May 21, 2013 at 11:39 AM, David Miller <davem@davemloft.net> wrote:
>
>> It's one of those "implementation details", I hate it too.
>
> Maybe if we bake it on this list little further we can see how to get
> away from that, or what's the most non ugly way for that?

I'm all for proper review and fixing any issues before forcing 
"ugliness" and "black magic" on unsuspecting users.

Having said that, you failed to mention that your company sells 
userspace stack replacements.

Informal testing I did convinces me that for a given HW, the latencies 
you get with this patchset and with userspace busy-polling are about the 
same. (This is my personal opinion, I'm not authorized to talk on behalf 
of my company or anyone else.)

Why don't you try it out and tell us what you find.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21 10:27                 ` Eliezer Tamir
@ 2013-05-21 10:41                   ` Or Gerlitz
  0 siblings, 0 replies; 69+ messages in thread
From: Or Gerlitz @ 2013-05-21 10:41 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: David Miller, eilong, linux-kernel, netdev, jesse.brandeburg,
	donald.c.skidmore, e1000-devel, willemb, andi, hpa, eliezer

On Tue, May 21, 2013 at 1:27 PM, Eliezer Tamir
<eliezer.tamir@linux.intel.com> wrote:
> On 21/05/2013 11:43, Or Gerlitz wrote:

>> Maybe if we bake it on this list little further we can see how to get
>> away from that, or what's the most non ugly way for that?

> I'm all for proper review and fixing any issues before forcing "ugliness"
> and "black magic" on unsuspecting users.
>
> Having said that, you failed to mention that your company sells userspace
> stack replacements.

ditto for your firm, heard on DPDK? but your comment is irrelevant, I
only raised the idea that if we bake this little further we might find
a solution which avoids this nasty corner, not more but not less.

[...]

> Why don't you try it out and tell us what you find.

sure, we are looking on that on the driver level, my comment was more general

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
  2013-05-20 10:15 ` Eliezer Tamir
                   ` (4 preceding siblings ...)
  (?)
@ 2013-05-21 12:06 ` Alex Rosenbaum
  2013-05-21 12:29   ` Eliezer Tamir
  -1 siblings, 1 reply; 69+ messages in thread
From: Alex Rosenbaum @ 2013-05-21 12:06 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

On 5/20/2013 1:15 PM, Eliezer Tamir wrote:
> updated with the comments I got so far.
>
> Thanks,
> Eliezer
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Hello Eliezer,

I am working in Mellanox on a low latency user space offload technology 
and there are some similarities between the user space and your kernel 
implementation.

We have experience in similar ‘infinite polling’ issues in respect to 
the real applications.

I am coming in a little late here but wanted to check that:

1. It seem this patch does not cover epoll/select and such IO muxing APIs?

Most real application will be based on epoll or select, not like netperf 
which is a simple send/recv per thread based network test. If you take 
memcached application you have epoll per thread with few sockets in each 
running on each core.

In the IO mux cases you need to poll multiple driver rings while also 
polling other non-network fd’s (files, pipes,..) and not to hurt their 
latency response.

2. How is the logic aware of RSS and RFS?

With TCP sockets, the driver knows the specific ring it need to poll so 
this should be mapped and provide the best latency.

For UDP (unicast and multicast) you can have all rings delivering 
packets to a single receive socket, is ndo_ll_poll expected to scan 
driver rings?

3. I could not find any reference to multi-thread on single core logic. 
This can causes the opposite effect and create contentions and higher 
latency’s.

Maybe you should ref_count the number of threads per core going into 
ndo_ll_poll. If the second+ threads want to go down to ndo_ll_poll you 
should block (sleep) them instead of creating contention.

In this mode at least the first thread will get very good latency and 
the others will not get hurt.

Or if they move to a different core they should go down to the driver 
for polling the ring.

Thanks,

Alex Rosenbaum

Director R&D Application Acceleration

Mellanox Technologies

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
  2013-05-21 12:06 ` [PATCH v3 net-next 0/4] net: low latency Ethernet device polling Alex Rosenbaum
@ 2013-05-21 12:29   ` Eliezer Tamir
  2013-05-21 13:15       ` Alex Rosenbaum
                       ` (2 more replies)
  0 siblings, 3 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-21 12:29 UTC (permalink / raw)
  To: Alex Rosenbaum
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

Many of the questions you asked are covered in our RFC cover letter, but 
I will touch them briefly

On 21/05/2013 15:06, Alex Rosenbaum wrote:
> 1. It seem this patch does not cover epoll/select and such IO muxing APIs?

We are thinking about how to implement epoll support as one of the next 
steps.

What benchmarks are you using to test poll/select/epoll?

> 2. How is the logic aware of RSS and RFS?
>
> With TCP sockets, the driver knows the specific ring it need to poll so
> this should be mapped and provide the best latency.

This code is blissfully oblivious of RFS and RSS, it only assumes that 
the packets for a socket are likely to continue to come on the same queue.
The code is designed to be correct even if you get your data on the 
wrong queue. (your performance will suffer but no more than that.)

> 3. I could not find any reference to multi-thread on single core logic.
> This can causes the opposite effect and create contentions and higher
> latency’s.

Again, the only bad thing that will happen if you misconfigure this is a 
performance hit, we will not deadlock.

-Eliezer

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
  2013-05-21 12:29   ` Eliezer Tamir
@ 2013-05-21 13:15       ` Alex Rosenbaum
  2013-05-21 14:30       ` Willem de Bruijn
  2013-05-21 18:15       ` Ben Hutchings
  2 siblings, 0 replies; 69+ messages in thread
From: Alex Rosenbaum @ 2013-05-21 13:15 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

On 5/21/2013 3:29 PM, Eliezer Tamir wrote:
> What benchmarks are you using to test poll/select/epoll?
for epoll/select latency tests we are using sockperf as performance 
latency tool: https://code.google.com/p/sockperf/
It is a client-server based tool and it supported ping-pong, throughput, 
and under-load test type.
For epoll, you will need to define a 'feedfile' ("-f filepathname") 
which has a list of TCP and/or UDP socket and defined your IO mux type 
("-F epoll").

thanks for all the clarifications.

- Alex Rosenbaum

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
@ 2013-05-21 13:15       ` Alex Rosenbaum
  0 siblings, 0 replies; 69+ messages in thread
From: Alex Rosenbaum @ 2013-05-21 13:15 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Willem de Bruijn, Don, e1000-devel, netdev, HPA,
	Jesse Brandeburg, linux-kernel, Andi Kleen, Eliezer Tamir,
	Dave Miller

On 5/21/2013 3:29 PM, Eliezer Tamir wrote:
> What benchmarks are you using to test poll/select/epoll?
for epoll/select latency tests we are using sockperf as performance 
latency tool: https://code.google.com/p/sockperf/
It is a client-server based tool and it supported ping-pong, throughput, 
and under-load test type.
For epoll, you will need to define a 'feedfile' ("-f filepathname") 
which has a list of TCP and/or UDP socket and defined your IO mux type 
("-F epoll").

thanks for all the clarifications.

- Alex Rosenbaum

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-21  7:28     ` Eliezer Tamir
@ 2013-05-21 13:28       ` Eric Dumazet
  2013-05-21 17:02         ` Pekka Riikonen
  0 siblings, 1 reply; 69+ messages in thread
From: Eric Dumazet @ 2013-05-21 13:28 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

On Tue, 2013-05-21 at 10:28 +0300, Eliezer Tamir wrote:
> On 20/05/2013 18:29, Eric Dumazet wrote:
> > On Mon, 2013-05-20 at 13:16 +0300, Eliezer Tamir wrote:
> ---
> >> +static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
> >> +{
> >> +	skb->dev_ref = napi;
> >> +}
> >> +
> >> +static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
> >> +{
> >> +	sk->dev_ref = skb->dev_ref;
> >> +}
> >
> > I do not see why it's safe to keep a pointer to a napi object without
> > taking a reference, or something to prevent object being removed.
> >
> > Using a genid might be enough. (some counter incremented every time a
> > napi is dismantled)
> 
> I really like this approach and I tried it.
> The main problem I had is that you need to increase the size of the skb 
> to store the generation id unless you stuff it in the flags2 bitfield.
> There appear to be only 7 useful bit left there.
> Is it OK to use them all up?
> 
> 
> > Alternatively, use a napi_id instead of a pointer.
> 
> I'm not sure I understand what you propose.

Oh well.

To get a pointer to a struct net_device, we can use ifindex, and do a
rcu lookup into a hash table to get the net_device. We do not need
{pointer,ifindex} but {ifindex} is enough

My suggestion is to not have skb->skb_ref but skb->napi_index : Its safe
to copy its value from skb->napi_index to sk->napi_index without
refcounting.

All NAPI need to get a unique napi_index, and be inserted in a hash
table for immediate/fast lookup.




^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
  2013-05-21 13:15       ` Alex Rosenbaum
  (?)
@ 2013-05-21 13:31       ` Eric Dumazet
  -1 siblings, 0 replies; 69+ messages in thread
From: Eric Dumazet @ 2013-05-21 13:31 UTC (permalink / raw)
  To: Alex Rosenbaum
  Cc: Eliezer Tamir, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	Andi Kleen, HPA, Eliezer Tamir

On Tue, 2013-05-21 at 16:15 +0300, Alex Rosenbaum wrote:
> On 5/21/2013 3:29 PM, Eliezer Tamir wrote:
> > What benchmarks are you using to test poll/select/epoll?
> for epoll/select latency tests we are using sockperf as performance 
> latency tool: https://code.google.com/p/sockperf/
> It is a client-server based tool and it supported ping-pong, throughput, 
> and under-load test type.
> For epoll, you will need to define a 'feedfile' ("-f filepathname") 
> which has a list of TCP and/or UDP socket and defined your IO mux type 
> ("-F epoll").

I totally agree, most modern applications use poll/select/epoll,
and a fair amount of sockets per task, sendfile()/vmsplice()/... and
netperf is not using same paths.

Thanks Alex !




^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
  2013-05-21  6:54       ` Eliezer Tamir
@ 2013-05-21 14:19         ` Willem de Bruijn
  -1 siblings, 0 replies; 69+ messages in thread
From: Willem de Bruijn @ 2013-05-21 14:19 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Or Gerlitz, Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Andi Kleen, HPA, Eliezer Tamir

On Tue, May 21, 2013 at 2:54 AM, Eliezer Tamir
<eliezer.tamir@linux.intel.com> wrote:
> On 20/05/2013 23:20, Or Gerlitz wrote:
>>
>> On Mon, May 20, 2013 at 1:16 PM, Eliezer Tamir
>> <eliezer.tamir@linux.intel.com> wrote:
>>>
>>> Add the ixgbe driver code implementing ndo_ll_poll.
>>> It should be easy for other drivers to do something similar
>>> in order to enable support for CONFIG_INET_LL_RX_POLL
>>
>>
>> I am not sure,
>
>
> Willem ported this to <some undisclosed HW that they use at Google>, his
> feedback was that it was not a major effort.

The core ndo_ll_poll implementation is generally a subset of a device
driver's existing napi callback. It cleans the queues, but it skips
napi_complete and unmasking of the IRQ.

+       ixgbe_for_each_ring(ring, q_vector->rx) {
+               found = ixgbe_clean_rx_irq(q_vector, ring, 4);
+               if (found)
+                       break;
+       }

A subtle difference in the above code vs ixgbe_poll is that the
callback returns as soon as some data arrived on a queue, as opposed
to iterating over all queues. The budget is lower, too. Since not all
data arriving is necessarily destined towards polling socket, this may
or may not be an improvement.

Besides that, the driver has to mark the packet with
ll_mark_skb(&cq->napi, skb);

On devices where tx completion interrupts share the same IRQ as rx
interrupts, the driver may also have to clean the tx queue once in a
while (at obvious tail latency cost). LLS does not disable the IRQ,
but I think the suggestion was to set its moderation threshold very
high to avoid net_rx_action/LLS lock contention. If so, starvation may
occur.

The most difficult bit is handling mutual exclusion with the
interrupt-driven receive path. The ixgbe port has its own internal
locking mechanism in anticipation of future use cases that can be
lock-free. As first approximation, I just took the napi->poll_lock,
similar to how netpoll handles mutual exclusion with net_rx_action.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll
@ 2013-05-21 14:19         ` Willem de Bruijn
  0 siblings, 0 replies; 69+ messages in thread
From: Willem de Bruijn @ 2013-05-21 14:19 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Or Gerlitz, e1000-devel, netdev, HPA, Jesse Brandeburg,
	linux-kernel, Andi Kleen, Eliezer Tamir, Dave Miller

On Tue, May 21, 2013 at 2:54 AM, Eliezer Tamir
<eliezer.tamir@linux.intel.com> wrote:
> On 20/05/2013 23:20, Or Gerlitz wrote:
>>
>> On Mon, May 20, 2013 at 1:16 PM, Eliezer Tamir
>> <eliezer.tamir@linux.intel.com> wrote:
>>>
>>> Add the ixgbe driver code implementing ndo_ll_poll.
>>> It should be easy for other drivers to do something similar
>>> in order to enable support for CONFIG_INET_LL_RX_POLL
>>
>>
>> I am not sure,
>
>
> Willem ported this to <some undisclosed HW that they use at Google>, his
> feedback was that it was not a major effort.

The core ndo_ll_poll implementation is generally a subset of a device
driver's existing napi callback. It cleans the queues, but it skips
napi_complete and unmasking of the IRQ.

+       ixgbe_for_each_ring(ring, q_vector->rx) {
+               found = ixgbe_clean_rx_irq(q_vector, ring, 4);
+               if (found)
+                       break;
+       }

A subtle difference in the above code vs ixgbe_poll is that the
callback returns as soon as some data arrived on a queue, as opposed
to iterating over all queues. The budget is lower, too. Since not all
data arriving is necessarily destined towards polling socket, this may
or may not be an improvement.

Besides that, the driver has to mark the packet with
ll_mark_skb(&cq->napi, skb);

On devices where tx completion interrupts share the same IRQ as rx
interrupts, the driver may also have to clean the tx queue once in a
while (at obvious tail latency cost). LLS does not disable the IRQ,
but I think the suggestion was to set its moderation threshold very
high to avoid net_rx_action/LLS lock contention. If so, starvation may
occur.

The most difficult bit is handling mutual exclusion with the
interrupt-driven receive path. The ixgbe port has its own internal
locking mechanism in anticipation of future use cases that can be
lock-free. As first approximation, I just took the napi->poll_lock,
similar to how netpoll handles mutual exclusion with net_rx_action.

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
  2013-05-21 12:29   ` Eliezer Tamir
@ 2013-05-21 14:30       ` Willem de Bruijn
  2013-05-21 14:30       ` Willem de Bruijn
  2013-05-21 18:15       ` Ben Hutchings
  2 siblings, 0 replies; 69+ messages in thread
From: Willem de Bruijn @ 2013-05-21 14:30 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Alex Rosenbaum, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Andi Kleen, HPA,
	Eliezer Tamir

>> 2. How is the logic aware of RSS and RFS?
>>
>> With TCP sockets, the driver knows the specific ring it need to poll so
>> this should be mapped and provide the best latency.
>
>
> This code is blissfully oblivious of RFS and RSS, it only assumes that the
> packets for a socket are likely to continue to come on the same queue.
> The code is designed to be correct even if you get your data on the wrong
> queue. (your performance will suffer but no more than that.)
>

For low latency, you don't want to have to wait for the IPI that RPS
sets to redirect packets to another CPU.  However, this feature works
extremely well with hardware flow steering (nfc/ntuple) and
accelerated RFS, where the device will enqueue directly on an rxqueue
owned exclusively by the destination cpu (if configured correctly).

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
@ 2013-05-21 14:30       ` Willem de Bruijn
  0 siblings, 0 replies; 69+ messages in thread
From: Willem de Bruijn @ 2013-05-21 14:30 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Andi Kleen, e1000-devel, netdev, HPA, Jesse Brandeburg,
	linux-kernel, Alex Rosenbaum, Eliezer Tamir, Dave Miller

>> 2. How is the logic aware of RSS and RFS?
>>
>> With TCP sockets, the driver knows the specific ring it need to poll so
>> this should be mapped and provide the best latency.
>
>
> This code is blissfully oblivious of RFS and RSS, it only assumes that the
> packets for a socket are likely to continue to come on the same queue.
> The code is designed to be correct even if you get your data on the wrong
> queue. (your performance will suffer but no more than that.)
>

For low latency, you don't want to have to wait for the IPI that RPS
sets to redirect packets to another CPU.  However, this feature works
extremely well with hardware flow steering (nfc/ntuple) and
accelerated RFS, where the device will enqueue directly on an rxqueue
owned exclusively by the destination cpu (if configured correctly).

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-21 13:28       ` Eric Dumazet
@ 2013-05-21 17:02         ` Pekka Riikonen
  2013-05-21 17:48           ` Eric Dumazet
  2013-05-21 18:49           ` David Miller
  0 siblings, 2 replies; 69+ messages in thread
From: Pekka Riikonen @ 2013-05-21 17:02 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Eliezer Tamir, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	Andi Kleen, HPA, Eliezer Tamir

On Tue, 21 May 2013, Eric Dumazet wrote:

: > > Alternatively, use a napi_id instead of a pointer.
: > 
: > I'm not sure I understand what you propose.
: 
: Oh well.
: 
: To get a pointer to a struct net_device, we can use ifindex, and do a
: rcu lookup into a hash table to get the net_device. We do not need
: {pointer,ifindex} but {ifindex} is enough
: 
: My suggestion is to not have skb->skb_ref but skb->napi_index : Its safe
: to copy its value from skb->napi_index to sk->napi_index without
: refcounting.
: 
: All NAPI need to get a unique napi_index, and be inserted in a hash
: table for immediate/fast lookup.
: 
Maybe even that's not needed.  Couldn't skb->queue_mapping give the 
correct NAPI instance in multiqueue nics?  The NAPI instance could be made 
easily available from skb->dev.  In any case an index is much better than 
a new pointer.

	Pekka

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-21 17:02         ` Pekka Riikonen
@ 2013-05-21 17:48           ` Eric Dumazet
  2013-05-21 17:51               ` Eric Dumazet
  2013-05-21 18:49           ` David Miller
  1 sibling, 1 reply; 69+ messages in thread
From: Eric Dumazet @ 2013-05-21 17:48 UTC (permalink / raw)
  To: Pekka Riikonen
  Cc: Eliezer Tamir, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	Andi Kleen, HPA, Eliezer Tamir

On Tue, 2013-05-21 at 19:02 +0200, Pekka Riikonen wrote:

> Maybe even that's not needed.  Couldn't skb->queue_mapping give the 
> correct NAPI instance in multiqueue nics?  The NAPI instance could be made 
> easily available from skb->dev.  In any case an index is much better than 
> a new pointer.

We do not keep skb->dev information once a packet leaves the rcu
protected region.

Once packet is queued to tcp input queues, skb->dev is NULL.




^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-21 17:48           ` Eric Dumazet
@ 2013-05-21 17:51               ` Eric Dumazet
  0 siblings, 0 replies; 69+ messages in thread
From: Eric Dumazet @ 2013-05-21 17:51 UTC (permalink / raw)
  To: Pekka Riikonen
  Cc: Eliezer Tamir, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	Andi Kleen, HPA, Eliezer Tamir

On Tue, 2013-05-21 at 10:48 -0700, Eric Dumazet wrote:

> We do not keep skb->dev information once a packet leaves the rcu
> protected region.
> 
> Once packet is queued to tcp input queues, skb->dev is NULL.

This is done in tcp_v4_rcv() & tcp_v6_rcv()




^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
@ 2013-05-21 17:51               ` Eric Dumazet
  0 siblings, 0 replies; 69+ messages in thread
From: Eric Dumazet @ 2013-05-21 17:51 UTC (permalink / raw)
  To: Pekka Riikonen
  Cc: Willem de Bruijn, Don, Eliezer Tamir, e1000-devel, netdev, HPA,
	Jesse Brandeburg, linux-kernel, Andi Kleen, Eliezer Tamir,
	Dave Miller

On Tue, 2013-05-21 at 10:48 -0700, Eric Dumazet wrote:

> We do not keep skb->dev information once a packet leaves the rcu
> protected region.
> 
> Once packet is queued to tcp input queues, skb->dev is NULL.

This is done in tcp_v4_rcv() & tcp_v6_rcv()




------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
  2013-05-21 12:29   ` Eliezer Tamir
@ 2013-05-21 18:15       ` Ben Hutchings
  2013-05-21 14:30       ` Willem de Bruijn
  2013-05-21 18:15       ` Ben Hutchings
  2 siblings, 0 replies; 69+ messages in thread
From: Ben Hutchings @ 2013-05-21 18:15 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Alex Rosenbaum, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	Andi Kleen, HPA, Eliezer Tamir

On Tue, 2013-05-21 at 15:29 +0300, Eliezer Tamir wrote:
> Many of the questions you asked are covered in our RFC cover letter, but 
> I will touch them briefly
> 
> On 21/05/2013 15:06, Alex Rosenbaum wrote:
> > 1. It seem this patch does not cover epoll/select and such IO muxing APIs?
> 
> We are thinking about how to implement epoll support as one of the next 
> steps.
>
> What benchmarks are you using to test poll/select/epoll?
[...]

I raised this issue back at Plumbers and I thought Jesse said poll() was
covered.  It's likely to be a bit of a toy without that.  Also, UDP?

Solarflare uses a wide range of benchmarks.  Of the publicly available
applications, I believe Ixia Chariot uses (or can use) epoll.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
@ 2013-05-21 18:15       ` Ben Hutchings
  0 siblings, 0 replies; 69+ messages in thread
From: Ben Hutchings @ 2013-05-21 18:15 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Willem de Bruijn, Andi Kleen, e1000-devel, netdev, HPA,
	Jesse Brandeburg, linux-kernel, Alex Rosenbaum, Eliezer Tamir,
	Dave Miller

On Tue, 2013-05-21 at 15:29 +0300, Eliezer Tamir wrote:
> Many of the questions you asked are covered in our RFC cover letter, but 
> I will touch them briefly
> 
> On 21/05/2013 15:06, Alex Rosenbaum wrote:
> > 1. It seem this patch does not cover epoll/select and such IO muxing APIs?
> 
> We are thinking about how to implement epoll support as one of the next 
> steps.
>
> What benchmarks are you using to test poll/select/epoll?
[...]

I raised this issue back at Plumbers and I thought Jesse said poll() was
covered.  It's likely to be a bit of a toy without that.  Also, UDP?

Solarflare uses a wide range of benchmarks.  Of the publicly available
applications, I believe Ixia Chariot uses (or can use) epoll.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-21 17:02         ` Pekka Riikonen
  2013-05-21 17:48           ` Eric Dumazet
@ 2013-05-21 18:49           ` David Miller
  2013-05-21 19:31               ` Pekka Riikonen
  1 sibling, 1 reply; 69+ messages in thread
From: David Miller @ 2013-05-21 18:49 UTC (permalink / raw)
  To: priikone
  Cc: eric.dumazet, eliezer.tamir, linux-kernel, netdev,
	jesse.brandeburg, donald.c.skidmore, e1000-devel, willemb, andi,
	hpa, eliezer

From: Pekka Riikonen <priikone@iki.fi>
Date: Tue, 21 May 2013 19:02:19 +0200 (CEST)

> On Tue, 21 May 2013, Eric Dumazet wrote:
> 
> : > > Alternatively, use a napi_id instead of a pointer.
> : > 
> : > I'm not sure I understand what you propose.
> : 
> : Oh well.
> : 
> : To get a pointer to a struct net_device, we can use ifindex, and do a
> : rcu lookup into a hash table to get the net_device. We do not need
> : {pointer,ifindex} but {ifindex} is enough
> : 
> : My suggestion is to not have skb->skb_ref but skb->napi_index : Its safe
> : to copy its value from skb->napi_index to sk->napi_index without
> : refcounting.
> : 
> : All NAPI need to get a unique napi_index, and be inserted in a hash
> : table for immediate/fast lookup.
> : 
> Maybe even that's not needed.  Couldn't skb->queue_mapping give the 
> correct NAPI instance in multiqueue nics?  The NAPI instance could be made 
> easily available from skb->dev.  In any case an index is much better than 
> a new pointer.

Queue mapping is more volatile, and consider layered devices.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-21 17:51               ` Eric Dumazet
@ 2013-05-21 19:25                 ` Eliezer Tamir
  -1 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-21 19:25 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Pekka Riikonen, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	Andi Kleen, HPA, Eliezer Tamir

On 21/05/2013 20:51, Eric Dumazet wrote:
> On Tue, 2013-05-21 at 10:48 -0700, Eric Dumazet wrote:
>
>> We do not keep skb->dev information once a packet leaves the rcu
>> protected region.
>>
>> Once packet is queued to tcp input queues, skb->dev is NULL.
>
> This is done in tcp_v4_rcv() & tcp_v6_rcv()

So if we move calling sk_mark_ll() into the rcu protected region,
We would not need the gen id in the skb.
We could save skb->dev and skb->queue_mapping along with the generation 
id in the socket.

then we don't need to change the skb struct at all.
skb_mark_ll() can go away.
we also call sk_mark_ll in one central place in net_protocol->handler 
instead of all over the place.

or am I missing something?

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
@ 2013-05-21 19:25                 ` Eliezer Tamir
  0 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-21 19:25 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Willem de Bruijn, e1000-devel, netdev, HPA, Jesse Brandeburg,
	linux-kernel, Andi Kleen, Eliezer Tamir, Dave Miller,
	Pekka Riikonen

On 21/05/2013 20:51, Eric Dumazet wrote:
> On Tue, 2013-05-21 at 10:48 -0700, Eric Dumazet wrote:
>
>> We do not keep skb->dev information once a packet leaves the rcu
>> protected region.
>>
>> Once packet is queued to tcp input queues, skb->dev is NULL.
>
> This is done in tcp_v4_rcv() & tcp_v6_rcv()

So if we move calling sk_mark_ll() into the rcu protected region,
We would not need the gen id in the skb.
We could save skb->dev and skb->queue_mapping along with the generation 
id in the socket.

then we don't need to change the skb struct at all.
skb_mark_ll() can go away.
we also call sk_mark_ll in one central place in net_protocol->handler 
instead of all over the place.

or am I missing something?

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-21 19:25                 ` Eliezer Tamir
@ 2013-05-21 19:29                   ` Eric Dumazet
  -1 siblings, 0 replies; 69+ messages in thread
From: Eric Dumazet @ 2013-05-21 19:29 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Pekka Riikonen, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	Andi Kleen, HPA, Eliezer Tamir

On Tue, 2013-05-21 at 22:25 +0300, Eliezer Tamir wrote:
> On 21/05/2013 20:51, Eric Dumazet wrote:
> > On Tue, 2013-05-21 at 10:48 -0700, Eric Dumazet wrote:
> >
> >> We do not keep skb->dev information once a packet leaves the rcu
> >> protected region.
> >>
> >> Once packet is queued to tcp input queues, skb->dev is NULL.
> >
> > This is done in tcp_v4_rcv() & tcp_v6_rcv()
> 
> So if we move calling sk_mark_ll() into the rcu protected region,
> We would not need the gen id in the skb.
> We could save skb->dev and skb->queue_mapping along with the generation 
> id in the socket.
> 
> then we don't need to change the skb struct at all.
> skb_mark_ll() can go away.
> we also call sk_mark_ll in one central place in net_protocol->handler 
> instead of all over the place.
> 
> or am I missing something?


You cannot keep a pointer to some object without taking a reference on
the object.

Thats why I suggested to use an napi identifier instead, this is safe
and cheap.




^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
@ 2013-05-21 19:29                   ` Eric Dumazet
  0 siblings, 0 replies; 69+ messages in thread
From: Eric Dumazet @ 2013-05-21 19:29 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Willem de Bruijn, e1000-devel, netdev, HPA, Jesse Brandeburg,
	linux-kernel, Andi Kleen, Eliezer Tamir, Dave Miller,
	Pekka Riikonen

On Tue, 2013-05-21 at 22:25 +0300, Eliezer Tamir wrote:
> On 21/05/2013 20:51, Eric Dumazet wrote:
> > On Tue, 2013-05-21 at 10:48 -0700, Eric Dumazet wrote:
> >
> >> We do not keep skb->dev information once a packet leaves the rcu
> >> protected region.
> >>
> >> Once packet is queued to tcp input queues, skb->dev is NULL.
> >
> > This is done in tcp_v4_rcv() & tcp_v6_rcv()
> 
> So if we move calling sk_mark_ll() into the rcu protected region,
> We would not need the gen id in the skb.
> We could save skb->dev and skb->queue_mapping along with the generation 
> id in the socket.
> 
> then we don't need to change the skb struct at all.
> skb_mark_ll() can go away.
> we also call sk_mark_ll in one central place in net_protocol->handler 
> instead of all over the place.
> 
> or am I missing something?


You cannot keep a pointer to some object without taking a reference on
the object.

Thats why I suggested to use an napi identifier instead, this is safe
and cheap.




------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
  2013-05-21 18:49           ` David Miller
@ 2013-05-21 19:31               ` Pekka Riikonen
  0 siblings, 0 replies; 69+ messages in thread
From: Pekka Riikonen @ 2013-05-21 19:31 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, eliezer.tamir, linux-kernel, netdev,
	jesse.brandeburg, donald.c.skidmore, e1000-devel, willemb, andi,
	hpa, eliezer

On Tue, 21 May 2013, David Miller wrote:

> From: Pekka Riikonen <priikone@iki.fi>
> Date: Tue, 21 May 2013 19:02:19 +0200 (CEST)
>
>> On Tue, 21 May 2013, Eric Dumazet wrote:
>>
>> : > > Alternatively, use a napi_id instead of a pointer.
>> : >
>> : > I'm not sure I understand what you propose.
>> :
>> : Oh well.
>> :
>> : To get a pointer to a struct net_device, we can use ifindex, and do a
>> : rcu lookup into a hash table to get the net_device. We do not need
>> : {pointer,ifindex} but {ifindex} is enough
>> :
>> : My suggestion is to not have skb->skb_ref but skb->napi_index : Its safe
>> : to copy its value from skb->napi_index to sk->napi_index without
>> : refcounting.
>> :
>> : All NAPI need to get a unique napi_index, and be inserted in a hash
>> : table for immediate/fast lookup.
>> :
>> Maybe even that's not needed.  Couldn't skb->queue_mapping give the
>> correct NAPI instance in multiqueue nics?  The NAPI instance could be made
>> easily available from skb->dev.  In any case an index is much better than
>> a new pointer.
>
> Queue mapping is more volatile, and consider layered devices.
>
Yes, true.  The napi_index then is probably the way to go.  Main thing for 
me is that it doesn't increase skb size when in union with dma_cookie (skb 
has been growing lately).

 	Pekka


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 1/4] net: implement support for low latency socket polling
@ 2013-05-21 19:31               ` Pekka Riikonen
  0 siblings, 0 replies; 69+ messages in thread
From: Pekka Riikonen @ 2013-05-21 19:31 UTC (permalink / raw)
  To: David Miller
  Cc: willemb, eliezer.tamir, e1000-devel, netdev, hpa,
	jesse.brandeburg, linux-kernel, andi, eliezer

On Tue, 21 May 2013, David Miller wrote:

> From: Pekka Riikonen <priikone@iki.fi>
> Date: Tue, 21 May 2013 19:02:19 +0200 (CEST)
>
>> On Tue, 21 May 2013, Eric Dumazet wrote:
>>
>> : > > Alternatively, use a napi_id instead of a pointer.
>> : >
>> : > I'm not sure I understand what you propose.
>> :
>> : Oh well.
>> :
>> : To get a pointer to a struct net_device, we can use ifindex, and do a
>> : rcu lookup into a hash table to get the net_device. We do not need
>> : {pointer,ifindex} but {ifindex} is enough
>> :
>> : My suggestion is to not have skb->skb_ref but skb->napi_index : Its safe
>> : to copy its value from skb->napi_index to sk->napi_index without
>> : refcounting.
>> :
>> : All NAPI need to get a unique napi_index, and be inserted in a hash
>> : table for immediate/fast lookup.
>> :
>> Maybe even that's not needed.  Couldn't skb->queue_mapping give the
>> correct NAPI instance in multiqueue nics?  The NAPI instance could be made
>> easily available from skb->dev.  In any case an index is much better than
>> a new pointer.
>
> Queue mapping is more volatile, and consider layered devices.
>
Yes, true.  The napi_index then is probably the way to go.  Main thing for 
me is that it doesn't increase skb size when in union with dma_cookie (skb 
has been growing lately).

 	Pekka


------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
  2013-05-21 18:15       ` Ben Hutchings
  (?)
@ 2013-05-22  9:35       ` Eliezer Tamir
  -1 siblings, 0 replies; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-22  9:35 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Alex Rosenbaum, Dave Miller, linux-kernel, netdev,
	Jesse Brandeburg, Don Skidmore, e1000-devel, Willem de Bruijn,
	Andi Kleen, HPA, Eliezer Tamir



On 21/05/2013 21:15, Ben Hutchings wrote:
> On Tue, 2013-05-21 at 15:29 +0300, Eliezer Tamir wrote:
>> Many of the questions you asked are covered in our RFC cover letter, but
>> I will touch them briefly
>>
>> On 21/05/2013 15:06, Alex Rosenbaum wrote:
>>> 1. It seem this patch does not cover epoll/select and such IO muxing APIs?
>>
>> We are thinking about how to implement epoll support as one of the next
>> steps.
>>
>> What benchmarks are you using to test poll/select/epoll?
> [...]
>
> I raised this issue back at Plumbers and I thought Jesse said poll() was
> covered.  It's likely to be a bit of a toy without that.  Also, UDP?

The current patchset covers poll/select for UDP.
Adding poll/select for TCP should be simple, I will try to include it in 
the next version.

Epoll should see a gain from the above in the case where all the polled 
file descriptors are associated to the same device queue.
A fuller implementation, in the epoll itself would be needed to cover 
multiple queues/devices.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
  2013-05-21 13:15       ` Alex Rosenbaum
  (?)
  (?)
@ 2013-05-23 11:06       ` Eliezer Tamir
  2013-05-23 11:45         ` Alex Rosenbaum
  -1 siblings, 1 reply; 69+ messages in thread
From: Eliezer Tamir @ 2013-05-23 11:06 UTC (permalink / raw)
  To: Alex Rosenbaum
  Cc: Dave Miller, linux-kernel, netdev, Jesse Brandeburg,
	Don Skidmore, e1000-devel, Willem de Bruijn, Andi Kleen, HPA,
	Eliezer Tamir

On 21/05/2013 16:15, Alex Rosenbaum wrote:
> On 5/21/2013 3:29 PM, Eliezer Tamir wrote:
>> What benchmarks are you using to test poll/select/epoll?
> for epoll/select latency tests we are using sockperf as performance
> latency tool: https://code.google.com/p/sockperf/
> It is a client-server based tool and it supported ping-pong, throughput,
> and under-load test type.
> For epoll, you will need to define a 'feedfile' ("-f filepathname")
> which has a list of TCP and/or UDP socket and defined your IO mux type
> ("-F epoll").

Thank you!
This is very helpful.

With sockperf i can directly observe how poll/select/epoll are behaving.
I can see some improvement in all of them but clearly more work is 
needed here.

-Eliezer

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v3 net-next 0/4] net: low latency Ethernet device polling
  2013-05-23 11:06       ` Eliezer Tamir
@ 2013-05-23 11:45         ` Alex Rosenbaum
  0 siblings, 0 replies; 69+ messages in thread
From: Alex Rosenbaum @ 2013-05-23 11:45 UTC (permalink / raw)
  To: Eliezer Tamir
  Cc: Willem de Bruijn, Don, e1000-devel, netdev, HPA,
	Jesse Brandeburg, linux-kernel, Andi Kleen, Eliezer Tamir,
	Dave Miller


[-- Attachment #1.1: Type: text/plain, Size: 1117 bytes --]

On 5/23/2013 2:06 PM, Eliezer Tamir wrote:
> On 21/05/2013 16:15, Alex Rosenbaum wrote:
>> On 5/21/2013 3:29 PM, Eliezer Tamir wrote:
>>> What benchmarks are you using to test poll/select/epoll?
>> for epoll/select latency tests we are using sockperf as performance
>> latency tool: https://code.google.com/p/sockperf/
>> It is a client-server based tool and it supported ping-pong, throughput,
>> and under-load test type.
>> For epoll, you will need to define a 'feedfile' ("-f filepathname")
>> which has a list of TCP and/or UDP socket and defined your IO mux type
>> ("-F epoll").
>
> Thank you!
> This is very helpful.
>
> With sockperf i can directly observe how poll/select/epoll are behaving.
> I can see some improvement in all of them but clearly more work is 
> needed here.
>
> -Eliezer

I'm happy you got it working.

If you add '--timeout=0' to the sockperf epoll command line you can see 
non-blocking epoll application behavior and then you will get some 
improved latency over due to the application thread not sleeping.

This is not like a full blown LLS epoll solution but half way there.


Alex


[-- Attachment #2: Type: text/plain, Size: 421 bytes --]

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may

[-- Attachment #3: Type: text/plain, Size: 257 bytes --]

_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply	[flat|nested] 69+ messages in thread

end of thread, other threads:[~2013-05-23 11:45 UTC | newest]

Thread overview: 69+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-20 10:15 [PATCH v3 net-next 0/4] net: low latency Ethernet device polling Eliezer Tamir
2013-05-20 10:15 ` Eliezer Tamir
2013-05-20 10:16 ` [PATCH v3 net-next 1/4] net: implement support for low latency socket polling Eliezer Tamir
2013-05-20 10:16   ` Eliezer Tamir
2013-05-20 15:29   ` Eric Dumazet
2013-05-20 19:40     ` David Miller
2013-05-21  7:28     ` Eliezer Tamir
2013-05-21 13:28       ` Eric Dumazet
2013-05-21 17:02         ` Pekka Riikonen
2013-05-21 17:48           ` Eric Dumazet
2013-05-21 17:51             ` Eric Dumazet
2013-05-21 17:51               ` Eric Dumazet
2013-05-21 19:25               ` Eliezer Tamir
2013-05-21 19:25                 ` Eliezer Tamir
2013-05-21 19:29                 ` Eric Dumazet
2013-05-21 19:29                   ` Eric Dumazet
2013-05-21 18:49           ` David Miller
2013-05-21 19:31             ` Pekka Riikonen
2013-05-21 19:31               ` Pekka Riikonen
2013-05-20 10:16 ` [PATCH v3 net-next 2/4] tcp: add TCP support for low latency receive poll Eliezer Tamir
2013-05-20 10:16   ` Eliezer Tamir
2013-05-20 13:49   ` Eric Dumazet
2013-05-20 14:59     ` Eliezer Tamir
2013-05-20 14:59       ` Eliezer Tamir
2013-05-20 10:16 ` [PATCH v3 net-next 3/4] ixgbe: Add support for ndo_ll_poll Eliezer Tamir
2013-05-20 10:16   ` Eliezer Tamir
2013-05-20 20:20   ` Or Gerlitz
2013-05-20 20:20     ` Or Gerlitz
2013-05-20 20:33     ` Andi Kleen
2013-05-20 20:42       ` Or Gerlitz
2013-05-20 21:01         ` Andi Kleen
2013-05-21  6:23           ` Or Gerlitz
2013-05-21  6:23             ` Or Gerlitz
2013-05-21  6:54     ` Eliezer Tamir
2013-05-21  6:54       ` Eliezer Tamir
2013-05-21  7:06       ` Eilon Greenstein
2013-05-21  7:06         ` Eilon Greenstein
2013-05-21  7:14         ` David Miller
2013-05-21  8:24           ` Or Gerlitz
2013-05-21  8:24             ` Or Gerlitz
2013-05-21  8:31             ` Eliezer Tamir
2013-05-21  8:31               ` Eliezer Tamir
2013-05-21  8:39             ` David Miller
2013-05-21  8:42               ` Eliezer Tamir
2013-05-21  8:42                 ` Eliezer Tamir
2013-05-21  8:43               ` Or Gerlitz
2013-05-21  8:43                 ` Or Gerlitz
2013-05-21 10:27                 ` Eliezer Tamir
2013-05-21 10:41                   ` Or Gerlitz
2013-05-21  8:21         ` Or Gerlitz
2013-05-21  8:21           ` Or Gerlitz
2013-05-21  8:28           ` Eilon Greenstein
2013-05-21  8:28             ` Eilon Greenstein
2013-05-21 14:19       ` Willem de Bruijn
2013-05-21 14:19         ` Willem de Bruijn
2013-05-20 10:16 ` [PATCH v3 net-next 4/4] ixgbe: add extra stats " Eliezer Tamir
2013-05-20 10:16   ` Eliezer Tamir
2013-05-21 12:06 ` [PATCH v3 net-next 0/4] net: low latency Ethernet device polling Alex Rosenbaum
2013-05-21 12:29   ` Eliezer Tamir
2013-05-21 13:15     ` Alex Rosenbaum
2013-05-21 13:15       ` Alex Rosenbaum
2013-05-21 13:31       ` Eric Dumazet
2013-05-23 11:06       ` Eliezer Tamir
2013-05-23 11:45         ` Alex Rosenbaum
2013-05-21 14:30     ` Willem de Bruijn
2013-05-21 14:30       ` Willem de Bruijn
2013-05-21 18:15     ` Ben Hutchings
2013-05-21 18:15       ` Ben Hutchings
2013-05-22  9:35       ` Eliezer Tamir

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.