All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH 0/4] RFS hardware acceleration
@ 2010-09-20 19:01 Ben Hutchings
  2010-09-20 19:08 ` [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices Ben Hutchings
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Ben Hutchings @ 2010-09-20 19:01 UTC (permalink / raw)
  To: Tom Herbert; +Cc: netdev, linux-net-drivers

Tom,

This patch series extends RFS to use hardware RX filters where
available.  Depending on the number of hardware RX queues and their
IRQs' affinity, this should reduce the need for IPIs or at least get
packets delivered to the right NUMA node.

I've implemented the driver side of this for our hardware, though I
don't know whether you have any of that to test on.  I would be very
interested to know how much this can help in the sort of cases where you
use RFS.

Ben.

Ben Hutchings (4):
  IRQ: IRQ groups for multiqueue devices
  net: RPS: Enable hardware acceleration
  sfc: Implement RFS acceleration
  sfc/RFS/irq_group debug output

 drivers/net/sfc/efx.c     |   49 +++++++++++---
 drivers/net/sfc/efx.h     |    9 +++
 drivers/net/sfc/filter.c  |  109 +++++++++++++++++++++++++++++
 include/linux/irq.h       |   52 ++++++++++++++
 include/linux/netdevice.h |   29 +++++++-
 kernel/irq/manage.c       |  170 +++++++++++++++++++++++++++++++++++++++++++++
 net/core/dev.c            |   88 ++++++++++++++++++++++--
 7 files changed, 488 insertions(+), 18 deletions(-)

-- 
1.7.2.1


-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices
  2010-09-20 19:01 [RFC][PATCH 0/4] RFS hardware acceleration Ben Hutchings
@ 2010-09-20 19:08 ` Ben Hutchings
  2010-09-20 21:27   ` Thomas Gleixner
  2010-09-20 19:10 ` [RFC][PATCH 2/4] net: RPS: Enable hardware acceleration Ben Hutchings
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Ben Hutchings @ 2010-09-20 19:08 UTC (permalink / raw)
  To: Tom Herbert; +Cc: netdev, linux-net-drivers, linux-kernel

When initiating I/O on multiqueue devices, we usually want to select a
queue for which the response will be handled on the same or a nearby
CPU.  IRQ groups hold a mapping of CPU to IRQ which will be updated
based on the inverse of IRQ CPU-affinities plus CPU topology
information.
---
 include/linux/irq.h |   52 ++++++++++++++++++
 kernel/irq/manage.c |  149 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 201 insertions(+), 0 deletions(-)

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c03243a..bbddd5f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -196,6 +196,8 @@ struct irq_desc {
 #ifdef CONFIG_SMP
 	cpumask_var_t		affinity;
 	const struct cpumask	*affinity_hint;
+	struct irq_group	*group;
+	u16			group_index;
 	unsigned int		node;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_var_t		pending_mask;
@@ -498,6 +500,33 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
 #endif
 }
 
+/**
+ * struct irq_group - IRQ group for multiqueue devices
+ * @closest: For each CPU, the index and distance to the closest IRQ,
+ *	based on affinity masks
+ * @size: Size of the group
+ * @used: Number of IRQs currently included in the group
+ * @irq: Descriptors for IRQs in the group
+ */
+struct irq_group {
+	struct {
+		u16	index;
+		u16	dist;
+	} closest[NR_CPUS];
+	unsigned int	size, used;
+	struct irq_desc *irq[0];
+};
+#define IRQ_CPU_DIST_INF 0xffff
+
+extern struct irq_group *alloc_irq_group(unsigned int size, gfp_t flags);
+extern void free_irq_group(struct irq_group *group);
+extern void irq_group_add(struct irq_group *group, unsigned int irq);
+
+static inline u16 irq_group_get_index(struct irq_group *group, int cpu)
+{
+	return group->closest[cpu].index;
+}
+
 #else /* !CONFIG_SMP */
 
 static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
@@ -519,6 +548,29 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
 				   struct irq_desc *new_desc)
 {
 }
+
+struct irq_group {
+};
+
+static inline struct irq_group *alloc_irq_group(unsigned int size, gfp_t flags)
+{
+	static struct irq_group dummy;
+	return &dummy;
+}
+
+static inline void free_irq_group(struct irq_group *group)
+{
+}
+
+static inline void irq_group_add(struct irq_group *group, unsigned int irq)
+{
+}
+
+static inline u16 irq_group_get_index(struct irq_group *group, int cpu)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_SMP */
 
 #endif /* _LINUX_IRQ_H */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c3003e9..3f2b1a9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -100,6 +100,154 @@ void irq_set_thread_affinity(struct irq_desc *desc)
 	}
 }
 
+static void irq_group_update_neigh(struct irq_group *group,
+				   const struct cpumask *mask,
+				   u16 index, u16 dist)
+{
+	int cpu;
+
+	for_each_cpu(cpu, mask) {
+		if (dist < group->closest[cpu].dist) {
+			group->closest[cpu].index = index;
+			group->closest[cpu].dist = dist;
+		}
+	}
+}
+
+static bool irq_group_copy_neigh(struct irq_group *group, int cpu,
+				 const struct cpumask *mask, u16 dist)
+{
+	int neigh;
+
+	for_each_cpu(neigh, mask) {
+		if (group->closest[neigh].dist <= dist) {
+			group->closest[cpu].index = group->closest[neigh].index;
+			group->closest[cpu].dist = dist;
+			return true;
+		}
+	}
+	return false;
+}
+
+/* Update the per-CPU closest IRQs following a change of affinity */
+static void
+irq_update_group(struct irq_desc *desc, const struct cpumask *affinity)
+{
+	struct irq_group *group = desc->group;
+	unsigned index = desc->group_index;
+	int cpu;
+
+	if (!group)
+		return;
+
+	/* Invalidate old distances to this IRQ */
+	for_each_online_cpu(cpu)
+		if (group->closest[cpu].index == index)
+			group->closest[cpu].dist = IRQ_CPU_DIST_INF;
+
+	/*
+	 * Set this as the closest IRQ for all CPUs in the affinity mask,
+	 * plus the following CPUs if they don't have a closer IRQ:
+	 * - all other threads in the same core (distance 1);
+	 * - all other cores in the same package (distance 2);
+	 * - all other packages in the same NUMA node (distance 3).
+	 */
+	for_each_cpu(cpu, affinity) {
+		group->closest[cpu].index = index;
+		group->closest[cpu].dist = 0;
+		irq_group_update_neigh(group, topology_thread_cpumask(cpu),
+				       index, 1);
+		irq_group_update_neigh(group, topology_core_cpumask(cpu),
+				       index, 2);
+		irq_group_update_neigh(group, cpumask_of_node(cpu_to_node(cpu)),
+				       index, 3);
+	}
+
+	/* Find new closest IRQ for any CPUs left with invalid distances */
+	for_each_online_cpu(cpu) {
+		if (!(group->closest[cpu].index == index &&
+		      group->closest[cpu].dist == IRQ_CPU_DIST_INF))
+			continue;
+		if (irq_group_copy_neigh(group, cpu,
+					 topology_thread_cpumask(cpu), 1))
+			continue;
+		if (irq_group_copy_neigh(group, cpu,
+					 topology_core_cpumask(cpu), 2))
+			continue;
+		if (irq_group_copy_neigh(group, cpu,
+					 cpumask_of_node(cpu_to_node(cpu)), 3))
+			continue;
+		/* We could continue into NUMA node distances, but for now
+		 * we give up. */
+	}
+}
+
+/**
+ *	alloc_irq_group - allocate IRQ group
+ *	@size:		Size of the group
+ *	@flags:		Allocation flags e.g. %GFP_KERNEL
+ */
+struct irq_group *alloc_irq_group(unsigned int size, gfp_t flags)
+{
+	struct irq_group *group =
+		kzalloc(sizeof(*group) + size * sizeof(group->irq[0]), flags);
+	int cpu;
+
+	if (!group)
+		return NULL;
+
+	/* Initially assign CPUs to IRQs on a rota */
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		group->closest[cpu].index = cpu % size;
+		group->closest[cpu].dist = IRQ_CPU_DIST_INF;
+	}
+
+	group->size = size;
+	return group;
+}
+EXPORT_SYMBOL(alloc_irq_group);
+
+/**
+ *	free_irq_group - free IRQ group
+ *	@group:		IRQ group allocated with alloc_irq_group(), or %NULL
+ */
+void free_irq_group(struct irq_group *group)
+{
+	struct irq_desc *desc;
+	unsigned int i;
+
+	if (!group)
+		return;
+
+	/* Remove all descriptors from the group */
+	for (i = 0; i < group->used; i++) {
+		desc = group->irq[i];
+		BUG_ON(desc->group != group || desc->group_index != i);
+		desc->group = NULL;
+	}
+
+	kfree(group);
+}
+EXPORT_SYMBOL(free_irq_group);
+
+/**
+ *	irq_group_add - add IRQ to a group
+ *	@group:		IRQ group allocated with alloc_irq_group()
+ *	@irq:		Interrupt to add to group
+ */
+void irq_group_add(struct irq_group *group, unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	BUG_ON(desc->group);
+	BUG_ON(group->used >= group->size);
+
+	desc->group = group;
+	desc->group_index = group->used;
+	group->irq[group->used++] = desc;
+}
+EXPORT_SYMBOL(irq_group_add);
+
 /**
  *	irq_set_affinity - Set the irq affinity of a given irq
  *	@irq:		Interrupt to set affinity
@@ -134,6 +282,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	}
 #endif
 	desc->status |= IRQ_AFFINITY_SET;
+	irq_update_group(desc, cpumask);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
 }
-- 
1.7.2.1



-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [RFC][PATCH 2/4] net: RPS: Enable hardware acceleration
  2010-09-20 19:01 [RFC][PATCH 0/4] RFS hardware acceleration Ben Hutchings
  2010-09-20 19:08 ` [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices Ben Hutchings
@ 2010-09-20 19:10 ` Ben Hutchings
  2010-09-20 19:11 ` [RFC][PATCH 3/4] sfc: Implement RFS acceleration Ben Hutchings
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Ben Hutchings @ 2010-09-20 19:10 UTC (permalink / raw)
  To: Tom Herbert; +Cc: netdev, linux-net-drivers

Allow drivers for multiqueue hardware with flow filter tables to
accelerate RFS.  The driver must:

1. Set net_device::rx_irq_group to an irq_group of the RX completion
IRQs (in queue order).  This will provide a mapping from CPUs to the
queues for which completions are handled nearest to them.

2. Implement net_device_ops::ndo_rx_flow_steer.  This operation adds
or replaces a filter steering the given flow to the given RX queue, if
possible.

3. Periodically remove filters for which rps_may_expire_flow() returns
true.
---
The heuristic in rps_may_expire_flow() is quite possibly bogus.  I'm not
even sure whether expiry of flows should be triggered by the driver or
from the RPS/RFS core.  Any better ideas on how to do this?

Ben.

 include/linux/netdevice.h |   29 +++++++++++++--
 net/core/dev.c            |   88 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f7f1302..897118f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -524,14 +524,16 @@ struct rps_map {
 #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
 
 /*
- * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
- * tail pointer for that CPU's input queue at the time of last enqueue.
+ * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
+ * tail pointer for that CPU's input queue at the time of last enqueue, and
+ * a hardware filter index.
  */
 struct rps_dev_flow {
 	u16 cpu;
-	u16 fill;
+	u16 filter;
 	unsigned int last_qtail;
 };
+#define RPS_NO_FILTER 0xffff
 
 /*
  * The rps_dev_flow_table structure contains a table of flow mappings.
@@ -581,6 +583,9 @@ static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
 
 extern struct rps_sock_flow_table *rps_sock_flow_table;
 
+extern bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+				u32 flow_id, u16 filter_id);
+
 /* This structure contains an instance of an RX queue. */
 struct netdev_rx_queue {
 	struct rps_map *rps_map;
@@ -701,6 +706,13 @@ struct netdev_rx_queue {
  * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
  *			  struct nlattr *port[]);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
+ *
+ *	RFS acceleration.
+ * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
+ *			    u16 rxq_index, u32 flow_id);
+ *	Set hardware filter for RFS.  rxq_index is the target queue index;
+ *	flow_id is a flow ID to be passed to rps_may_expire_flow() later.
+ *	Return the filter ID on success, or a negative error code.
  */
 #define HAVE_NET_DEVICE_OPS
 struct net_device_ops {
@@ -773,6 +785,12 @@ struct net_device_ops {
 	int			(*ndo_fcoe_get_wwn)(struct net_device *dev,
 						    u64 *wwn, int type);
 #endif
+#ifdef CONFIG_RPS
+	int			(*ndo_rx_flow_steer)(struct net_device *dev,
+						     const struct sk_buff *skb,
+						     u16 rxq_index,
+						     u32 flow_id);
+#endif
 };
 
 /*
@@ -978,6 +996,11 @@ struct net_device {
 
 	/* Number of RX queues allocated at alloc_netdev_mq() time  */
 	unsigned int		num_rx_queues;
+
+	/* IRQ group for RX completion interrupts, indexed by RX queue
+	 * number.  Assigned by driver.  This must only be set if the
+	 * ndo_rx_flow_steer operation is defined. */
+	struct irq_group	*rx_irq_group;
 #endif
 
 	rx_handler_func_t	*rx_handler;
diff --git a/net/core/dev.c b/net/core/dev.c
index 2c7934f..50301eb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2335,6 +2335,49 @@ EXPORT_SYMBOL(__skb_get_rxhash);
 struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
 EXPORT_SYMBOL(rps_sock_flow_table);
 
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+	    struct rps_dev_flow *rflow, u16 next_cpu)
+{
+	struct netdev_rx_queue *rxqueue;
+	struct rps_dev_flow_table *flow_table;
+	struct rps_dev_flow *old_rflow;
+	u32 flow_id;
+	u16 rxq_index;
+	u16 tcpu;
+	int rc;
+
+	tcpu = rflow->cpu = next_cpu;
+	if (tcpu == RPS_NO_CPU)
+		return rflow;
+
+	/* Should we steer this flow to a different hardware queue? */
+	if (!skb_rx_queue_recorded(skb) || !dev->rx_irq_group)
+		goto out;
+	rxq_index = irq_group_get_index(dev->rx_irq_group, next_cpu);
+	if (rxq_index == skb_get_rx_queue(skb))
+		goto out;
+
+	rxqueue = dev->_rx + rxq_index;
+	flow_table = rcu_dereference(rxqueue->rps_flow_table);
+	if (!flow_table)
+		goto out;
+	flow_id = skb->rxhash & flow_table->mask;
+	rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id);
+	if (rc < 0)
+		goto out;
+	old_rflow = rflow;
+	rflow = &flow_table->flows[flow_id];
+	rflow->cpu = next_cpu;
+	rflow->filter = rc;
+	if (old_rflow->filter == rflow->filter)
+		old_rflow->filter = RPS_NO_FILTER;
+
+out:
+	rflow->last_qtail = per_cpu(softnet_data, tcpu).input_queue_head;
+	return rflow;
+}
+
 /*
  * get_rps_cpu is called from netif_receive_skb and returns the target
  * CPU from the RPS map of the receiving queue for a given skb.
@@ -2404,12 +2447,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		if (unlikely(tcpu != next_cpu) &&
 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
-		      rflow->last_qtail)) >= 0)) {
-			tcpu = rflow->cpu = next_cpu;
-			if (tcpu != RPS_NO_CPU)
-				rflow->last_qtail = per_cpu(softnet_data,
-				    tcpu).input_queue_head;
-		}
+		      rflow->last_qtail)) >= 0))
+			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+
 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
 			*rflowp = rflow;
 			cpu = tcpu;
@@ -2430,6 +2470,42 @@ done:
 	return cpu;
 }
 
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+			 u32 flow_id, u16 filter_id)
+{
+	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+	struct rps_dev_flow_table *flow_table;
+	struct rps_dev_flow *rflow;
+	bool expire = true;
+	int cpu;
+
+	rcu_read_lock();
+	flow_table = rcu_dereference(rxqueue->rps_flow_table);
+	if (flow_table && flow_id <= flow_table->mask) {
+		rflow = &flow_table->flows[flow_id];
+		cpu = ACCESS_ONCE(rflow->cpu);
+		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+			   rflow->last_qtail) <
+		     (int)(10 * flow_table->mask)))
+			expire = false;
+	}
+	rcu_read_unlock();
+	return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
 /* Called from hardirq (IPI) context */
 static void rps_trigger_softirq(void *data)
 {
-- 
1.7.2.1



-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [RFC][PATCH 3/4] sfc: Implement RFS acceleration
  2010-09-20 19:01 [RFC][PATCH 0/4] RFS hardware acceleration Ben Hutchings
  2010-09-20 19:08 ` [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices Ben Hutchings
  2010-09-20 19:10 ` [RFC][PATCH 2/4] net: RPS: Enable hardware acceleration Ben Hutchings
@ 2010-09-20 19:11 ` Ben Hutchings
  2010-09-20 19:12 ` [RFC][PATCH 4/4] sfc/RFS/irq_group debug output Ben Hutchings
  2010-09-20 19:36 ` [RFC][PATCH 0/4] RFS hardware acceleration Tom Herbert
  4 siblings, 0 replies; 12+ messages in thread
From: Ben Hutchings @ 2010-09-20 19:11 UTC (permalink / raw)
  To: Tom Herbert; +Cc: netdev, linux-net-drivers

---
This depends on today's patch series for net-next-2.6.

Ben.

 drivers/net/sfc/efx.c    |   49 ++++++++++++++++++----
 drivers/net/sfc/efx.h    |    9 ++++
 drivers/net/sfc/filter.c |  100 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 149 insertions(+), 9 deletions(-)

diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index 8a51c41..aea6283 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -127,6 +127,8 @@ static int napi_weight = 64;
  * monitor.  On Falcon-based NICs, this will:
  * - Check the on-board hardware monitor;
  * - Poll the link state and reconfigure the hardware as necessary.
+ * If RFS is enabled, this will scan part of the RX IP filter table and
+ * remove filters for inactive flows.
  */
 unsigned int efx_monitor_interval = 1 * HZ;
 
@@ -1174,7 +1176,7 @@ static int efx_wanted_channels(void)
 /* Probe the number and type of interrupts we are able to obtain, and
  * the resulting numbers of channels and RX queues.
  */
-static void efx_probe_interrupts(struct efx_nic *efx)
+static int efx_probe_interrupts(struct efx_nic *efx)
 {
 	int max_channels =
 		min_t(int, efx->type->phys_addr_channels, EFX_MAX_CHANNELS);
@@ -1216,6 +1218,17 @@ static void efx_probe_interrupts(struct efx_nic *efx)
 				efx->n_tx_channels = efx->n_channels;
 				efx->n_rx_channels = efx->n_channels;
 			}
+#ifdef CONFIG_RPS
+			efx->net_dev->rx_irq_group =
+				alloc_irq_group(efx->n_rx_channels, GFP_KERNEL);
+			if (!efx->net_dev->rx_irq_group) {
+				pci_disable_msix(efx->pci_dev);
+				return -ENOMEM;
+			}
+			for (i = 0; i < efx->n_rx_channels; i++)
+				irq_group_add(efx->net_dev->rx_irq_group,
+					      xentries[i].vector);
+#endif
 			for (i = 0; i < n_channels; i++)
 				efx_get_channel(efx, i)->irq =
 					xentries[i].vector;
@@ -1249,6 +1262,8 @@ static void efx_probe_interrupts(struct efx_nic *efx)
 		efx->n_tx_channels = 1;
 		efx->legacy_irq = efx->pci_dev->irq;
 	}
+
+	return 0;
 }
 
 static void efx_remove_interrupts(struct efx_nic *efx)
@@ -1258,6 +1273,10 @@ static void efx_remove_interrupts(struct efx_nic *efx)
 	/* Remove MSI/MSI-X interrupts */
 	efx_for_each_channel(channel, efx)
 		channel->irq = 0;
+#ifdef CONFIG_RPS
+	free_irq_group(efx->net_dev->rx_irq_group);
+	efx->net_dev->rx_irq_group = NULL;
+#endif
 	pci_disable_msi(efx->pci_dev);
 	pci_disable_msix(efx->pci_dev);
 
@@ -1307,7 +1326,9 @@ static int efx_probe_nic(struct efx_nic *efx)
 
 	/* Determine the number of channels and queues by trying to hook
 	 * in MSI-X interrupts. */
-	efx_probe_interrupts(efx);
+	rc = efx_probe_interrupts(efx);
+	if (rc)
+		goto fail;
 
 	if (efx->n_channels > 1)
 		get_random_bytes(&efx->rx_hash_key, sizeof(efx->rx_hash_key));
@@ -1322,6 +1343,10 @@ static int efx_probe_nic(struct efx_nic *efx)
 	efx_init_irq_moderation(efx, tx_irq_mod_usec, rx_irq_mod_usec, true);
 
 	return 0;
+
+fail:
+	efx->type->remove(efx);
+	return rc;
 }
 
 static void efx_remove_nic(struct efx_nic *efx)
@@ -1419,13 +1444,15 @@ static void efx_start_all(struct efx_nic *efx)
 	if (efx->reset_pending != RESET_TYPE_NONE)
 		efx_mcdi_mode_poll(efx);
 
-	/* Start the hardware monitor if there is one. Otherwise (we're link
-	 * event driven), we have to poll the PHY because after an event queue
-	 * flush, we could have a missed a link state change */
-	if (efx->type->monitor != NULL) {
+	/* Start the periodic monitor if necessary */
+	if (efx->type->monitor || efx_filter_rfs_enabled())
 		queue_delayed_work(efx->workqueue, &efx->monitor_work,
 				   efx_monitor_interval);
-	} else {
+
+	/* If we normally rely on link state events, we have to poll
+	 * the PHY because after an event queue flush, we could have a
+	 * missed a link state change */
+	if (!efx->type->monitor) {
 		mutex_lock(&efx->mac_lock);
 		if (efx->phy_op->poll(efx))
 			efx_link_status_changed(efx);
@@ -1556,17 +1583,18 @@ static void efx_monitor(struct work_struct *data)
 	netif_vdbg(efx, timer, efx->net_dev,
 		   "hardware monitor executing on CPU %d\n",
 		   raw_smp_processor_id());
-	BUG_ON(efx->type->monitor == NULL);
 
 	/* If the mac_lock is already held then it is likely a port
 	 * reconfiguration is already in place, which will likely do
 	 * most of the work of monitor() anyway. */
-	if (mutex_trylock(&efx->mac_lock)) {
+	if (efx->type->monitor && mutex_trylock(&efx->mac_lock)) {
 		if (efx->port_enabled)
 			efx->type->monitor(efx);
 		mutex_unlock(&efx->mac_lock);
 	}
 
+	efx_filter_rfs_expire(efx);
+
 	queue_delayed_work(efx->workqueue, &efx->monitor_work,
 			   efx_monitor_interval);
 }
@@ -1849,6 +1877,9 @@ static const struct net_device_ops efx_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = efx_netpoll,
 #endif
+#ifdef CONFIG_RPS
+	.ndo_rx_flow_steer	= efx_filter_rfs,
+#endif
 };
 
 static void efx_update_name(struct efx_nic *efx)
diff --git a/drivers/net/sfc/efx.h b/drivers/net/sfc/efx.h
index f502b14..88a43f1 100644
--- a/drivers/net/sfc/efx.h
+++ b/drivers/net/sfc/efx.h
@@ -77,6 +77,15 @@ extern int efx_filter_remove_filter(struct efx_nic *efx,
 extern void efx_filter_table_clear(struct efx_nic *efx,
 				   enum efx_filter_table_id table_id,
 				   enum efx_filter_priority priority);
+#ifdef CONFIG_RPS
+extern int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+			  u16 rxq_index, u32 flow_id);
+extern void efx_filter_rfs_expire(struct efx_nic *efx);
+#define efx_filter_rfs_enabled() 1
+#else
+static inline void efx_filter_rfs_expire(struct efx_nic *efx) {}
+#define efx_filter_rfs_enabled() 0
+#endif
 
 /* Channels */
 extern void efx_process_channel_now(struct efx_channel *channel);
diff --git a/drivers/net/sfc/filter.c b/drivers/net/sfc/filter.c
index abc884d..349b5d1 100644
--- a/drivers/net/sfc/filter.c
+++ b/drivers/net/sfc/filter.c
@@ -7,6 +7,8 @@
  * by the Free Software Foundation, incorporated herein by reference.
  */
 
+#include <net/ip.h>
+
 #include "efx.h"
 #include "filter.h"
 #include "io.h"
@@ -33,6 +35,10 @@ struct efx_filter_state {
 	spinlock_t	lock;
 	struct efx_filter_table table[EFX_FILTER_TABLE_COUNT];
 	unsigned	search_depth[EFX_FILTER_TYPE_COUNT];
+#ifdef CONFIG_RPS
+	u32		*rps_flow_id;
+	unsigned	rps_expire_index;
+#endif
 };
 
 /* The filter hash function is LFSR polynomial x^16 + x^3 + 1 of a 32-bit
@@ -397,6 +403,13 @@ int efx_probe_filters(struct efx_nic *efx)
 	spin_lock_init(&state->lock);
 
 	if (efx_nic_rev(efx) >= EFX_REV_FALCON_B0) {
+#ifdef CONFIG_RPS
+		state->rps_flow_id = kcalloc(FR_BZ_RX_FILTER_TBL0_ROWS,
+					     sizeof(*state->rps_flow_id),
+					     GFP_KERNEL);
+		if (!state->rps_flow_id)
+			goto fail;
+#endif
 		table = &state->table[EFX_FILTER_TABLE_RX_IP];
 		table->offset = FR_BZ_RX_FILTER_TBL0;
 		table->size = FR_BZ_RX_FILTER_TBL0_ROWS;
@@ -441,5 +454,92 @@ void efx_remove_filters(struct efx_nic *efx)
 		kfree(state->table[table_id].used_bitmap);
 		vfree(state->table[table_id].spec);
 	}
+#ifdef CONFIG_RPS
+	kfree(state->rps_flow_id);
+#endif
 	kfree(state);
 }
+
+#ifdef CONFIG_RPS
+
+int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+		   u16 rxq_index, u32 flow_id)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	struct efx_filter_state *state = efx->filter_state;
+	struct efx_filter_spec spec;
+	const struct iphdr *ip;
+	const __be16 *ports;
+	int nhoff;
+	int rc;
+
+	nhoff = skb_network_offset(skb);
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return -EPROTONOSUPPORT;
+
+	/* RFS must validate the IP header length before calling us */
+	EFX_BUG_ON_PARANOID(!pskb_may_pull(skb, nhoff + sizeof(*ip)));
+	ip = (const struct iphdr *)(skb->data + nhoff);
+	if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+		return -EPROTONOSUPPORT;
+	EFX_BUG_ON_PARANOID(!pskb_may_pull(skb, nhoff + 4 * ip->ihl + 4));
+	ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);
+
+	switch (ip->protocol) {
+	case IPPROTO_TCP:
+		efx_filter_set_rx_tcp_full(&spec,
+					   ntohl(ip->saddr), ntohs(ports[0]),
+					   ntohl(ip->daddr), ntohs(ports[1]));
+		break;
+	case IPPROTO_UDP:
+		efx_filter_set_rx_udp_full(&spec,
+					   ntohl(ip->saddr), ntohs(ports[0]),
+					   ntohl(ip->daddr), ntohs(ports[1]));
+		break;
+	default:
+		return -EPROTONOSUPPORT;
+	}
+	spec.priority = EFX_FILTER_PRI_HINT;
+	spec.dmaq_id = rxq_index;
+
+	rc = efx_filter_insert_filter(efx, &spec, true);
+	if (rc >= 0)
+		state->rps_flow_id[rc] = flow_id;
+
+	return rc;
+}
+
+void efx_filter_rfs_expire(struct efx_nic *efx)
+{
+	struct efx_filter_state *state = efx->filter_state;
+	struct efx_filter_table *table = &state->table[EFX_FILTER_TABLE_RX_IP];
+	unsigned mask = table->size - 1;
+	unsigned index;
+	unsigned stop;
+
+	spin_lock_bh(&state->lock);
+
+	/* Check filters in batches of 1024 */
+	index = state->rps_expire_index;
+	stop = (index + 1024) & mask;
+	
+	while (index != stop) {
+		if (test_bit(index, table->used_bitmap) &&
+		    table->spec[index].priority == EFX_FILTER_PRI_HINT &&
+		    rps_may_expire_flow(efx->net_dev,
+					table->spec[index].dmaq_id,
+					state->rps_flow_id[index], index))
+			efx_filter_table_clear_entry(efx, table, index);
+		index = (index + 1) & mask;
+	}
+
+	state->rps_expire_index = stop;
+	if (table->used == 0)
+		efx_filter_table_reset_search_depth(state,
+						    EFX_FILTER_TABLE_RX_IP);
+
+	spin_unlock_bh(&state->lock);
+}
+
+#endif /* CONFIG_RPS */
-- 
1.7.2.1



-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [RFC][PATCH 4/4] sfc/RFS/irq_group debug output
  2010-09-20 19:01 [RFC][PATCH 0/4] RFS hardware acceleration Ben Hutchings
                   ` (2 preceding siblings ...)
  2010-09-20 19:11 ` [RFC][PATCH 3/4] sfc: Implement RFS acceleration Ben Hutchings
@ 2010-09-20 19:12 ` Ben Hutchings
  2010-09-20 19:36 ` [RFC][PATCH 0/4] RFS hardware acceleration Tom Herbert
  4 siblings, 0 replies; 12+ messages in thread
From: Ben Hutchings @ 2010-09-20 19:12 UTC (permalink / raw)
  To: Tom Herbert; +Cc: netdev, linux-net-drivers

Just some logging I found useful.

Ben.
---
 drivers/net/sfc/filter.c |   11 ++++++++++-
 kernel/irq/manage.c      |   21 +++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletions(-)

diff --git a/drivers/net/sfc/filter.c b/drivers/net/sfc/filter.c
index 349b5d1..db7fa46 100644
--- a/drivers/net/sfc/filter.c
+++ b/drivers/net/sfc/filter.c
@@ -506,6 +506,11 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 	rc = efx_filter_insert_filter(efx, &spec, true);
 	if (rc >= 0)
 		state->rps_flow_id[rc] = flow_id;
+	netif_info(efx, rx_status, efx->net_dev,
+		   "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d]\n",
+		   (ip->protocol == IPPROTO_TCP) ? "TCP" : "UDP",
+		   &ip->saddr, ntohs(ports[0]), &ip->daddr, ntohs(ports[1]),
+		   rxq_index, flow_id, rc);
 
 	return rc;
 }
@@ -529,8 +534,12 @@ void efx_filter_rfs_expire(struct efx_nic *efx)
 		    table->spec[index].priority == EFX_FILTER_PRI_HINT &&
 		    rps_may_expire_flow(efx->net_dev,
 					table->spec[index].dmaq_id,
-					state->rps_flow_id[index], index))
+					state->rps_flow_id[index], index)) {
+			netif_info(efx, rx_status, efx->net_dev,
+				   "expiring filter %d [flow %u]\n",
+				   index, state->rps_flow_id[index]);
 			efx_filter_table_clear_entry(efx, table, index);
+		}
 		index = (index + 1) & mask;
 	}
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3f2b1a9..7199dde 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -129,6 +129,21 @@ static bool irq_group_copy_neigh(struct irq_group *group, int cpu,
 	return false;
 }
 
+static void print_irq_group(const struct irq_group *group, const char *prefix)
+{
+	unsigned index;
+	int cpu;
+
+	pr_info("irq_group %p, %s:\n", group, prefix);
+
+	for_each_possible_cpu(cpu) {
+		index = group->closest[cpu].index;
+		pr_info("cpu %d -> index %u (IRQ %u; distance %u)\n",
+			cpu, index, group->irq[index]->irq,
+			group->closest[cpu].dist);
+	}
+}
+
 /* Update the per-CPU closest IRQs following a change of affinity */
 static void
 irq_update_group(struct irq_desc *desc, const struct cpumask *affinity)
@@ -145,6 +160,8 @@ irq_update_group(struct irq_desc *desc, const struct cpumask *affinity)
 		if (group->closest[cpu].index == index)
 			group->closest[cpu].dist = IRQ_CPU_DIST_INF;
 
+	print_irq_group(group, "after invalidating old distances");
+
 	/*
 	 * Set this as the closest IRQ for all CPUs in the affinity mask,
 	 * plus the following CPUs if they don't have a closer IRQ:
@@ -163,6 +180,8 @@ irq_update_group(struct irq_desc *desc, const struct cpumask *affinity)
 				       index, 3);
 	}
 
+	print_irq_group(group, "after updating neighbours");
+
 	/* Find new closest IRQ for any CPUs left with invalid distances */
 	for_each_online_cpu(cpu) {
 		if (!(group->closest[cpu].index == index &&
@@ -180,6 +199,8 @@ irq_update_group(struct irq_desc *desc, const struct cpumask *affinity)
 		/* We could continue into NUMA node distances, but for now
 		 * we give up. */
 	}
+
+	print_irq_group(group, "after copying neighbours");
 }
 
 /**
-- 
1.7.2.1


-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH 0/4] RFS hardware acceleration
  2010-09-20 19:01 [RFC][PATCH 0/4] RFS hardware acceleration Ben Hutchings
                   ` (3 preceding siblings ...)
  2010-09-20 19:12 ` [RFC][PATCH 4/4] sfc/RFS/irq_group debug output Ben Hutchings
@ 2010-09-20 19:36 ` Tom Herbert
  4 siblings, 0 replies; 12+ messages in thread
From: Tom Herbert @ 2010-09-20 19:36 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, linux-net-drivers, Bob Felderman

Thanks Ben, this does look interesting.  We'll try to take a look.


On Mon, Sep 20, 2010 at 12:01 PM, Ben Hutchings
<bhutchings@solarflare.com> wrote:
> Tom,
>
> This patch series extends RFS to use hardware RX filters where
> available.  Depending on the number of hardware RX queues and their
> IRQs' affinity, this should reduce the need for IPIs or at least get
> packets delivered to the right NUMA node.
>
> I've implemented the driver side of this for our hardware, though I
> don't know whether you have any of that to test on.  I would be very
> interested to know how much this can help in the sort of cases where you
> use RFS.
>
> Ben.
>
> Ben Hutchings (4):
>  IRQ: IRQ groups for multiqueue devices
>  net: RPS: Enable hardware acceleration
>  sfc: Implement RFS acceleration
>  sfc/RFS/irq_group debug output
>
>  drivers/net/sfc/efx.c     |   49 +++++++++++---
>  drivers/net/sfc/efx.h     |    9 +++
>  drivers/net/sfc/filter.c  |  109 +++++++++++++++++++++++++++++
>  include/linux/irq.h       |   52 ++++++++++++++
>  include/linux/netdevice.h |   29 +++++++-
>  kernel/irq/manage.c       |  170 +++++++++++++++++++++++++++++++++++++++++++++
>  net/core/dev.c            |   88 ++++++++++++++++++++++--
>  7 files changed, 488 insertions(+), 18 deletions(-)
>
> --
> 1.7.2.1
>
>
> --
> Ben Hutchings, Senior Software Engineer, Solarflare Communications
> Not speaking for my employer; that's the marketing department's job.
> They asked us to note that Solarflare product names are trademarked.
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices
  2010-09-20 19:08 ` [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices Ben Hutchings
@ 2010-09-20 21:27   ` Thomas Gleixner
  2010-09-21 12:25     ` Ben Hutchings
  0 siblings, 1 reply; 12+ messages in thread
From: Thomas Gleixner @ 2010-09-20 21:27 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Tom Herbert, netdev, linux-net-drivers, linux-kernel,
	Peter Zijlstra, Ingo Molnar

Ben,

On Mon, 20 Sep 2010, Ben Hutchings wrote:

it would have been nice if I'd been cc'ed on that, but of course it's
my fault that there is no entry in MAINTAINERS. No, it's not.

> When initiating I/O on multiqueue devices, we usually want to select a

"we usually" is pretty useless for people not familiar with the
problem at hand. A [PATCH 0/4] intro would have been helpful along
with the users (aka [PATCH 2-4/4]) of the new facility.

Don't take it personally and I'm sure that you're solving a real world
problem, but I'm starting to get grumpy that especially networking
folks (aside of various driver developers) believe that adding random
stuff to kernel/irq is their private pleasure. Is it that hard to talk
to me upfront ?

> queue for which the response will be handled on the same or a nearby
> CPU.  IRQ groups hold a mapping of CPU to IRQ which will be updated
> based on the inverse of IRQ CPU-affinities plus CPU topology
> information.

Can you please explain, why you need that reverse mapping including
the below code ? What problem does this solve which can not be deduced
by the exisiting information/infrastructure ? And why is that reverse
mapping tied to interrupts and not something which we want to see in
some generic available (library) code ?

More comments inline.

> ---
>  include/linux/irq.h |   52 ++++++++++++++++++
>  kernel/irq/manage.c |  149 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 201 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/irq.h b/include/linux/irq.h
> index c03243a..bbddd5f 100644
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -196,6 +196,8 @@ struct irq_desc {
>  #ifdef CONFIG_SMP
>  	cpumask_var_t		affinity;
>  	const struct cpumask	*affinity_hint;

  How about updating the kernel doc above the struct ?

> +	struct irq_group	*group;

  Grr, how does this compile ? That needs at least a forward
  declaration of struct irq_group. RFC is _NOT_ an excuse

> +	u16			group_index;

  What's group_index doing and what's the point of an u16 here ?

>  	unsigned int		node;
>  #ifdef CONFIG_GENERIC_PENDING_IRQ
>  	cpumask_var_t		pending_mask;
> @@ -498,6 +500,33 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
>  #endif
>  }
>  
> +/**
> + * struct irq_group - IRQ group for multiqueue devices
> + * @closest: For each CPU, the index and distance to the closest IRQ,
> + *	based on affinity masks

  index of what ?

  Btw, please follow the style of the other kernel doc comments in this
  file where the explanation is aligned for readability sake

> + * @size: Size of the group
> + * @used: Number of IRQs currently included in the group
> + * @irq: Descriptors for IRQs in the group

  That's an array of pointers to irq descriptors, right ?

  Insert some sensible decription what irq groups are and for which
  problem space they are useful if at all.

> + */
> +struct irq_group {
> +	struct {
> +		u16	index;
> +		u16	dist;
> +	} closest[NR_CPUS];
> +	unsigned int	size, used;

  Separate lines please

> +	struct irq_desc *irq[0];
> +};

  Please separate this with a newline and add some useful comment
  about the meaning and purpose of this not selfexplaining constant.

> +#define IRQ_CPU_DIST_INF 0xffff

> +static inline u16 irq_group_get_index(struct irq_group *group, int cpu)
> +{
> +	return group->closest[cpu].index;
> +}
> +

  So you have an accessor function for closest[cpu].index. Are the
  other members meant to be accessible directly by random driver ?

>  #else /* !CONFIG_SMP */
>  
>  static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
> @@ -519,6 +548,29 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
>  				   struct irq_desc *new_desc)
>  {
>  }
> +
> +struct irq_group {
> +};
> +
> +static inline struct irq_group *alloc_irq_group(unsigned int size, gfp_t flags)
> +{
> +	static struct irq_group dummy;

  That will create one static instance per callsite. Is that on
  purpose? If yes, it needs a damned good comment.

> +	return &dummy;

>  #endif	/* CONFIG_SMP */
>  
>  #endif /* _LINUX_IRQ_H */
> diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
> index c3003e9..3f2b1a9 100644
> --- a/kernel/irq/manage.c
> +++ b/kernel/irq/manage.c
> @@ -100,6 +100,154 @@ void irq_set_thread_affinity(struct irq_desc *desc)
>  	}
>  }
>  
> +static void irq_group_update_neigh(struct irq_group *group,
> +				   const struct cpumask *mask,
> +				   u16 index, u16 dist)
> +{
> +	int cpu;
> +
> +	for_each_cpu(cpu, mask) {
> +		if (dist < group->closest[cpu].dist) {
> +			group->closest[cpu].index = index;
> +			group->closest[cpu].dist = dist;
> +		}
> +	}
> +}

  I have not the faintest idea how group, index and dist are related
  to each other. And I have no intention to decode the information
  about that piecewise by reverse engineering that obfuscated code.

> +static bool irq_group_copy_neigh(struct irq_group *group, int cpu,
> +				 const struct cpumask *mask, u16 dist)
> +{
> +	int neigh;
> +
> +	for_each_cpu(neigh, mask) {
> +		if (group->closest[neigh].dist <= dist) {
> +			group->closest[cpu].index = group->closest[neigh].index;
> +			group->closest[cpu].dist = dist;
> +			return true;
> +		}
> +	}
> +	return false;

  What's the reason for copying or not ?

> +}
> +
> +/* Update the per-CPU closest IRQs following a change of affinity */
> +static void
> +irq_update_group(struct irq_desc *desc, const struct cpumask *affinity)
> +{
> +	struct irq_group *group = desc->group;
> +	unsigned index = desc->group_index;
> +	int cpu;
> +
> +	if (!group)
> +		return;
> +
> +	/* Invalidate old distances to this IRQ */
> +	for_each_online_cpu(cpu)
> +		if (group->closest[cpu].index == index)
> +			group->closest[cpu].dist = IRQ_CPU_DIST_INF;
> +
> +	/*
> +	 * Set this as the closest IRQ for all CPUs in the affinity mask,
> +	 * plus the following CPUs if they don't have a closer IRQ:
> +	 * - all other threads in the same core (distance 1);
> +	 * - all other cores in the same package (distance 2);
> +	 * - all other packages in the same NUMA node (distance 3).
> +	 */
> +	for_each_cpu(cpu, affinity) {
> +		group->closest[cpu].index = index;
> +		group->closest[cpu].dist = 0;
> +		irq_group_update_neigh(group, topology_thread_cpumask(cpu),
> +				       index, 1);
> +		irq_group_update_neigh(group, topology_core_cpumask(cpu),
> +				       index, 2);
> +		irq_group_update_neigh(group, cpumask_of_node(cpu_to_node(cpu)),
> +				       index, 3);
> +	}
> +
> +	/* Find new closest IRQ for any CPUs left with invalid distances */
> +	for_each_online_cpu(cpu) {
> +		if (!(group->closest[cpu].index == index &&
> +		      group->closest[cpu].dist == IRQ_CPU_DIST_INF))
> +			continue;
> +		if (irq_group_copy_neigh(group, cpu,
> +					 topology_thread_cpumask(cpu), 1))
> +			continue;
> +		if (irq_group_copy_neigh(group, cpu,
> +					 topology_core_cpumask(cpu), 2))
> +			continue;
> +		if (irq_group_copy_neigh(group, cpu,
> +					 cpumask_of_node(cpu_to_node(cpu)), 3))
> +			continue;
> +		/* We could continue into NUMA node distances, but for now
> +		 * we give up. */

  What are the consequences of giving up ? Does not happen ? Should
  not happen ? Will break ? Don't care ? ....

> +	}

  This is called from irq_set_affinity() with desc->lock held and
  interrupts disabled. You're not serious about that, are you ?

  Damned, there are two iterations over each online cpu and another
  one over the affinity mask. Did you ever extrapolate how long that
  runs on a really large machine ?

  If CPU affinity of an IRQ changes, then it does not matter if one or
  two interrupts end up in the wrong space. Those changes are not
  happening every other interrupt.

> +}
> +
> +/**
> + *	alloc_irq_group - allocate IRQ group
> + *	@size:		Size of the group

  size of what ? I guess number of interrupts, right ?

> + *	@flags:		Allocation flags e.g. %GFP_KERNEL
> + */
> +struct irq_group *alloc_irq_group(unsigned int size, gfp_t flags)
> +{
> +	struct irq_group *group =
> +		kzalloc(sizeof(*group) + size * sizeof(group->irq[0]), flags);
> +	int cpu;
> +
> +	if (!group)
> +		return NULL;
> +
> +	/* Initially assign CPUs to IRQs on a rota */
> +	for (cpu = 0; cpu < NR_CPUS; cpu++) {
> +		group->closest[cpu].index = cpu % size;

  So here we randomly assign index with the lower cpu numbers no
  matter whether they are online or possible ?

> +		group->closest[cpu].dist = IRQ_CPU_DIST_INF;
> +	}
> +
> +	group->size = size;
> +	return group;
> +}
> +EXPORT_SYMBOL(alloc_irq_group);

  EXPORT_SYMBOL_GPL if at all. Same for the other exports

> +
> +/**
> + *	free_irq_group - free IRQ group
> + *	@group:		IRQ group allocated with alloc_irq_group(), or %NULL

  How is this serialized or sanity checked against free_irq() ?

> + */
> +void free_irq_group(struct irq_group *group)
> +{
> +	struct irq_desc *desc;
> +	unsigned int i;
> +
> +	if (!group)
> +		return;
> +
> +	/* Remove all descriptors from the group */
> +	for (i = 0; i < group->used; i++) {
> +		desc = group->irq[i];
> +		BUG_ON(desc->group != group || desc->group_index != i);
> +		desc->group = NULL;
> +	}
> +
> +	kfree(group);
> +}
> +EXPORT_SYMBOL(free_irq_group);
> +
> +/**
> + *	irq_group_add - add IRQ to a group
> + *	@group:		IRQ group allocated with alloc_irq_group()
> + *	@irq:		Interrupt to add to group
> + */
> +void irq_group_add(struct irq_group *group, unsigned int irq)
> +{
> +	struct irq_desc *desc = irq_to_desc(irq);

  Again, how is this serialized against anything else fiddling with
  irq_desc[irq]?

> +	BUG_ON(desc->group);
> +	BUG_ON(group->used >= group->size);
> +
> +	desc->group = group;
> +	desc->group_index = group->used;
> +	group->irq[group->used++] = desc;
> +}
> +EXPORT_SYMBOL(irq_group_add);

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices
  2010-09-20 21:27   ` Thomas Gleixner
@ 2010-09-21 12:25     ` Ben Hutchings
  2010-09-21 15:34       ` Thomas Gleixner
  0 siblings, 1 reply; 12+ messages in thread
From: Ben Hutchings @ 2010-09-21 12:25 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Tom Herbert, netdev, linux-net-drivers, linux-kernel,
	Peter Zijlstra, Ingo Molnar

On Mon, 2010-09-20 at 23:27 +0200, Thomas Gleixner wrote:
> Ben,
> 
> On Mon, 20 Sep 2010, Ben Hutchings wrote:
> 
> it would have been nice if I'd been cc'ed on that, but of course it's
> my fault that there is no entry in MAINTAINERS. No, it's not.
> 
> > When initiating I/O on multiqueue devices, we usually want to select a
> 
> "we usually" is pretty useless for people not familiar with the
> problem at hand. A [PATCH 0/4] intro would have been helpful along
> with the users (aka [PATCH 2-4/4]) of the new facility.

There was an intro on the netdev list
<http://article.gmane.org/gmane.linux.network/172427> but it probably
doesn't tell you what you want to know anyway.

> Don't take it personally and I'm sure that you're solving a real world
> problem, but I'm starting to get grumpy that especially networking
> folks (aside of various driver developers) believe that adding random
> stuff to kernel/irq is their private pleasure. Is it that hard to talk
> to me upfront ?
> 
> > queue for which the response will be handled on the same or a nearby
> > CPU.  IRQ groups hold a mapping of CPU to IRQ which will be updated
> > based on the inverse of IRQ CPU-affinities plus CPU topology
> > information.
> 
> Can you please explain, why you need that reverse mapping including
> the below code ? What problem does this solve which can not be deduced
> by the exisiting information/infrastructure ? And why is that reverse
> mapping tied to interrupts and not something which we want to see in
> some generic available (library) code ?

Are you thinking of a per-CPU distance map?  That seems like it would be
useful to have.  I wonder about the storage requirements on larger
systems.

> More comments inline.
> 
> > ---
> >  include/linux/irq.h |   52 ++++++++++++++++++
> >  kernel/irq/manage.c |  149 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 201 insertions(+), 0 deletions(-)
> > 
> > diff --git a/include/linux/irq.h b/include/linux/irq.h
> > index c03243a..bbddd5f 100644
> > --- a/include/linux/irq.h
> > +++ b/include/linux/irq.h
> > @@ -196,6 +196,8 @@ struct irq_desc {
> >  #ifdef CONFIG_SMP
> >  	cpumask_var_t		affinity;
> >  	const struct cpumask	*affinity_hint;
> 
>   How about updating the kernel doc above the struct ?
> 
> > +	struct irq_group	*group;
> 
>   Grr, how does this compile ? That needs at least a forward
>   declaration of struct irq_group. RFC is _NOT_ an excuse

Naming a struct in a function parameter declarator is not a valid
forward declaration, but this is.

> > +	u16			group_index;
> 
>   What's group_index doing and what's the point of an u16 here ?

Index of this IRQ within the group.

> >  	unsigned int		node;
> >  #ifdef CONFIG_GENERIC_PENDING_IRQ
> >  	cpumask_var_t		pending_mask;
> > @@ -498,6 +500,33 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
> >  #endif
> >  }
> >  
> > +/**
> > + * struct irq_group - IRQ group for multiqueue devices
> > + * @closest: For each CPU, the index and distance to the closest IRQ,
> > + *	based on affinity masks
> 
>   index of what ?

Index within the group of the closest IRQ for this CPU; also the queue
index.

[...]
> > +static inline u16 irq_group_get_index(struct irq_group *group, int cpu)
> > +{
> > +	return group->closest[cpu].index;
> > +}
> > +
> 
>   So you have an accessor function for closest[cpu].index. Are the
>   other members meant to be accessible directly by random driver ?

No, they should not be touched at all.

> >  #else /* !CONFIG_SMP */
> >  
> >  static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
> > @@ -519,6 +548,29 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
> >  				   struct irq_desc *new_desc)
> >  {
> >  }
> > +
> > +struct irq_group {
> > +};
> > +
> > +static inline struct irq_group *alloc_irq_group(unsigned int size, gfp_t flags)
> > +{
> > +	static struct irq_group dummy;
> 
>   That will create one static instance per callsite. Is that on
>   purpose? If yes, it needs a damned good comment.

Which uses no storage, or maybe 1 byte.  I'm open to suggestions for a
better dummy implementation.

> > +	return &dummy;
> 
> >  #endif	/* CONFIG_SMP */
> >  
> >  #endif /* _LINUX_IRQ_H */
> > diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
> > index c3003e9..3f2b1a9 100644
> > --- a/kernel/irq/manage.c
> > +++ b/kernel/irq/manage.c
> > @@ -100,6 +100,154 @@ void irq_set_thread_affinity(struct irq_desc *desc)
> >  	}
> >  }
> >  
> > +static void irq_group_update_neigh(struct irq_group *group,
> > +				   const struct cpumask *mask,
> > +				   u16 index, u16 dist)
> > +{
> > +	int cpu;
> > +
> > +	for_each_cpu(cpu, mask) {
> > +		if (dist < group->closest[cpu].dist) {
> > +			group->closest[cpu].index = index;
> > +			group->closest[cpu].dist = dist;
> > +		}
> > +	}
> > +}
> 
>   I have not the faintest idea how group, index and dist are related
>   to each other. And I have no intention to decode the information
>   about that piecewise by reverse engineering that obfuscated code.

I'm happy to write more comments.

index is the index of the IRQ being updated within the group.  dist is
the maximum distance of that IRQ from the CPUs in the mask.

> > +static bool irq_group_copy_neigh(struct irq_group *group, int cpu,
> > +				 const struct cpumask *mask, u16 dist)
> > +{
> > +	int neigh;
> > +
> > +	for_each_cpu(neigh, mask) {
> > +		if (group->closest[neigh].dist <= dist) {
> > +			group->closest[cpu].index = group->closest[neigh].index;
> > +			group->closest[cpu].dist = dist;
> > +			return true;
> > +		}
> > +	}
> > +	return false;
> 
>   What's the reason for copying or not ?

We're trying to find closer IRQs for the CPUs which were removed from
the affinity of the IRQ currently being moved.

[...]
> > +		/* We could continue into NUMA node distances, but for now
> > +		 * we give up. */
> 
>   What are the consequences of giving up ? Does not happen ? Should
>   not happen ? Will break ? Don't care ? ....

Poor choice of closest IRQ.  Nothing disastrous.

> > +	}
> 
>   This is called from irq_set_affinity() with desc->lock held and
>   interrupts disabled. You're not serious about that, are you ?
> 
>   Damned, there are two iterations over each online cpu and another
>   one over the affinity mask. Did you ever extrapolate how long that
>   runs on a really large machine ?

No, and I know this sucks.  So, do you want to suggest anything better?
Should I look at building those CPU distance maps (trading space for
time)?

>   If CPU affinity of an IRQ changes, then it does not matter if one or
>   two interrupts end up in the wrong space. Those changes are not
>   happening every other interrupt.
> 
> > +}
> > +
> > +/**
> > + *	alloc_irq_group - allocate IRQ group
> > + *	@size:		Size of the group
> 
>   size of what ? I guess number of interrupts, right ?

Right.

> > + *	@flags:		Allocation flags e.g. %GFP_KERNEL
> > + */
> > +struct irq_group *alloc_irq_group(unsigned int size, gfp_t flags)
> > +{
> > +	struct irq_group *group =
> > +		kzalloc(sizeof(*group) + size * sizeof(group->irq[0]), flags);
> > +	int cpu;
> > +
> > +	if (!group)
> > +		return NULL;
> > +
> > +	/* Initially assign CPUs to IRQs on a rota */
> > +	for (cpu = 0; cpu < NR_CPUS; cpu++) {
> > +		group->closest[cpu].index = cpu % size;
> 
>   So here we randomly assign index with the lower cpu numbers no
>   matter whether they are online or possible ?

Doesn't matter, because we're mapping CPUs to IRQs not the other way
round.

> > +		group->closest[cpu].dist = IRQ_CPU_DIST_INF;
> > +	}
> > +
> > +	group->size = size;
> > +	return group;
> > +}
> > +EXPORT_SYMBOL(alloc_irq_group);
> 
>   EXPORT_SYMBOL_GPL if at all. Same for the other exports
> 
> > +
> > +/**
> > + *	free_irq_group - free IRQ group
> > + *	@group:		IRQ group allocated with alloc_irq_group(), or %NULL
> 
>   How is this serialized or sanity checked against free_irq() ?

free_irq() should be called first, for all IRQs in the group.  The IRQs
should then be really freed with pci_disable_msix() or similar.

> > + */
> > +void free_irq_group(struct irq_group *group)
> > +{
> > +	struct irq_desc *desc;
> > +	unsigned int i;
> > +
> > +	if (!group)
> > +		return;
> > +
> > +	/* Remove all descriptors from the group */
> > +	for (i = 0; i < group->used; i++) {
> > +		desc = group->irq[i];
> > +		BUG_ON(desc->group != group || desc->group_index != i);
> > +		desc->group = NULL;
> > +	}
> > +
> > +	kfree(group);
> > +}
> > +EXPORT_SYMBOL(free_irq_group);
> > +
> > +/**
> > + *	irq_group_add - add IRQ to a group
> > + *	@group:		IRQ group allocated with alloc_irq_group()
> > + *	@irq:		Interrupt to add to group
> > + */
> > +void irq_group_add(struct irq_group *group, unsigned int irq)
> > +{
> > +	struct irq_desc *desc = irq_to_desc(irq);
> 
>   Again, how is this serialized against anything else fiddling with
>   irq_desc[irq]?

The driver should call this after allocating IRQs with pci_enable_msix()
or similar and before setting the handlers with request_irq().  Is that
sufficient?

Ben.

> > +	BUG_ON(desc->group);
> > +	BUG_ON(group->used >= group->size);
> > +
> > +	desc->group = group;
> > +	desc->group_index = group->used;
> > +	group->irq[group->used++] = desc;
> > +}
> > +EXPORT_SYMBOL(irq_group_add);
> 
> Thanks,
> 
> 	tglx

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices
  2010-09-21 12:25     ` Ben Hutchings
@ 2010-09-21 15:34       ` Thomas Gleixner
  2010-09-21 19:04         ` Thomas Gleixner
  0 siblings, 1 reply; 12+ messages in thread
From: Thomas Gleixner @ 2010-09-21 15:34 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Tom Herbert, netdev, linux-net-drivers, linux-kernel,
	Peter Zijlstra, Ingo Molnar

On Tue, 21 Sep 2010, Ben Hutchings wrote:
> On Mon, 2010-09-20 at 23:27 +0200, Thomas Gleixner wrote:
> > > queue for which the response will be handled on the same or a nearby
> > > CPU.  IRQ groups hold a mapping of CPU to IRQ which will be updated
> > > based on the inverse of IRQ CPU-affinities plus CPU topology
> > > information.
> > 
> > Can you please explain, why you need that reverse mapping including
> > the below code ? What problem does this solve which can not be deduced
> > by the exisiting information/infrastructure ? And why is that reverse
> > mapping tied to interrupts and not something which we want to see in
> > some generic available (library) code ?
> 
> Are you thinking of a per-CPU distance map?  That seems like it would be
> useful to have.  I wonder about the storage requirements on larger
> systems.

  That might be a problem, but we should at least look into this.

  But even if a prebuilt map is too expensive storage wise, then the
  functions which build the lookup map might be useful for similar
  distance lookup scenarios as well.

> > > +static inline struct irq_group *alloc_irq_group(unsigned int size, gfp_t flags)
> > > +{
> > > +	static struct irq_group dummy;
> > 
> >   That will create one static instance per callsite. Is that on
> >   purpose? If yes, it needs a damned good comment.
> 
> Which uses no storage, or maybe 1 byte.  I'm open to suggestions for a
> better dummy implementation.
> 
> > > +	return &dummy;

  Why does it need to return a real struct instead of NULL ?
 
> > > +	}
> > 
> >   This is called from irq_set_affinity() with desc->lock held and
> >   interrupts disabled. You're not serious about that, are you ?
> > 
> >   Damned, there are two iterations over each online cpu and another
> >   one over the affinity mask. Did you ever extrapolate how long that
> >   runs on a really large machine ?
> 
> No, and I know this sucks.  So, do you want to suggest anything better?
> Should I look at building those CPU distance maps (trading space for
> time)?

  The point is that there is no reason why this code needs to run in a
  spinlocked irq disabled reason. I know why your implementation needs
  it, because it protects you against module unload etc. Come on, we
  have enough infrastructure to control the lifetime of objects in
  sane ways and not by hijacking a heavy lock and disabling
  interrupts.

  Btw, how does this code deal with concurrent affinity setting of
  multiple irqs in the group ? Random number generator ?

> > > + *	@flags:		Allocation flags e.g. %GFP_KERNEL

  Why do we need flags here? That code gets called in a device setup
  routine and not from random context.

> > > + */
> > > +struct irq_group *alloc_irq_group(unsigned int size, gfp_t flags)
> > > +{

> > > +
> > > +/**
> > > + *	free_irq_group - free IRQ group
> > > + *	@group:		IRQ group allocated with alloc_irq_group(), or %NULL
> > 
> >   How is this serialized or sanity checked against free_irq() ?
> 
> free_irq() should be called first, for all IRQs in the group.  The IRQs
> should then be really freed with pci_disable_msix() or similar.

  I explained that several times, that I do not care about what should
  be called in which order and what not. Driver coders get it wrong
  all the time, then it explodes in the genirq code and I get the crap
  to debug. No, thanks.

  Also if you call free_irq() first, then you still have a reference
  to that very irq descriptor in your group and a reference to the
  group in the irq descriptor. Brilliant, as the irq descriptor can be
  reused immediately after free_irq().

  Also it's not only problematic against free_irq, it's damned racy
  against a concurrent affinity setting as well. And that's not
  controlled by "should be called" at all.

> > > +/**
> > > + *	irq_group_add - add IRQ to a group
> > > + *	@group:		IRQ group allocated with alloc_irq_group()
> > > + *	@irq:		Interrupt to add to group
> > > + */
> > > +void irq_group_add(struct irq_group *group, unsigned int irq)
> > > +{
> > > +	struct irq_desc *desc = irq_to_desc(irq);
> > 
> >   Again, how is this serialized against anything else fiddling with
> >   irq_desc[irq]?
> 
> The driver should call this after allocating IRQs with pci_enable_msix()
> or similar and before setting the handlers with request_irq().  Is that
> sufficient?

  "should call" is _NEVER_ sufficient. fiddling with irq_desc w/o
  holding the lock is a nono, period.

> > > +	BUG_ON(desc->group);
> > > +	BUG_ON(group->used >= group->size);
> > > +
> > > +	desc->group = group;
> > > +	desc->group_index = group->used;
> > > +	group->irq[group->used++] = desc;

  Forgot to ask yesterday. Why do we need to store irq desc in the
  group ? For free_irq_group() I guess, but I explained already why
  this code is broken. And it's even more broken when an irq is moved
  to a different node and CONFIG_NUMA_IRQ_DESC=y.

Lemme summarize what I understand so far. For multiqueue devices with
several queues and interrupts you want to lookup the closest queue/irq
to the cpu on which you are currently running to avoid expensive cross
node memory access. Now you need to follow affinity changes for this
lookup table.

The basic idea of your code is ok. The lookup table itself is fine,
but the integration into the genirq code sucks. What needs to be done:

1) Remove the references to irq_desc in the group. They are not needed
   and will never work. The group does not need any information about
   the irq number and the irq descriptor.

2) Add proper refcounting to struct irq_group so it can be accessed
   outside of irq_desc->lock.

3) Move the update of the map outside of irq_desc->lock and the irq
   disabled region. Update the map in setup_affinity() as well. That
   code can be called from various atomic contexts, so it's probably
   the best thing to delegate the update to a workqueue or such unless
   you come up with a nice prebuilt lookup table.

4) Add comments so the code is understandable for mere mortals

5) Add proper install/free handling. Hint: affinity_hint

6) All modifications of irq_desc need a check whether irq_to_desc
   returned a valid pointer and must hold desc->lock

7) Deal with concurrent updates of multiple irqs in a group (at least
   a comment why the code does not need serialization)

8) Move the code to kernel/irq/irqgroup.c or some other sensible name
   and make it configurable so embedded folks don't have to carry it
   around as useless binary bloat.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices
  2010-09-21 15:34       ` Thomas Gleixner
@ 2010-09-21 19:04         ` Thomas Gleixner
  2010-09-22 16:00           ` Ben Hutchings
  0 siblings, 1 reply; 12+ messages in thread
From: Thomas Gleixner @ 2010-09-21 19:04 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Tom Herbert, netdev, linux-net-drivers, linux-kernel,
	Peter Zijlstra, Ingo Molnar

On Tue, 21 Sep 2010, Thomas Gleixner wrote:
> On Tue, 21 Sep 2010, Ben Hutchings wrote:
> > On Mon, 2010-09-20 at 23:27 +0200, Thomas Gleixner wrote:
> Lemme summarize what I understand so far. For multiqueue devices with
> several queues and interrupts you want to lookup the closest queue/irq
> to the cpu on which you are currently running to avoid expensive cross
> node memory access. Now you need to follow affinity changes for this
> lookup table.
> 
> The basic idea of your code is ok. The lookup table itself is fine,
> but the integration into the genirq code sucks. What needs to be done:
> 
> 1) Remove the references to irq_desc in the group. They are not needed
>    and will never work. The group does not need any information about
>    the irq number and the irq descriptor.
> 
> 2) Add proper refcounting to struct irq_group so it can be accessed
>    outside of irq_desc->lock.
> 
> 3) Move the update of the map outside of irq_desc->lock and the irq
>    disabled region. Update the map in setup_affinity() as well. That
>    code can be called from various atomic contexts, so it's probably
>    the best thing to delegate the update to a workqueue or such unless
>    you come up with a nice prebuilt lookup table.
> 
> 4) Add comments so the code is understandable for mere mortals
> 
> 5) Add proper install/free handling. Hint: affinity_hint
> 
> 6) All modifications of irq_desc need a check whether irq_to_desc
>    returned a valid pointer and must hold desc->lock
> 
> 7) Deal with concurrent updates of multiple irqs in a group (at least
>    a comment why the code does not need serialization)
> 
> 8) Move the code to kernel/irq/irqgroup.c or some other sensible name
>    and make it configurable so embedded folks don't have to carry it
>    around as useless binary bloat.

Talked to Peter about it and we came to the conclusion, that we should
just provide a callback infrastructure in the irq code which does not
care about the action behind it. That's going to solve #1,#2,#3,#5,#6
and parts of #8

That queue/index map code should move to lib/ or some other
appropriate place so it can be shared with storage or whatever is
going to grow multiqueue. comments #4, #7, #8 (s@kernel/irq@lib/@)
above still apply :)

The modification to the genirq code would be based on registering

struct irq_affinity_callback {
	unsigned int irq;
	struct kref kref;
	struct work work;
	void (*callback)(struct irq_affinity_callback *, const cpumask_t *mask);
	void (*release)(struct kref *ref);
};

for an interrupt via 

int irq_set_affinity_callback(unsigned int irq,
    			      struct irq_affinity_callback *cb);

That function can be called with cb=NULL to remove the callback. if
cb!=NULL, irq, kref and work are initialized.

The genirq code schedules work when affinity changes and the callback
struct pointer is non NULL. The callback is then called from the
worker thread with a copy of the irq affinity mask.

So on any affinity change, we do with desc->lock held:

   if (desc->affinity_callback) {
      	kref_get(&desc->affinity_callback->kref);
	schedule_work(&desc->affinity_callback->work);
   }

The worker function in the genirq code does:
{
    struct irq_affinity_callback *cb;
    struct irq_desc *desc;
    cpu_mask_t cpumask;

    cb = container_of(work, struct irq_affinity_callback, work);

    desc = irq_to_desc(cb->irq);
    if (!desc)
       goto out;

    if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)
       goto out;

    raw_spin_lock_irqsave(&desc->lock, flags);
    cpumask_copy(cpumask, desc->affinity);
    raw_spin_unlock_irqrestore(&desc->lock, flags);

    cb->callback(cb, cpumask);

    free_cpumask_var(cpumask);
out:
    kref_put(&cb->kref, cb->release);
}

That allows you to do all kind of magic in thread context, updating
the queue map, reallocating queue memory when the node affinity
changes (I know that you want to), go wild.

Thoughts ?

	 tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices
  2010-09-21 19:04         ` Thomas Gleixner
@ 2010-09-22 16:00           ` Ben Hutchings
  2010-09-22 16:06             ` Thomas Gleixner
  0 siblings, 1 reply; 12+ messages in thread
From: Ben Hutchings @ 2010-09-22 16:00 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Tom Herbert, netdev, linux-net-drivers, linux-kernel,
	Peter Zijlstra, Ingo Molnar

On Tue, 2010-09-21 at 21:04 +0200, Thomas Gleixner wrote:
[...]
> Talked to Peter about it and we came to the conclusion, that we should
> just provide a callback infrastructure in the irq code which does not
> care about the action behind it. That's going to solve #1,#2,#3,#5,#6
> and parts of #8
> 
> That queue/index map code should move to lib/ or some other
> appropriate place so it can be shared with storage or whatever is
> going to grow multiqueue. comments #4, #7, #8 (s@kernel/irq@lib/@)
> above still apply :)

OK.

> The modification to the genirq code would be based on registering
> 
> struct irq_affinity_callback {
> 	unsigned int irq;
> 	struct kref kref;
> 	struct work work;
> 	void (*callback)(struct irq_affinity_callback *, const cpumask_t *mask);
> 	void (*release)(struct kref *ref);
> };
> 
> for an interrupt via 
> 
> int irq_set_affinity_callback(unsigned int irq,
>     			      struct irq_affinity_callback *cb);
> 
> That function can be called with cb=NULL to remove the callback. if
> cb!=NULL, irq, kref and work are initialized.

When should it be called, relative to {request,free}_irq() and
pci_{disable,enable}_msix()?

[...]
> That allows you to do all kind of magic in thread context, updating
> the queue map, reallocating queue memory when the node affinity
> changes (I know that you want to), go wild.

I definitely don't want to reallocate queues if node affinity of the IRQ
is changed by irqbalance, because this will disrupt traffic.  So
changing the node affinity of queues has to be a separate operation.

> Thoughts ?

This does look like something I can use, thanks.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices
  2010-09-22 16:00           ` Ben Hutchings
@ 2010-09-22 16:06             ` Thomas Gleixner
  0 siblings, 0 replies; 12+ messages in thread
From: Thomas Gleixner @ 2010-09-22 16:06 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Tom Herbert, netdev, linux-net-drivers, linux-kernel,
	Peter Zijlstra, Ingo Molnar

On Wed, 22 Sep 2010, Ben Hutchings wrote:

> On Tue, 2010-09-21 at 21:04 +0200, Thomas Gleixner wrote:
> [...]
> > Talked to Peter about it and we came to the conclusion, that we should
> > just provide a callback infrastructure in the irq code which does not
> > care about the action behind it. That's going to solve #1,#2,#3,#5,#6
> > and parts of #8
> > 
> > That queue/index map code should move to lib/ or some other
> > appropriate place so it can be shared with storage or whatever is
> > going to grow multiqueue. comments #4, #7, #8 (s@kernel/irq@lib/@)
> > above still apply :)
> 
> OK.
> 
> > The modification to the genirq code would be based on registering
> > 
> > struct irq_affinity_callback {
> > 	unsigned int irq;
> > 	struct kref kref;
> > 	struct work work;
> > 	void (*callback)(struct irq_affinity_callback *, const cpumask_t *mask);
> > 	void (*release)(struct kref *ref);
> > };
> > 
> > for an interrupt via 
> > 
> > int irq_set_affinity_callback(unsigned int irq,
> >     			      struct irq_affinity_callback *cb);
> > 
> > That function can be called with cb=NULL to remove the callback. if
> > cb!=NULL, irq, kref and work are initialized.
> 
> When should it be called, relative to {request,free}_irq() and
> pci_{disable,enable}_msix()?

It should be called before request_irq and before free_irq. free_irq
will warn when the pointer is !NULL.
 
> [...]
> > That allows you to do all kind of magic in thread context, updating
> > the queue map, reallocating queue memory when the node affinity
> > changes (I know that you want to), go wild.
> 
> I definitely don't want to reallocate queues if node affinity of the IRQ
> is changed by irqbalance, because this will disrupt traffic.  So
> changing the node affinity of queues has to be a separate operation.

Fair enough.
 
> > Thoughts ?
> 
> This does look like something I can use, thanks.

Will look into it in the next days.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2010-09-22 16:06 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-09-20 19:01 [RFC][PATCH 0/4] RFS hardware acceleration Ben Hutchings
2010-09-20 19:08 ` [RFC][PATCH 1/4] IRQ: IRQ groups for multiqueue devices Ben Hutchings
2010-09-20 21:27   ` Thomas Gleixner
2010-09-21 12:25     ` Ben Hutchings
2010-09-21 15:34       ` Thomas Gleixner
2010-09-21 19:04         ` Thomas Gleixner
2010-09-22 16:00           ` Ben Hutchings
2010-09-22 16:06             ` Thomas Gleixner
2010-09-20 19:10 ` [RFC][PATCH 2/4] net: RPS: Enable hardware acceleration Ben Hutchings
2010-09-20 19:11 ` [RFC][PATCH 3/4] sfc: Implement RFS acceleration Ben Hutchings
2010-09-20 19:12 ` [RFC][PATCH 4/4] sfc/RFS/irq_group debug output Ben Hutchings
2010-09-20 19:36 ` [RFC][PATCH 0/4] RFS hardware acceleration Tom Herbert

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.