[PATCH -mm 0/2] Use llist in irq

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH -mm 0/2] Use llist in irq_work and xlist
@ 2011-08-30  5:16 Huang Ying
  2011-08-30  5:16 ` [PATCH -mm 1/2] irq_work, Use llist in irq_work Huang Ying
  2011-08-30  5:16 ` [PATCH -mm 2/2] net, rds, Replace xlist in net/rds/xlist.h with llist Huang Ying
  0 siblings, 2 replies; 20+ messages in thread
From: Huang Ying @ 2011-08-30  5:16 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, ying.huang

To remove the code duplication.

[PATCH -mm 1/2] irq_work, Use llist in irq_work
[PATCH -mm 2/2] net, rds, Replace xlist in net/rds/xlist.h with llist

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-08-30  5:16 [PATCH -mm 0/2] Use llist in irq_work and xlist Huang Ying
@ 2011-08-30  5:16 ` Huang Ying
  2011-08-31 10:10   ` Peter Zijlstra
  2011-08-30  5:16 ` [PATCH -mm 2/2] net, rds, Replace xlist in net/rds/xlist.h with llist Huang Ying
  1 sibling, 1 reply; 20+ messages in thread
From: Huang Ying @ 2011-08-30  5:16 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, ying.huang, Peter Zijlstra

Use llist in irq_work instead of the lock-less linked list
implementation in irq_work to avoid the code duplication.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/irq_work.h |   15 ++++---
 init/Kconfig             |    1 
 kernel/irq_work.c        |   92 ++++++++++++++++++-----------------------------
 3 files changed, 47 insertions(+), 61 deletions(-)

--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -1,20 +1,23 @@
 #ifndef _LINUX_IRQ_WORK_H
 #define _LINUX_IRQ_WORK_H
 
+#include <linux/llist.h>
+
 struct irq_work {
-	struct irq_work *next;
+	unsigned long flags;
+	struct llist_node llnode;
 	void (*func)(struct irq_work *);
 };
 
 static inline
-void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *))
+void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 {
-	entry->next = NULL;
-	entry->func = func;
+	work->flags = 0;
+	work->func = func;
 }
 
-bool irq_work_queue(struct irq_work *entry);
+bool irq_work_queue(struct irq_work *work);
 void irq_work_run(void);
-void irq_work_sync(struct irq_work *entry);
+void irq_work_sync(struct irq_work *work);
 
 #endif /* _LINUX_IRQ_WORK_H */
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -26,6 +26,7 @@ config HAVE_IRQ_WORK
 config IRQ_WORK
 	bool
 	depends on HAVE_IRQ_WORK
+	select LLIST
 
 menu "General setup"
 
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -19,49 +19,34 @@
  * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
  * pending   next, 3 -> {busy}          : queued, pending callback
  * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- *
- * We use the lower two bits of the next pointer to keep PENDING and BUSY
- * flags.
  */
 
 #define IRQ_WORK_PENDING	1UL
 #define IRQ_WORK_BUSY		2UL
 #define IRQ_WORK_FLAGS		3UL
 
-static inline bool irq_work_is_set(struct irq_work *entry, int flags)
-{
-	return (unsigned long)entry->next & flags;
-}
-
-static inline struct irq_work *irq_work_next(struct irq_work *entry)
-{
-	unsigned long next = (unsigned long)entry->next;
-	next &= ~IRQ_WORK_FLAGS;
-	return (struct irq_work *)next;
-}
+#define LIST_NONEMPTY_BIT	0
 
-static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
-{
-	unsigned long next = (unsigned long)entry;
-	next |= flags;
-	return (struct irq_work *)next;
-}
+struct irq_work_list {
+	unsigned long flags;
+	struct llist_head llist;
+};
 
-static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+static DEFINE_PER_CPU(struct irq_work_list, irq_work_lists);
 
 /*
  * Claim the entry so that no one else will poke at it.
  */
-static bool irq_work_claim(struct irq_work *entry)
+static bool irq_work_claim(struct irq_work *work)
 {
-	struct irq_work *next, *nflags;
+	unsigned long flags, nflags;
 
 	do {
-		next = entry->next;
-		if ((unsigned long)next & IRQ_WORK_PENDING)
+		flags = work->flags;
+		if (flags & IRQ_WORK_PENDING)
 			return false;
-		nflags = next_flags(next, IRQ_WORK_FLAGS);
-	} while (cmpxchg(&entry->next, next, nflags) != next);
+		nflags = flags | IRQ_WORK_FLAGS;
+	} while (cmpxchg(&work->flags, flags, nflags) != flags);
 
 	return true;
 }
@@ -77,23 +62,19 @@ void __weak arch_irq_work_raise(void)
 /*
  * Queue the entry and raise the IPI if needed.
  */
-static void __irq_work_queue(struct irq_work *entry)
+static void __irq_work_queue(struct irq_work *work)
 {
-	struct irq_work *next;
+	struct irq_work_list *irq_work_list;
 
-	preempt_disable();
+	irq_work_list = &get_cpu_var(irq_work_lists);
 
-	do {
-		next = __this_cpu_read(irq_work_list);
-		/* Can assign non-atomic because we keep the flags set. */
-		entry->next = next_flags(next, IRQ_WORK_FLAGS);
-	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
+	llist_add(&work->llnode, &irq_work_list->llist);
 
 	/* The list was empty, raise self-interrupt to start processing. */
-	if (!irq_work_next(entry))
+	if (!test_and_set_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags))
 		arch_irq_work_raise();
 
-	preempt_enable();
+	put_cpu_var(irq_work_list);
 }
 
 /*
@@ -102,16 +83,16 @@ static void __irq_work_queue(struct irq_
  *
  * Can be re-enqueued while the callback is still in progress.
  */
-bool irq_work_queue(struct irq_work *entry)
+bool irq_work_queue(struct irq_work *work)
 {
-	if (!irq_work_claim(entry)) {
+	if (!irq_work_claim(work)) {
 		/*
 		 * Already enqueued, can't do!
 		 */
 		return false;
 	}
 
-	__irq_work_queue(entry);
+	__irq_work_queue(work);
 	return true;
 }
 EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -122,34 +103,35 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
  */
 void irq_work_run(void)
 {
-	struct irq_work *list;
+	struct irq_work *work;
+	struct irq_work_list *irq_work_list;
+	struct llist_node *llnode;
 
-	if (this_cpu_read(irq_work_list) == NULL)
+	irq_work_list = &__get_cpu_var(irq_work_lists);
+	if (llist_empty(&irq_work_list->llist))
 		return;
 
 	BUG_ON(!in_irq());
 	BUG_ON(!irqs_disabled());
 
-	list = this_cpu_xchg(irq_work_list, NULL);
-
-	while (list != NULL) {
-		struct irq_work *entry = list;
+	clear_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags);
+	llnode = llist_del_all(&irq_work_list->llist);
+	while (llnode != NULL) {
+		work = llist_entry(llnode, struct irq_work, llnode);
 
-		list = irq_work_next(list);
+		llnode = llnode->next;
 
 		/*
-		 * Clear the PENDING bit, after this point the @entry
+		 * Clear the PENDING bit, after this point the @work
 		 * can be re-used.
 		 */
-		entry->next = next_flags(NULL, IRQ_WORK_BUSY);
-		entry->func(entry);
+		work->flags = IRQ_WORK_BUSY;
+		work->func(work);
 		/*
 		 * Clear the BUSY bit and return to the free state if
 		 * no-one else claimed it meanwhile.
 		 */
-		(void)cmpxchg(&entry->next,
-			      next_flags(NULL, IRQ_WORK_BUSY),
-			      NULL);
+		(void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
 	}
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
@@ -158,11 +140,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
  * Synchronize against the irq_work @entry, ensures the entry is not
  * currently in use.
  */
-void irq_work_sync(struct irq_work *entry)
+void irq_work_sync(struct irq_work *work)
 {
 	WARN_ON_ONCE(irqs_disabled());
 
-	while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+	while (work->flags & IRQ_WORK_BUSY)
 		cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH -mm 2/2] net, rds, Replace xlist in net/rds/xlist.h with llist
  2011-08-30  5:16 [PATCH -mm 0/2] Use llist in irq_work and xlist Huang Ying
  2011-08-30  5:16 ` [PATCH -mm 1/2] irq_work, Use llist in irq_work Huang Ying
@ 2011-08-30  5:16 ` Huang Ying
  1 sibling, 0 replies; 20+ messages in thread
From: Huang Ying @ 2011-08-30  5:16 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, ying.huang, Chris Mason

The functionality of xlist and llist is almost same.  This patch
replace xlist with llist to avoid code duplication.

Known issues: don't know how to test this, need special hardware?

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Chris Mason <chris.mason@oracle.com>
---
 net/rds/Kconfig   |    1 
 net/rds/ib_rdma.c |  112 ++++++++++++++++++++++++------------------------------
 net/rds/xlist.h   |   80 --------------------------------------
 3 files changed, 52 insertions(+), 141 deletions(-)
 delete mode 100644 net/rds/xlist.h

--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -9,6 +9,7 @@ config RDS
 
 config RDS_RDMA
 	tristate "RDS over Infiniband and iWARP"
+	select LLIST
 	depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
 	---help---
 	  Allow RDS to use Infiniband and iWARP as a transport.
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -33,10 +33,10 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/rculist.h>
+#include <linux/llist.h>
 
 #include "rds.h"
 #include "ib.h"
-#include "xlist.h"
 
 static DEFINE_PER_CPU(unsigned long, clean_list_grace);
 #define CLEAN_LIST_BUSY_BIT 0
@@ -49,7 +49,7 @@ struct rds_ib_mr {
 	struct rds_ib_mr_pool	*pool;
 	struct ib_fmr		*fmr;
 
-	struct xlist_head	xlist;
+	struct llist_node	llnode;
 
 	/* unmap_list is for freeing */
 	struct list_head	unmap_list;
@@ -71,9 +71,9 @@ struct rds_ib_mr_pool {
 	atomic_t		item_count;		/* total # of MRs */
 	atomic_t		dirty_count;		/* # dirty of MRs */
 
-	struct xlist_head	drop_list;		/* MRs that have reached their max_maps limit */
-	struct xlist_head	free_list;		/* unused MRs */
-	struct xlist_head	clean_list;		/* global unused & unamapped MRs */
+	struct llist_head	drop_list;		/* MRs that have reached their max_maps limit */
+	struct llist_head	free_list;		/* unused MRs */
+	struct llist_head	clean_list;		/* global unused & unamapped MRs */
 	wait_queue_head_t	flush_wait;
 
 	atomic_t		free_pinned;		/* memory pinned by free MRs */
@@ -220,9 +220,9 @@ struct rds_ib_mr_pool *rds_ib_create_mr_
 	if (!pool)
 		return ERR_PTR(-ENOMEM);
 
-	INIT_XLIST_HEAD(&pool->free_list);
-	INIT_XLIST_HEAD(&pool->drop_list);
-	INIT_XLIST_HEAD(&pool->clean_list);
+	init_llist_head(&pool->free_list);
+	init_llist_head(&pool->drop_list);
+	init_llist_head(&pool->clean_list);
 	mutex_init(&pool->flush_lock);
 	init_waitqueue_head(&pool->flush_wait);
 	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
@@ -260,26 +260,18 @@ void rds_ib_destroy_mr_pool(struct rds_i
 	kfree(pool);
 }
 
-static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
-			 struct rds_ib_mr **ibmr_ret)
-{
-	struct xlist_head *ibmr_xl;
-	ibmr_xl = xlist_del_head_fast(xl);
-	*ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
-}
-
 static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
 {
 	struct rds_ib_mr *ibmr = NULL;
-	struct xlist_head *ret;
+	struct llist_node *ret;
 	unsigned long *flag;
 
 	preempt_disable();
 	flag = &__get_cpu_var(clean_list_grace);
 	set_bit(CLEAN_LIST_BUSY_BIT, flag);
-	ret = xlist_del_head(&pool->clean_list);
+	ret = llist_del_first(&pool->clean_list);
 	if (ret)
-		ibmr = list_entry(ret, struct rds_ib_mr, xlist);
+		ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
 
 	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
 	preempt_enable();
@@ -529,46 +521,44 @@ static inline unsigned int rds_ib_flush_
 }
 
 /*
- * given an xlist of mrs, put them all into the list_head for more processing
+ * given an llist of mrs, put them all into the list_head for more processing
  */
-static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
+static void llist_append_to_list(struct llist_head *llist, struct list_head *list)
 {
 	struct rds_ib_mr *ibmr;
-	struct xlist_head splice;
-	struct xlist_head *cur;
-	struct xlist_head *next;
-
-	splice.next = NULL;
-	xlist_splice(xlist, &splice);
-	cur = splice.next;
-	while (cur) {
-		next = cur->next;
-		ibmr = list_entry(cur, struct rds_ib_mr, xlist);
+	struct llist_node *node;
+	struct llist_node *next;
+
+	node = llist_del_all(llist);
+	while (node) {
+		next = node->next;
+		ibmr = llist_entry(node, struct rds_ib_mr, llnode);
 		list_add_tail(&ibmr->unmap_list, list);
-		cur = next;
+		node = next;
 	}
 }
 
 /*
- * this takes a list head of mrs and turns it into an xlist of clusters.
- * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
- * reuse.
+ * this takes a list head of mrs and turns it into linked llist nodes
+ * of clusters.  Each cluster has linked llist nodes of
+ * MR_CLUSTER_SIZE mrs that are ready for reuse.
  */
-static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
-				struct list_head *list, struct xlist_head *xlist,
-				struct xlist_head **tail_ret)
+static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
+				struct list_head *list,
+				struct llist_node **nodes_head,
+				struct llist_node **nodes_tail)
 {
 	struct rds_ib_mr *ibmr;
-	struct xlist_head *cur_mr = xlist;
-	struct xlist_head *tail_mr = NULL;
+	struct llist_node *cur = NULL;
+	struct llist_node **next = nodes_head;
 
 	list_for_each_entry(ibmr, list, unmap_list) {
-		tail_mr = &ibmr->xlist;
-		tail_mr->next = NULL;
-		cur_mr->next = tail_mr;
-		cur_mr = tail_mr;
+		cur = &ibmr->llnode;
+		*next = cur;
+		next = &cur->next;
 	}
-	*tail_ret = tail_mr;
+	*next = NULL;
+	*nodes_tail = cur;
 }
 
 /*
@@ -581,8 +571,8 @@ static int rds_ib_flush_mr_pool(struct r
 			        int free_all, struct rds_ib_mr **ibmr_ret)
 {
 	struct rds_ib_mr *ibmr, *next;
-	struct xlist_head clean_xlist;
-	struct xlist_head *clean_tail;
+	struct llist_node *clean_nodes;
+	struct llist_node *clean_tail;
 	LIST_HEAD(unmap_list);
 	LIST_HEAD(fmr_list);
 	unsigned long unpinned = 0;
@@ -603,7 +593,7 @@ static int rds_ib_flush_mr_pool(struct r
 
 			prepare_to_wait(&pool->flush_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
-			if (xlist_empty(&pool->clean_list))
+			if (llist_empty(&pool->clean_list))
 				schedule();
 
 			ibmr = rds_ib_reuse_fmr(pool);
@@ -628,10 +618,10 @@ static int rds_ib_flush_mr_pool(struct r
 	/* Get the list of all MRs to be dropped. Ordering matters -
 	 * we want to put drop_list ahead of free_list.
 	 */
-	xlist_append_to_list(&pool->drop_list, &unmap_list);
-	xlist_append_to_list(&pool->free_list, &unmap_list);
+	llist_append_to_list(&pool->drop_list, &unmap_list);
+	llist_append_to_list(&pool->free_list, &unmap_list);
 	if (free_all)
-		xlist_append_to_list(&pool->clean_list, &unmap_list);
+		llist_append_to_list(&pool->clean_list, &unmap_list);
 
 	free_goal = rds_ib_flush_goal(pool, free_all);
 
@@ -663,22 +653,22 @@ static int rds_ib_flush_mr_pool(struct r
 	if (!list_empty(&unmap_list)) {
 		/* we have to make sure that none of the things we're about
 		 * to put on the clean list would race with other cpus trying
-		 * to pull items off.  The xlist would explode if we managed to
+		 * to pull items off.  The llist would explode if we managed to
 		 * remove something from the clean list and then add it back again
-		 * while another CPU was spinning on that same item in xlist_del_head.
+		 * while another CPU was spinning on that same item in llist_del_first.
 		 *
-		 * This is pretty unlikely, but just in case  wait for an xlist grace period
+		 * This is pretty unlikely, but just in case  wait for an llist grace period
 		 * here before adding anything back into the clean list.
 		 */
 		wait_clean_list_grace();
 
-		list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
+		list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
 		if (ibmr_ret)
-			refill_local(pool, &clean_xlist, ibmr_ret);
+			*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
 
-		/* refill_local may have emptied our list */
-		if (!xlist_empty(&clean_xlist))
-			xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
+		/* more than one entry in llist nodes */
+		if (clean_nodes->next)
+			llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);
 
 	}
 
@@ -711,9 +701,9 @@ void rds_ib_free_mr(void *trans_private,
 
 	/* Return it to the pool's free list */
 	if (ibmr->remap_count >= pool->fmr_attr.max_maps)
-		xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
+		llist_add(&ibmr->llnode, &pool->drop_list);
 	else
-		xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
+		llist_add(&ibmr->llnode, &pool->free_list);
 
 	atomic_add(ibmr->sg_len, &pool->free_pinned);
 	atomic_inc(&pool->dirty_count);
--- a/net/rds/xlist.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef _LINUX_XLIST_H
-#define _LINUX_XLIST_H
-
-#include <linux/stddef.h>
-#include <linux/poison.h>
-#include <linux/prefetch.h>
-#include <asm/system.h>
-
-struct xlist_head {
-	struct xlist_head *next;
-};
-
-static inline void INIT_XLIST_HEAD(struct xlist_head *list)
-{
-	list->next = NULL;
-}
-
-static inline int xlist_empty(struct xlist_head *head)
-{
-	return head->next == NULL;
-}
-
-static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail,
-			     struct xlist_head *head)
-{
-	struct xlist_head *cur;
-	struct xlist_head *check;
-
-	while (1) {
-		cur = head->next;
-		tail->next = cur;
-		check = cmpxchg(&head->next, cur, new);
-		if (check == cur)
-			break;
-	}
-}
-
-static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
-{
-	struct xlist_head *cur;
-	struct xlist_head *check;
-	struct xlist_head *next;
-
-	while (1) {
-		cur = head->next;
-		if (!cur)
-			goto out;
-
-		next = cur->next;
-		check = cmpxchg(&head->next, cur, next);
-		if (check == cur)
-			goto out;
-	}
-out:
-	return cur;
-}
-
-static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head)
-{
-	struct xlist_head *cur;
-
-	cur = head->next;
-	if (!cur)
-		return NULL;
-
-	head->next = cur->next;
-	return cur;
-}
-
-static inline void xlist_splice(struct xlist_head *list,
-				struct xlist_head *head)
-{
-	struct xlist_head *cur;
-
-	WARN_ON(head->next);
-	cur = xchg(&list->next, NULL);
-	head->next = cur;
-}
-
-#endif

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-08-30  5:16 ` [PATCH -mm 1/2] irq_work, Use llist in irq_work Huang Ying
@ 2011-08-31 10:10   ` Peter Zijlstra
  2011-09-01  1:46     ` Huang Ying
  0 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2011-08-31 10:10 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andrew Morton, linux-kernel

On Tue, 2011-08-30 at 13:16 +0800, Huang Ying wrote:
> Use llist in irq_work instead of the lock-less linked list
> implementation in irq_work to avoid the code duplication.

Except you make code horrid as well.. both this and xlist don't have
additional function calls, whereas you do.

Also, WTFH do you have unconditinoal cpu_relax() calls inside the
cmpxchg() loops, that's just bloody insane.

Move all of lib/llist.c inline, create a new macro for the

#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG                                                    
        BUG_ON(in_nmi());                                                                    
#endif 

blurb and loose the LLIST Kconfig.

> Signed-off-by: Huang Ying <ying.huang@intel.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> ---
>  include/linux/irq_work.h |   15 ++++---
>  init/Kconfig             |    1 
>  kernel/irq_work.c        |   92 ++++++++++++++++++-----------------------------
>  3 files changed, 47 insertions(+), 61 deletions(-)
> 
> --- a/include/linux/irq_work.h
> +++ b/include/linux/irq_work.h
> @@ -1,20 +1,23 @@
>  #ifndef _LINUX_IRQ_WORK_H
>  #define _LINUX_IRQ_WORK_H
>  
> +#include <linux/llist.h>
> +
>  struct irq_work {
> -	struct irq_work *next;
> +	unsigned long flags;
> +	struct llist_node llnode;
>  	void (*func)(struct irq_work *);
>  };

Separating out the flags is unfortunate, but ok.


> +#define LIST_NONEMPTY_BIT	0

This is just sad, see below.
 
> -static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
> -{
> -	unsigned long next = (unsigned long)entry;
> -	next |= flags;
> -	return (struct irq_work *)next;
> -}
> +struct irq_work_list {
> +	unsigned long flags;
> +	struct llist_head llist;
> +};

which is superfluous


> @@ -77,23 +62,19 @@ void __weak arch_irq_work_raise(void)
>  /*
>   * Queue the entry and raise the IPI if needed.
>   */
> -static void __irq_work_queue(struct irq_work *entry)
> +static void __irq_work_queue(struct irq_work *work)
>  {
> -	struct irq_work *next;
> +	struct irq_work_list *irq_work_list;
>  
> -	preempt_disable();
> +	irq_work_list = &get_cpu_var(irq_work_lists);
>  
> -	do {
> -		next = __this_cpu_read(irq_work_list);
> -		/* Can assign non-atomic because we keep the flags set. */
> -		entry->next = next_flags(next, IRQ_WORK_FLAGS);
> -	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
> +	llist_add(&work->llnode, &irq_work_list->llist);
>  
>  	/* The list was empty, raise self-interrupt to start processing. */
> -	if (!irq_work_next(entry))
> +	if (!test_and_set_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags))
>  		arch_irq_work_raise();

So why can't you simply test work->llnode->next? and loose the get/put
cpu muck? The existing preempt_disable/enable() are already superfluous
and could be removed, you just made all this way more horrid than need
be.

>  
> -	preempt_enable();
> +	put_cpu_var(irq_work_list);
>  }


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-08-31 10:10   ` Peter Zijlstra
@ 2011-09-01  1:46     ` Huang Ying
  2011-09-01  3:20       ` Huang Ying
  2011-09-01  7:57       ` Peter Zijlstra
  0 siblings, 2 replies; 20+ messages in thread
From: Huang Ying @ 2011-09-01  1:46 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andrew Morton, linux-kernel

Hi, Peter,

Thanks for your comments.

On 08/31/2011 06:10 PM, Peter Zijlstra wrote:
> On Tue, 2011-08-30 at 13:16 +0800, Huang Ying wrote:
>> Use llist in irq_work instead of the lock-less linked list
>> implementation in irq_work to avoid the code duplication.
> 
> Except you make code horrid as well.. both this and xlist don't have
> additional function calls, whereas you do.
> 
> Also, WTFH do you have unconditinoal cpu_relax() calls inside the
> cmpxchg() loops, that's just bloody insane.

You mean we should not use cpu_relax before the first cmpxchg?  You
suggest something as follow?

void llist_add(struct llist_node *new, struct llist_head *head)
{
        struct llist_node *entry, *old_entry;

#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
        BUG_ON(in_nmi());
#endif

        entry = head->first;
        for (;;) {
                old_entry = entry;
                new->next = entry;
                entry = cmpxchg(&head->first, old_entry, new);
                if (entry == old_entry)
                        break;
                cpu_relax();
        }
}

> Move all of lib/llist.c inline, create a new macro for the
> 
> #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG                                                    
>         BUG_ON(in_nmi());                                                                    
> #endif 
> 
> blurb and loose the LLIST Kconfig.

OK.  I will do that.

>> Signed-off-by: Huang Ying <ying.huang@intel.com>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> ---
>>  include/linux/irq_work.h |   15 ++++---
>>  init/Kconfig             |    1 
>>  kernel/irq_work.c        |   92 ++++++++++++++++++-----------------------------
>>  3 files changed, 47 insertions(+), 61 deletions(-)
>>
>> --- a/include/linux/irq_work.h
>> +++ b/include/linux/irq_work.h
>> @@ -1,20 +1,23 @@
>>  #ifndef _LINUX_IRQ_WORK_H
>>  #define _LINUX_IRQ_WORK_H
>>  
>> +#include <linux/llist.h>
>> +
>>  struct irq_work {
>> -	struct irq_work *next;
>> +	unsigned long flags;
>> +	struct llist_node llnode;
>>  	void (*func)(struct irq_work *);
>>  };
> 
> Separating out the flags is unfortunate, but ok.
> 
> 
>> +#define LIST_NONEMPTY_BIT	0
> 
> This is just sad, see below.
>  
>> -static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
>> -{
>> -	unsigned long next = (unsigned long)entry;
>> -	next |= flags;
>> -	return (struct irq_work *)next;
>> -}
>> +struct irq_work_list {
>> +	unsigned long flags;
>> +	struct llist_head llist;
>> +};
> 
> which is superfluous
> 
> 
>> @@ -77,23 +62,19 @@ void __weak arch_irq_work_raise(void)
>>  /*
>>   * Queue the entry and raise the IPI if needed.
>>   */
>> -static void __irq_work_queue(struct irq_work *entry)
>> +static void __irq_work_queue(struct irq_work *work)
>>  {
>> -	struct irq_work *next;
>> +	struct irq_work_list *irq_work_list;
>>  
>> -	preempt_disable();
>> +	irq_work_list = &get_cpu_var(irq_work_lists);
>>  
>> -	do {
>> -		next = __this_cpu_read(irq_work_list);
>> -		/* Can assign non-atomic because we keep the flags set. */
>> -		entry->next = next_flags(next, IRQ_WORK_FLAGS);
>> -	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
>> +	llist_add(&work->llnode, &irq_work_list->llist);
>>  
>>  	/* The list was empty, raise self-interrupt to start processing. */
>> -	if (!irq_work_next(entry))
>> +	if (!test_and_set_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags))
>>  		arch_irq_work_raise();
> 
> So why can't you simply test work->llnode->next?

Yes.  That is better.  Even if there may be a small race window, it is
not a big issue to raise one extra self interrupt seldom.

> and loose the get/put
> cpu muck? The existing preempt_disable/enable() are already superfluous
> and could be removed, you just made all this way more horrid than need
> be.

Will it cause race condition to remove preempt_disable/enable?
Considering something as follow:

- get irq_work_list of CPU A
- queue irq_work into irq_work_list of CPU A
- preempted and resumed execution on CPU B
- arch_irq_work_raise on CPU B

irq_work_run on CPU B will do nothing.  While irq_work need to wait for
next timer interrupt.  Isn't it an issue?

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01  1:46     ` Huang Ying
@ 2011-09-01  3:20       ` Huang Ying
  2011-09-01  7:58         ` Peter Zijlstra
  2011-09-01  7:57       ` Peter Zijlstra
  1 sibling, 1 reply; 20+ messages in thread
From: Huang Ying @ 2011-09-01  3:20 UTC (permalink / raw)
  To: Huang Ying; +Cc: Peter Zijlstra, Andrew Morton, linux-kernel

On 09/01/2011 09:46 AM, Huang Ying wrote:
>>> -static void __irq_work_queue(struct irq_work *entry)
>>> +static void __irq_work_queue(struct irq_work *work)
>>>  {
>>> -	struct irq_work *next;
>>> +	struct irq_work_list *irq_work_list;
>>>  
>>> -	preempt_disable();
>>> +	irq_work_list = &get_cpu_var(irq_work_lists);
>>>  
>>> -	do {
>>> -		next = __this_cpu_read(irq_work_list);
>>> -		/* Can assign non-atomic because we keep the flags set. */
>>> -		entry->next = next_flags(next, IRQ_WORK_FLAGS);
>>> -	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
>>> +	llist_add(&work->llnode, &irq_work_list->llist);
>>>  
>>>  	/* The list was empty, raise self-interrupt to start processing. */
>>> -	if (!irq_work_next(entry))
>>> +	if (!test_and_set_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags))
>>>  		arch_irq_work_raise();
>>
>> So why can't you simply test work->llnode->next?
> 
> Yes.  That is better.  Even if there may be a small race window, it is
> not a big issue to raise one extra self interrupt seldom.

Remember something about this.  I didn't test work->llnode->next here
because I didn't want expose the implementation details like that here.
 How about make llist_add() return whether list is empty before adding?
 Because it will be an inline function, that should be optimized out if
the caller do not need the information.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01  1:46     ` Huang Ying
  2011-09-01  3:20       ` Huang Ying
@ 2011-09-01  7:57       ` Peter Zijlstra
  2011-09-01  8:44         ` Huang Ying
  1 sibling, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2011-09-01  7:57 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andrew Morton, linux-kernel

On Thu, 2011-09-01 at 09:46 +0800, Huang Ying wrote:

> You mean we should not use cpu_relax before the first cmpxchg? 

Yeah, that's just wasting time for no reason..

>  You suggest something as follow?
> 
> void llist_add(struct llist_node *new, struct llist_head *head)
> {
>         struct llist_node *entry, *old_entry;
> 
> #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
>         BUG_ON(in_nmi());
> #endif
> 
>         entry = head->first;
>         for (;;) {
>                 old_entry = entry;
>                 new->next = entry;
>                 entry = cmpxchg(&head->first, old_entry, new);
>                 if (entry == old_entry)
>                         break;
>                 cpu_relax();
>         }
> }

If you insist on having cpu_relax(), then yes that's lots better. Also
avoids the assignment in your conditional. Thing with cpu_relax() is
that its only beneficial in the highly contended case and degrade
light/un-contended loads.

Also, just noticed, why do you have different list_head/list_node
structures? They're the same, a single pointer.

> > and loose the get/put
> > cpu muck? The existing preempt_disable/enable() are already superfluous
> > and could be removed, you just made all this way more horrid than need
> > be.
> 
> Will it cause race condition to remove preempt_disable/enable?
> Considering something as follow:
> 
> - get irq_work_list of CPU A
> - queue irq_work into irq_work_list of CPU A
> - preempted and resumed execution on CPU B
> - arch_irq_work_raise on CPU B
> 
> irq_work_run on CPU B will do nothing.  While irq_work need to wait for
> next timer interrupt.  Isn't it an issue?

Yes that's unfortunate, the current version would work just fine without
preempt but that's because of the this_cpu_* ops foo. 

Not sure it would make sense to add a special this_cpu_llist_add() or
so.. esp seeing that this_cpu_* is basically x86-only.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01  3:20       ` Huang Ying
@ 2011-09-01  7:58         ` Peter Zijlstra
  2011-09-01  8:56           ` Huang Ying
  0 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2011-09-01  7:58 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andrew Morton, linux-kernel

On Thu, 2011-09-01 at 11:20 +0800, Huang Ying wrote:
> On 09/01/2011 09:46 AM, Huang Ying wrote:
> >>> -static void __irq_work_queue(struct irq_work *entry)
> >>> +static void __irq_work_queue(struct irq_work *work)
> >>>  {
> >>> -	struct irq_work *next;
> >>> +	struct irq_work_list *irq_work_list;
> >>>  
> >>> -	preempt_disable();
> >>> +	irq_work_list = &get_cpu_var(irq_work_lists);
> >>>  
> >>> -	do {
> >>> -		next = __this_cpu_read(irq_work_list);
> >>> -		/* Can assign non-atomic because we keep the flags set. */
> >>> -		entry->next = next_flags(next, IRQ_WORK_FLAGS);
> >>> -	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
> >>> +	llist_add(&work->llnode, &irq_work_list->llist);
> >>>  
> >>>  	/* The list was empty, raise self-interrupt to start processing. */
> >>> -	if (!irq_work_next(entry))
> >>> +	if (!test_and_set_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags))
> >>>  		arch_irq_work_raise();
> >>
> >> So why can't you simply test work->llnode->next?
> > 
> > Yes.  That is better.  Even if there may be a small race window, it is
> > not a big issue to raise one extra self interrupt seldom.
> 
> Remember something about this.  I didn't test work->llnode->next here
> because I didn't want expose the implementation details like that here.
>  How about make llist_add() return whether list is empty before adding?
>  Because it will be an inline function, that should be optimized out if
> the caller do not need the information.

You could also use llist_empty() although that brings me to that
ACCESS_ONCE thing in there, what's the point?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01  7:57       ` Peter Zijlstra
@ 2011-09-01  8:44         ` Huang Ying
  2011-09-01 10:00           ` Peter Zijlstra
  0 siblings, 1 reply; 20+ messages in thread
From: Huang Ying @ 2011-09-01  8:44 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andrew Morton, linux-kernel

On 09/01/2011 03:57 PM, Peter Zijlstra wrote:
> On Thu, 2011-09-01 at 09:46 +0800, Huang Ying wrote:
> 
>> You mean we should not use cpu_relax before the first cmpxchg? 
> 
> Yeah, that's just wasting time for no reason..
> 
>>  You suggest something as follow?
>>
>> void llist_add(struct llist_node *new, struct llist_head *head)
>> {
>>         struct llist_node *entry, *old_entry;
>>
>> #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
>>         BUG_ON(in_nmi());
>> #endif
>>
>>         entry = head->first;
>>         for (;;) {
>>                 old_entry = entry;
>>                 new->next = entry;
>>                 entry = cmpxchg(&head->first, old_entry, new);
>>                 if (entry == old_entry)
>>                         break;
>>                 cpu_relax();
>>         }
>> }
> 
> If you insist on having cpu_relax(), then yes that's lots better. Also
> avoids the assignment in your conditional. Thing with cpu_relax() is
> that its only beneficial in the highly contended case and degrade
> light/un-contended loads.

Because llist is in library, it may be used in highly contended case and
light/un-contended loads.  So maybe code as above is best choice for llist.

> Also, just noticed, why do you have different list_head/list_node
> structures? They're the same, a single pointer.

There is only one structure before (llist_head).  Linus give me some
comments about that, suggests to use 2 structures.  I think his comments
are reasonable.  Copied here:

"
Btw, looking at the lockless list, I really think it's a mistake to
use "struct llist_head" for both the head and for the list linkage.

They are _very_ different. In the <linux/list.h> kind of model, the
list_head really *is* the same regardless of where in the list you
are, and you can have headless lists (ie a list that doesn't really
have an anchor, the list is just comprised of the elements
themselves).

But that's not true of the lockless list. There, the head is very much
magical, and as I mentioned, you can NOT do a "traverse starting at
head" thing. You can only do add/del/clear on the head, and then to
traverse you have to have a list *without* the head entry, because it
has been removed.

So from a pure type safety angle, I think the lockless list should
have clearly separate types for the head and for the actual list
entries, because they behave totally differently. When the normal
"list_add()" things can be used to add things in the middle - so each
list entry really can act as a head - that isn't true at all for the
lockless version.
"

>>> and loose the get/put
>>> cpu muck? The existing preempt_disable/enable() are already superfluous
>>> and could be removed, you just made all this way more horrid than need
>>> be.
>>
>> Will it cause race condition to remove preempt_disable/enable?
>> Considering something as follow:
>>
>> - get irq_work_list of CPU A
>> - queue irq_work into irq_work_list of CPU A
>> - preempted and resumed execution on CPU B
>> - arch_irq_work_raise on CPU B
>>
>> irq_work_run on CPU B will do nothing.  While irq_work need to wait for
>> next timer interrupt.  Isn't it an issue?
> 
> Yes that's unfortunate, the current version would work just fine without
> preempt but that's because of the this_cpu_* ops foo. 
> 
> Not sure it would make sense to add a special this_cpu_llist_add() or
> so.. esp seeing that this_cpu_* is basically x86-only.

Even with this_cpu_* ops, it seems that the race condition I described
is still possible.

- use this_cpu_cmpxchg to queue irq_work into irq_work_list of CPU A
- preempted and resumed execution on CPU B
- arch_irq_work_raise on CPU B

Do you think so?

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01  7:58         ` Peter Zijlstra
@ 2011-09-01  8:56           ` Huang Ying
  2011-09-01  9:57             ` Peter Zijlstra
  2011-09-01 12:51             ` Mathieu Desnoyers
  0 siblings, 2 replies; 20+ messages in thread
From: Huang Ying @ 2011-09-01  8:56 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andrew Morton, linux-kernel, Mathieu Desnoyers

On 09/01/2011 03:58 PM, Peter Zijlstra wrote:
> On Thu, 2011-09-01 at 11:20 +0800, Huang Ying wrote:
>> On 09/01/2011 09:46 AM, Huang Ying wrote:
>>>>> -static void __irq_work_queue(struct irq_work *entry)
>>>>> +static void __irq_work_queue(struct irq_work *work)
>>>>>  {
>>>>> -	struct irq_work *next;
>>>>> +	struct irq_work_list *irq_work_list;
>>>>>  
>>>>> -	preempt_disable();
>>>>> +	irq_work_list = &get_cpu_var(irq_work_lists);
>>>>>  
>>>>> -	do {
>>>>> -		next = __this_cpu_read(irq_work_list);
>>>>> -		/* Can assign non-atomic because we keep the flags set. */
>>>>> -		entry->next = next_flags(next, IRQ_WORK_FLAGS);
>>>>> -	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
>>>>> +	llist_add(&work->llnode, &irq_work_list->llist);
>>>>>  
>>>>>  	/* The list was empty, raise self-interrupt to start processing. */
>>>>> -	if (!irq_work_next(entry))
>>>>> +	if (!test_and_set_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags))
>>>>>  		arch_irq_work_raise();
>>>>
>>>> So why can't you simply test work->llnode->next?
>>>
>>> Yes.  That is better.  Even if there may be a small race window, it is
>>> not a big issue to raise one extra self interrupt seldom.
>>
>> Remember something about this.  I didn't test work->llnode->next here
>> because I didn't want expose the implementation details like that here.
>>  How about make llist_add() return whether list is empty before adding?
>>  Because it will be an inline function, that should be optimized out if
>> the caller do not need the information.
> 
> You could also use llist_empty() although that brings me to that
> ACCESS_ONCE thing in there, what's the point?

Something as follow with llist_empty() seems not work.

  empty = llist_empty(irq_work_list);
  llist_add(&work->llnode, irq_work_list);
  if (empty)
          arch_irq_work_raise();

Because irq_work IRQ handler or timer IRQ handler may be executed just
before "llist_add(&work->llnode, irq_work_list)", so that, although
"empty == false", arch_irq_work_raise() still should be executed.

Can you tell me how to that with llist_empty()?


For ACCESS_ONCE, Mathiew suggest me to add it,

Hi, Mathiew,

Can you explain why ACCESS_ONCE should be used here?

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01  8:56           ` Huang Ying
@ 2011-09-01  9:57             ` Peter Zijlstra
  2011-09-02  1:14               ` Huang Ying
  2011-09-01 12:51             ` Mathieu Desnoyers
  1 sibling, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2011-09-01  9:57 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andrew Morton, linux-kernel, Mathieu Desnoyers

On Thu, 2011-09-01 at 16:56 +0800, Huang Ying wrote:
> Something as follow with llist_empty() seems not work.
> 
>   empty = llist_empty(irq_work_list);
>   llist_add(&work->llnode, irq_work_list);
>   if (empty)
>           arch_irq_work_raise();
> 
> Because irq_work IRQ handler or timer IRQ handler may be executed just
> before "llist_add(&work->llnode, irq_work_list)", so that, although
> "empty == false", arch_irq_work_raise() still should be executed.

Right, I was thinking:

	llist_add(&work->llist, irq_work_list);
	if (llist_empty(&work->llist))
		arch_irq_work_raise();

And then ran into the difference between llist_node and llist_head. Now
we could sort that by introducing llist_next() and write it like:

	if (!llist_next(&work->list))
		arch_irq_work_raise();



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01  8:44         ` Huang Ying
@ 2011-09-01 10:00           ` Peter Zijlstra
  2011-09-02  1:18             ` Huang Ying
  0 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2011-09-01 10:00 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andrew Morton, linux-kernel

On Thu, 2011-09-01 at 16:44 +0800, Huang Ying wrote:

> Because llist is in library, it may be used in highly contended case and
> light/un-contended loads.  So maybe code as above is best choice for llist.

Well the thing is, if you're heavily contended you should probably be
doing something else..

> > Also, just noticed, why do you have different list_head/list_node
> > structures? They're the same, a single pointer.
> 
> There is only one structure before (llist_head).  Linus give me some
> comments about that, suggests to use 2 structures.  I think his comments
> are reasonable.  Copied here:

OK

> Even with this_cpu_* ops, it seems that the race condition I described
> is still possible.
> 
> - use this_cpu_cmpxchg to queue irq_work into irq_work_list of CPU A
> - preempted and resumed execution on CPU B
> - arch_irq_work_raise on CPU B
> 
> Do you think so?

No you're right, got my brain inside-out today or so..

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01  8:56           ` Huang Ying
  2011-09-01  9:57             ` Peter Zijlstra
@ 2011-09-01 12:51             ` Mathieu Desnoyers
  2011-09-01 13:00               ` Mathieu Desnoyers
  2011-09-02  1:08               ` Huang Ying
  1 sibling, 2 replies; 20+ messages in thread
From: Mathieu Desnoyers @ 2011-09-01 12:51 UTC (permalink / raw)
  To: Huang Ying; +Cc: Peter Zijlstra, Andrew Morton, linux-kernel, Paul E. McKenney

* Huang Ying (ying.huang@intel.com) wrote:
> On 09/01/2011 03:58 PM, Peter Zijlstra wrote:
> > On Thu, 2011-09-01 at 11:20 +0800, Huang Ying wrote:
> >> On 09/01/2011 09:46 AM, Huang Ying wrote:
> >>>>> -static void __irq_work_queue(struct irq_work *entry)
> >>>>> +static void __irq_work_queue(struct irq_work *work)
> >>>>>  {
> >>>>> -	struct irq_work *next;
> >>>>> +	struct irq_work_list *irq_work_list;
> >>>>>  
> >>>>> -	preempt_disable();
> >>>>> +	irq_work_list = &get_cpu_var(irq_work_lists);
> >>>>>  
> >>>>> -	do {
> >>>>> -		next = __this_cpu_read(irq_work_list);
> >>>>> -		/* Can assign non-atomic because we keep the flags set. */
> >>>>> -		entry->next = next_flags(next, IRQ_WORK_FLAGS);
> >>>>> -	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
> >>>>> +	llist_add(&work->llnode, &irq_work_list->llist);
> >>>>>  
> >>>>>  	/* The list was empty, raise self-interrupt to start processing. */
> >>>>> -	if (!irq_work_next(entry))
> >>>>> +	if (!test_and_set_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags))
> >>>>>  		arch_irq_work_raise();
> >>>>
> >>>> So why can't you simply test work->llnode->next?
> >>>
> >>> Yes.  That is better.  Even if there may be a small race window, it is
> >>> not a big issue to raise one extra self interrupt seldom.
> >>
> >> Remember something about this.  I didn't test work->llnode->next here
> >> because I didn't want expose the implementation details like that here.
> >>  How about make llist_add() return whether list is empty before adding?
> >>  Because it will be an inline function, that should be optimized out if
> >> the caller do not need the information.

Yes, that would make sense.

something like

static inline
struct llist_node *llist_add(struct llist_node *new, struct llist_head *head)
{
	struct llist_node *entry, *old_entry;

#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
	BUG_ON(in_nmi());
#endif

	entry = head->first;
	do {
		old_entry = entry;
		new->next = entry;
		cpu_relax();
	} while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry);
        return old_entry;
}

Where llist_add returns NULL if the list was initially empty, else
returns non-null. This return value usage should be documented in the
function header, of course.

BUT, big warning here: this whole thing _should_ be renamed as a
"lockless stack" rather than a "lockless list". Really. I said it in the
past, and here I see another example showing why we should. The only
reason why we can get away this easily with knowing atomically if the
structure was empty prior to the insertion is because this "list"
hehaves like a stack (LIFO). I foresee we'll need to add "lockless
queues" at some point (FIFO), which has the ability to enqueue/dequeue
concurrently without sharing the same cache-lines when the queue is
non-empty. Within that kind of queue structure, knowing if the queue was
empty prior to insertion would become a bottleneck, so I would not
advise to make that part of _that_ API, which would require to add a new
"llqueue" API. And at that point, the "llist" vs "llqueue" semantic
would become really confusing. Maybe "llstack" would be more appropriate
here instead of llist ? Or llfifo ? The API we can provide with a
lock-less structure does not only depend on how the elements are
organized, but also on the operations allowed on the structure. So the
API should reflect that. So I'm saying this warning again: if we don't
fix that now, I think we're heading into a lock-free structure
namespacing trainwreck that will limit our ability to add other
lock-free operations due vague naming that does not take the operations
allowed on the structure into consideration, combined with API
constraints permitted by a specific given behavior (e.g. FIFO semantic)
that tend to define these lock-free APIs.

I've been working on lock-free structures myself in Userspace RCU: I
have lock-free stack and queue, wait-free stack, wait-free queue, and
(work in progress), RCU red-black tree (not lock-free), and lock-free
RCU expandable hash table. If we plan namespacing of lock-free data
structure correctly, I think there will be room for very interesting
performance and scalability improvements in the future.

> > 
> > You could also use llist_empty() although that brings me to that
> > ACCESS_ONCE thing in there, what's the point?
> 
> Something as follow with llist_empty() seems not work.
> 
>   empty = llist_empty(irq_work_list);
>   llist_add(&work->llnode, irq_work_list);
>   if (empty)
>           arch_irq_work_raise();
> 
> Because irq_work IRQ handler or timer IRQ handler may be executed just
> before "llist_add(&work->llnode, irq_work_list)", so that, although
> "empty == false", arch_irq_work_raise() still should be executed.
> 
> Can you tell me how to that with llist_empty()?
> 
> 
> For ACCESS_ONCE, Mathiew suggest me to add it,
> 
> Hi, Mathiew,
> 
> Can you explain why ACCESS_ONCE should be used here?

It was mainly to force the compiler to re-fetch the head->first value
from memory in busy-waiting loops. So even though the right practice is
to have a cpu_relax() in the body of the loop (which would force the
refetch due to the compiler barrier), having the ACCESS_ONCE in
llist_empty() should not hurt much. It also seems to be what atomic*.h
does (volatile read), so I guess the expected behavior wrt atomic
accesses are to behave as volatile, hence my recommendation to make this
llist_empty a volatile access. Quoting my previous email on that topic:

"Would it make sense to do:

 return ACCESS_ONCE(head->first) == NULL;

instead ? Otherwise the compiler can choose to keep the result around in
registers without re-reading (e.g. busy waiting loop)."

So I was not implying it was strictly required, merely wondering whether
it should be added.

Best regards,

Mathieu

> 
> Best Regards,
> Huang Ying

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01 12:51             ` Mathieu Desnoyers
@ 2011-09-01 13:00               ` Mathieu Desnoyers
  2011-09-02  1:08               ` Huang Ying
  1 sibling, 0 replies; 20+ messages in thread
From: Mathieu Desnoyers @ 2011-09-01 13:00 UTC (permalink / raw)
  To: Huang Ying; +Cc: Peter Zijlstra, Andrew Morton, linux-kernel, Paul E. McKenney

* Mathieu Desnoyers (mathieu.desnoyers@efficios.com) wrote:
> * Huang Ying (ying.huang@intel.com) wrote:
> > On 09/01/2011 03:58 PM, Peter Zijlstra wrote:
> > > On Thu, 2011-09-01 at 11:20 +0800, Huang Ying wrote:
> > >> On 09/01/2011 09:46 AM, Huang Ying wrote:
> > >>>>> -static void __irq_work_queue(struct irq_work *entry)
> > >>>>> +static void __irq_work_queue(struct irq_work *work)
> > >>>>>  {
> > >>>>> -	struct irq_work *next;
> > >>>>> +	struct irq_work_list *irq_work_list;
> > >>>>>  
> > >>>>> -	preempt_disable();
> > >>>>> +	irq_work_list = &get_cpu_var(irq_work_lists);
> > >>>>>  
> > >>>>> -	do {
> > >>>>> -		next = __this_cpu_read(irq_work_list);
> > >>>>> -		/* Can assign non-atomic because we keep the flags set. */
> > >>>>> -		entry->next = next_flags(next, IRQ_WORK_FLAGS);
> > >>>>> -	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
> > >>>>> +	llist_add(&work->llnode, &irq_work_list->llist);
> > >>>>>  
> > >>>>>  	/* The list was empty, raise self-interrupt to start processing. */
> > >>>>> -	if (!irq_work_next(entry))
> > >>>>> +	if (!test_and_set_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags))
> > >>>>>  		arch_irq_work_raise();
> > >>>>
> > >>>> So why can't you simply test work->llnode->next?
> > >>>
> > >>> Yes.  That is better.  Even if there may be a small race window, it is
> > >>> not a big issue to raise one extra self interrupt seldom.
> > >>
> > >> Remember something about this.  I didn't test work->llnode->next here
> > >> because I didn't want expose the implementation details like that here.
> > >>  How about make llist_add() return whether list is empty before adding?
> > >>  Because it will be an inline function, that should be optimized out if
> > >> the caller do not need the information.
> 
> Yes, that would make sense.
> 
> something like
> 
> static inline
> struct llist_node *llist_add(struct llist_node *new, struct llist_head *head)
> {
> 	struct llist_node *entry, *old_entry;
> 
> #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
> 	BUG_ON(in_nmi());
> #endif
> 
> 	entry = head->first;
> 	do {
> 		old_entry = entry;
> 		new->next = entry;
> 		cpu_relax();
> 	} while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry);
>         return old_entry;
> }
> 
> Where llist_add returns NULL if the list was initially empty, else
> returns non-null. This return value usage should be documented in the
> function header, of course.
> 
> BUT, big warning here: this whole thing _should_ be renamed as a
> "lockless stack" rather than a "lockless list". Really. I said it in the
> past, and here I see another example showing why we should. The only
> reason why we can get away this easily with knowing atomically if the
> structure was empty prior to the insertion is because this "list"
> hehaves like a stack (LIFO). I foresee we'll need to add "lockless
> queues" at some point (FIFO), which has the ability to enqueue/dequeue
> concurrently without sharing the same cache-lines when the queue is
> non-empty. Within that kind of queue structure, knowing if the queue was
> empty prior to insertion would become a bottleneck, so I would not
> advise to make that part of _that_ API, which would require to add a new
> "llqueue" API. And at that point, the "llist" vs "llqueue" semantic
> would become really confusing. Maybe "llstack" would be more appropriate
> here instead of llist ? Or llfifo ? The API we can provide with a

I meant lllifo here, sorry. (3 'l's in a row is kind of messy though).

Thanks,

Mathieu

> lock-less structure does not only depend on how the elements are
> organized, but also on the operations allowed on the structure. So the
> API should reflect that. So I'm saying this warning again: if we don't
> fix that now, I think we're heading into a lock-free structure
> namespacing trainwreck that will limit our ability to add other
> lock-free operations due vague naming that does not take the operations
> allowed on the structure into consideration, combined with API
> constraints permitted by a specific given behavior (e.g. FIFO semantic)
> that tend to define these lock-free APIs.
> 
> I've been working on lock-free structures myself in Userspace RCU: I
> have lock-free stack and queue, wait-free stack, wait-free queue, and
> (work in progress), RCU red-black tree (not lock-free), and lock-free
> RCU expandable hash table. If we plan namespacing of lock-free data
> structure correctly, I think there will be room for very interesting
> performance and scalability improvements in the future.
> 
> 
> > > 
> > > You could also use llist_empty() although that brings me to that
> > > ACCESS_ONCE thing in there, what's the point?
> > 
> > Something as follow with llist_empty() seems not work.
> > 
> >   empty = llist_empty(irq_work_list);
> >   llist_add(&work->llnode, irq_work_list);
> >   if (empty)
> >           arch_irq_work_raise();
> > 
> > Because irq_work IRQ handler or timer IRQ handler may be executed just
> > before "llist_add(&work->llnode, irq_work_list)", so that, although
> > "empty == false", arch_irq_work_raise() still should be executed.
> > 
> > Can you tell me how to that with llist_empty()?
> > 
> > 
> > For ACCESS_ONCE, Mathiew suggest me to add it,
> > 
> > Hi, Mathiew,
> > 
> > Can you explain why ACCESS_ONCE should be used here?
> 
> It was mainly to force the compiler to re-fetch the head->first value
> from memory in busy-waiting loops. So even though the right practice is
> to have a cpu_relax() in the body of the loop (which would force the
> refetch due to the compiler barrier), having the ACCESS_ONCE in
> llist_empty() should not hurt much. It also seems to be what atomic*.h
> does (volatile read), so I guess the expected behavior wrt atomic
> accesses are to behave as volatile, hence my recommendation to make this
> llist_empty a volatile access. Quoting my previous email on that topic:
> 
> "Would it make sense to do:
> 
>  return ACCESS_ONCE(head->first) == NULL;
> 
> instead ? Otherwise the compiler can choose to keep the result around in
> registers without re-reading (e.g. busy waiting loop)."
> 
> So I was not implying it was strictly required, merely wondering whether
> it should be added.
> 
> Best regards,
> 
> Mathieu
> 
> > 
> > Best Regards,
> > Huang Ying
> 
> -- 
> Mathieu Desnoyers
> Operating System Efficiency R&D Consultant
> EfficiOS Inc.
> http://www.efficios.com

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01 12:51             ` Mathieu Desnoyers
  2011-09-01 13:00               ` Mathieu Desnoyers
@ 2011-09-02  1:08               ` Huang Ying
  2011-09-03 16:33                 ` Mathieu Desnoyers
  1 sibling, 1 reply; 20+ messages in thread
From: Huang Ying @ 2011-09-02  1:08 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, Andrew Morton, linux-kernel, Paul E. McKenney

On 09/01/2011 08:51 PM, Mathieu Desnoyers wrote:
> * Huang Ying (ying.huang@intel.com) wrote:
>> On 09/01/2011 03:58 PM, Peter Zijlstra wrote:
>>> On Thu, 2011-09-01 at 11:20 +0800, Huang Ying wrote:
>>>> On 09/01/2011 09:46 AM, Huang Ying wrote:
>>>>>>> -static void __irq_work_queue(struct irq_work *entry)
>>>>>>> +static void __irq_work_queue(struct irq_work *work)
>>>>>>>  {
>>>>>>> -	struct irq_work *next;
>>>>>>> +	struct irq_work_list *irq_work_list;
>>>>>>>  
>>>>>>> -	preempt_disable();
>>>>>>> +	irq_work_list = &get_cpu_var(irq_work_lists);
>>>>>>>  
>>>>>>> -	do {
>>>>>>> -		next = __this_cpu_read(irq_work_list);
>>>>>>> -		/* Can assign non-atomic because we keep the flags set. */
>>>>>>> -		entry->next = next_flags(next, IRQ_WORK_FLAGS);
>>>>>>> -	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
>>>>>>> +	llist_add(&work->llnode, &irq_work_list->llist);
>>>>>>>  
>>>>>>>  	/* The list was empty, raise self-interrupt to start processing. */
>>>>>>> -	if (!irq_work_next(entry))
>>>>>>> +	if (!test_and_set_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags))
>>>>>>>  		arch_irq_work_raise();
>>>>>>
>>>>>> So why can't you simply test work->llnode->next?
>>>>>
>>>>> Yes.  That is better.  Even if there may be a small race window, it is
>>>>> not a big issue to raise one extra self interrupt seldom.
>>>>
>>>> Remember something about this.  I didn't test work->llnode->next here
>>>> because I didn't want expose the implementation details like that here.
>>>>  How about make llist_add() return whether list is empty before adding?
>>>>  Because it will be an inline function, that should be optimized out if
>>>> the caller do not need the information.
> 
> Yes, that would make sense.
> 
> something like
> 
> static inline
> struct llist_node *llist_add(struct llist_node *new, struct llist_head *head)
> {
> 	struct llist_node *entry, *old_entry;
> 
> #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
> 	BUG_ON(in_nmi());
> #endif
> 
> 	entry = head->first;
> 	do {
> 		old_entry = entry;
> 		new->next = entry;
> 		cpu_relax();
> 	} while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry);
>         return old_entry;
> }
> 
> Where llist_add returns NULL if the list was initially empty, else
> returns non-null. This return value usage should be documented in the
> function header, of course.
> 
> BUT, big warning here: this whole thing _should_ be renamed as a
> "lockless stack" rather than a "lockless list". Really. I said it in the
> past, and here I see another example showing why we should. The only
> reason why we can get away this easily with knowing atomically if the
> structure was empty prior to the insertion is because this "list"
> hehaves like a stack (LIFO). I foresee we'll need to add "lockless
> queues" at some point (FIFO), which has the ability to enqueue/dequeue
> concurrently without sharing the same cache-lines when the queue is
> non-empty. Within that kind of queue structure, knowing if the queue was
> empty prior to insertion would become a bottleneck, so I would not
> advise to make that part of _that_ API, which would require to add a new
> "llqueue" API. And at that point, the "llist" vs "llqueue" semantic
> would become really confusing. Maybe "llstack" would be more appropriate
> here instead of llist ? Or llfifo ? The API we can provide with a
> lock-less structure does not only depend on how the elements are
> organized, but also on the operations allowed on the structure. So the
> API should reflect that. So I'm saying this warning again: if we don't
> fix that now, I think we're heading into a lock-free structure
> namespacing trainwreck that will limit our ability to add other
> lock-free operations due vague naming that does not take the operations
> allowed on the structure into consideration, combined with API
> constraints permitted by a specific given behavior (e.g. FIFO semantic)
> that tend to define these lock-free APIs.

I remember the previous consensus between us is to keep the API of llist
and may change its implementation in the future.  But it seems that
define a really general llist API is too hard.  But fortunately, because
llist is just inside kernel (not like a user space library, which is
used by various applications), we can change its name in the future if
it is needed.

> I've been working on lock-free structures myself in Userspace RCU: I
> have lock-free stack and queue, wait-free stack, wait-free queue, and
> (work in progress), RCU red-black tree (not lock-free), and lock-free
> RCU expandable hash table. If we plan namespacing of lock-free data
> structure correctly, I think there will be room for very interesting
> performance and scalability improvements in the future.

That is interesting.  But we need find the user in Linux kernel first.

>>>
>>> You could also use llist_empty() although that brings me to that
>>> ACCESS_ONCE thing in there, what's the point?
>>
>> Something as follow with llist_empty() seems not work.
>>
>>   empty = llist_empty(irq_work_list);
>>   llist_add(&work->llnode, irq_work_list);
>>   if (empty)
>>           arch_irq_work_raise();
>>
>> Because irq_work IRQ handler or timer IRQ handler may be executed just
>> before "llist_add(&work->llnode, irq_work_list)", so that, although
>> "empty == false", arch_irq_work_raise() still should be executed.
>>
>> Can you tell me how to that with llist_empty()?
>>
>>
>> For ACCESS_ONCE, Mathiew suggest me to add it,
>>
>> Hi, Mathiew,
>>
>> Can you explain why ACCESS_ONCE should be used here?
> 
> It was mainly to force the compiler to re-fetch the head->first value
> from memory in busy-waiting loops. So even though the right practice is
> to have a cpu_relax() in the body of the loop (which would force the
> refetch due to the compiler barrier), having the ACCESS_ONCE in
> llist_empty() should not hurt much. It also seems to be what atomic*.h
> does (volatile read), so I guess the expected behavior wrt atomic
> accesses are to behave as volatile, hence my recommendation to make this
> llist_empty a volatile access. Quoting my previous email on that topic:
> 
> "Would it make sense to do:
> 
>  return ACCESS_ONCE(head->first) == NULL;
> 
> instead ? Otherwise the compiler can choose to keep the result around in
> registers without re-reading (e.g. busy waiting loop)."
> 
> So I was not implying it was strictly required, merely wondering whether
> it should be added.

Thanks for your explanation.  I should refer to our previous email
firstly.  I think your point is reasonable.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01  9:57             ` Peter Zijlstra
@ 2011-09-02  1:14               ` Huang Ying
  2011-09-03 17:35                 ` Mathieu Desnoyers
  0 siblings, 1 reply; 20+ messages in thread
From: Huang Ying @ 2011-09-02  1:14 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andrew Morton, linux-kernel, Mathieu Desnoyers

On 09/01/2011 05:57 PM, Peter Zijlstra wrote:
> On Thu, 2011-09-01 at 16:56 +0800, Huang Ying wrote:
>> Something as follow with llist_empty() seems not work.
>>
>>   empty = llist_empty(irq_work_list);
>>   llist_add(&work->llnode, irq_work_list);
>>   if (empty)
>>           arch_irq_work_raise();
>>
>> Because irq_work IRQ handler or timer IRQ handler may be executed just
>> before "llist_add(&work->llnode, irq_work_list)", so that, although
>> "empty == false", arch_irq_work_raise() still should be executed.
> 
> Right, I was thinking:
> 
> 	llist_add(&work->llist, irq_work_list);
> 	if (llist_empty(&work->llist))
> 		arch_irq_work_raise();
> 
> And then ran into the difference between llist_node and llist_head. Now
> we could sort that by introducing llist_next() and write it like:
> 
> 	if (!llist_next(&work->list))
> 		arch_irq_work_raise();
> 

This reveals some implementation details of llist.  But it will reveal
some implementation details to make llist_add() return whether list is
empty before adding as Mathieu pointed out.  So I think something like
this or just to check work->list->next should be acceptable.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-01 10:00           ` Peter Zijlstra
@ 2011-09-02  1:18             ` Huang Ying
  2011-09-02 13:26               ` Peter Zijlstra
  0 siblings, 1 reply; 20+ messages in thread
From: Huang Ying @ 2011-09-02  1:18 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andrew Morton, linux-kernel

On 09/01/2011 06:00 PM, Peter Zijlstra wrote:
> On Thu, 2011-09-01 at 16:44 +0800, Huang Ying wrote:
> 
>> Because llist is in library, it may be used in highly contended case and
>> light/un-contended loads.  So maybe code as above is best choice for llist.
> 
> Well the thing is, if you're heavily contended you should probably be
> doing something else..

So which solution is preferable?

1) no cpu_relax
2) cpu_relax after first cmpxchg

Personally, I prefer 2).  It should have acceptable overhead in
ligh/un-contended loads.  Do you agree.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-02  1:18             ` Huang Ying
@ 2011-09-02 13:26               ` Peter Zijlstra
  0 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2011-09-02 13:26 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andrew Morton, linux-kernel

On Fri, 2011-09-02 at 09:18 +0800, Huang Ying wrote:
> On 09/01/2011 06:00 PM, Peter Zijlstra wrote:
> > On Thu, 2011-09-01 at 16:44 +0800, Huang Ying wrote:
> > 
> >> Because llist is in library, it may be used in highly contended case and
> >> light/un-contended loads.  So maybe code as above is best choice for llist.
> > 
> > Well the thing is, if you're heavily contended you should probably be
> > doing something else..
> 
> So which solution is preferable?
> 
> 1) no cpu_relax
> 2) cpu_relax after first cmpxchg
> 
> Personally, I prefer 2).  It should have acceptable overhead in
> ligh/un-contended loads.  Do you agree.

I guess so, we can always poke at it again when someone finds something
to measure.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-02  1:08               ` Huang Ying
@ 2011-09-03 16:33                 ` Mathieu Desnoyers
  0 siblings, 0 replies; 20+ messages in thread
From: Mathieu Desnoyers @ 2011-09-03 16:33 UTC (permalink / raw)
  To: Huang Ying; +Cc: Peter Zijlstra, Andrew Morton, linux-kernel, Paul E. McKenney

* Huang Ying (ying.huang@intel.com) wrote:
> On 09/01/2011 08:51 PM, Mathieu Desnoyers wrote:
> > * Huang Ying (ying.huang@intel.com) wrote:
> >> On 09/01/2011 03:58 PM, Peter Zijlstra wrote:
> >>> On Thu, 2011-09-01 at 11:20 +0800, Huang Ying wrote:
> >>>> On 09/01/2011 09:46 AM, Huang Ying wrote:
> >>>>>>> -static void __irq_work_queue(struct irq_work *entry)
> >>>>>>> +static void __irq_work_queue(struct irq_work *work)
> >>>>>>>  {
> >>>>>>> -	struct irq_work *next;
> >>>>>>> +	struct irq_work_list *irq_work_list;
> >>>>>>>  
> >>>>>>> -	preempt_disable();
> >>>>>>> +	irq_work_list = &get_cpu_var(irq_work_lists);
> >>>>>>>  
> >>>>>>> -	do {
> >>>>>>> -		next = __this_cpu_read(irq_work_list);
> >>>>>>> -		/* Can assign non-atomic because we keep the flags set. */
> >>>>>>> -		entry->next = next_flags(next, IRQ_WORK_FLAGS);
> >>>>>>> -	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
> >>>>>>> +	llist_add(&work->llnode, &irq_work_list->llist);
> >>>>>>>  
> >>>>>>>  	/* The list was empty, raise self-interrupt to start processing. */
> >>>>>>> -	if (!irq_work_next(entry))
> >>>>>>> +	if (!test_and_set_bit(LIST_NONEMPTY_BIT, &irq_work_list->flags))
> >>>>>>>  		arch_irq_work_raise();
> >>>>>>
> >>>>>> So why can't you simply test work->llnode->next?
> >>>>>
> >>>>> Yes.  That is better.  Even if there may be a small race window, it is
> >>>>> not a big issue to raise one extra self interrupt seldom.
> >>>>
> >>>> Remember something about this.  I didn't test work->llnode->next here
> >>>> because I didn't want expose the implementation details like that here.
> >>>>  How about make llist_add() return whether list is empty before adding?
> >>>>  Because it will be an inline function, that should be optimized out if
> >>>> the caller do not need the information.
> > 
> > Yes, that would make sense.
> > 
> > something like
> > 
> > static inline
> > struct llist_node *llist_add(struct llist_node *new, struct llist_head *head)
> > {
> > 	struct llist_node *entry, *old_entry;
> > 
> > #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
> > 	BUG_ON(in_nmi());
> > #endif
> > 
> > 	entry = head->first;
> > 	do {
> > 		old_entry = entry;
> > 		new->next = entry;
> > 		cpu_relax();
> > 	} while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry);
> >         return old_entry;
> > }
> > 
> > Where llist_add returns NULL if the list was initially empty, else
> > returns non-null. This return value usage should be documented in the
> > function header, of course.
> > 
> > BUT, big warning here: this whole thing _should_ be renamed as a
> > "lockless stack" rather than a "lockless list". Really. I said it in the
> > past, and here I see another example showing why we should. The only
> > reason why we can get away this easily with knowing atomically if the
> > structure was empty prior to the insertion is because this "list"
> > hehaves like a stack (LIFO). I foresee we'll need to add "lockless
> > queues" at some point (FIFO), which has the ability to enqueue/dequeue
> > concurrently without sharing the same cache-lines when the queue is
> > non-empty. Within that kind of queue structure, knowing if the queue was
> > empty prior to insertion would become a bottleneck, so I would not
> > advise to make that part of _that_ API, which would require to add a new
> > "llqueue" API. And at that point, the "llist" vs "llqueue" semantic
> > would become really confusing. Maybe "llstack" would be more appropriate
> > here instead of llist ? Or llfifo ? The API we can provide with a
> > lock-less structure does not only depend on how the elements are
> > organized, but also on the operations allowed on the structure. So the
> > API should reflect that. So I'm saying this warning again: if we don't
> > fix that now, I think we're heading into a lock-free structure
> > namespacing trainwreck that will limit our ability to add other
> > lock-free operations due vague naming that does not take the operations
> > allowed on the structure into consideration, combined with API
> > constraints permitted by a specific given behavior (e.g. FIFO semantic)
> > that tend to define these lock-free APIs.
> 
> I remember the previous consensus between us is to keep the API of llist
> and may change its implementation in the future.

Previously, the concensus was to keep the "llist" name until we could
gather evidence that this name is inappropriate. The feature request
from Peter Zijlstra outlines very well the need for an llstack API that
allows the push operation to return whether the stack was empty or not
before the operation.

> But it seems that define a really general llist API is too hard.

As I've been saying over and over: lock-free API naming needs to take
into account the operations allowed on the structure, so "list" really
isn't precise enough.

> But fortunately, because
> llist is just inside kernel (not like a user space library, which is
> used by various applications), we can change its name in the future if
> it is needed.

With the feature request we have here, it appears the time to change it
has already come.

> 
> > I've been working on lock-free structures myself in Userspace RCU: I
> > have lock-free stack and queue, wait-free stack, wait-free queue, and
> > (work in progress), RCU red-black tree (not lock-free), and lock-free
> > RCU expandable hash table. If we plan namespacing of lock-free data
> > structure correctly, I think there will be room for very interesting
> > performance and scalability improvements in the future.
> 
> That is interesting.  But we need find the user in Linux kernel first.

Sure. That will happen in due time. :)

> 
> >>>
> >>> You could also use llist_empty() although that brings me to that
> >>> ACCESS_ONCE thing in there, what's the point?
> >>
> >> Something as follow with llist_empty() seems not work.
> >>
> >>   empty = llist_empty(irq_work_list);
> >>   llist_add(&work->llnode, irq_work_list);
> >>   if (empty)
> >>           arch_irq_work_raise();
> >>
> >> Because irq_work IRQ handler or timer IRQ handler may be executed just
> >> before "llist_add(&work->llnode, irq_work_list)", so that, although
> >> "empty == false", arch_irq_work_raise() still should be executed.
> >>
> >> Can you tell me how to that with llist_empty()?
> >>
> >>
> >> For ACCESS_ONCE, Mathiew suggest me to add it,
> >>
> >> Hi, Mathiew,
> >>
> >> Can you explain why ACCESS_ONCE should be used here?
> > 
> > It was mainly to force the compiler to re-fetch the head->first value
> > from memory in busy-waiting loops. So even though the right practice is
> > to have a cpu_relax() in the body of the loop (which would force the
> > refetch due to the compiler barrier), having the ACCESS_ONCE in
> > llist_empty() should not hurt much. It also seems to be what atomic*.h
> > does (volatile read), so I guess the expected behavior wrt atomic
> > accesses are to behave as volatile, hence my recommendation to make this
> > llist_empty a volatile access. Quoting my previous email on that topic:
> > 
> > "Would it make sense to do:
> > 
> >  return ACCESS_ONCE(head->first) == NULL;
> > 
> > instead ? Otherwise the compiler can choose to keep the result around in
> > registers without re-reading (e.g. busy waiting loop)."
> > 
> > So I was not implying it was strictly required, merely wondering whether
> > it should be added.
> 
> Thanks for your explanation.  I should refer to our previous email
> firstly.  I think your point is reasonable.

Thanks,

Mathieu

> 
> Best Regards,
> Huang Ying

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH -mm 1/2] irq_work, Use llist in irq_work
  2011-09-02  1:14               ` Huang Ying
@ 2011-09-03 17:35                 ` Mathieu Desnoyers
  0 siblings, 0 replies; 20+ messages in thread
From: Mathieu Desnoyers @ 2011-09-03 17:35 UTC (permalink / raw)
  To: Huang Ying; +Cc: Peter Zijlstra, Andrew Morton, linux-kernel

* Huang Ying (ying.huang@intel.com) wrote:
> On 09/01/2011 05:57 PM, Peter Zijlstra wrote:
> > On Thu, 2011-09-01 at 16:56 +0800, Huang Ying wrote:
> >> Something as follow with llist_empty() seems not work.
> >>
> >>   empty = llist_empty(irq_work_list);
> >>   llist_add(&work->llnode, irq_work_list);
> >>   if (empty)
> >>           arch_irq_work_raise();
> >>
> >> Because irq_work IRQ handler or timer IRQ handler may be executed just
> >> before "llist_add(&work->llnode, irq_work_list)", so that, although
> >> "empty == false", arch_irq_work_raise() still should be executed.
> > 
> > Right, I was thinking:
> > 
> > 	llist_add(&work->llist, irq_work_list);
> > 	if (llist_empty(&work->llist))
> > 		arch_irq_work_raise();
> > 
> > And then ran into the difference between llist_node and llist_head. Now
> > we could sort that by introducing llist_next() and write it like:
> > 
> > 	if (!llist_next(&work->list))
> > 		arch_irq_work_raise();
> > 
> 
> This reveals some implementation details of llist.  But it will reveal
> some implementation details to make llist_add() return whether list is
> empty before adding as Mathieu pointed out.  So I think something like
> this or just to check work->list->next should be acceptable.

No. These solutions all appear to have some relatively high level of
ugliness and expose too much of the structure internals.

I'll submit a patch to change the API from llist to llstack shortly for
comments.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2011-09-03 17:35 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-30  5:16 [PATCH -mm 0/2] Use llist in irq_work and xlist Huang Ying
2011-08-30  5:16 ` [PATCH -mm 1/2] irq_work, Use llist in irq_work Huang Ying
2011-08-31 10:10   ` Peter Zijlstra
2011-09-01  1:46     ` Huang Ying
2011-09-01  3:20       ` Huang Ying
2011-09-01  7:58         ` Peter Zijlstra
2011-09-01  8:56           ` Huang Ying
2011-09-01  9:57             ` Peter Zijlstra
2011-09-02  1:14               ` Huang Ying
2011-09-03 17:35                 ` Mathieu Desnoyers
2011-09-01 12:51             ` Mathieu Desnoyers
2011-09-01 13:00               ` Mathieu Desnoyers
2011-09-02  1:08               ` Huang Ying
2011-09-03 16:33                 ` Mathieu Desnoyers
2011-09-01  7:57       ` Peter Zijlstra
2011-09-01  8:44         ` Huang Ying
2011-09-01 10:00           ` Peter Zijlstra
2011-09-02  1:18             ` Huang Ying
2011-09-02 13:26               ` Peter Zijlstra
2011-08-30  5:16 ` [PATCH -mm 2/2] net, rds, Replace xlist in net/rds/xlist.h with llist Huang Ying

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.