Re: sporadic hangs on generic/186

From: "NeilBrown" <neilb@suse.de>
To: "Dave Chinner" <david@fromorbit.com>
Cc: "Trond Myklebust" <trondmy@hammerspace.com>,
	"bfields@fieldses.org" <bfields@fieldses.org>,
	"linux-nfs@vger.kernel.org" <linux-nfs@vger.kernel.org>,
	"linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>,
	"chuck.lever@oracle.com" <chuck.lever@oracle.com>
Subject: Re: sporadic hangs on generic/186
Date: Tue, 12 Apr 2022 13:13:57 +1000	[thread overview]
Message-ID: <164973323782.10985.11874897668084187412@noble.neil.brown.name> (raw)
In-Reply-To: <20220410233415.GG1609613@dread.disaster.area>


On Mon, 11 Apr 2022, Dave Chinner wrote:
 
> Yup, and you've missed my point entirely,

Hmm... It seems that I did. Sorry.

Your point is that the flags passed to kmalloc et al should be relevant,
and that we should be focusing on process context - including for
access to reserves in memalloc context (using memalloc_noreclaim_save()
as appropriate).  I agree.

My point is that memalloc_noreclaim_save() isn't necessarily sufficient.
It provides access to a shared pool of reserves, which is hard to reason
about.  It *might* provide a guarantee that every allocation will
succeed without deadlock - but only if there is a reasonable limit on
the number or parallel allocation chains - and we don't have any
mechanism for tracking those.

So there is still a place for mempools - they complement __GFP_MEMALLOC
reserves and can both be used together.  mempools() are particularly
appropriate when the allocation commonly (or always) occurs in
PF_MEMALLOC context, as using the mempool provides a better guarantee,
and removes a burden from the shared pool.

This suggests to me that a different interface to mempools could be
useful.  One that uses the mempool preallocation instead of the shared
preallocation
If PF_MEMALLOC or GFP_MEMALLOC are set, then the mempool is used which
avoids the shared reserves.
Otherwise (or if __GFP_NOMEMALLOC is set), follow the normal allocation
path and don't use the mempool reserves. 

Something like the following.

Do you agree?

Thanks,
NeilBrown

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 0c964ac107c2..4414644c49d5 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -46,6 +46,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
 extern void mempool_destroy(mempool_t *pool);
 extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
+extern void *mempool_memalloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
 extern void mempool_free(void *element, mempool_t *pool);
 
 /*
diff --git a/mm/mempool.c b/mm/mempool.c
index b933d0fc21b8..1182bd3443cc 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -417,8 +417,12 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 		goto repeat_alloc;
 	}
 
-	/* We must not sleep if !__GFP_DIRECT_RECLAIM */
-	if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
+	/*
+	 * We must not sleep if !__GFP_DIRECT_RECLAIM
+	 * and must not retry in __GFP_NORETRY
+	 */
+	if (!(gfp_mask & __GFP_DIRECT_RECLAIM) ||
+	    (gfp_mask & __GFP_NORETRY)) {
 		spin_unlock_irqrestore(&pool->lock, flags);
 		return NULL;
 	}
@@ -440,6 +444,30 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(mempool_alloc);
 
+/**
+ * mempool_memalloc - allocate, using mempool rather than shared reserves
+ * @pool:	mempool to allocate from
+ * @gfp_mask:	GFP allocation flags
+ *
+ *
+ * If the process context or gfp flags permit allocation from reserves
+ * (i.e. PF_MEMALLOC or GFP_MEMALLOC), then use the mempool for allocation,
+ * otherwise allocate directly using the underlying allocator
+ *
+ * Return: pointer to allocated element, or %NULL on failure.
+ */
+
+void *mempool_memalloc(mempool_t *pool, gfp_t gfp_mask)
+{
+	if (gfp_mask & __GFP_NOMEMALLOC ||
+	    (!(current->flags & PF_MEMALLOC) &&
+	     !(gfp_mask & __GFP_MEMALLOC)))
+		/* No reserves requested */
+		return pool->alloc(gfp_mask, pool->pool_data);
+	else
+		return mempool_alloc(pool, gfp_mask);
+}
+
 /**
  * mempool_free - return an element to the pool.
  * @element:   pool element pointer.
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 682fcd24bf43..2ecbe6b89fbb 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -615,8 +615,6 @@ rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
 	};
 	struct rpc_cred *ret;
 
-	if (RPC_IS_ASYNC(task))
-		lookupflags |= RPCAUTH_LOOKUP_ASYNC;
 	ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags);
 	put_cred(acred.cred);
 	return ret;
@@ -633,8 +631,6 @@ rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags)
 
 	if (!acred.principal)
 		return NULL;
-	if (RPC_IS_ASYNC(task))
-		lookupflags |= RPCAUTH_LOOKUP_ASYNC;
 	return auth->au_ops->lookup_cred(auth, &acred, lookupflags);
 }
 
@@ -658,7 +654,7 @@ rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags)
 	};
 
 	if (flags & RPC_TASK_ASYNC)
-		lookupflags |= RPCAUTH_LOOKUP_NEW | RPCAUTH_LOOKUP_ASYNC;
+		lookupflags |= RPCAUTH_LOOKUP_NEW;
 	if (task->tk_op_cred)
 		/* Task must use exactly this rpc_cred */
 		new = get_rpccred(task->tk_op_cred);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index de7e5b41ab8f..4f68934dbeb5 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1343,11 +1343,7 @@ gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
 static struct rpc_cred *
 gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	gfp_t gfp = GFP_KERNEL;
-
-	if (flags & RPCAUTH_LOOKUP_ASYNC)
-		gfp = GFP_NOWAIT | __GFP_NOWARN;
-	return rpcauth_lookup_credcache(auth, acred, flags, gfp);
+	return rpcauth_lookup_credcache(auth, acred, flags, GFP_KERNEL);
 }
 
 static struct rpc_cred *
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 1e091d3fa607..6170d4d34687 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -45,14 +45,10 @@ static struct rpc_cred *unx_lookup_cred(struct rpc_auth *auth,
 {
 	struct rpc_cred *ret;
 
-	ret = kmalloc(sizeof(*ret), rpc_task_gfp_mask());
-	if (!ret) {
-		if (!(flags & RPCAUTH_LOOKUP_ASYNC))
-			return ERR_PTR(-ENOMEM);
-		ret = mempool_alloc(unix_pool, GFP_NOWAIT);
-		if (!ret)
-			return ERR_PTR(-ENOMEM);
-	}
+	ret = mempool_memalloc(unix_pool, rpc_task_gfp_mask());
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
+
 	rpcauth_init_cred(ret, acred, auth, &unix_credops);
 	ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
 	return ret;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 7f70c1e608b7..4138aa62d3f3 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1040,12 +1040,9 @@ int rpc_malloc(struct rpc_task *task)
 	gfp_t gfp = rpc_task_gfp_mask();
 
 	size += sizeof(struct rpc_buffer);
-	if (size <= RPC_BUFFER_MAXSIZE) {
-		buf = kmem_cache_alloc(rpc_buffer_slabp, gfp);
-		/* Reach for the mempool if dynamic allocation fails */
-		if (!buf && RPC_IS_ASYNC(task))
-			buf = mempool_alloc(rpc_buffer_mempool, GFP_NOWAIT);
-	} else
+	if (size <= RPC_BUFFER_MAXSIZE)
+		buf = mempool_memalloc(rpc_buffer_mempool, gfp);
+	else
 		buf = kmalloc(size, gfp);
 
 	if (!buf)
@@ -1110,12 +1107,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 
 static struct rpc_task *rpc_alloc_task(void)
 {
-	struct rpc_task *task;
-
-	task = kmem_cache_alloc(rpc_task_slabp, rpc_task_gfp_mask());
-	if (task)
-		return task;
-	return mempool_alloc(rpc_task_mempool, GFP_NOWAIT);
+	return mempool_memalloc(rpc_task_mempool, rpc_task_gfp_mask());
 }
 
 /*

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 3e6ce288a7fc..98da816b5fc2 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -99,7 +99,6 @@ struct rpc_auth_create_args {
 
 /* Flags for rpcauth_lookupcred() */
 #define RPCAUTH_LOOKUP_NEW		0x01	/* Accept an uninitialised cred */
-#define RPCAUTH_LOOKUP_ASYNC		0x02	/* Don't block waiting for memory */
 
 /*
  * Client authentication ops