linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [POC] recoverable fault injection
@ 2012-11-21 23:44 Johannes Berg
  2012-11-22 11:40 ` Akinobu Mita
  0 siblings, 1 reply; 5+ messages in thread
From: Johannes Berg @ 2012-11-21 23:44 UTC (permalink / raw)
  To: Akinobu Mita; +Cc: linux-kernel, linux-wireless

This idea has been floating around in my head for a long time now ... 

I was thinking: what if we could do fault injection during regular
testing, at least on those code paths that are not supposed to have side
effects when they fail? Now obviously this isn't all code paths, and
many probably erroneously *do* have side effects even if they're not
supposed to, but it does apply to a number of code paths.

So I decided to play with this, and the result it the patch below. It
adds a new knob "recoverable-only" to the slab and page_alloc fault
attributes. If enabled, then a single fault can be injected if the task
executing it is in a "recoverable section", this is implemented by some
new fields in struct task_struct and the (very ugly!) macro
FAULT_INJECT_CALL_RECOVERABLE_FUNCTION.

Now obviously this isn't polished at all, etc. However, it seems to work
for the single function (a wireless API call) that I annotated, it has
various memory allocations in the call chain, and when any one of them
fails it doesn't seem to leak memory or crash etc. which is the point of
the testing.

Does it make any sense at all to invest more work on this and try to put
it into mainline? Maybe somebody tried something like this before and
decided it wasn't worth it, or maybe it didn't really work at all? Is
the task_struct usage completely broken? Any other thoughts?

johannes


diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index c6f996f..1820a4b 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -7,6 +7,12 @@
 #include <linux/debugfs.h>
 #include <linux/atomic.h>
 
+enum fault_attr_id {
+	FAULT_ATTR_UNKNOWN,
+	FAULT_ATTR_SLAB,
+	FAULT_ATTR_PAGE_ALLOC,
+};
+
 /*
  * For explanation of the elements of this struct, see
  * Documentation/fault-injection/fault-injection.txt
@@ -18,6 +24,7 @@ struct fault_attr {
 	atomic_t space;
 	unsigned long verbose;
 	u32 task_filter;
+	u32 recoverable_only;
 	unsigned long stacktrace_depth;
 	unsigned long require_start;
 	unsigned long require_end;
@@ -25,16 +32,20 @@ struct fault_attr {
 	unsigned long reject_end;
 
 	unsigned long count;
+	enum fault_attr_id id;
 };
 
-#define FAULT_ATTR_INITIALIZER {				\
+#define FAULT_ATTR_ID_INITIALIZER(_id) {			\
 		.interval = 1,					\
 		.times = ATOMIC_INIT(1),			\
 		.require_end = ULONG_MAX,			\
 		.stacktrace_depth = 32,				\
 		.verbose = 2,					\
+		.id = FAULT_ATTR_ ## _id,			\
 	}
 
+#define FAULT_ATTR_INITIALIZER FAULT_ATTR_ID_INITIALIZER(UNKNOWN)
+
 #define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER
 int setup_fault_attr(struct fault_attr *attr, char *str);
 bool should_fail(struct fault_attr *attr, ssize_t size);
@@ -54,6 +65,23 @@ static inline struct dentry *fault_create_debugfs_attr(const char *name,
 
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 
+#define FAULT_INJECT_CALL_RECOVERABLE_FUNCTION(ids, fn, args...)\
+	({							\
+		int ___fi_ret;					\
+		current->failed_need_recovery = false;		\
+		current->fail_recoverable = (ids);		\
+		___fi_ret = fn(args);				\
+		current->fail_recoverable = 0;			\
+		if (current->failed_need_recovery && ___fi_ret)	\
+			___fi_ret = fn(args);			\
+		___fi_ret;					\
+	});
+
+#else /* CONFIG_FAULT_INJECTION */
+
+#define FAULT_INJECT_CALL_RECOVERABLE_FUNCTION(ids, fn, args...)\
+	fn(args);
+
 #endif /* CONFIG_FAULT_INJECTION */
 
 #ifdef CONFIG_FAILSLAB
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a0..6ee0069 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1492,7 +1492,9 @@ struct task_struct {
 	struct task_delay_info *delays;
 #endif
 #ifdef CONFIG_FAULT_INJECTION
-	int make_it_fail;
+	u8 fail_recoverable;
+	bool make_it_fail;
+	bool failed_need_recovery;
 #endif
 	/*
 	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index f7210ad..63cde14 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -108,6 +108,10 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
 	if (attr->task_filter && !fail_task(attr, current))
 		return false;
 
+	if (attr->recoverable_only &&
+	    (in_interrupt() || !(current->fail_recoverable & BIT(attr->id))))
+			return false;
+
 	if (atomic_read(&attr->times) == 0)
 		return false;
 
@@ -130,6 +134,11 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
 
 	fail_dump(attr);
 
+	if (attr->recoverable_only) {
+		current->fail_recoverable = 0;
+		current->failed_need_recovery = true;
+	}
+
 	if (atomic_read(&attr->times) != -1)
 		atomic_dec_not_zero(&attr->times);
 
@@ -225,6 +234,10 @@ struct dentry *fault_create_debugfs_attr(const char *name,
 		goto fail;
 	if (!debugfs_create_bool("task-filter", mode, dir, &attr->task_filter))
 		goto fail;
+	if (attr->id != FAULT_ATTR_UNKNOWN &&
+	    !debugfs_create_bool("recoverable-only", mode, dir,
+				 &attr->recoverable_only))
+		goto fail;
 
 #ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER
 
diff --git a/mm/failslab.c b/mm/failslab.c
index fefaaba..9cdfe2f 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -6,7 +6,7 @@ static struct {
 	u32 ignore_gfp_wait;
 	int cache_filter;
 } failslab = {
-	.attr = FAULT_ATTR_INITIALIZER,
+	.attr = FAULT_ATTR_ID_INITIALIZER(SLAB),
 	.ignore_gfp_wait = 1,
 	.cache_filter = 0,
 };
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bb90971..f3568fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1536,7 +1536,7 @@ static struct {
 	u32 ignore_gfp_wait;
 	u32 min_order;
 } fail_page_alloc = {
-	.attr = FAULT_ATTR_INITIALIZER,
+	.attr = FAULT_ATTR_ID_INITIALIZER(PAGE_ALLOC),
 	.ignore_gfp_wait = 1,
 	.ignore_gfp_highmem = 1,
 	.min_order = 1,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f0aaf5c..d74c604 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -15,6 +15,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/netlink.h>
 #include <linux/etherdevice.h>
+#include <linux/fault-inject.h>
 #include <net/net_namespace.h>
 #include <net/genetlink.h>
 #include <net/cfg80211.h>
@@ -6129,8 +6130,8 @@ static int nl80211_tdls_oper(struct sk_buff *skb, struct genl_info *info)
 	return rdev_tdls_oper(rdev, dev, peer, operation);
 }
 
-static int nl80211_remain_on_channel(struct sk_buff *skb,
-				     struct genl_info *info)
+static int _nl80211_remain_on_channel(struct sk_buff *skb,
+				      struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct wireless_dev *wdev = info->user_ptr[1];
@@ -6195,6 +6196,14 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
 	return err;
 }
 
+static int nl80211_remain_on_channel(struct sk_buff *skb,
+				     struct genl_info *info)
+{
+	return FAULT_INJECT_CALL_RECOVERABLE_FUNCTION(
+		BIT(FAULT_ATTR_SLAB) | BIT(FAULT_ATTR_PAGE_ALLOC),
+		_nl80211_remain_on_channel, skb, info);
+}
+
 static int nl80211_cancel_remain_on_channel(struct sk_buff *skb,
 					    struct genl_info *info)
 {



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [POC] recoverable fault injection
  2012-11-21 23:44 [POC] recoverable fault injection Johannes Berg
@ 2012-11-22 11:40 ` Akinobu Mita
  2012-11-22 11:52   ` Johannes Berg
  0 siblings, 1 reply; 5+ messages in thread
From: Akinobu Mita @ 2012-11-22 11:40 UTC (permalink / raw)
  To: Johannes Berg; +Cc: linux-kernel, linux-wireless

2012/11/22 Johannes Berg <johannes@sipsolutions.net>:
> This idea has been floating around in my head for a long time now ...
>
> I was thinking: what if we could do fault injection during regular
> testing, at least on those code paths that are not supposed to have side
> effects when they fail? Now obviously this isn't all code paths, and
> many probably erroneously *do* have side effects even if they're not
> supposed to, but it does apply to a number of code paths.

It sounds interesting. I have never thought of this idea.

> So I decided to play with this, and the result it the patch below. It
> adds a new knob "recoverable-only" to the slab and page_alloc fault
> attributes. If enabled, then a single fault can be injected if the task
> executing it is in a "recoverable section", this is implemented by some
> new fields in struct task_struct and the (very ugly!) macro
> FAULT_INJECT_CALL_RECOVERABLE_FUNCTION.

I suggest introducing a pair of function like:

void fault_recoverable_enable(unsigned long fault_ids);
void fault_recoverable_disable();

fault_recoverable_enable() sets current task's recoverable state to the
value specified with the argument.  fault_recoverable_disable() resets
the recoverable state.

I think this can be more readable than FAULT_INJECT_CALL_RECOVERABLE_FUNCTION
macro.  In case of nl80211_remain_on_channel(), you can just put
them in exit and entrance:

fault_recoverable_enable(FAULT_ATTR_SLAB | FAULT_ATTR_PAGE_ALLOC);
...
fault_recoverable_disable();

return err;

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [POC] recoverable fault injection
  2012-11-22 11:40 ` Akinobu Mita
@ 2012-11-22 11:52   ` Johannes Berg
  2012-11-22 13:29     ` Akinobu Mita
  0 siblings, 1 reply; 5+ messages in thread
From: Johannes Berg @ 2012-11-22 11:52 UTC (permalink / raw)
  To: Akinobu Mita; +Cc: linux-kernel, linux-wireless

On Thu, 2012-11-22 at 20:40 +0900, Akinobu Mita wrote:

> > I was thinking: what if we could do fault injection during regular
> > testing, at least on those code paths that are not supposed to have side
> > effects when they fail? Now obviously this isn't all code paths, and
> > many probably erroneously *do* have side effects even if they're not
> > supposed to, but it does apply to a number of code paths.
> 
> It sounds interesting. I have never thought of this idea.

:-)
It also occurred to me that if the function is defined to not have side
effects when failing, this actually also lets you test that in a way.
Anyway, it's really just an idea at this point.

> > So I decided to play with this, and the result it the patch below. It
> > adds a new knob "recoverable-only" to the slab and page_alloc fault
> > attributes. If enabled, then a single fault can be injected if the task
> > executing it is in a "recoverable section", this is implemented by some
> > new fields in struct task_struct and the (very ugly!) macro
> > FAULT_INJECT_CALL_RECOVERABLE_FUNCTION.
> 
> I suggest introducing a pair of function like:
> 
> void fault_recoverable_enable(unsigned long fault_ids);
> void fault_recoverable_disable();
> [...]

I thought about something like that, I actually initially played with
macros like this:

#define FAULT_RECOVERABLE_START(ids)	\
	/* set up the task state */	\
 fault_recovery_retry:

#define FAULT_RECOVERABLE_END(ids)	\
	if (current->encountered_fault)	\
		goto fault_recovery_retry;

or so. However, the problem is that if you exit the function between
these points, and this is true for your functions as well, you leave the
task's fault injection enabled which isn't what you want. So adding
functions or macros like this didn't really seem right. Also, functions
(rather than macros) have the problem that the retry can't be
encapsulated -- note how my macro calls the function again if it failed.
So with functions like that, you introduce new manually-coded error and
retry paths, that seemed undesirable.

As you can see in my macro, it's also possible for an allocation to fail
but the function to succeed, so the function that is called must have a
return value indicating success or failure. I ran into this with debug
objects, their allocation failed all the time but obviously the function
succeeded as debug objects fail gracefully if they can't allocate
memory.

Now, I'm not saying I'm happy with this, but I haven't found a better
solution yet, but I'll admit that I haven't thought about it for long.
If this was python I'd add a decorator to the function ;-)

Oh another thing I realized: when a fault is injected, I currently set
current->fail_recoverable = 0, but I could just unset the failed bit
instead which would allow multiple different failures. Not sure which is
better though.

Thanks for looking at this :-)

johannes


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [POC] recoverable fault injection
  2012-11-22 11:52   ` Johannes Berg
@ 2012-11-22 13:29     ` Akinobu Mita
  2012-11-22 13:41       ` Johannes Berg
  0 siblings, 1 reply; 5+ messages in thread
From: Akinobu Mita @ 2012-11-22 13:29 UTC (permalink / raw)
  To: Johannes Berg; +Cc: linux-kernel, linux-wireless

2012/11/22 Johannes Berg <johannes@sipsolutions.net>:

> I thought about something like that, I actually initially played with
> macros like this:
>
> #define FAULT_RECOVERABLE_START(ids)    \
>         /* set up the task state */     \
>  fault_recovery_retry:
>
> #define FAULT_RECOVERABLE_END(ids)      \
>         if (current->encountered_fault) \
>                 goto fault_recovery_retry;
>
> or so. However, the problem is that if you exit the function between
> these points, and this is true for your functions as well, you leave the
> task's fault injection enabled which isn't what you want. So adding
> functions or macros like this didn't really seem right. Also, functions
> (rather than macros) have the problem that the retry can't be
> encapsulated -- note how my macro calls the function again if it failed.
> So with functions like that, you introduce new manually-coded error and
> retry paths, that seemed undesirable.
>
> As you can see in my macro, it's also possible for an allocation to fail
> but the function to succeed, so the function that is called must have a
> return value indicating success or failure. I ran into this with debug
> objects, their allocation failed all the time but obviously the function
> succeeded as debug objects fail gracefully if they can't allocate
> memory.

Oh, I completely missed retrying part in your macro.

I looked into FAULT_INJECT_CALL_RECOVERABLE_FUNCTION again,
then I realized that it is not necessary to be variadic macro.

You can define macro like wait_event() family and use it like below:

        return FAULT_INJECT_CALL_RECOVERABLE_FUNCTION(
                BIT(FAULT_ATTR_SLAB) | BIT(FAULT_ATTR_PAGE_ALLOC),
                _nl80211_remain_on_channel(skb, info));

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [POC] recoverable fault injection
  2012-11-22 13:29     ` Akinobu Mita
@ 2012-11-22 13:41       ` Johannes Berg
  0 siblings, 0 replies; 5+ messages in thread
From: Johannes Berg @ 2012-11-22 13:41 UTC (permalink / raw)
  To: Akinobu Mita; +Cc: linux-kernel, linux-wireless

On Thu, 2012-11-22 at 22:29 +0900, Akinobu Mita wrote:

> > As you can see in my macro, it's also possible for an allocation to fail
> > but the function to succeed, so the function that is called must have a
> > return value indicating success or failure. I ran into this with debug
> > objects, their allocation failed all the time but obviously the function
> > succeeded as debug objects fail gracefully if they can't allocate
> > memory.
> 
> Oh, I completely missed retrying part in your macro.
> 
> I looked into FAULT_INJECT_CALL_RECOVERABLE_FUNCTION again,
> then I realized that it is not necessary to be variadic macro.
> 
> You can define macro like wait_event() family and use it like below:
> 
>         return FAULT_INJECT_CALL_RECOVERABLE_FUNCTION(
>                 BIT(FAULT_ATTR_SLAB) | BIT(FAULT_ATTR_PAGE_ALLOC),
>                 _nl80211_remain_on_channel(skb, info));

Oh, right, good point. Maybe even pass the entire line, and the
evaluation of whether the function failed. Then we don't have to have
the assumption of it returning an int with the 0/-ERR behaviour:

int ret;

FAULT_INJECT_CALL_RECOVERABLE_FUNCTION(
	BIT(FAULT_ATTR_SLAB) | BIT(FAULT_ATTR_PAGE_ALLOC),
	ret = _nl80211_remain_on_channel(skb, info),
	ret != 0);

return ret;


Ok that's kinda ugly too ... hmm.

johannes


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2012-11-22 20:45 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-11-21 23:44 [POC] recoverable fault injection Johannes Berg
2012-11-22 11:40 ` Akinobu Mita
2012-11-22 11:52   ` Johannes Berg
2012-11-22 13:29     ` Akinobu Mita
2012-11-22 13:41       ` Johannes Berg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).