All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] RFC - Write Bypass Race Bug
@ 2021-04-30 14:44 Marc Smith
  2021-05-06  4:09 ` Coly Li
  0 siblings, 1 reply; 4+ messages in thread
From: Marc Smith @ 2021-04-30 14:44 UTC (permalink / raw)
  To: linux-bcache

The problem:
If an inflight backing WRITE operation for a block is performed that
meets the criteria for bypassing the cache and that takes a long time
to complete, a READ operation for the same block may be fully
processed in the interim that populates the cache with the device
content from before the inflight WRITE. When the inflight WRITE
finally completes, since it was marked for bypass, the cache is not
subsequently updated, and the stale data populated by the READ request
remains in cache. While there is code in bcache for invalidating the
cache when a bypassed WRITE is performed, this is done prior to
issuing the backing I/O so it does not help.

The proposed fix:
Add two new lists to the cached_dev structure to track inflight
"bypass" write requests and inflight read requests that have have
missed cache. These are called "inflight_bypass_write_list" and
"inflight_read_list", respectively, and are protected by a spinlock
called the "inflight_lock"

When a WRITE is bypassing the cache, check whether there is an
overlapping inflight read. If so, set bypass = false to essentially
convert the bypass write into a writethrough write. Otherwise, if
there is no overlapping inflight read, then add the "search" structure
to the inflight bypass write list.

When a READ misses cache, check whether there is an overlapping
inflight write. If so, set a new flag in the search structure called
"do_not_cache" which causes cache population to be skipped after the
backing I/O completes. Otherwise, if there is no overlapping inflight
write, then add the "search" structure to the inflight read list.

The rest of the changes are to add a new stat called
"bypass_cache_insert_races" to track how many times the race was
encountered. Example:
cat /sys/fs/bcache/0c9b7a62-b431-4328-9dcb-a81e322238af/bdev0/stats_total/cache_bypass_races
16577

Assuming this is the correct approach, areas to look at:
1) Searching linked lists doesn't scale. Can something like an
interval tree be used here instead?
2) Can this be restructured so that the inflight_lock doesn't have to
be accessed with interrupts disabled? Note that search_free() can be
called in interrupt context.
3) Can do_not_cache just be another (1-bit) flag in the search
structure instead of occupying its own "int" ?

v1 -> v2 changes:
- Use interval trees instead of linked lists to track inflight requests.
- Change code order to avoid acquiring lock in search_free().
---
 drivers/md/bcache/bcache.h  |  4 ++
 drivers/md/bcache/request.c | 88 ++++++++++++++++++++++++++++++++++++-
 drivers/md/bcache/stats.c   | 14 ++++++
 drivers/md/bcache/stats.h   |  4 ++
 drivers/md/bcache/super.c   |  4 ++
 5 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 848dd4db1659..1a6afafaaa9a 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -397,6 +397,10 @@ struct cached_dev {
 	unsigned int		error_limit;
 	unsigned int		offline_seconds;

+	struct rb_root_cached	inflight_bypass_write_root;
+	struct rb_root_cached	inflight_read_root;
+	spinlock_t		inflight_lock;
+
 	char			backing_dev_name[BDEVNAME_SIZE];
 };

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 29c231758293..d6587cdbf4db 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -17,12 +17,34 @@
 #include <linux/hash.h>
 #include <linux/random.h>
 #include <linux/backing-dev.h>
+#include <linux/interval_tree_generic.h>

 #include <trace/events/bcache.h>

 #define CUTOFF_CACHE_ADD	95
 #define CUTOFF_CACHE_READA	90

+struct bch_itree_node {
+	struct rb_node	rb;
+	uint64_t	start;
+	uint64_t	last;
+	uint64_t	__subtree_last;
+};
+
+static uint64_t bch_itree_node_start(struct bch_itree_node *node)
+{
+	return node->start;
+}
+
+static uint64_t bch_itree_node_last(struct bch_itree_node *node)
+{
+	return node->last;
+}
+
+INTERVAL_TREE_DEFINE(struct bch_itree_node, rb,
+	uint64_t, __subtree_last,
+	bch_itree_node_start, bch_itree_node_last,, bch_itree)
+
 struct kmem_cache *bch_search_cache;

 static void bch_data_insert_start(struct closure *cl);
@@ -480,8 +502,40 @@ struct search {

 	struct btree_op		op;
 	struct data_insert_op	iop;
+	struct bch_itree_node   itree_node;
+	int			do_not_cache;
 };

+static bool check_inflight_overlapping(struct search *s)
+{
+	struct bch_itree_node *node;
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	struct rb_root_cached *search_root, *insert_root;
+	unsigned long flags;
+
+	if (s->write) {
+		search_root = &dc->inflight_read_root;
+		insert_root = &dc->inflight_bypass_write_root;
+	} else {
+		search_root = &dc->inflight_bypass_write_root;
+		insert_root = &dc->inflight_read_root;
+	}
+
+	spin_lock_irqsave(&dc->inflight_lock, flags);
+	node = bch_itree_iter_first(search_root,
+			s->itree_node.start, s->itree_node.last);
+	if (node == NULL) {
+		bch_itree_insert(&s->itree_node, insert_root);
+	}
+	spin_unlock_irqrestore(&dc->inflight_lock, flags);
+
+	if (node) {
+		bch_mark_cache_bypass_race(s->d->c, s->d);
+		return(true);
+	}
+	return(false);
+}
+
 static void bch_cache_read_endio(struct bio *bio)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
@@ -702,6 +756,21 @@ static void do_bio_hook(struct search *s,
 static void search_free(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d,
+					struct cached_dev, disk);
+	unsigned long flags;
+
+	if (!RB_EMPTY_NODE(&s->itree_node.rb)) {
+		spin_lock_irqsave(&dc->inflight_lock, flags);
+		if (s->write) {
+			bch_itree_remove(&s->itree_node,
+				&dc->inflight_bypass_write_root);
+		} else {
+			bch_itree_remove(&s->itree_node,
+				&dc->inflight_read_root);
+		}
+		spin_unlock_irqrestore(&dc->inflight_lock, flags);
+	}

 	atomic_dec(&s->iop.c->search_inflight);

@@ -735,6 +804,10 @@ static inline struct search *search_alloc(struct bio *bio,
 	/* Count on the bcache device */
 	s->orig_bdev		= orig_bdev;
 	s->start_time		= start_time;
+	RB_CLEAR_NODE(&s->itree_node.rb);
+	s->itree_node.start	= bio->bi_iter.bi_sector;
+	s->itree_node.last	= s->itree_node.start + bio_sectors(bio) - 1;
+	s->do_not_cache		= 0;
 	s->iop.c		= d->c;
 	s->iop.bio		= NULL;
 	s->iop.inode		= d->id;
@@ -850,7 +923,7 @@ static void cached_dev_read_done(struct closure *cl)
 	closure_get(&dc->disk.cl);
 	bio_complete(s);

-	if (s->iop.bio &&
+	if (s->iop.bio && s->do_not_cache == 0 &&
 	    !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
 		BUG_ON(!s->iop.replace);
 		closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
@@ -886,6 +959,12 @@ static int cached_dev_cache_miss(struct btree *b,
struct search *s,

 	s->cache_missed = 1;

+	if (s->do_not_cache == 0 && RB_EMPTY_NODE(&s->itree_node.rb)) {
+		if (check_inflight_overlapping(s)) {
+			s->do_not_cache = 1;
+		}
+	}
+
 	if (s->cache_miss || s->iop.bypass) {
 		miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
 		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
@@ -981,6 +1060,13 @@ static void cached_dev_write(struct cached_dev
*dc, struct search *s)

 	bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);

+	if (s->iop.bypass == true) {
+		if (check_inflight_overlapping(s)) {
+			/* convert bypass write into writethrough write */
+			s->iop.bypass = false;
+		}
+	}
+
 	down_read_non_owner(&dc->writeback_lock);
 	if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
 		/*
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 503aafe188dc..31dbf69c80d6 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -49,6 +49,7 @@ read_attribute(cache_hit_ratio);
 read_attribute(cache_readaheads);
 read_attribute(cache_miss_collisions);
 read_attribute(bypassed);
+read_attribute(cache_bypass_races);

 SHOW(bch_stats)
 {
@@ -66,6 +67,7 @@ SHOW(bch_stats)

 	var_print(cache_readaheads);
 	var_print(cache_miss_collisions);
+	var_print(cache_bypass_races);
 	sysfs_hprint(bypassed,	var(sectors_bypassed) << 9);
 #undef var
 	return 0;
@@ -89,6 +91,7 @@ static struct attribute *bch_stats_files[] = {
 	&sysfs_cache_readaheads,
 	&sysfs_cache_miss_collisions,
 	&sysfs_bypassed,
+	&sysfs_cache_bypass_races,
 	NULL
 };
 static KTYPE(bch_stats);
@@ -116,6 +119,7 @@ void bch_cache_accounting_clear(struct
cache_accounting *acc)
 	acc->total.cache_readaheads = 0;
 	acc->total.cache_miss_collisions = 0;
 	acc->total.sectors_bypassed = 0;
+	acc->total.cache_bypass_races = 0;
 }

 void bch_cache_accounting_destroy(struct cache_accounting *acc)
@@ -148,6 +152,7 @@ static void scale_stats(struct cache_stats *stats,
unsigned long rescale_at)
 		scale_stat(&stats->cache_readaheads);
 		scale_stat(&stats->cache_miss_collisions);
 		scale_stat(&stats->sectors_bypassed);
+		scale_stat(&stats->cache_bypass_races);
 	}
 }

@@ -171,6 +176,7 @@ static void scale_accounting(struct timer_list *t)
 	move_stat(cache_readaheads);
 	move_stat(cache_miss_collisions);
 	move_stat(sectors_bypassed);
+	move_stat(cache_bypass_races);

 	scale_stats(&acc->total, 0);
 	scale_stats(&acc->day, DAY_RESCALE);
@@ -232,6 +238,14 @@ void bch_mark_sectors_bypassed(struct cache_set
*c, struct cached_dev *dc,
 	atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
 }

+void bch_mark_cache_bypass_race(struct cache_set *c, struct bcache_device *d)
+{
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
+	atomic_inc(&dc->accounting.collector.cache_bypass_races);
+	atomic_inc(&c->accounting.collector.cache_bypass_races);
+}
+
 void bch_cache_accounting_init(struct cache_accounting *acc,
 			       struct closure *parent)
 {
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
index abfaabf7e7fc..4cd2dcea068b 100644
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@@ -10,6 +10,7 @@ struct cache_stat_collector {
 	atomic_t cache_readaheads;
 	atomic_t cache_miss_collisions;
 	atomic_t sectors_bypassed;
+	atomic_t cache_bypass_races;
 };

 struct cache_stats {
@@ -22,6 +23,7 @@ struct cache_stats {
 	unsigned long cache_readaheads;
 	unsigned long cache_miss_collisions;
 	unsigned long sectors_bypassed;
+	unsigned long cache_bypass_races;

 	unsigned int		rescale;
 };
@@ -61,5 +63,7 @@ void bch_mark_cache_miss_collision(struct cache_set *c,
 void bch_mark_sectors_bypassed(struct cache_set *c,
 			       struct cached_dev *dc,
 			       int sectors);
+void bch_mark_cache_bypass_race(struct cache_set *c,
+			       struct bcache_device *d);

 #endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 03e1fe4de53d..3bcc615f1216 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1495,6 +1495,10 @@ static int register_bdev(struct cache_sb *sb,
struct cache_sb_disk *sb_disk,
 			goto err;
 	}

+	spin_lock_init(&dc->inflight_lock);
+	dc->inflight_read_root = RB_ROOT_CACHED;
+	dc->inflight_bypass_write_root = RB_ROOT_CACHED;
+
 	return 0;
 err:
 	pr_notice("error %s: %s\n", dc->backing_dev_name, err);
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] RFC - Write Bypass Race Bug
  2021-04-30 14:44 [PATCH v2] RFC - Write Bypass Race Bug Marc Smith
@ 2021-05-06  4:09 ` Coly Li
  2021-05-10 21:06   ` Marc Smith
  0 siblings, 1 reply; 4+ messages in thread
From: Coly Li @ 2021-05-06  4:09 UTC (permalink / raw)
  To: Marc Smith; +Cc: linux-bcache

On 4/30/21 10:44 PM, Marc Smith wrote:
> The problem:
> If an inflight backing WRITE operation for a block is performed that
> meets the criteria for bypassing the cache and that takes a long time
> to complete, a READ operation for the same block may be fully
> processed in the interim that populates the cache with the device
> content from before the inflight WRITE. When the inflight WRITE
> finally completes, since it was marked for bypass, the cache is not
> subsequently updated, and the stale data populated by the READ request
> remains in cache. While there is code in bcache for invalidating the
> cache when a bypassed WRITE is performed, this is done prior to
> issuing the backing I/O so it does not help.
> 
> The proposed fix:
> Add two new lists to the cached_dev structure to track inflight
> "bypass" write requests and inflight read requests that have have
> missed cache. These are called "inflight_bypass_write_list" and
> "inflight_read_list", respectively, and are protected by a spinlock
> called the "inflight_lock"
> 
> When a WRITE is bypassing the cache, check whether there is an
> overlapping inflight read. If so, set bypass = false to essentially
> convert the bypass write into a writethrough write. Otherwise, if
> there is no overlapping inflight read, then add the "search" structure
> to the inflight bypass write list.

Hi Marc,

Could you please explain a bit more why the above thing is necessary ?
Please help me to understand your idea more clear :-)

> 
> When a READ misses cache, check whether there is an overlapping
> inflight write. If so, set a new flag in the search structure called
> "do_not_cache" which causes cache population to be skipped after the
> backing I/O completes. Otherwise, if there is no overlapping inflight
> write, then add the "search" structure to the inflight read list.
> 
> The rest of the changes are to add a new stat called
> "bypass_cache_insert_races" to track how many times the race was
> encountered. Example:
> cat /sys/fs/bcache/0c9b7a62-b431-4328-9dcb-a81e322238af/bdev0/stats_total/cache_bypass_races
> 16577
> 

The stat counters make sense.

> Assuming this is the correct approach, areas to look at:
> 1) Searching linked lists doesn't scale. Can something like an
> interval tree be used here instead?

Tree is not good. Heavy I/O may add and delete many nodes in and from
the tree, the operation is heavy and congested, especially this is a
balanced tree.


> 2) Can this be restructured so that the inflight_lock doesn't have to
> be accessed with interrupts disabled? Note that search_free() can be
> called in interrupt context.

Yes it is possible, with a lockless approach. What is needed is an array
to atomic counter. Each element of the array covers a range of LBA, if a
bypass write hits a range of LBA, increases the atomic counter from
corresponding element(s). For the cache miss read, just do an atomic
read without tree iteration and any lock protection.

An array of atomic counters should work but not space efficient, memory
should be allocated for all LBA ranges even most of the counters not
touched during the whole system up time.

Maybe xarray works, but I don't look into the xarray api carefully yet.

> 3) Can do_not_cache just be another (1-bit) flag in the search
> structure instead of occupying its own "int" ?

At this moment, taking an int space is OK.


Thanks for the patch.

Coly Li

> 
> v1 -> v2 changes:
> - Use interval trees instead of linked lists to track inflight requests.
> - Change code order to avoid acquiring lock in search_free().
> ---
>  drivers/md/bcache/bcache.h  |  4 ++
>  drivers/md/bcache/request.c | 88 ++++++++++++++++++++++++++++++++++++-
>  drivers/md/bcache/stats.c   | 14 ++++++
>  drivers/md/bcache/stats.h   |  4 ++
>  drivers/md/bcache/super.c   |  4 ++
>  5 files changed, 113 insertions(+), 1 deletion(-)
> 

[snipped]


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] RFC - Write Bypass Race Bug
  2021-05-06  4:09 ` Coly Li
@ 2021-05-10 21:06   ` Marc Smith
  2021-05-16 14:58     ` Coly Li
  0 siblings, 1 reply; 4+ messages in thread
From: Marc Smith @ 2021-05-10 21:06 UTC (permalink / raw)
  To: Coly Li; +Cc: linux-bcache

On Thu, May 6, 2021 at 12:09 AM Coly Li <colyli@suse.de> wrote:
>
> On 4/30/21 10:44 PM, Marc Smith wrote:
> > The problem:
> > If an inflight backing WRITE operation for a block is performed that
> > meets the criteria for bypassing the cache and that takes a long time
> > to complete, a READ operation for the same block may be fully
> > processed in the interim that populates the cache with the device
> > content from before the inflight WRITE. When the inflight WRITE
> > finally completes, since it was marked for bypass, the cache is not
> > subsequently updated, and the stale data populated by the READ request
> > remains in cache. While there is code in bcache for invalidating the
> > cache when a bypassed WRITE is performed, this is done prior to
> > issuing the backing I/O so it does not help.
> >
> > The proposed fix:
> > Add two new lists to the cached_dev structure to track inflight
> > "bypass" write requests and inflight read requests that have have
> > missed cache. These are called "inflight_bypass_write_list" and
> > "inflight_read_list", respectively, and are protected by a spinlock
> > called the "inflight_lock"
> >
> > When a WRITE is bypassing the cache, check whether there is an
> > overlapping inflight read. If so, set bypass = false to essentially
> > convert the bypass write into a writethrough write. Otherwise, if
> > there is no overlapping inflight read, then add the "search" structure
> > to the inflight bypass write list.
>
> Hi Marc,
>
> Could you please explain a bit more why the above thing is necessary ?
> Please help me to understand your idea more clear :-)
>

As stated previously, the race condition involves two requests attempting
I/O on the same block. One is a bypass write and the other is a normal
read that misses the cache.

The simplest form of the race is the following:

1.  BYPASS WRITER starts processing bcache request
2.  BYPASS WRITER invalidates block in the cache
3.  NORMAL READER starts processing bcache request
4.  NORMAL READER misses cache, issues READ request to backing device
5.  NORMAL READER processes completion of backing dev request
6.  NORMAL READER populates cache with stale data
7.  NORMAL READER completes
8.  BYPASS WRITER issues WRITE request to backing device
9.  BYPASS WRITER processes completion of backing dev request
10. BYPASS WRITER completes

An approach to avoiding the race is to add the following steps:

1.5 BYPASS WRITER adds the sector range to a data structure that holds
    inflight writes. (As currently coded, this data structure is an
    interval tree.)

5.5 NORMAL READER checks the data structure to see if there are any
    inflight bypass writers having overlapping sectors. If so, skip
    step 6.

9.5 BYPASS WRITER removes sector range added in step 1.5

If this were the only order of operations for the race, then the 3 added
steps would be sufficient and only inflight BYPASS WRITES would need
to be tracked.

However, consider the following ordering with new steps above added:

1.  NORMAL READER starts processing bcache request
2.  NORMAL READER misses cache, issues READ request to backing device
3.  NORMAL READER processes completion of backing dev request
4.  NORMAL READER checks the data structure to see if there are any
    inflight bypass writers having overlapping sectors. None are found.
5.  BYPASS WRITER starts processing bcache request
6.  BYPASS WRITER adds the sector range to a data structure that holds
    inflight writes.
7.  BYPASS WRITER invalidates block in the cache
8.  BYPASS WRITER issues WRITE request to backing device
9 . NORMAL READER populates cache with stale data
10. NORMAL READER completes
11. BYPASS WRITER processes completion of backing dev request
12. BYPASS WRITER removes sector range added in step 6.
13. BYPASS WRITER completes

With this order of operations, the added steps don't solve the problem
as stale data is added in step #9 and never corrected.

So to cover both scenarios (and potentially others), both inflight bypass
WRITEs and normal READs are tracked. If a BYPASS WRITE is being performed,
it checks for an inflight overlapping read. If one exists, the BYPASS
WRITE coverts itself to a WRITETHROUGH, which essentially "neutralizes" it
from the race. If, however, there are no overlapping READs, it adds itself
to a tracking data structure. When a normal read is issued that misses
the cache, a check is made for an inflight overlapping BYPASS WRITE. If
one exists, the READ sets the "do_not_cache" flag which neutralizes it
from the race. If, however, there are no inflight overlapping BYPASS
WRITEs, the sector range is added to the data structure holding
inflight reads.

This effectively adds/modifies the following steps:

1.5 NORMAL READER adds its sector range to a data structure that holds
    inflight writes.

5.5 BYPASS WRITER checks for the data structure to see if there is an
    inflight overlapping READ. There is one! So it coverts itself to
    a WRITETHROUGH write.

8.  [WRITETHROUGH] WRITER issues WRITE request to BOTH the cache
    and the backing device. A writethrough racing with a reader (step 9)
    is already correctly handled by bcache (I think.)

9.5 NORMAL READER removes the sector range added in step 1.5

In summary, when a BYPASS WRITE is racing with a normal READ,
one of them needs to take evasive maneuvers to avoid corruption. The
BYPASS WRITE can convert itself to a WRITETHROUGH WRITE or the READ
can skip populating the cache. To handle all order of operations, it
appears that support for both are needed depending on whether the
BYPASS WRITE or normal READ arrives first. Therefore, both BYPASS WRITES
and normal READS need to be tracked.


> >
> > When a READ misses cache, check whether there is an overlapping
> > inflight write. If so, set a new flag in the search structure called
> > "do_not_cache" which causes cache population to be skipped after the
> > backing I/O completes. Otherwise, if there is no overlapping inflight
> > write, then add the "search" structure to the inflight read list.
> >
> > The rest of the changes are to add a new stat called
> > "bypass_cache_insert_races" to track how many times the race was
> > encountered. Example:
> > cat /sys/fs/bcache/0c9b7a62-b431-4328-9dcb-a81e322238af/bdev0/stats_total/cache_bypass_races
> > 16577
> >
>
> The stat counters make sense.
>
> > Assuming this is the correct approach, areas to look at:
> > 1) Searching linked lists doesn't scale. Can something like an
> > interval tree be used here instead?
>
> Tree is not good. Heavy I/O may add and delete many nodes in and from
> the tree, the operation is heavy and congested, especially this is a
> balanced tree.
>
>
> > 2) Can this be restructured so that the inflight_lock doesn't have to
> > be accessed with interrupts disabled? Note that search_free() can be
> > called in interrupt context.
>
> Yes it is possible, with a lockless approach. What is needed is an array
> to atomic counter. Each element of the array covers a range of LBA, if a
> bypass write hits a range of LBA, increases the atomic counter from
> corresponding element(s). For the cache miss read, just do an atomic
> read without tree iteration and any lock protection.
>
> An array of atomic counters should work but not space efficient, memory
> should be allocated for all LBA ranges even most of the counters not
> touched during the whole system up time.
>
> Maybe xarray works, but I don't look into the xarray api carefully yet.

Okay, we'll look into using this data structure, thanks.


>
> > 3) Can do_not_cache just be another (1-bit) flag in the search
> > structure instead of occupying its own "int" ?
>
> At this moment, taking an int space is OK.
>
>
> Thanks for the patch.
>
> Coly Li
>
> >
> > v1 -> v2 changes:
> > - Use interval trees instead of linked lists to track inflight requests.
> > - Change code order to avoid acquiring lock in search_free().
> > ---
> >  drivers/md/bcache/bcache.h  |  4 ++
> >  drivers/md/bcache/request.c | 88 ++++++++++++++++++++++++++++++++++++-
> >  drivers/md/bcache/stats.c   | 14 ++++++
> >  drivers/md/bcache/stats.h   |  4 ++
> >  drivers/md/bcache/super.c   |  4 ++
> >  5 files changed, 113 insertions(+), 1 deletion(-)
> >
>
> [snipped]
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] RFC - Write Bypass Race Bug
  2021-05-10 21:06   ` Marc Smith
@ 2021-05-16 14:58     ` Coly Li
  0 siblings, 0 replies; 4+ messages in thread
From: Coly Li @ 2021-05-16 14:58 UTC (permalink / raw)
  To: Marc Smith; +Cc: linux-bcache

On 5/11/21 5:06 AM, Marc Smith wrote:
> On Thu, May 6, 2021 at 12:09 AM Coly Li <colyli@suse.de> wrote:
>>
>> On 4/30/21 10:44 PM, Marc Smith wrote:
>>> The problem:
>>> If an inflight backing WRITE operation for a block is performed that
>>> meets the criteria for bypassing the cache and that takes a long time
>>> to complete, a READ operation for the same block may be fully
>>> processed in the interim that populates the cache with the device
>>> content from before the inflight WRITE. When the inflight WRITE
>>> finally completes, since it was marked for bypass, the cache is not
>>> subsequently updated, and the stale data populated by the READ request
>>> remains in cache. While there is code in bcache for invalidating the
>>> cache when a bypassed WRITE is performed, this is done prior to
>>> issuing the backing I/O so it does not help.
>>>
>>> The proposed fix:
>>> Add two new lists to the cached_dev structure to track inflight
>>> "bypass" write requests and inflight read requests that have have
>>> missed cache. These are called "inflight_bypass_write_list" and
>>> "inflight_read_list", respectively, and are protected by a spinlock
>>> called the "inflight_lock"
>>>
>>> When a WRITE is bypassing the cache, check whether there is an
>>> overlapping inflight read. If so, set bypass = false to essentially
>>> convert the bypass write into a writethrough write. Otherwise, if
>>> there is no overlapping inflight read, then add the "search" structure
>>> to the inflight bypass write list.
>>
>> Hi Marc,
>>
>> Could you please explain a bit more why the above thing is necessary ?
>> Please help me to understand your idea more clear :-)
>>
> 
> As stated previously, the race condition involves two requests attempting
> I/O on the same block. One is a bypass write and the other is a normal
> read that misses the cache.
> 
> The simplest form of the race is the following:
> 
> 1.  BYPASS WRITER starts processing bcache request
> 2.  BYPASS WRITER invalidates block in the cache
> 3.  NORMAL READER starts processing bcache request
> 4.  NORMAL READER misses cache, issues READ request to backing device
> 5.  NORMAL READER processes completion of backing dev request
> 6.  NORMAL READER populates cache with stale data
> 7.  NORMAL READER completes
> 8.  BYPASS WRITER issues WRITE request to backing device
> 9.  BYPASS WRITER processes completion of backing dev request
> 10. BYPASS WRITER completes
> 
> An approach to avoiding the race is to add the following steps:
> 
> 1.5 BYPASS WRITER adds the sector range to a data structure that holds
>     inflight writes. (As currently coded, this data structure is an
>     interval tree.)
> 
> 5.5 NORMAL READER checks the data structure to see if there are any
>     inflight bypass writers having overlapping sectors. If so, skip
>     step 6.
> 
> 9.5 BYPASS WRITER removes sector range added in step 1.5
> 
> If this were the only order of operations for the race, then the 3 added
> steps would be sufficient and only inflight BYPASS WRITES would need
> to be tracked.
> 
> However, consider the following ordering with new steps above added:
> 
> 1.  NORMAL READER starts processing bcache request
> 2.  NORMAL READER misses cache, issues READ request to backing device
> 3.  NORMAL READER processes completion of backing dev request
> 4.  NORMAL READER checks the data structure to see if there are any
>     inflight bypass writers having overlapping sectors. None are found.
> 5.  BYPASS WRITER starts processing bcache request
> 6.  BYPASS WRITER adds the sector range to a data structure that holds
>     inflight writes.
> 7.  BYPASS WRITER invalidates block in the cache
> 8.  BYPASS WRITER issues WRITE request to backing device
> 9 . NORMAL READER populates cache with stale data
> 10. NORMAL READER completes
> 11. BYPASS WRITER processes completion of backing dev request
> 12. BYPASS WRITER removes sector range added in step 6.
> 13. BYPASS WRITER completes
> 
> With this order of operations, the added steps don't solve the problem
> as stale data is added in step #9 and never corrected.
> 

I am not sure whether the above condition can take place. For a cache
missing, before reading the missing data from backing device, a replace
key is inserted into the btree as a check key. After the missing data is
read form backing device, the data is returned to caller firstly, and
inserted into btree.

There is a trick for the second-insert, if there is a matching check key
is founded, then the data of read missing is inserted into btree. If the
check key is not find, it means there is another key inserted and
overwritten the check key, than the btree insert for read-missing data
will give up to avoid a race.

So I feel current code may avoid the above race condition. For the first
race condition, I agree that it exists and should be fixed.


> So to cover both scenarios (and potentially others), both inflight bypass
> WRITEs and normal READs are tracked. If a BYPASS WRITE is being performed,
> it checks for an inflight overlapping read. If one exists, the BYPASS
> WRITE coverts itself to a WRITETHROUGH, which essentially "neutralizes" it
> from the race. If, however, there are no overlapping READs, it adds itself
> to a tracking data structure. When a normal read is issued that misses
> the cache, a check is made for an inflight overlapping BYPASS WRITE. If
> one exists, the READ sets the "do_not_cache" flag which neutralizes it
> from the race. If, however, there are no inflight overlapping BYPASS
> WRITEs, the sector range is added to the data structure holding
> inflight reads.
> 
> This effectively adds/modifies the following steps:
> 
> 1.5 NORMAL READER adds its sector range to a data structure that holds
>     inflight writes.
> 
> 5.5 BYPASS WRITER checks for the data structure to see if there is an
>     inflight overlapping READ. There is one! So it coverts itself to
>     a WRITETHROUGH write.
> 
> 8.  [WRITETHROUGH] WRITER issues WRITE request to BOTH the cache
>     and the backing device. A writethrough racing with a reader (step 9)
>     is already correctly handled by bcache (I think.)
> 
> 9.5 NORMAL READER removes the sector range added in step 1.5
> 
> In summary, when a BYPASS WRITE is racing with a normal READ,
> one of them needs to take evasive maneuvers to avoid corruption. The
> BYPASS WRITE can convert itself to a WRITETHROUGH WRITE or the READ
> can skip populating the cache. To handle all order of operations, it
> appears that support for both are needed depending on whether the
> BYPASS WRITE or normal READ arrives first. Therefore, both BYPASS WRITES
> and normal READS need to be tracked.
> 
> 
>>>
>>> When a READ misses cache, check whether there is an overlapping
>>> inflight write. If so, set a new flag in the search structure called
>>> "do_not_cache" which causes cache population to be skipped after the
>>> backing I/O completes. Otherwise, if there is no overlapping inflight
>>> write, then add the "search" structure to the inflight read list.
>>>
>>> The rest of the changes are to add a new stat called
>>> "bypass_cache_insert_races" to track how many times the race was
>>> encountered. Example:
>>> cat /sys/fs/bcache/0c9b7a62-b431-4328-9dcb-a81e322238af/bdev0/stats_total/cache_bypass_races
>>> 16577
>>>
>>
>> The stat counters make sense.
>>
>>> Assuming this is the correct approach, areas to look at:
>>> 1) Searching linked lists doesn't scale. Can something like an
>>> interval tree be used here instead?
>>
>> Tree is not good. Heavy I/O may add and delete many nodes in and from
>> the tree, the operation is heavy and congested, especially this is a
>> balanced tree.
>>
>>
>>> 2) Can this be restructured so that the inflight_lock doesn't have to
>>> be accessed with interrupts disabled? Note that search_free() can be
>>> called in interrupt context.
>>
>> Yes it is possible, with a lockless approach. What is needed is an array
>> to atomic counter. Each element of the array covers a range of LBA, if a
>> bypass write hits a range of LBA, increases the atomic counter from
>> corresponding element(s). For the cache miss read, just do an atomic
>> read without tree iteration and any lock protection.
>>
>> An array of atomic counters should work but not space efficient, memory
>> should be allocated for all LBA ranges even most of the counters not
>> touched during the whole system up time.
>>
>> Maybe xarray works, but I don't look into the xarray api carefully yet.
> 
> Okay, we'll look into using this data structure, thanks.
> 

There is spinlock inside xarray, I am not sure whether it scales well in
random I/O on large LBA range. We need to try.

[snipped]

Coly Li

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-05-16 14:58 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-30 14:44 [PATCH v2] RFC - Write Bypass Race Bug Marc Smith
2021-05-06  4:09 ` Coly Li
2021-05-10 21:06   ` Marc Smith
2021-05-16 14:58     ` Coly Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.