Mike,

I'm Nacking this patch here.  You've taken the comment from my latest
patch, and attached it to something else.  I don't know if you've taken an
older patch and updated it, or taken the latest patch and downgraded it.

For instance the latest patch uses the name 'struct dm_buffer_cache', but
it looks like you've renamed to 'struct buffer_cache' and then posted
another patch in the series that renames back to dm_buffer_cache.

I also see the comment change:

original "A refcount_t for hold_count does not work." which has changed to
"NOTE: attempting to use refcount_t instead of atomic_t caused crashes!".
Talk about over dramatisation (If the hold_count is incorrect we detect and
BUG).

I can't stand by my code if I don't know what's being sent up in my name.

Please use the patchset from this branch:
https://github.com/jthornber/linux/tree/2023-03-21-thin-concurrency-8

- Joe



On Wed, Mar 22, 2023 at 6:19 PM Mike Snitzer <snitzer@kernel.org> wrote:

> From: Joe Thornber <ejt@redhat.com>
>
> When multiple threads perform IO to a thin device, the underlying
> dm_bufio object can become a bottleneck; slowing down access to btree
> nodes that store the thin metadata. Prior to this commit, each bufio
> instance had a single mutex that was taken for every bufio operation.
>
> This commit concentrates on improving the common case: in which a user
> of dm_bufio wishes to access, but not modify, a buffer which is already
> within the dm_bufio cache.
>
> Implementation::
>
>   The code has been refactored; pulling out an 'lru' abstraction and a
>   'buffer cache' abstraction.
>
>  LRU:
>   A clock algorithm is used in the LRU abstraction.  This avoids
>   relinking list nodes, which would require a write lock protecting
>   it.
>
>   None of the lru methods are threadsafe; locking must be done at a
>   higher level.
>
>  Buffer cache:
>   The buffer cache is responsible for managing the holder count,
>   tracking clean/dirty state, and choosing buffers via predicates.
>   Higher level code is responsible for allocation of buffers, IO and
>   eviction/cache sizing.
>
>   The buffer cache has thread safe methods for acquiring a reference
>   to an existing buffer. All other methods in buffer cache are _not_
>   threadsafe, and only contain enough locking to guarantee the safe
>   methods.
>
>   Rather than a single mutex, sharded rw_semaphores are used to allow
>   concurrent threads to 'get' buffers. Each rw_semaphore protects its
>   own rbtree of buffer entries.
>
> Testing::
>
>   Some of the low level functions have been unit tested using dm-unit:
>     https://github.com/jthornber/dm-unit/blob/main/src/tests/bufio.rs
>
>   Higher level concurrency and IO is tested via a test only target
>   found here:
>
> https://github.com/jthornber/linux/blob/2023-02-28-thin-concurrency-7/drivers/md/dm-bufio-test.c
>
>   The associated userland side of these tests is here:
>
> https://github.com/jthornber/dmtest-python/blob/main/src/dmtest/bufio/bufio_tests.py
>
>   In addition the full dmtest suite of tests (dm-thin, dm-cache, etc)
>   has been run (~450 tests).
>
> Performance::
>
>   Most bufio operations have unchanged performance. But if multiple
>   threads are attempting to get buffers concurrently, and these
>   buffers are already in the cache then there's a big speed up. Eg,
>   one test has 16 'hotspot' threads simulating btree lookups while
>   another thread dirties the whole device. In this case the hotspot
>   threads acquire the buffers about 25 times faster.
>
> Signed-off-by: Joe Thornber <ejt@redhat.com>
> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> ---
>  drivers/md/dm-bufio.c | 1769 ++++++++++++++++++++++++++++++-----------
>  1 file changed, 1292 insertions(+), 477 deletions(-)
>
> diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
> index 1de1bdcda1ce..56ef160331ce 100644
> --- a/drivers/md/dm-bufio.c
> +++ b/drivers/md/dm-bufio.c
> @@ -66,6 +66,247 @@
>  #define LIST_DIRTY     1
>  #define LIST_SIZE      2
>
> +/*--------------------------------------------------------------*/
> +
> +/*
> + * Rather than use an LRU list, we use a clock algorithm where entries
> + * are held in a circular list.  When an entry is 'hit' a reference bit
> + * is set.  The least recently used entry is approximated by running a
> + * cursor around the list selecting unreferenced entries. Referenced
> + * entries have their reference bit cleared as the cursor passes them.
> + */
> +struct lru_entry {
> +       struct list_head list;
> +       atomic_t referenced;
> +};
> +
> +struct lru_iter {
> +       struct lru *lru;
> +       struct list_head list;
> +       struct lru_entry *stop;
> +       struct lru_entry *e;
> +};
> +
> +struct lru {
> +       struct list_head *cursor;
> +       unsigned long count;
> +
> +       struct list_head iterators;
> +};
> +
> +/*--------------*/
> +
> +static void lru_init(struct lru *lru)
> +{
> +       lru->cursor = NULL;
> +       lru->count = 0;
> +       INIT_LIST_HEAD(&lru->iterators);
> +}
> +
> +static void lru_destroy(struct lru *lru)
> +{
> +       BUG_ON(lru->cursor);
> +       BUG_ON(!list_empty(&lru->iterators));
> +}
> +
> +static inline unsigned int lru_count(struct lru *lru)
> +{
> +       return lru->count;
> +}
> +
> +/*
> + * Insert a new entry into the lru.
> + */
> +static void lru_insert(struct lru *lru, struct lru_entry *le)
> +{
> +       /*
> +        * Don't be tempted to set to 1, makes the lru aspect
> +        * perform poorly.
> +        */
> +       atomic_set(&le->referenced, 0);
> +
> +       if (lru->cursor)
> +               list_add_tail(&le->list, lru->cursor);
> +
> +       else {
> +               INIT_LIST_HEAD(&le->list);
> +               lru->cursor = &le->list;
> +       }
> +       lru->count++;
> +}
> +
> +/*--------------*/
> +
> +/*
> + * Convert a list_head pointer to an lru_entry pointer.
> + */
> +static inline struct lru_entry *to_le(struct list_head *l)
> +{
> +       return container_of(l, struct lru_entry, list);
> +}
> +
> +/*
> + * Initialize an lru_iter and add it to the list of cursors in the lru.
> + */
> +static void lru_iter_begin(struct lru *lru, struct lru_iter *it)
> +{
> +       it->lru = lru;
> +       it->stop = lru->cursor ? to_le(lru->cursor->prev) : NULL;
> +       it->e = lru->cursor ? to_le(lru->cursor) : NULL;
> +       list_add(&it->list, &lru->iterators);
> +}
> +
> +/*
> + * Remove an lru_iter from the list of cursors in the lru.
> + */
> +static void lru_iter_end(struct lru_iter *it)
> +{
> +       list_del(&it->list);
> +}
> +
> +/* Predicate function type to be used with lru_iter_next */
> +typedef bool (*iter_predicate)(struct lru_entry *le, void *context);
> +
> +/*
> + * Advance the cursor to the next entry that passes the
> + * predicate, and return that entry.  Returns NULL if the
> + * iteration is complete.
> + */
> +static struct lru_entry *lru_iter_next(struct lru_iter *it,
> +                                      iter_predicate pred, void *context)
> +{
> +       struct lru_entry *e;
> +
> +       while (it->e) {
> +               e = it->e;
> +
> +               /* advance the cursor */
> +               if (it->e == it->stop)
> +                       it->e = NULL;
> +               else
> +                       it->e = to_le(it->e->list.next);
> +
> +               if (pred(e, context))
> +                       return e;
> +       }
> +
> +       return NULL;
> +}
> +
> +/*
> + * Invalidate a specific lru_entry and update all cursors in
> + * the lru accordingly.
> + */
> +static void lru_iter_invalidate(struct lru *lru, struct lru_entry *e)
> +{
> +       struct lru_iter *it;
> +
> +       list_for_each_entry(it, &lru->iterators, list) {
> +               /* Move c->e forwards if necc. */
> +               if (it->e == e) {
> +                       it->e = to_le(it->e->list.next);
> +                       if (it->e == e)
> +                               it->e = NULL;
> +               }
> +
> +               /* Move it->stop backwards if necc. */
> +               if (it->stop == e) {
> +                       it->stop = to_le(it->stop->list.prev);
> +                       if (it->stop == e)
> +                               it->stop = NULL;
> +               }
> +       }
> +}
> +
> +/*--------------*/
> +
> +/*
> + * Remove a specific entry from the lru.
> + */
> +static void lru_remove(struct lru *lru, struct lru_entry *le)
> +{
> +       lru_iter_invalidate(lru, le);
> +       if (lru->count == 1)
> +               lru->cursor = NULL;
> +       else {
> +               if (lru->cursor == &le->list)
> +                       lru->cursor = lru->cursor->next;
> +               list_del(&le->list);
> +       }
> +       lru->count--;
> +}
> +
> +/*
> + * Mark as referenced.
> + */
> +static inline void lru_reference(struct lru_entry *le)
> +{
> +       atomic_set(&le->referenced, 1);
> +}
> +
> +/*--------------*/
> +
> +/*
> + * Remove the least recently used entry (approx), that passes the
> predicate.
> + * Returns NULL on failure.
> + */
> +enum evict_result {
> +       ER_EVICT,
> +       ER_DONT_EVICT,
> +       ER_STOP, /* stop looking for something to evict */
> +};
> +
> +typedef enum evict_result (*le_predicate)(struct lru_entry *le, void
> *context);
> +
> +static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred,
> void *context)
> +{
> +       unsigned int tested = 0;
> +       struct list_head *h = lru->cursor;
> +       struct lru_entry *le;
> +
> +       if (!h)
> +               return NULL;
> +       /*
> +        * In the worst case we have to loop around twice. Once to clear
> +        * the reference flags, and then again to discover the predicate
> +        * fails for all entries.
> +        */
> +       while (tested < lru->count) {
> +               le = container_of(h, struct lru_entry, list);
> +
> +               if (atomic_read(&le->referenced))
> +                       atomic_set(&le->referenced, 0);
> +               else {
> +                       tested++;
> +                       switch (pred(le, context)) {
> +                       case ER_EVICT:
> +                               /*
> +                                * Adjust the cursor, so we start the next
> +                                * search from here.
> +                                */
> +                               lru->cursor = le->list.next;
> +                               lru_remove(lru, le);
> +                               return le;
> +
> +                       case ER_DONT_EVICT:
> +                               break;
> +
> +                       case ER_STOP:
> +                               lru->cursor = le->list.next;
> +                               return NULL;
> +                       }
> +               }
> +
> +               h = h->next;
> +
> +               cond_resched();
> +       }
> +
> +       return NULL;
> +}
> +
> +/*--------------------------------------------------------------*/
> +
>  /*
>   * Buffer state bits.
>   */
> @@ -86,28 +327,37 @@ enum data_mode {
>  };
>
>  struct dm_buffer {
> +       /* protected by the locks in buffer_cache */
>         struct rb_node node;
> -       struct list_head lru_list;
> -       struct list_head global_list;
>
> +       /* immutable, so don't need protecting */
>         sector_t block;
>         void *data;
>         unsigned char data_mode;                /* DATA_MODE_* */
>
> -       unsigned int accessed;
> -       unsigned int hold_count;
> +       /*
> +        * These two fields are used in isolation, so do not need
> +        * a surrounding lock. NOTE: attempting to use refcount_t
> +        * instead of atomic_t caused crashes!
> +        */
> +       atomic_t hold_count;
>         unsigned long last_accessed;
>
> +       /*
> +        * Everything else is protected by the mutex in
> +        * dm_bufio_client
> +        */
> +       unsigned long state;
> +       struct lru_entry lru;
>         unsigned char list_mode;                /* LIST_* */
>         blk_status_t read_error;
>         blk_status_t write_error;
> -       unsigned long state;
>         unsigned int dirty_start;
>         unsigned int dirty_end;
>         unsigned int write_start;
>         unsigned int write_end;
> -       struct dm_bufio_client *c;
>         struct list_head write_list;
> +       struct dm_bufio_client *c;
>         void (*end_io)(struct dm_buffer *b, blk_status_t bs);
>  #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
>  #define MAX_STACK 10
> @@ -116,9 +366,579 @@ struct dm_buffer {
>  #endif
>  };
>
> +/*--------------------------------------------------------------*/
> +
> +/*
> + * The buffer cache manages buffers, particularly:
> + *  - inc/dec of holder count
> + *  - setting the last_accessed field
> + *  - maintains clean/dirty state along with lru
> + *  - selecting buffers that match predicates
> + *
> + * It does *not* handle:
> + *  - allocation/freeing of buffers.
> + *  - IO
> + *  - Eviction or cache sizing.
> + *
> + * cache_get() and cache_put() are threadsafe, you do not need to
> + * protect these calls with a surrounding mutex.  All the other
> + * methods are not threadsafe; they do use locking primitives, but
> + * only enough to ensure get/put are threadsafe.
> + */
> +
> +#define NR_LOCKS 64
> +#define LOCKS_MASK (NR_LOCKS - 1)
> +
> +struct tree_lock {
> +       struct rw_semaphore lock;
> +} ____cacheline_aligned_in_smp;
> +
> +struct buffer_cache {
> +       /*
> +        * We spread entries across multiple trees to reduce contention
> +        * on the locks.
> +        */
> +       struct tree_lock locks[NR_LOCKS];
> +       struct rb_root roots[NR_LOCKS];
> +       struct lru lru[LIST_SIZE];
> +};
> +
> +/*
> + * We want to scatter the buffers across the different rbtrees
> + * to improve concurrency, but we also want to encourage the
> + * lock_history optimisation (below). Scattering runs of 16
> + * buffers seems to be a good compromise.
> + */
> +static inline unsigned int cache_index(sector_t block)
> +{
> +       return block & LOCKS_MASK;
> +}
> +
> +static inline void cache_read_lock(struct buffer_cache *bc, sector_t
> block)
> +{
> +       down_read(&bc->locks[cache_index(block)].lock);
> +}
> +
> +static inline void cache_read_unlock(struct buffer_cache *bc, sector_t
> block)
> +{
> +       up_read(&bc->locks[cache_index(block)].lock);
> +}
> +
> +static inline void cache_write_lock(struct buffer_cache *bc, sector_t
> block)
> +{
> +       down_write(&bc->locks[cache_index(block)].lock);
> +}
> +
> +static inline void cache_write_unlock(struct buffer_cache *bc, sector_t
> block)
> +{
> +       up_write(&bc->locks[cache_index(block)].lock);
> +}
> +
> +/*
> + * Sometimes we want to repeatedly get and drop locks as part of an
> iteration.
> + * This struct helps avoid redundant drop and gets of the same lock.
> + */
> +struct lock_history {
> +       struct buffer_cache *cache;
> +       bool write;
> +       unsigned int previous;
> +};
> +
> +static void lh_init(struct lock_history *lh, struct buffer_cache *cache,
> bool write)
> +{
> +       lh->cache = cache;
> +       lh->write = write;
> +       lh->previous = NR_LOCKS;  /* indicates no previous */
> +}
> +
> +static void __lh_lock(struct lock_history *lh, unsigned int index)
> +{
> +       if (lh->write)
> +               down_write(&lh->cache->locks[index].lock);
> +       else
> +               down_read(&lh->cache->locks[index].lock);
> +}
> +
> +static void __lh_unlock(struct lock_history *lh, unsigned int index)
> +{
> +       if (lh->write)
> +               up_write(&lh->cache->locks[index].lock);
> +       else
> +               up_read(&lh->cache->locks[index].lock);
> +}
> +
> +/*
> + * Make sure you call this since it will unlock the final lock.
> + */
> +static void lh_exit(struct lock_history *lh)
> +{
> +       if (lh->previous != NR_LOCKS) {
> +               __lh_unlock(lh, lh->previous);
> +               lh->previous = NR_LOCKS;
> +       }
> +}
> +
> +/*
> + * Named 'next' because there is no corresponding
> + * 'up/unlock' call since it's done automatically.
> + */
> +static void lh_next(struct lock_history *lh, sector_t b)
> +{
> +       unsigned int index = cache_index(b);
> +
> +       if (lh->previous != NR_LOCKS) {
> +               if (lh->previous != index) {
> +                       __lh_unlock(lh, lh->previous);
> +                       __lh_lock(lh, index);
> +                       lh->previous = index;
> +               }
> +       } else {
> +               __lh_lock(lh, index);
> +               lh->previous = index;
> +       }
> +}
> +
> +static struct dm_buffer *le_to_buffer(struct lru_entry *le)
> +{
> +       return container_of(le, struct dm_buffer, lru);
> +}
> +
> +static struct dm_buffer *list_to_buffer(struct list_head *l)
> +{
> +       struct lru_entry *le = list_entry(l, struct lru_entry, list);
> +
> +       if (!le)
> +               return NULL;
> +
> +       return le_to_buffer(le);
> +}
> +
> +static void cache_init(struct buffer_cache *bc)
> +{
> +       unsigned int i;
> +
> +       for (i = 0; i < NR_LOCKS; i++) {
> +               init_rwsem(&bc->locks[i].lock);
> +               bc->roots[i] = RB_ROOT;
> +       }
> +
> +       lru_init(&bc->lru[LIST_CLEAN]);
> +       lru_init(&bc->lru[LIST_DIRTY]);
> +}
> +
> +static void cache_destroy(struct buffer_cache *bc)
> +{
> +       unsigned int i;
> +
> +       for (i = 0; i < NR_LOCKS; i++)
> +               BUG_ON(!RB_EMPTY_ROOT(&bc->roots[i]));
> +
> +       lru_destroy(&bc->lru[LIST_CLEAN]);
> +       lru_destroy(&bc->lru[LIST_DIRTY]);
> +}
> +
> +/*
> + * not threadsafe, or racey depending how you look at it
> + */
> +static unsigned long cache_count(struct buffer_cache *bc, int list_mode)
> +{
> +       return lru_count(&bc->lru[list_mode]);
> +}
> +
> +/*--------------*/
> +
> +/*
> + * not threadsafe
> + */
> +static unsigned long cache_total(struct buffer_cache *bc)
> +{
> +       return lru_count(&bc->lru[LIST_CLEAN]) +
> +               lru_count(&bc->lru[LIST_DIRTY]);
> +}
> +
> +/*--------------*/
> +
> +/*
> + * Gets a specific buffer, indexed by block.
> + * If the buffer is found then its holder count will be incremented and
> + * lru_reference will be called.
> + *
> + * threadsafe
> + */
> +static struct dm_buffer *__cache_get(const struct rb_root *root, sector_t
> block)
> +{
> +       struct rb_node *n = root->rb_node;
> +       struct dm_buffer *b;
> +
> +       while (n) {
> +               b = container_of(n, struct dm_buffer, node);
> +
> +               if (b->block == block)
> +                       return b;
> +
> +               n = block < b->block ? n->rb_left : n->rb_right;
> +       }
> +
> +       return NULL;
> +}
> +
> +static void __cache_inc_buffer(struct dm_buffer *b)
> +{
> +       atomic_inc(&b->hold_count);
> +       WRITE_ONCE(b->last_accessed, jiffies);
> +}
> +
> +static struct dm_buffer *cache_get(struct buffer_cache *bc, sector_t
> block)
> +{
> +       struct dm_buffer *b;
> +
> +       cache_read_lock(bc, block);
> +       b = __cache_get(&bc->roots[cache_index(block)], block);
> +       if (b) {
> +               lru_reference(&b->lru);
> +               __cache_inc_buffer(b);
> +       }
> +       cache_read_unlock(bc, block);
> +
> +       return b;
> +}
> +
> +/*--------------*/
> +
> +/*
> + * Returns true if the hold count hits zero.
> + * threadsafe
> + */
> +static bool cache_put(struct buffer_cache *bc, struct dm_buffer *b)
> +{
> +       bool r;
> +
> +       cache_read_lock(bc, b->block);
> +       BUG_ON(!atomic_read(&b->hold_count));
> +       r = atomic_dec_and_test(&b->hold_count);
> +       cache_read_unlock(bc, b->block);
> +
> +       return r;
> +}
> +
> +/*--------------*/
> +
> +typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *);
> +
> +/*
> + * Evicts a buffer based on a predicate.  The oldest buffer that
> + * matches the predicate will be selected.  In addition to the
> + * predicate the hold_count of the selected buffer will be zero.
> + */
> +struct evict_wrapper {
> +       struct lock_history *lh;
> +       b_predicate pred;
> +       void *context;
> +};
> +
> +/*
> + * Wraps the buffer predicate turning it into an lru predicate.  Adds
> + * extra test for hold_count.
> + */
> +static enum evict_result __evict_pred(struct lru_entry *le, void *context)
> +{
> +       struct evict_wrapper *w = context;
> +       struct dm_buffer *b = le_to_buffer(le);
> +
> +       lh_next(w->lh, b->block);
> +
> +       if (atomic_read(&b->hold_count))
> +               return ER_DONT_EVICT;
> +
> +       return w->pred(b, w->context);
> +}
> +
> +static struct dm_buffer *__cache_evict(struct buffer_cache *bc, int
> list_mode,
> +                                      b_predicate pred, void *context,
> +                                      struct lock_history *lh)
> +{
> +       struct evict_wrapper w = {.lh = lh, .pred = pred, .context =
> context};
> +       struct lru_entry *le;
> +       struct dm_buffer *b;
> +
> +       le = lru_evict(&bc->lru[list_mode], __evict_pred, &w);
> +       if (!le)
> +               return NULL;
> +
> +       b = le_to_buffer(le);
> +       /* __evict_pred will have locked the appropriate tree. */
> +       rb_erase(&b->node, &bc->roots[cache_index(b->block)]);
> +
> +       return b;
> +}
> +
> +static struct dm_buffer *cache_evict(struct buffer_cache *bc, int
> list_mode,
> +                                    b_predicate pred, void *context)
> +{
> +       struct dm_buffer *b;
> +       struct lock_history lh;
> +
> +       lh_init(&lh, bc, true);
> +       b = __cache_evict(bc, list_mode, pred, context, &lh);
> +       lh_exit(&lh);
> +
> +       return b;
> +}
> +
> +/*--------------*/
> +
> +/*
> + * Mark a buffer as clean or dirty. Not threadsafe.
> + */
> +static void cache_mark(struct buffer_cache *bc, struct dm_buffer *b, int
> list_mode)
> +{
> +       cache_write_lock(bc, b->block);
> +       if (list_mode != b->list_mode) {
> +               lru_remove(&bc->lru[b->list_mode], &b->lru);
> +               b->list_mode = list_mode;
> +               lru_insert(&bc->lru[b->list_mode], &b->lru);
> +       }
> +       cache_write_unlock(bc, b->block);
> +}
> +
> +/*--------------*/
> +
> +/*
> + * Runs through the lru associated with 'old_mode', if the predicate
> matches then
> + * it moves them to 'new_mode'.  Not threadsafe.
> + */
> +static void __cache_mark_many(struct buffer_cache *bc, int old_mode, int
> new_mode,
> +                             b_predicate pred, void *context, struct
> lock_history *lh)
> +{
> +       struct lru_entry *le;
> +       struct dm_buffer *b;
> +       struct evict_wrapper w = {.lh = lh, .pred = pred, .context =
> context};
> +
> +       while (true) {
> +               le = lru_evict(&bc->lru[old_mode], __evict_pred, &w);
> +               if (!le)
> +                       break;
> +
> +               b = le_to_buffer(le);
> +               b->list_mode = new_mode;
> +               lru_insert(&bc->lru[b->list_mode], &b->lru);
> +       }
> +}
> +
> +static void cache_mark_many(struct buffer_cache *bc, int old_mode, int
> new_mode,
> +                           b_predicate pred, void *context)
> +{
> +       struct lock_history lh;
> +
> +       lh_init(&lh, bc, true);
> +       __cache_mark_many(bc, old_mode, new_mode, pred, context, &lh);
> +       lh_exit(&lh);
> +}
> +
> +/*--------------*/
> +
> +/*
> + * Iterates through all clean or dirty entries calling a function for each
> + * entry.  The callback may terminate the iteration early.  Not
> threadsafe.
> + */
> +
> +/*
> + * Iterator functions should return one of these actions to indicate
> + * how the iteration should proceed.
> + */
> +enum it_action {
> +       IT_NEXT,
> +       IT_COMPLETE,
> +};
> +
> +typedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context);
> +
> +static void __cache_iterate(struct buffer_cache *bc, int list_mode,
> +                           iter_fn fn, void *context, struct lock_history
> *lh)
> +{
> +       struct lru *lru = &bc->lru[list_mode];
> +       struct lru_entry *le, *first;
> +
> +       if (!lru->cursor)
> +               return;
> +
> +       first = le = to_le(lru->cursor);
> +       do {
> +               struct dm_buffer *b = le_to_buffer(le);
> +
> +               lh_next(lh, b->block);
> +
> +               switch (fn(b, context)) {
> +               case IT_NEXT:
> +                       break;
> +
> +               case IT_COMPLETE:
> +                       return;
> +               }
> +               cond_resched();
> +
> +               le = to_le(le->list.next);
> +       } while (le != first);
> +}
> +
> +static void cache_iterate(struct buffer_cache *bc, int list_mode,
> +                         iter_fn fn, void *context)
> +{
> +       struct lock_history lh;
> +
> +       lh_init(&lh, bc, false);
> +       __cache_iterate(bc, list_mode, fn, context, &lh);
> +       lh_exit(&lh);
> +}
> +
> +/*--------------*/
> +
> +/*
> + * Passes ownership of the buffer to the cache. Returns false if the
> + * buffer was already present (in which case ownership does not pass).
> + * eg, a race with another thread.
> + *
> + * Holder count should be 1 on insertion.
> + *
> + * Not threadsafe.
> + */
> +static bool __cache_insert(struct rb_root *root, struct dm_buffer *b)
> +{
> +       struct rb_node **new = &root->rb_node, *parent = NULL;
> +       struct dm_buffer *found;
> +
> +       while (*new) {
> +               found = container_of(*new, struct dm_buffer, node);
> +
> +               if (found->block == b->block)
> +                       return false;
> +
> +               parent = *new;
> +               new = b->block < found->block ?
> +                       &found->node.rb_left : &found->node.rb_right;
> +       }
> +
> +       rb_link_node(&b->node, parent, new);
> +       rb_insert_color(&b->node, root);
> +
> +       return true;
> +}
> +
> +static bool cache_insert(struct buffer_cache *bc, struct dm_buffer *b)
> +{
> +       bool r;
> +
> +       BUG_ON(b->list_mode >= LIST_SIZE);
> +
> +       cache_write_lock(bc, b->block);
> +       BUG_ON(atomic_read(&b->hold_count) != 1);
> +       r = __cache_insert(&bc->roots[cache_index(b->block)], b);
> +       if (r)
> +               lru_insert(&bc->lru[b->list_mode], &b->lru);
> +       cache_write_unlock(bc, b->block);
> +
> +       return r;
> +}
> +
> +/*--------------*/
> +
> +/*
> + * Removes buffer from cache, ownership of the buffer passes back to the
> caller.
> + * Fails if the hold_count is not one (ie. the caller holds the only
> reference).
> + *
> + * Not threadsafe.
> + */
> +static bool cache_remove(struct buffer_cache *bc, struct dm_buffer *b)
> +{
> +       bool r;
> +
> +       cache_write_lock(bc, b->block);
> +
> +       if (atomic_read(&b->hold_count) != 1)
> +               r = false;
> +
> +       else {
> +               r = true;
> +               rb_erase(&b->node, &bc->roots[cache_index(b->block)]);
> +               lru_remove(&bc->lru[b->list_mode], &b->lru);
> +       }
> +
> +       cache_write_unlock(bc, b->block);
> +
> +       return r;
> +}
> +
> +/*--------------*/
> +
> +typedef void (*b_release)(struct dm_buffer *);
> +
> +static struct dm_buffer *__find_next(struct rb_root *root, sector_t block)
> +{
> +       struct rb_node *n = root->rb_node;
> +       struct dm_buffer *b;
> +       struct dm_buffer *best = NULL;
> +
> +       while (n) {
> +               b = container_of(n, struct dm_buffer, node);
> +
> +               if (b->block == block)
> +                       return b;
> +
> +               if (block <= b->block) {
> +                       n = n->rb_left;
> +                       best = b;
> +               } else
> +                       n = n->rb_right;
> +       }
> +
> +       return best;
> +}
> +
> +static void __remove_range(struct buffer_cache *bc,
> +                          struct rb_root *root,
> +                          sector_t begin, sector_t end,
> +                          b_predicate pred, b_release release)
> +{
> +       struct dm_buffer *b;
> +
> +       while (true) {
> +               cond_resched();
> +
> +               b = __find_next(root, begin);
> +               if (!b || (b->block >= end))
> +                       break;
> +
> +               begin = b->block + 1;
> +
> +               if (atomic_read(&b->hold_count))
> +                       continue;
> +
> +               if (pred(b, NULL) == ER_EVICT) {
> +                       rb_erase(&b->node, root);
> +                       lru_remove(&bc->lru[b->list_mode], &b->lru);
> +                       release(b);
> +               }
> +       }
> +}
> +
> +static void cache_remove_range(struct buffer_cache *bc,
> +                              sector_t begin, sector_t end,
> +                              b_predicate pred, b_release release)
> +{
> +       unsigned int i;
> +
> +       for (i = 0; i < NR_LOCKS; i++) {
> +               down_write(&bc->locks[i].lock);
> +               __remove_range(bc, &bc->roots[i], begin, end, pred,
> release);
> +               up_write(&bc->locks[i].lock);
> +       }
> +}
> +
> +/*----------------------------------------------------------------*/
> +
>  /*
>   * Linking of buffers:
> - *     All buffers are linked to buffer_tree with their node field.
> + *     All buffers are linked to buffer_cache with their node field.
>   *
>   *     Clean buffers that are not being written (B_WRITING not set)
>   *     are linked to lru[LIST_CLEAN] with their lru_list field.
> @@ -136,9 +956,6 @@ struct dm_bufio_client {
>         spinlock_t spinlock;
>         bool no_sleep;
>
> -       struct list_head lru[LIST_SIZE];
> -       unsigned long n_buffers[LIST_SIZE];
> -
>         struct block_device *bdev;
>         unsigned int block_size;
>         s8 sectors_per_block_bits;
> @@ -153,7 +970,7 @@ struct dm_bufio_client {
>
>         unsigned int minimum_buffers;
>
> -       struct rb_root buffer_tree;
> +       struct buffer_cache cache;
>         wait_queue_head_t free_buffer_wait;
>
>         sector_t start;
> @@ -165,6 +982,11 @@ struct dm_bufio_client {
>         struct shrinker shrinker;
>         struct work_struct shrink_work;
>         atomic_long_t need_shrink;
> +
> +       /*
> +        * Used by global_cleanup to sort the clients list.
> +        */
> +       unsigned long oldest_buffer;
>  };
>
>  static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
> @@ -181,14 +1003,6 @@ static void dm_bufio_lock(struct dm_bufio_client *c)
>                 mutex_lock_nested(&c->lock, dm_bufio_in_request());
>  }
>
> -static int dm_bufio_trylock(struct dm_bufio_client *c)
> -{
> -       if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
> -               return spin_trylock_bh(&c->spinlock);
> -       else
> -               return mutex_trylock(&c->lock);
> -}
> -
>  static void dm_bufio_unlock(struct dm_bufio_client *c)
>  {
>         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
> @@ -217,10 +1031,6 @@ static unsigned long dm_bufio_cache_size_latch;
>
>  static DEFINE_SPINLOCK(global_spinlock);
>
> -static LIST_HEAD(global_queue);
> -
> -static unsigned long global_num;
> -
>  /*
>   * Buffers are freed after this timeout
>   */
> @@ -254,7 +1064,6 @@ static struct workqueue_struct *dm_bufio_wq;
>  static struct delayed_work dm_bufio_cleanup_old_work;
>  static struct work_struct dm_bufio_replacement_work;
>
> -
>  #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
>  static void buffer_record_stack(struct dm_buffer *b)
>  {
> @@ -262,78 +1071,6 @@ static void buffer_record_stack(struct dm_buffer *b)
>  }
>  #endif
>
> -/*
> - *----------------------------------------------------------------
> - * A red/black tree acts as an index for all the buffers.
> - *----------------------------------------------------------------
> - */
> -static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
> -{
> -       struct rb_node *n = c->buffer_tree.rb_node;
> -       struct dm_buffer *b;
> -
> -       while (n) {
> -               b = container_of(n, struct dm_buffer, node);
> -
> -               if (b->block == block)
> -                       return b;
> -
> -               n = block < b->block ? n->rb_left : n->rb_right;
> -       }
> -
> -       return NULL;
> -}
> -
> -static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t
> block)
> -{
> -       struct rb_node *n = c->buffer_tree.rb_node;
> -       struct dm_buffer *b;
> -       struct dm_buffer *best = NULL;
> -
> -       while (n) {
> -               b = container_of(n, struct dm_buffer, node);
> -
> -               if (b->block == block)
> -                       return b;
> -
> -               if (block <= b->block) {
> -                       n = n->rb_left;
> -                       best = b;
> -               } else {
> -                       n = n->rb_right;
> -               }
> -       }
> -
> -       return best;
> -}
> -
> -static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
> -{
> -       struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
> -       struct dm_buffer *found;
> -
> -       while (*new) {
> -               found = container_of(*new, struct dm_buffer, node);
> -
> -               if (found->block == b->block) {
> -                       BUG_ON(found != b);
> -                       return;
> -               }
> -
> -               parent = *new;
> -               new = b->block < found->block ?
> -                       &found->node.rb_left : &found->node.rb_right;
> -       }
> -
> -       rb_link_node(&b->node, parent, new);
> -       rb_insert_color(&b->node, &c->buffer_tree);
> -}
> -
> -static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
> -{
> -       rb_erase(&b->node, &c->buffer_tree);
> -}
> -
>  /*----------------------------------------------------------------*/
>
>  static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
> @@ -361,16 +1098,9 @@ static void adjust_total_allocated(struct dm_buffer
> *b, bool unlink)
>         if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
>                 dm_bufio_peak_allocated = dm_bufio_current_allocated;
>
> -       b->accessed = 1;
> -
>         if (!unlink) {
> -               list_add(&b->global_list, &global_queue);
> -               global_num++;
>                 if (dm_bufio_current_allocated > dm_bufio_cache_size)
>                         queue_work(dm_bufio_wq,
> &dm_bufio_replacement_work);
> -       } else {
> -               list_del(&b->global_list);
> -               global_num--;
>         }
>
>         spin_unlock(&global_spinlock);
> @@ -486,8 +1216,9 @@ static void free_buffer_data(struct dm_bufio_client
> *c,
>   */
>  static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t
> gfp_mask)
>  {
> -       struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
> +       struct dm_buffer *b;
>
> +       b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
>         if (!b)
>                 return NULL;
>
> @@ -498,6 +1229,7 @@ static struct dm_buffer *alloc_buffer(struct
> dm_bufio_client *c, gfp_t gfp_mask)
>                 kmem_cache_free(c->slab_buffer, b);
>                 return NULL;
>         }
> +       adjust_total_allocated(b, false);
>
>  #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
>         b->stack_len = 0;
> @@ -512,61 +1244,11 @@ static void free_buffer(struct dm_buffer *b)
>  {
>         struct dm_bufio_client *c = b->c;
>
> +       adjust_total_allocated(b, true);
>         free_buffer_data(c, b->data, b->data_mode);
>         kmem_cache_free(c->slab_buffer, b);
>  }
>
> -/*
> - * Link buffer to the buffer tree and clean or dirty queue.
> - */
> -static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
> -{
> -       struct dm_bufio_client *c = b->c;
> -
> -       c->n_buffers[dirty]++;
> -       b->block = block;
> -       b->list_mode = dirty;
> -       list_add(&b->lru_list, &c->lru[dirty]);
> -       __insert(b->c, b);
> -       b->last_accessed = jiffies;
> -
> -       adjust_total_allocated(b, false);
> -}
> -
> -/*
> - * Unlink buffer from the buffer tree and dirty or clean queue.
> - */
> -static void __unlink_buffer(struct dm_buffer *b)
> -{
> -       struct dm_bufio_client *c = b->c;
> -
> -       BUG_ON(!c->n_buffers[b->list_mode]);
> -
> -       c->n_buffers[b->list_mode]--;
> -       __remove(b->c, b);
> -       list_del(&b->lru_list);
> -
> -       adjust_total_allocated(b, true);
> -}
> -
> -/*
> - * Place the buffer to the head of dirty or clean LRU queue.
> - */
> -static void __relink_lru(struct dm_buffer *b, int dirty)
> -{
> -       struct dm_bufio_client *c = b->c;
> -
> -       b->accessed = 1;
> -
> -       BUG_ON(!c->n_buffers[b->list_mode]);
> -
> -       c->n_buffers[b->list_mode]--;
> -       c->n_buffers[dirty]++;
> -       b->list_mode = dirty;
> -       list_move(&b->lru_list, &c->lru[dirty]);
> -       b->last_accessed = jiffies;
> -}
> -
>  /*
>
> *--------------------------------------------------------------------------
>   * Submit I/O on the buffer.
> @@ -806,7 +1488,7 @@ static void __flush_write_list(struct list_head
> *write_list)
>   */
>  static void __make_buffer_clean(struct dm_buffer *b)
>  {
> -       BUG_ON(b->hold_count);
> +       BUG_ON(atomic_read(&b->hold_count));
>
>         /* smp_load_acquire() pairs with read_endio()'s
> smp_mb__before_atomic() */
>         if (!smp_load_acquire(&b->state))       /* fast case */
> @@ -817,6 +1499,26 @@ static void __make_buffer_clean(struct dm_buffer *b)
>         wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
>  }
>
> +static enum evict_result is_clean(struct dm_buffer *b, void *context)
> +{
> +       struct dm_bufio_client *c = context;
> +
> +       BUG_ON(test_bit(B_WRITING, &b->state));
> +       BUG_ON(test_bit(B_DIRTY, &b->state));
> +
> +       if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
> +           unlikely(test_bit(B_READING, &b->state)))
> +               return ER_DONT_EVICT;
> +
> +       return ER_EVICT;
> +}
> +
> +static enum evict_result is_dirty(struct dm_buffer *b, void *context)
> +{
> +       BUG_ON(test_bit(B_READING, &b->state));
> +       return ER_EVICT;
> +}
> +
>  /*
>   * Find some buffer that is not held by anybody, clean it, unlink it and
>   * return it.
> @@ -825,34 +1527,20 @@ static struct dm_buffer
> *__get_unclaimed_buffer(struct dm_bufio_client *c)
>  {
>         struct dm_buffer *b;
>
> -       list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
> -               BUG_ON(test_bit(B_WRITING, &b->state));
> -               BUG_ON(test_bit(B_DIRTY, &b->state));
> -
> -               if (static_branch_unlikely(&no_sleep_enabled) &&
> c->no_sleep &&
> -                   unlikely(test_bit_acquire(B_READING, &b->state)))
> -                       continue;
> -
> -               if (!b->hold_count) {
> -                       __make_buffer_clean(b);
> -                       __unlink_buffer(b);
> -                       return b;
> -               }
> -               cond_resched();
> +       b = cache_evict(&c->cache, LIST_CLEAN, is_clean, c);
> +       if (b) {
> +               /* this also waits for pending reads */
> +               __make_buffer_clean(b);
> +               return b;
>         }
>
>         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
>                 return NULL;
>
> -       list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
> -               BUG_ON(test_bit(B_READING, &b->state));
> -
> -               if (!b->hold_count) {
> -                       __make_buffer_clean(b);
> -                       __unlink_buffer(b);
> -                       return b;
> -               }
> -               cond_resched();
> +       b = cache_evict(&c->cache, LIST_DIRTY, is_dirty, NULL);
> +       if (b) {
> +               __make_buffer_clean(b);
> +               return b;
>         }
>
>         return NULL;
> @@ -873,7 +1561,12 @@ static void __wait_for_free_buffer(struct
> dm_bufio_client *c)
>         set_current_state(TASK_UNINTERRUPTIBLE);
>         dm_bufio_unlock(c);
>
> -       io_schedule();
> +       /*
> +        * It's possible to miss a wake up event since we don't always
> +        * hold c->lock when wake_up is called.  So we have a timeout here,
> +        * just in case.
> +        */
> +       io_schedule_timeout(5 * HZ);
>
>         remove_wait_queue(&c->free_buffer_wait, &wait);
>
> @@ -931,9 +1624,8 @@ static struct dm_buffer
> *__alloc_buffer_wait_no_callback(struct dm_bufio_client
>                 }
>
>                 if (!list_empty(&c->reserved_buffers)) {
> -                       b = list_entry(c->reserved_buffers.next,
> -                                      struct dm_buffer, lru_list);
> -                       list_del(&b->lru_list);
> +                       b = list_to_buffer(c->reserved_buffers.next);
> +                       list_del(&b->lru.list);
>                         c->need_reserved_buffers++;
>
>                         return b;
> @@ -967,36 +1659,55 @@ static void __free_buffer_wake(struct dm_buffer *b)
>  {
>         struct dm_bufio_client *c = b->c;
>
> +       b->block = -1;
>         if (!c->need_reserved_buffers)
>                 free_buffer(b);
>         else {
> -               list_add(&b->lru_list, &c->reserved_buffers);
> +               list_add(&b->lru.list, &c->reserved_buffers);
>                 c->need_reserved_buffers--;
>         }
>
>         wake_up(&c->free_buffer_wait);
>  }
>
> +static enum evict_result cleaned(struct dm_buffer *b, void *context)
> +{
> +       BUG_ON(test_bit(B_READING, &b->state));
> +
> +       if (test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state))
> +               return ER_DONT_EVICT;
> +       else
> +               return ER_EVICT;
> +}
> +
> +static void __move_clean_buffers(struct dm_bufio_client *c)
> +{
> +       cache_mark_many(&c->cache, LIST_DIRTY, LIST_CLEAN, cleaned, NULL);
> +}
> +
> +struct write_context {
> +       int no_wait;
> +       struct list_head *write_list;
> +};
> +
> +static enum it_action write_one(struct dm_buffer *b, void *context)
> +{
> +       struct write_context *wc = context;
> +
> +       if (wc->no_wait && test_bit(B_WRITING, &b->state))
> +               return IT_COMPLETE;
> +
> +       __write_dirty_buffer(b, wc->write_list);
> +       return IT_NEXT;
> +}
> +
>  static void __write_dirty_buffers_async(struct dm_bufio_client *c, int
> no_wait,
>                                         struct list_head *write_list)
>  {
> -       struct dm_buffer *b, *tmp;
> +       struct write_context wc = {.no_wait = no_wait, .write_list =
> write_list};
>
> -       list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY],
> lru_list) {
> -               BUG_ON(test_bit(B_READING, &b->state));
> -
> -               if (!test_bit(B_DIRTY, &b->state) &&
> -                   !test_bit(B_WRITING, &b->state)) {
> -                       __relink_lru(b, LIST_CLEAN);
> -                       continue;
> -               }
> -
> -               if (no_wait && test_bit(B_WRITING, &b->state))
> -                       return;
> -
> -               __write_dirty_buffer(b, write_list);
> -               cond_resched();
> -       }
> +       __move_clean_buffers(c);
> +       cache_iterate(&c->cache, LIST_DIRTY, write_one, &wc);
>  }
>
>  /*
> @@ -1007,7 +1718,8 @@ static void __write_dirty_buffers_async(struct
> dm_bufio_client *c, int no_wait,
>  static void __check_watermark(struct dm_bufio_client *c,
>                               struct list_head *write_list)
>  {
> -       if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] *
> DM_BUFIO_WRITEBACK_RATIO)
> +       if (cache_count(&c->cache, LIST_DIRTY) >
> +               cache_count(&c->cache, LIST_CLEAN) *
> DM_BUFIO_WRITEBACK_RATIO)
>                 __write_dirty_buffers_async(c, 1, write_list);
>  }
>
> @@ -1017,20 +1729,30 @@ static void __check_watermark(struct
> dm_bufio_client *c,
>   *--------------------------------------------------------------
>   */
>
> +static void cache_put_and_wake(struct dm_bufio_client *c, struct
> dm_buffer *b)
> +{
> +       /*
> +        * Relying on waitqueue_active() is racey, but we sleep
> +        * with schedule_timeout anyway.
> +        */
> +       if (cache_put(&c->cache, b) &&
> +           unlikely(waitqueue_active(&c->free_buffer_wait)))
> +               wake_up(&c->free_buffer_wait);
> +}
> +
> +/*
> + * This assumes you have already checked the cache to see if the buffer
> + * is already present (it will recheck after dropping the lock for
> allocation).
> + */
>  static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t
> block,
>                                      enum new_flag nf, int *need_submit,
>                                      struct list_head *write_list)
>  {
>         struct dm_buffer *b, *new_b = NULL;
> -
>         *need_submit = 0;
>
> -       b = __find(c, block);
> -       if (b)
> -               goto found_buffer;
> -
> -       if (nf == NF_GET)
> -               return NULL;
> +       /* This can't be called with NF_GET */
> +       BUG_ON(nf == NF_GET);
>
>         new_b = __alloc_buffer_wait(c, nf);
>         if (!new_b)
> @@ -1040,7 +1762,7 @@ static struct dm_buffer *__bufio_new(struct
> dm_bufio_client *c, sector_t block,
>          * We've had a period where the mutex was unlocked, so need to
>          * recheck the buffer tree.
>          */
> -       b = __find(c, block);
> +       b = cache_get(&c->cache, block);
>         if (b) {
>                 __free_buffer_wake(new_b);
>                 goto found_buffer;
> @@ -1049,24 +1771,35 @@ static struct dm_buffer *__bufio_new(struct
> dm_bufio_client *c, sector_t block,
>         __check_watermark(c, write_list);
>
>         b = new_b;
> -       b->hold_count = 1;
> +       atomic_set(&b->hold_count, 1);
> +       WRITE_ONCE(b->last_accessed, jiffies);
> +       b->block = block;
>         b->read_error = 0;
>         b->write_error = 0;
> -       __link_buffer(b, block, LIST_CLEAN);
> +       b->list_mode = LIST_CLEAN;
>
> -       if (nf == NF_FRESH) {
> +       if (nf == NF_FRESH)
>                 b->state = 0;
> -               return b;
> +       else {
> +               b->state = 1 << B_READING;
> +               *need_submit = 1;
>         }
>
> -       b->state = 1 << B_READING;
> -       *need_submit = 1;
> +       /*
> +        * We mustn't insert into the cache until the B_READING state
> +        * is set.  Otherwise another thread could get it and use
> +        * it before it had been read.
> +        */
> +       cache_insert(&c->cache, b);
>
>         return b;
>
>  found_buffer:
> -       if (nf == NF_PREFETCH)
> +       if (nf == NF_PREFETCH) {
> +               cache_put_and_wake(c, b);
>                 return NULL;
> +       }
> +
>         /*
>          * Note: it is essential that we don't wait for the buffer to be
>          * read if dm_bufio_get function is used. Both dm_bufio_get and
> @@ -1074,12 +1807,11 @@ static struct dm_buffer *__bufio_new(struct
> dm_bufio_client *c, sector_t block,
>          * If the user called both dm_bufio_prefetch and dm_bufio_get on
>          * the same buffer, it would deadlock if we waited.
>          */
> -       if (nf == NF_GET && unlikely(test_bit_acquire(B_READING,
> &b->state)))
> +       if (nf == NF_GET && unlikely(test_bit_acquire(B_READING,
> &b->state))) {
> +               cache_put_and_wake(c, b);
>                 return NULL;
> +       }
>
> -       b->hold_count++;
> -       __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
> -                    test_bit(B_WRITING, &b->state));
>         return b;
>  }
>
> @@ -1109,18 +1841,49 @@ static void read_endio(struct dm_buffer *b,
> blk_status_t status)
>  static void *new_read(struct dm_bufio_client *c, sector_t block,
>                       enum new_flag nf, struct dm_buffer **bp)
>  {
> -       int need_submit;
> +       int need_submit = 0;
>         struct dm_buffer *b;
>
>         LIST_HEAD(write_list);
> +       *bp = NULL;
> +
> +       /*
> +        * Fast path, hopefully the block is already in the cache.  No need
> +        * to get the client lock for this.
> +        */
> +       b = cache_get(&c->cache, block);
> +       if (b) {
> +               if (nf == NF_PREFETCH) {
> +                       cache_put_and_wake(c, b);
> +                       return NULL;
> +               }
> +
> +               /*
> +                * Note: it is essential that we don't wait for the buffer
> to be
> +                * read if dm_bufio_get function is used. Both
> dm_bufio_get and
> +                * dm_bufio_prefetch can be used in the driver request
> routine.
> +                * If the user called both dm_bufio_prefetch and
> dm_bufio_get on
> +                * the same buffer, it would deadlock if we waited.
> +                */
> +               if (nf == NF_GET && unlikely(test_bit_acquire(B_READING,
> &b->state))) {
> +                       cache_put_and_wake(c, b);
> +                       return NULL;
> +               }
> +       }
> +
> +       if (!b) {
> +               if (nf == NF_GET)
> +                       return NULL;
> +
> +               dm_bufio_lock(c);
> +               b = __bufio_new(c, block, nf, &need_submit, &write_list);
> +               dm_bufio_unlock(c);
> +       }
>
> -       dm_bufio_lock(c);
> -       b = __bufio_new(c, block, nf, &need_submit, &write_list);
>  #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
> -       if (b && b->hold_count == 1)
> +       if (b && (atomic_read(&b->hold_count) == 1))
>                 buffer_record_stack(b);
>  #endif
> -       dm_bufio_unlock(c);
>
>         __flush_write_list(&write_list);
>
> @@ -1141,7 +1904,6 @@ static void *new_read(struct dm_bufio_client *c,
> sector_t block,
>         }
>
>         *bp = b;
> -
>         return b->data;
>  }
>
> @@ -1156,7 +1918,6 @@ void *dm_bufio_read(struct dm_bufio_client *c,
> sector_t block,
>                     struct dm_buffer **bp)
>  {
>         BUG_ON(dm_bufio_in_request());
> -
>         return new_read(c, block, NF_READ, bp);
>  }
>  EXPORT_SYMBOL_GPL(dm_bufio_read);
> @@ -1165,7 +1926,6 @@ void *dm_bufio_new(struct dm_bufio_client *c,
> sector_t block,
>                    struct dm_buffer **bp)
>  {
>         BUG_ON(dm_bufio_in_request());
> -
>         return new_read(c, block, NF_FRESH, bp);
>  }
>  EXPORT_SYMBOL_GPL(dm_bufio_new);
> @@ -1180,12 +1940,19 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
>         BUG_ON(dm_bufio_in_request());
>
>         blk_start_plug(&plug);
> -       dm_bufio_lock(c);
>
>         for (; n_blocks--; block++) {
>                 int need_submit;
>                 struct dm_buffer *b;
>
> +               b = cache_get(&c->cache, block);
> +               if (b) {
> +                       /* already in cache */
> +                       cache_put_and_wake(c, b);
> +                       continue;
> +               }
> +
> +               dm_bufio_lock(c);
>                 b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
>                                 &write_list);
>                 if (unlikely(!list_empty(&write_list))) {
> @@ -1195,6 +1962,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
>                         blk_start_plug(&plug);
>                         dm_bufio_lock(c);
>                 }
> +
>                 if (unlikely(b != NULL)) {
>                         dm_bufio_unlock(c);
>
> @@ -1208,10 +1976,9 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
>                                 goto flush_plug;
>                         dm_bufio_lock(c);
>                 }
> +               dm_bufio_unlock(c);
>         }
>
> -       dm_bufio_unlock(c);
> -
>  flush_plug:
>         blk_finish_plug(&plug);
>  }
> @@ -1221,29 +1988,28 @@ void dm_bufio_release(struct dm_buffer *b)
>  {
>         struct dm_bufio_client *c = b->c;
>
> -       dm_bufio_lock(c);
> +       /*
> +        * If there were errors on the buffer, and the buffer is not
> +        * to be written, free the buffer. There is no point in caching
> +        * invalid buffer.
> +        */
> +       if ((b->read_error || b->write_error) &&
> +           !test_bit_acquire(B_READING, &b->state) &&
> +           !test_bit(B_WRITING, &b->state) &&
> +           !test_bit(B_DIRTY, &b->state)) {
> +               dm_bufio_lock(c);
>
> -       BUG_ON(!b->hold_count);
> -
> -       b->hold_count--;
> -       if (!b->hold_count) {
> -               wake_up(&c->free_buffer_wait);
> -
> -               /*
> -                * If there were errors on the buffer, and the buffer is
> not
> -                * to be written, free the buffer. There is no point in
> caching
> -                * invalid buffer.
> -                */
> -               if ((b->read_error || b->write_error) &&
> -                   !test_bit_acquire(B_READING, &b->state) &&
> -                   !test_bit(B_WRITING, &b->state) &&
> -                   !test_bit(B_DIRTY, &b->state)) {
> -                       __unlink_buffer(b);
> +               /* cache remove can fail if there are other holders */
> +               if (cache_remove(&c->cache, b)) {
>                         __free_buffer_wake(b);
> +                       dm_bufio_unlock(c);
> +                       return;
>                 }
> +
> +               dm_bufio_unlock(c);
>         }
>
> -       dm_bufio_unlock(c);
> +       cache_put_and_wake(c, b);
>  }
>  EXPORT_SYMBOL_GPL(dm_bufio_release);
>
> @@ -1262,7 +2028,7 @@ void dm_bufio_mark_partial_buffer_dirty(struct
> dm_buffer *b,
>         if (!test_and_set_bit(B_DIRTY, &b->state)) {
>                 b->dirty_start = start;
>                 b->dirty_end = end;
> -               __relink_lru(b, LIST_DIRTY);
> +               cache_mark(&c->cache, b, LIST_DIRTY);
>         } else {
>                 if (start < b->dirty_start)
>                         b->dirty_start = start;
> @@ -1300,11 +2066,19 @@
> EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
>   *
>   * Finally, we flush hardware disk cache.
>   */
> +static bool is_writing(struct lru_entry *e, void *context)
> +{
> +       struct dm_buffer *b = le_to_buffer(e);
> +
> +       return test_bit(B_WRITING, &b->state);
> +}
> +
>  int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
>  {
>         int a, f;
> -       unsigned long buffers_processed = 0;
> -       struct dm_buffer *b, *tmp;
> +       unsigned long nr_buffers;
> +       struct lru_entry *e;
> +       struct lru_iter it;
>
>         LIST_HEAD(write_list);
>
> @@ -1314,52 +2088,32 @@ int dm_bufio_write_dirty_buffers(struct
> dm_bufio_client *c)
>         __flush_write_list(&write_list);
>         dm_bufio_lock(c);
>
> -again:
> -       list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY],
> lru_list) {
> -               int dropped_lock = 0;
> -
> -               if (buffers_processed < c->n_buffers[LIST_DIRTY])
> -                       buffers_processed++;
> +       nr_buffers = cache_count(&c->cache, LIST_DIRTY);
> +       lru_iter_begin(&c->cache.lru[LIST_DIRTY], &it);
> +       while ((e = lru_iter_next(&it, is_writing, c))) {
> +               struct dm_buffer *b = le_to_buffer(e);
> +               __cache_inc_buffer(b);
>
>                 BUG_ON(test_bit(B_READING, &b->state));
>
> -               if (test_bit(B_WRITING, &b->state)) {
> -                       if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
> -                               dropped_lock = 1;
> -                               b->hold_count++;
> -                               dm_bufio_unlock(c);
> -                               wait_on_bit_io(&b->state, B_WRITING,
> -                                              TASK_UNINTERRUPTIBLE);
> -                               dm_bufio_lock(c);
> -                               b->hold_count--;
> -                       } else
> -                               wait_on_bit_io(&b->state, B_WRITING,
> -                                              TASK_UNINTERRUPTIBLE);
> -               }
> +               if (nr_buffers) {
> +                       nr_buffers--;
> +                       dm_bufio_unlock(c);
> +                       wait_on_bit_io(&b->state, B_WRITING,
> TASK_UNINTERRUPTIBLE);
> +                       dm_bufio_lock(c);
> +               } else
> +                       wait_on_bit_io(&b->state, B_WRITING,
> TASK_UNINTERRUPTIBLE);
>
> -               if (!test_bit(B_DIRTY, &b->state) &&
> -                   !test_bit(B_WRITING, &b->state))
> -                       __relink_lru(b, LIST_CLEAN);
> +               if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING,
> &b->state))
> +                       cache_mark(&c->cache, b, LIST_CLEAN);
> +
> +               cache_put_and_wake(c, b);
>
>                 cond_resched();
> -
> -               /*
> -                * If we dropped the lock, the list is no longer
> consistent,
> -                * so we must restart the search.
> -                *
> -                * In the most common case, the buffer just processed is
> -                * relinked to the clean list, so we won't loop scanning
> the
> -                * same buffer again and again.
> -                *
> -                * This may livelock if there is another thread
> simultaneously
> -                * dirtying buffers, so we count the number of buffers
> walked
> -                * and if it exceeds the total number of buffers, it means
> that
> -                * someone is doing some writes simultaneously with us.  In
> -                * this case, stop, dropping the lock.
> -                */
> -               if (dropped_lock)
> -                       goto again;
>         }
> +
> +       lru_iter_end(&it);
> +
>         wake_up(&c->free_buffer_wait);
>         dm_bufio_unlock(c);
>
> @@ -1418,12 +2172,22 @@ int dm_bufio_issue_discard(struct dm_bufio_client
> *c, sector_t block, sector_t c
>  }
>  EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
>
> -static void forget_buffer_locked(struct dm_buffer *b)
> +static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
>  {
> -       if (likely(!b->hold_count) &&
> likely(!smp_load_acquire(&b->state))) {
> -               __unlink_buffer(b);
> -               __free_buffer_wake(b);
> +       struct dm_buffer *b;
> +
> +       b = cache_get(&c->cache, block);
> +       if (b) {
> +               if (likely(!smp_load_acquire(&b->state))) {
> +                       if (cache_remove(&c->cache, b))
> +                               __free_buffer_wake(b);
> +                       else
> +                               cache_put_and_wake(c, b);
> +               } else
> +                       cache_put_and_wake(c, b);
>         }
> +
> +       return b ? true : false;
>  }
>
>  /*
> @@ -1434,38 +2198,22 @@ static void forget_buffer_locked(struct dm_buffer
> *b)
>   */
>  void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
>  {
> -       struct dm_buffer *b;
> -
>         dm_bufio_lock(c);
> -
> -       b = __find(c, block);
> -       if (b)
> -               forget_buffer_locked(b);
> -
> +       forget_buffer(c, block);
>         dm_bufio_unlock(c);
>  }
>  EXPORT_SYMBOL_GPL(dm_bufio_forget);
>
> +static enum evict_result idle(struct dm_buffer *b, void *context)
> +{
> +       return b->state ? ER_DONT_EVICT : ER_EVICT;
> +}
> +
>  void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block,
> sector_t n_blocks)
>  {
> -       struct dm_buffer *b;
> -       sector_t end_block = block + n_blocks;
> -
> -       while (block < end_block) {
> -               dm_bufio_lock(c);
> -
> -               b = __find_next(c, block);
> -               if (b) {
> -                       block = b->block + 1;
> -                       forget_buffer_locked(b);
> -               }
> -
> -               dm_bufio_unlock(c);
> -
> -               if (!b)
> -                       break;
> -       }
> -
> +       dm_bufio_lock(c);
> +       cache_remove_range(&c->cache, block, block + n_blocks, idle,
> __free_buffer_wake);
> +       dm_bufio_unlock(c);
>  }
>  EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
>
> @@ -1527,11 +2275,26 @@ struct dm_bufio_client *dm_bufio_get_client(struct
> dm_buffer *b)
>  }
>  EXPORT_SYMBOL_GPL(dm_bufio_get_client);
>
> +static enum it_action warn_leak(struct dm_buffer *b, void *context)
> +{
> +       bool *warned = context;
> +
> +       WARN_ON(!(*warned));
> +       *warned = true;
> +       DMERR("leaked buffer %llx, hold count %u, list %d",
> +             (unsigned long long)b->block, atomic_read(&b->hold_count),
> b->list_mode);
> +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
> +       stack_trace_print(b->stack_entries, b->stack_len, 1);
> +       /* mark unclaimed to avoid BUG_ON below */
> +       atomic_set(&b->hold_count, 0);
> +#endif
> +       return IT_NEXT;
> +}
> +
>  static void drop_buffers(struct dm_bufio_client *c)
>  {
> -       struct dm_buffer *b;
>         int i;
> -       bool warned = false;
> +       struct dm_buffer *b;
>
>         BUG_ON(dm_bufio_in_request());
>
> @@ -1545,18 +2308,11 @@ static void drop_buffers(struct dm_bufio_client *c)
>         while ((b = __get_unclaimed_buffer(c)))
>                 __free_buffer_wake(b);
>
> -       for (i = 0; i < LIST_SIZE; i++)
> -               list_for_each_entry(b, &c->lru[i], lru_list) {
> -                       WARN_ON(!warned);
> -                       warned = true;
> -                       DMERR("leaked buffer %llx, hold count %u, list %d",
> -                             (unsigned long long)b->block, b->hold_count,
> i);
> -#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
> -                       stack_trace_print(b->stack_entries, b->stack_len,
> 1);
> -                       /* mark unclaimed to avoid BUG_ON below */
> -                       b->hold_count = 0;
> -#endif
> -               }
> +       for (i = 0; i < LIST_SIZE; i++) {
> +               bool warned = false;
> +
> +               cache_iterate(&c->cache, i, warn_leak, &warned);
> +       }
>
>  #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
>         while ((b = __get_unclaimed_buffer(c)))
> @@ -1564,39 +2320,11 @@ static void drop_buffers(struct dm_bufio_client *c)
>  #endif
>
>         for (i = 0; i < LIST_SIZE; i++)
> -               BUG_ON(!list_empty(&c->lru[i]));
> +               BUG_ON(cache_count(&c->cache, i));
>
>         dm_bufio_unlock(c);
>  }
>
> -/*
> - * We may not be able to evict this buffer if IO pending or the client
> - * is still using it.  Caller is expected to know buffer is too old.
> - *
> - * And if GFP_NOFS is used, we must not do any I/O because we hold
> - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
> - * rerouted to different bufio client.
> - */
> -static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
> -{
> -       if (!(gfp & __GFP_FS) ||
> -           (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep))
> {
> -               if (test_bit_acquire(B_READING, &b->state) ||
> -                   test_bit(B_WRITING, &b->state) ||
> -                   test_bit(B_DIRTY, &b->state))
> -                       return false;
> -       }
> -
> -       if (b->hold_count)
> -               return false;
> -
> -       __make_buffer_clean(b);
> -       __unlink_buffer(b);
> -       __free_buffer_wake(b);
> -
> -       return true;
> -}
> -
>  static unsigned long get_retain_buffers(struct dm_bufio_client *c)
>  {
>         unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
> @@ -1612,22 +2340,30 @@ static unsigned long get_retain_buffers(struct
> dm_bufio_client *c)
>  static void __scan(struct dm_bufio_client *c)
>  {
>         int l;
> -       struct dm_buffer *b, *tmp;
> +       struct dm_buffer *b;
>         unsigned long freed = 0;
> -       unsigned long count = c->n_buffers[LIST_CLEAN] +
> -                             c->n_buffers[LIST_DIRTY];
>         unsigned long retain_target = get_retain_buffers(c);
> +       unsigned long count = cache_total(&c->cache);
>
>         for (l = 0; l < LIST_SIZE; l++) {
> -               list_for_each_entry_safe_reverse(b, tmp, &c->lru[l],
> lru_list) {
> +               while (true) {
>                         if (count - freed <= retain_target)
>                                 atomic_long_set(&c->need_shrink, 0);
> +
>                         if (!atomic_long_read(&c->need_shrink))
> -                               return;
> -                       if (__try_evict_buffer(b, GFP_KERNEL)) {
> -                               atomic_long_dec(&c->need_shrink);
> -                               freed++;
> -                       }
> +                               break;
> +
> +                       b = cache_evict(&c->cache, l,
> +                                       l == LIST_CLEAN ? is_clean :
> is_dirty, c);
> +                       if (!b)
> +                               break;
> +
> +                       BUG_ON(b->list_mode != l);
> +                       __make_buffer_clean(b);
> +                       __free_buffer_wake(b);
> +
> +                       atomic_long_dec(&c->need_shrink);
> +                       freed++;
>                         cond_resched();
>                 }
>         }
> @@ -1656,8 +2392,7 @@ static unsigned long dm_bufio_shrink_scan(struct
> shrinker *shrink, struct shrink
>  static unsigned long dm_bufio_shrink_count(struct shrinker *shrink,
> struct shrink_control *sc)
>  {
>         struct dm_bufio_client *c = container_of(shrink, struct
> dm_bufio_client, shrinker);
> -       unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
> -                             READ_ONCE(c->n_buffers[LIST_DIRTY]);
> +       unsigned long count = cache_total(&c->cache);
>         unsigned long retain_target = get_retain_buffers(c);
>         unsigned long queued_for_cleanup =
> atomic_long_read(&c->need_shrink);
>
> @@ -1685,7 +2420,6 @@ struct dm_bufio_client
> *dm_bufio_client_create(struct block_device *bdev, unsign
>  {
>         int r;
>         struct dm_bufio_client *c;
> -       unsigned int i;
>         char slab_name[27];
>
>         if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
> @@ -1699,7 +2433,7 @@ struct dm_bufio_client
> *dm_bufio_client_create(struct block_device *bdev, unsign
>                 r = -ENOMEM;
>                 goto bad_client;
>         }
> -       c->buffer_tree = RB_ROOT;
> +       cache_init(&c->cache);
>
>         c->bdev = bdev;
>         c->block_size = block_size;
> @@ -1716,11 +2450,6 @@ struct dm_bufio_client
> *dm_bufio_client_create(struct block_device *bdev, unsign
>                 static_branch_inc(&no_sleep_enabled);
>         }
>
> -       for (i = 0; i < LIST_SIZE; i++) {
> -               INIT_LIST_HEAD(&c->lru[i]);
> -               c->n_buffers[i] = 0;
> -       }
> -
>         mutex_init(&c->lock);
>         spin_lock_init(&c->spinlock);
>         INIT_LIST_HEAD(&c->reserved_buffers);
> @@ -1792,9 +2521,9 @@ struct dm_bufio_client
> *dm_bufio_client_create(struct block_device *bdev, unsign
>
>  bad:
>         while (!list_empty(&c->reserved_buffers)) {
> -               struct dm_buffer *b = list_entry(c->reserved_buffers.next,
> -                                                struct dm_buffer,
> lru_list);
> -               list_del(&b->lru_list);
> +               struct dm_buffer *b =
> list_to_buffer(c->reserved_buffers.next);
> +
> +               list_del(&b->lru.list);
>                 free_buffer(b);
>         }
>         kmem_cache_destroy(c->slab_cache);
> @@ -1831,23 +2560,23 @@ void dm_bufio_client_destroy(struct
> dm_bufio_client *c)
>
>         mutex_unlock(&dm_bufio_clients_lock);
>
> -       BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
>         BUG_ON(c->need_reserved_buffers);
>
>         while (!list_empty(&c->reserved_buffers)) {
> -               struct dm_buffer *b = list_entry(c->reserved_buffers.next,
> -                                                struct dm_buffer,
> lru_list);
> -               list_del(&b->lru_list);
> +               struct dm_buffer *b =
> list_to_buffer(c->reserved_buffers.next);
> +
> +               list_del(&b->lru.list);
>                 free_buffer(b);
>         }
>
>         for (i = 0; i < LIST_SIZE; i++)
> -               if (c->n_buffers[i])
> -                       DMERR("leaked buffer count %d: %ld", i,
> c->n_buffers[i]);
> +               if (cache_count(&c->cache, i))
> +                       DMERR("leaked buffer count %d: %lu", i,
> cache_count(&c->cache, i));
>
>         for (i = 0; i < LIST_SIZE; i++)
> -               BUG_ON(c->n_buffers[i]);
> +               BUG_ON(cache_count(&c->cache, i));
>
> +       cache_destroy(&c->cache);
>         kmem_cache_destroy(c->slab_cache);
>         kmem_cache_destroy(c->slab_buffer);
>         dm_io_client_destroy(c->dm_io);
> @@ -1864,6 +2593,8 @@ void dm_bufio_set_sector_offset(struct
> dm_bufio_client *c, sector_t start)
>  }
>  EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
>
> +/*--------------------------------------------------------------*/
> +
>  static unsigned int get_max_age_hz(void)
>  {
>         unsigned int max_age = READ_ONCE(dm_bufio_max_age);
> @@ -1876,13 +2607,74 @@ static unsigned int get_max_age_hz(void)
>
>  static bool older_than(struct dm_buffer *b, unsigned long age_hz)
>  {
> -       return time_after_eq(jiffies, b->last_accessed + age_hz);
> +       return time_after_eq(jiffies, READ_ONCE(b->last_accessed) +
> age_hz);
>  }
>
> -static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long
> age_hz)
> +struct evict_params {
> +       gfp_t gfp;
> +       unsigned long age_hz;
> +
> +       /*
> +        * This gets updated with the largest last_accessed (ie. most
> +        * recently used) of the evicted buffers.  It will not be
> reinitialised
> +        * by __evict_many(), so you can use it across multiple
> invocations.
> +        */
> +       unsigned long last_accessed;
> +};
> +
> +/*
> + * We may not be able to evict this buffer if IO pending or the client
> + * is still using it.
> + *
> + * And if GFP_NOFS is used, we must not do any I/O because we hold
> + * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
> + * rerouted to different bufio client.
> + */
> +static enum evict_result select_for_evict(struct dm_buffer *b, void
> *context)
> +{
> +       struct evict_params *params = context;
> +
> +       if (!(params->gfp & __GFP_FS) ||
> +           (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep))
> {
> +               if (test_bit_acquire(B_READING, &b->state) ||
> +                   test_bit(B_WRITING, &b->state) ||
> +                   test_bit(B_DIRTY, &b->state))
> +                       return ER_DONT_EVICT;
> +       }
> +
> +       return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP;
> +}
> +
> +static unsigned int __evict_many(struct dm_bufio_client *c,
> +                                struct evict_params *params,
> +                                int list_mode, unsigned int max_count)
> +{
> +       unsigned int count;
> +       unsigned long last_accessed;
> +       struct dm_buffer *b;
> +
> +       for (count = 0; count < max_count; count++) {
> +               b = cache_evict(&c->cache, list_mode, select_for_evict,
> params);
> +               if (!b)
> +                       break;
> +
> +               last_accessed = READ_ONCE(b->last_accessed);
> +               if (time_after_eq(params->last_accessed, last_accessed))
> +                       params->last_accessed = last_accessed;
> +
> +               __make_buffer_clean(b);
> +               __free_buffer_wake(b);
> +
> +               cond_resched();
> +       }
> +
> +       return count;
> +}
> +
> +static void evict_old_buffers(struct dm_bufio_client *c, unsigned long
> age_hz)
>  {
> -       struct dm_buffer *b, *tmp;
> -       unsigned long retain_target = get_retain_buffers(c);
> +       struct evict_params params = {.gfp = 0, .age_hz = age_hz,
> .last_accessed = 0};
> +       unsigned long retain = get_retain_buffers(c);
>         unsigned long count;
>         LIST_HEAD(write_list);
>
> @@ -1895,91 +2687,13 @@ static void __evict_old_buffers(struct
> dm_bufio_client *c, unsigned long age_hz)
>                 dm_bufio_lock(c);
>         }
>
> -       count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
> -       list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN],
> lru_list) {
> -               if (count <= retain_target)
> -                       break;
> -
> -               if (!older_than(b, age_hz))
> -                       break;
> -
> -               if (__try_evict_buffer(b, 0))
> -                       count--;
> -
> -               cond_resched();
> -       }
> +       count = cache_total(&c->cache);
> +       if (count > retain)
> +               __evict_many(c, &params, LIST_CLEAN, count - retain);
>
>         dm_bufio_unlock(c);
>  }
>
> -static void do_global_cleanup(struct work_struct *w)
> -{
> -       struct dm_bufio_client *locked_client = NULL;
> -       struct dm_bufio_client *current_client;
> -       struct dm_buffer *b;
> -       unsigned int spinlock_hold_count;
> -       unsigned long threshold = dm_bufio_cache_size -
> -               dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
> -       unsigned long loops = global_num * 2;
> -
> -       mutex_lock(&dm_bufio_clients_lock);
> -
> -       while (1) {
> -               cond_resched();
> -
> -               spin_lock(&global_spinlock);
> -               if (unlikely(dm_bufio_current_allocated <= threshold))
> -                       break;
> -
> -               spinlock_hold_count = 0;
> -get_next:
> -               if (!loops--)
> -                       break;
> -               if (unlikely(list_empty(&global_queue)))
> -                       break;
> -               b = list_entry(global_queue.prev, struct dm_buffer,
> global_list);
> -
> -               if (b->accessed) {
> -                       b->accessed = 0;
> -                       list_move(&b->global_list, &global_queue);
> -                       if (likely(++spinlock_hold_count < 16))
> -                               goto get_next;
> -                       spin_unlock(&global_spinlock);
> -                       continue;
> -               }
> -
> -               current_client = b->c;
> -               if (unlikely(current_client != locked_client)) {
> -                       if (locked_client)
> -                               dm_bufio_unlock(locked_client);
> -
> -                       if (!dm_bufio_trylock(current_client)) {
> -                               spin_unlock(&global_spinlock);
> -                               dm_bufio_lock(current_client);
> -                               locked_client = current_client;
> -                               continue;
> -                       }
> -
> -                       locked_client = current_client;
> -               }
> -
> -               spin_unlock(&global_spinlock);
> -
> -               if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
> -                       spin_lock(&global_spinlock);
> -                       list_move(&b->global_list, &global_queue);
> -                       spin_unlock(&global_spinlock);
> -               }
> -       }
> -
> -       spin_unlock(&global_spinlock);
> -
> -       if (locked_client)
> -               dm_bufio_unlock(locked_client);
> -
> -       mutex_unlock(&dm_bufio_clients_lock);
> -}
> -
>  static void cleanup_old_buffers(void)
>  {
>         unsigned long max_age_hz = get_max_age_hz();
> @@ -1990,7 +2704,7 @@ static void cleanup_old_buffers(void)
>         __cache_size_refresh();
>
>         list_for_each_entry(c, &dm_bufio_all_clients, client_list)
> -               __evict_old_buffers(c, max_age_hz);
> +               evict_old_buffers(c, max_age_hz);
>
>         mutex_unlock(&dm_bufio_clients_lock);
>  }
> @@ -2003,6 +2717,107 @@ static void work_fn(struct work_struct *w)
>                            DM_BUFIO_WORK_TIMER_SECS * HZ);
>  }
>
> +/*--------------------------------------------------------------*/
> +
> +/*
> + * Global cleanup tries to evict the oldest buffers from across _all_
> + * the clients.  It does this by repeatedly evicting a few buffers from
> + * the client that holds the oldest buffer.  It's approximate, but
> hopefully
> + * good enough.
> + */
> +static struct dm_bufio_client *__pop_client(void)
> +{
> +       struct list_head *h;
> +
> +       if (list_empty(&dm_bufio_all_clients))
> +               return NULL;
> +
> +       h = dm_bufio_all_clients.next;
> +       list_del(h);
> +       return container_of(h, struct dm_bufio_client, client_list);
> +}
> +
> +/*
> + * Inserts the client in the global client list based on its
> + * 'oldest_buffer' field.
> + */
> +static void __insert_client(struct dm_bufio_client *new_client)
> +{
> +       struct dm_bufio_client *c;
> +       struct list_head *h = dm_bufio_all_clients.next;
> +
> +       while (h != &dm_bufio_all_clients) {
> +               c = container_of(h, struct dm_bufio_client, client_list);
> +               if (time_after_eq(c->oldest_buffer,
> new_client->oldest_buffer))
> +                       break;
> +               h = h->next;
> +       }
> +
> +       list_add_tail(&new_client->client_list, h);
> +}
> +
> +static unsigned int __evict_a_few(unsigned int nr_buffers)
> +{
> +       unsigned int count;
> +       struct dm_bufio_client *c;
> +       struct evict_params params = {
> +               .gfp = GFP_KERNEL,
> +               .age_hz = 0,
> +               /* set to jiffies in case there are no buffers in this
> client */
> +               .last_accessed = jiffies
> +       };
> +
> +       c = __pop_client();
> +       if (!c)
> +               return 0;
> +
> +       dm_bufio_lock(c);
> +       count = __evict_many(c, &params, LIST_CLEAN, nr_buffers);
> +       dm_bufio_unlock(c);
> +
> +       if (count)
> +               c->oldest_buffer = params.last_accessed;
> +       __insert_client(c);
> +
> +       return count;
> +}
> +
> +static void check_watermarks(void)
> +{
> +       LIST_HEAD(write_list);
> +       struct dm_bufio_client *c;
> +
> +       mutex_lock(&dm_bufio_clients_lock);
> +       list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
> +               dm_bufio_lock(c);
> +               __check_watermark(c, &write_list);
> +               dm_bufio_unlock(c);
> +       }
> +       mutex_unlock(&dm_bufio_clients_lock);
> +
> +       __flush_write_list(&write_list);
> +}
> +
> +static void evict_old(void)
> +{
> +       unsigned long threshold = dm_bufio_cache_size -
> +               dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
> +
> +       mutex_lock(&dm_bufio_clients_lock);
> +       while (dm_bufio_current_allocated > threshold) {
> +               if (!__evict_a_few(64))
> +                       break;
> +               cond_resched();
> +       }
> +       mutex_unlock(&dm_bufio_clients_lock);
> +}
> +
> +static void do_global_cleanup(struct work_struct *w)
> +{
> +       check_watermarks();
> +       evict_old();
> +}
> +
>  /*
>   *--------------------------------------------------------------
>   * Module setup
> --
> 2.40.0
>
>