Re: [PATCH v3 2/5] zswap: make shrinking memcg-aware

From: Nhat Pham <nphamcs@gmail.com>
To: Yosry Ahmed <yosryahmed@google.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>,
	akpm@linux-foundation.org, hannes@cmpxchg.org,
	sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com,
	mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com,
	muchun.song@linux.dev, linux-mm@kvack.org, kernel-team@meta.com,
	linux-kernel@vger.kernel.org, cgroups@vger.kernel.org,
	linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
	shuah@kernel.org
Subject: Re: [PATCH v3 2/5] zswap: make shrinking memcg-aware
Date: Fri, 20 Oct 2023 12:58:19 -0700	[thread overview]
Message-ID: <CAKEwX=OVcAEnOEmhy2dGwE7Zm-L-y8Mq=bx4BFUeQnTXaAA_FQ@mail.gmail.com> (raw)
In-Reply-To: <CAJD7tka_gvNPgu4gim9-dqx0Wf-zdGj+==nwx2yrmOuZoe=oyw@mail.gmail.com>

On Thu, Oct 19, 2023 at 9:15 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> [..]
> > > >
> > > > +/*********************************
> > > > +* lru functions
> > > > +**********************************/
> > > > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> > > > +{
> > > > +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
> > >
> > > Could we avoid the need for get/put with an rcu_read_lock() instead?
> >
> > I think we can, I'm not entirely sure of the consequences though. By the
> > look of it I'd say it's safe but I wouldn't trust my judgement on this.
>
> It just seems like we have a pattern of short-lived get/put. If RCU
> gives enough protection it should be simpler. IIUC taking a reference
> does not protect against offlining or reparenting, so I am not sure if
> taking a reference here would provide any more protection than
>

I'd keep it for now. Sounds like an optimization to me, which could
always be done as a follow-up :)

Unless of course, if somebody has a really strong opinion regarding
this.

> >
> > >
> [..]
> > > > @@ -686,7 +716,36 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> > > >         zswap_entry_put(tree, entry);
> > > >  unlock:
> > > >         spin_unlock(&tree->lock);
> > > > -       return ret ? -EAGAIN : 0;
> > > > +       spin_lock(lock);
> > > > +       return ret;
> > > > +}
> > > > +
> > > > +static int shrink_memcg(struct mem_cgroup *memcg)
> > > > +{
> > > > +       struct zswap_pool *pool;
> > > > +       int nid, shrunk = 0;
> > > > +
> > > > +       pool = zswap_pool_current_get();
> > > > +       if (!pool)
> > > > +               return -EINVAL;
> > > > +
> > > > +       /*
> > > > +        * Skip zombies because their LRUs are reparented and we would be
> > > > +        * reclaiming from the parent instead of the dead memcgroup.
> > >
> > > nit: s/memcgroup/memcg.
> > >
> > > > +        */
> > > > +       if (memcg && !mem_cgroup_online(memcg))
> > > > +               goto out;
> > >
> > > If we move this above zswap_pool_current_get(), we can return directly
> > > and remove the label. I noticed we will return -EAGAIN if memcg is
> > > offline. IIUC -EAGAIN for the caller will move on to the next memcg,
> > > but I am wondering if a different errno would be clearer here.
> >
> > True, I remember spending some time staring at error codes but couldn't find a
> > better one. What if we use -EINVAL for retryable errors, and use something else
> > for the one where there is no pool? -ENODEV?
>
> Do you mean -EINVAL for non-retryable errors? Perhaps -ENOENT is more
> appropriate as a return for offline memcgs?
>
> >
> > >
> [..]
> > > >  static void shrink_worker(struct work_struct *w)
> > > > @@ -695,10 +754,13 @@ static void shrink_worker(struct work_struct *w)
> > > >                                                 shrink_work);
> > > >         int ret, failures = 0;
> > > >
> > > > +       /* global reclaim will select cgroup in a round-robin fashion. */
> > > >         do {
> > > > -               ret = zswap_reclaim_entry(pool);
> > > > +               pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
> > >
> > > Perhaps next_shrink_memcg is a better name here?
> >
> > Will change if you have a strong preference, I'd keep it shorter because it's
> > always used in conjunction with a memcg type or function.
>
> I'd rather have the more explicit name unless it causes some annoying
> line breaks or so.
>
> >
> > >
> > > > +
> > > > +               ret = shrink_memcg(pool->next_shrink);
> > > > +
> > > >                 if (ret) {
> > > > -                       zswap_reject_reclaim_fail++;
> > > >                         if (ret != -EAGAIN)
> > > >                                 break;
> > > >                         if (++failures == MAX_RECLAIM_RETRIES)
> > > > @@ -764,8 +826,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> > > >          */
> > > >         kref_init(&pool->kref);
> > > >         INIT_LIST_HEAD(&pool->list);
> > > > -       INIT_LIST_HEAD(&pool->lru);
> > > > -       spin_lock_init(&pool->lru_lock);
> > > > +       list_lru_init_memcg(&pool->list_lru, NULL);
> > > >         INIT_WORK(&pool->shrink_work, shrink_worker);
> > > >
> > > >         zswap_pool_debug("created", pool);
> > > > @@ -831,6 +892,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> > > >
> > > >         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> > > >         free_percpu(pool->acomp_ctx);
> > > > +       list_lru_destroy(&pool->list_lru);
> > > > +       if (pool->next_shrink)
> > > > +               mem_cgroup_put(pool->next_shrink);
> > > >         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> > > >                 zpool_destroy_pool(pool->zpools[i]);
> > > >         kfree(pool);
> > > > @@ -1076,7 +1140,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > > >
> > > >         /* try to allocate swap cache page */
> > > >         page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
> > > > -                                      &page_was_allocated);
> > > > +                                      &page_was_allocated, true);
> > > >         if (!page) {
> > > >                 ret = -ENOMEM;
> > > >                 goto fail;
> > > > @@ -1142,7 +1206,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > > >         /* start writeback */
> > > >         __swap_writepage(page, &wbc);
> > > >         put_page(page);
> > > > -       zswap_written_back_pages++;
> > > >
> > > >         return ret;
> > > >
> > > > @@ -1199,8 +1262,10 @@ bool zswap_store(struct folio *folio)
> > > >         struct scatterlist input, output;
> > > >         struct crypto_acomp_ctx *acomp_ctx;
> > > >         struct obj_cgroup *objcg = NULL;
> > > > +       struct mem_cgroup *memcg = NULL;
> > > >         struct zswap_pool *pool;
> > > >         struct zpool *zpool;
> > > > +       int lru_alloc_ret;
> > > >         unsigned int dlen = PAGE_SIZE;
> > > >         unsigned long handle, value;
> > > >         char *buf;
> > > > @@ -1230,15 +1295,15 @@ bool zswap_store(struct folio *folio)
> > > >                 zswap_invalidate_entry(tree, dupentry);
> > > >         }
> > > >         spin_unlock(&tree->lock);
> > > > -
> > > > -       /*
> > > > -        * XXX: zswap reclaim does not work with cgroups yet. Without a
> > > > -        * cgroup-aware entry LRU, we will push out entries system-wide based on
> > > > -        * local cgroup limits.
> > > > -        */
> > > >         objcg = get_obj_cgroup_from_folio(folio);
> > > > -       if (objcg && !obj_cgroup_may_zswap(objcg))
> > > > -               goto reject;
> > > > +       if (objcg && !obj_cgroup_may_zswap(objcg)) {
> > > > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > > > +               if (shrink_memcg(memcg)) {
> > > > +                       mem_cgroup_put(memcg);
> > > > +                       goto reject;
> > > > +               }
> > > > +               mem_cgroup_put(memcg);
> > > > +       }
> > > >
> > > >         /* reclaim space if needed */
> > > >         if (zswap_is_full()) {
> > > > @@ -1254,10 +1319,15 @@ bool zswap_store(struct folio *folio)
> > > >                         zswap_pool_reached_full = false;
> > > >         }
> > > >
> > > > +       pool = zswap_pool_current_get();
> > > > +       if (!pool)
> > > > +               goto reject;
> > > > +
> > >
> > > Why do we need to move zswap_pool_current_get() up here?
> >
> > Ah, thanks. This is a leftover from a previous version where the pool was needed
> > to allocate the entry.
> >
> >
> > >
> > > >         /* allocate entry */
> > > > -       entry = zswap_entry_cache_alloc(GFP_KERNEL);
> > > > +       entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> > > >         if (!entry) {
> > > >                 zswap_reject_kmemcache_fail++;
> > > > +               zswap_pool_put(pool);
> > > >                 goto reject;
> > > >         }
> > > >
> > > > @@ -1269,6 +1339,7 @@ bool zswap_store(struct folio *folio)
> > > >                         entry->length = 0;
> > > >                         entry->value = value;
> > > >                         atomic_inc(&zswap_same_filled_pages);
> > > > +                       zswap_pool_put(pool);
> > > >                         goto insert_entry;
> > > >                 }
> > > >                 kunmap_atomic(src);
> > > > @@ -1278,9 +1349,15 @@ bool zswap_store(struct folio *folio)
> > > >                 goto freepage;
> > > >
> > > >         /* if entry is successfully added, it keeps the reference */
> > > > -       entry->pool = zswap_pool_current_get();
> > > > -       if (!entry->pool)
> > > > -               goto freepage;
> > > > +       entry->pool = pool;
> > > > +       if (objcg) {
> > > > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > > > +               lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);
> > > > +               mem_cgroup_put(memcg);
> > > > +
> > > > +               if (lru_alloc_ret)
> > > > +                       goto freepage;
> > > > +       }
> > > >
> > > >         /* compress */
> > > >         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > > > @@ -1358,9 +1435,8 @@ bool zswap_store(struct folio *folio)
> > > >                 zswap_invalidate_entry(tree, dupentry);
> > > >         }
> > > >         if (entry->length) {
> > > > -               spin_lock(&entry->pool->lru_lock);
> > > > -               list_add(&entry->lru, &entry->pool->lru);
> > > > -               spin_unlock(&entry->pool->lru_lock);
> > > > +               INIT_LIST_HEAD(&entry->lru);
> > > > +               zswap_lru_add(&pool->list_lru, entry);
> > > >         }
> > > >         spin_unlock(&tree->lock);
> > > >
> > > > @@ -1373,8 +1449,8 @@ bool zswap_store(struct folio *folio)
> > > >
> > > >  put_dstmem:
> > > >         mutex_unlock(acomp_ctx->mutex);
> > > > -       zswap_pool_put(entry->pool);
> > > >  freepage:
> > > > +       zswap_pool_put(entry->pool);
> > > >         zswap_entry_cache_free(entry);
> > > >  reject:
> > > >         if (objcg)
> > > > @@ -1467,9 +1543,8 @@ bool zswap_load(struct folio *folio)
> > > >                 zswap_invalidate_entry(tree, entry);
> > > >                 folio_mark_dirty(folio);
> > > >         } else if (entry->length) {
> > > > -               spin_lock(&entry->pool->lru_lock);
> > > > -               list_move(&entry->lru, &entry->pool->lru);
> > > > -               spin_unlock(&entry->pool->lru_lock);
> > > > +               zswap_lru_del(&entry->pool->list_lru, entry);
> > > > +               zswap_lru_add(&entry->pool->list_lru, entry);
> > > >         }
> > > >         zswap_entry_put(tree, entry);
> > > >         spin_unlock(&tree->lock);
> > > > --
> > > > 2.34.1