linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* RE: [Regression] mqueue performance degradation after "The new cgroup slab memory controller" patchset.
       [not found] <PH0PR11MB562641BC03630B4B7A227FD7E9189@PH0PR11MB5626.namprd11.prod.outlook.com>
@ 2022-12-05 15:05 ` Luther, Sven
  2022-12-05 16:06 ` Shakeel Butt
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Luther, Sven @ 2022-12-05 15:05 UTC (permalink / raw)
  To: linux-kernel, regressions, Roman Gushchin
  Cc: Andrew Morton, Christoph Lameter, Johannes Weiner, Michal Hocko,
	Shakeel Butt, linux-mm, Vlastimil Babka, kernel-team,
	Eric W. Biederman, Muchun Song, Waiman Long, Alexey Gladkov,
	Bonn, Jonas

#regzbot ^introduced 10befea91b61c4e2c2d1df06a2e978d182fcf792

We are making heavy use of mqueues, and noticed a degradation of performance between 4.18 & 5.10 linux kernels.

After a gross per-version tracing, we did kernel bisection between 5.8 and 5.9
and traced the issue to a 10 patches (of which 9 where skipped as they didn't boot) between:


commit 10befea91b61c4e2c2d1df06a2e978d182fcf792 (HEAD, refs/bisect/bad)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:21:27 2020 -0700

    mm: memcg/slab: use a single set of kmem_caches for all allocations

and:

commit 286e04b8ed7a04279ae277f0f024430246ea5eec (refs/bisect/good-286e04b8ed7a04279ae277f0f024430246ea5eec)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:20:52 2020 -0700

    mm: memcg/slab: allocate obj_cgroups for non-root slab pages

All of them are part of the "The new cgroup slab memory controller" patchset:

  https://lore.kernel.org/all/20200623174037.3951353-18-guro@fb.com/T/

from Roman Gushchin, which moves the accounting for page level to the object level.

Measurements where done using the a test programmtest, which measures mix/average/max time mqueue_send/mqueue_rcv,
and average for getppid, both measured over 100 000 runs. Results are shown in the following table

+----------+--------------------------+-------------------------+----------------+
| kernel   |    mqueue_rcv (ns)       | mqueue_send (ns)        |    getppid     |
| version  | min avg  max   variation | min avg max   variation | (ns) variation |
+----------+--------------------------+-------------------------+----------------+
| 4.18.45  | 351 382 17533     base   | 383 410 13178     base  | 149      base  |
| 5.8-good | 380 392  7156   -2,55%   | 376 384  6225    6,77%  | 169   -11,83%  |
| 5.8-bad  | 524 530  5310  -27,92%   | 512 519  8775  -21,00%  | 169   -11,83%  |
| 5.10     | 520 533  4078  -28,33%   | 518 534  8108  -23,22%  | 167   -10,78%  |
| 5.15     | 431 444  8440  -13,96%   | 425 437  6170   -6,18%  | 171   -12,87%  |
| 6.03     | 474 614  3881  -37,79%   | 482 693   931  -40,84%  | 171   -12,87%  |
+----------+--------------------------+-------------------------+-----------------

Below you will find the following appendices:

  - raw GOOD bisect results
  - raw BAD bisect results
  - test code source code
  - bisect results

Friendly,

Sven Luther

GOOD bisect results:
====================
# uname -a
Linux axxiax86-64 5.8.0-5-286e04b8ed7a04279ae277f0f024430246ea5eec-good #1 SMP PREEMPT Fri Jul 29 11:01:19 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
# ./mqtest
RX: 380/392/7156
TX: 376/384/6225
Accumulated time for 100000 getppid, 16924653ns
Average time per getppid: 169ns
# ./mqtest
RX: 383/397/6300
TX: 377/385/8258
Accumulated time for 100000 getppid, 16915580ns
Average time per getppid: 169ns
# ./mqtest
RX: 385/397/7690
TX: 377/385/4244
Accumulated time for 100000 getppid, 16922910ns
Average time per getppid: 169ns

BAD bisect results:
===================
# uname -a
Linux axxiax86-64 5.8.0-5-10befea91b61c4e2c2d1df06a2e978d182fcf792-bad #1 SMP PREEMPT Fri Jul 29 11:01:19 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
# ./mqtest
RX: 524/530/5310
TX: 512/519/8775
Accumulated time for 100000 getppid, 16921865ns
Average time per getppid: 169ns
# ./mqtest
RX: 520/525/8204
TX: 514/522/7420
Accumulated time for 100000 getppid, 16931493ns
Average time per getppid: 169ns
# ./mqtest
RX: 520/525/8128
TX: 512/519/5340
Accumulated time for 100000 getppid, 16921535ns
Average time per getppid: 169ns


Test programm:
==============
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include <time.h>
#include <mqueue.h>

struct sysc_meas {
        int64_t min;
        int64_t max;
        int64_t sum;
        int64_t count;
};

static inline uint64_t __cputime_get(void)
{
        unsigned long l, h;

        asm volatile("rdtsc" : "=a" (l), "=d" (h));

        return ((uint64_t) h << 32) | l;
}

static inline int64_t __cputime_ns(uint64_t cputime)
{
        /* Note, it is assumed thatTSC clock is 2.2G */
        return (cputime * 5) / 11;
}

static inline uint64_t __cputime_get_ns()
{
        return __cputime_ns(__cputime_get());
}

static void sysc_meas_init(struct sysc_meas *sm)
{
        sm->min = INT64_MAX;
        sm->max = INT64_MIN;
        sm->sum = sm->count = 0;
}
static void sysc_meas_add(struct sysc_meas *sm, int64_t value)
{
        if (value > sm->max)
                sm->max = value;
        if (value < sm->min)
                sm->min = value;
        sm->sum += value;
        sm->count++;
}
static void sysc_mq(uint32_t count)
{
        int fd;
        struct mq_attr mq_attr;
        uint64_t ts;
        struct sysc_meas sm_tx, sm_rx;
        char dummy[256];

        if (!count)
                return;

        mq_unlink("/sysc_mq");

        memset(&mq_attr, 0, sizeof(mq_attr));
        mq_attr.mq_maxmsg = 10;
        mq_attr.mq_msgsize =
                sizeof(dummy);

        fd = mq_open("/sysc_mq", O_RDWR | O_NONBLOCK | O_CREAT, 0666, &mq_attr);
        if (fd < 0) {
                printf("ERROR: unable to create MQ, %d\n", -errno);
                exit(-1);
        }

        sysc_meas_init(&sm_rx);
        sysc_meas_init(&sm_tx);

        dummy[0] = 0;
        dummy[sizeof(dummy) - 1] = 0;

        while (count--) {
                ts = __cputime_get();
                mq_send(fd, dummy, sizeof(dummy), 0);
                sysc_meas_add(&sm_tx, __cputime_get() - ts);
                ts = __cputime_get();
                mq_receive(fd, dummy, sizeof(dummy), NULL);
                sysc_meas_add(&sm_rx, __cputime_get() - ts);
        }

        mq_close(fd);
        mq_unlink("/sysc_mq");

        printf("RX: %ld/%ld/%ld\n", __cputime_ns(sm_rx.min),
               __cputime_ns(sm_rx.sum / sm_rx.count),
               __cputime_ns(sm_rx.max));

        printf("TX: %ld/%ld/%ld\n", __cputime_ns(sm_tx.min),
               __cputime_ns(sm_tx.sum / sm_tx.count),
               __cputime_ns(sm_tx.max));
}
int main(int argc, char *argv[])
{
        uint32_t loops = 100000, cnt;
        uint64_t ts_a, ts_b, acc = 0;

        if (argc > 1)
                loops = strtoul(argv[1], (char **)NULL, 0);
        cnt = loops;
        sysc_mq(loops);
        while (loops--) {
                ts_a = __cputime_get();
                (void)getppid();
                ts_b = __cputime_get();
                acc += (ts_b - ts_a);
        }
        printf("Accumulated time for %d getppid, %luns\n",
               cnt, __cputime_ns(acc));
        printf("Average time per getppid: %luns\n", (__cputime_ns(acc)/cnt));

        exit (0);
}

Complete bisect result:
=======================
commit 10befea91b61c4e2c2d1df06a2e978d182fcf792 (HEAD, refs/bisect/bad)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:21:27 2020 -0700

    mm: memcg/slab: use a single set of kmem_caches for all allocations

commit 15999eef7f25e2ea6a1c33f026166f472c5714e9 (refs/bisect/skip-15999eef7f25e2ea6a1c33f026166f472c5714e9)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:21:24 2020 -0700

    mm: memcg/slab: remove redundant check in memcg_accumulate_slabinfo()

commit c7094406fcb7cdf4fe1de8893f0613b75349773d (refs/bisect/skip-c7094406fcb7cdf4fe1de8893f0613b75349773d)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:21:20 2020 -0700

    mm: memcg/slab: deprecate slab_root_caches

commit 272911a4ad18c48f8bc449a5db945a54987dd687 (refs/bisect/skip-272911a4ad18c48f8bc449a5db945a54987dd687)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:21:17 2020 -0700

    mm: memcg/slab: remove memcg_kmem_get_cache()

commit d797b7d05405c519f7b62ea69a75cea1883863b2 (refs/bisect/skip-d797b7d05405c519f7b62ea69a75cea1883863b2)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:21:14 2020 -0700

    mm: memcg/slab: simplify memcg cache creation

commit 9855609bde03e2472b99a95e869d29ee1e78a751 (refs/bisect/skip-9855609bde03e2472b99a95e869d29ee1e78a751)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:21:10 2020 -0700

    mm: memcg/slab: use a single set of kmem_caches for all accounted allocations

commit 0f876e4dc55db5fafef774917fd66e1373c0f390 (refs/bisect/skip-0f876e4dc55db5fafef774917fd66e1373c0f390)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:21:06 2020 -0700

    mm: memcg/slab: move memcg_kmem_bypass() to memcontrol.h

commit 4330a26bc4527f1d8918c398ebc983574f761cca (refs/bisect/skip-4330a26bc4527f1d8918c398ebc983574f761cca)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:21:03 2020 -0700

    mm: memcg/slab: deprecate memory.kmem.slabinfo

commit f2fe7b09a52bc898ec030d4fa6f78725439c7c2c (refs/bisect/skip-f2fe7b09a52bc898ec030d4fa6f78725439c7c2c)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:20:59 2020 -0700

    mm: memcg/slab: charge individual slab objects instead of pages

 commit 964d4bd370d559d9bd8e4abc139e85d2753956fb (refs/bisect/skip-964d4bd370d559d9bd8e4abc139e85d2753956fb)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:20:56 2020 -0700

    mm: memcg/slab: save obj_cgroup for non-root slab objects

commit 286e04b8ed7a04279ae277f0f024430246ea5eec (refs/bisect/good-286e04b8ed7a04279ae277f0f024430246ea5eec)
Author: Roman Gushchin <guro@fb.com>
Date:   Thu Aug 6 23:20:52 2020 -0700

    mm: memcg/slab: allocate obj_cgroups for non-root slab pages


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Regression] mqueue performance degradation after "The new cgroup slab memory controller" patchset.
       [not found] <PH0PR11MB562641BC03630B4B7A227FD7E9189@PH0PR11MB5626.namprd11.prod.outlook.com>
  2022-12-05 15:05 ` [Regression] mqueue performance degradation after "The new cgroup slab memory controller" patchset Luther, Sven
@ 2022-12-05 16:06 ` Shakeel Butt
  2022-12-05 16:30   ` Waiman Long
  2022-12-05 20:08 ` Roman Gushchin
  2022-12-06  2:15 ` Roman Gushchin
  3 siblings, 1 reply; 7+ messages in thread
From: Shakeel Butt @ 2022-12-05 16:06 UTC (permalink / raw)
  To: Luther, Sven
  Cc: linux-kernel, regressions, Roman Gushchin, Andrew Morton,
	Christoph Lameter, Johannes Weiner, Michal Hocko, linux-mm,
	Vlastimil Babka, kernel-team, Eric W. Biederman, Muchun Song,
	Waiman Long, Alexey Gladkov, Bonn, Jonas

Hi Sven,

On Mon, Dec 5, 2022 at 6:56 AM Luther, Sven <Sven.Luther@windriver.com> wrote:
>
> #regzbot ^introduced 10befea91b61c4e2c2d1df06a2e978d182fcf792
>
> We are making heavy use of mqueues, and noticed a degradation of performance between 4.18 & 5.10 linux kernels.
>
> After a gross per-version tracing, we did kernel bisection between 5.8 and 5.9
> and traced the issue to a 10 patches (of which 9 where skipped as they didn't boot) between:
>
>
> commit 10befea91b61c4e2c2d1df06a2e978d182fcf792 (HEAD, refs/bisect/bad)
> Author: Roman Gushchin <guro@fb.com>
> Date:   Thu Aug 6 23:21:27 2020 -0700
>
>     mm: memcg/slab: use a single set of kmem_caches for all allocations
>
> and:
>
> commit 286e04b8ed7a04279ae277f0f024430246ea5eec (refs/bisect/good-286e04b8ed7a04279ae277f0f024430246ea5eec)
> Author: Roman Gushchin <guro@fb.com>
> Date:   Thu Aug 6 23:20:52 2020 -0700
>
>     mm: memcg/slab: allocate obj_cgroups for non-root slab pages
>
> All of them are part of the "The new cgroup slab memory controller" patchset:
>
>   https://lore.kernel.org/all/20200623174037.3951353-18-guro@fb.com/T/
>
> from Roman Gushchin, which moves the accounting for page level to the object level.
>
> Measurements where done using the a test programmtest, which measures mix/average/max time mqueue_send/mqueue_rcv,
> and average for getppid, both measured over 100 000 runs. Results are shown in the following table
>
> +----------+--------------------------+-------------------------+----------------+
> | kernel   |    mqueue_rcv (ns)       | mqueue_send (ns)        |    getppid     |
> | version  | min avg  max   variation | min avg max   variation | (ns) variation |
> +----------+--------------------------+-------------------------+----------------+
> | 4.18.45  | 351 382 17533     base   | 383 410 13178     base  | 149      base  |
> | 5.8-good | 380 392  7156   -2,55%   | 376 384  6225    6,77%  | 169   -11,83%  |
> | 5.8-bad  | 524 530  5310  -27,92%   | 512 519  8775  -21,00%  | 169   -11,83%  |
> | 5.10     | 520 533  4078  -28,33%   | 518 534  8108  -23,22%  | 167   -10,78%  |
> | 5.15     | 431 444  8440  -13,96%   | 425 437  6170   -6,18%  | 171   -12,87%  |
> | 6.03     | 474 614  3881  -37,79%   | 482 693   931  -40,84%  | 171   -12,87%  |
> +----------+--------------------------+-------------------------+-----------------
>

Is the last kernel 6.0.3? Also we know there is performance impact of
per-object kmem accounting. Can you try the latest i.e. 6.1-rc8? There
are a couple of memcg charging optimization patches merged in this
window.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Regression] mqueue performance degradation after "The new cgroup slab memory controller" patchset.
  2022-12-05 16:06 ` Shakeel Butt
@ 2022-12-05 16:30   ` Waiman Long
  2022-12-05 17:25     ` Shakeel Butt
  0 siblings, 1 reply; 7+ messages in thread
From: Waiman Long @ 2022-12-05 16:30 UTC (permalink / raw)
  To: Shakeel Butt, Luther, Sven
  Cc: linux-kernel, regressions, Roman Gushchin, Andrew Morton,
	Christoph Lameter, Johannes Weiner, Michal Hocko, linux-mm,
	Vlastimil Babka, kernel-team, Eric W. Biederman, Muchun Song,
	Alexey Gladkov, Bonn, Jonas

On 12/5/22 11:06, Shakeel Butt wrote:
> Hi Sven,
>
> On Mon, Dec 5, 2022 at 6:56 AM Luther, Sven <Sven.Luther@windriver.com> wrote:
>> #regzbot ^introduced 10befea91b61c4e2c2d1df06a2e978d182fcf792
>>
>> We are making heavy use of mqueues, and noticed a degradation of performance between 4.18 & 5.10 linux kernels.
>>
>> After a gross per-version tracing, we did kernel bisection between 5.8 and 5.9
>> and traced the issue to a 10 patches (of which 9 where skipped as they didn't boot) between:
>>
>>
>> commit 10befea91b61c4e2c2d1df06a2e978d182fcf792 (HEAD, refs/bisect/bad)
>> Author: Roman Gushchin <guro@fb.com>
>> Date:   Thu Aug 6 23:21:27 2020 -0700
>>
>>      mm: memcg/slab: use a single set of kmem_caches for all allocations
>>
>> and:
>>
>> commit 286e04b8ed7a04279ae277f0f024430246ea5eec (refs/bisect/good-286e04b8ed7a04279ae277f0f024430246ea5eec)
>> Author: Roman Gushchin <guro@fb.com>
>> Date:   Thu Aug 6 23:20:52 2020 -0700
>>
>>      mm: memcg/slab: allocate obj_cgroups for non-root slab pages
>>
>> All of them are part of the "The new cgroup slab memory controller" patchset:
>>
>>    https://lore.kernel.org/all/20200623174037.3951353-18-guro@fb.com/T/
>>
>> from Roman Gushchin, which moves the accounting for page level to the object level.
>>
>> Measurements where done using the a test programmtest, which measures mix/average/max time mqueue_send/mqueue_rcv,
>> and average for getppid, both measured over 100 000 runs. Results are shown in the following table
>>
>> +----------+--------------------------+-------------------------+----------------+
>> | kernel   |    mqueue_rcv (ns)       | mqueue_send (ns)        |    getppid     |
>> | version  | min avg  max   variation | min avg max   variation | (ns) variation |
>> +----------+--------------------------+-------------------------+----------------+
>> | 4.18.45  | 351 382 17533     base   | 383 410 13178     base  | 149      base  |
>> | 5.8-good | 380 392  7156   -2,55%   | 376 384  6225    6,77%  | 169   -11,83%  |
>> | 5.8-bad  | 524 530  5310  -27,92%   | 512 519  8775  -21,00%  | 169   -11,83%  |
>> | 5.10     | 520 533  4078  -28,33%   | 518 534  8108  -23,22%  | 167   -10,78%  |
>> | 5.15     | 431 444  8440  -13,96%   | 425 437  6170   -6,18%  | 171   -12,87%  |
>> | 6.03     | 474 614  3881  -37,79%   | 482 693   931  -40,84%  | 171   -12,87%  |
>> +----------+--------------------------+-------------------------+-----------------
>>
> Is the last kernel 6.0.3? Also we know there is performance impact of
> per-object kmem accounting. Can you try the latest i.e. 6.1-rc8? There
> are a couple of memcg charging optimization patches merged in this
> window.

It is known that per-object kmem accounting regresses performance. I had 
submitted a number of optimization patches that got merged into v5.14. 
So the regression is reduced in the 5.15 line above. It looks like there 
are some additional regressions in the latest kernel. We will need to 
figure out what causes it.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Regression] mqueue performance degradation after "The new cgroup slab memory controller" patchset.
  2022-12-05 16:30   ` Waiman Long
@ 2022-12-05 17:25     ` Shakeel Butt
  0 siblings, 0 replies; 7+ messages in thread
From: Shakeel Butt @ 2022-12-05 17:25 UTC (permalink / raw)
  To: Waiman Long
  Cc: Luther, Sven, linux-kernel, regressions, Roman Gushchin,
	Andrew Morton, Christoph Lameter, Johannes Weiner, Michal Hocko,
	linux-mm, Vlastimil Babka, kernel-team, Eric W. Biederman,
	Muchun Song, Alexey Gladkov, Bonn, Jonas

On Mon, Dec 5, 2022 at 8:31 AM Waiman Long <longman@redhat.com> wrote:
>
> On 12/5/22 11:06, Shakeel Butt wrote:
> > Hi Sven,
> >
> > On Mon, Dec 5, 2022 at 6:56 AM Luther, Sven <Sven.Luther@windriver.com> wrote:
> >> #regzbot ^introduced 10befea91b61c4e2c2d1df06a2e978d182fcf792
> >>
> >> We are making heavy use of mqueues, and noticed a degradation of performance between 4.18 & 5.10 linux kernels.
> >>
> >> After a gross per-version tracing, we did kernel bisection between 5.8 and 5.9
> >> and traced the issue to a 10 patches (of which 9 where skipped as they didn't boot) between:
> >>
> >>
> >> commit 10befea91b61c4e2c2d1df06a2e978d182fcf792 (HEAD, refs/bisect/bad)
> >> Author: Roman Gushchin <guro@fb.com>
> >> Date:   Thu Aug 6 23:21:27 2020 -0700
> >>
> >>      mm: memcg/slab: use a single set of kmem_caches for all allocations
> >>
> >> and:
> >>
> >> commit 286e04b8ed7a04279ae277f0f024430246ea5eec (refs/bisect/good-286e04b8ed7a04279ae277f0f024430246ea5eec)
> >> Author: Roman Gushchin <guro@fb.com>
> >> Date:   Thu Aug 6 23:20:52 2020 -0700
> >>
> >>      mm: memcg/slab: allocate obj_cgroups for non-root slab pages
> >>
> >> All of them are part of the "The new cgroup slab memory controller" patchset:
> >>
> >>    https://lore.kernel.org/all/20200623174037.3951353-18-guro@fb.com/T/
> >>
> >> from Roman Gushchin, which moves the accounting for page level to the object level.
> >>
> >> Measurements where done using the a test programmtest, which measures mix/average/max time mqueue_send/mqueue_rcv,
> >> and average for getppid, both measured over 100 000 runs. Results are shown in the following table
> >>
> >> +----------+--------------------------+-------------------------+----------------+
> >> | kernel   |    mqueue_rcv (ns)       | mqueue_send (ns)        |    getppid     |
> >> | version  | min avg  max   variation | min avg max   variation | (ns) variation |
> >> +----------+--------------------------+-------------------------+----------------+
> >> | 4.18.45  | 351 382 17533     base   | 383 410 13178     base  | 149      base  |
> >> | 5.8-good | 380 392  7156   -2,55%   | 376 384  6225    6,77%  | 169   -11,83%  |
> >> | 5.8-bad  | 524 530  5310  -27,92%   | 512 519  8775  -21,00%  | 169   -11,83%  |
> >> | 5.10     | 520 533  4078  -28,33%   | 518 534  8108  -23,22%  | 167   -10,78%  |
> >> | 5.15     | 431 444  8440  -13,96%   | 425 437  6170   -6,18%  | 171   -12,87%  |
> >> | 6.03     | 474 614  3881  -37,79%   | 482 693   931  -40,84%  | 171   -12,87%  |
> >> +----------+--------------------------+-------------------------+-----------------
> >>
> > Is the last kernel 6.0.3? Also we know there is performance impact of
> > per-object kmem accounting. Can you try the latest i.e. 6.1-rc8? There
> > are a couple of memcg charging optimization patches merged in this
> > window.
>
> It is known that per-object kmem accounting regresses performance. I had
> submitted a number of optimization patches that got merged into v5.14.
> So the regression is reduced in the 5.15 line above. It looks like there
> are some additional regressions in the latest kernel. We will need to
> figure out what causes it.
>

In 5.18 PREEMPT_RT patches went in and 559271146efc ("mm/memcg:
optimize user context object stock access") got reverted.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Regression] mqueue performance degradation after "The new cgroup slab memory controller" patchset.
       [not found] <PH0PR11MB562641BC03630B4B7A227FD7E9189@PH0PR11MB5626.namprd11.prod.outlook.com>
  2022-12-05 15:05 ` [Regression] mqueue performance degradation after "The new cgroup slab memory controller" patchset Luther, Sven
  2022-12-05 16:06 ` Shakeel Butt
@ 2022-12-05 20:08 ` Roman Gushchin
  2022-12-06  2:15 ` Roman Gushchin
  3 siblings, 0 replies; 7+ messages in thread
From: Roman Gushchin @ 2022-12-05 20:08 UTC (permalink / raw)
  To: Luther, Sven
  Cc: linux-kernel, regressions, Roman Gushchin, Andrew Morton,
	Christoph Lameter, Johannes Weiner, Michal Hocko, Shakeel Butt,
	linux-mm, Vlastimil Babka, kernel-team, Eric W. Biederman,
	Muchun Song, Waiman Long, Alexey Gladkov, Bonn, Jonas

On Mon, Dec 05, 2022 at 02:55:48PM +0000, Luther, Sven wrote:
> #regzbot ^introduced 10befea91b61c4e2c2d1df06a2e978d182fcf792
> 
> We are making heavy use of mqueues, and noticed a degradation of performance between 4.18 & 5.10 linux kernels.
> 
> After a gross per-version tracing, we did kernel bisection between 5.8 and 5.9
> and traced the issue to a 10 patches (of which 9 where skipped as they didn't boot) between:
> 
> 
> commit 10befea91b61c4e2c2d1df06a2e978d182fcf792 (HEAD, refs/bisect/bad)
> Author: Roman Gushchin <guro@fb.com>
> Date:   Thu Aug 6 23:21:27 2020 -0700
> 
>     mm: memcg/slab: use a single set of kmem_caches for all allocations
> 
> and:
> 
> commit 286e04b8ed7a04279ae277f0f024430246ea5eec (refs/bisect/good-286e04b8ed7a04279ae277f0f024430246ea5eec)
> Author: Roman Gushchin <guro@fb.com>
> Date:   Thu Aug 6 23:20:52 2020 -0700
> 
>     mm: memcg/slab: allocate obj_cgroups for non-root slab pages
> 
> All of them are part of the "The new cgroup slab memory controller" patchset:
> 
>   https://lore.kernel.org/all/20200623174037.3951353-18-guro@fb.com/T/
> 
> from Roman Gushchin, which moves the accounting for page level to the object level.
> 
> Measurements where done using the a test programmtest, which measures mix/average/max time mqueue_send/mqueue_rcv,
> and average for getppid, both measured over 100 000 runs. Results are shown in the following table
> 
> +----------+--------------------------+-------------------------+----------------+
> | kernel   |    mqueue_rcv (ns)       | mqueue_send (ns)        |    getppid     |
> | version  | min avg  max   variation | min avg max   variation | (ns) variation |
> +----------+--------------------------+-------------------------+----------------+
> | 4.18.45  | 351 382 17533     base   | 383 410 13178     base  | 149      base  |
> | 5.8-good | 380 392  7156   -2,55%   | 376 384  6225    6,77%  | 169   -11,83%  |
> | 5.8-bad  | 524 530  5310  -27,92%   | 512 519  8775  -21,00%  | 169   -11,83%  |
> | 5.10     | 520 533  4078  -28,33%   | 518 534  8108  -23,22%  | 167   -10,78%  |
> | 5.15     | 431 444  8440  -13,96%   | 425 437  6170   -6,18%  | 171   -12,87%  |
> | 6.03     | 474 614  3881  -37,79%   | 482 693   931  -40,84%  | 171   -12,87%  |
> +----------+--------------------------+-------------------------+-----------------

Hi Sven!

Thank you for the report! As Waiman said, it's not a secret that per-object tracking
makes individual allocations slower, but for the majority of workloads it's well
compensated by significant memory savings and a lower fragmentation.

It seems there is another regression between 5.15 and 6.03, which is a separate
topic, but how big is the real regression between 4.18 and 5.15? The benchmark
shows about 14%, but is you real workload suffering at the same level?
If the answer is yes, the right thing to do is to introduce some sort of
mqueue-specific caching for allocated objects.

Thanks!

Roman

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Regression] mqueue performance degradation after "The new cgroup slab memory controller" patchset.
       [not found] <PH0PR11MB562641BC03630B4B7A227FD7E9189@PH0PR11MB5626.namprd11.prod.outlook.com>
                   ` (2 preceding siblings ...)
  2022-12-05 20:08 ` Roman Gushchin
@ 2022-12-06  2:15 ` Roman Gushchin
  2022-12-07  9:44   ` Luther, Sven
  3 siblings, 1 reply; 7+ messages in thread
From: Roman Gushchin @ 2022-12-06  2:15 UTC (permalink / raw)
  To: Luther, Sven
  Cc: linux-kernel, regressions, Roman Gushchin, Andrew Morton,
	Christoph Lameter, Johannes Weiner, Michal Hocko, Shakeel Butt,
	linux-mm, Vlastimil Babka, kernel-team, Eric W. Biederman,
	Muchun Song, Waiman Long, Alexey Gladkov, Bonn, Jonas

On Mon, Dec 05, 2022 at 02:55:48PM +0000, Luther, Sven wrote:
> #regzbot ^introduced 10befea91b61c4e2c2d1df06a2e978d182fcf792
> 
> We are making heavy use of mqueues, and noticed a degradation of performance between 4.18 & 5.10 linux kernels.
> 
> After a gross per-version tracing, we did kernel bisection between 5.8 and 5.9
> and traced the issue to a 10 patches (of which 9 where skipped as they didn't boot) between:
> 
> 
> commit 10befea91b61c4e2c2d1df06a2e978d182fcf792 (HEAD, refs/bisect/bad)
> Author: Roman Gushchin <guro@fb.com>
> Date:   Thu Aug 6 23:21:27 2020 -0700
> 
>     mm: memcg/slab: use a single set of kmem_caches for all allocations
> 
> and:
> 
> commit 286e04b8ed7a04279ae277f0f024430246ea5eec (refs/bisect/good-286e04b8ed7a04279ae277f0f024430246ea5eec)
> Author: Roman Gushchin <guro@fb.com>
> Date:   Thu Aug 6 23:20:52 2020 -0700
> 
>     mm: memcg/slab: allocate obj_cgroups for non-root slab pages
> 
> All of them are part of the "The new cgroup slab memory controller" patchset:
> 
>   https://lore.kernel.org/all/20200623174037.3951353-18-guro@fb.com/T/
> 
> from Roman Gushchin, which moves the accounting for page level to the object level.
> 
> Measurements where done using the a test programmtest, which measures mix/average/max time mqueue_send/mqueue_rcv,
> and average for getppid, both measured over 100 000 runs. Results are shown in the following table
> 
> +----------+--------------------------+-------------------------+----------------+
> | kernel   |    mqueue_rcv (ns)       | mqueue_send (ns)        |    getppid     |
> | version  | min avg  max   variation | min avg max   variation | (ns) variation |
> +----------+--------------------------+-------------------------+----------------+
> | 4.18.45  | 351 382 17533     base   | 383 410 13178     base  | 149      base  |
> | 5.8-good | 380 392  7156   -2,55%   | 376 384  6225    6,77%  | 169   -11,83%  |
> | 5.8-bad  | 524 530  5310  -27,92%   | 512 519  8775  -21,00%  | 169   -11,83%  |
> | 5.10     | 520 533  4078  -28,33%   | 518 534  8108  -23,22%  | 167   -10,78%  |
> | 5.15     | 431 444  8440  -13,96%   | 425 437  6170   -6,18%  | 171   -12,87%  |
> | 6.03     | 474 614  3881  -37,79%   | 482 693   931  -40,84%  | 171   -12,87%  |
> +----------+--------------------------+-------------------------+-----------------
>

Hi Sven!

To prove a concept of local msg caching, I'm mastered a patch (attached below).
In my test setup it seems to resolve most of the regression. Would you mind to
give it a try? (It's only tested on my local vm, don't treat it as a production
code). If it will fix the regression, I can invest more time into it and post
it in an umpstreamble form.

Here are my results (5 runs each):

Original (current mm tree, 6.1+):
RX: 1122/1202/114001 1197/1267/26517 1109/1173/29613 1091/1165/54434 1091/1160/26302
TX: 1176/1255/38168  1252/1360/27683 1165/1226/41454 1145/1222/90040 1146/1214/26595

No accounting:
RX: 984/1053/31268    1024/1091/39105 1018/1077/61515  999/1065/30423  1008/1060/115284
TX: 1020/1097/137690  1065/1143/31448 1055/1130/133278 1032/1106/52372 1043/1099/25705

Patched:
RX: 1033/1165/38579 1030/1108/43703 1022/1114/25653 1008/1110/38462 1089/1136/29120
TX: 1047/1184/25373 1048/1116/25425 1034/1122/61275 1022/1121/24636 1105/1155/46600

Thanks!

--

From 7742e074c25eb51d8e606585f0f8b06228d907f5 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Mon, 5 Dec 2022 18:13:15 -0800
Subject: [PATCH] ipc/mqueue: introduce msg cache

Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
 ipc/mqueue.c  | 20 ++++++++++---
 ipc/msg.c     | 12 ++++----
 ipc/msgutil.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++----
 ipc/util.h    |  8 ++++--
 4 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 467a194b8a2e..5c6fec8e9701 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -131,6 +131,11 @@ struct ext_wait_queue {		/* queue of sleeping tasks */
 	int state;		/* one of STATE_* values */
 };
 
+struct pcpu_msg_cache;
+struct msg_cache {
+	struct pcpu_msg_cache __percpu *pcpu_cache;
+};
+
 struct mqueue_inode_info {
 	spinlock_t lock;
 	struct inode vfs_inode;
@@ -152,6 +157,8 @@ struct mqueue_inode_info {
 	/* for tasks waiting for free space and messages, respectively */
 	struct ext_wait_queue e_wait_q[2];
 
+	struct msg_cache msg_cache;
+
 	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
 };
 
@@ -368,6 +375,9 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 		mq_bytes = info->attr.mq_maxmsg * info->attr.mq_msgsize;
 		if (mq_bytes + mq_treesize < mq_bytes)
 			goto out_inode;
+		ret = init_msg_cache(&info->msg_cache);
+		if (ret)
+			goto out_inode;
 		mq_bytes += mq_treesize;
 		info->ucounts = get_ucounts(current_ucounts());
 		if (info->ucounts) {
@@ -531,9 +541,11 @@ static void mqueue_evict_inode(struct inode *inode)
 
 	list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
 		list_del(&msg->m_list);
-		free_msg(msg);
+		free_msg(msg, &info->msg_cache);
 	}
 
+	free_msg_cache(&info->msg_cache);
+
 	if (info->ucounts) {
 		unsigned long mq_bytes, mq_treesize;
 
@@ -1108,7 +1120,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 
 	/* First try to allocate memory, before doing anything with
 	 * existing queues. */
-	msg_ptr = load_msg(u_msg_ptr, msg_len);
+	msg_ptr = load_msg(u_msg_ptr, msg_len, &info->msg_cache);
 	if (IS_ERR(msg_ptr)) {
 		ret = PTR_ERR(msg_ptr);
 		goto out_fput;
@@ -1170,7 +1182,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 	wake_up_q(&wake_q);
 out_free:
 	if (ret)
-		free_msg(msg_ptr);
+		free_msg(msg_ptr, &info->msg_cache);
 out_fput:
 	fdput(f);
 out:
@@ -1273,7 +1285,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 			store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
 			ret = -EFAULT;
 		}
-		free_msg(msg_ptr);
+		free_msg(msg_ptr, &info->msg_cache);
 	}
 out_fput:
 	fdput(f);
diff --git a/ipc/msg.c b/ipc/msg.c
index fd08b3cb36d7..fcc09f848490 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -287,7 +287,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 
 	list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
 		percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1);
-		free_msg(msg);
+		free_msg(msg, NULL);
 	}
 	percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes);
 	ipc_update_pid(&msq->q_lspid, NULL);
@@ -861,7 +861,7 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext,
 	if (mtype < 1)
 		return -EINVAL;
 
-	msg = load_msg(mtext, msgsz);
+	msg = load_msg(mtext, msgsz, NULL);
 	if (IS_ERR(msg))
 		return PTR_ERR(msg);
 
@@ -954,7 +954,7 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext,
 out_unlock1:
 	rcu_read_unlock();
 	if (msg != NULL)
-		free_msg(msg);
+		free_msg(msg, NULL);
 	return err;
 }
 
@@ -1049,7 +1049,7 @@ static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz)
 	/*
 	 * Create dummy message to copy real message to.
 	 */
-	copy = load_msg(buf, bufsz);
+	copy = load_msg(buf, bufsz, NULL);
 	if (!IS_ERR(copy))
 		copy->m_ts = bufsz;
 	return copy;
@@ -1058,7 +1058,7 @@ static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz)
 static inline void free_copy(struct msg_msg *copy)
 {
 	if (copy)
-		free_msg(copy);
+		free_msg(copy, NULL);
 }
 #else
 static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz)
@@ -1256,7 +1256,7 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
 	}
 
 	bufsz = msg_handler(buf, msg, bufsz);
-	free_msg(msg);
+	free_msg(msg, NULL);
 
 	return bufsz;
 }
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index d0a0e877cadd..8fe64bb3f48d 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -39,16 +39,65 @@ struct msg_msgseg {
 	/* the next part of the message follows immediately */
 };
 
+struct pcpu_msg_cache {
+	struct msg_msg *msg;
+	struct task_struct *curr;
+	size_t len;
+};
+
+struct msg_cache {
+	struct pcpu_msg_cache __percpu *pcpu_cache;
+};
+
 #define DATALEN_MSG	((size_t)PAGE_SIZE-sizeof(struct msg_msg))
 #define DATALEN_SEG	((size_t)PAGE_SIZE-sizeof(struct msg_msgseg))
 
+int init_msg_cache(struct msg_cache *cache)
+{
+	cache->pcpu_cache = alloc_percpu(struct pcpu_msg_cache);
+	if (!cache->pcpu_cache)
+		return -ENOMEM;
 
-static struct msg_msg *alloc_msg(size_t len)
+	return 0;
+}
+
+void free_msg_cache(struct msg_cache *cache)
+{
+	int cpu;
+
+	if (!cache->pcpu_cache)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct pcpu_msg_cache *pc = per_cpu_ptr(cache->pcpu_cache, cpu);
+
+		if (pc->msg)
+			free_msg(pc->msg, NULL);
+	}
+
+	free_percpu(cache->pcpu_cache);
+}
+
+static struct msg_msg *alloc_msg(size_t len, struct msg_cache *cache)
 {
 	struct msg_msg *msg;
 	struct msg_msgseg **pseg;
 	size_t alen;
 
+	if (cache) {
+		struct pcpu_msg_cache *pc;
+
+		msg = NULL;
+		pc = get_cpu_ptr(cache->pcpu_cache);
+		if (pc->msg && pc->curr == get_current() && pc->len == len) {
+			msg = pc->msg;
+			pc->msg = NULL;
+		}
+		put_cpu_ptr(cache->pcpu_cache);
+		if (msg)
+			return msg;
+	}
+
 	alen = min(len, DATALEN_MSG);
 	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT);
 	if (msg == NULL)
@@ -77,18 +126,19 @@ static struct msg_msg *alloc_msg(size_t len)
 	return msg;
 
 out_err:
-	free_msg(msg);
+	free_msg(msg, cache);
 	return NULL;
 }
 
-struct msg_msg *load_msg(const void __user *src, size_t len)
+struct msg_msg *load_msg(const void __user *src, size_t len,
+			 struct msg_cache *cache)
 {
 	struct msg_msg *msg;
 	struct msg_msgseg *seg;
 	int err = -EFAULT;
 	size_t alen;
 
-	msg = alloc_msg(len);
+	msg = alloc_msg(len, cache);
 	if (msg == NULL)
 		return ERR_PTR(-ENOMEM);
 
@@ -111,7 +161,7 @@ struct msg_msg *load_msg(const void __user *src, size_t len)
 	return msg;
 
 out_err:
-	free_msg(msg);
+	free_msg(msg, cache);
 	return ERR_PTR(err);
 }
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -166,10 +216,27 @@ int store_msg(void __user *dest, struct msg_msg *msg, size_t len)
 	return 0;
 }
 
-void free_msg(struct msg_msg *msg)
+void free_msg(struct msg_msg *msg, struct msg_cache *cache)
 {
 	struct msg_msgseg *seg;
 
+	if (cache) {
+		struct pcpu_msg_cache *pc;
+		bool cached = false;
+
+		pc = get_cpu_ptr(cache->pcpu_cache);
+		if (!pc->msg) {
+			pc->curr = get_current();
+			pc->len = msg->m_ts;
+			pc->msg = msg;
+			cached = true;
+		}
+		put_cpu_ptr(cache->pcpu_cache);
+
+		if (cached)
+			return;
+	}
+
 	security_msg_msg_free(msg);
 
 	seg = msg->next;
diff --git a/ipc/util.h b/ipc/util.h
index b2906e366539..a2da266386aa 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -196,8 +196,12 @@ static inline void ipc_update_pid(struct pid **pos, struct pid *pid)
 int ipc_parse_version(int *cmd);
 #endif
 
-extern void free_msg(struct msg_msg *msg);
-extern struct msg_msg *load_msg(const void __user *src, size_t len);
+struct msg_cache;
+extern int init_msg_cache(struct msg_cache *cache);
+extern void free_msg_cache(struct msg_cache *cache);
+extern void free_msg(struct msg_msg *msg, struct msg_cache *cache);
+extern struct msg_msg *load_msg(const void __user *src, size_t len,
+				struct msg_cache *cache);
 extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
 extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);
 
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* RE: [Regression] mqueue performance degradation after "The new cgroup slab memory controller" patchset.
  2022-12-06  2:15 ` Roman Gushchin
@ 2022-12-07  9:44   ` Luther, Sven
  0 siblings, 0 replies; 7+ messages in thread
From: Luther, Sven @ 2022-12-07  9:44 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-kernel, regressions, Roman Gushchin, Andrew Morton,
	Christoph Lameter, Johannes Weiner, Michal Hocko, Shakeel Butt,
	linux-mm, Vlastimil Babka, kernel-team, Eric W. Biederman,
	Muchun Song, Waiman Long, Alexey Gladkov, Bonn, Jonas

Hi,

Thanks for your replies and suggestions.

First 6.03 was indeed 6.0.3 the last stable at the time i did the measurements), so as suggested i redid the measurements with 6.1-rc8 (which seem a bit worse than 6.0.3).

I also added the patch provided by Roman Gushchin (on line 6.1-rc8 p), which is indeed a consequent improvement on this test.

I also added the variation measurement for minimum values, in addition to average measurements, since i have the feeling that using the minimum measurement will show less variance compared to the average one, so, maybe better for comparisons.

Concerning the test case, obviously our application doesn't do 100 000 mqueue calls in a row, but still makes heavy use of them, and we saw the degradation in it.

+-----------+---------------------------------+--------------------------------+----------------+
| kernel    | mqueue_rcv (ns)     variation   | mqueue_send (ns)   variation   |    getppid     |
| version   | min avg  max      min      avg  | min avg max       min     avg  | (ns) variation |
+-----------+--------------------------+---------------------------------------+----------------+
| 4.18.45   | 351 382 17533    base     base  | 383 410 13178     base    base | 149      base  |
| 5.8-good  | 380 392  7156   -7,63%  -2,55%  | 376 384  6225    1,86%   6,77% | 169   -11,83%  |
| 5.8-bad   | 524 530  5310  -33,02% -27,92%  | 512 519  8775  -25,20% -21,00% | 169   -11,83%  |
| 5.10      | 520 533  4078  -32,20% -28,33%  | 518 534  8108  -26,06% -23,22% | 167   -10,78%  |
| 5.15      | 431 444  8440  -18,56% -13,96%  | 425 437  6170   -9,88%  -6,18% | 171   -12,87%  |
| 6.0.3     | 474 614  3881  -25,95% -37,79%  | 482 693   931  -20,54% -40,84% | 171   -12,87%  |
| 6.1-rc8   | 496 509  8804  -29,23% -24,95%  | 493 512  5748  -22,31% -19,92% | 174   -14,37%  |
| 6.1-rc8 p | 377 381  6374   -6,90%   0,26%  | 339 343 13224   12,98%  19,53% | 174   -14,37%  |
+-----------+---------------------------------+--------------------------------+----------------+

So, to summarize, the patch introduced in 5.14 and removed in 5.18 represented an optimization of this issue, not sure why it was removed again, but the caching patch Roman provided brings performance almost to the point we where in 4.18 (or even better in TX case), i wonder if a conjunction of both would further optimize the mqueue performance.

Friendly,

Sven Luther
________________________________________
De : Roman Gushchin <roman.gushchin@linux.dev>
Envoyé : mardi 6 décembre 2022 03:15
À : Luther, Sven
Cc : linux-kernel@vger.kernel.org; regressions@lists.linux.dev; Roman Gushchin; Andrew Morton; Christoph Lameter; Johannes Weiner; Michal Hocko; Shakeel Butt; linux-mm@kvack.org; Vlastimil Babka; kernel-team@fb.com; Eric W. Biederman; Muchun Song; Waiman Long; Alexey Gladkov; Bonn, Jonas
Objet : Re: [Regression] mqueue performance degradation after "The new cgroup slab memory controller" patchset.

CAUTION: This email comes from a non Wind River email account!
Do not click links or open attachments unless you recognize the sender and know the content is safe.

On Mon, Dec 05, 2022 at 02:55:48PM +0000, Luther, Sven wrote:
> #regzbot ^introduced 10befea91b61c4e2c2d1df06a2e978d182fcf792
>
> We are making heavy use of mqueues, and noticed a degradation of performance between 4.18 & 5.10 linux kernels.
>
> After a gross per-version tracing, we did kernel bisection between 5.8 and 5.9
> and traced the issue to a 10 patches (of which 9 where skipped as they didn't boot) between:
>
>
> commit 10befea91b61c4e2c2d1df06a2e978d182fcf792 (HEAD, refs/bisect/bad)
> Author: Roman Gushchin <guro@fb.com>
> Date:   Thu Aug 6 23:21:27 2020 -0700
>
>     mm: memcg/slab: use a single set of kmem_caches for all allocations
>
> and:
>
> commit 286e04b8ed7a04279ae277f0f024430246ea5eec (refs/bisect/good-286e04b8ed7a04279ae277f0f024430246ea5eec)
> Author: Roman Gushchin <guro@fb.com>
> Date:   Thu Aug 6 23:20:52 2020 -0700
>
>     mm: memcg/slab: allocate obj_cgroups for non-root slab pages
>
> All of them are part of the "The new cgroup slab memory controller" patchset:
>
>   https://lore.kernel.org/all/20200623174037.3951353-18-guro@fb.com/T/
>
> from Roman Gushchin, which moves the accounting for page level to the object level.
>
> Measurements where done using the a test programmtest, which measures mix/average/max time mqueue_send/mqueue_rcv,
> and average for getppid, both measured over 100 000 runs. Results are shown in the following table
>
> +----------+--------------------------+-------------------------+----------------+
> | kernel   |    mqueue_rcv (ns)       | mqueue_send (ns)        |    getppid     |
> | version  | min avg  max   variation | min avg max   variation | (ns) variation |
> +----------+--------------------------+-------------------------+----------------+
> | 4.18.45  | 351 382 17533     base   | 383 410 13178     base  | 149      base  |
> | 5.8-good | 380 392  7156   -2,55%   | 376 384  6225    6,77%  | 169   -11,83%  |
> | 5.8-bad  | 524 530  5310  -27,92%   | 512 519  8775  -21,00%  | 169   -11,83%  |
> | 5.10     | 520 533  4078  -28,33%   | 518 534  8108  -23,22%  | 167   -10,78%  |
> | 5.15     | 431 444  8440  -13,96%   | 425 437  6170   -6,18%  | 171   -12,87%  |
> | 6.03     | 474 614  3881  -37,79%   | 482 693   931  -40,84%  | 171   -12,87%  |
> +----------+--------------------------+-------------------------+-----------------
>

Hi Sven!

To prove a concept of local msg caching, I'm mastered a patch (attached below).
In my test setup it seems to resolve most of the regression. Would you mind to
give it a try? (It's only tested on my local vm, don't treat it as a production
code). If it will fix the regression, I can invest more time into it and post
it in an umpstreamble form.

Here are my results (5 runs each):

Original (current mm tree, 6.1+):
RX: 1122/1202/114001 1197/1267/26517 1109/1173/29613 1091/1165/54434 1091/1160/26302
TX: 1176/1255/38168  1252/1360/27683 1165/1226/41454 1145/1222/90040 1146/1214/26595

No accounting:
RX: 984/1053/31268    1024/1091/39105 1018/1077/61515  999/1065/30423  1008/1060/115284
TX: 1020/1097/137690  1065/1143/31448 1055/1130/133278 1032/1106/52372 1043/1099/25705

Patched:
RX: 1033/1165/38579 1030/1108/43703 1022/1114/25653 1008/1110/38462 1089/1136/29120
TX: 1047/1184/25373 1048/1116/25425 1034/1122/61275 1022/1121/24636 1105/1155/46600

Thanks!

--

From 7742e074c25eb51d8e606585f0f8b06228d907f5 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Mon, 5 Dec 2022 18:13:15 -0800
Subject: [PATCH] ipc/mqueue: introduce msg cache

Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
 ipc/mqueue.c  | 20 ++++++++++---
 ipc/msg.c     | 12 ++++----
 ipc/msgutil.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++----
 ipc/util.h    |  8 ++++--
 4 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 467a194b8a2e..5c6fec8e9701 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -131,6 +131,11 @@ struct ext_wait_queue {            /* queue of sleeping tasks */
        int state;              /* one of STATE_* values */
 };

+struct pcpu_msg_cache;
+struct msg_cache {
+       struct pcpu_msg_cache __percpu *pcpu_cache;
+};
+
 struct mqueue_inode_info {
        spinlock_t lock;
        struct inode vfs_inode;
@@ -152,6 +157,8 @@ struct mqueue_inode_info {
        /* for tasks waiting for free space and messages, respectively */
        struct ext_wait_queue e_wait_q[2];

+       struct msg_cache msg_cache;
+
        unsigned long qsize; /* size of queue in memory (sum of all msgs) */
 };

@@ -368,6 +375,9 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
                mq_bytes = info->attr.mq_maxmsg * info->attr.mq_msgsize;
                if (mq_bytes + mq_treesize < mq_bytes)
                        goto out_inode;
+               ret = init_msg_cache(&info->msg_cache);
+               if (ret)
+                       goto out_inode;
                mq_bytes += mq_treesize;
                info->ucounts = get_ucounts(current_ucounts());
                if (info->ucounts) {
@@ -531,9 +541,11 @@ static void mqueue_evict_inode(struct inode *inode)

        list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
                list_del(&msg->m_list);
-               free_msg(msg);
+               free_msg(msg, &info->msg_cache);
        }

+       free_msg_cache(&info->msg_cache);
+
        if (info->ucounts) {
                unsigned long mq_bytes, mq_treesize;

@@ -1108,7 +1120,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,

        /* First try to allocate memory, before doing anything with
         * existing queues. */
-       msg_ptr = load_msg(u_msg_ptr, msg_len);
+       msg_ptr = load_msg(u_msg_ptr, msg_len, &info->msg_cache);
        if (IS_ERR(msg_ptr)) {
                ret = PTR_ERR(msg_ptr);
                goto out_fput;
@@ -1170,7 +1182,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
        wake_up_q(&wake_q);
 out_free:
        if (ret)
-               free_msg(msg_ptr);
+               free_msg(msg_ptr, &info->msg_cache);
 out_fput:
        fdput(f);
 out:
@@ -1273,7 +1285,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
                        store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
                        ret = -EFAULT;
                }
-               free_msg(msg_ptr);
+               free_msg(msg_ptr, &info->msg_cache);
        }
 out_fput:
        fdput(f);
diff --git a/ipc/msg.c b/ipc/msg.c
index fd08b3cb36d7..fcc09f848490 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -287,7 +287,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)

        list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
                percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1);
-               free_msg(msg);
+               free_msg(msg, NULL);
        }
        percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes);
        ipc_update_pid(&msq->q_lspid, NULL);
@@ -861,7 +861,7 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext,
        if (mtype < 1)
                return -EINVAL;

-       msg = load_msg(mtext, msgsz);
+       msg = load_msg(mtext, msgsz, NULL);
        if (IS_ERR(msg))
                return PTR_ERR(msg);

@@ -954,7 +954,7 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext,
 out_unlock1:
        rcu_read_unlock();
        if (msg != NULL)
-               free_msg(msg);
+               free_msg(msg, NULL);
        return err;
 }

@@ -1049,7 +1049,7 @@ static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz)
        /*
         * Create dummy message to copy real message to.
         */
-       copy = load_msg(buf, bufsz);
+       copy = load_msg(buf, bufsz, NULL);
        if (!IS_ERR(copy))
                copy->m_ts = bufsz;
        return copy;
@@ -1058,7 +1058,7 @@ static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz)
 static inline void free_copy(struct msg_msg *copy)
 {
        if (copy)
-               free_msg(copy);
+               free_msg(copy, NULL);
 }
 #else
 static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz)
@@ -1256,7 +1256,7 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
        }

        bufsz = msg_handler(buf, msg, bufsz);
-       free_msg(msg);
+       free_msg(msg, NULL);

        return bufsz;
 }
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index d0a0e877cadd..8fe64bb3f48d 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -39,16 +39,65 @@ struct msg_msgseg {
        /* the next part of the message follows immediately */
 };

+struct pcpu_msg_cache {
+       struct msg_msg *msg;
+       struct task_struct *curr;
+       size_t len;
+};
+
+struct msg_cache {
+       struct pcpu_msg_cache __percpu *pcpu_cache;
+};
+
 #define DATALEN_MSG    ((size_t)PAGE_SIZE-sizeof(struct msg_msg))
 #define DATALEN_SEG    ((size_t)PAGE_SIZE-sizeof(struct msg_msgseg))

+int init_msg_cache(struct msg_cache *cache)
+{
+       cache->pcpu_cache = alloc_percpu(struct pcpu_msg_cache);
+       if (!cache->pcpu_cache)
+               return -ENOMEM;

-static struct msg_msg *alloc_msg(size_t len)
+       return 0;
+}
+
+void free_msg_cache(struct msg_cache *cache)
+{
+       int cpu;
+
+       if (!cache->pcpu_cache)
+               return;
+
+       for_each_possible_cpu(cpu) {
+               struct pcpu_msg_cache *pc = per_cpu_ptr(cache->pcpu_cache, cpu);
+
+               if (pc->msg)
+                       free_msg(pc->msg, NULL);
+       }
+
+       free_percpu(cache->pcpu_cache);
+}
+
+static struct msg_msg *alloc_msg(size_t len, struct msg_cache *cache)
 {
        struct msg_msg *msg;
        struct msg_msgseg **pseg;
        size_t alen;

+       if (cache) {
+               struct pcpu_msg_cache *pc;
+
+               msg = NULL;
+               pc = get_cpu_ptr(cache->pcpu_cache);
+               if (pc->msg && pc->curr == get_current() && pc->len == len) {
+                       msg = pc->msg;
+                       pc->msg = NULL;
+               }
+               put_cpu_ptr(cache->pcpu_cache);
+               if (msg)
+                       return msg;
+       }
+
        alen = min(len, DATALEN_MSG);
        msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT);
        if (msg == NULL)
@@ -77,18 +126,19 @@ static struct msg_msg *alloc_msg(size_t len)
        return msg;

 out_err:
-       free_msg(msg);
+       free_msg(msg, cache);
        return NULL;
 }

-struct msg_msg *load_msg(const void __user *src, size_t len)
+struct msg_msg *load_msg(const void __user *src, size_t len,
+                        struct msg_cache *cache)
 {
        struct msg_msg *msg;
        struct msg_msgseg *seg;
        int err = -EFAULT;
        size_t alen;

-       msg = alloc_msg(len);
+       msg = alloc_msg(len, cache);
        if (msg == NULL)
                return ERR_PTR(-ENOMEM);

@@ -111,7 +161,7 @@ struct msg_msg *load_msg(const void __user *src, size_t len)
        return msg;

 out_err:
-       free_msg(msg);
+       free_msg(msg, cache);
        return ERR_PTR(err);
 }
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -166,10 +216,27 @@ int store_msg(void __user *dest, struct msg_msg *msg, size_t len)
        return 0;
 }

-void free_msg(struct msg_msg *msg)
+void free_msg(struct msg_msg *msg, struct msg_cache *cache)
 {
        struct msg_msgseg *seg;

+       if (cache) {
+               struct pcpu_msg_cache *pc;
+               bool cached = false;
+
+               pc = get_cpu_ptr(cache->pcpu_cache);
+               if (!pc->msg) {
+                       pc->curr = get_current();
+                       pc->len = msg->m_ts;
+                       pc->msg = msg;
+                       cached = true;
+               }
+               put_cpu_ptr(cache->pcpu_cache);
+
+               if (cached)
+                       return;
+       }
+
        security_msg_msg_free(msg);

        seg = msg->next;
diff --git a/ipc/util.h b/ipc/util.h
index b2906e366539..a2da266386aa 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -196,8 +196,12 @@ static inline void ipc_update_pid(struct pid **pos, struct pid *pid)
 int ipc_parse_version(int *cmd);
 #endif

-extern void free_msg(struct msg_msg *msg);
-extern struct msg_msg *load_msg(const void __user *src, size_t len);
+struct msg_cache;
+extern int init_msg_cache(struct msg_cache *cache);
+extern void free_msg_cache(struct msg_cache *cache);
+extern void free_msg(struct msg_msg *msg, struct msg_cache *cache);
+extern struct msg_msg *load_msg(const void __user *src, size_t len,
+                               struct msg_cache *cache);
 extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
 extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);

--
2.38.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-12-07  9:45 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <PH0PR11MB562641BC03630B4B7A227FD7E9189@PH0PR11MB5626.namprd11.prod.outlook.com>
2022-12-05 15:05 ` [Regression] mqueue performance degradation after "The new cgroup slab memory controller" patchset Luther, Sven
2022-12-05 16:06 ` Shakeel Butt
2022-12-05 16:30   ` Waiman Long
2022-12-05 17:25     ` Shakeel Butt
2022-12-05 20:08 ` Roman Gushchin
2022-12-06  2:15 ` Roman Gushchin
2022-12-07  9:44   ` Luther, Sven

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).