Dear Linux folks, On 08/28/18 07:27, Paul Menzel wrote: > Using `sleepgraph.py` [1][2] to profile the suspend to RAM (STR) > times, shows that `ksys_enter` takes a noticeable amount of time. > > 13 ms on a TUXEDO Book BU1406 with the NVMe device *SAMSUNG > MZVKW512HMJP-00000*, which is quite good, and over a 60 ms on ASRock > E350M1 with an SSD SanDisk device. It’s 109 ms on the ASRock E350M1, which makes up one third of the total time to reach ACPI S3. > Adding `devicefilter: ksys_sync` to `config/suspend-callgraph.cfg`, > and running `sudo ./sleepgraph.py -config > config/suspend-callgraph.cfg`, the attached HTML output shows, that > `iterate_supers` takes 6 to 7 ms twice. > > •   `iterate_supers` (6.316 ms @ 388.944557) > •   `iterate_supers` (0.201 ms @ 388.950873) > •   `iterate_supers` (7.421 ms @ 388.951074) > > Normally, `sync_inodes_one_sb` only takes microseconds, but once in > both cases it takes several milliseconds. > > •   sync_inodes_one_sb (0.001 ms @ 388.944660) > •   sync_inodes_one_sb (5.978 ms @ 388.944665) > •   sync_inodes_one_sb (0.001 ms @ 388.950645) > > Please find an excerpt from the call graph from ftrace below, and > note the time increase between 388.944751 and 388.950636. > >>   388.944700 |   0) sleepgr-11355  |               |          wb_wait_for_completion() { >>   388.944701 |   0) sleepgr-11355  |               |            _cond_resched() { >>   388.944701 |   0) sleepgr-11355  |   0.064 us    |              rcu_all_qs(); >>   388.944702 |   0) sleepgr-11355  |   0.664 us    |            } /* _cond_resched */ >>   388.944702 |   0) sleepgr-11355  |   0.067 us    |            init_wait_entry(); >>   388.944703 |   0) sleepgr-11355  |               |            prepare_to_wait_event() { >>   388.944703 |   0) sleepgr-11355  |   0.080 us    |              _raw_spin_lock_irqsave(); >>   388.944704 |   0) sleepgr-11355  |   0.073 us    |              _raw_spin_unlock_irqrestore(); >>   388.944704 |   0) sleepgr-11355  |   1.388 us    |            } /* prepare_to_wait_event */ >>   388.944705 |   0) sleepgr-11355  |               |            schedule() { >>   388.944705 |   0) sleepgr-11355  |   0.085 us    |              rcu_note_context_switch(); >>   388.944706 |   0) sleepgr-11355  |   0.064 us    |              _raw_spin_lock(); >>   388.944707 |   0) sleepgr-11355  |   0.093 us    |              update_rq_clock(); >>   388.944708 |   0) sleepgr-11355  |               |              deactivate_task() { >>   388.944708 |   0) sleepgr-11355  |               |                dequeue_task_fair() { >>   388.944708 |   0) sleepgr-11355  |               |                  dequeue_entity() { >>   388.944709 |   0) sleepgr-11355  |               |                    update_curr() { >>   388.944709 |   0) sleepgr-11355  |   0.095 us    |                      update_min_vruntime(); >>   388.944710 |   0) sleepgr-11355  |   0.126 us    |                      cpuacct_charge(); >>   388.944710 |   0) sleepgr-11355  |               |                      __cgroup_account_cputime() { >>   388.944711 |   0) sleepgr-11355  |   0.055 us    |                        cgroup_rstat_updated(); >>   388.944711 |   0) sleepgr-11355  |   0.675 us    |                      } /* __cgroup_account_cputime */ >>   388.944712 |   0) sleepgr-11355  |   2.779 us    |                    } /* update_curr */ >>   388.944712 |   0) sleepgr-11355  |   0.121 us    |                    __update_load_avg_se(); >>   388.944713 |   0) sleepgr-11355  |   0.118 us    |                    __update_load_avg_cfs_rq(); >>   388.944713 |   0) sleepgr-11355  |   0.056 us    |                    clear_buddies(); >>   388.944714 |   0) sleepgr-11355  |   0.066 us    |                    account_entity_dequeue(); >>   388.944714 |   0) sleepgr-11355  |   0.050 us    |                    update_cfs_group(); >>   388.944715 |   0) sleepgr-11355  |   6.127 us    |                  } /* dequeue_entity */ >>   388.944715 |   0) sleepgr-11355  |               |                  dequeue_entity() { >>   388.944715 |   0) sleepgr-11355  |               |                    update_curr() { >>   388.944716 |   0) sleepgr-11355  |   0.097 us    |                      __calc_delta(); >>   388.944716 |   0) sleepgr-11355  |   0.071 us    |                      update_min_vruntime(); >>   388.944717 |   0) sleepgr-11355  |   1.473 us    |                    } /* update_curr */ >>   388.944717 |   0) sleepgr-11355  |   0.142 us    |                    __update_load_avg_se(); >>   388.944718 |   0) sleepgr-11355  |   0.100 us    |                    __update_load_avg_cfs_rq(); >>   388.944719 |   0) sleepgr-11355  |   0.069 us    |                    clear_buddies(); >>   388.944719 |   0) sleepgr-11355  |   0.098 us    |                    account_entity_dequeue(); >>   388.944720 |   0) sleepgr-11355  |               |                    update_cfs_group() { >>   388.944720 |   0) sleepgr-11355  |   0.110 us    |                      reweight_entity(); >>   388.944721 |   0) sleepgr-11355  |   0.847 us    |                    } /* update_cfs_group */ >>   388.944721 |   0) sleepgr-11355  |   6.197 us    |                  } /* dequeue_entity */ >>   388.944722 |   0) sleepgr-11355  |   0.053 us    |                  hrtick_update(); >>   388.944722 |   0) sleepgr-11355  |   14.141 us   |                } /* dequeue_task_fair */ >>   388.944723 |   0) sleepgr-11355  |   14.827 us   |              } /* deactivate_task */ >>   388.944723 |   0) sleepgr-11355  |               |              pick_next_task_fair() { >>   388.944723 |   0) sleepgr-11355  |               |                update_blocked_averages() { >>   388.944724 |   0) sleepgr-11355  |   0.070 us    |                  _raw_spin_lock_irqsave(); >>   388.944724 |   0) sleepgr-11355  |   0.090 us    |                  update_rq_clock(); >>   388.944725 |   0) sleepgr-11355  |   0.120 us    |                  __update_load_avg_cfs_rq(); >>   388.944726 |   0) sleepgr-11355  |   0.162 us    |                  __update_load_avg_se(); >>   388.944727 |   0) sleepgr-11355  |   0.090 us    |                  __update_load_avg_cfs_rq(); >>   388.944727 |   0) sleepgr-11355  |   0.081 us    |                  __update_load_avg_cfs_rq(); >>   388.944728 |   0) sleepgr-11355  |               |                  update_rt_rq_load_avg() { >>   388.944728 |   0) sleepgr-11355  |   0.063 us    |                    __accumulate_pelt_segments(); >>   388.944729 |   0) sleepgr-11355  |   0.785 us    |                  } /* update_rt_rq_load_avg */ >>   388.944729 |   0) sleepgr-11355  |               |                  update_dl_rq_load_avg() { >>   388.944730 |   0) sleepgr-11355  |   0.066 us    |                    __accumulate_pelt_segments(); >>   388.944730 |   0) sleepgr-11355  |   0.763 us    |                  } /* update_dl_rq_load_avg */ >>   388.944731 |   0) sleepgr-11355  |   0.061 us    |                  _raw_spin_unlock_irqrestore(); >>   388.944731 |   0) sleepgr-11355  |   7.452 us    |                } /* update_blocked_averages */ >>   388.944731 |   0) sleepgr-11355  |               |                load_balance() { >>   388.944732 |   0) sleepgr-11355  |               |                  find_busiest_group() { >>   388.944732 |   0) sleepgr-11355  |   0.100 us    |                    update_nohz_stats(); >>   388.944733 |   0) sleepgr-11355  |   0.057 us    |                    idle_cpu(); >>   388.944734 |   0) sleepgr-11355  |   0.098 us    |                    update_nohz_stats(); >>   388.944734 |   0) sleepgr-11355  |   2.407 us    |                  } /* find_busiest_group */ >>   388.944735 |   0) sleepgr-11355  |   3.120 us    |                } /* load_balance */ >>   388.944735 |   0) sleepgr-11355  |   0.065 us    |                __msecs_to_jiffies(); >>   388.944736 |   0) sleepgr-11355  |               |                load_balance() { >>   388.944736 |   0) sleepgr-11355  |               |                  find_busiest_group() { >>   388.944736 |   0) sleepgr-11355  |   0.087 us    |                    update_nohz_stats(); >>   388.944737 |   0) sleepgr-11355  |   0.063 us    |                    idle_cpu(); >>   388.944738 |   0) sleepgr-11355  |   0.086 us    |                    update_nohz_stats(); >>   388.944739 |   0) sleepgr-11355  |   0.068 us    |                    update_nohz_stats(); >>   388.944739 |   0) sleepgr-11355  |   0.098 us    |                    idle_cpu(); >>   388.944740 |   0) sleepgr-11355  |   0.094 us    |                    update_nohz_stats(); >>   388.944741 |   0) sleepgr-11355  |   0.065 us    |                    idle_cpu(); >>   388.944741 |   0) sleepgr-11355  |   5.160 us    |                  } /* find_busiest_group */ >>   388.944742 |   0) sleepgr-11355  |   5.831 us    |                } /* load_balance */ >>   388.944742 |   0) sleepgr-11355  |   0.062 us    |                __msecs_to_jiffies(); >>   388.944743 |   0) sleepgr-11355  |   0.065 us    |                _raw_spin_lock(); >>   388.944743 |   0) sleepgr-11355  |   20.205 us   |              } /* pick_next_task_fair */ >>   388.944744 |   0) sleepgr-11355  |               |              pick_next_task_idle() { >>   388.944744 |   0) sleepgr-11355  |               |                put_prev_task_fair() { >>   388.944745 |   0) sleepgr-11355  |               |                  put_prev_entity() { >>   388.944745 |   0) sleepgr-11355  |   0.064 us    |                    check_cfs_rq_runtime(); >>   388.944746 |   0) sleepgr-11355  |   0.713 us    |                  } /* put_prev_entity */ >>   388.944746 |   0) sleepgr-11355  |               |                  put_prev_entity() { >>   388.944746 |   0) sleepgr-11355  |   0.068 us    |                    check_cfs_rq_runtime(); >>   388.944747 |   0) sleepgr-11355  |   0.691 us    |                  } /* put_prev_entity */ >>   388.944747 |   0) sleepgr-11355  |   2.633 us    |                } /* put_prev_task_fair */ >>   388.944748 |   0) sleepgr-11355  |   0.094 us    |                __update_idle_core(); >>   388.944748 |   0) sleepgr-11355  |   3.899 us    |              } /* pick_next_task_idle */ >>   388.944749 |   0) sleepgr-11355  |               |              enter_lazy_tlb() { >>   388.944749 |   0) sleepgr-11355  |               |                switch_mm() { >>   388.944749 |   0) sleepgr-11355  |               |                  switch_mm_irqs_off() { >>   388.944750 |   0) sleepgr-11355  |   0.244 us    |                    load_new_mm_cr3(); >>   388.944751 |   0) sleepgr-11355  |   0.955 us    |                  } /* switch_mm_irqs_off */ >>   388.944751 |   0) sleepgr-11355  |   1.472 us    |                } /* switch_mm */ >>   388.944751 |   0) sleepgr-11355  |   2.023 us    |              } /* enter_lazy_tlb */ >>   388.950636 |   0) sleepgr-11355  |   0.103 us    |              finish_task_switch(); >>   388.950638 |   0) sleepgr-11355  |   5932.632 us |            } /* schedule */ >>   388.950638 |   0) sleepgr-11355  |               |            prepare_to_wait_event() { >>   388.950638 |   0) sleepgr-11355  |   0.068 us    |              _raw_spin_lock_irqsave(); >>   388.950638 |   0) sleepgr-11355  |   0.053 us    |              _raw_spin_unlock_irqrestore(); >>   388.950639 |   0) sleepgr-11355  |   0.866 us    |            } /* prepare_to_wait_event */ >>   388.950639 |   0) sleepgr-11355  |               |            finish_wait() { >>   388.950639 |   0) sleepgr-11355  |   0.046 us    |              _raw_spin_lock_irqsave(); >>   388.950640 |   0) sleepgr-11355  |   0.052 us    |              _raw_spin_unlock_irqrestore(); >>   388.950640 |   0) sleepgr-11355  |   0.788 us    |            } /* finish_wait */ >>   388.950640 |   0) sleepgr-11355  |   5939.443 us |          } /* wb_wait_for_completion */ > > Can these outliers be avoided? As this is in `schedule`, is it > related to the used scheduler? At least the Intel Skylake i7-7500U > CPU @ 2.70GHz processor with four threads in the TUXEDO laptop should > have enough performance. > > No scheduler seems to be used for the NVMe device. > >     $ more /sys/devices/pci0000:00/0000:00:1d.0/0000:04:00.0/nvme/nvme0/nvme0n1/queue/scheduler >     [none] Please find the data for the SSD SanDisk device (AHCI) attached. Here is an excerpt. iterate_supers (53.363 ms @ 921.357477) sync_inodes_one_sb (52.417 ms @ 921.357858) sync_inodes_sb (52.414 ms @ 921.357859) […] wb_wait_for_completion (17.226 ms @ 921.358335) […] […] filemap_fdatawait_keep_errors (34.120 ms @ 921.375575) __filemap_fdatawait_range (34.117 ms @ 921.375576) […] io_schedule (33.307 ms @ 921.375586) […] […] Kind regards, Paul > [1]: https://01.org/suspendresume > [2]: https://github.com/01org/pm-graph