From 10c5fd0412ab71c14cca7a66c2407bfe3bb861af Mon Sep 17 00:00:00 2001 From: Michael Stapelberg Date: Tue, 10 Mar 2020 15:48:20 +0100 Subject: [PATCH] backing_dev_info: introduce min_bw/max_bw limits This allows working around long-standing significant performance issues when using mmap with files on FUSE file systems such as ObjFS. The page-writeback code tries to measure how quick file system backing devices are able to write data. Unfortunately, our usage pattern seems to hit an unfortunate code path: the kernel only ever measures the (non-representative) rising slope of the starting transfer, but the transfer is already over before it could possibly measure the representative steady-state. As a consequence, the FUSE write bandwidth sinks steadily down to 0 (!) and heavily throttles page dirtying in programs trying to write to FUSE. This patch adds a knob which allows avoiding this situation entirely on a per-file-system basis by restricting the minimum/maximum bandwidth. There are no negative effects expected from applying this patch. See also the discussion on the Linux Kernel Mailing List: https://lore.kernel.org/linux-fsdevel/CANnVG6n=ySfe1gOr=0ituQidp56idGARDKHzP0hv=ERedeMrMA@mail.gmail.com/ To inspect the measured bandwidth, check the BdiWriteBandwidth field in e.g. /sys/kernel/debug/bdi/0:93/stats. To pin the measured bandwidth to its default of 100 MB/s, use: echo 25600 > /sys/class/bdi/0:42/min_bw echo 25600 > /sys/class/bdi/0:42/max_bw --- include/linux/backing-dev-defs.h | 2 ++ include/linux/backing-dev.h | 3 +++ mm/backing-dev.c | 40 ++++++++++++++++++++++++++++++++ mm/page-writeback.c | 29 +++++++++++++++++++++++ 4 files changed, 74 insertions(+) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 4fc87dee005a..a29bcb8a799d 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -200,6 +200,8 @@ struct backing_dev_info { unsigned int capabilities; /* Device capabilities */ unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; + u64 min_bw; + u64 max_bw; /* * Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f88197c1ffc2..4490bd03aec1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -111,6 +111,9 @@ static inline unsigned long wb_stat_error(void) int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_bw(struct backing_dev_info *bdi, u64 min_bw); +int bdi_set_max_bw(struct backing_dev_info *bdi, u64 max_bw); + /* * Flags in backing_dev_info::capability * diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 62f05f605fb5..5c10d4425976 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -201,6 +201,44 @@ static ssize_t max_ratio_store(struct device *dev, } BDI_SHOW(max_ratio, bdi->max_ratio) +static ssize_t min_bw_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned long long limit; + ssize_t ret; + + ret = kstrtoull(buf, 10, &limit); + if (ret < 0) + return ret; + + ret = bdi_set_min_bw(bdi, limit); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(min_bw, bdi->min_bw) + +static ssize_t max_bw_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned long long limit; + ssize_t ret; + + ret = kstrtoull(buf, 10, &limit); + if (ret < 0) + return ret; + + ret = bdi_set_max_bw(bdi, limit); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(max_bw, bdi->max_bw) + static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, char *page) @@ -216,6 +254,8 @@ static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, + &dev_attr_min_bw.attr, + &dev_attr_max_bw.attr, &dev_attr_stable_pages_required.attr, NULL, }; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2caf780a42e7..c7c9eebc4c56 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -713,6 +713,22 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); +int bdi_set_min_bw(struct backing_dev_info *bdi, u64 min_bw) +{ + spin_lock_bh(&bdi_lock); + bdi->min_bw = min_bw; + spin_unlock_bh(&bdi_lock); + return 0; +} + +int bdi_set_max_bw(struct backing_dev_info *bdi, u64 max_bw) +{ + spin_lock_bh(&bdi_lock); + bdi->max_bw = max_bw; + spin_unlock_bh(&bdi_lock); + return 0; +} + static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { @@ -1080,6 +1096,16 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) dtc->pos_ratio = pos_ratio; } +static u64 clamp_bw(struct backing_dev_info *bdi, u64 bw) { + if (bdi->min_bw > 0 && bw < bdi->min_bw) { + bw = bdi->min_bw; + } + if (bdi->max_bw > 0 && bw > bdi->max_bw) { + bw = bdi->max_bw; + } + return bw; +} + static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written) @@ -1103,12 +1129,15 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb, bw *= HZ; if (unlikely(elapsed > period)) { bw = div64_ul(bw, elapsed); + bw = clamp_bw(wb->bdi, bw); avg = bw; goto out; } bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); + bw = clamp_bw(wb->bdi, bw); + /* * one more level of smoothing, for filtering out sudden spikes */ -- 2.25.1