* [PATCH] [RFC] Add support for uevents on block device idle changes @ 2009-11-17 14:37 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-17 14:37 UTC (permalink / raw) To: linux-kernel; +Cc: axboe, linux-hotplug, Matthew Garrett Userspace may wish to know whether a given disk is active or idle, for example to modify power management policy based on access patterns. This patch adds a deferrable timer to the block layer which will fire if the disk is idle for a user-definable period of time, generating a uevent. A uevent will also be generated if an access is received while the disk is classified as idle. This patch seems to work as designed, but introduces a noticable amount of userspace overhead in udevd. I'm guessing that this is because change events on block devices are normally associated with disk removal/insertion, so a large quantity of complex rules end up getting run in order to deal with RAID setup or whatever. Is there a better way to deliver these events? --- Documentation/ABI/testing/sysfs-block | 9 +++++ block/blk-core.c | 9 +++++ block/genhd.c | 55 +++++++++++++++++++++++++++++++++ fs/partitions/check.c | 3 ++ include/linux/genhd.h | 6 +++ 5 files changed, 82 insertions(+), 0 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 5f3beda..5519720 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -128,3 +128,12 @@ Description: preferred request size for workloads where sustained throughput is desired. If no optimal I/O size is reported this file contains 0. + +What: /sys/block/<disk>/idle_hysteresis +Date: November 2009 +Contact: Matthew Garrett <mjg@redhat.com> +Description: + Contains the number of milliseconds to wait after an access + before declaring that a disk is idle. Any accesses during + this time will reset the timer. "0" (the default) indicates + that no events will be generated. \ No newline at end of file diff --git a/block/blk-core.c b/block/blk-core.c index 71da511..f278817 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1452,6 +1452,15 @@ static inline void __generic_make_request(struct bio *bio) if (should_fail_request(bio)) goto end_io; + if (bio->bi_bdev->bd_disk->hysteresis_time && + bio_has_data(bio) && + !mod_timer(&bio->bi_bdev->bd_disk->hysteresis_timer, + jiffies+msecs_to_jiffies + (bio->bi_bdev->bd_disk->hysteresis_time))) { + bio->bi_bdev->bd_disk->idle = 0; + schedule_work(&bio->bi_bdev->bd_disk->idle_notify); + } + /* * If this device has partitions, remap block n * of partition p to block n+start(p) of the disk. diff --git a/block/genhd.c b/block/genhd.c index 517e433..f59fbe0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -504,6 +504,26 @@ static int exact_lock(dev_t devt, void *data) return 0; } +static void disk_idle(unsigned long data) +{ + struct gendisk *gd = (struct gendisk *)data; + + gd->idle = 1; + schedule_work(&gd->idle_notify); +} + +static void disk_idle_notify_thread(struct work_struct *work) +{ + struct gendisk *gd = container_of(work, struct gendisk, idle_notify); + char event[] = "IDLE=0"; + char *envp[] = { event, NULL }; + + if (gd->idle) + event[5] = '1'; + + kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); +} + /** * add_disk - add partitioning information to kernel list * @disk: per-device partitioning information @@ -543,6 +563,10 @@ void add_disk(struct gendisk *disk) blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); + + init_timer(&disk->hysteresis_timer); + setup_timer(&disk->hysteresis_timer, disk_idle, (unsigned long)disk); + register_disk(disk); blk_register_queue(disk); @@ -861,6 +885,32 @@ static ssize_t disk_alignment_offset_show(struct device *dev, return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); } +static ssize_t disk_idle_hysteresis_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk->hysteresis_time); +} + +static ssize_t disk_idle_hysteresis_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + unsigned long timeout; + int res; + + res = strict_strtoul(buf, 10, &timeout); + if (res) + return -EINVAL; + + disk->hysteresis_time = timeout; + + return count; +} + static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); @@ -870,6 +920,8 @@ static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(idle_hysteresis, 0644, disk_idle_hysteresis_show, + disk_idle_hysteresis_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -890,6 +942,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_idle_hysteresis.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1183,6 +1236,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); INIT_WORK(&disk->async_notify, media_change_notify_thread); + INIT_WORK(&disk->idle_notify, + disk_idle_notify_thread); } return disk; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7b685e1..d55dd29 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -652,6 +652,9 @@ void del_gendisk(struct gendisk *disk) struct disk_part_iter piter; struct hd_struct *part; + cancel_work_sync(&disk->idle_notify); + del_timer_sync(&disk->hysteresis_timer); + /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 297df45..7e969a5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/kdev_t.h> #include <linux/rcupdate.h> +#include <linux/timer.h> #ifdef CONFIG_BLOCK @@ -163,10 +164,15 @@ struct gendisk { atomic_t sync_io; /* RAID */ struct work_struct async_notify; + struct work_struct idle_notify; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; #endif int node_id; + + bool idle; + int hysteresis_time; + struct timer_list hysteresis_timer; }; static inline struct gendisk *part_to_disk(struct hd_struct *part) -- 1.6.5.2 ^ permalink raw reply related [flat|nested] 73+ messages in thread
* [PATCH] [RFC] Add support for uevents on block device idle changes @ 2009-11-17 14:37 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-17 14:37 UTC (permalink / raw) To: linux-kernel; +Cc: axboe, linux-hotplug, Matthew Garrett Userspace may wish to know whether a given disk is active or idle, for example to modify power management policy based on access patterns. This patch adds a deferrable timer to the block layer which will fire if the disk is idle for a user-definable period of time, generating a uevent. A uevent will also be generated if an access is received while the disk is classified as idle. This patch seems to work as designed, but introduces a noticable amount of userspace overhead in udevd. I'm guessing that this is because change events on block devices are normally associated with disk removal/insertion, so a large quantity of complex rules end up getting run in order to deal with RAID setup or whatever. Is there a better way to deliver these events? --- Documentation/ABI/testing/sysfs-block | 9 +++++ block/blk-core.c | 9 +++++ block/genhd.c | 55 +++++++++++++++++++++++++++++++++ fs/partitions/check.c | 3 ++ include/linux/genhd.h | 6 +++ 5 files changed, 82 insertions(+), 0 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 5f3beda..5519720 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -128,3 +128,12 @@ Description: preferred request size for workloads where sustained throughput is desired. If no optimal I/O size is reported this file contains 0. + +What: /sys/block/<disk>/idle_hysteresis +Date: November 2009 +Contact: Matthew Garrett <mjg@redhat.com> +Description: + Contains the number of milliseconds to wait after an access + before declaring that a disk is idle. Any accesses during + this time will reset the timer. "0" (the default) indicates + that no events will be generated. \ No newline at end of file diff --git a/block/blk-core.c b/block/blk-core.c index 71da511..f278817 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1452,6 +1452,15 @@ static inline void __generic_make_request(struct bio *bio) if (should_fail_request(bio)) goto end_io; + if (bio->bi_bdev->bd_disk->hysteresis_time && + bio_has_data(bio) && + !mod_timer(&bio->bi_bdev->bd_disk->hysteresis_timer, + jiffies+msecs_to_jiffies + (bio->bi_bdev->bd_disk->hysteresis_time))) { + bio->bi_bdev->bd_disk->idle = 0; + schedule_work(&bio->bi_bdev->bd_disk->idle_notify); + } + /* * If this device has partitions, remap block n * of partition p to block n+start(p) of the disk. diff --git a/block/genhd.c b/block/genhd.c index 517e433..f59fbe0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -504,6 +504,26 @@ static int exact_lock(dev_t devt, void *data) return 0; } +static void disk_idle(unsigned long data) +{ + struct gendisk *gd = (struct gendisk *)data; + + gd->idle = 1; + schedule_work(&gd->idle_notify); +} + +static void disk_idle_notify_thread(struct work_struct *work) +{ + struct gendisk *gd = container_of(work, struct gendisk, idle_notify); + char event[] = "IDLE=0"; + char *envp[] = { event, NULL }; + + if (gd->idle) + event[5] = '1'; + + kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); +} + /** * add_disk - add partitioning information to kernel list * @disk: per-device partitioning information @@ -543,6 +563,10 @@ void add_disk(struct gendisk *disk) blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); + + init_timer(&disk->hysteresis_timer); + setup_timer(&disk->hysteresis_timer, disk_idle, (unsigned long)disk); + register_disk(disk); blk_register_queue(disk); @@ -861,6 +885,32 @@ static ssize_t disk_alignment_offset_show(struct device *dev, return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); } +static ssize_t disk_idle_hysteresis_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk->hysteresis_time); +} + +static ssize_t disk_idle_hysteresis_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + unsigned long timeout; + int res; + + res = strict_strtoul(buf, 10, &timeout); + if (res) + return -EINVAL; + + disk->hysteresis_time = timeout; + + return count; +} + static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); @@ -870,6 +920,8 @@ static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(idle_hysteresis, 0644, disk_idle_hysteresis_show, + disk_idle_hysteresis_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -890,6 +942,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_idle_hysteresis.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1183,6 +1236,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); INIT_WORK(&disk->async_notify, media_change_notify_thread); + INIT_WORK(&disk->idle_notify, + disk_idle_notify_thread); } return disk; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7b685e1..d55dd29 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -652,6 +652,9 @@ void del_gendisk(struct gendisk *disk) struct disk_part_iter piter; struct hd_struct *part; + cancel_work_sync(&disk->idle_notify); + del_timer_sync(&disk->hysteresis_timer); + /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 297df45..7e969a5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/kdev_t.h> #include <linux/rcupdate.h> +#include <linux/timer.h> #ifdef CONFIG_BLOCK @@ -163,10 +164,15 @@ struct gendisk { atomic_t sync_io; /* RAID */ struct work_struct async_notify; + struct work_struct idle_notify; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; #endif int node_id; + + bool idle; + int hysteresis_time; + struct timer_list hysteresis_timer; }; static inline struct gendisk *part_to_disk(struct hd_struct *part) -- 1.6.5.2 ^ permalink raw reply related [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-17 14:37 ` Matthew Garrett @ 2009-11-17 15:55 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-17 15:55 UTC (permalink / raw) To: Matthew Garrett; +Cc: linux-kernel, axboe, linux-hotplug On Tue, Nov 17, 2009 at 15:37, Matthew Garrett <mjg@redhat.com> wrote: > Userspace may wish to know whether a given disk is active or idle, for > example to modify power management policy based on access patterns. This > patch adds a deferrable timer to the block layer which will fire if the > disk is idle for a user-definable period of time, generating a uevent. A > uevent will also be generated if an access is received while the disk is > classified as idle. > > This patch seems to work as designed, but introduces a noticable amount of > userspace overhead in udevd. I'm guessing that this is because change events > on block devices are normally associated with disk removal/insertion, so > a large quantity of complex rules end up getting run in order to deal with > RAID setup or whatever. Is there a better way to deliver these events? I guess, at the moment the disk tells it's idle, udev will open() the disk and look for changed signatures, and end its idle state. :) Uevents might not be the right interface, they are usually used if the device needs to be (re-)examined, which will the idle thing into a loop with the current setups, I guess. Maybe we can use a sysfs file which can be open()'d and something can watch with poll(), and gets woken up by the kernel, after the drive changes its state? MD raid, as an example, has files like this in sysfs to allow monitoring. That way, there is also no overhead, if the requesting process goes away, which is usually the nicer interface, than a global switch, which does not care about if the requesting process still exists. Thanks, Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-17 15:55 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-17 15:55 UTC (permalink / raw) To: Matthew Garrett; +Cc: linux-kernel, axboe, linux-hotplug On Tue, Nov 17, 2009 at 15:37, Matthew Garrett <mjg@redhat.com> wrote: > Userspace may wish to know whether a given disk is active or idle, for > example to modify power management policy based on access patterns. This > patch adds a deferrable timer to the block layer which will fire if the > disk is idle for a user-definable period of time, generating a uevent. A > uevent will also be generated if an access is received while the disk is > classified as idle. > > This patch seems to work as designed, but introduces a noticable amount of > userspace overhead in udevd. I'm guessing that this is because change events > on block devices are normally associated with disk removal/insertion, so > a large quantity of complex rules end up getting run in order to deal with > RAID setup or whatever. Is there a better way to deliver these events? I guess, at the moment the disk tells it's idle, udev will open() the disk and look for changed signatures, and end its idle state. :) Uevents might not be the right interface, they are usually used if the device needs to be (re-)examined, which will the idle thing into a loop with the current setups, I guess. Maybe we can use a sysfs file which can be open()'d and something can watch with poll(), and gets woken up by the kernel, after the drive changes its state? MD raid, as an example, has files like this in sysfs to allow monitoring. That way, there is also no overhead, if the requesting process goes away, which is usually the nicer interface, than a global switch, which does not care about if the requesting process still exists. Thanks, Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-17 15:55 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-17 16:09 ` David Zeuthen -1 siblings, 0 replies; 73+ messages in thread From: David Zeuthen @ 2009-11-17 16:09 UTC (permalink / raw) To: Kay Sievers; +Cc: Matthew Garrett, linux-kernel, axboe, linux-hotplug On Tue, 2009-11-17 at 16:55 +0100, Kay Sievers wrote: > I guess, at the moment the disk tells it's idle, udev will open() the > disk and look for changed signatures, and end its idle state. :) Huh, this will probably cause interesting loops then ;-) > Uevents might not be the right interface, they are usually used if the > device needs to be (re-)examined, which will the idle thing into a > loop with the current setups, I guess. Yeah. > Maybe we can use a sysfs file which can be open()'d and something can > watch with poll(), and gets woken up by the kernel, after the drive > changes its state? MD raid, as an example, has files like this in > sysfs to allow monitoring. That way, there is also no overhead, if the > requesting process goes away, which is usually the nicer interface, > than a global switch, which does not care about if the requesting > process still exists. Can't we just use /sys/block/sdX/stat? I believe this file already has the property that the contents of the file will stay constant exactly when the device is idle. Being able to use poll() for change notifications seems like a good interface. Thanks, David ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-17 16:09 ` David Zeuthen 0 siblings, 0 replies; 73+ messages in thread From: David Zeuthen @ 2009-11-17 16:09 UTC (permalink / raw) To: Kay Sievers; +Cc: Matthew Garrett, linux-kernel, axboe, linux-hotplug On Tue, 2009-11-17 at 16:55 +0100, Kay Sievers wrote: > I guess, at the moment the disk tells it's idle, udev will open() the > disk and look for changed signatures, and end its idle state. :) Huh, this will probably cause interesting loops then ;-) > Uevents might not be the right interface, they are usually used if the > device needs to be (re-)examined, which will the idle thing into a > loop with the current setups, I guess. Yeah. > Maybe we can use a sysfs file which can be open()'d and something can > watch with poll(), and gets woken up by the kernel, after the drive > changes its state? MD raid, as an example, has files like this in > sysfs to allow monitoring. That way, there is also no overhead, if the > requesting process goes away, which is usually the nicer interface, > than a global switch, which does not care about if the requesting > process still exists. Can't we just use /sys/block/sdX/stat? I believe this file already has the property that the contents of the file will stay constant exactly when the device is idle. Being able to use poll() for change notifications seems like a good interface. Thanks, David ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-17 16:09 ` [PATCH] [RFC] Add support for uevents on block device idle David Zeuthen @ 2009-11-17 18:57 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-17 18:57 UTC (permalink / raw) To: David Zeuthen; +Cc: Kay Sievers, linux-kernel, axboe, linux-hotplug Ok. How about something like this? It adds an extra field to the stat file and introduces David's suggestion of making it pollable. commit ba6d4c7ab7940ae8dc11a884281d0a36b20455b9 Author: Matthew Garrett <mjg@redhat.com> Date: Mon Nov 16 17:44:03 2009 -0500 [RFC] Add support for uevents on block device idle changes Userspace may wish to know whether a given disk is active or idle, for example to modify power management policy based on access patterns. This patch adds a deferrable timer to the block layer which will fire if the disk is idle for a user-definable period of time, generating a uevent. A uevent will also be generated if an access is received while the disk is classified as idle. diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 5f3beda..8747f42 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -3,7 +3,7 @@ Date: February 2008 Contact: Jerome Marchand <jmarchan@redhat.com> Description: The /sys/block/<disk>/stat files displays the I/O - statistics of disk <disk>. They contain 11 fields: + statistics of disk <disk>. They contain 12 fields: 1 - reads completed succesfully 2 - reads merged 3 - sectors read @@ -15,6 +15,7 @@ Description: 9 - I/Os currently in progress 10 - time spent doing I/Os (ms) 11 - weighted time spent doing I/Os (ms) + 12 - 1 if the disk is idle (determined by idle_hysteresis) For more details refer Documentation/iostats.txt @@ -128,3 +129,12 @@ Description: preferred request size for workloads where sustained throughput is desired. If no optimal I/O size is reported this file contains 0. + +What: /sys/block/<disk>/idle_hysteresis +Date: November 2009 +Contact: Matthew Garrett <mjg@redhat.com> +Description: + Contains the number of milliseconds to wait after an access + before declaring that a disk is idle. Any accesses during + this time will reset the timer. "0" (the default) indicates + that no events will be generated. \ No newline at end of file diff --git a/block/blk-core.c b/block/blk-core.c index 71da511..f278817 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1452,6 +1452,15 @@ static inline void __generic_make_request(struct bio *bio) if (should_fail_request(bio)) goto end_io; + if (bio->bi_bdev->bd_disk->hysteresis_time && + bio_has_data(bio) && + !mod_timer(&bio->bi_bdev->bd_disk->hysteresis_timer, + jiffies+msecs_to_jiffies + (bio->bi_bdev->bd_disk->hysteresis_time))) { + bio->bi_bdev->bd_disk->idle = 0; + schedule_work(&bio->bi_bdev->bd_disk->idle_notify); + } + /* * If this device has partitions, remap block n * of partition p to block n+start(p) of the disk. diff --git a/block/genhd.c b/block/genhd.c index 517e433..ea37e48 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -504,6 +504,21 @@ static int exact_lock(dev_t devt, void *data) return 0; } +static void disk_idle(unsigned long data) +{ + struct gendisk *gd = (struct gendisk *)data; + + gd->idle = 1; + schedule_work(&gd->idle_notify); +} + +static void disk_idle_notify_thread(struct work_struct *work) +{ + struct gendisk *gd = container_of(work, struct gendisk, idle_notify); + + sysfs_notify(&disk_to_dev(gd)->kobj, NULL, "stat"); +} + /** * add_disk - add partitioning information to kernel list * @disk: per-device partitioning information @@ -543,6 +558,10 @@ void add_disk(struct gendisk *disk) blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); + + init_timer(&disk->hysteresis_timer); + setup_timer(&disk->hysteresis_timer, disk_idle, (unsigned long)disk); + register_disk(disk); blk_register_queue(disk); @@ -861,6 +880,32 @@ static ssize_t disk_alignment_offset_show(struct device *dev, return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); } +static ssize_t disk_idle_hysteresis_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk->hysteresis_time); +} + +static ssize_t disk_idle_hysteresis_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + unsigned long timeout; + int res; + + res = strict_strtoul(buf, 10, &timeout); + if (res) + return -EINVAL; + + disk->hysteresis_time = timeout; + + return count; +} + static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); @@ -870,6 +915,8 @@ static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(idle_hysteresis, 0644, disk_idle_hysteresis_show, + disk_idle_hysteresis_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -890,6 +937,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_idle_hysteresis.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1183,6 +1231,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); INIT_WORK(&disk->async_notify, media_change_notify_thread); + INIT_WORK(&disk->idle_notify, + disk_idle_notify_thread); } return disk; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7b685e1..cccfb7d 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -230,6 +230,7 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); + struct gendisk *gd = dev_to_disk(dev); int cpu; cpu = part_stat_lock(); @@ -238,7 +239,7 @@ ssize_t part_stat_show(struct device *dev, return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " - "%8u %8u %8u" + "%8u %8u %8u %1u" "\n", part_stat_read(p, ios[READ]), part_stat_read(p, merges[READ]), @@ -250,7 +251,8 @@ ssize_t part_stat_show(struct device *dev, jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), part_in_flight(p), jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue))); + jiffies_to_msecs(part_stat_read(p, time_in_queue)), + gd->idle); } ssize_t part_inflight_show(struct device *dev, @@ -652,6 +654,9 @@ void del_gendisk(struct gendisk *disk) struct disk_part_iter piter; struct hd_struct *part; + del_timer_sync(&disk->hysteresis_timer); + cancel_work_sync(&disk->idle_notify); + /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 297df45..7e969a5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/kdev_t.h> #include <linux/rcupdate.h> +#include <linux/timer.h> #ifdef CONFIG_BLOCK @@ -163,10 +164,15 @@ struct gendisk { atomic_t sync_io; /* RAID */ struct work_struct async_notify; + struct work_struct idle_notify; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; #endif int node_id; + + bool idle; + int hysteresis_time; + struct timer_list hysteresis_timer; }; static inline struct gendisk *part_to_disk(struct hd_struct *part) -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply related [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-17 18:57 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-17 18:57 UTC (permalink / raw) To: David Zeuthen; +Cc: Kay Sievers, linux-kernel, axboe, linux-hotplug Ok. How about something like this? It adds an extra field to the stat file and introduces David's suggestion of making it pollable. commit ba6d4c7ab7940ae8dc11a884281d0a36b20455b9 Author: Matthew Garrett <mjg@redhat.com> Date: Mon Nov 16 17:44:03 2009 -0500 [RFC] Add support for uevents on block device idle changes Userspace may wish to know whether a given disk is active or idle, for example to modify power management policy based on access patterns. This patch adds a deferrable timer to the block layer which will fire if the disk is idle for a user-definable period of time, generating a uevent. A uevent will also be generated if an access is received while the disk is classified as idle. diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 5f3beda..8747f42 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -3,7 +3,7 @@ Date: February 2008 Contact: Jerome Marchand <jmarchan@redhat.com> Description: The /sys/block/<disk>/stat files displays the I/O - statistics of disk <disk>. They contain 11 fields: + statistics of disk <disk>. They contain 12 fields: 1 - reads completed succesfully 2 - reads merged 3 - sectors read @@ -15,6 +15,7 @@ Description: 9 - I/Os currently in progress 10 - time spent doing I/Os (ms) 11 - weighted time spent doing I/Os (ms) + 12 - 1 if the disk is idle (determined by idle_hysteresis) For more details refer Documentation/iostats.txt @@ -128,3 +129,12 @@ Description: preferred request size for workloads where sustained throughput is desired. If no optimal I/O size is reported this file contains 0. + +What: /sys/block/<disk>/idle_hysteresis +Date: November 2009 +Contact: Matthew Garrett <mjg@redhat.com> +Description: + Contains the number of milliseconds to wait after an access + before declaring that a disk is idle. Any accesses during + this time will reset the timer. "0" (the default) indicates + that no events will be generated. \ No newline at end of file diff --git a/block/blk-core.c b/block/blk-core.c index 71da511..f278817 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1452,6 +1452,15 @@ static inline void __generic_make_request(struct bio *bio) if (should_fail_request(bio)) goto end_io; + if (bio->bi_bdev->bd_disk->hysteresis_time && + bio_has_data(bio) && + !mod_timer(&bio->bi_bdev->bd_disk->hysteresis_timer, + jiffies+msecs_to_jiffies + (bio->bi_bdev->bd_disk->hysteresis_time))) { + bio->bi_bdev->bd_disk->idle = 0; + schedule_work(&bio->bi_bdev->bd_disk->idle_notify); + } + /* * If this device has partitions, remap block n * of partition p to block n+start(p) of the disk. diff --git a/block/genhd.c b/block/genhd.c index 517e433..ea37e48 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -504,6 +504,21 @@ static int exact_lock(dev_t devt, void *data) return 0; } +static void disk_idle(unsigned long data) +{ + struct gendisk *gd = (struct gendisk *)data; + + gd->idle = 1; + schedule_work(&gd->idle_notify); +} + +static void disk_idle_notify_thread(struct work_struct *work) +{ + struct gendisk *gd = container_of(work, struct gendisk, idle_notify); + + sysfs_notify(&disk_to_dev(gd)->kobj, NULL, "stat"); +} + /** * add_disk - add partitioning information to kernel list * @disk: per-device partitioning information @@ -543,6 +558,10 @@ void add_disk(struct gendisk *disk) blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); + + init_timer(&disk->hysteresis_timer); + setup_timer(&disk->hysteresis_timer, disk_idle, (unsigned long)disk); + register_disk(disk); blk_register_queue(disk); @@ -861,6 +880,32 @@ static ssize_t disk_alignment_offset_show(struct device *dev, return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); } +static ssize_t disk_idle_hysteresis_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk->hysteresis_time); +} + +static ssize_t disk_idle_hysteresis_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + unsigned long timeout; + int res; + + res = strict_strtoul(buf, 10, &timeout); + if (res) + return -EINVAL; + + disk->hysteresis_time = timeout; + + return count; +} + static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); @@ -870,6 +915,8 @@ static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(idle_hysteresis, 0644, disk_idle_hysteresis_show, + disk_idle_hysteresis_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -890,6 +937,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_idle_hysteresis.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1183,6 +1231,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); INIT_WORK(&disk->async_notify, media_change_notify_thread); + INIT_WORK(&disk->idle_notify, + disk_idle_notify_thread); } return disk; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7b685e1..cccfb7d 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -230,6 +230,7 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); + struct gendisk *gd = dev_to_disk(dev); int cpu; cpu = part_stat_lock(); @@ -238,7 +239,7 @@ ssize_t part_stat_show(struct device *dev, return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " - "%8u %8u %8u" + "%8u %8u %8u %1u" "\n", part_stat_read(p, ios[READ]), part_stat_read(p, merges[READ]), @@ -250,7 +251,8 @@ ssize_t part_stat_show(struct device *dev, jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), part_in_flight(p), jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue))); + jiffies_to_msecs(part_stat_read(p, time_in_queue)), + gd->idle); } ssize_t part_inflight_show(struct device *dev, @@ -652,6 +654,9 @@ void del_gendisk(struct gendisk *disk) struct disk_part_iter piter; struct hd_struct *part; + del_timer_sync(&disk->hysteresis_timer); + cancel_work_sync(&disk->idle_notify); + /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 297df45..7e969a5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/kdev_t.h> #include <linux/rcupdate.h> +#include <linux/timer.h> #ifdef CONFIG_BLOCK @@ -163,10 +164,15 @@ struct gendisk { atomic_t sync_io; /* RAID */ struct work_struct async_notify; + struct work_struct idle_notify; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; #endif int node_id; + + bool idle; + int hysteresis_time; + struct timer_list hysteresis_timer; }; static inline struct gendisk *part_to_disk(struct hd_struct *part) -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply related [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-17 18:57 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-18 19:30 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 19:30 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Tue, Nov 17, 2009 at 19:57, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > Ok. How about something like this? It adds an extra field to the stat > file and introduces David's suggestion of making it pollable. Hmm, while thinking more about it, I'm still not sure if we should add an interface which can work for a single user only, and add a timer to the kernel which is only used by a userspace policy thing. Wouldn't it be good enough, if we add a file "idle_since" which contains the time of the actual disk idle time, and userspace can schedule a re-examination of that value at the actual end of the idle time it is looking for? Thanks, Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 19:30 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 19:30 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Tue, Nov 17, 2009 at 19:57, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > Ok. How about something like this? It adds an extra field to the stat > file and introduces David's suggestion of making it pollable. Hmm, while thinking more about it, I'm still not sure if we should add an interface which can work for a single user only, and add a timer to the kernel which is only used by a userspace policy thing. Wouldn't it be good enough, if we add a file "idle_since" which contains the time of the actual disk idle time, and userspace can schedule a re-examination of that value at the actual end of the idle time it is looking for? Thanks, Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 19:30 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-18 19:40 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 19:40 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote: > Wouldn't it be good enough, if we add a file "idle_since" which > contains the time of the actual disk idle time, and userspace can > schedule a re-examination of that value at the actual end of the idle > time it is looking for? That would require either polling or waking up a userspace application on every disk access. Doing it in-kernel involves only a single timer wakeup for every active/idle transition. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 19:40 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 19:40 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote: > Wouldn't it be good enough, if we add a file "idle_since" which > contains the time of the actual disk idle time, and userspace can > schedule a re-examination of that value at the actual end of the idle > time it is looking for? That would require either polling or waking up a userspace application on every disk access. Doing it in-kernel involves only a single timer wakeup for every active/idle transition. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 19:40 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-18 19:47 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 19:47 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 20:40, Matthew Garrett <mjg@redhat.com> wrote: > On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote: > >> Wouldn't it be good enough, if we add a file "idle_since" which >> contains the time of the actual disk idle time, and userspace can >> schedule a re-examination of that value at the actual end of the idle >> time it is looking for? > > That would require either polling or waking up a userspace application > on every disk access. Doing it in-kernel involves only a single timer > wakeup for every active/idle transition. How would it? If you look for, like a 60 seconds timeout, and the file contains 20, you schedule a wakeup in 40 seconds. If the file after the 40 seconds contains 60, you reached your idle timeout exactly at that moment, if it's less, then you re-calculate and start from the beginning. Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 19:47 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 19:47 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 20:40, Matthew Garrett <mjg@redhat.com> wrote: > On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote: > >> Wouldn't it be good enough, if we add a file "idle_since" which >> contains the time of the actual disk idle time, and userspace can >> schedule a re-examination of that value at the actual end of the idle >> time it is looking for? > > That would require either polling or waking up a userspace application > on every disk access. Doing it in-kernel involves only a single timer > wakeup for every active/idle transition. How would it? If you look for, like a 60 seconds timeout, and the file contains 20, you schedule a wakeup in 40 seconds. If the file after the 40 seconds contains 60, you reached your idle timeout exactly at that moment, if it's less, then you re-calculate and start from the beginning. Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 19:47 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-18 19:53 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 19:53 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 08:47:37PM +0100, Kay Sievers wrote: > On Wed, Nov 18, 2009 at 20:40, Matthew Garrett <mjg@redhat.com> wrote: > > On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote: > > > >> Wouldn't it be good enough, if we add a file "idle_since" which > >> contains the time of the actual disk idle time, and userspace can > >> schedule a re-examination of that value at the actual end of the idle > >> time it is looking for? > > > > That would require either polling or waking up a userspace application > > on every disk access. Doing it in-kernel involves only a single timer > > wakeup for every active/idle transition. > > How would it? If you look for, like a 60 seconds timeout, and the file > contains 20, you schedule a wakeup in 40 seconds. If the file after > the 40 seconds contains 60, you reached your idle timeout exactly at > that moment, if it's less, then you re-calculate and start from the > beginning. How is that not polling? You'll repeatedly read a file looking for a value that may never appear - imagine the case where you're waiting for 60 seconds of idleness, but the disk always becomes active again after 50. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 19:53 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 19:53 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 08:47:37PM +0100, Kay Sievers wrote: > On Wed, Nov 18, 2009 at 20:40, Matthew Garrett <mjg@redhat.com> wrote: > > On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote: > > > >> Wouldn't it be good enough, if we add a file "idle_since" which > >> contains the time of the actual disk idle time, and userspace can > >> schedule a re-examination of that value at the actual end of the idle > >> time it is looking for? > > > > That would require either polling or waking up a userspace application > > on every disk access. Doing it in-kernel involves only a single timer > > wakeup for every active/idle transition. > > How would it? If you look for, like a 60 seconds timeout, and the file > contains 20, you schedule a wakeup in 40 seconds. If the file after > the 40 seconds contains 60, you reached your idle timeout exactly at > that moment, if it's less, then you re-calculate and start from the > beginning. How is that not polling? You'll repeatedly read a file looking for a value that may never appear - imagine the case where you're waiting for 60 seconds of idleness, but the disk always becomes active again after 50. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 19:53 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-18 20:03 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 20:03 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 20:53, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Wed, Nov 18, 2009 at 08:47:37PM +0100, Kay Sievers wrote: >> On Wed, Nov 18, 2009 at 20:40, Matthew Garrett <mjg@redhat.com> wrote: >> > On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote: >> > >> >> Wouldn't it be good enough, if we add a file "idle_since" which >> >> contains the time of the actual disk idle time, and userspace can >> >> schedule a re-examination of that value at the actual end of the idle >> >> time it is looking for? >> > >> > That would require either polling or waking up a userspace application >> > on every disk access. Doing it in-kernel involves only a single timer >> > wakeup for every active/idle transition. >> >> How would it? If you look for, like a 60 seconds timeout, and the file >> contains 20, you schedule a wakeup in 40 seconds. If the file after >> the 40 seconds contains 60, you reached your idle timeout exactly at >> that moment, if it's less, then you re-calculate and start from the >> beginning. > > How is that not polling? You'll repeatedly read a file looking for a > value that may never appear - imagine the case where you're waiting for > 60 seconds of idleness, but the disk always becomes active again after > 50. Sure, but what's wrong with reading that file every 50 seconds? Almost all boxes poll for media changes of optical drives and usb card readers anyway, so it's not that we are not doing stuff like this already. Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 20:03 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 20:03 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 20:53, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Wed, Nov 18, 2009 at 08:47:37PM +0100, Kay Sievers wrote: >> On Wed, Nov 18, 2009 at 20:40, Matthew Garrett <mjg@redhat.com> wrote: >> > On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote: >> > >> >> Wouldn't it be good enough, if we add a file "idle_since" which >> >> contains the time of the actual disk idle time, and userspace can >> >> schedule a re-examination of that value at the actual end of the idle >> >> time it is looking for? >> > >> > That would require either polling or waking up a userspace application >> > on every disk access. Doing it in-kernel involves only a single timer >> > wakeup for every active/idle transition. >> >> How would it? If you look for, like a 60 seconds timeout, and the file >> contains 20, you schedule a wakeup in 40 seconds. If the file after >> the 40 seconds contains 60, you reached your idle timeout exactly at >> that moment, if it's less, then you re-calculate and start from the >> beginning. > > How is that not polling? You'll repeatedly read a file looking for a > value that may never appear - imagine the case where you're waiting for > 60 seconds of idleness, but the disk always becomes active again after > 50. Sure, but what's wrong with reading that file every 50 seconds? Almost all boxes poll for media changes of optical drives and usb card readers anyway, so it's not that we are not doing stuff like this already. Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 20:03 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-18 20:07 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 20:07 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: > Sure, but what's wrong with reading that file every 50 seconds? Almost > all boxes poll for media changes of optical drives and usb card > readers anyway, so it's not that we are not doing stuff like this > already. We poll for media because there's no event-based way of avoiding it - in this case there is. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 20:07 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 20:07 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: > Sure, but what's wrong with reading that file every 50 seconds? Almost > all boxes poll for media changes of optical drives and usb card > readers anyway, so it's not that we are not doing stuff like this > already. We poll for media because there's no event-based way of avoiding it - in this case there is. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 20:07 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-18 21:06 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 21:06 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 21:07, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: > >> Sure, but what's wrong with reading that file every 50 seconds? Almost >> all boxes poll for media changes of optical drives and usb card >> readers anyway, so it's not that we are not doing stuff like this >> already. > > We poll for media because there's no event-based way of avoiding it - in > this case there is. That's true, but I think there is a significant difference between polling every one or two seconds for media changes, and usually one or two minutes for a disk idle. It's not that we poll in a rather hight frequency, in an arbitrary interval, and check if some condition is met. I still don't think that we should add new event interfaces which are single-subscriber only, and use global values for a specific user. What if there will be another independent user for this, which might want a different timeout? They fight over the trigger value to set in sysfs? >From my perspective, the once-at-timeout wakeup is more acceptable than an in-kernel policy setting for a single-subscriber event interface. Thanks, Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 21:06 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 21:06 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 21:07, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: > >> Sure, but what's wrong with reading that file every 50 seconds? Almost >> all boxes poll for media changes of optical drives and usb card >> readers anyway, so it's not that we are not doing stuff like this >> already. > > We poll for media because there's no event-based way of avoiding it - in > this case there is. That's true, but I think there is a significant difference between polling every one or two seconds for media changes, and usually one or two minutes for a disk idle. It's not that we poll in a rather hight frequency, in an arbitrary interval, and check if some condition is met. I still don't think that we should add new event interfaces which are single-subscriber only, and use global values for a specific user. What if there will be another independent user for this, which might want a different timeout? They fight over the trigger value to set in sysfs? From my perspective, the once-at-timeout wakeup is more acceptable than an in-kernel policy setting for a single-subscriber event interface. Thanks, Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 21:06 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-18 21:29 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 21:29 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 22:06, Kay Sievers <kay.sievers@vrfy.org> wrote: > On Wed, Nov 18, 2009 at 21:07, Matthew Garrett <mjg59@srcf.ucam.org> wrote: >> On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: >> >>> Sure, but what's wrong with reading that file every 50 seconds? Almost >>> all boxes poll for media changes of optical drives and usb card >>> readers anyway, so it's not that we are not doing stuff like this >>> already. >> >> We poll for media because there's no event-based way of avoiding it - in >> this case there is. > > That's true, but I think there is a significant difference between > polling every one or two seconds for media changes, and usually one or > two minutes for a disk idle. It's not that we poll in a rather hight > frequency, in an arbitrary interval, and check if some condition is > met. > > I still don't think that we should add new event interfaces which are > single-subscriber only, and use global values for a specific user. > What if there will be another independent user for this, which might > want a different timeout? They fight over the trigger value to set in > sysfs? > > From my perspective, the once-at-timeout wakeup is more acceptable > than an in-kernel policy setting for a single-subscriber event > interface. I guess, the "idle_since" file could be made poll()able, and throw an event when the idle time is re-set to 0, so the value checking needs only to happen as long we wait for the disk to become idle. As long as it's busy anyway, the rare wakeups should not matter much. :) Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 21:29 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 21:29 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 22:06, Kay Sievers <kay.sievers@vrfy.org> wrote: > On Wed, Nov 18, 2009 at 21:07, Matthew Garrett <mjg59@srcf.ucam.org> wrote: >> On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: >> >>> Sure, but what's wrong with reading that file every 50 seconds? Almost >>> all boxes poll for media changes of optical drives and usb card >>> readers anyway, so it's not that we are not doing stuff like this >>> already. >> >> We poll for media because there's no event-based way of avoiding it - in >> this case there is. > > That's true, but I think there is a significant difference between > polling every one or two seconds for media changes, and usually one or > two minutes for a disk idle. It's not that we poll in a rather hight > frequency, in an arbitrary interval, and check if some condition is > met. > > I still don't think that we should add new event interfaces which are > single-subscriber only, and use global values for a specific user. > What if there will be another independent user for this, which might > want a different timeout? They fight over the trigger value to set in > sysfs? > > From my perspective, the once-at-timeout wakeup is more acceptable > than an in-kernel policy setting for a single-subscriber event > interface. I guess, the "idle_since" file could be made poll()able, and throw an event when the idle time is re-set to 0, so the value checking needs only to happen as long we wait for the disk to become idle. As long as it's busy anyway, the rare wakeups should not matter much. :) Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 21:29 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-18 21:35 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 21:35 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 10:29:23PM +0100, Kay Sievers wrote: > I guess, the "idle_since" file could be made poll()able, and throw an > event when the idle time is re-set to 0, so the value checking needs > only to happen as long we wait for the disk to become idle. As long as > it's busy anyway, the rare wakeups should not matter much. :) That'd be a userspace wakeup every time something gets submitted to the block device, which sounds far from ideal... -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 21:35 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 21:35 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 10:29:23PM +0100, Kay Sievers wrote: > I guess, the "idle_since" file could be made poll()able, and throw an > event when the idle time is re-set to 0, so the value checking needs > only to happen as long we wait for the disk to become idle. As long as > it's busy anyway, the rare wakeups should not matter much. :) That'd be a userspace wakeup every time something gets submitted to the block device, which sounds far from ideal... -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 21:35 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-18 21:39 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 21:39 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 22:35, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Wed, Nov 18, 2009 at 10:29:23PM +0100, Kay Sievers wrote: > >> I guess, the "idle_since" file could be made poll()able, and throw an >> event when the idle time is re-set to 0, so the value checking needs >> only to happen as long we wait for the disk to become idle. As long as >> it's busy anyway, the rare wakeups should not matter much. :) > > That'd be a userspace wakeup every time something gets submitted to the > block device, which sounds far from ideal... No, you would only poll() when you reached the timeout and the disk entered the idle state. This can not happen more frequently than the timeout itself. Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 21:39 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 21:39 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 22:35, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Wed, Nov 18, 2009 at 10:29:23PM +0100, Kay Sievers wrote: > >> I guess, the "idle_since" file could be made poll()able, and throw an >> event when the idle time is re-set to 0, so the value checking needs >> only to happen as long we wait for the disk to become idle. As long as >> it's busy anyway, the rare wakeups should not matter much. :) > > That'd be a userspace wakeup every time something gets submitted to the > block device, which sounds far from ideal... No, you would only poll() when you reached the timeout and the disk entered the idle state. This can not happen more frequently than the timeout itself. Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 21:39 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-18 21:45 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 21:45 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 10:39:26PM +0100, Kay Sievers wrote: > On Wed, Nov 18, 2009 at 22:35, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > > On Wed, Nov 18, 2009 at 10:29:23PM +0100, Kay Sievers wrote: > > > >> I guess, the "idle_since" file could be made poll()able, and throw an > >> event when the idle time is re-set to 0, so the value checking needs > >> only to happen as long we wait for the disk to become idle. As long as > >> it's busy anyway, the rare wakeups should not matter much. :) > > > > That'd be a userspace wakeup every time something gets submitted to the > > block device, which sounds far from ideal... > > No, you would only poll() when you reached the timeout and the disk > entered the idle state. This can not happen more frequently than the > timeout itself. I don't understand. idle_since would be reset on every access to the block device. The alternative is to generate an event when the disk goes idle, but that goes back to requiring a timer in the kernel... -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 21:45 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 21:45 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 10:39:26PM +0100, Kay Sievers wrote: > On Wed, Nov 18, 2009 at 22:35, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > > On Wed, Nov 18, 2009 at 10:29:23PM +0100, Kay Sievers wrote: > > > >> I guess, the "idle_since" file could be made poll()able, and throw an > >> event when the idle time is re-set to 0, so the value checking needs > >> only to happen as long we wait for the disk to become idle. As long as > >> it's busy anyway, the rare wakeups should not matter much. :) > > > > That'd be a userspace wakeup every time something gets submitted to the > > block device, which sounds far from ideal... > > No, you would only poll() when you reached the timeout and the disk > entered the idle state. This can not happen more frequently than the > timeout itself. I don't understand. idle_since would be reset on every access to the block device. The alternative is to generate an event when the disk goes idle, but that goes back to requiring a timer in the kernel... -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 21:06 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-18 21:33 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 21:33 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 10:06:33PM +0100, Kay Sievers wrote: > That's true, but I think there is a significant difference between > polling every one or two seconds for media changes, and usually one or > two minutes for a disk idle. It's not that we poll in a rather hight > frequency, in an arbitrary interval, and check if some condition is > met. My use cases are on the order of a second. > I still don't think that we should add new event interfaces which are > single-subscriber only, and use global values for a specific user. > What if there will be another independent user for this, which might > want a different timeout? They fight over the trigger value to set in > sysfs? You can trivially multiplex without any additional wakeups. Something like devkit-disks can simply trigger on the lowest requested time and then schedule wakeups for subscribers who want a different timeout. > From my perspective, the once-at-timeout wakeup is more acceptable > than an in-kernel policy setting for a single-subscriber event > interface. I'd be open to it being something for multiple subscribers, though that would add to the complexity in the block code and I'm not sure that's needed. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 21:33 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-18 21:33 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 10:06:33PM +0100, Kay Sievers wrote: > That's true, but I think there is a significant difference between > polling every one or two seconds for media changes, and usually one or > two minutes for a disk idle. It's not that we poll in a rather hight > frequency, in an arbitrary interval, and check if some condition is > met. My use cases are on the order of a second. > I still don't think that we should add new event interfaces which are > single-subscriber only, and use global values for a specific user. > What if there will be another independent user for this, which might > want a different timeout? They fight over the trigger value to set in > sysfs? You can trivially multiplex without any additional wakeups. Something like devkit-disks can simply trigger on the lowest requested time and then schedule wakeups for subscribers who want a different timeout. > From my perspective, the once-at-timeout wakeup is more acceptable > than an in-kernel policy setting for a single-subscriber event > interface. I'd be open to it being something for multiple subscribers, though that would add to the complexity in the block code and I'm not sure that's needed. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 21:33 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-18 21:40 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 21:40 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Wed, Nov 18, 2009 at 10:06:33PM +0100, Kay Sievers wrote: > >> That's true, but I think there is a significant difference between >> polling every one or two seconds for media changes, and usually one or >> two minutes for a disk idle. It's not that we poll in a rather hight >> frequency, in an arbitrary interval, and check if some condition is >> met. > > My use cases are on the order of a second. > >> I still don't think that we should add new event interfaces which are >> single-subscriber only, and use global values for a specific user. >> What if there will be another independent user for this, which might >> want a different timeout? They fight over the trigger value to set in >> sysfs? > > You can trivially multiplex without any additional wakeups. Something > like devkit-disks can simply trigger on the lowest requested time and > then schedule wakeups for subscribers who want a different timeout. No, it can't do this race-free. And it's far from trivial. It can not know when something changes the single global value. Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-18 21:40 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-18 21:40 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Wed, Nov 18, 2009 at 10:06:33PM +0100, Kay Sievers wrote: > >> That's true, but I think there is a significant difference between >> polling every one or two seconds for media changes, and usually one or >> two minutes for a disk idle. It's not that we poll in a rather hight >> frequency, in an arbitrary interval, and check if some condition is >> met. > > My use cases are on the order of a second. > >> I still don't think that we should add new event interfaces which are >> single-subscriber only, and use global values for a specific user. >> What if there will be another independent user for this, which might >> want a different timeout? They fight over the trigger value to set in >> sysfs? > > You can trivially multiplex without any additional wakeups. Something > like devkit-disks can simply trigger on the lowest requested time and > then schedule wakeups for subscribers who want a different timeout. No, it can't do this race-free. And it's far from trivial. It can not know when something changes the single global value. Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 21:33 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-19 11:09 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-19 11:09 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Wed, Nov 18, 2009 at 10:06:33PM +0100, Kay Sievers wrote: > >> That's true, but I think there is a significant difference between >> polling every one or two seconds for media changes, and usually one or >> two minutes for a disk idle. It's not that we poll in a rather hight >> frequency, in an arbitrary interval, and check if some condition is >> met. > > My use cases are on the order of a second. Ok, what's the specific use case, which should be triggered after a second? I thought you were thinking about disk spindown or similar. Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-19 11:09 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-19 11:09 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Wed, Nov 18, 2009 at 10:06:33PM +0100, Kay Sievers wrote: > >> That's true, but I think there is a significant difference between >> polling every one or two seconds for media changes, and usually one or >> two minutes for a disk idle. It's not that we poll in a rather hight >> frequency, in an arbitrary interval, and check if some condition is >> met. > > My use cases are on the order of a second. Ok, what's the specific use case, which should be triggered after a second? I thought you were thinking about disk spindown or similar. Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-19 11:09 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-19 13:01 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-19 13:01 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote: > On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > > My use cases are on the order of a second. > > Ok, what's the specific use case, which should be triggered after a > second? I thought you were thinking about disk spindown or similar. The first is altering ALPM policy. ALPM will be initiated by the host if the number of queued requests hits zero - if there's no hysteresis implemented, then that can result in a significant performance hit. We don't need /much/ hysteresis, but it's the difference between a 50% performance hit and not having that. The other I'm currently looking at is monitoring disk usage in order to be able to apply an adaptive policy to disk spindown. This doesn't need the information to be as accurate, but it would still be better at closer to the second granularity than the minute level. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-19 13:01 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-19 13:01 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote: > On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > > My use cases are on the order of a second. > > Ok, what's the specific use case, which should be triggered after a > second? I thought you were thinking about disk spindown or similar. The first is altering ALPM policy. ALPM will be initiated by the host if the number of queued requests hits zero - if there's no hysteresis implemented, then that can result in a significant performance hit. We don't need /much/ hysteresis, but it's the difference between a 50% performance hit and not having that. The other I'm currently looking at is monitoring disk usage in order to be able to apply an adaptive policy to disk spindown. This doesn't need the information to be as accurate, but it would still be better at closer to the second granularity than the minute level. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-19 13:01 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-19 13:29 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-19 13:29 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 14:01, Matthew Garrett <mjg@redhat.com> wrote: > On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote: >> On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: >> > My use cases are on the order of a second. >> >> Ok, what's the specific use case, which should be triggered after a >> second? I thought you were thinking about disk spindown or similar. > > The first is altering ALPM policy. ALPM will be initiated by the host if > the number of queued requests hits zero - if there's no hysteresis > implemented, then that can result in a significant performance hit. We > don't need /much/ hysteresis, but it's the difference between a 50% > performance hit and not having that. Can't that logic live entirely in the kernel, instead of being a rather generic userspace event interface (with the current limitation to a single user)? Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-19 13:29 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-19 13:29 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 14:01, Matthew Garrett <mjg@redhat.com> wrote: > On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote: >> On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: >> > My use cases are on the order of a second. >> >> Ok, what's the specific use case, which should be triggered after a >> second? I thought you were thinking about disk spindown or similar. > > The first is altering ALPM policy. ALPM will be initiated by the host if > the number of queued requests hits zero - if there's no hysteresis > implemented, then that can result in a significant performance hit. We > don't need /much/ hysteresis, but it's the difference between a 50% > performance hit and not having that. Can't that logic live entirely in the kernel, instead of being a rather generic userspace event interface (with the current limitation to a single user)? Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-19 13:29 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-19 14:16 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-19 14:16 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 02:29:29PM +0100, Kay Sievers wrote: > On Thu, Nov 19, 2009 at 14:01, Matthew Garrett <mjg@redhat.com> wrote: > > On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote: > >> On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > >> > My use cases are on the order of a second. > >> > >> Ok, what's the specific use case, which should be triggered after a > >> second? I thought you were thinking about disk spindown or similar. > > > > The first is altering ALPM policy. ALPM will be initiated by the host if > > the number of queued requests hits zero - if there's no hysteresis > > implemented, then that can result in a significant performance hit. We > > don't need /much/ hysteresis, but it's the difference between a 50% > > performance hit and not having that. > > Can't that logic live entirely in the kernel, instead of being a > rather generic userspace event interface (with the current limitation > to a single user)? It could, but it seems a bit of a hack. It'd still also require the timer to be in the kernel, so we might as well expose that to userspace. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-19 14:16 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-19 14:16 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 02:29:29PM +0100, Kay Sievers wrote: > On Thu, Nov 19, 2009 at 14:01, Matthew Garrett <mjg@redhat.com> wrote: > > On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote: > >> On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > >> > My use cases are on the order of a second. > >> > >> Ok, what's the specific use case, which should be triggered after a > >> second? I thought you were thinking about disk spindown or similar. > > > > The first is altering ALPM policy. ALPM will be initiated by the host if > > the number of queued requests hits zero - if there's no hysteresis > > implemented, then that can result in a significant performance hit. We > > don't need /much/ hysteresis, but it's the difference between a 50% > > performance hit and not having that. > > Can't that logic live entirely in the kernel, instead of being a > rather generic userspace event interface (with the current limitation > to a single user)? It could, but it seems a bit of a hack. It'd still also require the timer to be in the kernel, so we might as well expose that to userspace. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-19 14:16 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-19 14:25 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-19 14:25 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 15:16, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Thu, Nov 19, 2009 at 02:29:29PM +0100, Kay Sievers wrote: >> On Thu, Nov 19, 2009 at 14:01, Matthew Garrett <mjg@redhat.com> wrote: >> > On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote: >> >> On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: >> >> > My use cases are on the order of a second. >> >> >> >> Ok, what's the specific use case, which should be triggered after a >> >> second? I thought you were thinking about disk spindown or similar. >> > >> > The first is altering ALPM policy. ALPM will be initiated by the host if >> > the number of queued requests hits zero - if there's no hysteresis >> > implemented, then that can result in a significant performance hit. We >> > don't need /much/ hysteresis, but it's the difference between a 50% >> > performance hit and not having that. >> >> Can't that logic live entirely in the kernel, instead of being a >> rather generic userspace event interface (with the current limitation >> to a single user)? > > It could, but it seems a bit of a hack. It'd still also require the > timer to be in the kernel, so we might as well expose that to userspace. Sure, but a userspace configurable policy for an in-kernel disk-idle powermanagent sounds fine, compared to a single-subscriber userspace-only disk-idle event interface. :) Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-19 14:25 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-19 14:25 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 15:16, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Thu, Nov 19, 2009 at 02:29:29PM +0100, Kay Sievers wrote: >> On Thu, Nov 19, 2009 at 14:01, Matthew Garrett <mjg@redhat.com> wrote: >> > On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote: >> >> On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <mjg59@srcf.ucam.org> wrote: >> >> > My use cases are on the order of a second. >> >> >> >> Ok, what's the specific use case, which should be triggered after a >> >> second? I thought you were thinking about disk spindown or similar. >> > >> > The first is altering ALPM policy. ALPM will be initiated by the host if >> > the number of queued requests hits zero - if there's no hysteresis >> > implemented, then that can result in a significant performance hit. We >> > don't need /much/ hysteresis, but it's the difference between a 50% >> > performance hit and not having that. >> >> Can't that logic live entirely in the kernel, instead of being a >> rather generic userspace event interface (with the current limitation >> to a single user)? > > It could, but it seems a bit of a hack. It'd still also require the > timer to be in the kernel, so we might as well expose that to userspace. Sure, but a userspace configurable policy for an in-kernel disk-idle powermanagent sounds fine, compared to a single-subscriber userspace-only disk-idle event interface. :) Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-19 14:25 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-19 14:30 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-19 14:30 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 03:25:59PM +0100, Kay Sievers wrote: > On Thu, Nov 19, 2009 at 15:16, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > > It could, but it seems a bit of a hack. It'd still also require the > > timer to be in the kernel, so we might as well expose that to userspace. > > Sure, but a userspace configurable policy for an in-kernel disk-idle > powermanagent sounds fine, compared to a single-subscriber > userspace-only disk-idle event interface. :) Well, we still need to expose this for the access pattern modifying. I really don't see the issue with the single subscriber being devkit-disks - none of the operations involved are atomic, so we're inherently racy here. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-19 14:30 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-19 14:30 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 03:25:59PM +0100, Kay Sievers wrote: > On Thu, Nov 19, 2009 at 15:16, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > > It could, but it seems a bit of a hack. It'd still also require the > > timer to be in the kernel, so we might as well expose that to userspace. > > Sure, but a userspace configurable policy for an in-kernel disk-idle > powermanagent sounds fine, compared to a single-subscriber > userspace-only disk-idle event interface. :) Well, we still need to expose this for the access pattern modifying. I really don't see the issue with the single subscriber being devkit-disks - none of the operations involved are atomic, so we're inherently racy here. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-19 14:30 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-19 14:34 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-19 14:34 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 15:30, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Thu, Nov 19, 2009 at 03:25:59PM +0100, Kay Sievers wrote: >> On Thu, Nov 19, 2009 at 15:16, Matthew Garrett <mjg59@srcf.ucam.org> wrote: >> > It could, but it seems a bit of a hack. It'd still also require the >> > timer to be in the kernel, so we might as well expose that to userspace. >> >> Sure, but a userspace configurable policy for an in-kernel disk-idle >> powermanagent sounds fine, compared to a single-subscriber >> userspace-only disk-idle event interface. :) > > Well, we still need to expose this for the access pattern modifying. I > really don't see the issue with the single subscriber being devkit-disks > - none of the operations involved are atomic, so we're inherently racy > here. Single-subscriber event interfaces are usually a no-go for generic infrastructure like this. We still have the unmodified HAL running until it is dead, and this works only because there are no such awkward interfaces. In a few years we will probably have diskfoo replacing dk-disks, and then ... :) Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-19 14:34 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-19 14:34 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 15:30, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Thu, Nov 19, 2009 at 03:25:59PM +0100, Kay Sievers wrote: >> On Thu, Nov 19, 2009 at 15:16, Matthew Garrett <mjg59@srcf.ucam.org> wrote: >> > It could, but it seems a bit of a hack. It'd still also require the >> > timer to be in the kernel, so we might as well expose that to userspace. >> >> Sure, but a userspace configurable policy for an in-kernel disk-idle >> powermanagent sounds fine, compared to a single-subscriber >> userspace-only disk-idle event interface. :) > > Well, we still need to expose this for the access pattern modifying. I > really don't see the issue with the single subscriber being devkit-disks > - none of the operations involved are atomic, so we're inherently racy > here. Single-subscriber event interfaces are usually a no-go for generic infrastructure like this. We still have the unmodified HAL running until it is dead, and this works only because there are no such awkward interfaces. In a few years we will probably have diskfoo replacing dk-disks, and then ... :) Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-19 14:34 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-19 14:48 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-19 14:48 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 03:34:53PM +0100, Kay Sievers wrote: > Single-subscriber event interfaces are usually a no-go for generic > infrastructure like this. We still have the unmodified HAL running > until it is dead, and this works only because there are no such > awkward interfaces. In a few years we will probably have diskfoo > replacing dk-disks, and then ... :) If you've got any ideas for what a multi-subscriber interface would look like, I'm happy look at it. I don't think there's an especially compelling use-case for one right now so I'm not enthusiastic about the additional complexity that'd be required, but as long as there's basic agreement that it's not practical to do this in userspace then we're at least on the same page. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-19 14:48 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-19 14:48 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 03:34:53PM +0100, Kay Sievers wrote: > Single-subscriber event interfaces are usually a no-go for generic > infrastructure like this. We still have the unmodified HAL running > until it is dead, and this works only because there are no such > awkward interfaces. In a few years we will probably have diskfoo > replacing dk-disks, and then ... :) If you've got any ideas for what a multi-subscriber interface would look like, I'm happy look at it. I don't think there's an especially compelling use-case for one right now so I'm not enthusiastic about the additional complexity that'd be required, but as long as there's basic agreement that it's not practical to do this in userspace then we're at least on the same page. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-19 14:48 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-19 15:00 ` Kay Sievers -1 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-19 15:00 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 15:48, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Thu, Nov 19, 2009 at 03:34:53PM +0100, Kay Sievers wrote: > >> Single-subscriber event interfaces are usually a no-go for generic >> infrastructure like this. We still have the unmodified HAL running >> until it is dead, and this works only because there are no such >> awkward interfaces. In a few years we will probably have diskfoo >> replacing dk-disks, and then ... :) > > If you've got any ideas for what a multi-subscriber interface would look > like, I'm happy look at it. Yeah, it would not be as simple as your patch. It probably involves a way to get a file descriptor per listener, to let the kernel know if anybody is interested, and to auto-cleanup when the listener dies, and to have per instance timers. > I don't think there's an especially > compelling use-case for one right now so I'm not enthusiastic about the > additional complexity that'd be required, Right, but we've been there, and it's a pain, if you can not subscribe to an interface because something else is already using it/expecting it is the only user ever. So there needs to be a good reason for adding something like this as a new interface, which will very likely hit us back some day. > but as long as there's basic > agreement that it's not practical to do this in userspace then we're at > least on the same page. I'm all for executing the policy inside the kernel and let userspace only enable/configure it. It think there is not much to disagr4ee about such an approach. Thanks, Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-19 15:00 ` Kay Sievers 0 siblings, 0 replies; 73+ messages in thread From: Kay Sievers @ 2009-11-19 15:00 UTC (permalink / raw) To: Matthew Garrett; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 15:48, Matthew Garrett <mjg59@srcf.ucam.org> wrote: > On Thu, Nov 19, 2009 at 03:34:53PM +0100, Kay Sievers wrote: > >> Single-subscriber event interfaces are usually a no-go for generic >> infrastructure like this. We still have the unmodified HAL running >> until it is dead, and this works only because there are no such >> awkward interfaces. In a few years we will probably have diskfoo >> replacing dk-disks, and then ... :) > > If you've got any ideas for what a multi-subscriber interface would look > like, I'm happy look at it. Yeah, it would not be as simple as your patch. It probably involves a way to get a file descriptor per listener, to let the kernel know if anybody is interested, and to auto-cleanup when the listener dies, and to have per instance timers. > I don't think there's an especially > compelling use-case for one right now so I'm not enthusiastic about the > additional complexity that'd be required, Right, but we've been there, and it's a pain, if you can not subscribe to an interface because something else is already using it/expecting it is the only user ever. So there needs to be a good reason for adding something like this as a new interface, which will very likely hit us back some day. > but as long as there's basic > agreement that it's not practical to do this in userspace then we're at > least on the same page. I'm all for executing the policy inside the kernel and let userspace only enable/configure it. It think there is not much to disagr4ee about such an approach. Thanks, Kay ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-19 15:00 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers @ 2009-11-20 20:29 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-20 20:29 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 04:00:46PM +0100, Kay Sievers wrote: > Yeah, it would not be as simple as your patch. It probably involves a > way to get a file descriptor per listener, to let the kernel know if > anybody is interested, and to auto-cleanup when the listener dies, and > to have per instance timers. This would seem to involve a lot of extra locking in the block submission and completion code. I don't think it's ideal. How about this: * idle_hysteresis contains a value. If it's greater than 0, attempting to increase it will give -EINVAL. It can be polled. * On idle state transition, applications listening to the stat sysfs node will get woken. The stat output will include the number of msecs that the disk has been idle. If this is less than the application requested, it can set a timer to wake it up again in the future and recheck. * When an application exits, if (and only if) it wrote a value to idle_hysteresis, it should set this back to 0. This will notify any other apps, which may then set their own wakeup time. It's not beautiful but it satisfies the constraints. There's a minimum of extra wakeups, it doesn't complicate the block path any further and multiple applications can take advantage of it. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-20 20:29 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-20 20:29 UTC (permalink / raw) To: Kay Sievers; +Cc: David Zeuthen, linux-kernel, axboe, linux-hotplug On Thu, Nov 19, 2009 at 04:00:46PM +0100, Kay Sievers wrote: > Yeah, it would not be as simple as your patch. It probably involves a > way to get a file descriptor per listener, to let the kernel know if > anybody is interested, and to auto-cleanup when the listener dies, and > to have per instance timers. This would seem to involve a lot of extra locking in the block submission and completion code. I don't think it's ideal. How about this: * idle_hysteresis contains a value. If it's greater than 0, attempting to increase it will give -EINVAL. It can be polled. * On idle state transition, applications listening to the stat sysfs node will get woken. The stat output will include the number of msecs that the disk has been idle. If this is less than the application requested, it can set a timer to wake it up again in the future and recheck. * When an application exits, if (and only if) it wrote a value to idle_hysteresis, it should set this back to 0. This will notify any other apps, which may then set their own wakeup time. It's not beautiful but it satisfies the constraints. There's a minimum of extra wakeups, it doesn't complicate the block path any further and multiple applications can take advantage of it. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-18 20:07 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-22 23:37 ` Pavel Machek -1 siblings, 0 replies; 73+ messages in thread From: Pavel Machek @ 2009-11-22 23:37 UTC (permalink / raw) To: Matthew Garrett Cc: Kay Sievers, David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed 2009-11-18 20:07:12, Matthew Garrett wrote: > On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: > > > Sure, but what's wrong with reading that file every 50 seconds? Almost > > all boxes poll for media changes of optical drives and usb card > > readers anyway, so it's not that we are not doing stuff like this > > already. > > We poll for media because there's no event-based way of avoiding it - in > this case there is. ...when you add overhead to every disk operation. I'd say that polling once in 50 seconds is preferable to that. -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-22 23:37 ` Pavel Machek 0 siblings, 0 replies; 73+ messages in thread From: Pavel Machek @ 2009-11-22 23:37 UTC (permalink / raw) To: Matthew Garrett Cc: Kay Sievers, David Zeuthen, linux-kernel, axboe, linux-hotplug On Wed 2009-11-18 20:07:12, Matthew Garrett wrote: > On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: > > > Sure, but what's wrong with reading that file every 50 seconds? Almost > > all boxes poll for media changes of optical drives and usb card > > readers anyway, so it's not that we are not doing stuff like this > > already. > > We poll for media because there's no event-based way of avoiding it - in > this case there is. ...when you add overhead to every disk operation. I'd say that polling once in 50 seconds is preferable to that. -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-22 23:37 ` [PATCH] [RFC] Add support for uevents on block device idle Pavel Machek @ 2009-11-23 14:12 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-23 14:12 UTC (permalink / raw) To: Pavel Machek Cc: Kay Sievers, David Zeuthen, linux-kernel, axboe, linux-hotplug On Mon, Nov 23, 2009 at 12:37:49AM +0100, Pavel Machek wrote: > On Wed 2009-11-18 20:07:12, Matthew Garrett wrote: > > On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: > > > > > Sure, but what's wrong with reading that file every 50 seconds? Almost > > > all boxes poll for media changes of optical drives and usb card > > > readers anyway, so it's not that we are not doing stuff like this > > > already. > > > > We poll for media because there's no event-based way of avoiding it - in > > this case there is. > > ...when you add overhead to every disk operation. I'd say that polling > once in 50 seconds is preferable to that. Yeah, but 50 seconds isn't the timescale we're talking about here. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-23 14:12 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-23 14:12 UTC (permalink / raw) To: Pavel Machek Cc: Kay Sievers, David Zeuthen, linux-kernel, axboe, linux-hotplug On Mon, Nov 23, 2009 at 12:37:49AM +0100, Pavel Machek wrote: > On Wed 2009-11-18 20:07:12, Matthew Garrett wrote: > > On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: > > > > > Sure, but what's wrong with reading that file every 50 seconds? Almost > > > all boxes poll for media changes of optical drives and usb card > > > readers anyway, so it's not that we are not doing stuff like this > > > already. > > > > We poll for media because there's no event-based way of avoiding it - in > > this case there is. > > ...when you add overhead to every disk operation. I'd say that polling > once in 50 seconds is preferable to that. Yeah, but 50 seconds isn't the timescale we're talking about here. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-22 23:37 ` [PATCH] [RFC] Add support for uevents on block device idle Pavel Machek @ 2009-11-23 14:17 ` Jens Axboe -1 siblings, 0 replies; 73+ messages in thread From: Jens Axboe @ 2009-11-23 14:17 UTC (permalink / raw) To: Pavel Machek Cc: Matthew Garrett, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23 2009, Pavel Machek wrote: > On Wed 2009-11-18 20:07:12, Matthew Garrett wrote: > > On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: > > > > > Sure, but what's wrong with reading that file every 50 seconds? Almost > > > all boxes poll for media changes of optical drives and usb card > > > readers anyway, so it's not that we are not doing stuff like this > > > already. > > > > We poll for media because there's no event-based way of avoiding it - in > > this case there is. > > ...when you add overhead to every disk operation. I'd say that polling > once in 50 seconds is preferable to that. I have to agree, doing a mod_timer() on every single IO is going to suck big time. I went to great lengths to avoid doing that even for timeout detection. So that's pretty much a non-starter to begin with. Additionally, as Bart also wrote, you are not doing this in the right place. You want to do this post-merge, not for each incoming IO. Have you looked at laptop mode? Looks like you are essentially re-inventing that, but in a bad way. -- Jens Axboe ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-23 14:17 ` Jens Axboe 0 siblings, 0 replies; 73+ messages in thread From: Jens Axboe @ 2009-11-23 14:17 UTC (permalink / raw) To: Pavel Machek Cc: Matthew Garrett, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23 2009, Pavel Machek wrote: > On Wed 2009-11-18 20:07:12, Matthew Garrett wrote: > > On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote: > > > > > Sure, but what's wrong with reading that file every 50 seconds? Almost > > > all boxes poll for media changes of optical drives and usb card > > > readers anyway, so it's not that we are not doing stuff like this > > > already. > > > > We poll for media because there's no event-based way of avoiding it - in > > this case there is. > > ...when you add overhead to every disk operation. I'd say that polling > once in 50 seconds is preferable to that. I have to agree, doing a mod_timer() on every single IO is going to suck big time. I went to great lengths to avoid doing that even for timeout detection. So that's pretty much a non-starter to begin with. Additionally, as Bart also wrote, you are not doing this in the right place. You want to do this post-merge, not for each incoming IO. Have you looked at laptop mode? Looks like you are essentially re-inventing that, but in a bad way. -- Jens Axboe ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-23 14:17 ` [PATCH] [RFC] Add support for uevents on block device idle Jens Axboe @ 2009-11-23 14:25 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-23 14:25 UTC (permalink / raw) To: Jens Axboe Cc: Pavel Machek, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23, 2009 at 03:17:54PM +0100, Jens Axboe wrote: > I have to agree, doing a mod_timer() on every single IO is going to suck > big time. I went to great lengths to avoid doing that even for timeout > detection. So that's pretty much a non-starter to begin with. It's conditional on a (default off) setting, so it's not a hit unless the user requests it. But yeah, the performance hit is obviously a concern. It may be that polling is the least bad way of doing this. > Additionally, as Bart also wrote, you are not doing this in the right > place. You want to do this post-merge, not for each incoming IO. Have > you looked at laptop mode? Looks like you are essentially re-inventing > that, but in a bad way. Right, that's mostly down to my having no familiarity with the block layer at all :) I can fix that up easily enough, but if a deferrable timer is going to be too expensive then it'll need some rethinking anyway. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-23 14:25 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-23 14:25 UTC (permalink / raw) To: Jens Axboe Cc: Pavel Machek, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23, 2009 at 03:17:54PM +0100, Jens Axboe wrote: > I have to agree, doing a mod_timer() on every single IO is going to suck > big time. I went to great lengths to avoid doing that even for timeout > detection. So that's pretty much a non-starter to begin with. It's conditional on a (default off) setting, so it's not a hit unless the user requests it. But yeah, the performance hit is obviously a concern. It may be that polling is the least bad way of doing this. > Additionally, as Bart also wrote, you are not doing this in the right > place. You want to do this post-merge, not for each incoming IO. Have > you looked at laptop mode? Looks like you are essentially re-inventing > that, but in a bad way. Right, that's mostly down to my having no familiarity with the block layer at all :) I can fix that up easily enough, but if a deferrable timer is going to be too expensive then it'll need some rethinking anyway. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-23 14:25 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-23 14:31 ` Jens Axboe -1 siblings, 0 replies; 73+ messages in thread From: Jens Axboe @ 2009-11-23 14:31 UTC (permalink / raw) To: Matthew Garrett Cc: Pavel Machek, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23 2009, Matthew Garrett wrote: > On Mon, Nov 23, 2009 at 03:17:54PM +0100, Jens Axboe wrote: > > > I have to agree, doing a mod_timer() on every single IO is going to suck > > big time. I went to great lengths to avoid doing that even for timeout > > detection. So that's pretty much a non-starter to begin with. > > It's conditional on a (default off) setting, so it's not a hit unless > the user requests it. But yeah, the performance hit is obviously a > concern. It may be that polling is the least bad way of doing this. Even if it's off by default, doesn't mean we shouldn't make the implementation correct or fast :-) > > Additionally, as Bart also wrote, you are not doing this in the right > > place. You want to do this post-merge, not for each incoming IO. Have > > you looked at laptop mode? Looks like you are essentially re-inventing > > that, but in a bad way. > > Right, that's mostly down to my having no familiarity with the block > layer at all :) I can fix that up easily enough, but if a deferrable > timer is going to be too expensive then it'll need some rethinking > anyway. Well, take a look at laptop mode. A timer per-io is probably unavoidable, but doing it at IO completion could mean a big decrease in timer activity as opposed to doing it for each incoming IO. And since you are looking at when the disk is idle, it makes a lot more sense to me to do that when the we complete a request (and have no further pending IO) rather than on incoming IO. Your biggest performance issue here is going to be sync IO, since the disk will go idle very briefly before being kicked into action again. -- Jens Axboe ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-23 14:31 ` Jens Axboe 0 siblings, 0 replies; 73+ messages in thread From: Jens Axboe @ 2009-11-23 14:31 UTC (permalink / raw) To: Matthew Garrett Cc: Pavel Machek, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23 2009, Matthew Garrett wrote: > On Mon, Nov 23, 2009 at 03:17:54PM +0100, Jens Axboe wrote: > > > I have to agree, doing a mod_timer() on every single IO is going to suck > > big time. I went to great lengths to avoid doing that even for timeout > > detection. So that's pretty much a non-starter to begin with. > > It's conditional on a (default off) setting, so it's not a hit unless > the user requests it. But yeah, the performance hit is obviously a > concern. It may be that polling is the least bad way of doing this. Even if it's off by default, doesn't mean we shouldn't make the implementation correct or fast :-) > > Additionally, as Bart also wrote, you are not doing this in the right > > place. You want to do this post-merge, not for each incoming IO. Have > > you looked at laptop mode? Looks like you are essentially re-inventing > > that, but in a bad way. > > Right, that's mostly down to my having no familiarity with the block > layer at all :) I can fix that up easily enough, but if a deferrable > timer is going to be too expensive then it'll need some rethinking > anyway. Well, take a look at laptop mode. A timer per-io is probably unavoidable, but doing it at IO completion could mean a big decrease in timer activity as opposed to doing it for each incoming IO. And since you are looking at when the disk is idle, it makes a lot more sense to me to do that when the we complete a request (and have no further pending IO) rather than on incoming IO. Your biggest performance issue here is going to be sync IO, since the disk will go idle very briefly before being kicked into action again. -- Jens Axboe ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-23 14:31 ` [PATCH] [RFC] Add support for uevents on block device idle Jens Axboe @ 2009-11-23 14:42 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-23 14:42 UTC (permalink / raw) To: Jens Axboe Cc: Pavel Machek, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23, 2009 at 03:31:40PM +0100, Jens Axboe wrote: > Well, take a look at laptop mode. A timer per-io is probably > unavoidable, but doing it at IO completion could mean a big decrease in > timer activity as opposed to doing it for each incoming IO. And since > you are looking at when the disk is idle, it makes a lot more sense to > me to do that when the we complete a request (and have no further > pending IO) rather than on incoming IO. Right. The current implementation I have does a del_timer() at submission (which should be moved to post-merge) - that should be cheap in the common case of a new command being submitted when there's already commands outstanding. There's then a mod_timer() at completion time. That's still a certain amount of expense, but it should be less. > Your biggest performance issue here is going to be sync IO, since the > disk will go idle very briefly before being kicked into action again. Ok, I'll try to benchmark that. The alternative (polling) method would be something much like Kay suggested - either add an extra field to stat or an extra sysfs file, then invalidate that on submission and set to jiffies on completion. It's not ideal from a wakeups perspective, but it's pretty low impact on the kernel side. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-23 14:42 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-23 14:42 UTC (permalink / raw) To: Jens Axboe Cc: Pavel Machek, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23, 2009 at 03:31:40PM +0100, Jens Axboe wrote: > Well, take a look at laptop mode. A timer per-io is probably > unavoidable, but doing it at IO completion could mean a big decrease in > timer activity as opposed to doing it for each incoming IO. And since > you are looking at when the disk is idle, it makes a lot more sense to > me to do that when the we complete a request (and have no further > pending IO) rather than on incoming IO. Right. The current implementation I have does a del_timer() at submission (which should be moved to post-merge) - that should be cheap in the common case of a new command being submitted when there's already commands outstanding. There's then a mod_timer() at completion time. That's still a certain amount of expense, but it should be less. > Your biggest performance issue here is going to be sync IO, since the > disk will go idle very briefly before being kicked into action again. Ok, I'll try to benchmark that. The alternative (polling) method would be something much like Kay suggested - either add an extra field to stat or an extra sysfs file, then invalidate that on submission and set to jiffies on completion. It's not ideal from a wakeups perspective, but it's pretty low impact on the kernel side. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-23 14:42 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett @ 2009-11-23 19:50 ` Jens Axboe -1 siblings, 0 replies; 73+ messages in thread From: Jens Axboe @ 2009-11-23 19:50 UTC (permalink / raw) To: Matthew Garrett Cc: Pavel Machek, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23 2009, Matthew Garrett wrote: > On Mon, Nov 23, 2009 at 03:31:40PM +0100, Jens Axboe wrote: > > > Well, take a look at laptop mode. A timer per-io is probably > > unavoidable, but doing it at IO completion could mean a big decrease in > > timer activity as opposed to doing it for each incoming IO. And since > > you are looking at when the disk is idle, it makes a lot more sense to > > me to do that when the we complete a request (and have no further > > pending IO) rather than on incoming IO. > > Right. The current implementation I have does a del_timer() at > submission (which should be moved to post-merge) - that should be cheap > in the common case of a new command being submitted when there's already > commands outstanding. There's then a mod_timer() at completion time. > That's still a certain amount of expense, but it should be less. > > > Your biggest performance issue here is going to be sync IO, since the > > disk will go idle very briefly before being kicked into action again. > > Ok, I'll try to benchmark that. > > The alternative (polling) method would be something much like Kay > suggested - either add an extra field to stat or an extra sysfs file, > then invalidate that on submission and set to jiffies on completion. > It's not ideal from a wakeups perspective, but it's pretty low impact on > the kernel side. If the polling works out, then yes that approach is certainly a lot better from a performance impact pov. What kind of time intervals are you targetting? -- Jens Axboe ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-23 19:50 ` Jens Axboe 0 siblings, 0 replies; 73+ messages in thread From: Jens Axboe @ 2009-11-23 19:50 UTC (permalink / raw) To: Matthew Garrett Cc: Pavel Machek, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23 2009, Matthew Garrett wrote: > On Mon, Nov 23, 2009 at 03:31:40PM +0100, Jens Axboe wrote: > > > Well, take a look at laptop mode. A timer per-io is probably > > unavoidable, but doing it at IO completion could mean a big decrease in > > timer activity as opposed to doing it for each incoming IO. And since > > you are looking at when the disk is idle, it makes a lot more sense to > > me to do that when the we complete a request (and have no further > > pending IO) rather than on incoming IO. > > Right. The current implementation I have does a del_timer() at > submission (which should be moved to post-merge) - that should be cheap > in the common case of a new command being submitted when there's already > commands outstanding. There's then a mod_timer() at completion time. > That's still a certain amount of expense, but it should be less. > > > Your biggest performance issue here is going to be sync IO, since the > > disk will go idle very briefly before being kicked into action again. > > Ok, I'll try to benchmark that. > > The alternative (polling) method would be something much like Kay > suggested - either add an extra field to stat or an extra sysfs file, > then invalidate that on submission and set to jiffies on completion. > It's not ideal from a wakeups perspective, but it's pretty low impact on > the kernel side. If the polling works out, then yes that approach is certainly a lot better from a performance impact pov. What kind of time intervals are you targetting? -- Jens Axboe ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-23 19:50 ` [PATCH] [RFC] Add support for uevents on block device idle Jens Axboe @ 2009-11-23 19:54 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-23 19:54 UTC (permalink / raw) To: Jens Axboe Cc: Pavel Machek, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23, 2009 at 08:50:00PM +0100, Jens Axboe wrote: > If the polling works out, then yes that approach is certainly a lot > better from a performance impact pov. > > What kind of time intervals are you targetting? On the order of a second. I'm doing benchmarking with my current implementation now, I'll let you know how it looks. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle @ 2009-11-23 19:54 ` Matthew Garrett 0 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-11-23 19:54 UTC (permalink / raw) To: Jens Axboe Cc: Pavel Machek, Kay Sievers, David Zeuthen, linux-kernel, linux-hotplug On Mon, Nov 23, 2009 at 08:50:00PM +0100, Jens Axboe wrote: > If the polling works out, then yes that approach is certainly a lot > better from a performance impact pov. > > What kind of time intervals are you targetting? On the order of a second. I'm doing benchmarking with my current implementation now, I'll let you know how it looks. -- Matthew Garrett | mjg59@srcf.ucam.org ^ permalink raw reply [flat|nested] 73+ messages in thread
* [RFC] Add support for events on block device idle changes 2009-11-23 19:50 ` [PATCH] [RFC] Add support for uevents on block device idle Jens Axboe (?) (?) @ 2009-12-11 21:20 ` Matthew Garrett -1 siblings, 0 replies; 73+ messages in thread From: Matthew Garrett @ 2009-12-11 21:20 UTC (permalink / raw) To: jens.axboe; +Cc: pavel, kay.sievers, david, linux-kernel, Matthew Garrett I looked into polling and realised that it's not going to work - reducing the performance hit requires the idle->active transition to be acknowleged as quickly as possible, so there needs to be an event for that. And for that to happen, we need to have an idea of whether the disk is idle or not. That could be done by just performing a comparison of the last request time against the current time, but it still ends up giving us several of the same problems. What I've done in this version is move the timer modification to the completion, with request submission just performing the timer deletion. This should make things less expensive than the previous versions of the patch. I've also added updates to the idle_hysteresis file, which makes it possible to implement a mechanism for multiple applications with different timeout requirements. This is somewhat hacky, but avoids putting any more complexity in fast path code. Thoughts? --- Userspace may wish to know whether a given disk is active or idle, for example to modify power management policy based on access patterns. This patch adds a deferrable timer to the block layer which will fire if the disk is idle for a user-definable period of time, generating a notification event on the device's stat node. An event will also be generated if an access is received while the disk is classified as idle. Documentation/ABI/testing/sysfs-block | 49 +++++++++++++++++++++++++++++- block/blk-core.c | 5 +++ block/elevator.c | 7 ++++ block/genhd.c | 54 +++++++++++++++++++++++++++++++++ fs/partitions/check.c | 9 ++++- include/linux/genhd.h | 6 ++++ 6 files changed, 127 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 5f3beda..03b411b 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -3,7 +3,7 @@ Date: February 2008 Contact: Jerome Marchand <jmarchan@redhat.com> Description: The /sys/block/<disk>/stat files displays the I/O - statistics of disk <disk>. They contain 11 fields: + statistics of disk <disk>. They contain 12 fields: 1 - reads completed succesfully 2 - reads merged 3 - sectors read @@ -15,6 +15,14 @@ Description: 9 - I/Os currently in progress 10 - time spent doing I/Os (ms) 11 - weighted time spent doing I/Os (ms) + 12 - -1 if the disk is active, otherwise the length of time in + milliseconds since the disk became idle (determined by + the idle_hysteresis setting) + + Applications that call poll() or select() on this attribute + will be woken when the block device undergoes a transition + between active and idle. + For more details refer Documentation/iostats.txt @@ -128,3 +136,42 @@ Description: preferred request size for workloads where sustained throughput is desired. If no optimal I/O size is reported this file contains 0. + +What: /sys/block/<disk>/idle_hysteresis +Date: November 2009 +Contact: Matthew Garrett <mjg@redhat.com> +Description: + Contains the number of milliseconds to wait after an + access before declaring that a disk is idle. Any + accesses to the block device during this time will + reset the timer. "0" (the default) indicates that no + events will be generated. If a value has already been + written to this file, then the only valid values are + either "0" (to disable notification) or a number + smaller than the current value. On write, a + notification will be sent to any userspace + applications poll()ing on this file. + + The intended use of this interface is to allow + applications to change power management policy based + on disk activity patterns. The first application to + use this interface should write its timeout value and + continue monitoring this file along with + "stat". Notifications will be sent to the "stat" file + when the disk switches from active to idle or + vice-versa. + + If more than one application uses this interface, the + second application should attempt to write its own + timeout. If this fails, it should read the current + timeout. It will then be woken on active to idle + transitions, at which point it should sleep for the + time difference between its desired timeout and the + programmed timeout. On waking, it should then re-read + the "state" file to determine if the disk has been + idle for long enough. + + If the write is successful, then the first application + to use the interface will be woken instead. It should + then modify its wakeup code to match the above + description. diff --git a/block/blk-core.c b/block/blk-core.c index 10e305f..16b501a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2115,6 +2115,11 @@ static void blk_finish_request(struct request *req, int error) if (unlikely(laptop_mode) && blk_fs_request(req)) laptop_io_completion(req); + if (blk_fs_request(req) && req->rq_disk->hysteresis_time) + mod_timer(&req->rq_disk->hysteresis_timer, + jiffies+msecs_to_jiffies + (req->rq_disk->hysteresis_time)); + blk_delete_timer(req); blk_account_io_done(req); diff --git a/block/elevator.c b/block/elevator.c index a847046..01f0bfb 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -683,6 +683,13 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) BUG(); } + if (blk_fs_request(rq) && rq->rq_disk->hysteresis_time && + rq->rq_disk->idle) { + rq->rq_disk->idle = 0; + del_timer(&rq->rq_disk->hysteresis_timer); + schedule_work(&rq->rq_disk->idle_notify); + } + if (unplug_it && blk_queue_plugged(q)) { int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC] - queue_in_flight(q); diff --git a/block/genhd.c b/block/genhd.c index 517e433..802c142 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -504,6 +504,21 @@ static int exact_lock(dev_t devt, void *data) return 0; } +static void disk_idle(unsigned long data) +{ + struct gendisk *gd = (struct gendisk *)data; + + gd->idle = jiffies; + schedule_work(&gd->idle_notify); +} + +static void disk_idle_notify_thread(struct work_struct *work) +{ + struct gendisk *gd = container_of(work, struct gendisk, idle_notify); + + sysfs_notify(&disk_to_dev(gd)->kobj, NULL, "stat"); +} + /** * add_disk - add partitioning information to kernel list * @disk: per-device partitioning information @@ -543,6 +558,10 @@ void add_disk(struct gendisk *disk) blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); + + init_timer(&disk->hysteresis_timer); + setup_timer(&disk->hysteresis_timer, disk_idle, (unsigned long)disk); + register_disk(disk); blk_register_queue(disk); @@ -861,6 +880,36 @@ static ssize_t disk_alignment_offset_show(struct device *dev, return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); } +static ssize_t disk_idle_hysteresis_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk->hysteresis_time); +} + +static ssize_t disk_idle_hysteresis_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + unsigned long timeout; + int res; + + res = strict_strtoul(buf, 10, &timeout); + if (res) + return -EINVAL; + + if (disk->hysteresis_time && timeout > disk->hysteresis_time) + return -EINVAL; + + disk->hysteresis_time = timeout; + sysfs_notify(&dev->kobj, NULL, "idle_hysteresis"); + + return count; +} + static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); @@ -870,6 +919,8 @@ static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(idle_hysteresis, 0644, disk_idle_hysteresis_show, + disk_idle_hysteresis_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -890,6 +941,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_idle_hysteresis.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1183,6 +1235,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); INIT_WORK(&disk->async_notify, media_change_notify_thread); + INIT_WORK(&disk->idle_notify, + disk_idle_notify_thread); } return disk; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7b685e1..954dc32 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -230,6 +230,7 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); + struct gendisk *gd = dev_to_disk(dev); int cpu; cpu = part_stat_lock(); @@ -238,7 +239,7 @@ ssize_t part_stat_show(struct device *dev, return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " - "%8u %8u %8u" + "%8u %8u %8u %8d" "\n", part_stat_read(p, ios[READ]), part_stat_read(p, merges[READ]), @@ -250,7 +251,8 @@ ssize_t part_stat_show(struct device *dev, jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), part_in_flight(p), jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue))); + jiffies_to_msecs(part_stat_read(p, time_in_queue)), + gd->idle ? jiffies_to_msecs(jiffies - gd->idle) : -1); } ssize_t part_inflight_show(struct device *dev, @@ -652,6 +654,9 @@ void del_gendisk(struct gendisk *disk) struct disk_part_iter piter; struct hd_struct *part; + del_timer_sync(&disk->hysteresis_timer); + cancel_work_sync(&disk->idle_notify); + /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 297df45..b8e6158 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/kdev_t.h> #include <linux/rcupdate.h> +#include <linux/timer.h> #ifdef CONFIG_BLOCK @@ -163,10 +164,15 @@ struct gendisk { atomic_t sync_io; /* RAID */ struct work_struct async_notify; + struct work_struct idle_notify; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; #endif int node_id; + + unsigned long idle; + int hysteresis_time; + struct timer_list hysteresis_timer; }; static inline struct gendisk *part_to_disk(struct hd_struct *part) -- 1.6.5.2 ^ permalink raw reply related [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes 2009-11-17 14:37 ` Matthew Garrett @ 2009-11-18 22:10 ` Bartlomiej Zolnierkiewicz -1 siblings, 0 replies; 73+ messages in thread From: Bartlomiej Zolnierkiewicz @ 2009-11-18 22:10 UTC (permalink / raw) To: Matthew Garrett; +Cc: linux-kernel, axboe, linux-hotplug On Tuesday 17 November 2009 15:37:39 Matthew Garrett wrote: > @@ -1452,6 +1452,15 @@ static inline void __generic_make_request(struct bio *bio) > if (should_fail_request(bio)) > goto end_io; > > + if (bio->bi_bdev->bd_disk->hysteresis_time && > + bio_has_data(bio) && > + !mod_timer(&bio->bi_bdev->bd_disk->hysteresis_timer, > + jiffies+msecs_to_jiffies > + (bio->bi_bdev->bd_disk->hysteresis_time))) { > + bio->bi_bdev->bd_disk->idle = 0; > + schedule_work(&bio->bi_bdev->bd_disk->idle_notify); > + } > + Wouldn't it be more reliable to hook into places where block requests are issued/completed instead of queued? This way you will not miss special requests (some of them are quite common in practice and may take a relatively long time to execute, i.e. FLUSH CACHE). -- Bartlomiej Zolnierkiewicz ^ permalink raw reply [flat|nested] 73+ messages in thread
* Re: [PATCH] [RFC] Add support for uevents on block device idle changes @ 2009-11-18 22:10 ` Bartlomiej Zolnierkiewicz 0 siblings, 0 replies; 73+ messages in thread From: Bartlomiej Zolnierkiewicz @ 2009-11-18 22:10 UTC (permalink / raw) To: Matthew Garrett; +Cc: linux-kernel, axboe, linux-hotplug On Tuesday 17 November 2009 15:37:39 Matthew Garrett wrote: > @@ -1452,6 +1452,15 @@ static inline void __generic_make_request(struct bio *bio) > if (should_fail_request(bio)) > goto end_io; > > + if (bio->bi_bdev->bd_disk->hysteresis_time && > + bio_has_data(bio) && > + !mod_timer(&bio->bi_bdev->bd_disk->hysteresis_timer, > + jiffies+msecs_to_jiffies > + (bio->bi_bdev->bd_disk->hysteresis_time))) { > + bio->bi_bdev->bd_disk->idle = 0; > + schedule_work(&bio->bi_bdev->bd_disk->idle_notify); > + } > + Wouldn't it be more reliable to hook into places where block requests are issued/completed instead of queued? This way you will not miss special requests (some of them are quite common in practice and may take a relatively long time to execute, i.e. FLUSH CACHE). -- Bartlomiej Zolnierkiewicz ^ permalink raw reply [flat|nested] 73+ messages in thread
end of thread, other threads:[~2009-12-11 21:21 UTC | newest] Thread overview: 73+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2009-11-17 14:37 [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-17 14:37 ` Matthew Garrett 2009-11-17 15:55 ` Kay Sievers 2009-11-17 15:55 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-17 16:09 ` [PATCH] [RFC] Add support for uevents on block device idle changes David Zeuthen 2009-11-17 16:09 ` [PATCH] [RFC] Add support for uevents on block device idle David Zeuthen 2009-11-17 18:57 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-17 18:57 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-18 19:30 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-18 19:30 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-18 19:40 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-18 19:40 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-18 19:47 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-18 19:47 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-18 19:53 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-18 19:53 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-18 20:03 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-18 20:03 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-18 20:07 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-18 20:07 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-18 21:06 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-18 21:06 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-18 21:29 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-18 21:29 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-18 21:35 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-18 21:35 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-18 21:39 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-18 21:39 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-18 21:45 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-18 21:45 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-18 21:33 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-18 21:33 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-18 21:40 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-18 21:40 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-19 11:09 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-19 11:09 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-19 13:01 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-19 13:01 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-19 13:29 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-19 13:29 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-19 14:16 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-19 14:16 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-19 14:25 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-19 14:25 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-19 14:30 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-19 14:30 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-19 14:34 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-19 14:34 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-19 14:48 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-19 14:48 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-19 15:00 ` [PATCH] [RFC] Add support for uevents on block device idle changes Kay Sievers 2009-11-19 15:00 ` [PATCH] [RFC] Add support for uevents on block device idle Kay Sievers 2009-11-20 20:29 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-20 20:29 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-22 23:37 ` [PATCH] [RFC] Add support for uevents on block device idle changes Pavel Machek 2009-11-22 23:37 ` [PATCH] [RFC] Add support for uevents on block device idle Pavel Machek 2009-11-23 14:12 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-23 14:12 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-23 14:17 ` [PATCH] [RFC] Add support for uevents on block device idle changes Jens Axboe 2009-11-23 14:17 ` [PATCH] [RFC] Add support for uevents on block device idle Jens Axboe 2009-11-23 14:25 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-23 14:25 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-23 14:31 ` [PATCH] [RFC] Add support for uevents on block device idle changes Jens Axboe 2009-11-23 14:31 ` [PATCH] [RFC] Add support for uevents on block device idle Jens Axboe 2009-11-23 14:42 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-23 14:42 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-11-23 19:50 ` [PATCH] [RFC] Add support for uevents on block device idle changes Jens Axboe 2009-11-23 19:50 ` [PATCH] [RFC] Add support for uevents on block device idle Jens Axboe 2009-11-23 19:54 ` [PATCH] [RFC] Add support for uevents on block device idle changes Matthew Garrett 2009-11-23 19:54 ` [PATCH] [RFC] Add support for uevents on block device idle Matthew Garrett 2009-12-11 21:20 ` [RFC] Add support for events on block device idle changes Matthew Garrett 2009-11-18 22:10 ` [PATCH] [RFC] Add support for uevents " Bartlomiej Zolnierkiewicz 2009-11-18 22:10 ` Bartlomiej Zolnierkiewicz
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.