* [PATCH] libceph: Complete stuck requests to OSD with EIO
@ 2017-02-09 13:04 Artur Molchanov
2017-02-09 14:27 ` Ilya Dryomov
2017-02-10 11:31 ` Jeff Layton
0 siblings, 2 replies; 14+ messages in thread
From: Artur Molchanov @ 2017-02-09 13:04 UTC (permalink / raw)
To: Ilya Dryomov; +Cc: ceph-devel
From: Artur Molchanov <artur.molchanov@synesis.ru>
Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
If osd_request_timeout equals to 0 (default value) then do nothing with
hung requests (keep default behavior).
Create RBD map option osd_request_timeout to set timeout in seconds. Set
osd_request_timeout to 0 by default.
Signed-off-by: Artur Molchanov <artur.molchanov@synesis.ru>
---
include/linux/ceph/libceph.h | 8 +++--
include/linux/ceph/osd_client.h | 1 +
net/ceph/ceph_common.c | 12 +++++++
net/ceph/osd_client.c | 72 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 90 insertions(+), 3 deletions(-)
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 1816c5e..10d8acb 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -48,6 +48,7 @@ struct ceph_options {
unsigned long mount_timeout; /* jiffies */
unsigned long osd_idle_ttl; /* jiffies */
unsigned long osd_keepalive_timeout; /* jiffies */
+ unsigned long osd_request_timeout; /* jiffies */
/*
* any type that can't be simply compared or doesn't need need
@@ -65,9 +66,10 @@ struct ceph_options {
/*
* defaults
*/
-#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
-#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
-#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
+#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
+#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT msecs_to_jiffies(0 * 1000)
#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000)
#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 03a6653..e45cba0 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -279,6 +279,7 @@ struct ceph_osd_client {
atomic_t num_homeless;
struct delayed_work timeout_work;
struct delayed_work osds_timeout_work;
+ struct delayed_work osd_request_timeout_work;
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_file;
#endif
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 464e885..81876c8 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -230,6 +230,7 @@ enum {
Opt_osdkeepalivetimeout,
Opt_mount_timeout,
Opt_osd_idle_ttl,
+ Opt_osd_request_timeout,
Opt_last_int,
/* int args above */
Opt_fsid,
@@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
{Opt_mount_timeout, "mount_timeout=%d"},
{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+ {Opt_osd_request_timeout, "osd_request_timeout=%d"},
/* int args above */
{Opt_fsid, "fsid=%s"},
{Opt_name, "name=%s"},
@@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+ opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
/* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */
@@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
}
opt->mount_timeout = msecs_to_jiffies(intval * 1000);
break;
+ case Opt_osd_request_timeout:
+ /* 0 is "wait forever" (i.e. infinite timeout) */
+ if (intval < 0 || intval > INT_MAX / 1000) {
+ pr_err("osd_request_timeout out of range\n");
+ err = -EINVAL;
+ goto out;
+ }
+ opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
+ break;
case Opt_share:
opt->flags &= ~CEPH_OPT_NOSHARE;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 842f049..2f5a024 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2554,6 +2554,75 @@ static void handle_timeout(struct work_struct *work)
osdc->client->options->osd_keepalive_timeout);
}
+/*
+ * Complete request to OSD with error err if it started before cutoff.
+ */
+static void complete_osd_stuck_request(struct ceph_osd_request *req,
+ unsigned long cutoff,
+ int err)
+{
+ if (!time_before(req->r_stamp, cutoff))
+ return;
+
+ pr_warn_ratelimited("osd req on osd%d timeout expired\n",
+ req->r_osd->o_osd);
+
+ complete_request(req, err);
+}
+
+/*
+ * Complete all requests to OSD that has been active for more than timeout.
+ */
+static void complete_osd_stuck_requests(struct ceph_osd *osd,
+ unsigned long timeout,
+ int err)
+{
+ struct ceph_osd_request *req, *n;
+ const unsigned long cutoff = jiffies - timeout;
+
+ rbtree_postorder_for_each_entry_safe(req, n, &osd->o_requests, r_node) {
+ complete_osd_stuck_request(req, cutoff, -EIO);
+ }
+}
+
+/*
+ * Timeout callback, called every N seconds. When 1 or more OSD
+ * requests that has been active for more than N seconds,
+ * we complete it with error EIO.
+ */
+static void handle_osd_request_timeout(struct work_struct *work)
+{
+ struct ceph_osd_client *osdc =
+ container_of(work, struct ceph_osd_client,
+ osd_request_timeout_work.work);
+ struct ceph_osd *osd, *n;
+ struct ceph_options *opts = osdc->client->options;
+ const unsigned long timeout = opts->osd_request_timeout;
+
+ dout("%s osdc %p\n", __func__, osdc);
+
+ if (!timeout)
+ return;
+
+ down_write(&osdc->lock);
+
+ rbtree_postorder_for_each_entry_safe(osd, n, &osdc->osds, o_node) {
+ complete_osd_stuck_requests(osd, timeout, -EIO);
+ }
+
+ up_write(&osdc->lock);
+
+ if (!atomic_read(&osdc->num_homeless))
+ goto out;
+
+ down_write(&osdc->lock);
+ complete_osd_stuck_requests(&osdc->homeless_osd, timeout, -EIO);
+ up_write(&osdc->lock);
+
+out:
+ schedule_delayed_work(&osdc->osd_request_timeout_work, timeout);
+}
+
static void handle_osds_timeout(struct work_struct *work)
{
struct ceph_osd_client *osdc =
@@ -4091,6 +4160,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
ceph_client *client)
osdc->linger_map_checks = RB_ROOT;
INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+ INIT_DELAYED_WORK(&osdc->osd_request_timeout_work, handle_osd_request_timeout);
err = -ENOMEM;
osdc->osdmap = ceph_osdmap_alloc();
@@ -4120,6 +4190,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
ceph_client *client)
osdc->client->options->osd_keepalive_timeout);
schedule_delayed_work(&osdc->osds_timeout_work,
round_jiffies_relative(osdc->client->options->osd_idle_ttl));
+ schedule_delayed_work(&osdc->osd_request_timeout_work,
+ round_jiffies_relative(osdc->client->options->osd_request_timeout));
return 0;
--
2.7.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-09 13:04 [PATCH] libceph: Complete stuck requests to OSD with EIO Artur Molchanov
@ 2017-02-09 14:27 ` Ilya Dryomov
2017-02-09 15:51 ` Artur Molchanov
2017-02-10 11:31 ` Jeff Layton
1 sibling, 1 reply; 14+ messages in thread
From: Ilya Dryomov @ 2017-02-09 14:27 UTC (permalink / raw)
To: Artur Molchanov; +Cc: Ceph Development
On Thu, Feb 9, 2017 at 2:04 PM, Artur Molchanov
<artur.molchanov@synesis.ru> wrote:
> From: Artur Molchanov <artur.molchanov@synesis.ru>
>
> Complete stuck requests to OSD with error EIO after osd_request_timeout
> expired.
> If osd_request_timeout equals to 0 (default value) then do nothing with
> hung requests (keep default behavior).
>
> Create RBD map option osd_request_timeout to set timeout in seconds. Set
> osd_request_timeout to 0 by default.
Hi Artur,
What is the use case? What do you expect this timeout to be in your
environment?
I actually have a slightly more compact version of this somewhere,
which I deliberately excluded when updating the OSD client a few kernel
releases ago. IIRC complete_request() wasn't meant to be called
arbitrarily, but we can probably fix that.
Thanks,
Ilya
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-09 14:27 ` Ilya Dryomov
@ 2017-02-09 15:51 ` Artur Molchanov
2017-02-09 16:11 ` Wido den Hollander
0 siblings, 1 reply; 14+ messages in thread
From: Artur Molchanov @ 2017-02-09 15:51 UTC (permalink / raw)
To: Ilya Dryomov; +Cc: Ceph Development
Hi Ilya,
Currently we're working on moving our product to kubernetes / ceph stack. We're
actively tuning ceph configuration and ceph cluster often need to be redeployed.
We have to warm reboot all machines with mounted RBDs as a result. This patch
allows us to remap and remount RBDs without time consuming reboot.
On 02/09/2017 05:27 PM, Ilya Dryomov wrote:
> What is the use case? What do you expect this timeout to be in your
> environment?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-09 15:51 ` Artur Molchanov
@ 2017-02-09 16:11 ` Wido den Hollander
0 siblings, 0 replies; 14+ messages in thread
From: Wido den Hollander @ 2017-02-09 16:11 UTC (permalink / raw)
To: Artur Molchanov, Ilya Dryomov; +Cc: Ceph Development
> Op 9 februari 2017 om 16:51 schreef Artur Molchanov <artur.molchanov@synesis.ru>:
>
>
> Hi Ilya,
>
> Currently we're working on moving our product to kubernetes / ceph stack. We're
> actively tuning ceph configuration and ceph cluster often need to be redeployed.
> We have to warm reboot all machines with mounted RBDs as a result. This patch
> allows us to remap and remount RBDs without time consuming reboot.
>
I understand the usecase. You don't always want to block, sometimes you want to timeout after some time.
Wido
> On 02/09/2017 05:27 PM, Ilya Dryomov wrote:
> > What is the use case? What do you expect this timeout to be in your
> > environment?
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-09 13:04 [PATCH] libceph: Complete stuck requests to OSD with EIO Artur Molchanov
2017-02-09 14:27 ` Ilya Dryomov
@ 2017-02-10 11:31 ` Jeff Layton
2017-02-10 12:25 ` Ilya Dryomov
2017-02-11 20:30 ` Artur Molchanov
1 sibling, 2 replies; 14+ messages in thread
From: Jeff Layton @ 2017-02-10 11:31 UTC (permalink / raw)
To: Artur Molchanov, Ilya Dryomov; +Cc: ceph-devel
On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
> From: Artur Molchanov <artur.molchanov@synesis.ru>
>
> Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
> If osd_request_timeout equals to 0 (default value) then do nothing with
> hung requests (keep default behavior).
>
> Create RBD map option osd_request_timeout to set timeout in seconds. Set
> osd_request_timeout to 0 by default.
>
Also, what exactly are the requests blocked on when this occurs? Is the
ceph_osd_request_target ending up paused? I wonder if we might be better
off with something that returns a hard error under the circumstances
where you're hanging, rather than depending on timeouts.
> Signed-off-by: Artur Molchanov <artur.molchanov@synesis.ru>
> ---
> include/linux/ceph/libceph.h | 8 +++--
> include/linux/ceph/osd_client.h | 1 +
> net/ceph/ceph_common.c | 12 +++++++
> net/ceph/osd_client.c | 72 +++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 90 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index 1816c5e..10d8acb 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -48,6 +48,7 @@ struct ceph_options {
> unsigned long mount_timeout; /* jiffies */
> unsigned long osd_idle_ttl; /* jiffies */
> unsigned long osd_keepalive_timeout; /* jiffies */
> + unsigned long osd_request_timeout; /* jiffies */
>
> /*
> * any type that can't be simply compared or doesn't need need
> @@ -65,9 +66,10 @@ struct ceph_options {
> /*
> * defaults
> */
> -#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
> -#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
> -#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
> +#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
> +#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
> +#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
> +#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT msecs_to_jiffies(0 * 1000)
>
> #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000)
> #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index 03a6653..e45cba0 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -279,6 +279,7 @@ struct ceph_osd_client {
> atomic_t num_homeless;
> struct delayed_work timeout_work;
> struct delayed_work osds_timeout_work;
> + struct delayed_work osd_request_timeout_work;
> #ifdef CONFIG_DEBUG_FS
> struct dentry *debugfs_file;
> #endif
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index 464e885..81876c8 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -230,6 +230,7 @@ enum {
> Opt_osdkeepalivetimeout,
> Opt_mount_timeout,
> Opt_osd_idle_ttl,
> + Opt_osd_request_timeout,
> Opt_last_int,
> /* int args above */
> Opt_fsid,
> @@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
> {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
> {Opt_mount_timeout, "mount_timeout=%d"},
> {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
> + {Opt_osd_request_timeout, "osd_request_timeout=%d"},
> /* int args above */
> {Opt_fsid, "fsid=%s"},
> {Opt_name, "name=%s"},
> @@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
> opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
> opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
> opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
> + opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
>
> /* get mon ip(s) */
> /* ip1[:port1][,ip2[:port2]...] */
> @@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
> }
> opt->mount_timeout = msecs_to_jiffies(intval * 1000);
> break;
> + case Opt_osd_request_timeout:
> + /* 0 is "wait forever" (i.e. infinite timeout) */
> + if (intval < 0 || intval > INT_MAX / 1000) {
> + pr_err("osd_request_timeout out of range\n");
> + err = -EINVAL;
> + goto out;
> + }
> + opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
> + break;
>
> case Opt_share:
> opt->flags &= ~CEPH_OPT_NOSHARE;
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 842f049..2f5a024 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -2554,6 +2554,75 @@ static void handle_timeout(struct work_struct *work)
> osdc->client->options->osd_keepalive_timeout);
> }
>
> +/*
> + * Complete request to OSD with error err if it started before cutoff.
> + */
> +static void complete_osd_stuck_request(struct ceph_osd_request *req,
> + unsigned long cutoff,
> + int err)
> +{
> + if (!time_before(req->r_stamp, cutoff))
> + return;
> +
> + pr_warn_ratelimited("osd req on osd%d timeout expired\n",
> + req->r_osd->o_osd);
> +
> + complete_request(req, err);
> +}
> +
> +/*
> + * Complete all requests to OSD that has been active for more than timeout.
> + */
> +static void complete_osd_stuck_requests(struct ceph_osd *osd,
> + unsigned long timeout,
> + int err)
> +{
> + struct ceph_osd_request *req, *n;
> + const unsigned long cutoff = jiffies - timeout;
> +
> + rbtree_postorder_for_each_entry_safe(req, n, &osd->o_requests, r_node) {
> + complete_osd_stuck_request(req, cutoff, -EIO);
> + }
> +}
> +
> +/*
> + * Timeout callback, called every N seconds. When 1 or more OSD
> + * requests that has been active for more than N seconds,
> + * we complete it with error EIO.
> + */
> +static void handle_osd_request_timeout(struct work_struct *work)
> +{
> + struct ceph_osd_client *osdc =
> + container_of(work, struct ceph_osd_client,
> + osd_request_timeout_work.work);
> + struct ceph_osd *osd, *n;
> + struct ceph_options *opts = osdc->client->options;
> + const unsigned long timeout = opts->osd_request_timeout;
> +
> + dout("%s osdc %p\n", __func__, osdc);
> +
> + if (!timeout)
> + return;
> +
> + down_write(&osdc->lock);
> +
> + rbtree_postorder_for_each_entry_safe(osd, n, &osdc->osds, o_node) {
> + complete_osd_stuck_requests(osd, timeout, -EIO);
> + }
> +
> + up_write(&osdc->lock);
> +
> + if (!atomic_read(&osdc->num_homeless))
> + goto out;
> +
> + down_write(&osdc->lock);
> + complete_osd_stuck_requests(&osdc->homeless_osd, timeout, -EIO);
> + up_write(&osdc->lock);
> +
> +out:
> + schedule_delayed_work(&osdc->osd_request_timeout_work, timeout);
> +}
> +
> static void handle_osds_timeout(struct work_struct *work)
> {
> struct ceph_osd_client *osdc =
> @@ -4091,6 +4160,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
> ceph_client *client)
> osdc->linger_map_checks = RB_ROOT;
> INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
> INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
> + INIT_DELAYED_WORK(&osdc->osd_request_timeout_work, handle_osd_request_timeout);
>
> err = -ENOMEM;
> osdc->osdmap = ceph_osdmap_alloc();
> @@ -4120,6 +4190,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
> ceph_client *client)
> osdc->client->options->osd_keepalive_timeout);
> schedule_delayed_work(&osdc->osds_timeout_work,
> round_jiffies_relative(osdc->client->options->osd_idle_ttl));
> + schedule_delayed_work(&osdc->osd_request_timeout_work,
> + round_jiffies_relative(osdc->client->options->osd_request_timeout));
>
> return 0;
>
Having a job that has to wake up every second or so isn't ideal. Perhaps
you would be better off scheduling the delayed work in the request
submission codepath, and only rearm it when the tree isn't empty after
calling complete_osd_stuck_requests?
Also, I don't see where this job is ever cancelled when the osdc is torn
down. That needs to occur or you'll cause a use-after-free oops...
--
Jeff Layton <jlayton@redhat.com>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-10 11:31 ` Jeff Layton
@ 2017-02-10 12:25 ` Ilya Dryomov
2017-02-13 9:11 ` Ilya Dryomov
2017-02-11 20:30 ` Artur Molchanov
1 sibling, 1 reply; 14+ messages in thread
From: Ilya Dryomov @ 2017-02-10 12:25 UTC (permalink / raw)
To: Jeff Layton; +Cc: Artur Molchanov, Ceph Development
On Fri, Feb 10, 2017 at 12:31 PM, Jeff Layton <jlayton@redhat.com> wrote:
> On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
>> From: Artur Molchanov <artur.molchanov@synesis.ru>
>>
>> Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
>> If osd_request_timeout equals to 0 (default value) then do nothing with
>> hung requests (keep default behavior).
>>
>> Create RBD map option osd_request_timeout to set timeout in seconds. Set
>> osd_request_timeout to 0 by default.
>>
> Also, what exactly are the requests blocked on when this occurs? Is the
> ceph_osd_request_target ending up paused? I wonder if we might be better
> off with something that returns a hard error under the circumstances
> where you're hanging, rather than depending on timeouts.
>
>
>> Signed-off-by: Artur Molchanov <artur.molchanov@synesis.ru>
>> ---
>> include/linux/ceph/libceph.h | 8 +++--
>> include/linux/ceph/osd_client.h | 1 +
>> net/ceph/ceph_common.c | 12 +++++++
>> net/ceph/osd_client.c | 72 +++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 90 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
>> index 1816c5e..10d8acb 100644
>> --- a/include/linux/ceph/libceph.h
>> +++ b/include/linux/ceph/libceph.h
>> @@ -48,6 +48,7 @@ struct ceph_options {
>> unsigned long mount_timeout; /* jiffies */
>> unsigned long osd_idle_ttl; /* jiffies */
>> unsigned long osd_keepalive_timeout; /* jiffies */
>> + unsigned long osd_request_timeout; /* jiffies */
>>
>> /*
>> * any type that can't be simply compared or doesn't need need
>> @@ -65,9 +66,10 @@ struct ceph_options {
>> /*
>> * defaults
>> */
>> -#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
>> -#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
>> -#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
>> +#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
>> +#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
>> +#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
>> +#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT msecs_to_jiffies(0 * 1000)
>>
>> #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000)
>> #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)
>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>> index 03a6653..e45cba0 100644
>> --- a/include/linux/ceph/osd_client.h
>> +++ b/include/linux/ceph/osd_client.h
>> @@ -279,6 +279,7 @@ struct ceph_osd_client {
>> atomic_t num_homeless;
>> struct delayed_work timeout_work;
>> struct delayed_work osds_timeout_work;
>> + struct delayed_work osd_request_timeout_work;
>> #ifdef CONFIG_DEBUG_FS
>> struct dentry *debugfs_file;
>> #endif
>> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
>> index 464e885..81876c8 100644
>> --- a/net/ceph/ceph_common.c
>> +++ b/net/ceph/ceph_common.c
>> @@ -230,6 +230,7 @@ enum {
>> Opt_osdkeepalivetimeout,
>> Opt_mount_timeout,
>> Opt_osd_idle_ttl,
>> + Opt_osd_request_timeout,
>> Opt_last_int,
>> /* int args above */
>> Opt_fsid,
>> @@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
>> {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
>> {Opt_mount_timeout, "mount_timeout=%d"},
>> {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
>> + {Opt_osd_request_timeout, "osd_request_timeout=%d"},
>> /* int args above */
>> {Opt_fsid, "fsid=%s"},
>> {Opt_name, "name=%s"},
>> @@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
>> opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
>> opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
>> opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
>> + opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
>>
>> /* get mon ip(s) */
>> /* ip1[:port1][,ip2[:port2]...] */
>> @@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
>> }
>> opt->mount_timeout = msecs_to_jiffies(intval * 1000);
>> break;
>> + case Opt_osd_request_timeout:
>> + /* 0 is "wait forever" (i.e. infinite timeout) */
>> + if (intval < 0 || intval > INT_MAX / 1000) {
>> + pr_err("osd_request_timeout out of range\n");
>> + err = -EINVAL;
>> + goto out;
>> + }
>> + opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
>> + break;
>>
>> case Opt_share:
>> opt->flags &= ~CEPH_OPT_NOSHARE;
>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>> index 842f049..2f5a024 100644
>> --- a/net/ceph/osd_client.c
>> +++ b/net/ceph/osd_client.c
>> @@ -2554,6 +2554,75 @@ static void handle_timeout(struct work_struct *work)
>> osdc->client->options->osd_keepalive_timeout);
>> }
>>
>> +/*
>> + * Complete request to OSD with error err if it started before cutoff.
>> + */
>> +static void complete_osd_stuck_request(struct ceph_osd_request *req,
>> + unsigned long cutoff,
>> + int err)
>> +{
>> + if (!time_before(req->r_stamp, cutoff))
>> + return;
>> +
>> + pr_warn_ratelimited("osd req on osd%d timeout expired\n",
>> + req->r_osd->o_osd);
>> +
>> + complete_request(req, err);
>> +}
>> +
>> +/*
>> + * Complete all requests to OSD that has been active for more than timeout.
>> + */
>> +static void complete_osd_stuck_requests(struct ceph_osd *osd,
>> + unsigned long timeout,
>> + int err)
>> +{
>> + struct ceph_osd_request *req, *n;
>> + const unsigned long cutoff = jiffies - timeout;
>> +
>> + rbtree_postorder_for_each_entry_safe(req, n, &osd->o_requests, r_node) {
>> + complete_osd_stuck_request(req, cutoff, -EIO);
>> + }
>> +}
>> +
>> +/*
>> + * Timeout callback, called every N seconds. When 1 or more OSD
>> + * requests that has been active for more than N seconds,
>> + * we complete it with error EIO.
>> + */
>> +static void handle_osd_request_timeout(struct work_struct *work)
>> +{
>> + struct ceph_osd_client *osdc =
>> + container_of(work, struct ceph_osd_client,
>> + osd_request_timeout_work.work);
>> + struct ceph_osd *osd, *n;
>> + struct ceph_options *opts = osdc->client->options;
>> + const unsigned long timeout = opts->osd_request_timeout;
>> +
>> + dout("%s osdc %p\n", __func__, osdc);
>> +
>> + if (!timeout)
>> + return;
>> +
>> + down_write(&osdc->lock);
>> +
>> + rbtree_postorder_for_each_entry_safe(osd, n, &osdc->osds, o_node) {
>> + complete_osd_stuck_requests(osd, timeout, -EIO);
>> + }
>> +
>> + up_write(&osdc->lock);
>> +
>> + if (!atomic_read(&osdc->num_homeless))
>> + goto out;
>> +
>> + down_write(&osdc->lock);
>> + complete_osd_stuck_requests(&osdc->homeless_osd, timeout, -EIO);
>> + up_write(&osdc->lock);
>> +
>> +out:
>> + schedule_delayed_work(&osdc->osd_request_timeout_work, timeout);
>> +}
>> +
>> static void handle_osds_timeout(struct work_struct *work)
>> {
>> struct ceph_osd_client *osdc =
>> @@ -4091,6 +4160,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
>> ceph_client *client)
>> osdc->linger_map_checks = RB_ROOT;
>> INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
>> INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
>> + INIT_DELAYED_WORK(&osdc->osd_request_timeout_work, handle_osd_request_timeout);
>>
>> err = -ENOMEM;
>> osdc->osdmap = ceph_osdmap_alloc();
>> @@ -4120,6 +4190,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
>> ceph_client *client)
>> osdc->client->options->osd_keepalive_timeout);
>> schedule_delayed_work(&osdc->osds_timeout_work,
>> round_jiffies_relative(osdc->client->options->osd_idle_ttl));
>> + schedule_delayed_work(&osdc->osd_request_timeout_work,
>> + round_jiffies_relative(osdc->client->options->osd_request_timeout));
>>
>> return 0;
>>
>
> Having a job that has to wake up every second or so isn't ideal. Perhaps
> you would be better off scheduling the delayed work in the request
> submission codepath, and only rearm it when the tree isn't empty after
> calling complete_osd_stuck_requests?
>
> Also, I don't see where this job is ever cancelled when the osdc is torn
> down. That needs to occur or you'll cause a use-after-free oops...
Yeah, there are other bugs here: rbtree_postorder_for_each_entry_safe()
can't be used because complete_request() calls erase(), etc. The patch
I had just piggy-backed on handle_timeout(), which is called every five
seconds anyway.
Given that there is interest, I think it's worth adding.
Thanks,
Ilya
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-10 11:31 ` Jeff Layton
2017-02-10 12:25 ` Ilya Dryomov
@ 2017-02-11 20:30 ` Artur Molchanov
2017-02-12 12:34 ` Jeff Layton
1 sibling, 1 reply; 14+ messages in thread
From: Artur Molchanov @ 2017-02-11 20:30 UTC (permalink / raw)
To: Jeff Layton, Ilya Dryomov; +Cc: ceph-devel
Hi Jef,
On Fri, 2017-02-10 at 14:31, Jeff Layton wrote:
> On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
>> From: Artur Molchanov <artur.molchanov@synesis.ru>
>>
>> Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
>> If osd_request_timeout equals to 0 (default value) then do nothing with
>> hung requests (keep default behavior).
>>
>> Create RBD map option osd_request_timeout to set timeout in seconds. Set
>> osd_request_timeout to 0 by default.
>>
> Also, what exactly are the requests blocked on when this occurs? Is the
> ceph_osd_request_target ending up paused? I wonder if we might be better
> off with something that returns a hard error under the circumstances
> where you're hanging, rather than depending on timeouts.
I wonder that it is better to complete requests only after timeout expired, just
because a request can fail due to temporary network issues (e.g. router
restarted) or restarting machine/services.
> Having a job that has to wake up every second or so isn't ideal. Perhaps
> you would be better off scheduling the delayed work in the request
> submission codepath, and only rearm it when the tree isn't empty after
> calling complete_osd_stuck_requests?
Would you please tell me more about rearming work only if the tree is not empty
after calling complete_osd_stuck_requests? From what code we should call
complete_osd_stuck_requests?
As I understood, there are two primary cases:
1 - Requests to OSD failed, but monitors do not return new osdmap (because all
monitors are offline or monitors did not update osdmap yet).
In this case requests are retried by cyclic calling ceph_con_workfn -> con_fault
-> ceph_con_workfn. We can check request timestamp and does not call con_fault
but complete it.
2 - Monitors return new osdmap which does not have any OSD for RBD.
In this case all requests to the last ready OSD will be linked on "homeless" OSD
and will not be retried until new osdmap with appropriate OSD received. I think
that we need additional periodic checking timestamp such requests.
Yes, there is already existing job handle_timeout. But the responsibility of
this job is to sending keepalive requests to slow OSD. I'm not sure that it is a
good idea to perform additional actions inside this job.
I decided that creating specific job handle_osd_request_timeout is more applicable.
This job will be run only once with a default value of osd_request_timeout (0).
At the same time, I think that user will not use too small value for this
parameter. I wonder that typical value will be about 1 minute or greater.
> Also, I don't see where this job is ever cancelled when the osdc is torn
> down. That needs to occur or you'll cause a use-after-free oops...
It is my fault, thanks for the correction.
--
Artur
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-11 20:30 ` Artur Molchanov
@ 2017-02-12 12:34 ` Jeff Layton
2017-02-12 15:26 ` Artur Molchanov
0 siblings, 1 reply; 14+ messages in thread
From: Jeff Layton @ 2017-02-12 12:34 UTC (permalink / raw)
To: Artur Molchanov, Ilya Dryomov; +Cc: ceph-devel
On Sat, 2017-02-11 at 23:30 +0300, Artur Molchanov wrote:
> Hi Jef,
>
> On Fri, 2017-02-10 at 14:31, Jeff Layton wrote:
> > On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
> > > From: Artur Molchanov <artur.molchanov@synesis.ru>
> > >
> > > Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
> > > If osd_request_timeout equals to 0 (default value) then do nothing with
> > > hung requests (keep default behavior).
> > >
> > > Create RBD map option osd_request_timeout to set timeout in seconds. Set
> > > osd_request_timeout to 0 by default.
> > >
> >
> > Also, what exactly are the requests blocked on when this occurs? Is the
> > ceph_osd_request_target ending up paused? I wonder if we might be better
> > off with something that returns a hard error under the circumstances
> > where you're hanging, rather than depending on timeouts.
>
> I wonder that it is better to complete requests only after timeout expired, just
> because a request can fail due to temporary network issues (e.g. router
> restarted) or restarting machine/services.
>
> > Having a job that has to wake up every second or so isn't ideal. Perhaps
> > you would be better off scheduling the delayed work in the request
> > submission codepath, and only rearm it when the tree isn't empty after
> > calling complete_osd_stuck_requests?
>
> Would you please tell me more about rearming work only if the tree is not empty
> after calling complete_osd_stuck_requests? From what code we should call
> complete_osd_stuck_requests?
>
Sure. I'm saying you would want to call schedule_delayed_work for the
timeout handler from the request submission path when you link a request
into the tree that has a timeout. Maybe in __submit_request?
Then, instead of unconditionally calling schedule_delayed_work at the
end of handle_request_timeout, you'd only call it if there were no
requests still sitting in the osdc trees.
> As I understood, there are two primary cases:
> 1 - Requests to OSD failed, but monitors do not return new osdmap (because all
> monitors are offline or monitors did not update osdmap yet).
> In this case requests are retried by cyclic calling ceph_con_workfn -> con_fault
> -> ceph_con_workfn. We can check request timestamp and does not call con_fault
> but complete it.
>
> 2 - Monitors return new osdmap which does not have any OSD for RBD.
> In this case all requests to the last ready OSD will be linked on "homeless" OSD
> and will not be retried until new osdmap with appropriate OSD received. I think
> that we need additional periodic checking timestamp such requests.
>
> Yes, there is already existing job handle_timeout. But the responsibility of
> this job is to sending keepalive requests to slow OSD. I'm not sure that it is a
> good idea to perform additional actions inside this job.
> I decided that creating specific job handle_osd_request_timeout is more applicable.
>
> This job will be run only once with a default value of osd_request_timeout (0).
Ahh, I missed that -- thanks.
> At the same time, I think that user will not use too small value for this
> parameter. I wonder that typical value will be about 1 minute or greater.
>
> > Also, I don't see where this job is ever cancelled when the osdc is torn
> > down. That needs to occur or you'll cause a use-after-free oops...
>
> It is my fault, thanks for the correction.
>
--
Jeff Layton <jlayton@redhat.com>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-12 12:34 ` Jeff Layton
@ 2017-02-12 15:26 ` Artur Molchanov
0 siblings, 0 replies; 14+ messages in thread
From: Artur Molchanov @ 2017-02-12 15:26 UTC (permalink / raw)
To: Jeff Layton, Ilya Dryomov; +Cc: ceph-devel
On Sun 2017-2-12 at 15:34, Jeff Layton wrote:
> On Sat, 2017-02-11 at 23:30 +0300, Artur Molchanov wrote:
>> Hi Jef,
>>
>> On Fri, 2017-02-10 at 14:31, Jeff Layton wrote:
>>> On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
>>>> From: Artur Molchanov <artur.molchanov@synesis.ru>
>>>>
>>>> Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
>>>> If osd_request_timeout equals to 0 (default value) then do nothing with
>>>> hung requests (keep default behavior).
>>>>
>>>> Create RBD map option osd_request_timeout to set timeout in seconds. Set
>>>> osd_request_timeout to 0 by default.
>>>>
>>>
>>> Also, what exactly are the requests blocked on when this occurs? Is the
>>> ceph_osd_request_target ending up paused? I wonder if we might be better
>>> off with something that returns a hard error under the circumstances
>>> where you're hanging, rather than depending on timeouts.
>>
>> I wonder that it is better to complete requests only after timeout expired, just
>> because a request can fail due to temporary network issues (e.g. router
>> restarted) or restarting machine/services.
>>
>>> Having a job that has to wake up every second or so isn't ideal. Perhaps
>>> you would be better off scheduling the delayed work in the request
>>> submission codepath, and only rearm it when the tree isn't empty after
>>> calling complete_osd_stuck_requests?
>>
>> Would you please tell me more about rearming work only if the tree is not empty
>> after calling complete_osd_stuck_requests? From what code we should call
>> complete_osd_stuck_requests?
>>
>
> Sure. I'm saying you would want to call schedule_delayed_work for the
> timeout handler from the request submission path when you link a request
> into the tree that has a timeout. Maybe in __submit_request?
>
> Then, instead of unconditionally calling schedule_delayed_work at the
> end of handle_request_timeout, you'd only call it if there were no
> requests still sitting in the osdc trees.
Thanks for the clarification.
But, unfortunately, requests which are linked to "homeless" OSD will not retry
until new osdmap with proper OSD received.
>> As I understood, there are two primary cases:
>> 1 - Requests to OSD failed, but monitors do not return new osdmap (because all
>> monitors are offline or monitors did not update osdmap yet).
>> In this case requests are retried by cyclic calling ceph_con_workfn -> con_fault
>> -> ceph_con_workfn. We can check request timestamp and does not call con_fault
>> but complete it.
>>
>> 2 - Monitors return new osdmap which does not have any OSD for RBD.
>> In this case all requests to the last ready OSD will be linked on "homeless" OSD
>> and will not be retried until new osdmap with appropriate OSD received. I think
>> that we need additional periodic checking timestamp such requests.
>>
>> Yes, there is already existing job handle_timeout. But the responsibility of
>> this job is to sending keepalive requests to slow OSD. I'm not sure that it is a
>> good idea to perform additional actions inside this job.
>> I decided that creating specific job handle_osd_request_timeout is more applicable.
>>
>> This job will be run only once with a default value of osd_request_timeout (0).
>
> Ahh, I missed that -- thanks.
>
>> At the same time, I think that user will not use too small value for this
>> parameter. I wonder that typical value will be about 1 minute or greater.
>>
>>> Also, I don't see where this job is ever cancelled when the osdc is torn
>>> down. That needs to occur or you'll cause a use-after-free oops...
>>
>> It is my fault, thanks for the correction.
>>
--
Artur
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-10 12:25 ` Ilya Dryomov
@ 2017-02-13 9:11 ` Ilya Dryomov
2017-02-13 9:54 ` Artur Molchanov
0 siblings, 1 reply; 14+ messages in thread
From: Ilya Dryomov @ 2017-02-13 9:11 UTC (permalink / raw)
To: Jeff Layton; +Cc: Artur Molchanov, Ceph Development
[-- Attachment #1: Type: text/plain, Size: 10072 bytes --]
On Fri, Feb 10, 2017 at 1:25 PM, Ilya Dryomov <idryomov@gmail.com> wrote:
> On Fri, Feb 10, 2017 at 12:31 PM, Jeff Layton <jlayton@redhat.com> wrote:
>> On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
>>> From: Artur Molchanov <artur.molchanov@synesis.ru>
>>>
>>> Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
>>> If osd_request_timeout equals to 0 (default value) then do nothing with
>>> hung requests (keep default behavior).
>>>
>>> Create RBD map option osd_request_timeout to set timeout in seconds. Set
>>> osd_request_timeout to 0 by default.
>>>
>> Also, what exactly are the requests blocked on when this occurs? Is the
>> ceph_osd_request_target ending up paused? I wonder if we might be better
>> off with something that returns a hard error under the circumstances
>> where you're hanging, rather than depending on timeouts.
>>
>>
>>> Signed-off-by: Artur Molchanov <artur.molchanov@synesis.ru>
>>> ---
>>> include/linux/ceph/libceph.h | 8 +++--
>>> include/linux/ceph/osd_client.h | 1 +
>>> net/ceph/ceph_common.c | 12 +++++++
>>> net/ceph/osd_client.c | 72 +++++++++++++++++++++++++++++++++++++++++
>>> 4 files changed, 90 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
>>> index 1816c5e..10d8acb 100644
>>> --- a/include/linux/ceph/libceph.h
>>> +++ b/include/linux/ceph/libceph.h
>>> @@ -48,6 +48,7 @@ struct ceph_options {
>>> unsigned long mount_timeout; /* jiffies */
>>> unsigned long osd_idle_ttl; /* jiffies */
>>> unsigned long osd_keepalive_timeout; /* jiffies */
>>> + unsigned long osd_request_timeout; /* jiffies */
>>>
>>> /*
>>> * any type that can't be simply compared or doesn't need need
>>> @@ -65,9 +66,10 @@ struct ceph_options {
>>> /*
>>> * defaults
>>> */
>>> -#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
>>> -#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
>>> -#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
>>> +#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
>>> +#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
>>> +#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
>>> +#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT msecs_to_jiffies(0 * 1000)
>>>
>>> #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000)
>>> #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)
>>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>>> index 03a6653..e45cba0 100644
>>> --- a/include/linux/ceph/osd_client.h
>>> +++ b/include/linux/ceph/osd_client.h
>>> @@ -279,6 +279,7 @@ struct ceph_osd_client {
>>> atomic_t num_homeless;
>>> struct delayed_work timeout_work;
>>> struct delayed_work osds_timeout_work;
>>> + struct delayed_work osd_request_timeout_work;
>>> #ifdef CONFIG_DEBUG_FS
>>> struct dentry *debugfs_file;
>>> #endif
>>> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
>>> index 464e885..81876c8 100644
>>> --- a/net/ceph/ceph_common.c
>>> +++ b/net/ceph/ceph_common.c
>>> @@ -230,6 +230,7 @@ enum {
>>> Opt_osdkeepalivetimeout,
>>> Opt_mount_timeout,
>>> Opt_osd_idle_ttl,
>>> + Opt_osd_request_timeout,
>>> Opt_last_int,
>>> /* int args above */
>>> Opt_fsid,
>>> @@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
>>> {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
>>> {Opt_mount_timeout, "mount_timeout=%d"},
>>> {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
>>> + {Opt_osd_request_timeout, "osd_request_timeout=%d"},
>>> /* int args above */
>>> {Opt_fsid, "fsid=%s"},
>>> {Opt_name, "name=%s"},
>>> @@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
>>> opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
>>> opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
>>> opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
>>> + opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
>>>
>>> /* get mon ip(s) */
>>> /* ip1[:port1][,ip2[:port2]...] */
>>> @@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
>>> }
>>> opt->mount_timeout = msecs_to_jiffies(intval * 1000);
>>> break;
>>> + case Opt_osd_request_timeout:
>>> + /* 0 is "wait forever" (i.e. infinite timeout) */
>>> + if (intval < 0 || intval > INT_MAX / 1000) {
>>> + pr_err("osd_request_timeout out of range\n");
>>> + err = -EINVAL;
>>> + goto out;
>>> + }
>>> + opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
>>> + break;
>>>
>>> case Opt_share:
>>> opt->flags &= ~CEPH_OPT_NOSHARE;
>>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>>> index 842f049..2f5a024 100644
>>> --- a/net/ceph/osd_client.c
>>> +++ b/net/ceph/osd_client.c
>>> @@ -2554,6 +2554,75 @@ static void handle_timeout(struct work_struct *work)
>>> osdc->client->options->osd_keepalive_timeout);
>>> }
>>>
>>> +/*
>>> + * Complete request to OSD with error err if it started before cutoff.
>>> + */
>>> +static void complete_osd_stuck_request(struct ceph_osd_request *req,
>>> + unsigned long cutoff,
>>> + int err)
>>> +{
>>> + if (!time_before(req->r_stamp, cutoff))
>>> + return;
>>> +
>>> + pr_warn_ratelimited("osd req on osd%d timeout expired\n",
>>> + req->r_osd->o_osd);
>>> +
>>> + complete_request(req, err);
>>> +}
>>> +
>>> +/*
>>> + * Complete all requests to OSD that has been active for more than timeout.
>>> + */
>>> +static void complete_osd_stuck_requests(struct ceph_osd *osd,
>>> + unsigned long timeout,
>>> + int err)
>>> +{
>>> + struct ceph_osd_request *req, *n;
>>> + const unsigned long cutoff = jiffies - timeout;
>>> +
>>> + rbtree_postorder_for_each_entry_safe(req, n, &osd->o_requests, r_node) {
>>> + complete_osd_stuck_request(req, cutoff, -EIO);
>>> + }
>>> +}
>>> +
>>> +/*
>>> + * Timeout callback, called every N seconds. When 1 or more OSD
>>> + * requests that has been active for more than N seconds,
>>> + * we complete it with error EIO.
>>> + */
>>> +static void handle_osd_request_timeout(struct work_struct *work)
>>> +{
>>> + struct ceph_osd_client *osdc =
>>> + container_of(work, struct ceph_osd_client,
>>> + osd_request_timeout_work.work);
>>> + struct ceph_osd *osd, *n;
>>> + struct ceph_options *opts = osdc->client->options;
>>> + const unsigned long timeout = opts->osd_request_timeout;
>>> +
>>> + dout("%s osdc %p\n", __func__, osdc);
>>> +
>>> + if (!timeout)
>>> + return;
>>> +
>>> + down_write(&osdc->lock);
>>> +
>>> + rbtree_postorder_for_each_entry_safe(osd, n, &osdc->osds, o_node) {
>>> + complete_osd_stuck_requests(osd, timeout, -EIO);
>>> + }
>>> +
>>> + up_write(&osdc->lock);
>>> +
>>> + if (!atomic_read(&osdc->num_homeless))
>>> + goto out;
>>> +
>>> + down_write(&osdc->lock);
>>> + complete_osd_stuck_requests(&osdc->homeless_osd, timeout, -EIO);
>>> + up_write(&osdc->lock);
>>> +
>>> +out:
>>> + schedule_delayed_work(&osdc->osd_request_timeout_work, timeout);
>>> +}
>>> +
>>> static void handle_osds_timeout(struct work_struct *work)
>>> {
>>> struct ceph_osd_client *osdc =
>>> @@ -4091,6 +4160,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
>>> ceph_client *client)
>>> osdc->linger_map_checks = RB_ROOT;
>>> INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
>>> INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
>>> + INIT_DELAYED_WORK(&osdc->osd_request_timeout_work, handle_osd_request_timeout);
>>>
>>> err = -ENOMEM;
>>> osdc->osdmap = ceph_osdmap_alloc();
>>> @@ -4120,6 +4190,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
>>> ceph_client *client)
>>> osdc->client->options->osd_keepalive_timeout);
>>> schedule_delayed_work(&osdc->osds_timeout_work,
>>> round_jiffies_relative(osdc->client->options->osd_idle_ttl));
>>> + schedule_delayed_work(&osdc->osd_request_timeout_work,
>>> + round_jiffies_relative(osdc->client->options->osd_request_timeout));
>>>
>>> return 0;
>>>
>>
>> Having a job that has to wake up every second or so isn't ideal. Perhaps
>> you would be better off scheduling the delayed work in the request
>> submission codepath, and only rearm it when the tree isn't empty after
>> calling complete_osd_stuck_requests?
>>
>> Also, I don't see where this job is ever cancelled when the osdc is torn
>> down. That needs to occur or you'll cause a use-after-free oops...
>
> Yeah, there are other bugs here: rbtree_postorder_for_each_entry_safe()
> can't be used because complete_request() calls erase(), etc. The patch
> I had just piggy-backed on handle_timeout(), which is called every five
> seconds anyway.
>
> Given that there is interest, I think it's worth adding.
Hi Artur,
How about the attached patch? handle_timeout() is going to iterate
over all but the homeless OSD anyway; all it costs us is a couple of
tests, so I don't think a separate work is needed.
abort_request() is a simple wrapper around complete_request(), making
it safe to call at any time -- replace it with complete_request() for
now if you want to try this out.
Thanks,
Ilya
[-- Attachment #2: osd-request-timeout.diff --]
[-- Type: text/plain, Size: 5856 bytes --]
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 1816c5e26581..88cd5dc8e238 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -48,6 +48,7 @@ struct ceph_options {
unsigned long mount_timeout; /* jiffies */
unsigned long osd_idle_ttl; /* jiffies */
unsigned long osd_keepalive_timeout; /* jiffies */
+ unsigned long osd_request_timeout; /* jiffies */
/*
* any type that can't be simply compared or doesn't need need
@@ -68,6 +69,7 @@ struct ceph_options {
#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */
#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000)
#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 03a6653d329a..4db031dd748d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -193,6 +193,7 @@ struct ceph_osd_request {
/* internal */
unsigned long r_stamp; /* jiffies, send or check time */
+ unsigned long r_start_stamp; /* jiffies */
int r_attempts;
struct ceph_eversion r_replay_version; /* aka reassert_version */
u32 r_last_force_resend;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 464e88599b9d..108533859a53 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -230,6 +230,7 @@ enum {
Opt_osdkeepalivetimeout,
Opt_mount_timeout,
Opt_osd_idle_ttl,
+ Opt_osd_request_timeout,
Opt_last_int,
/* int args above */
Opt_fsid,
@@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
{Opt_mount_timeout, "mount_timeout=%d"},
{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+ {Opt_osd_request_timeout, "osd_request_timeout=%d"},
/* int args above */
{Opt_fsid, "fsid=%s"},
{Opt_name, "name=%s"},
@@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+ opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
/* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */
@@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
}
opt->mount_timeout = msecs_to_jiffies(intval * 1000);
break;
+ case Opt_osd_request_timeout:
+ /* 0 is "wait forever" (i.e. infinite timeout) */
+ if (intval < 0 || intval > INT_MAX / 1000) {
+ pr_err("osd_request_timeout out of range\n");
+ err = -EINVAL;
+ goto out;
+ }
+ opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
+ break;
case Opt_share:
opt->flags &= ~CEPH_OPT_NOSHARE;
@@ -557,6 +569,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
seq_printf(m, "osdkeepalivetimeout=%d,",
jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
+ if (opt->osd_request_timeout != CEPH_OSD_REQUEST_TIMEOUT_DEFAULT)
+ seq_printf(m, "osd_request_timeout=%d,",
+ jiffies_to_msecs(opt->osd_request_timeout) / 1000);
/* drop redundant comma */
if (m->count != pos)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b8da4279e745..1d580530c700 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1717,6 +1717,8 @@ static void account_request(struct ceph_osd_request *req)
WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
atomic_inc(&req->r_osdc->num_requests);
+
+ req->r_start_stamp = jiffies;
}
static void submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -2504,6 +2506,7 @@ static void handle_timeout(struct work_struct *work)
container_of(work, struct ceph_osd_client, timeout_work.work);
struct ceph_options *opts = osdc->client->options;
unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+ unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
LIST_HEAD(slow_osds);
struct rb_node *n, *p;
@@ -2519,15 +2522,23 @@ static void handle_timeout(struct work_struct *work)
struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
bool found = false;
- for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+ for (p = rb_first(&osd->o_requests); p; ) {
struct ceph_osd_request *req =
rb_entry(p, struct ceph_osd_request, r_node);
+ p = rb_next(p); /* abort_request() */
+
if (time_before(req->r_stamp, cutoff)) {
dout(" req %p tid %llu on osd%d is laggy\n",
req, req->r_tid, osd->o_osd);
found = true;
}
+ if (opts->osd_request_timeout &&
+ time_before(req->r_start_stamp, expiry_cutoff)) {
+ pr_err_ratelimited("tid %llu on osd%d timeout\n",
+ req->r_tid, osd->o_osd);
+ abort_request(req, -ETIMEDOUT);
+ }
}
for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
struct ceph_osd_linger_request *lreq =
@@ -2547,6 +2558,21 @@ static void handle_timeout(struct work_struct *work)
list_move_tail(&osd->o_keepalive_item, &slow_osds);
}
+ if (opts->osd_request_timeout) {
+ for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ p = rb_next(p); /* abort_request() */
+
+ if (time_before(req->r_start_stamp, expiry_cutoff)) {
+ pr_err_ratelimited("tid %llu on osd%d timeout\n",
+ req->r_tid, osdc->homeless_osd.o_osd);
+ abort_request(req, -ETIMEDOUT);
+ }
+ }
+ }
+
if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
maybe_request_map(osdc);
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-13 9:11 ` Ilya Dryomov
@ 2017-02-13 9:54 ` Artur Molchanov
2017-02-13 14:15 ` Ilya Dryomov
0 siblings, 1 reply; 14+ messages in thread
From: Artur Molchanov @ 2017-02-13 9:54 UTC (permalink / raw)
To: Ilya Dryomov, Jeff Layton; +Cc: Ceph Development
Hi Ilya,
On 02/13/2017 12:11 PM, Ilya Dryomov wrote:
> Hi Artur,
>
> How about the attached patch? handle_timeout() is going to iterate
> over all but the homeless OSD anyway; all it costs us is a couple of
> tests, so I don't think a separate work is needed.
>
> abort_request() is a simple wrapper around complete_request(), making
> it safe to call at any time -- replace it with complete_request() for
> now if you want to try this out.
Using one job for sending keepalive requests and completing stuck requests
brings us to the need to check that osd_keepalive_timeout is not larger then
osd_request_timeout. So we should not forget to say about it in the documentation.
Is it worth creating a correlation between osd_keepalive_timeout and
osd_request_timeout?
--
Artur
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-13 9:54 ` Artur Molchanov
@ 2017-02-13 14:15 ` Ilya Dryomov
2017-02-13 15:23 ` Artur Molchanov
0 siblings, 1 reply; 14+ messages in thread
From: Ilya Dryomov @ 2017-02-13 14:15 UTC (permalink / raw)
To: Artur Molchanov; +Cc: Jeff Layton, Ceph Development
On Mon, Feb 13, 2017 at 10:54 AM, Artur Molchanov
<artur.molchanov@synesis.ru> wrote:
> Hi Ilya,
>
> On 02/13/2017 12:11 PM, Ilya Dryomov wrote:
>>
>> Hi Artur,
>>
>> How about the attached patch? handle_timeout() is going to iterate
>> over all but the homeless OSD anyway; all it costs us is a couple of
>> tests, so I don't think a separate work is needed.
>>
>> abort_request() is a simple wrapper around complete_request(), making
>> it safe to call at any time -- replace it with complete_request() for
>> now if you want to try this out.
>
>
> Using one job for sending keepalive requests and completing stuck requests
> brings us to the need to check that osd_keepalive_timeout is not larger then
> osd_request_timeout. So we should not forget to say about it in the
> documentation.
I'll make a note to mention that it is osd_keepalive_timeout-precise.
> Is it worth creating a correlation between osd_keepalive_timeout and
> osd_request_timeout?
No, probably not. osd_keepalive_timeout and osd_idle_ttl are similarly
related and we don't have any.
Thanks,
Ilya
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-13 14:15 ` Ilya Dryomov
@ 2017-02-13 15:23 ` Artur Molchanov
2017-02-13 15:32 ` Ilya Dryomov
0 siblings, 1 reply; 14+ messages in thread
From: Artur Molchanov @ 2017-02-13 15:23 UTC (permalink / raw)
To: Ilya Dryomov; +Cc: Jeff Layton, Ceph Development
On 02/13/2017 05:15 PM, Ilya Dryomov wrote:
> On Mon, Feb 13, 2017 at 10:54 AM, Artur Molchanov
> <artur.molchanov@synesis.ru> wrote:
>> Hi Ilya,
>>
>> On 02/13/2017 12:11 PM, Ilya Dryomov wrote:
>>>
>>> Hi Artur,
>>>
>>> How about the attached patch? handle_timeout() is going to iterate
>>> over all but the homeless OSD anyway; all it costs us is a couple of
>>> tests, so I don't think a separate work is needed.
>>>
>>> abort_request() is a simple wrapper around complete_request(), making
>>> it safe to call at any time -- replace it with complete_request() for
>>> now if you want to try this out.
>>
>>
>> Using one job for sending keepalive requests and completing stuck requests
>> brings us to the need to check that osd_keepalive_timeout is not larger then
>> osd_request_timeout. So we should not forget to say about it in the
>> documentation.
>
> I'll make a note to mention that it is osd_keepalive_timeout-precise.
>
>> Is it worth creating a correlation between osd_keepalive_timeout and
>> osd_request_timeout?
>
> No, probably not. osd_keepalive_timeout and osd_idle_ttl are similarly
> related and we don't have any.
Your variant of patch works.
As I said, using job handle_timeout to send keepalive requests and aborting
stuck requests is not the perfect choice, but OK, it works.
What should I do to make this patch merged to upstream?
--
Artur
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
2017-02-13 15:23 ` Artur Molchanov
@ 2017-02-13 15:32 ` Ilya Dryomov
0 siblings, 0 replies; 14+ messages in thread
From: Ilya Dryomov @ 2017-02-13 15:32 UTC (permalink / raw)
To: Artur Molchanov; +Cc: Jeff Layton, Ceph Development
On Mon, Feb 13, 2017 at 4:23 PM, Artur Molchanov
<artur.molchanov@synesis.ru> wrote:
> On 02/13/2017 05:15 PM, Ilya Dryomov wrote:
>>
>> On Mon, Feb 13, 2017 at 10:54 AM, Artur Molchanov
>> <artur.molchanov@synesis.ru> wrote:
>>>
>>> Hi Ilya,
>>>
>>> On 02/13/2017 12:11 PM, Ilya Dryomov wrote:
>>>>
>>>>
>>>> Hi Artur,
>>>>
>>>> How about the attached patch? handle_timeout() is going to iterate
>>>> over all but the homeless OSD anyway; all it costs us is a couple of
>>>> tests, so I don't think a separate work is needed.
>>>>
>>>> abort_request() is a simple wrapper around complete_request(), making
>>>> it safe to call at any time -- replace it with complete_request() for
>>>> now if you want to try this out.
>>>
>>>
>>>
>>> Using one job for sending keepalive requests and completing stuck
>>> requests
>>> brings us to the need to check that osd_keepalive_timeout is not larger
>>> then
>>> osd_request_timeout. So we should not forget to say about it in the
>>> documentation.
>>
>>
>> I'll make a note to mention that it is osd_keepalive_timeout-precise.
>>
>>> Is it worth creating a correlation between osd_keepalive_timeout and
>>> osd_request_timeout?
>>
>>
>> No, probably not. osd_keepalive_timeout and osd_idle_ttl are similarly
>> related and we don't have any.
>
>
> Your variant of patch works.
> As I said, using job handle_timeout to send keepalive requests and aborting
> stuck requests is not the perfect choice, but OK, it works.
>
> What should I do to make this patch merged to upstream?
Nothing -- I'll apply it for 4.11 once we have abort_request().
Thanks,
Ilya
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2017-02-13 15:32 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-09 13:04 [PATCH] libceph: Complete stuck requests to OSD with EIO Artur Molchanov
2017-02-09 14:27 ` Ilya Dryomov
2017-02-09 15:51 ` Artur Molchanov
2017-02-09 16:11 ` Wido den Hollander
2017-02-10 11:31 ` Jeff Layton
2017-02-10 12:25 ` Ilya Dryomov
2017-02-13 9:11 ` Ilya Dryomov
2017-02-13 9:54 ` Artur Molchanov
2017-02-13 14:15 ` Ilya Dryomov
2017-02-13 15:23 ` Artur Molchanov
2017-02-13 15:32 ` Ilya Dryomov
2017-02-11 20:30 ` Artur Molchanov
2017-02-12 12:34 ` Jeff Layton
2017-02-12 15:26 ` Artur Molchanov
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.