All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ilya Dryomov <idryomov@gmail.com>
To: Jeff Layton <jlayton@redhat.com>
Cc: Artur Molchanov <artur.molchanov@synesis.ru>,
	Ceph Development <ceph-devel@vger.kernel.org>
Subject: Re: [PATCH] libceph: Complete stuck requests to OSD with EIO
Date: Mon, 13 Feb 2017 10:11:38 +0100	[thread overview]
Message-ID: <CAOi1vP-1EPasYZUPoqwh1Ykgq59xEAf1oeGZH5Y8Kpo9ooQMow@mail.gmail.com> (raw)
In-Reply-To: <CAOi1vP_SJD7Vh97WBU2=EsYfWruXqQe06BND+uaq10=V7ZGH1g@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 10072 bytes --]

On Fri, Feb 10, 2017 at 1:25 PM, Ilya Dryomov <idryomov@gmail.com> wrote:
> On Fri, Feb 10, 2017 at 12:31 PM, Jeff Layton <jlayton@redhat.com> wrote:
>> On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
>>> From: Artur Molchanov <artur.molchanov@synesis.ru>
>>>
>>> Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
>>> If osd_request_timeout equals to 0 (default value) then do nothing with
>>> hung requests (keep default behavior).
>>>
>>> Create RBD map option osd_request_timeout to set timeout in seconds. Set
>>> osd_request_timeout to 0 by default.
>>>
>> Also, what exactly are the requests blocked on when this occurs? Is the
>> ceph_osd_request_target ending up paused? I wonder if we might be better
>> off with something that returns a hard error under the circumstances
>> where you're hanging, rather than depending on timeouts.
>>
>>
>>> Signed-off-by: Artur Molchanov <artur.molchanov@synesis.ru>
>>> ---
>>>   include/linux/ceph/libceph.h    |  8 +++--
>>>   include/linux/ceph/osd_client.h |  1 +
>>>   net/ceph/ceph_common.c          | 12 +++++++
>>>   net/ceph/osd_client.c           | 72 +++++++++++++++++++++++++++++++++++++++++
>>>   4 files changed, 90 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
>>> index 1816c5e..10d8acb 100644
>>> --- a/include/linux/ceph/libceph.h
>>> +++ b/include/linux/ceph/libceph.h
>>> @@ -48,6 +48,7 @@ struct ceph_options {
>>>       unsigned long mount_timeout;            /* jiffies */
>>>       unsigned long osd_idle_ttl;             /* jiffies */
>>>       unsigned long osd_keepalive_timeout;    /* jiffies */
>>> +     unsigned long osd_request_timeout;      /* jiffies */
>>>
>>>       /*
>>>        * any type that can't be simply compared or doesn't need need
>>> @@ -65,9 +66,10 @@ struct ceph_options {
>>>   /*
>>>    * defaults
>>>    */
>>> -#define CEPH_MOUNT_TIMEOUT_DEFAULT   msecs_to_jiffies(60 * 1000)
>>> -#define CEPH_OSD_KEEPALIVE_DEFAULT   msecs_to_jiffies(5 * 1000)
>>> -#define CEPH_OSD_IDLE_TTL_DEFAULT    msecs_to_jiffies(60 * 1000)
>>> +#define CEPH_MOUNT_TIMEOUT_DEFAULT           msecs_to_jiffies(60 * 1000)
>>> +#define CEPH_OSD_KEEPALIVE_DEFAULT           msecs_to_jiffies(5 * 1000)
>>> +#define CEPH_OSD_IDLE_TTL_DEFAULT            msecs_to_jiffies(60 * 1000)
>>> +#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT     msecs_to_jiffies(0 * 1000)
>>>
>>>   #define CEPH_MONC_HUNT_INTERVAL             msecs_to_jiffies(3 * 1000)
>>>   #define CEPH_MONC_PING_INTERVAL             msecs_to_jiffies(10 * 1000)
>>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>>> index 03a6653..e45cba0 100644
>>> --- a/include/linux/ceph/osd_client.h
>>> +++ b/include/linux/ceph/osd_client.h
>>> @@ -279,6 +279,7 @@ struct ceph_osd_client {
>>>       atomic_t               num_homeless;
>>>       struct delayed_work    timeout_work;
>>>       struct delayed_work    osds_timeout_work;
>>> +     struct delayed_work    osd_request_timeout_work;
>>>   #ifdef CONFIG_DEBUG_FS
>>>       struct dentry          *debugfs_file;
>>>   #endif
>>> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
>>> index 464e885..81876c8 100644
>>> --- a/net/ceph/ceph_common.c
>>> +++ b/net/ceph/ceph_common.c
>>> @@ -230,6 +230,7 @@ enum {
>>>       Opt_osdkeepalivetimeout,
>>>       Opt_mount_timeout,
>>>       Opt_osd_idle_ttl,
>>> +     Opt_osd_request_timeout,
>>>       Opt_last_int,
>>>       /* int args above */
>>>       Opt_fsid,
>>> @@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
>>>       {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
>>>       {Opt_mount_timeout, "mount_timeout=%d"},
>>>       {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
>>> +     {Opt_osd_request_timeout, "osd_request_timeout=%d"},
>>>       /* int args above */
>>>       {Opt_fsid, "fsid=%s"},
>>>       {Opt_name, "name=%s"},
>>> @@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
>>>       opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
>>>       opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
>>>       opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
>>> +     opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
>>>
>>>       /* get mon ip(s) */
>>>       /* ip1[:port1][,ip2[:port2]...] */
>>> @@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
>>>                       }
>>>                       opt->mount_timeout = msecs_to_jiffies(intval * 1000);
>>>                       break;
>>> +             case Opt_osd_request_timeout:
>>> +                     /* 0 is "wait forever" (i.e. infinite timeout) */
>>> +                     if (intval < 0 || intval > INT_MAX / 1000) {
>>> +                             pr_err("osd_request_timeout out of range\n");
>>> +                             err = -EINVAL;
>>> +                             goto out;
>>> +                     }
>>> +                     opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
>>> +                     break;
>>>
>>>               case Opt_share:
>>>                       opt->flags &= ~CEPH_OPT_NOSHARE;
>>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>>> index 842f049..2f5a024 100644
>>> --- a/net/ceph/osd_client.c
>>> +++ b/net/ceph/osd_client.c
>>> @@ -2554,6 +2554,75 @@ static void handle_timeout(struct work_struct *work)
>>>                             osdc->client->options->osd_keepalive_timeout);
>>>   }
>>>
>>> +/*
>>> + * Complete request to OSD with error err if it started before cutoff.
>>> + */
>>> +static void complete_osd_stuck_request(struct ceph_osd_request *req,
>>> +                                    unsigned long cutoff,
>>> +                                    int err)
>>> +{
>>> +     if (!time_before(req->r_stamp, cutoff))
>>> +             return;
>>> +
>>> +     pr_warn_ratelimited("osd req on osd%d timeout expired\n",
>>> +                         req->r_osd->o_osd);
>>> +
>>> +     complete_request(req, err);
>>> +}
>>> +
>>> +/*
>>> + * Complete all requests to OSD that has been active for more than timeout.
>>> + */
>>> +static void complete_osd_stuck_requests(struct ceph_osd *osd,
>>> +                                     unsigned long timeout,
>>> +                                     int err)
>>> +{
>>> +     struct ceph_osd_request *req, *n;
>>> +     const unsigned long cutoff = jiffies - timeout;
>>> +
>>> +     rbtree_postorder_for_each_entry_safe(req, n, &osd->o_requests, r_node) {
>>> +             complete_osd_stuck_request(req, cutoff, -EIO);
>>> +     }
>>> +}
>>> +
>>> +/*
>>> + * Timeout callback, called every N seconds. When 1 or more OSD
>>> + * requests that has been active for more than N seconds,
>>> + * we complete it with error EIO.
>>> + */
>>> +static void handle_osd_request_timeout(struct work_struct *work)
>>> +{
>>> +     struct ceph_osd_client *osdc =
>>> +             container_of(work, struct ceph_osd_client,
>>> +                          osd_request_timeout_work.work);
>>> +     struct ceph_osd *osd, *n;
>>> +     struct ceph_options *opts = osdc->client->options;
>>> +     const unsigned long timeout = opts->osd_request_timeout;
>>> +
>>> +     dout("%s osdc %p\n", __func__, osdc);
>>> +
>>> +     if (!timeout)
>>> +             return;
>>> +
>>> +     down_write(&osdc->lock);
>>> +
>>> +     rbtree_postorder_for_each_entry_safe(osd, n, &osdc->osds, o_node) {
>>> +             complete_osd_stuck_requests(osd, timeout, -EIO);
>>> +     }
>>> +
>>> +     up_write(&osdc->lock);
>>> +
>>> +     if (!atomic_read(&osdc->num_homeless))
>>> +             goto out;
>>> +
>>> +     down_write(&osdc->lock);
>>> +     complete_osd_stuck_requests(&osdc->homeless_osd, timeout, -EIO);
>>> +     up_write(&osdc->lock);
>>> +
>>> +out:
>>> +     schedule_delayed_work(&osdc->osd_request_timeout_work, timeout);
>>> +}
>>> +
>>>   static void handle_osds_timeout(struct work_struct *work)
>>>   {
>>>       struct ceph_osd_client *osdc =
>>> @@ -4091,6 +4160,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
>>> ceph_client *client)
>>>       osdc->linger_map_checks = RB_ROOT;
>>>       INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
>>>       INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
>>> +     INIT_DELAYED_WORK(&osdc->osd_request_timeout_work, handle_osd_request_timeout);
>>>
>>>       err = -ENOMEM;
>>>       osdc->osdmap = ceph_osdmap_alloc();
>>> @@ -4120,6 +4190,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
>>> ceph_client *client)
>>>                             osdc->client->options->osd_keepalive_timeout);
>>>       schedule_delayed_work(&osdc->osds_timeout_work,
>>>           round_jiffies_relative(osdc->client->options->osd_idle_ttl));
>>> +     schedule_delayed_work(&osdc->osd_request_timeout_work,
>>> +         round_jiffies_relative(osdc->client->options->osd_request_timeout));
>>>
>>>       return 0;
>>>
>>
>> Having a job that has to wake up every second or so isn't ideal. Perhaps
>> you would be better off scheduling the delayed work in the request
>> submission codepath, and only rearm it when the tree isn't empty after
>> calling complete_osd_stuck_requests?
>>
>> Also, I don't see where this job is ever cancelled when the osdc is torn
>> down. That needs to occur or you'll cause a use-after-free oops...
>
> Yeah, there are other bugs here: rbtree_postorder_for_each_entry_safe()
> can't be used because complete_request() calls erase(), etc.  The patch
> I had just piggy-backed on handle_timeout(), which is called every five
> seconds anyway.
>
> Given that there is interest, I think it's worth adding.

Hi Artur,

How about the attached patch?  handle_timeout() is going to iterate
over all but the homeless OSD anyway; all it costs us is a couple of
tests, so I don't think a separate work is needed.

abort_request() is a simple wrapper around complete_request(), making
it safe to call at any time -- replace it with complete_request() for
now if you want to try this out.

Thanks,

                Ilya

[-- Attachment #2: osd-request-timeout.diff --]
[-- Type: text/plain, Size: 5856 bytes --]

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 1816c5e26581..88cd5dc8e238 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -48,6 +48,7 @@ struct ceph_options {
 	unsigned long mount_timeout;		/* jiffies */
 	unsigned long osd_idle_ttl;		/* jiffies */
 	unsigned long osd_keepalive_timeout;	/* jiffies */
+	unsigned long osd_request_timeout;	/* jiffies */
 
 	/*
 	 * any type that can't be simply compared or doesn't need need
@@ -68,6 +69,7 @@ struct ceph_options {
 #define CEPH_MOUNT_TIMEOUT_DEFAULT	msecs_to_jiffies(60 * 1000)
 #define CEPH_OSD_KEEPALIVE_DEFAULT	msecs_to_jiffies(5 * 1000)
 #define CEPH_OSD_IDLE_TTL_DEFAULT	msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0  /* no timeout */
 
 #define CEPH_MONC_HUNT_INTERVAL		msecs_to_jiffies(3 * 1000)
 #define CEPH_MONC_PING_INTERVAL		msecs_to_jiffies(10 * 1000)
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 03a6653d329a..4db031dd748d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -193,6 +193,7 @@ struct ceph_osd_request {
 
 	/* internal */
 	unsigned long r_stamp;                /* jiffies, send or check time */
+	unsigned long r_start_stamp;          /* jiffies */
 	int r_attempts;
 	struct ceph_eversion r_replay_version; /* aka reassert_version */
 	u32 r_last_force_resend;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 464e88599b9d..108533859a53 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -230,6 +230,7 @@ enum {
 	Opt_osdkeepalivetimeout,
 	Opt_mount_timeout,
 	Opt_osd_idle_ttl,
+	Opt_osd_request_timeout,
 	Opt_last_int,
 	/* int args above */
 	Opt_fsid,
@@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
 	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
 	{Opt_mount_timeout, "mount_timeout=%d"},
 	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+	{Opt_osd_request_timeout, "osd_request_timeout=%d"},
 	/* int args above */
 	{Opt_fsid, "fsid=%s"},
 	{Opt_name, "name=%s"},
@@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
 	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+	opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
 
 	/* get mon ip(s) */
 	/* ip1[:port1][,ip2[:port2]...] */
@@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
 			}
 			opt->mount_timeout = msecs_to_jiffies(intval * 1000);
 			break;
+		case Opt_osd_request_timeout:
+			/* 0 is "wait forever" (i.e. infinite timeout) */
+			if (intval < 0 || intval > INT_MAX / 1000) {
+				pr_err("osd_request_timeout out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
+			break;
 
 		case Opt_share:
 			opt->flags &= ~CEPH_OPT_NOSHARE;
@@ -557,6 +569,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
 	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
 		seq_printf(m, "osdkeepalivetimeout=%d,",
 		    jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
+	if (opt->osd_request_timeout != CEPH_OSD_REQUEST_TIMEOUT_DEFAULT)
+		seq_printf(m, "osd_request_timeout=%d,",
+			   jiffies_to_msecs(opt->osd_request_timeout) / 1000);
 
 	/* drop redundant comma */
 	if (m->count != pos)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b8da4279e745..1d580530c700 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1717,6 +1717,8 @@ static void account_request(struct ceph_osd_request *req)
 
 	WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
 	atomic_inc(&req->r_osdc->num_requests);
+
+	req->r_start_stamp = jiffies;
 }
 
 static void submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -2504,6 +2506,7 @@ static void handle_timeout(struct work_struct *work)
 		container_of(work, struct ceph_osd_client, timeout_work.work);
 	struct ceph_options *opts = osdc->client->options;
 	unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+	unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
 	LIST_HEAD(slow_osds);
 	struct rb_node *n, *p;
 
@@ -2519,15 +2522,23 @@ static void handle_timeout(struct work_struct *work)
 		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
 		bool found = false;
 
-		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+		for (p = rb_first(&osd->o_requests); p; ) {
 			struct ceph_osd_request *req =
 			    rb_entry(p, struct ceph_osd_request, r_node);
 
+			p = rb_next(p); /* abort_request() */
+
 			if (time_before(req->r_stamp, cutoff)) {
 				dout(" req %p tid %llu on osd%d is laggy\n",
 				     req, req->r_tid, osd->o_osd);
 				found = true;
 			}
+			if (opts->osd_request_timeout &&
+			    time_before(req->r_start_stamp, expiry_cutoff)) {
+				pr_err_ratelimited("tid %llu on osd%d timeout\n",
+				       req->r_tid, osd->o_osd);
+				abort_request(req, -ETIMEDOUT);
+			}
 		}
 		for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
 			struct ceph_osd_linger_request *lreq =
@@ -2547,6 +2558,21 @@ static void handle_timeout(struct work_struct *work)
 			list_move_tail(&osd->o_keepalive_item, &slow_osds);
 	}
 
+	if (opts->osd_request_timeout) {
+		for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			p = rb_next(p); /* abort_request() */
+
+			if (time_before(req->r_start_stamp, expiry_cutoff)) {
+				pr_err_ratelimited("tid %llu on osd%d timeout\n",
+				       req->r_tid, osdc->homeless_osd.o_osd);
+				abort_request(req, -ETIMEDOUT);
+			}
+		}
+	}
+
 	if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
 		maybe_request_map(osdc);
 

  reply	other threads:[~2017-02-13  9:11 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-02-09 13:04 [PATCH] libceph: Complete stuck requests to OSD with EIO Artur Molchanov
2017-02-09 14:27 ` Ilya Dryomov
2017-02-09 15:51   ` Artur Molchanov
2017-02-09 16:11     ` Wido den Hollander
2017-02-10 11:31 ` Jeff Layton
2017-02-10 12:25   ` Ilya Dryomov
2017-02-13  9:11     ` Ilya Dryomov [this message]
2017-02-13  9:54       ` Artur Molchanov
2017-02-13 14:15         ` Ilya Dryomov
2017-02-13 15:23           ` Artur Molchanov
2017-02-13 15:32             ` Ilya Dryomov
2017-02-11 20:30   ` Artur Molchanov
2017-02-12 12:34     ` Jeff Layton
2017-02-12 15:26       ` Artur Molchanov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAOi1vP-1EPasYZUPoqwh1Ykgq59xEAf1oeGZH5Y8Kpo9ooQMow@mail.gmail.com \
    --to=idryomov@gmail.com \
    --cc=artur.molchanov@synesis.ru \
    --cc=ceph-devel@vger.kernel.org \
    --cc=jlayton@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.