* [PATCH] libceph: apply new_state before new_up_client on incrementals
@ 2016-07-19 21:26 Ilya Dryomov
2016-07-21 0:04 ` Josh Durgin
0 siblings, 1 reply; 4+ messages in thread
From: Ilya Dryomov @ 2016-07-19 21:26 UTC (permalink / raw)
To: ceph-devel; +Cc: Josh Durgin
Currently, osd_weight and osd_state fields are updated in the encoding
order. This is wrong, because an incremental map may look like e.g.
new_up_client: { osd=6, addr=... } # set osd_state and addr
new_state: { osd=6, xorstate=EXISTS } # clear osd_state
Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After
applying new_up_client, osd_state is changed to EXISTS | UP. Carrying
on with the new_state update, we flip EXISTS and leave osd6 in a weird
"!EXISTS but UP" state. A non-existent OSD is considered down by the
mapping code
2087 for (i = 0; i < pg->pg_temp.len; i++) {
2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
2089 if (ceph_can_shift_osds(pi))
2090 continue;
2091
2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE;
and so requests get directed to the second OSD in the set instead of
the first, resulting in OSD-side errors like:
[WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680
and hung rbds on the client:
[ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0)
[ 493.566805] rbd: rbd0: result -6 xferred 400000
[ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688
The fix is to decouple application from the decoding and:
- apply new_weight first
- apply new_state before new_up_client
- twiddle osd_state flags if marking in
- clear out some of the state if osd is destroyed
Fixes: http://tracker.ceph.com/issues/14901
Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd
Cc: stable@vger.kernel.org # 3.15+
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
net/ceph/osdmap.c | 156 +++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 113 insertions(+), 43 deletions(-)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 03062bb763b3..7e480bf75bcf 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1261,6 +1261,115 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
}
/*
+ * Encoding order is (new_up_client, new_state, new_weight). Need to
+ * apply in the (new_weight, new_state, new_up_client) order, because
+ * an incremental map may look like e.g.
+ *
+ * new_up_client: { osd=6, addr=... } # set osd_state and addr
+ * new_state: { osd=6, xorstate=EXISTS } # clear osd_state
+ */
+static int decode_new_up_state_weight(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ void *new_up_client;
+ void *new_state;
+ void *new_weight_end;
+ u32 len;
+
+ new_up_client = *p;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
+ ceph_decode_need(p, end, len, e_inval);
+ *p += len;
+
+ new_state = *p;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ len *= sizeof(u32) + sizeof(u8);
+ ceph_decode_need(p, end, len, e_inval);
+ *p += len;
+
+ /* new_weight */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ while (len--) {
+ s32 osd;
+ u32 w;
+
+ ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
+ osd = ceph_decode_32(p);
+ w = ceph_decode_32(p);
+ BUG_ON(osd >= map->max_osd);
+ pr_info("osd%d weight 0x%x %s\n", osd, w,
+ w == CEPH_OSD_IN ? "(in)" :
+ (w == CEPH_OSD_OUT ? "(out)" : ""));
+ map->osd_weight[osd] = w;
+
+ /*
+ * If we are marking in, set the EXISTS, and clear the
+ * AUTOOUT and NEW bits.
+ */
+ if (w) {
+ map->osd_state[osd] |= CEPH_OSD_EXISTS;
+ map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
+ CEPH_OSD_NEW);
+ }
+ }
+ new_weight_end = *p;
+
+ /* new_state (up/down) */
+ *p = new_state;
+ len = ceph_decode_32(p);
+ while (len--) {
+ s32 osd;
+ u8 xorstate;
+ int ret;
+
+ osd = ceph_decode_32(p);
+ xorstate = ceph_decode_8(p);
+ if (xorstate == 0)
+ xorstate = CEPH_OSD_UP;
+ BUG_ON(osd >= map->max_osd);
+ if ((map->osd_state[osd] & CEPH_OSD_UP) &&
+ (xorstate & CEPH_OSD_UP))
+ pr_info("osd%d down\n", osd);
+ if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
+ (xorstate & CEPH_OSD_EXISTS)) {
+ pr_info("osd%d does not exist\n", osd);
+ map->osd_weight[osd] = CEPH_OSD_IN;
+ ret = set_primary_affinity(map, osd,
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
+ if (ret)
+ return ret;
+ memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
+ map->osd_state[osd] = 0;
+ } else {
+ map->osd_state[osd] ^= xorstate;
+ }
+ }
+
+ /* new_up_client */
+ *p = new_up_client;
+ len = ceph_decode_32(p);
+ while (len--) {
+ s32 osd;
+ struct ceph_entity_addr addr;
+
+ osd = ceph_decode_32(p);
+ ceph_decode_copy(p, &addr, sizeof(addr));
+ ceph_decode_addr(&addr);
+ BUG_ON(osd >= map->max_osd);
+ pr_info("osd%d up\n", osd);
+ map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
+ map->osd_addr[osd] = addr;
+ }
+
+ *p = new_weight_end;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+/*
* decode and apply an incremental map update.
*/
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
@@ -1358,49 +1467,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
__remove_pg_pool(&map->pg_pools, pi);
}
- /* new_up */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd;
- struct ceph_entity_addr addr;
- ceph_decode_32_safe(p, end, osd, e_inval);
- ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
- ceph_decode_addr(&addr);
- pr_info("osd%d up\n", osd);
- BUG_ON(osd >= map->max_osd);
- map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
- map->osd_addr[osd] = addr;
- }
-
- /* new_state */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd;
- u8 xorstate;
- ceph_decode_32_safe(p, end, osd, e_inval);
- xorstate = **(u8 **)p;
- (*p)++; /* clean flag */
- if (xorstate == 0)
- xorstate = CEPH_OSD_UP;
- if (xorstate & CEPH_OSD_UP)
- pr_info("osd%d down\n", osd);
- if (osd < map->max_osd)
- map->osd_state[osd] ^= xorstate;
- }
-
- /* new_weight */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd, off;
- ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
- osd = ceph_decode_32(p);
- off = ceph_decode_32(p);
- pr_info("osd%d weight 0x%x %s\n", osd, off,
- off == CEPH_OSD_IN ? "(in)" :
- (off == CEPH_OSD_OUT ? "(out)" : ""));
- if (osd < map->max_osd)
- map->osd_weight[osd] = off;
- }
+ /* new_up_client, new_state, new_weight */
+ err = decode_new_up_state_weight(p, end, map);
+ if (err)
+ goto bad;
/* new_pg_temp */
err = decode_new_pg_temp(p, end, map);
--
2.4.3
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] libceph: apply new_state before new_up_client on incrementals
2016-07-19 21:26 [PATCH] libceph: apply new_state before new_up_client on incrementals Ilya Dryomov
@ 2016-07-21 0:04 ` Josh Durgin
2016-07-21 15:27 ` Ilya Dryomov
0 siblings, 1 reply; 4+ messages in thread
From: Josh Durgin @ 2016-07-21 0:04 UTC (permalink / raw)
To: Ilya Dryomov, ceph-devel
On 07/19/2016 02:26 PM, Ilya Dryomov wrote:
> Currently, osd_weight and osd_state fields are updated in the encoding
> order. This is wrong, because an incremental map may look like e.g.
>
> new_up_client: { osd=6, addr=... } # set osd_state and addr
> new_state: { osd=6, xorstate=EXISTS } # clear osd_state
>
> Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After
> applying new_up_client, osd_state is changed to EXISTS | UP. Carrying
> on with the new_state update, we flip EXISTS and leave osd6 in a weird
> "!EXISTS but UP" state. A non-existent OSD is considered down by the
> mapping code
>
> 2087 for (i = 0; i < pg->pg_temp.len; i++) {
> 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
> 2089 if (ceph_can_shift_osds(pi))
> 2090 continue;
> 2091
> 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE;
>
> and so requests get directed to the second OSD in the set instead of
> the first, resulting in OSD-side errors like:
>
> [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680
>
> and hung rbds on the client:
>
> [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0)
> [ 493.566805] rbd: rbd0: result -6 xferred 400000
> [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688
>
> The fix is to decouple application from the decoding and:
> - apply new_weight first
> - apply new_state before new_up_client
> - twiddle osd_state flags if marking in
> - clear out some of the state if osd is destroyed
>
> Fixes: http://tracker.ceph.com/issues/14901
>
> Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd
> Cc: stable@vger.kernel.org # 3.15+
> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> ---
> net/ceph/osdmap.c | 156 +++++++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 113 insertions(+), 43 deletions(-)
This matches userspace's incremental handling now. Looks good.
Reviewed-by: Josh Durgin <jdurgin@redhat.com>
>
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index 03062bb763b3..7e480bf75bcf 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -1261,6 +1261,115 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
> }
>
> /*
> + * Encoding order is (new_up_client, new_state, new_weight). Need to
> + * apply in the (new_weight, new_state, new_up_client) order, because
> + * an incremental map may look like e.g.
> + *
> + * new_up_client: { osd=6, addr=... } # set osd_state and addr
> + * new_state: { osd=6, xorstate=EXISTS } # clear osd_state
> + */
> +static int decode_new_up_state_weight(void **p, void *end,
> + struct ceph_osdmap *map)
> +{
> + void *new_up_client;
> + void *new_state;
> + void *new_weight_end;
> + u32 len;
> +
> + new_up_client = *p;
> + ceph_decode_32_safe(p, end, len, e_inval);
> + len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
> + ceph_decode_need(p, end, len, e_inval);
> + *p += len;
> +
> + new_state = *p;
> + ceph_decode_32_safe(p, end, len, e_inval);
> + len *= sizeof(u32) + sizeof(u8);
> + ceph_decode_need(p, end, len, e_inval);
> + *p += len;
> +
> + /* new_weight */
> + ceph_decode_32_safe(p, end, len, e_inval);
> + while (len--) {
> + s32 osd;
> + u32 w;
> +
> + ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
> + osd = ceph_decode_32(p);
> + w = ceph_decode_32(p);
> + BUG_ON(osd >= map->max_osd);
> + pr_info("osd%d weight 0x%x %s\n", osd, w,
> + w == CEPH_OSD_IN ? "(in)" :
> + (w == CEPH_OSD_OUT ? "(out)" : ""));
> + map->osd_weight[osd] = w;
> +
> + /*
> + * If we are marking in, set the EXISTS, and clear the
> + * AUTOOUT and NEW bits.
> + */
> + if (w) {
> + map->osd_state[osd] |= CEPH_OSD_EXISTS;
> + map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
> + CEPH_OSD_NEW);
> + }
> + }
> + new_weight_end = *p;
> +
> + /* new_state (up/down) */
> + *p = new_state;
> + len = ceph_decode_32(p);
> + while (len--) {
> + s32 osd;
> + u8 xorstate;
> + int ret;
> +
> + osd = ceph_decode_32(p);
> + xorstate = ceph_decode_8(p);
> + if (xorstate == 0)
> + xorstate = CEPH_OSD_UP;
> + BUG_ON(osd >= map->max_osd);
> + if ((map->osd_state[osd] & CEPH_OSD_UP) &&
> + (xorstate & CEPH_OSD_UP))
> + pr_info("osd%d down\n", osd);
> + if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
> + (xorstate & CEPH_OSD_EXISTS)) {
> + pr_info("osd%d does not exist\n", osd);
> + map->osd_weight[osd] = CEPH_OSD_IN;
> + ret = set_primary_affinity(map, osd,
> + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
> + if (ret)
> + return ret;
> + memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
> + map->osd_state[osd] = 0;
> + } else {
> + map->osd_state[osd] ^= xorstate;
> + }
> + }
> +
> + /* new_up_client */
> + *p = new_up_client;
> + len = ceph_decode_32(p);
> + while (len--) {
> + s32 osd;
> + struct ceph_entity_addr addr;
> +
> + osd = ceph_decode_32(p);
> + ceph_decode_copy(p, &addr, sizeof(addr));
> + ceph_decode_addr(&addr);
> + BUG_ON(osd >= map->max_osd);
> + pr_info("osd%d up\n", osd);
> + map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
> + map->osd_addr[osd] = addr;
> + }
> +
> + *p = new_weight_end;
> + return 0;
> +
> +e_inval:
> + return -EINVAL;
> +}
> +
> +/*
> * decode and apply an incremental map update.
> */
> struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
> @@ -1358,49 +1467,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
> __remove_pg_pool(&map->pg_pools, pi);
> }
>
> - /* new_up */
> - ceph_decode_32_safe(p, end, len, e_inval);
> - while (len--) {
> - u32 osd;
> - struct ceph_entity_addr addr;
> - ceph_decode_32_safe(p, end, osd, e_inval);
> - ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
> - ceph_decode_addr(&addr);
> - pr_info("osd%d up\n", osd);
> - BUG_ON(osd >= map->max_osd);
> - map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
> - map->osd_addr[osd] = addr;
> - }
> -
> - /* new_state */
> - ceph_decode_32_safe(p, end, len, e_inval);
> - while (len--) {
> - u32 osd;
> - u8 xorstate;
> - ceph_decode_32_safe(p, end, osd, e_inval);
> - xorstate = **(u8 **)p;
> - (*p)++; /* clean flag */
> - if (xorstate == 0)
> - xorstate = CEPH_OSD_UP;
> - if (xorstate & CEPH_OSD_UP)
> - pr_info("osd%d down\n", osd);
> - if (osd < map->max_osd)
> - map->osd_state[osd] ^= xorstate;
> - }
> -
> - /* new_weight */
> - ceph_decode_32_safe(p, end, len, e_inval);
> - while (len--) {
> - u32 osd, off;
> - ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
> - osd = ceph_decode_32(p);
> - off = ceph_decode_32(p);
> - pr_info("osd%d weight 0x%x %s\n", osd, off,
> - off == CEPH_OSD_IN ? "(in)" :
> - (off == CEPH_OSD_OUT ? "(out)" : ""));
> - if (osd < map->max_osd)
> - map->osd_weight[osd] = off;
> - }
> + /* new_up_client, new_state, new_weight */
> + err = decode_new_up_state_weight(p, end, map);
> + if (err)
> + goto bad;
>
> /* new_pg_temp */
> err = decode_new_pg_temp(p, end, map);
>
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] libceph: apply new_state before new_up_client on incrementals
2016-07-21 0:04 ` Josh Durgin
@ 2016-07-21 15:27 ` Ilya Dryomov
2016-07-21 19:22 ` Josh Durgin
0 siblings, 1 reply; 4+ messages in thread
From: Ilya Dryomov @ 2016-07-21 15:27 UTC (permalink / raw)
To: Josh Durgin; +Cc: Ceph Development
On Thu, Jul 21, 2016 at 2:04 AM, Josh Durgin <jdurgin@redhat.com> wrote:
> On 07/19/2016 02:26 PM, Ilya Dryomov wrote:
>>
>> Currently, osd_weight and osd_state fields are updated in the encoding
>> order. This is wrong, because an incremental map may look like e.g.
>>
>> new_up_client: { osd=6, addr=... } # set osd_state and addr
>> new_state: { osd=6, xorstate=EXISTS } # clear osd_state
>>
>> Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After
>> applying new_up_client, osd_state is changed to EXISTS | UP. Carrying
>> on with the new_state update, we flip EXISTS and leave osd6 in a weird
>> "!EXISTS but UP" state. A non-existent OSD is considered down by the
>> mapping code
>>
>> 2087 for (i = 0; i < pg->pg_temp.len; i++) {
>> 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
>> 2089 if (ceph_can_shift_osds(pi))
>> 2090 continue;
>> 2091
>> 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE;
>>
>> and so requests get directed to the second OSD in the set instead of
>> the first, resulting in OSD-side errors like:
>>
>> [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected
>> client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680
>>
>> and hung rbds on the client:
>>
>> [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0)
>> [ 493.566805] rbd: rbd0: result -6 xferred 400000
>> [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688
>>
>> The fix is to decouple application from the decoding and:
>> - apply new_weight first
>> - apply new_state before new_up_client
>> - twiddle osd_state flags if marking in
>> - clear out some of the state if osd is destroyed
>>
>> Fixes: http://tracker.ceph.com/issues/14901
>>
>> Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists'
>> flag for newly up osd
>> Cc: stable@vger.kernel.org # 3.15+
>> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
>> ---
>> net/ceph/osdmap.c | 156
>> +++++++++++++++++++++++++++++++++++++++---------------
>> 1 file changed, 113 insertions(+), 43 deletions(-)
>
>
> This matches userspace's incremental handling now. Looks good.
Technically, no. new_primary_affinity is decoded along with new_weight
in userspace and crushmap is applied at the very end, after weights and
states are applied. AFAICT crushmap is applied last only because of
the userspace-only adjust_osd_weights(), which needs to have the new
weights.
As for primary-affinity, the following
new_up_client: { osd=X, addr=... }
new_state: { osd=X, xorstate=EXISTS }
new_primary_affinity: { osd=X, aff=Y }
would be misinterpreted - the kernel client would stay on aff=Y instead
of aff=1 (reset), but I don't see a (obvious, at least) way to get such
a map, and, as I wanted this to be as small as possible for backporting,
I punted on that. Let me know if you see otherwise.
Thanks,
Ilya
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] libceph: apply new_state before new_up_client on incrementals
2016-07-21 15:27 ` Ilya Dryomov
@ 2016-07-21 19:22 ` Josh Durgin
0 siblings, 0 replies; 4+ messages in thread
From: Josh Durgin @ 2016-07-21 19:22 UTC (permalink / raw)
To: Ilya Dryomov; +Cc: Ceph Development
On 07/21/2016 08:27 AM, Ilya Dryomov wrote:
> On Thu, Jul 21, 2016 at 2:04 AM, Josh Durgin <jdurgin@redhat.com> wrote:
>> On 07/19/2016 02:26 PM, Ilya Dryomov wrote:
>>>
>>> Currently, osd_weight and osd_state fields are updated in the encoding
>>> order. This is wrong, because an incremental map may look like e.g.
>>>
>>> new_up_client: { osd=6, addr=... } # set osd_state and addr
>>> new_state: { osd=6, xorstate=EXISTS } # clear osd_state
>>>
>>> Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After
>>> applying new_up_client, osd_state is changed to EXISTS | UP. Carrying
>>> on with the new_state update, we flip EXISTS and leave osd6 in a weird
>>> "!EXISTS but UP" state. A non-existent OSD is considered down by the
>>> mapping code
>>>
>>> 2087 for (i = 0; i < pg->pg_temp.len; i++) {
>>> 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
>>> 2089 if (ceph_can_shift_osds(pi))
>>> 2090 continue;
>>> 2091
>>> 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE;
>>>
>>> and so requests get directed to the second OSD in the set instead of
>>> the first, resulting in OSD-side errors like:
>>>
>>> [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected
>>> client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680
>>>
>>> and hung rbds on the client:
>>>
>>> [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0)
>>> [ 493.566805] rbd: rbd0: result -6 xferred 400000
>>> [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688
>>>
>>> The fix is to decouple application from the decoding and:
>>> - apply new_weight first
>>> - apply new_state before new_up_client
>>> - twiddle osd_state flags if marking in
>>> - clear out some of the state if osd is destroyed
>>>
>>> Fixes: http://tracker.ceph.com/issues/14901
>>>
>>> Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists'
>>> flag for newly up osd
>>> Cc: stable@vger.kernel.org # 3.15+
>>> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
>>> ---
>>> net/ceph/osdmap.c | 156
>>> +++++++++++++++++++++++++++++++++++++++---------------
>>> 1 file changed, 113 insertions(+), 43 deletions(-)
>>
>>
>> This matches userspace's incremental handling now. Looks good.
>
> Technically, no. new_primary_affinity is decoded along with new_weight
> in userspace and crushmap is applied at the very end, after weights and
> states are applied. AFAICT crushmap is applied last only because of
> the userspace-only adjust_osd_weights(), which needs to have the new
> weights.
>
> As for primary-affinity, the following
>
> new_up_client: { osd=X, addr=... }
> new_state: { osd=X, xorstate=EXISTS }
> new_primary_affinity: { osd=X, aff=Y }
>
> would be misinterpreted - the kernel client would stay on aff=Y instead
> of aff=1 (reset), but I don't see a (obvious, at least) way to get such
> a map, and, as I wanted this to be as small as possible for backporting,
> I punted on that. Let me know if you see otherwise.
Yeah, I also see no way for this to happen, and it isn't likely to be
needed in the future either. So keeping this update small sounds good
to me.
Josh
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2016-07-21 19:22 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-19 21:26 [PATCH] libceph: apply new_state before new_up_client on incrementals Ilya Dryomov
2016-07-21 0:04 ` Josh Durgin
2016-07-21 15:27 ` Ilya Dryomov
2016-07-21 19:22 ` Josh Durgin
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.