* [PATCH v2 1/6] libceph: add an epoch_barrier field to struct ceph_osd_client
2017-02-06 13:29 [PATCH v2 0/6] ceph: implement new-style ENOSPC handling in kcephfs Jeff Layton
@ 2017-02-06 13:29 ` Jeff Layton
2017-02-06 13:29 ` [PATCH v2 2/6] libceph: add ceph_osdc_complete_writes Jeff Layton
` (4 subsequent siblings)
5 siblings, 0 replies; 15+ messages in thread
From: Jeff Layton @ 2017-02-06 13:29 UTC (permalink / raw)
To: ceph-devel; +Cc: zyan, sage, idryomov, jspray
Cephfs can get cap update requests that contain a new epoch barrier in
them. When that happens we want to pause all OSD traffic until the right
map epoch arrives. Add a way for the upper layers to set the epoch_barrier
in ceph_osd_client and fix libceph to pause requests until it has the
right map epoch.
Add an epoch_barrier field to ceph_osd_client that is protected by the
osdc->lock rwsem. When the barrier is set, and the current OSD map
epoch is below that, pause the request target when submitting the
request or when revisiting it.
If we get a new map, compare the new epoch against the barrier before
kicking requests and request another map if the map epoch is still lower
than the one we want.
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
include/linux/ceph/osd_client.h | 2 ++
net/ceph/osd_client.c | 42 +++++++++++++++++++++++++++++++++--------
2 files changed, 36 insertions(+), 8 deletions(-)
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 03a6653d329a..e7d7cf284cf4 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -269,6 +269,7 @@ struct ceph_osd_client {
struct rb_root osds; /* osds */
struct list_head osd_lru; /* idle osds */
spinlock_t osd_lru_lock;
+ u32 epoch_barrier;
struct ceph_osd homeless_osd;
atomic64_t last_tid; /* tid of last request */
u64 last_linger_id;
@@ -307,6 +308,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
struct ceph_msg *msg);
extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg);
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
extern void osd_req_op_init(struct ceph_osd_request *osd_req,
unsigned int which, u16 opcode, u32 flags);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 3a2417bb6ff0..90d190f8f791 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1298,8 +1298,10 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
__pool_full(pi);
WARN_ON(pi->id != t->base_oloc.pool);
- return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
- (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+ return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) ||
+ ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) ||
+ (osdc->epoch_barrier &&
+ osdc->osdmap->epoch < osdc->epoch_barrier);
}
enum calc_target_result {
@@ -1609,21 +1611,24 @@ static void send_request(struct ceph_osd_request *req)
static void maybe_request_map(struct ceph_osd_client *osdc)
{
bool continuous = false;
+ u32 epoch = osdc->osdmap->epoch;
verify_osdc_locked(osdc);
- WARN_ON(!osdc->osdmap->epoch);
+ WARN_ON_ONCE(epoch == 0);
if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) ||
- ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ (osdc->epoch_barrier && epoch < osdc->epoch_barrier)) {
dout("%s osdc %p continuous\n", __func__, osdc);
continuous = true;
} else {
dout("%s osdc %p onetime\n", __func__, osdc);
}
+ ++epoch;
if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
- osdc->osdmap->epoch + 1, continuous))
+ epoch, continuous))
ceph_monc_renew_subs(&osdc->client->monc);
}
@@ -1651,8 +1656,14 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
goto promote;
}
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
- ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ if (osdc->epoch_barrier &&
+ osdc->osdmap->epoch < osdc->epoch_barrier) {
+ dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
+ osdc->epoch_barrier);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
dout("req %p pausewr\n", req);
req->r_t.paused = true;
maybe_request_map(osdc);
@@ -3279,7 +3290,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
have_pool_full(osdc);
- if (was_pauserd || was_pausewr || pauserd || pausewr)
+ if (was_pauserd || was_pausewr || pauserd || pausewr ||
+ (osdc->epoch_barrier && osdc->osdmap->epoch < osdc->epoch_barrier))
maybe_request_map(osdc);
kick_requests(osdc, &need_resend, &need_resend_linger);
@@ -3296,6 +3308,20 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
up_write(&osdc->lock);
}
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+ down_read(&osdc->lock);
+ if (unlikely(eb > osdc->epoch_barrier)) {
+ up_read(&osdc->lock);
+ down_write(&osdc->lock);
+ osdc->epoch_barrier = max(eb, osdc->epoch_barrier);
+ up_write(&osdc->lock);
+ } else {
+ up_read(&osdc->lock);
+ }
+}
+EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier);
+
/*
* Resubmit requests pending on the given osd.
*/
--
2.9.3
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 2/6] libceph: add ceph_osdc_complete_writes
2017-02-06 13:29 [PATCH v2 0/6] ceph: implement new-style ENOSPC handling in kcephfs Jeff Layton
2017-02-06 13:29 ` [PATCH v2 1/6] libceph: add an epoch_barrier field to struct ceph_osd_client Jeff Layton
@ 2017-02-06 13:29 ` Jeff Layton
2017-02-06 13:29 ` [PATCH v2 3/6] libceph: rename and export have_pool_full Jeff Layton
` (3 subsequent siblings)
5 siblings, 0 replies; 15+ messages in thread
From: Jeff Layton @ 2017-02-06 13:29 UTC (permalink / raw)
To: ceph-devel; +Cc: zyan, sage, idryomov, jspray
From: John Spray <john.spray@redhat.com>
When a Ceph volume hits capacity, a flag is set in the OSD map to
indicate that, and a new map is sprayed around the cluster. When the
cephfs client sees that, we want it to shut down any OSD writes that are
in-progress with an -ENOSPC error as they'll just hang otherwise.
Add a callback to the osdc that gets called on map updates and a way
for upper layers to register that callback.
[ jlayton: code style cleanup and adaptation to new osd msg handling ]
Signed-off-by: John Spray <john.spray@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
include/linux/ceph/osd_client.h | 12 ++++++++++
net/ceph/osd_client.c | 50 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 62 insertions(+)
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index e7d7cf284cf4..34010c86b307 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -21,6 +21,7 @@ struct ceph_osd_client;
/*
* completion callback for async writepages
*/
+typedef void (*ceph_osdc_map_callback_t)(struct ceph_osd_client *, void *);
typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
@@ -290,6 +291,9 @@ struct ceph_osd_client {
struct ceph_msgpool msgpool_op_reply;
struct workqueue_struct *notify_wq;
+
+ ceph_osdc_map_callback_t map_cb;
+ void *map_p;
};
static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag)
@@ -393,6 +397,7 @@ extern void ceph_osdc_put_request(struct ceph_osd_request *req);
extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req,
bool nofail);
+extern u32 ceph_osdc_complete_writes(struct ceph_osd_client *osdc, int r);
extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req);
@@ -459,5 +464,12 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
struct ceph_object_locator *oloc,
struct ceph_watch_item **watchers,
u32 *num_watchers);
+
+static inline void ceph_osdc_register_map_cb(struct ceph_osd_client *osdc,
+ ceph_osdc_map_callback_t cb, void *data)
+{
+ osdc->map_cb = cb;
+ osdc->map_p = data;
+}
#endif
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 90d190f8f791..aeee87a0e0da 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -18,6 +18,7 @@
#include <linux/ceph/decode.h>
#include <linux/ceph/auth.h>
#include <linux/ceph/pagelist.h>
+#include <linux/lockdep.h>
#define OSD_OPREPLY_FRONT_LEN 512
@@ -1782,6 +1783,51 @@ static void complete_request(struct ceph_osd_request *req, int err)
ceph_osdc_put_request(req);
}
+/*
+ * Drop all pending write/modify requests and complete
+ * them with the `r` as return code.
+ *
+ * Returns the highest OSD map epoch of a request that was
+ * cancelled, or 0 if none were cancelled.
+ */
+u32 ceph_osdc_complete_writes(struct ceph_osd_client *osdc, int r)
+{
+ struct ceph_osd_request *req;
+ struct ceph_osd *osd;
+ struct rb_node *m, *n;
+ u32 latest_epoch = 0;
+
+ lockdep_assert_held(&osdc->lock);
+
+ dout("enter complete_writes r=%d\n", r);
+
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ osd = rb_entry(n, struct ceph_osd, o_node);
+ m = rb_first(&osd->o_requests);
+ mutex_lock(&osd->lock);
+ while (m) {
+ req = rb_entry(m, struct ceph_osd_request, r_node);
+ m = rb_next(m);
+
+ if (req->r_flags & CEPH_OSD_FLAG_WRITE &&
+ (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ pool_full(osdc, req->r_t.base_oloc.pool))) {
+ u32 cur_epoch = le32_to_cpu(req->r_replay_version.epoch);
+
+ dout("%s: complete tid=%llu flags 0x%x\n", __func__, req->r_tid, req->r_flags);
+ complete_request(req, r);
+ if (cur_epoch > latest_epoch)
+ latest_epoch = cur_epoch;
+ }
+ }
+ mutex_unlock(&osd->lock);
+ }
+
+ dout("return complete_writes latest_epoch=%u\n", latest_epoch);
+ return latest_epoch;
+}
+EXPORT_SYMBOL(ceph_osdc_complete_writes);
+
static void cancel_map_check(struct ceph_osd_request *req)
{
struct ceph_osd_client *osdc = req->r_osdc;
@@ -3298,6 +3344,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
osdc->osdmap->epoch);
+ if (osdc->map_cb)
+ osdc->map_cb(osdc, osdc->map_p);
up_write(&osdc->lock);
wake_up_all(&osdc->client->auth_wq);
return;
@@ -4116,6 +4164,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
osdc->linger_requests = RB_ROOT;
osdc->map_checks = RB_ROOT;
osdc->linger_map_checks = RB_ROOT;
+ osdc->map_cb = NULL;
+ osdc->map_p = NULL;
INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
--
2.9.3
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 3/6] libceph: rename and export have_pool_full
2017-02-06 13:29 [PATCH v2 0/6] ceph: implement new-style ENOSPC handling in kcephfs Jeff Layton
2017-02-06 13:29 ` [PATCH v2 1/6] libceph: add an epoch_barrier field to struct ceph_osd_client Jeff Layton
2017-02-06 13:29 ` [PATCH v2 2/6] libceph: add ceph_osdc_complete_writes Jeff Layton
@ 2017-02-06 13:29 ` Jeff Layton
2017-02-06 13:29 ` [PATCH v2 4/6] ceph: register map callback to handle ENOSPC conditions Jeff Layton
` (2 subsequent siblings)
5 siblings, 0 replies; 15+ messages in thread
From: Jeff Layton @ 2017-02-06 13:29 UTC (permalink / raw)
To: ceph-devel; +Cc: zyan, sage, idryomov, jspray
Cephfs needs to be able to call this as well. Rename have_pool_full to
ceph_osdc_have_pool_full, and export it.
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
include/linux/ceph/osd_client.h | 1 +
net/ceph/osd_client.c | 7 ++++---
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 34010c86b307..17bf1873bb01 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -313,6 +313,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg);
void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
+extern bool ceph_osdc_have_pool_full(struct ceph_osd_client *osdc);
extern void osd_req_op_init(struct ceph_osd_request *osd_req,
unsigned int which, u16 opcode, u32 flags);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index aeee87a0e0da..d61d7a79fdb3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1259,7 +1259,7 @@ static bool __pool_full(struct ceph_pg_pool_info *pi)
return pi->flags & CEPH_POOL_FLAG_FULL;
}
-static bool have_pool_full(struct ceph_osd_client *osdc)
+bool ceph_osdc_have_pool_full(struct ceph_osd_client *osdc)
{
struct rb_node *n;
@@ -1273,6 +1273,7 @@ static bool have_pool_full(struct ceph_osd_client *osdc)
return false;
}
+EXPORT_SYMBOL(ceph_osdc_have_pool_full);
static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
{
@@ -3271,7 +3272,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
- have_pool_full(osdc);
+ ceph_osdc_have_pool_full(osdc);
/* incremental maps */
ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -3335,7 +3336,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
- have_pool_full(osdc);
+ ceph_osdc_have_pool_full(osdc);
if (was_pauserd || was_pausewr || pauserd || pausewr ||
(osdc->epoch_barrier && osdc->osdmap->epoch < osdc->epoch_barrier))
maybe_request_map(osdc);
--
2.9.3
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 4/6] ceph: register map callback to handle ENOSPC conditions
2017-02-06 13:29 [PATCH v2 0/6] ceph: implement new-style ENOSPC handling in kcephfs Jeff Layton
` (2 preceding siblings ...)
2017-02-06 13:29 ` [PATCH v2 3/6] libceph: rename and export have_pool_full Jeff Layton
@ 2017-02-06 13:29 ` Jeff Layton
2017-02-06 13:29 ` [PATCH v2 5/6] ceph: handle epoch barriers in cap messages Jeff Layton
2017-02-06 13:29 ` [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes Jeff Layton
5 siblings, 0 replies; 15+ messages in thread
From: Jeff Layton @ 2017-02-06 13:29 UTC (permalink / raw)
To: ceph-devel; +Cc: zyan, sage, idryomov, jspray
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
fs/ceph/mds_client.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 93fc893e1930..43297c6b5a8b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -433,6 +433,21 @@ static int __verify_registered_session(struct ceph_mds_client *mdsc,
return 0;
}
+static void handle_osd_map(struct ceph_osd_client *osdc, void *p)
+{
+ u32 cancelled_epoch = 0;
+
+ lockdep_assert_held(&osdc->lock);
+
+ if ((osdc->osdmap->flags & CEPH_OSDMAP_FULL) ||
+ ceph_osdc_have_pool_full(osdc))
+ cancelled_epoch = ceph_osdc_complete_writes(osdc, -ENOSPC);
+
+ if (cancelled_epoch)
+ osdc->epoch_barrier = max(cancelled_epoch + 1,
+ osdc->epoch_barrier);
+}
+
/*
* create+register a new session for given mds.
* called under mdsc->mutex.
@@ -3486,6 +3501,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
ceph_caps_init(mdsc);
ceph_adjust_min_caps(mdsc, fsc->min_caps);
+ ceph_osdc_register_map_cb(&fsc->client->osdc,
+ handle_osd_map, (void*)mdsc);
+
init_rwsem(&mdsc->pool_perm_rwsem);
mdsc->pool_perm_tree = RB_ROOT;
--
2.9.3
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 5/6] ceph: handle epoch barriers in cap messages
2017-02-06 13:29 [PATCH v2 0/6] ceph: implement new-style ENOSPC handling in kcephfs Jeff Layton
` (3 preceding siblings ...)
2017-02-06 13:29 ` [PATCH v2 4/6] ceph: register map callback to handle ENOSPC conditions Jeff Layton
@ 2017-02-06 13:29 ` Jeff Layton
2017-02-06 13:29 ` [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes Jeff Layton
5 siblings, 0 replies; 15+ messages in thread
From: Jeff Layton @ 2017-02-06 13:29 UTC (permalink / raw)
To: ceph-devel; +Cc: zyan, sage, idryomov, jspray
Have the client store and update the osdc epoch_barrier when a cap
message comes in with one. This allows clients to inform servers that
their released caps may not be used until a particular OSD map epoch.
Signed-off-by: John Spray <john.spray@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
fs/ceph/caps.c | 17 +++++++++++++----
fs/ceph/mds_client.c | 20 ++++++++++++++++++++
fs/ceph/mds_client.h | 7 +++++--
3 files changed, 38 insertions(+), 6 deletions(-)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3c2dfd72e5b2..d91d3f32a5b6 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1015,6 +1015,7 @@ static int send_cap_msg(struct cap_msg_args *arg)
void *p;
size_t extra_len;
struct timespec zerotime = {0};
+ struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
" seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
@@ -1077,7 +1078,9 @@ static int send_cap_msg(struct cap_msg_args *arg)
/* inline data size */
ceph_encode_32(&p, 0);
/* osd_epoch_barrier (version 5) */
- ceph_encode_32(&p, 0);
+ down_read(&osdc->lock);
+ ceph_encode_32(&p, osdc->epoch_barrier);
+ up_read(&osdc->lock);
/* oldest_flush_tid (version 6) */
ceph_encode_64(&p, arg->oldest_flush_tid);
@@ -3635,13 +3638,19 @@ void ceph_handle_caps(struct ceph_mds_session *session,
p += inline_len;
}
+ if (le16_to_cpu(msg->hdr.version) >= 5) {
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+ u32 epoch_barrier;
+
+ ceph_decode_32_safe(&p, end, epoch_barrier, bad);
+ ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
+ }
+
if (le16_to_cpu(msg->hdr.version) >= 8) {
u64 flush_tid;
u32 caller_uid, caller_gid;
- u32 osd_epoch_barrier;
u32 pool_ns_len;
- /* version >= 5 */
- ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+
/* version >= 6 */
ceph_decode_64_safe(&p, end, flush_tid, bad);
/* version >= 7 */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 43297c6b5a8b..40f89c768bf0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1565,9 +1565,15 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head;
struct ceph_mds_cap_item *item;
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
struct ceph_cap *cap;
LIST_HEAD(tmp_list);
int num_cap_releases;
+ __le32 barrier, *cap_barrier;
+
+ down_read(&osdc->lock);
+ barrier = cpu_to_le32(osdc->epoch_barrier);
+ up_read(&osdc->lock);
spin_lock(&session->s_cap_lock);
again:
@@ -1585,7 +1591,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
head = msg->front.iov_base;
head->num = cpu_to_le32(0);
msg->front.iov_len = sizeof(*head);
+
+ msg->hdr.version = cpu_to_le16(2);
+ msg->hdr.compat_version = cpu_to_le16(1);
}
+
cap = list_first_entry(&tmp_list, struct ceph_cap,
session_caps);
list_del(&cap->session_caps);
@@ -1603,6 +1613,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
ceph_put_cap(mdsc, cap);
if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
@@ -1618,6 +1633,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
spin_unlock(&session->s_cap_lock);
if (msg) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ac0475a2daa7..517684c7c5f0 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -104,10 +104,13 @@ struct ceph_mds_reply_info_parsed {
/*
* cap releases are batched and sent to the MDS en masse.
+ *
+ * Account for per-message overhead of mds_cap_release header
+ * and __le32 for osd epoch barrier trailing field.
*/
-#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - \
+#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \
sizeof(struct ceph_mds_cap_release)) / \
- sizeof(struct ceph_mds_cap_item))
+ sizeof(struct ceph_mds_cap_item))
/*
--
2.9.3
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes
2017-02-06 13:29 [PATCH v2 0/6] ceph: implement new-style ENOSPC handling in kcephfs Jeff Layton
` (4 preceding siblings ...)
2017-02-06 13:29 ` [PATCH v2 5/6] ceph: handle epoch barriers in cap messages Jeff Layton
@ 2017-02-06 13:29 ` Jeff Layton
2017-02-06 14:09 ` Ilya Dryomov
5 siblings, 1 reply; 15+ messages in thread
From: Jeff Layton @ 2017-02-06 13:29 UTC (permalink / raw)
To: ceph-devel; +Cc: zyan, sage, idryomov, jspray
Right now, cephfs will cancel any in-flight OSD write operations when a
new map comes in that shows the OSD or pool as full, but nothing
prevents new requests from stalling out after that point.
If the caller knows that it will want an immediate error return instead
of blocking on a full or at-quota error condition then allow it to set a
flag to request that behavior. Cephfs write requests will always set
that flag.
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
fs/ceph/addr.c | 14 +++++++++-----
fs/ceph/file.c | 8 +++++---
include/linux/ceph/rados.h | 1 +
net/ceph/osd_client.c | 6 ++++++
4 files changed, 21 insertions(+), 8 deletions(-)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4547bbf80e4f..577fe6351de1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1019,7 +1019,8 @@ static int ceph_writepages_start(struct address_space *mapping,
offset, &len, 0, num_ops,
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
+ CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_FULL_CANCEL,
snapc, truncate_seq,
truncate_size, false);
if (IS_ERR(req)) {
@@ -1030,7 +1031,8 @@ static int ceph_writepages_start(struct address_space *mapping,
CEPH_OSD_SLAB_OPS),
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
+ CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_FULL_CANCEL,
snapc, truncate_seq,
truncate_size, true);
BUG_ON(IS_ERR(req));
@@ -1681,7 +1683,9 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 0, 1,
CEPH_OSD_OP_CREATE,
- CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+ CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_FULL_CANCEL,
NULL, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -1699,7 +1703,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 1, 3,
CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+ CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_FULL_CANCEL,
NULL, ci->i_truncate_seq,
ci->i_truncate_size, false);
if (IS_ERR(req)) {
@@ -1872,7 +1876,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
goto out_unlock;
}
- wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
+ wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_FULL_CANCEL;
osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a91a4f1fc837..938dca02db7a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -692,7 +692,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
- CEPH_OSD_FLAG_WRITE;
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_FULL_CANCEL;
ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
@@ -849,7 +849,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
- CEPH_OSD_FLAG_WRITE;
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_FULL_CANCEL;
} else {
flags = CEPH_OSD_FLAG_READ;
}
@@ -1051,6 +1051,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_FULL_CANCEL |
CEPH_OSD_FLAG_ACK;
while ((len = iov_iter_count(from)) > 0) {
@@ -1549,7 +1550,8 @@ static int ceph_zero_partial_object(struct inode *inode,
offset, length,
0, 1, op,
CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
+ CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_FULL_CANCEL,
NULL, 0, 0, false);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 5c0da61cb763..def43570a85a 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -401,6 +401,7 @@ enum {
CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
+ CEPH_OSD_FLAG_FULL_CANCEL = 0x2000000, /* cancel operation on full flag */
};
enum {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d61d7a79fdb3..3b0e1220b552 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -50,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
struct ceph_osd_linger_request *lreq);
static void unlink_linger(struct ceph_osd *osd,
struct ceph_osd_linger_request *lreq);
+static void complete_request(struct ceph_osd_request *req, int err);
#if 1
static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
@@ -1643,6 +1644,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
enum calc_target_result ct_res;
bool need_send = false;
bool promoted = false;
+ int ret = 0;
WARN_ON(req->r_tid || req->r_got_reply);
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
@@ -1683,6 +1685,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
pr_warn_ratelimited("FULL or reached pool quota\n");
req->r_t.paused = true;
maybe_request_map(osdc);
+ if (req->r_flags & CEPH_OSD_FLAG_FULL_CANCEL)
+ ret = -ENOSPC;
} else if (!osd_homeless(osd)) {
need_send = true;
} else {
@@ -1699,6 +1703,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
link_request(osd, req);
if (need_send)
send_request(req);
+ else if (ret)
+ complete_request(req, ret);
mutex_unlock(&osd->lock);
if (ct_res == CALC_TARGET_POOL_DNE)
--
2.9.3
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes
2017-02-06 13:29 ` [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes Jeff Layton
@ 2017-02-06 14:09 ` Ilya Dryomov
2017-02-06 15:28 ` Jeff Layton
2017-02-06 15:49 ` Jeff Layton
0 siblings, 2 replies; 15+ messages in thread
From: Ilya Dryomov @ 2017-02-06 14:09 UTC (permalink / raw)
To: Jeff Layton; +Cc: Ceph Development, Yan, Zheng, Sage Weil, John Spray
On Mon, Feb 6, 2017 at 2:29 PM, Jeff Layton <jlayton@redhat.com> wrote:
> Right now, cephfs will cancel any in-flight OSD write operations when a
> new map comes in that shows the OSD or pool as full, but nothing
> prevents new requests from stalling out after that point.
>
> If the caller knows that it will want an immediate error return instead
> of blocking on a full or at-quota error condition then allow it to set a
> flag to request that behavior. Cephfs write requests will always set
> that flag.
>
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
> fs/ceph/addr.c | 14 +++++++++-----
> fs/ceph/file.c | 8 +++++---
> include/linux/ceph/rados.h | 1 +
> net/ceph/osd_client.c | 6 ++++++
> 4 files changed, 21 insertions(+), 8 deletions(-)
>
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 4547bbf80e4f..577fe6351de1 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -1019,7 +1019,8 @@ static int ceph_writepages_start(struct address_space *mapping,
> offset, &len, 0, num_ops,
> CEPH_OSD_OP_WRITE,
> CEPH_OSD_FLAG_WRITE |
> - CEPH_OSD_FLAG_ONDISK,
> + CEPH_OSD_FLAG_ONDISK |
> + CEPH_OSD_FLAG_FULL_CANCEL,
> snapc, truncate_seq,
> truncate_size, false);
> if (IS_ERR(req)) {
> @@ -1030,7 +1031,8 @@ static int ceph_writepages_start(struct address_space *mapping,
> CEPH_OSD_SLAB_OPS),
> CEPH_OSD_OP_WRITE,
> CEPH_OSD_FLAG_WRITE |
> - CEPH_OSD_FLAG_ONDISK,
> + CEPH_OSD_FLAG_ONDISK |
> + CEPH_OSD_FLAG_FULL_CANCEL,
> snapc, truncate_seq,
> truncate_size, true);
> BUG_ON(IS_ERR(req));
> @@ -1681,7 +1683,9 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
> ceph_vino(inode), 0, &len, 0, 1,
> CEPH_OSD_OP_CREATE,
> - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
> + CEPH_OSD_FLAG_ONDISK |
> + CEPH_OSD_FLAG_WRITE |
> + CEPH_OSD_FLAG_FULL_CANCEL,
> NULL, 0, 0, false);
> if (IS_ERR(req)) {
> err = PTR_ERR(req);
> @@ -1699,7 +1703,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
> ceph_vino(inode), 0, &len, 1, 3,
> CEPH_OSD_OP_WRITE,
> - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
> + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_FULL_CANCEL,
> NULL, ci->i_truncate_seq,
> ci->i_truncate_size, false);
> if (IS_ERR(req)) {
> @@ -1872,7 +1876,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
> goto out_unlock;
> }
>
> - wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
> + wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_FULL_CANCEL;
> osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
> ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
> ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index a91a4f1fc837..938dca02db7a 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -692,7 +692,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
>
> req->r_flags = CEPH_OSD_FLAG_ORDERSNAP |
> CEPH_OSD_FLAG_ONDISK |
> - CEPH_OSD_FLAG_WRITE;
> + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_FULL_CANCEL;
> ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
> ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
>
> @@ -849,7 +849,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>
> flags = CEPH_OSD_FLAG_ORDERSNAP |
> CEPH_OSD_FLAG_ONDISK |
> - CEPH_OSD_FLAG_WRITE;
> + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_FULL_CANCEL;
> } else {
> flags = CEPH_OSD_FLAG_READ;
> }
> @@ -1051,6 +1051,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
> flags = CEPH_OSD_FLAG_ORDERSNAP |
> CEPH_OSD_FLAG_ONDISK |
> CEPH_OSD_FLAG_WRITE |
> + CEPH_OSD_FLAG_FULL_CANCEL |
> CEPH_OSD_FLAG_ACK;
>
> while ((len = iov_iter_count(from)) > 0) {
> @@ -1549,7 +1550,8 @@ static int ceph_zero_partial_object(struct inode *inode,
> offset, length,
> 0, 1, op,
> CEPH_OSD_FLAG_WRITE |
> - CEPH_OSD_FLAG_ONDISK,
> + CEPH_OSD_FLAG_ONDISK |
> + CEPH_OSD_FLAG_FULL_CANCEL,
> NULL, 0, 0, false);
> if (IS_ERR(req)) {
> ret = PTR_ERR(req);
> diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
> index 5c0da61cb763..def43570a85a 100644
> --- a/include/linux/ceph/rados.h
> +++ b/include/linux/ceph/rados.h
> @@ -401,6 +401,7 @@ enum {
> CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
> CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
> CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
> + CEPH_OSD_FLAG_FULL_CANCEL = 0x2000000, /* cancel operation on full flag */
Is this a new flag? This is the wire protocol and I don't see it in
ceph.git.
I'll look at epoch_barrier and callback stuff later.
Thanks,
Ilya
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes
2017-02-06 14:09 ` Ilya Dryomov
@ 2017-02-06 15:28 ` Jeff Layton
2017-02-06 15:49 ` Jeff Layton
1 sibling, 0 replies; 15+ messages in thread
From: Jeff Layton @ 2017-02-06 15:28 UTC (permalink / raw)
To: Ilya Dryomov; +Cc: Ceph Development, Yan, Zheng, Sage Weil, John Spray
On Mon, 2017-02-06 at 15:09 +0100, Ilya Dryomov wrote:
> On Mon, Feb 6, 2017 at 2:29 PM, Jeff Layton <jlayton@redhat.com> wrote:
> > Right now, cephfs will cancel any in-flight OSD write operations when a
> > new map comes in that shows the OSD or pool as full, but nothing
> > prevents new requests from stalling out after that point.
> >
> > If the caller knows that it will want an immediate error return instead
> > of blocking on a full or at-quota error condition then allow it to set a
> > flag to request that behavior. Cephfs write requests will always set
> > that flag.
> >
> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > ---
> > fs/ceph/addr.c | 14 +++++++++-----
> > fs/ceph/file.c | 8 +++++---
> > include/linux/ceph/rados.h | 1 +
> > net/ceph/osd_client.c | 6 ++++++
> > 4 files changed, 21 insertions(+), 8 deletions(-)
> >
> > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > index 4547bbf80e4f..577fe6351de1 100644
> > --- a/fs/ceph/addr.c
> > +++ b/fs/ceph/addr.c
> > @@ -1019,7 +1019,8 @@ static int ceph_writepages_start(struct address_space *mapping,
> > offset, &len, 0, num_ops,
> > CEPH_OSD_OP_WRITE,
> > CEPH_OSD_FLAG_WRITE |
> > - CEPH_OSD_FLAG_ONDISK,
> > + CEPH_OSD_FLAG_ONDISK |
> > + CEPH_OSD_FLAG_FULL_CANCEL,
> > snapc, truncate_seq,
> > truncate_size, false);
> > if (IS_ERR(req)) {
> > @@ -1030,7 +1031,8 @@ static int ceph_writepages_start(struct address_space *mapping,
> > CEPH_OSD_SLAB_OPS),
> > CEPH_OSD_OP_WRITE,
> > CEPH_OSD_FLAG_WRITE |
> > - CEPH_OSD_FLAG_ONDISK,
> > + CEPH_OSD_FLAG_ONDISK |
> > + CEPH_OSD_FLAG_FULL_CANCEL,
> > snapc, truncate_seq,
> > truncate_size, true);
> > BUG_ON(IS_ERR(req));
> > @@ -1681,7 +1683,9 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> > req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
> > ceph_vino(inode), 0, &len, 0, 1,
> > CEPH_OSD_OP_CREATE,
> > - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
> > + CEPH_OSD_FLAG_ONDISK |
> > + CEPH_OSD_FLAG_WRITE |
> > + CEPH_OSD_FLAG_FULL_CANCEL,
> > NULL, 0, 0, false);
> > if (IS_ERR(req)) {
> > err = PTR_ERR(req);
> > @@ -1699,7 +1703,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> > req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
> > ceph_vino(inode), 0, &len, 1, 3,
> > CEPH_OSD_OP_WRITE,
> > - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
> > + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_FULL_CANCEL,
> > NULL, ci->i_truncate_seq,
> > ci->i_truncate_size, false);
> > if (IS_ERR(req)) {
> > @@ -1872,7 +1876,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
> > goto out_unlock;
> > }
> >
> > - wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
> > + wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_FULL_CANCEL;
> > osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
> > ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
> > ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index a91a4f1fc837..938dca02db7a 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -692,7 +692,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
> >
> > req->r_flags = CEPH_OSD_FLAG_ORDERSNAP |
> > CEPH_OSD_FLAG_ONDISK |
> > - CEPH_OSD_FLAG_WRITE;
> > + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_FULL_CANCEL;
> > ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
> > ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
> >
> > @@ -849,7 +849,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
> >
> > flags = CEPH_OSD_FLAG_ORDERSNAP |
> > CEPH_OSD_FLAG_ONDISK |
> > - CEPH_OSD_FLAG_WRITE;
> > + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_FULL_CANCEL;
> > } else {
> > flags = CEPH_OSD_FLAG_READ;
> > }
> > @@ -1051,6 +1051,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
> > flags = CEPH_OSD_FLAG_ORDERSNAP |
> > CEPH_OSD_FLAG_ONDISK |
> > CEPH_OSD_FLAG_WRITE |
> > + CEPH_OSD_FLAG_FULL_CANCEL |
> > CEPH_OSD_FLAG_ACK;
> >
> > while ((len = iov_iter_count(from)) > 0) {
> > @@ -1549,7 +1550,8 @@ static int ceph_zero_partial_object(struct inode *inode,
> > offset, length,
> > 0, 1, op,
> > CEPH_OSD_FLAG_WRITE |
> > - CEPH_OSD_FLAG_ONDISK,
> > + CEPH_OSD_FLAG_ONDISK |
> > + CEPH_OSD_FLAG_FULL_CANCEL,
> > NULL, 0, 0, false);
> > if (IS_ERR(req)) {
> > ret = PTR_ERR(req);
> > diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
> > index 5c0da61cb763..def43570a85a 100644
> > --- a/include/linux/ceph/rados.h
> > +++ b/include/linux/ceph/rados.h
> > @@ -401,6 +401,7 @@ enum {
> > CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
> > CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
> > CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
> > + CEPH_OSD_FLAG_FULL_CANCEL = 0x2000000, /* cancel operation on full flag */
>
> Is this a new flag? This is the wire protocol and I don't see it in
> ceph.git.
>
> I'll look at epoch_barrier and callback stuff later.
>
> Thanks,
>
Oof, ok. I thought those were kernel-internal flags. I missed that they
get encoded onto the wire.
Yeah, this is probably the wrong place to pass that flag in then. What
we really want is to pass this along to the request submission code, but
there is no need to pass this to the server.
I'll look at the code to see if there's a more suitable place for this
flag. Worst case, I'll just add a new bool to ceph_osd_request for this.
Thanks,
--
Jeff Layton <jlayton@redhat.com>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes
2017-02-06 14:09 ` Ilya Dryomov
2017-02-06 15:28 ` Jeff Layton
@ 2017-02-06 15:49 ` Jeff Layton
2017-02-06 16:27 ` Ilya Dryomov
1 sibling, 1 reply; 15+ messages in thread
From: Jeff Layton @ 2017-02-06 15:49 UTC (permalink / raw)
To: Ilya Dryomov; +Cc: Ceph Development, Yan, Zheng, Sage Weil, John Spray
On Mon, 2017-02-06 at 15:09 +0100, Ilya Dryomov wrote:
[...]
> > diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
> > index 5c0da61cb763..def43570a85a 100644
> > --- a/include/linux/ceph/rados.h
> > +++ b/include/linux/ceph/rados.h
> > @@ -401,6 +401,7 @@ enum {
> > CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
> > CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
> > CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
> > + CEPH_OSD_FLAG_FULL_CANCEL = 0x2000000, /* cancel operation on full flag */
>
> Is this a new flag? This is the wire protocol and I don't see it in
> ceph.git.
>
> I'll look at epoch_barrier and callback stuff later.
>
> Thanks,
>
> Ilya
Here's a respun version of the last patch in the set. This should avoid
adding an on the wire flag. I just added a new bool and changed the
code to set and look at that to indicate the desire for an immediate
error return in this case. Compiles but is otherwise untested. I'll give
it a go in a bit.
-----------------------------8<------------------------------
libceph: allow requests to return immediately on full
conditions if caller wishes
Right now, cephfs will cancel any in-flight OSD write operations when a
new map comes in that shows the OSD or pool as full, but nothing
prevents new requests from stalling out after that point.
If the caller knows that it will want an immediate error return instead
of blocking on a full or at-quota error condition then allow it to set a
flag to request that behavior. Cephfs write requests will always set
that flag.
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
fs/ceph/addr.c | 4 ++++
fs/ceph/file.c | 4 ++++
include/linux/ceph/osd_client.h | 1 +
net/ceph/osd_client.c | 6 ++++++
4 files changed, 15 insertions(+)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4547bbf80e4f..ef9c9bae7460 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1040,6 +1040,7 @@ static int ceph_writepages_start(struct address_space *mapping,
req->r_callback = writepages_finish;
req->r_inode = inode;
+ req->r_enospc_on_full = true;
/* Format the osd request message and submit the write */
len = 0;
@@ -1689,6 +1690,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
}
req->r_mtime = inode->i_mtime;
+ req->r_enospc_on_full = true;
err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!err)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1732,6 +1734,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
}
req->r_mtime = inode->i_mtime;
+ req->r_enospc_on_full = true;
err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!err)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1893,6 +1896,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
wr_req->r_mtime = ci->vfs_inode.i_mtime;
+ wr_req->r_enospc_on_full = true;
err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
if (!err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a91a4f1fc837..eaed17f90d5f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -714,6 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_callback = ceph_aio_complete_req;
req->r_inode = inode;
req->r_priv = aio_req;
+ req->r_enospc_on_full = true;
ret = ceph_osdc_start_request(req->r_osdc, req, false);
out:
@@ -912,6 +913,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
req->r_mtime = mtime;
+ req->r_enospc_on_full = true;
}
osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
@@ -1105,6 +1107,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
false, true);
req->r_mtime = mtime;
+ req->r_enospc_on_full = true;
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!ret)
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1557,6 +1560,7 @@ static int ceph_zero_partial_object(struct inode *inode,
}
req->r_mtime = inode->i_mtime;
+ req->r_enospc_on_full = true;
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!ret) {
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 17bf1873bb01..f01e93ff03d5 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -172,6 +172,7 @@ struct ceph_osd_request {
int r_result;
bool r_got_reply;
+ bool r_enospc_on_full; /* return ENOSPC when full */
struct ceph_osd_client *r_osdc;
struct kref r_kref;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d61d7a79fdb3..9f40d11b3c68 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -50,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
struct ceph_osd_linger_request *lreq);
static void unlink_linger(struct ceph_osd *osd,
struct ceph_osd_linger_request *lreq);
+static void complete_request(struct ceph_osd_request *req, int err);
#if 1
static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
@@ -1643,6 +1644,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
enum calc_target_result ct_res;
bool need_send = false;
bool promoted = false;
+ int ret = 0;
WARN_ON(req->r_tid || req->r_got_reply);
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
@@ -1683,6 +1685,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
pr_warn_ratelimited("FULL or reached pool quota\n");
req->r_t.paused = true;
maybe_request_map(osdc);
+ if (req->r_enospc_on_full)
+ ret = -ENOSPC;
} else if (!osd_homeless(osd)) {
need_send = true;
} else {
@@ -1699,6 +1703,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
link_request(osd, req);
if (need_send)
send_request(req);
+ else if (ret)
+ complete_request(req, ret);
mutex_unlock(&osd->lock);
if (ct_res == CALC_TARGET_POOL_DNE)
--
2.9.3
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes
2017-02-06 15:49 ` Jeff Layton
@ 2017-02-06 16:27 ` Ilya Dryomov
2017-02-06 16:36 ` Jeff Layton
0 siblings, 1 reply; 15+ messages in thread
From: Ilya Dryomov @ 2017-02-06 16:27 UTC (permalink / raw)
To: Jeff Layton; +Cc: Ceph Development, Yan, Zheng, Sage Weil, John Spray
On Mon, Feb 6, 2017 at 4:49 PM, Jeff Layton <jlayton@redhat.com> wrote:
> On Mon, 2017-02-06 at 15:09 +0100, Ilya Dryomov wrote:
>
> [...]
>
>> > diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
>> > index 5c0da61cb763..def43570a85a 100644
>> > --- a/include/linux/ceph/rados.h
>> > +++ b/include/linux/ceph/rados.h
>> > @@ -401,6 +401,7 @@ enum {
>> > CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
>> > CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
>> > CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
>> > + CEPH_OSD_FLAG_FULL_CANCEL = 0x2000000, /* cancel operation on full flag */
>>
>> Is this a new flag? This is the wire protocol and I don't see it in
>> ceph.git.
>>
>> I'll look at epoch_barrier and callback stuff later.
>>
>> Thanks,
>>
>> Ilya
>
> Here's a respun version of the last patch in the set. This should avoid
> adding an on the wire flag. I just added a new bool and changed the
> code to set and look at that to indicate the desire for an immediate
> error return in this case. Compiles but is otherwise untested. I'll give
> it a go in a bit.
>
> -----------------------------8<------------------------------
>
> libceph: allow requests to return immediately on full
> conditions if caller wishes
>
> Right now, cephfs will cancel any in-flight OSD write operations when a
> new map comes in that shows the OSD or pool as full, but nothing
> prevents new requests from stalling out after that point.
>
> If the caller knows that it will want an immediate error return instead
> of blocking on a full or at-quota error condition then allow it to set a
> flag to request that behavior. Cephfs write requests will always set
> that flag.
>
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
> fs/ceph/addr.c | 4 ++++
> fs/ceph/file.c | 4 ++++
> include/linux/ceph/osd_client.h | 1 +
> net/ceph/osd_client.c | 6 ++++++
> 4 files changed, 15 insertions(+)
>
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 4547bbf80e4f..ef9c9bae7460 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -1040,6 +1040,7 @@ static int ceph_writepages_start(struct address_space *mapping,
>
> req->r_callback = writepages_finish;
> req->r_inode = inode;
> + req->r_enospc_on_full = true;
>
> /* Format the osd request message and submit the write */
> len = 0;
> @@ -1689,6 +1690,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> }
>
> req->r_mtime = inode->i_mtime;
> + req->r_enospc_on_full = true;
> err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> if (!err)
> err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> @@ -1732,6 +1734,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> }
>
> req->r_mtime = inode->i_mtime;
> + req->r_enospc_on_full = true;
> err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> if (!err)
> err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> @@ -1893,6 +1896,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
> err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
>
> wr_req->r_mtime = ci->vfs_inode.i_mtime;
> + wr_req->r_enospc_on_full = true;
> err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
>
> if (!err)
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index a91a4f1fc837..eaed17f90d5f 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -714,6 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
> req->r_callback = ceph_aio_complete_req;
> req->r_inode = inode;
> req->r_priv = aio_req;
> + req->r_enospc_on_full = true;
>
> ret = ceph_osdc_start_request(req->r_osdc, req, false);
> out:
> @@ -912,6 +913,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>
> osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
> req->r_mtime = mtime;
> + req->r_enospc_on_full = true;
> }
>
> osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
> @@ -1105,6 +1107,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
> false, true);
>
> req->r_mtime = mtime;
> + req->r_enospc_on_full = true;
> ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> if (!ret)
> ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
> @@ -1557,6 +1560,7 @@ static int ceph_zero_partial_object(struct inode *inode,
> }
>
> req->r_mtime = inode->i_mtime;
> + req->r_enospc_on_full = true;
> ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> if (!ret) {
> ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index 17bf1873bb01..f01e93ff03d5 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -172,6 +172,7 @@ struct ceph_osd_request {
>
> int r_result;
> bool r_got_reply;
> + bool r_enospc_on_full; /* return ENOSPC when full */
>
> struct ceph_osd_client *r_osdc;
> struct kref r_kref;
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index d61d7a79fdb3..9f40d11b3c68 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -50,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
> struct ceph_osd_linger_request *lreq);
> static void unlink_linger(struct ceph_osd *osd,
> struct ceph_osd_linger_request *lreq);
> +static void complete_request(struct ceph_osd_request *req, int err);
>
> #if 1
> static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
> @@ -1643,6 +1644,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> enum calc_target_result ct_res;
> bool need_send = false;
> bool promoted = false;
> + int ret = 0;
>
> WARN_ON(req->r_tid || req->r_got_reply);
> dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
> @@ -1683,6 +1685,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> pr_warn_ratelimited("FULL or reached pool quota\n");
> req->r_t.paused = true;
> maybe_request_map(osdc);
> + if (req->r_enospc_on_full)
> + ret = -ENOSPC;
> } else if (!osd_homeless(osd)) {
> need_send = true;
> } else {
> @@ -1699,6 +1703,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> link_request(osd, req);
> if (need_send)
> send_request(req);
> + else if (ret)
> + complete_request(req, ret);
How is this handled in the userspace client? I don't see a similar
check in Objecter.
Thanks,
Ilya
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes
2017-02-06 16:27 ` Ilya Dryomov
@ 2017-02-06 16:36 ` Jeff Layton
2017-02-06 17:05 ` Ilya Dryomov
0 siblings, 1 reply; 15+ messages in thread
From: Jeff Layton @ 2017-02-06 16:36 UTC (permalink / raw)
To: Ilya Dryomov; +Cc: Ceph Development, Yan, Zheng, Sage Weil, John Spray
On Mon, 2017-02-06 at 17:27 +0100, Ilya Dryomov wrote:
> On Mon, Feb 6, 2017 at 4:49 PM, Jeff Layton <jlayton@redhat.com> wrote:
> > On Mon, 2017-02-06 at 15:09 +0100, Ilya Dryomov wrote:
> >
> > [...]
> >
> > > > diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
> > > > index 5c0da61cb763..def43570a85a 100644
> > > > --- a/include/linux/ceph/rados.h
> > > > +++ b/include/linux/ceph/rados.h
> > > > @@ -401,6 +401,7 @@ enum {
> > > > CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
> > > > CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
> > > > CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
> > > > + CEPH_OSD_FLAG_FULL_CANCEL = 0x2000000, /* cancel operation on full flag */
> > >
> > > Is this a new flag? This is the wire protocol and I don't see it in
> > > ceph.git.
> > >
> > > I'll look at epoch_barrier and callback stuff later.
> > >
> > > Thanks,
> > >
> > > Ilya
> >
> > Here's a respun version of the last patch in the set. This should avoid
> > adding an on the wire flag. I just added a new bool and changed the
> > code to set and look at that to indicate the desire for an immediate
> > error return in this case. Compiles but is otherwise untested. I'll give
> > it a go in a bit.
> >
> > -----------------------------8<------------------------------
> >
> > libceph: allow requests to return immediately on full
> > conditions if caller wishes
> >
> > Right now, cephfs will cancel any in-flight OSD write operations when a
> > new map comes in that shows the OSD or pool as full, but nothing
> > prevents new requests from stalling out after that point.
> >
> > If the caller knows that it will want an immediate error return instead
> > of blocking on a full or at-quota error condition then allow it to set a
> > flag to request that behavior. Cephfs write requests will always set
> > that flag.
> >
> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > ---
> > fs/ceph/addr.c | 4 ++++
> > fs/ceph/file.c | 4 ++++
> > include/linux/ceph/osd_client.h | 1 +
> > net/ceph/osd_client.c | 6 ++++++
> > 4 files changed, 15 insertions(+)
> >
> > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > index 4547bbf80e4f..ef9c9bae7460 100644
> > --- a/fs/ceph/addr.c
> > +++ b/fs/ceph/addr.c
> > @@ -1040,6 +1040,7 @@ static int ceph_writepages_start(struct address_space *mapping,
> >
> > req->r_callback = writepages_finish;
> > req->r_inode = inode;
> > + req->r_enospc_on_full = true;
> >
> > /* Format the osd request message and submit the write */
> > len = 0;
> > @@ -1689,6 +1690,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> > }
> >
> > req->r_mtime = inode->i_mtime;
> > + req->r_enospc_on_full = true;
> > err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > if (!err)
> > err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > @@ -1732,6 +1734,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> > }
> >
> > req->r_mtime = inode->i_mtime;
> > + req->r_enospc_on_full = true;
> > err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > if (!err)
> > err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > @@ -1893,6 +1896,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
> > err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
> >
> > wr_req->r_mtime = ci->vfs_inode.i_mtime;
> > + wr_req->r_enospc_on_full = true;
> > err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
> >
> > if (!err)
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index a91a4f1fc837..eaed17f90d5f 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -714,6 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
> > req->r_callback = ceph_aio_complete_req;
> > req->r_inode = inode;
> > req->r_priv = aio_req;
> > + req->r_enospc_on_full = true;
> >
> > ret = ceph_osdc_start_request(req->r_osdc, req, false);
> > out:
> > @@ -912,6 +913,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
> >
> > osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
> > req->r_mtime = mtime;
> > + req->r_enospc_on_full = true;
> > }
> >
> > osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
> > @@ -1105,6 +1107,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
> > false, true);
> >
> > req->r_mtime = mtime;
> > + req->r_enospc_on_full = true;
> > ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > if (!ret)
> > ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > @@ -1557,6 +1560,7 @@ static int ceph_zero_partial_object(struct inode *inode,
> > }
> >
> > req->r_mtime = inode->i_mtime;
> > + req->r_enospc_on_full = true;
> > ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > if (!ret) {
> > ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> > index 17bf1873bb01..f01e93ff03d5 100644
> > --- a/include/linux/ceph/osd_client.h
> > +++ b/include/linux/ceph/osd_client.h
> > @@ -172,6 +172,7 @@ struct ceph_osd_request {
> >
> > int r_result;
> > bool r_got_reply;
> > + bool r_enospc_on_full; /* return ENOSPC when full */
> >
> > struct ceph_osd_client *r_osdc;
> > struct kref r_kref;
> > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> > index d61d7a79fdb3..9f40d11b3c68 100644
> > --- a/net/ceph/osd_client.c
> > +++ b/net/ceph/osd_client.c
> > @@ -50,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
> > struct ceph_osd_linger_request *lreq);
> > static void unlink_linger(struct ceph_osd *osd,
> > struct ceph_osd_linger_request *lreq);
> > +static void complete_request(struct ceph_osd_request *req, int err);
> >
> > #if 1
> > static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
> > @@ -1643,6 +1644,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> > enum calc_target_result ct_res;
> > bool need_send = false;
> > bool promoted = false;
> > + int ret = 0;
> >
> > WARN_ON(req->r_tid || req->r_got_reply);
> > dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
> > @@ -1683,6 +1685,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> > pr_warn_ratelimited("FULL or reached pool quota\n");
> > req->r_t.paused = true;
> > maybe_request_map(osdc);
> > + if (req->r_enospc_on_full)
> > + ret = -ENOSPC;
> > } else if (!osd_homeless(osd)) {
> > need_send = true;
> > } else {
> > @@ -1699,6 +1703,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> > link_request(osd, req);
> > if (need_send)
> > send_request(req);
> > + else if (ret)
> > + complete_request(req, ret);
>
> How is this handled in the userspace client? I don't see a similar
> check in Objecter.
>
> Thanks,
>
> Ilya
It seems to be handled at a much higher layer in libcephfs. In _write(),
for instance, we have:
if (objecter->osdmap_pool_full(in->layout.pool_id)) {
return -ENOSPC;
}
...and there's also some EDQUOT handling a little below there as well.
I don't think we can reasonably follow that model in the kernel client
though. The way it's done in userland seems to require the big client
mutex be held over large swaths of the code, and that's not the case in
the kernel client (thankfully).
--
Jeff Layton <jlayton@redhat.com>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes
2017-02-06 16:36 ` Jeff Layton
@ 2017-02-06 17:05 ` Ilya Dryomov
2017-02-06 18:30 ` Jeff Layton
0 siblings, 1 reply; 15+ messages in thread
From: Ilya Dryomov @ 2017-02-06 17:05 UTC (permalink / raw)
To: Jeff Layton; +Cc: Ceph Development, Yan, Zheng, Sage Weil, John Spray
On Mon, Feb 6, 2017 at 5:36 PM, Jeff Layton <jlayton@redhat.com> wrote:
> On Mon, 2017-02-06 at 17:27 +0100, Ilya Dryomov wrote:
>> On Mon, Feb 6, 2017 at 4:49 PM, Jeff Layton <jlayton@redhat.com> wrote:
>> > On Mon, 2017-02-06 at 15:09 +0100, Ilya Dryomov wrote:
>> >
>> > [...]
>> >
>> > > > diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
>> > > > index 5c0da61cb763..def43570a85a 100644
>> > > > --- a/include/linux/ceph/rados.h
>> > > > +++ b/include/linux/ceph/rados.h
>> > > > @@ -401,6 +401,7 @@ enum {
>> > > > CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
>> > > > CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
>> > > > CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
>> > > > + CEPH_OSD_FLAG_FULL_CANCEL = 0x2000000, /* cancel operation on full flag */
>> > >
>> > > Is this a new flag? This is the wire protocol and I don't see it in
>> > > ceph.git.
>> > >
>> > > I'll look at epoch_barrier and callback stuff later.
>> > >
>> > > Thanks,
>> > >
>> > > Ilya
>> >
>> > Here's a respun version of the last patch in the set. This should avoid
>> > adding an on the wire flag. I just added a new bool and changed the
>> > code to set and look at that to indicate the desire for an immediate
>> > error return in this case. Compiles but is otherwise untested. I'll give
>> > it a go in a bit.
>> >
>> > -----------------------------8<------------------------------
>> >
>> > libceph: allow requests to return immediately on full
>> > conditions if caller wishes
>> >
>> > Right now, cephfs will cancel any in-flight OSD write operations when a
>> > new map comes in that shows the OSD or pool as full, but nothing
>> > prevents new requests from stalling out after that point.
>> >
>> > If the caller knows that it will want an immediate error return instead
>> > of blocking on a full or at-quota error condition then allow it to set a
>> > flag to request that behavior. Cephfs write requests will always set
>> > that flag.
>> >
>> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > ---
>> > fs/ceph/addr.c | 4 ++++
>> > fs/ceph/file.c | 4 ++++
>> > include/linux/ceph/osd_client.h | 1 +
>> > net/ceph/osd_client.c | 6 ++++++
>> > 4 files changed, 15 insertions(+)
>> >
>> > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
>> > index 4547bbf80e4f..ef9c9bae7460 100644
>> > --- a/fs/ceph/addr.c
>> > +++ b/fs/ceph/addr.c
>> > @@ -1040,6 +1040,7 @@ static int ceph_writepages_start(struct address_space *mapping,
>> >
>> > req->r_callback = writepages_finish;
>> > req->r_inode = inode;
>> > + req->r_enospc_on_full = true;
>> >
>> > /* Format the osd request message and submit the write */
>> > len = 0;
>> > @@ -1689,6 +1690,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
>> > }
>> >
>> > req->r_mtime = inode->i_mtime;
>> > + req->r_enospc_on_full = true;
>> > err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
>> > if (!err)
>> > err = ceph_osdc_wait_request(&fsc->client->osdc, req);
>> > @@ -1732,6 +1734,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
>> > }
>> >
>> > req->r_mtime = inode->i_mtime;
>> > + req->r_enospc_on_full = true;
>> > err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
>> > if (!err)
>> > err = ceph_osdc_wait_request(&fsc->client->osdc, req);
>> > @@ -1893,6 +1896,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
>> > err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
>> >
>> > wr_req->r_mtime = ci->vfs_inode.i_mtime;
>> > + wr_req->r_enospc_on_full = true;
>> > err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
>> >
>> > if (!err)
>> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>> > index a91a4f1fc837..eaed17f90d5f 100644
>> > --- a/fs/ceph/file.c
>> > +++ b/fs/ceph/file.c
>> > @@ -714,6 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
>> > req->r_callback = ceph_aio_complete_req;
>> > req->r_inode = inode;
>> > req->r_priv = aio_req;
>> > + req->r_enospc_on_full = true;
>> >
>> > ret = ceph_osdc_start_request(req->r_osdc, req, false);
>> > out:
>> > @@ -912,6 +913,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>> >
>> > osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
>> > req->r_mtime = mtime;
>> > + req->r_enospc_on_full = true;
>> > }
>> >
>> > osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
>> > @@ -1105,6 +1107,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
>> > false, true);
>> >
>> > req->r_mtime = mtime;
>> > + req->r_enospc_on_full = true;
>> > ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
>> > if (!ret)
>> > ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
>> > @@ -1557,6 +1560,7 @@ static int ceph_zero_partial_object(struct inode *inode,
>> > }
>> >
>> > req->r_mtime = inode->i_mtime;
>> > + req->r_enospc_on_full = true;
>> > ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
>> > if (!ret) {
>> > ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
>> > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>> > index 17bf1873bb01..f01e93ff03d5 100644
>> > --- a/include/linux/ceph/osd_client.h
>> > +++ b/include/linux/ceph/osd_client.h
>> > @@ -172,6 +172,7 @@ struct ceph_osd_request {
>> >
>> > int r_result;
>> > bool r_got_reply;
>> > + bool r_enospc_on_full; /* return ENOSPC when full */
>> >
>> > struct ceph_osd_client *r_osdc;
>> > struct kref r_kref;
>> > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>> > index d61d7a79fdb3..9f40d11b3c68 100644
>> > --- a/net/ceph/osd_client.c
>> > +++ b/net/ceph/osd_client.c
>> > @@ -50,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
>> > struct ceph_osd_linger_request *lreq);
>> > static void unlink_linger(struct ceph_osd *osd,
>> > struct ceph_osd_linger_request *lreq);
>> > +static void complete_request(struct ceph_osd_request *req, int err);
>> >
>> > #if 1
>> > static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
>> > @@ -1643,6 +1644,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
>> > enum calc_target_result ct_res;
>> > bool need_send = false;
>> > bool promoted = false;
>> > + int ret = 0;
>> >
>> > WARN_ON(req->r_tid || req->r_got_reply);
>> > dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
>> > @@ -1683,6 +1685,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
>> > pr_warn_ratelimited("FULL or reached pool quota\n");
>> > req->r_t.paused = true;
>> > maybe_request_map(osdc);
>> > + if (req->r_enospc_on_full)
>> > + ret = -ENOSPC;
>> > } else if (!osd_homeless(osd)) {
>> > need_send = true;
>> > } else {
>> > @@ -1699,6 +1703,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
>> > link_request(osd, req);
>> > if (need_send)
>> > send_request(req);
>> > + else if (ret)
>> > + complete_request(req, ret);
>>
>> How is this handled in the userspace client? I don't see a similar
>> check in Objecter.
>>
>> Thanks,
>>
>> Ilya
>
> It seems to be handled at a much higher layer in libcephfs. In _write(),
> for instance, we have:
>
> if (objecter->osdmap_pool_full(in->layout.pool_id)) {
> return -ENOSPC;
> }
>
> ...and there's also some EDQUOT handling a little below there as well.
>
> I don't think we can reasonably follow that model in the kernel client
> though. The way it's done in userland seems to require the big client
> mutex be held over large swaths of the code, and that's not the case in
> the kernel client (thankfully).
All it's doing is checking a flag under a lock; the same lock is taken
in ceph_osdc_start_request(). Why would exporting
bool ceph_osdc_pool_full(osdc, pool_id)
{
bool ret;
down_read(&osdc->lock);
ret = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
pool_full(osdc, pool_id);
up_read(&osdc->lock);
return ret;
}
not work? IOW why special case request handling code if you know in
advance that the request is a goner?
Thanks,
Ilya
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes
2017-02-06 17:05 ` Ilya Dryomov
@ 2017-02-06 18:30 ` Jeff Layton
2017-02-06 21:46 ` Jeff Layton
0 siblings, 1 reply; 15+ messages in thread
From: Jeff Layton @ 2017-02-06 18:30 UTC (permalink / raw)
To: Ilya Dryomov; +Cc: Ceph Development, Yan, Zheng, Sage Weil, John Spray
On Mon, 2017-02-06 at 18:05 +0100, Ilya Dryomov wrote:
> On Mon, Feb 6, 2017 at 5:36 PM, Jeff Layton <jlayton@redhat.com> wrote:
> > On Mon, 2017-02-06 at 17:27 +0100, Ilya Dryomov wrote:
> > > On Mon, Feb 6, 2017 at 4:49 PM, Jeff Layton <jlayton@redhat.com> wrote:
> > > > On Mon, 2017-02-06 at 15:09 +0100, Ilya Dryomov wrote:
> > > >
> > > > [...]
> > > >
> > > > > > diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
> > > > > > index 5c0da61cb763..def43570a85a 100644
> > > > > > --- a/include/linux/ceph/rados.h
> > > > > > +++ b/include/linux/ceph/rados.h
> > > > > > @@ -401,6 +401,7 @@ enum {
> > > > > > CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
> > > > > > CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
> > > > > > CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
> > > > > > + CEPH_OSD_FLAG_FULL_CANCEL = 0x2000000, /* cancel operation on full flag */
> > > > >
> > > > > Is this a new flag? This is the wire protocol and I don't see it in
> > > > > ceph.git.
> > > > >
> > > > > I'll look at epoch_barrier and callback stuff later.
> > > > >
> > > > > Thanks,
> > > > >
> > > > > Ilya
> > > >
> > > > Here's a respun version of the last patch in the set. This should avoid
> > > > adding an on the wire flag. I just added a new bool and changed the
> > > > code to set and look at that to indicate the desire for an immediate
> > > > error return in this case. Compiles but is otherwise untested. I'll give
> > > > it a go in a bit.
> > > >
> > > > -----------------------------8<------------------------------
> > > >
> > > > libceph: allow requests to return immediately on full
> > > > conditions if caller wishes
> > > >
> > > > Right now, cephfs will cancel any in-flight OSD write operations when a
> > > > new map comes in that shows the OSD or pool as full, but nothing
> > > > prevents new requests from stalling out after that point.
> > > >
> > > > If the caller knows that it will want an immediate error return instead
> > > > of blocking on a full or at-quota error condition then allow it to set a
> > > > flag to request that behavior. Cephfs write requests will always set
> > > > that flag.
> > > >
> > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > ---
> > > > fs/ceph/addr.c | 4 ++++
> > > > fs/ceph/file.c | 4 ++++
> > > > include/linux/ceph/osd_client.h | 1 +
> > > > net/ceph/osd_client.c | 6 ++++++
> > > > 4 files changed, 15 insertions(+)
> > > >
> > > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > > > index 4547bbf80e4f..ef9c9bae7460 100644
> > > > --- a/fs/ceph/addr.c
> > > > +++ b/fs/ceph/addr.c
> > > > @@ -1040,6 +1040,7 @@ static int ceph_writepages_start(struct address_space *mapping,
> > > >
> > > > req->r_callback = writepages_finish;
> > > > req->r_inode = inode;
> > > > + req->r_enospc_on_full = true;
> > > >
> > > > /* Format the osd request message and submit the write */
> > > > len = 0;
> > > > @@ -1689,6 +1690,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> > > > }
> > > >
> > > > req->r_mtime = inode->i_mtime;
> > > > + req->r_enospc_on_full = true;
> > > > err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > > > if (!err)
> > > > err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > > > @@ -1732,6 +1734,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> > > > }
> > > >
> > > > req->r_mtime = inode->i_mtime;
> > > > + req->r_enospc_on_full = true;
> > > > err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > > > if (!err)
> > > > err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > > > @@ -1893,6 +1896,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
> > > > err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
> > > >
> > > > wr_req->r_mtime = ci->vfs_inode.i_mtime;
> > > > + wr_req->r_enospc_on_full = true;
> > > > err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
> > > >
> > > > if (!err)
> > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > > index a91a4f1fc837..eaed17f90d5f 100644
> > > > --- a/fs/ceph/file.c
> > > > +++ b/fs/ceph/file.c
> > > > @@ -714,6 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
> > > > req->r_callback = ceph_aio_complete_req;
> > > > req->r_inode = inode;
> > > > req->r_priv = aio_req;
> > > > + req->r_enospc_on_full = true;
> > > >
> > > > ret = ceph_osdc_start_request(req->r_osdc, req, false);
> > > > out:
> > > > @@ -912,6 +913,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
> > > >
> > > > osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
> > > > req->r_mtime = mtime;
> > > > + req->r_enospc_on_full = true;
> > > > }
> > > >
> > > > osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
> > > > @@ -1105,6 +1107,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
> > > > false, true);
> > > >
> > > > req->r_mtime = mtime;
> > > > + req->r_enospc_on_full = true;
> > > > ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > > > if (!ret)
> > > > ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > > > @@ -1557,6 +1560,7 @@ static int ceph_zero_partial_object(struct inode *inode,
> > > > }
> > > >
> > > > req->r_mtime = inode->i_mtime;
> > > > + req->r_enospc_on_full = true;
> > > > ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > > > if (!ret) {
> > > > ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> > > > index 17bf1873bb01..f01e93ff03d5 100644
> > > > --- a/include/linux/ceph/osd_client.h
> > > > +++ b/include/linux/ceph/osd_client.h
> > > > @@ -172,6 +172,7 @@ struct ceph_osd_request {
> > > >
> > > > int r_result;
> > > > bool r_got_reply;
> > > > + bool r_enospc_on_full; /* return ENOSPC when full */
> > > >
> > > > struct ceph_osd_client *r_osdc;
> > > > struct kref r_kref;
> > > > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> > > > index d61d7a79fdb3..9f40d11b3c68 100644
> > > > --- a/net/ceph/osd_client.c
> > > > +++ b/net/ceph/osd_client.c
> > > > @@ -50,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
> > > > struct ceph_osd_linger_request *lreq);
> > > > static void unlink_linger(struct ceph_osd *osd,
> > > > struct ceph_osd_linger_request *lreq);
> > > > +static void complete_request(struct ceph_osd_request *req, int err);
> > > >
> > > > #if 1
> > > > static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
> > > > @@ -1643,6 +1644,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> > > > enum calc_target_result ct_res;
> > > > bool need_send = false;
> > > > bool promoted = false;
> > > > + int ret = 0;
> > > >
> > > > WARN_ON(req->r_tid || req->r_got_reply);
> > > > dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
> > > > @@ -1683,6 +1685,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> > > > pr_warn_ratelimited("FULL or reached pool quota\n");
> > > > req->r_t.paused = true;
> > > > maybe_request_map(osdc);
> > > > + if (req->r_enospc_on_full)
> > > > + ret = -ENOSPC;
> > > > } else if (!osd_homeless(osd)) {
> > > > need_send = true;
> > > > } else {
> > > > @@ -1699,6 +1703,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> > > > link_request(osd, req);
> > > > if (need_send)
> > > > send_request(req);
> > > > + else if (ret)
> > > > + complete_request(req, ret);
> > >
> > > How is this handled in the userspace client? I don't see a similar
> > > check in Objecter.
> > >
> > > Thanks,
> > >
> > > Ilya
> >
> > It seems to be handled at a much higher layer in libcephfs. In _write(),
> > for instance, we have:
> >
> > if (objecter->osdmap_pool_full(in->layout.pool_id)) {
> > return -ENOSPC;
> > }
> >
> > ...and there's also some EDQUOT handling a little below there as well.
> >
> > I don't think we can reasonably follow that model in the kernel client
> > though. The way it's done in userland seems to require the big client
> > mutex be held over large swaths of the code, and that's not the case in
> > the kernel client (thankfully).
>
> All it's doing is checking a flag under a lock; the same lock is taken
> in ceph_osdc_start_request(). Why would exporting
>
> bool ceph_osdc_pool_full(osdc, pool_id)
> {
> bool ret;
>
> down_read(&osdc->lock);
> ret = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
> pool_full(osdc, pool_id);
> up_read(&osdc->lock);
> return ret;
> }
>
> not work? IOW why special case request handling code if you know in
> advance that the request is a goner?
>
You can't just call that before calling submit_request because once you
call up_read there, the map could have changed and the map_cb already
run. At that point we can't rely on the callback cancelling the now
hung request.
You could call that function after ceph_osdc_start_request, but before
we end up waiting on it (or whatever), but then we'd need to export
complete_request as well so we could cancel it. The other downside
there is that we end up with even more rwsem thrashing. We already end
up taking it for write in order to submit the thing, so having to take
it for read again afterward is not ideal.
We could remedy that by pushing the rwsem handling up into cephfs
(basically, export submit_request too), but that smells more like a
layering violation to me. The full flag is part of the osdmap, so it
doesn't seem wrong to have libcephfs handle it, and that's quite a bit
simpler and more efficient than trying to manage that from the cephfs
layer.
--
Jeff Layton <jlayton@redhat.com>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 6/6] libceph: allow requests to return immediately on full conditions if caller wishes
2017-02-06 18:30 ` Jeff Layton
@ 2017-02-06 21:46 ` Jeff Layton
0 siblings, 0 replies; 15+ messages in thread
From: Jeff Layton @ 2017-02-06 21:46 UTC (permalink / raw)
To: Ilya Dryomov; +Cc: Ceph Development, Yan, Zheng, Sage Weil, John Spray
On Mon, 2017-02-06 at 13:30 -0500, Jeff Layton wrote:
> On Mon, 2017-02-06 at 18:05 +0100, Ilya Dryomov wrote:
> > On Mon, Feb 6, 2017 at 5:36 PM, Jeff Layton <jlayton@redhat.com> wrote:
> > > On Mon, 2017-02-06 at 17:27 +0100, Ilya Dryomov wrote:
> > > > On Mon, Feb 6, 2017 at 4:49 PM, Jeff Layton <jlayton@redhat.com> wrote:
> > > > > On Mon, 2017-02-06 at 15:09 +0100, Ilya Dryomov wrote:
> > > > >
> > > > > [...]
> > > > >
> > > > > > > diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
> > > > > > > index 5c0da61cb763..def43570a85a 100644
> > > > > > > --- a/include/linux/ceph/rados.h
> > > > > > > +++ b/include/linux/ceph/rados.h
> > > > > > > @@ -401,6 +401,7 @@ enum {
> > > > > > > CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
> > > > > > > CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
> > > > > > > CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
> > > > > > > + CEPH_OSD_FLAG_FULL_CANCEL = 0x2000000, /* cancel operation on full flag */
> > > > > >
> > > > > > Is this a new flag? This is the wire protocol and I don't see it in
> > > > > > ceph.git.
> > > > > >
> > > > > > I'll look at epoch_barrier and callback stuff later.
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > > Ilya
> > > > >
> > > > > Here's a respun version of the last patch in the set. This should avoid
> > > > > adding an on the wire flag. I just added a new bool and changed the
> > > > > code to set and look at that to indicate the desire for an immediate
> > > > > error return in this case. Compiles but is otherwise untested. I'll give
> > > > > it a go in a bit.
> > > > >
> > > > > -----------------------------8<------------------------------
> > > > >
> > > > > libceph: allow requests to return immediately on full
> > > > > conditions if caller wishes
> > > > >
> > > > > Right now, cephfs will cancel any in-flight OSD write operations when a
> > > > > new map comes in that shows the OSD or pool as full, but nothing
> > > > > prevents new requests from stalling out after that point.
> > > > >
> > > > > If the caller knows that it will want an immediate error return instead
> > > > > of blocking on a full or at-quota error condition then allow it to set a
> > > > > flag to request that behavior. Cephfs write requests will always set
> > > > > that flag.
> > > > >
> > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > > ---
> > > > > fs/ceph/addr.c | 4 ++++
> > > > > fs/ceph/file.c | 4 ++++
> > > > > include/linux/ceph/osd_client.h | 1 +
> > > > > net/ceph/osd_client.c | 6 ++++++
> > > > > 4 files changed, 15 insertions(+)
> > > > >
> > > > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > > > > index 4547bbf80e4f..ef9c9bae7460 100644
> > > > > --- a/fs/ceph/addr.c
> > > > > +++ b/fs/ceph/addr.c
> > > > > @@ -1040,6 +1040,7 @@ static int ceph_writepages_start(struct address_space *mapping,
> > > > >
> > > > > req->r_callback = writepages_finish;
> > > > > req->r_inode = inode;
> > > > > + req->r_enospc_on_full = true;
> > > > >
> > > > > /* Format the osd request message and submit the write */
> > > > > len = 0;
> > > > > @@ -1689,6 +1690,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> > > > > }
> > > > >
> > > > > req->r_mtime = inode->i_mtime;
> > > > > + req->r_enospc_on_full = true;
> > > > > err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > > > > if (!err)
> > > > > err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > > > > @@ -1732,6 +1734,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
> > > > > }
> > > > >
> > > > > req->r_mtime = inode->i_mtime;
> > > > > + req->r_enospc_on_full = true;
> > > > > err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > > > > if (!err)
> > > > > err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > > > > @@ -1893,6 +1896,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
> > > > > err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
> > > > >
> > > > > wr_req->r_mtime = ci->vfs_inode.i_mtime;
> > > > > + wr_req->r_enospc_on_full = true;
> > > > > err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
> > > > >
> > > > > if (!err)
> > > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > > > index a91a4f1fc837..eaed17f90d5f 100644
> > > > > --- a/fs/ceph/file.c
> > > > > +++ b/fs/ceph/file.c
> > > > > @@ -714,6 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
> > > > > req->r_callback = ceph_aio_complete_req;
> > > > > req->r_inode = inode;
> > > > > req->r_priv = aio_req;
> > > > > + req->r_enospc_on_full = true;
> > > > >
> > > > > ret = ceph_osdc_start_request(req->r_osdc, req, false);
> > > > > out:
> > > > > @@ -912,6 +913,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
> > > > >
> > > > > osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
> > > > > req->r_mtime = mtime;
> > > > > + req->r_enospc_on_full = true;
> > > > > }
> > > > >
> > > > > osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
> > > > > @@ -1105,6 +1107,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
> > > > > false, true);
> > > > >
> > > > > req->r_mtime = mtime;
> > > > > + req->r_enospc_on_full = true;
> > > > > ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > > > > if (!ret)
> > > > > ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > > > > @@ -1557,6 +1560,7 @@ static int ceph_zero_partial_object(struct inode *inode,
> > > > > }
> > > > >
> > > > > req->r_mtime = inode->i_mtime;
> > > > > + req->r_enospc_on_full = true;
> > > > > ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> > > > > if (!ret) {
> > > > > ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> > > > > index 17bf1873bb01..f01e93ff03d5 100644
> > > > > --- a/include/linux/ceph/osd_client.h
> > > > > +++ b/include/linux/ceph/osd_client.h
> > > > > @@ -172,6 +172,7 @@ struct ceph_osd_request {
> > > > >
> > > > > int r_result;
> > > > > bool r_got_reply;
> > > > > + bool r_enospc_on_full; /* return ENOSPC when full */
> > > > >
> > > > > struct ceph_osd_client *r_osdc;
> > > > > struct kref r_kref;
> > > > > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> > > > > index d61d7a79fdb3..9f40d11b3c68 100644
> > > > > --- a/net/ceph/osd_client.c
> > > > > +++ b/net/ceph/osd_client.c
> > > > > @@ -50,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
> > > > > struct ceph_osd_linger_request *lreq);
> > > > > static void unlink_linger(struct ceph_osd *osd,
> > > > > struct ceph_osd_linger_request *lreq);
> > > > > +static void complete_request(struct ceph_osd_request *req, int err);
> > > > >
> > > > > #if 1
> > > > > static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
> > > > > @@ -1643,6 +1644,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> > > > > enum calc_target_result ct_res;
> > > > > bool need_send = false;
> > > > > bool promoted = false;
> > > > > + int ret = 0;
> > > > >
> > > > > WARN_ON(req->r_tid || req->r_got_reply);
> > > > > dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
> > > > > @@ -1683,6 +1685,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> > > > > pr_warn_ratelimited("FULL or reached pool quota\n");
> > > > > req->r_t.paused = true;
> > > > > maybe_request_map(osdc);
> > > > > + if (req->r_enospc_on_full)
> > > > > + ret = -ENOSPC;
> > > > > } else if (!osd_homeless(osd)) {
> > > > > need_send = true;
> > > > > } else {
> > > > > @@ -1699,6 +1703,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
> > > > > link_request(osd, req);
> > > > > if (need_send)
> > > > > send_request(req);
> > > > > + else if (ret)
> > > > > + complete_request(req, ret);
> > > >
> > > > How is this handled in the userspace client? I don't see a similar
> > > > check in Objecter.
> > > >
> > > > Thanks,
> > > >
> > > > Ilya
> > >
> > > It seems to be handled at a much higher layer in libcephfs. In _write(),
> > > for instance, we have:
> > >
> > > if (objecter->osdmap_pool_full(in->layout.pool_id)) {
> > > return -ENOSPC;
> > > }
> > >
> > > ...and there's also some EDQUOT handling a little below there as well.
> > >
> > > I don't think we can reasonably follow that model in the kernel client
> > > though. The way it's done in userland seems to require the big client
> > > mutex be held over large swaths of the code, and that's not the case in
> > > the kernel client (thankfully).
> >
> > All it's doing is checking a flag under a lock; the same lock is taken
> > in ceph_osdc_start_request(). Why would exporting
> >
> > bool ceph_osdc_pool_full(osdc, pool_id)
> > {
> > bool ret;
> >
> > down_read(&osdc->lock);
> > ret = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
> > pool_full(osdc, pool_id);
> > up_read(&osdc->lock);
> > return ret;
> > }
> >
> > not work? IOW why special case request handling code if you know in
> > advance that the request is a goner?
> >
>
>
> You can't just call that before calling submit_request because once you
> call up_read there, the map could have changed and the map_cb already
> run. At that point we can't rely on the callback cancelling the now
> hung request.
>
> You could call that function after ceph_osdc_start_request, but before
> we end up waiting on it (or whatever), but then we'd need to export
> complete_request as well so we could cancel it. The other downside
> there is that we end up with even more rwsem thrashing. We already end
> up taking it for write in order to submit the thing, so having to take
> it for read again afterward is not ideal.
>
> We could remedy that by pushing the rwsem handling up into cephfs
> (basically, export submit_request too), but that smells more like a
> layering violation to me. The full flag is part of the osdmap, so it
> doesn't seem wrong to have libcephfs handle it, and that's quite a bit
> simpler and more efficient than trying to manage that from the cephfs
> layer.
That said...I think it is better to use this new flag in a more
coherent fashion in the complete_writes callback. I've done some
cleanup and slimdown of the set today, so you may want to hold off
reviewing this just yet, and wait until I repost.
Thanks,
--
Jeff Layton <jlayton@redhat.com>
^ permalink raw reply [flat|nested] 15+ messages in thread