From: xiubli@redhat.com
To: idryomov@gmail.com, ceph-devel@vger.kernel.org
Cc: jlayton@kernel.org, lhenriques@suse.de, vshankar@redhat.com,
mchangir@redhat.com, Xiubo Li <xiubli@redhat.com>
Subject: [PATCH v16 03/68] libceph: add sparse read support to msgr2 crc state machine
Date: Mon, 27 Feb 2023 11:27:08 +0800 [thread overview]
Message-ID: <20230227032813.337906-4-xiubli@redhat.com> (raw)
In-Reply-To: <20230227032813.337906-1-xiubli@redhat.com>
From: Jeff Layton <jlayton@kernel.org>
Add support for a new sparse_read ceph_connection operation. The idea is
that the client driver can define this operation use it to do special
handling for incoming reads.
The alloc_msg routine will look at the request and determine whether the
reply is expected to be sparse. If it is, then we'll dispatch to a
different set of state machine states that will repeatedly call the
driver's sparse_read op to get length and placement info for reading the
extent map, and the extents themselves.
This necessitates adding some new field to some other structs:
- The msg gets a new bool to track whether it's a sparse_read request.
- A new field is added to the cursor to track the amount remaining in the
current extent. This is used to cap the read from the socket into the
msg_data
- Handing a revoke with all of this is particularly difficult, so I've
added a new data_len_remain field to the v2 connection info, and then
use that to skip that much on a revoke. We may want to expand the use of
that to the normal read path as well, just for consistency's sake.
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
include/linux/ceph/messenger.h | 28 ++++++
net/ceph/messenger.c | 1 +
net/ceph/messenger_v2.c | 167 +++++++++++++++++++++++++++++++--
3 files changed, 187 insertions(+), 9 deletions(-)
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 99c1726be6ee..8a6938fa324e 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -17,6 +17,7 @@
struct ceph_msg;
struct ceph_connection;
+struct ceph_msg_data_cursor;
/*
* Ceph defines these callbacks for handling connection events.
@@ -70,6 +71,30 @@ struct ceph_connection_operations {
int used_proto, int result,
const int *allowed_protos, int proto_cnt,
const int *allowed_modes, int mode_cnt);
+
+ /**
+ * sparse_read: read sparse data
+ * @con: connection we're reading from
+ * @cursor: data cursor for reading extents
+ * @buf: optional buffer to read into
+ *
+ * This should be called more than once, each time setting up to
+ * receive an extent into the current cursor position, and zeroing
+ * the holes between them.
+ *
+ * Returns amount of data to be read (in bytes), 0 if reading is
+ * complete, or -errno if there was an error.
+ *
+ * If @buf is set on a >0 return, then the data should be read into
+ * the provided buffer. Otherwise, it should be read into the cursor.
+ *
+ * The sparse read operation is expected to initialize the cursor
+ * with a length covering up to the end of the last extent.
+ */
+ int (*sparse_read)(struct ceph_connection *con,
+ struct ceph_msg_data_cursor *cursor,
+ char **buf);
+
};
/* use format string %s%lld */
@@ -207,6 +232,7 @@ struct ceph_msg_data_cursor {
struct ceph_msg_data *data; /* current data item */
size_t resid; /* bytes not yet consumed */
+ int sr_resid; /* residual sparse_read len */
bool need_crc; /* crc update needed */
union {
#ifdef CONFIG_BLOCK
@@ -251,6 +277,7 @@ struct ceph_msg {
struct kref kref;
bool more_to_follow;
bool needs_out_seq;
+ bool sparse_read;
int front_alloc_len;
struct ceph_msgpool *pool;
@@ -395,6 +422,7 @@ struct ceph_connection_v2_info {
void *conn_bufs[16];
int conn_buf_cnt;
+ int data_len_remain;
struct kvec in_sign_kvecs[8];
struct kvec out_sign_kvecs[8];
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1d06e114ba3f..7403a3a133cd 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1009,6 +1009,7 @@ void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
cursor->total_resid = length;
cursor->data = msg->data;
+ cursor->sr_resid = 0;
__ceph_msg_data_cursor_init(cursor);
}
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index 3009028c4fa2..027c0e439db8 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -52,14 +52,16 @@
#define FRAME_LATE_STATUS_COMPLETE 0xe
#define FRAME_LATE_STATUS_ABORTED_MASK 0xf
-#define IN_S_HANDLE_PREAMBLE 1
-#define IN_S_HANDLE_CONTROL 2
-#define IN_S_HANDLE_CONTROL_REMAINDER 3
-#define IN_S_PREPARE_READ_DATA 4
-#define IN_S_PREPARE_READ_DATA_CONT 5
-#define IN_S_PREPARE_READ_ENC_PAGE 6
-#define IN_S_HANDLE_EPILOGUE 7
-#define IN_S_FINISH_SKIP 8
+#define IN_S_HANDLE_PREAMBLE 1
+#define IN_S_HANDLE_CONTROL 2
+#define IN_S_HANDLE_CONTROL_REMAINDER 3
+#define IN_S_PREPARE_READ_DATA 4
+#define IN_S_PREPARE_READ_DATA_CONT 5
+#define IN_S_PREPARE_READ_ENC_PAGE 6
+#define IN_S_PREPARE_SPARSE_DATA 7
+#define IN_S_PREPARE_SPARSE_DATA_CONT 8
+#define IN_S_HANDLE_EPILOGUE 9
+#define IN_S_FINISH_SKIP 10
#define OUT_S_QUEUE_DATA 1
#define OUT_S_QUEUE_DATA_CONT 2
@@ -1819,6 +1821,123 @@ static void prepare_read_data_cont(struct ceph_connection *con)
con->v2.in_state = IN_S_HANDLE_EPILOGUE;
}
+static int prepare_sparse_read_cont(struct ceph_connection *con)
+{
+ int ret;
+ struct bio_vec bv;
+ char *buf = NULL;
+ struct ceph_msg_data_cursor *cursor = &con->v2.in_cursor;
+
+ WARN_ON(con->v2.in_state != IN_S_PREPARE_SPARSE_DATA_CONT);
+
+ if (iov_iter_is_bvec(&con->v2.in_iter)) {
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ con->in_data_crc = crc32c(con->in_data_crc,
+ page_address(con->bounce_page),
+ con->v2.in_bvec.bv_len);
+ get_bvec_at(cursor, &bv);
+ memcpy_to_page(bv.bv_page, bv.bv_offset,
+ page_address(con->bounce_page),
+ con->v2.in_bvec.bv_len);
+ } else {
+ con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+ con->v2.in_bvec.bv_page,
+ con->v2.in_bvec.bv_offset,
+ con->v2.in_bvec.bv_len);
+ }
+
+ ceph_msg_data_advance(cursor, con->v2.in_bvec.bv_len);
+ cursor->sr_resid -= con->v2.in_bvec.bv_len;
+ dout("%s: advance by 0x%x sr_resid 0x%x\n", __func__,
+ con->v2.in_bvec.bv_len, cursor->sr_resid);
+ WARN_ON_ONCE(cursor->sr_resid > cursor->total_resid);
+ if (cursor->sr_resid) {
+ get_bvec_at(cursor, &bv);
+ if (bv.bv_len > cursor->sr_resid)
+ bv.bv_len = cursor->sr_resid;
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ bv.bv_page = con->bounce_page;
+ bv.bv_offset = 0;
+ }
+ set_in_bvec(con, &bv);
+ con->v2.data_len_remain -= bv.bv_len;
+ return 0;
+ }
+ } else if (iov_iter_is_kvec(&con->v2.in_iter)) {
+ /* On first call, we have no kvec so don't compute crc */
+ if (con->v2.in_kvec_cnt) {
+ WARN_ON_ONCE(con->v2.in_kvec_cnt > 1);
+ con->in_data_crc = crc32c(con->in_data_crc,
+ con->v2.in_kvecs[0].iov_base,
+ con->v2.in_kvecs[0].iov_len);
+ }
+ } else {
+ return -EIO;
+ }
+
+ /* get next extent */
+ ret = con->ops->sparse_read(con, cursor, &buf);
+ if (ret <= 0) {
+ if (ret < 0)
+ return ret;
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+ return 0;
+ }
+
+ if (buf) {
+ /* receive into buffer */
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf, ret);
+ con->v2.data_len_remain -= ret;
+ return 0;
+ }
+
+ if (ret > cursor->total_resid) {
+ pr_warn("%s: ret 0x%x total_resid 0x%zx resid 0x%zx\n",
+ __func__, ret, cursor->total_resid, cursor->resid);
+ return -EIO;
+ }
+ get_bvec_at(cursor, &bv);
+ if (bv.bv_len > cursor->sr_resid)
+ bv.bv_len = cursor->sr_resid;
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ if (unlikely(!con->bounce_page)) {
+ con->bounce_page = alloc_page(GFP_NOIO);
+ if (!con->bounce_page) {
+ pr_err("failed to allocate bounce page\n");
+ return -ENOMEM;
+ }
+ }
+
+ bv.bv_page = con->bounce_page;
+ bv.bv_offset = 0;
+ }
+ set_in_bvec(con, &bv);
+ con->v2.data_len_remain -= ret;
+ return ret;
+}
+
+static int prepare_sparse_read_data(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->in_msg;
+
+ dout("%s: starting sparse read\n", __func__);
+
+ if (WARN_ON_ONCE(!con->ops->sparse_read))
+ return -EOPNOTSUPP;
+
+ if (!con_secure(con))
+ con->in_data_crc = -1;
+
+ reset_in_kvecs(con);
+ con->v2.in_state = IN_S_PREPARE_SPARSE_DATA_CONT;
+ con->v2.data_len_remain = data_len(msg);
+ return prepare_sparse_read_cont(con);
+}
+
static int prepare_read_tail_plain(struct ceph_connection *con)
{
struct ceph_msg *msg = con->in_msg;
@@ -1839,7 +1958,10 @@ static int prepare_read_tail_plain(struct ceph_connection *con)
}
if (data_len(msg)) {
- con->v2.in_state = IN_S_PREPARE_READ_DATA;
+ if (msg->sparse_read)
+ con->v2.in_state = IN_S_PREPARE_SPARSE_DATA;
+ else
+ con->v2.in_state = IN_S_PREPARE_READ_DATA;
} else {
add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
con->v2.in_state = IN_S_HANDLE_EPILOGUE;
@@ -2893,6 +3015,12 @@ static int populate_in_iter(struct ceph_connection *con)
prepare_read_enc_page(con);
ret = 0;
break;
+ case IN_S_PREPARE_SPARSE_DATA:
+ ret = prepare_sparse_read_data(con);
+ break;
+ case IN_S_PREPARE_SPARSE_DATA_CONT:
+ ret = prepare_sparse_read_cont(con);
+ break;
case IN_S_HANDLE_EPILOGUE:
ret = handle_epilogue(con);
break;
@@ -3485,6 +3613,23 @@ static void revoke_at_prepare_read_enc_page(struct ceph_connection *con)
con->v2.in_state = IN_S_FINISH_SKIP;
}
+static void revoke_at_prepare_sparse_data(struct ceph_connection *con)
+{
+ int resid; /* current piece of data */
+ int remaining;
+
+ WARN_ON(con_secure(con));
+ WARN_ON(!data_len(con->in_msg));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ dout("%s con %p resid %d\n", __func__, con, resid);
+
+ remaining = CEPH_EPILOGUE_PLAIN_LEN + con->v2.data_len_remain;
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid + remaining);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
static void revoke_at_handle_epilogue(struct ceph_connection *con)
{
int resid;
@@ -3501,6 +3646,7 @@ static void revoke_at_handle_epilogue(struct ceph_connection *con)
void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
{
switch (con->v2.in_state) {
+ case IN_S_PREPARE_SPARSE_DATA:
case IN_S_PREPARE_READ_DATA:
revoke_at_prepare_read_data(con);
break;
@@ -3510,6 +3656,9 @@ void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
case IN_S_PREPARE_READ_ENC_PAGE:
revoke_at_prepare_read_enc_page(con);
break;
+ case IN_S_PREPARE_SPARSE_DATA_CONT:
+ revoke_at_prepare_sparse_data(con);
+ break;
case IN_S_HANDLE_EPILOGUE:
revoke_at_handle_epilogue(con);
break;
--
2.31.1
next prev parent reply other threads:[~2023-02-27 3:29 UTC|newest]
Thread overview: 84+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-02-27 3:27 [PATCH v16 00/68] ceph+fscrypt: full support xiubli
2023-02-27 3:27 ` [PATCH v16 01/68] libceph: add spinlock around osd->o_requests xiubli
2023-02-27 3:27 ` [PATCH v16 02/68] libceph: define struct ceph_sparse_extent and add some helpers xiubli
2023-02-27 3:27 ` xiubli [this message]
2023-02-27 3:27 ` [PATCH v16 04/68] libceph: add sparse read support to OSD client xiubli
2023-02-27 3:27 ` [PATCH v16 05/68] libceph: support sparse reads on msgr2 secure codepath xiubli
2023-02-27 3:27 ` [PATCH v16 06/68] libceph: add sparse read support to msgr1 xiubli
2023-02-27 3:27 ` [PATCH v16 07/68] ceph: add new mount option to enable sparse reads xiubli
2023-02-27 3:27 ` [PATCH v16 08/68] ceph: preallocate inode for ops that may create one xiubli
2023-02-27 3:27 ` [PATCH v16 09/68] ceph: make ceph_msdc_build_path use ref-walk xiubli
2023-02-27 3:27 ` [PATCH v16 10/68] libceph: add new iov_iter-based ceph_msg_data_type and ceph_osd_data_type xiubli
2023-02-27 3:27 ` [PATCH v16 11/68] ceph: use osd_req_op_extent_osd_iter for netfs reads xiubli
2023-02-27 3:27 ` [PATCH v16 12/68] ceph: fscrypt_auth handling for ceph xiubli
2023-02-27 3:27 ` [PATCH v16 13/68] ceph: ensure that we accept a new context from MDS for new inodes xiubli
2023-02-27 3:27 ` [PATCH v16 14/68] ceph: add support for fscrypt_auth/fscrypt_file to cap messages xiubli
2023-02-27 3:27 ` [PATCH v16 15/68] ceph: implement -o test_dummy_encryption mount option xiubli
2023-02-27 3:27 ` [PATCH v16 16/68] ceph: decode alternate_name in lease info xiubli
2023-02-27 3:27 ` [PATCH v16 17/68] ceph: add fscrypt ioctls xiubli
2023-02-27 3:27 ` [PATCH v16 18/68] ceph: make the ioctl cmd more readable in debug log xiubli
2023-02-27 3:27 ` [PATCH v16 19/68] ceph: add base64 endcoding routines for encrypted names xiubli
2023-02-27 3:27 ` [PATCH v16 20/68] ceph: add encrypted fname handling to ceph_mdsc_build_path xiubli
2023-02-27 3:27 ` [PATCH v16 21/68] ceph: send altname in MClientRequest xiubli
2023-02-27 3:27 ` [PATCH v16 22/68] ceph: encode encrypted name in dentry release xiubli
2023-02-27 3:27 ` [PATCH v16 23/68] ceph: properly set DCACHE_NOKEY_NAME flag in lookup xiubli
2023-02-27 3:27 ` [PATCH v16 24/68] ceph: set DCACHE_NOKEY_NAME in atomic open xiubli
2023-02-27 3:27 ` [PATCH v16 25/68] ceph: make d_revalidate call fscrypt revalidator for encrypted dentries xiubli
2023-03-07 18:53 ` Luís Henriques
2023-03-08 1:50 ` Xiubo Li
2023-03-08 9:29 ` Luís Henriques
2023-03-08 10:42 ` Xiubo Li
2023-03-08 17:14 ` Luís Henriques
2023-03-08 17:54 ` Jeff Layton
2023-03-08 18:30 ` Luís Henriques
2023-03-08 19:32 ` Jeff Layton
2023-03-09 9:52 ` Luís Henriques
2023-03-09 7:06 ` Xiubo Li
2023-03-09 9:55 ` Luís Henriques
2023-03-09 11:41 ` Xiubo Li
2023-02-27 3:27 ` [PATCH v16 26/68] ceph: add helpers for converting names for userland presentation xiubli
2023-02-27 3:27 ` [PATCH v16 27/68] ceph: fix base64 encoded name's length check in ceph_fname_to_usr() xiubli
2023-02-27 3:27 ` [PATCH v16 28/68] ceph: add fscrypt support to ceph_fill_trace xiubli
2023-02-27 3:27 ` [PATCH v16 29/68] ceph: pass the request to parse_reply_info_readdir() xiubli
2023-02-27 3:27 ` [PATCH v16 30/68] ceph: add ceph_encode_encrypted_dname() helper xiubli
2023-02-27 3:27 ` [PATCH v16 31/68] ceph: add support to readdir for encrypted filenames xiubli
2023-02-27 3:27 ` [PATCH v16 32/68] ceph: create symlinks with encrypted and base64-encoded targets xiubli
2023-02-27 3:27 ` [PATCH v16 33/68] ceph: make ceph_get_name decrypt filenames xiubli
2023-02-27 3:27 ` [PATCH v16 34/68] ceph: add a new ceph.fscrypt.auth vxattr xiubli
2023-02-27 3:27 ` [PATCH v16 35/68] ceph: add some fscrypt guardrails xiubli
2023-02-27 3:27 ` [PATCH v16 36/68] ceph: allow encrypting a directory while not having Ax caps xiubli
2023-02-27 3:27 ` [PATCH v16 37/68] ceph: mark directory as non-complete after loading key xiubli
2023-02-27 3:27 ` [PATCH v16 38/68] ceph: don't allow changing layout on encrypted files/directories xiubli
2023-02-27 3:27 ` [PATCH v16 39/68] libceph: add CEPH_OSD_OP_ASSERT_VER support xiubli
2023-02-27 3:27 ` [PATCH v16 40/68] ceph: size handling for encrypted inodes in cap updates xiubli
2023-02-27 3:27 ` [PATCH v16 41/68] ceph: fscrypt_file field handling in MClientRequest messages xiubli
2023-02-27 3:27 ` [PATCH v16 42/68] ceph: get file size from fscrypt_file when present in inode traces xiubli
2023-02-27 3:27 ` [PATCH v16 43/68] ceph: handle fscrypt fields in cap messages from MDS xiubli
2023-02-27 3:27 ` [PATCH v16 44/68] ceph: update WARN_ON message to pr_warn xiubli
2023-02-27 3:27 ` [PATCH v16 45/68] ceph: add __ceph_get_caps helper support xiubli
2023-02-27 3:27 ` [PATCH v16 46/68] ceph: add __ceph_sync_read " xiubli
2023-02-27 3:27 ` [PATCH v16 47/68] ceph: add object version support for sync read xiubli
2023-02-27 3:27 ` [PATCH v16 48/68] ceph: add infrastructure for file encryption and decryption xiubli
2023-02-27 3:27 ` [PATCH v16 49/68] ceph: add truncate size handling support for fscrypt xiubli
2023-02-27 3:27 ` [PATCH v16 50/68] libceph: allow ceph_osdc_new_request to accept a multi-op read xiubli
2023-02-27 3:27 ` [PATCH v16 51/68] ceph: disable fallocate for encrypted inodes xiubli
2023-02-27 3:27 ` [PATCH v16 52/68] ceph: disable copy offload on " xiubli
2023-02-27 3:27 ` [PATCH v16 53/68] ceph: don't use special DIO path for " xiubli
2023-02-27 3:27 ` [PATCH v16 54/68] ceph: align data in pages in ceph_sync_write xiubli
2023-02-27 3:28 ` [PATCH v16 55/68] ceph: add read/modify/write to ceph_sync_write xiubli
2023-02-27 3:28 ` [PATCH v16 56/68] ceph: plumb in decryption during sync reads xiubli
2023-02-27 3:28 ` [PATCH v16 57/68] ceph: add fscrypt decryption support to ceph_netfs_issue_op xiubli
2023-02-27 3:28 ` [PATCH v16 58/68] ceph: set i_blkbits to crypto block size for encrypted inodes xiubli
2023-02-27 3:28 ` [PATCH v16 59/68] ceph: add encryption support to writepage xiubli
2023-02-27 3:28 ` [PATCH v16 60/68] ceph: fscrypt support for writepages xiubli
2023-02-27 3:28 ` [PATCH v16 61/68] ceph: invalidate pages when doing direct/sync writes xiubli
2023-02-27 3:28 ` [PATCH v16 62/68] ceph: add support for encrypted snapshot names xiubli
2023-02-27 3:28 ` [PATCH v16 63/68] ceph: add support for handling " xiubli
2023-02-27 3:28 ` [PATCH v16 64/68] ceph: update documentation regarding snapshot naming limitations xiubli
2023-02-27 3:28 ` [PATCH v16 65/68] ceph: prevent snapshots to be created in encrypted locked directories xiubli
2023-02-27 3:28 ` [PATCH v16 66/68] ceph: report STATX_ATTR_ENCRYPTED on encrypted inodes xiubli
2023-02-27 3:28 ` [PATCH v16 67/68] libceph: defer removing the req from osdc just after req->r_callback xiubli
2023-02-27 3:28 ` [PATCH v16 68/68] ceph: drop the messages from MDS when unmounting xiubli
2023-02-27 9:27 ` [PATCH v16 00/68] ceph+fscrypt: full support Luís Henriques
2023-02-27 9:58 ` Xiubo Li
2023-02-27 10:30 ` Luís Henriques
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230227032813.337906-4-xiubli@redhat.com \
--to=xiubli@redhat.com \
--cc=ceph-devel@vger.kernel.org \
--cc=idryomov@gmail.com \
--cc=jlayton@kernel.org \
--cc=lhenriques@suse.de \
--cc=mchangir@redhat.com \
--cc=vshankar@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).