All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3 v2] chunkd on-disk checksumming
@ 2010-07-17  5:46 Jeff Garzik
  2010-07-17  5:46 ` [PATCH 1/3 v2] chunkd: remove sendfile(2) zero-copy support Jeff Garzik
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: Jeff Garzik @ 2010-07-17  5:46 UTC (permalink / raw)
  To: hail-devel


This patchset is rediffed and bug-fixed from previous posting.  Patches
are against hail.git commit 68f68e3630cae4bb2ca651b577c1d4d0292687b1.

See original posting for more info:
http://marc.info/?l=hail-devel&m=127921794109300&w=2

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/3 v2] chunkd: remove sendfile(2) zero-copy support
  2010-07-17  5:46 [PATCH 0/3 v2] chunkd on-disk checksumming Jeff Garzik
@ 2010-07-17  5:46 ` Jeff Garzik
  2010-07-18  3:45   ` Steven Dake
  2010-07-17  5:47 ` [PATCH 2/3 v2] chunkd: Add checksum table to on-disk format, one sum per 64k of data Jeff Garzik
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 8+ messages in thread
From: Jeff Garzik @ 2010-07-17  5:46 UTC (permalink / raw)
  To: hail-devel


 chunkd/be-fs.c  |   60 --------------------------------------------------------
 chunkd/chunkd.h |   14 -------------
 chunkd/object.c |   31 ++++++++++++----------------
 chunkd/server.c |   28 --------------------------
 configure.ac    |    3 --
 5 files changed, 15 insertions(+), 121 deletions(-)
commit e87d06526fa2052a1cecbed65ec2fac263c5e7d8
Author: Jeff Garzik <jeff@garzik.org>
Date:   Sat Jul 17 00:26:41 2010 -0400

    chunkd: remove sendfile(2) zero-copy support
    
    chunkd will be soon checksumming data in main memory.  That removes
    the utility of a zero-copy interface which bypasses the on-heap
    data requirement.
    
    Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/chunkd/be-fs.c b/chunkd/be-fs.c
index 4b851a7..0a81134 100644
--- a/chunkd/be-fs.c
+++ b/chunkd/be-fs.c
@@ -25,9 +25,6 @@
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/uio.h>
-#if defined(HAVE_SYS_SENDFILE_H)
-#include <sys/sendfile.h>
-#endif
 #include <stdlib.h>
 #include <unistd.h>
 #include <stdio.h>
@@ -52,7 +49,6 @@ struct fs_obj {
 
 	int			in_fd;
 	char			*in_fn;
-	off_t			sendfile_ofs;
 };
 
 struct be_fs_obj_hdr {
@@ -547,62 +543,6 @@ ssize_t fs_obj_write(struct backend_obj *bo, const void *ptr, size_t len)
 	return rc;
 }
 
-#if defined(HAVE_SENDFILE) && defined(__linux__)
-
-ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len)
-{
-	struct fs_obj *obj = bo->private;
-	ssize_t rc;
-
-	if (obj->sendfile_ofs == 0) {
-		obj->sendfile_ofs += sizeof(struct be_fs_obj_hdr);
-		obj->sendfile_ofs += bo->key_len;
-	}
-
-	rc = sendfile(out_fd, obj->in_fd, &obj->sendfile_ofs, len);
-	if (rc < 0)
-		applog(LOG_ERR, "obj sendfile(%s) failed: %s",
-		       obj->in_fn, strerror(errno));
-
-	return rc;
-}
-
-#elif defined(HAVE_SENDFILE) && defined(__FreeBSD__)
-
-ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len)
-{
-	struct fs_obj *obj = bo->private;
-	ssize_t rc;
-	off_t sbytes = 0;
-
-	if (obj->sendfile_ofs == 0) {
-		obj->sendfile_ofs += sizeof(struct be_fs_obj_hdr);
-		obj->sendfile_ofs += bo->key_len;
-	}
-
-	rc = sendfile(obj->in_fd, out_fd, obj->sendfile_ofs, len,
-		      NULL, &sbytes, 0);
-	if (rc < 0) {
-		applog(LOG_ERR, "obj sendfile(%s) failed: %s",
-		       obj->in_fn, strerror(errno));
-		return rc;
-	}
-
-	obj->sendfile_ofs += sbytes;
-
-	return sbytes;
-}
-
-#else
-
-ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len)
-{
-	applog(LOG_ERR, "BUG: sendfile used but not supported");
-	return -EOPNOTSUPP;
-}
-
-#endif /* HAVE_SENDFILE && HAVE_SYS_SENDFILE_H */
-
 bool fs_obj_write_commit(struct backend_obj *bo, const char *user,
 			 unsigned char *md, bool sync_data)
 {
diff --git a/chunkd/chunkd.h b/chunkd/chunkd.h
index 72833f7..716704b 100644
--- a/chunkd/chunkd.h
+++ b/chunkd/chunkd.h
@@ -39,8 +39,6 @@ enum {
 	CLI_DATA_BUF_SZ		= 16 * 1024,
 
 	CHD_TRASH_MAX		= 1000,
-
-	CLI_MAX_SENDFILE_SZ	= 512 * 1024,
 };
 
 struct client;
@@ -54,7 +52,6 @@ struct client_write {
 	uint64_t		len;		/* write buffer length */
 	cli_write_func		cb;		/* callback */
 	void			*cb_data;	/* data passed to cb */
-	bool			sendfile;	/* using sendfile? */
 
 	struct list_head	node;
 };
@@ -268,7 +265,6 @@ extern bool fs_obj_delete(uint32_t table_id, const char *user,
 		          const void *kbuf, size_t klen,
 			  enum chunk_errcode *err_code);
 extern int fs_obj_disable(const char *fn);
-extern ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len);
 extern int fs_list_objs_open(struct fs_obj_lister *t,
 			     const char *root_path, uint32_t table_id);
 extern int fs_list_objs_next(struct fs_obj_lister *t, char **fnp);
@@ -323,7 +319,6 @@ extern void applog(int prio, const char *fmt, ...);
 extern bool cli_err(struct client *cli, enum chunk_errcode code, bool recycle_ok);
 extern int cli_writeq(struct client *cli, const void *buf, unsigned int buflen,
 		     cli_write_func cb, void *cb_data);
-extern bool cli_wr_sendfile(struct client *, cli_write_func);
 extern bool cli_rd_set_poll(struct client *cli, bool readable);
 extern void cli_wr_set_poll(struct client *cli, bool writable);
 extern bool cli_cb_free(struct client *cli, struct client_write *wr,
@@ -342,15 +337,6 @@ extern void read_config(void);
 /* selfcheck.c */
 extern int chk_spawn(TCHDB *hdb);
 
-static inline bool use_sendfile(struct client *cli)
-{
-#if defined(HAVE_SENDFILE) && defined(HAVE_SYS_SENDFILE_H)
-	return cli->ssl ? false : true;
-#else
-	return false;
-#endif
-}
-
 #ifndef HAVE_STRNLEN
 extern size_t strnlen(const char *s, size_t maxlen);
 #endif
diff --git a/chunkd/object.c b/chunkd/object.c
index d7d3cb6..a517558 100644
--- a/chunkd/object.c
+++ b/chunkd/object.c
@@ -253,28 +253,23 @@ void cli_in_end(struct client *cli)
 
 static bool object_read_bytes(struct client *cli)
 {
-	if (use_sendfile(cli)) {
-		if (!cli_wr_sendfile(cli, object_get_more))
-			return false;
-	} else {
-		ssize_t bytes;
+	ssize_t bytes;
 
-		bytes = fs_obj_read(cli->in_obj, cli->netbuf_out,
-				    MIN(cli->in_len, CLI_DATA_BUF_SZ));
-		if (bytes < 0)
-			return false;
-		if (bytes == 0 && cli->in_len != 0)
-			return false;
+	bytes = fs_obj_read(cli->in_obj, cli->netbuf_out,
+			    MIN(cli->in_len, CLI_DATA_BUF_SZ));
+	if (bytes < 0)
+		return false;
+	if (bytes == 0 && cli->in_len != 0)
+		return false;
 
-		cli->in_len -= bytes;
+	cli->in_len -= bytes;
 
-		if (!cli->in_len)
-			cli_in_end(cli);
+	if (!cli->in_len)
+		cli_in_end(cli);
 
-		if (cli_writeq(cli, cli->netbuf_out, bytes,
-			       cli->in_len ? object_get_more : NULL, NULL))
-			return false;
-	}
+	if (cli_writeq(cli, cli->netbuf_out, bytes,
+		       cli->in_len ? object_get_more : NULL, NULL))
+		return false;
 
 	return true;
 }
diff --git a/chunkd/server.c b/chunkd/server.c
index 3115592..833dcf8 100644
--- a/chunkd/server.c
+++ b/chunkd/server.c
@@ -504,14 +504,7 @@ restart:
 
 	/* execute non-blocking write */
 do_write:
-	if (tmp->sendfile) {
-		rc = fs_obj_sendfile(cli->in_obj, cli->fd,
-				     MIN(cli->in_len, CLI_MAX_SENDFILE_SZ));
-		if (rc < 0)
-			goto err_out;
-
-		cli->in_len -= rc;
-	} else if (cli->ssl) {
+	if (cli->ssl) {
 		rc = SSL_write(cli->ssl, tmp->buf, tmp->len);
 		if (rc <= 0) {
 			rc = SSL_get_error(cli->ssl, rc);
@@ -612,31 +605,12 @@ int cli_writeq(struct client *cli, const void *buf, unsigned int buflen,
 	wr->len = buflen;
 	wr->cb = cb;
 	wr->cb_data = cb_data;
-	wr->sendfile = false;
 
 	list_add_tail(&wr->node, &cli->write_q);
 
 	return 0;
 }
 
-bool cli_wr_sendfile(struct client *cli, cli_write_func cb)
-{
-	struct client_write *wr;
-
-	wr = calloc(1, sizeof(struct client_write));
-	if (!wr)
-		return false;
-
-	wr->len = cli->in_len;
-	wr->cb = cb;
-	wr->sendfile = true;
-	INIT_LIST_HEAD(&wr->node);
-
-	list_add_tail(&wr->node, &cli->write_q);
-
-	return true;
-}
-
 static int cli_read_data(struct client *cli, void *buf, size_t buflen)
 {
 	ssize_t rc;
diff --git a/configure.ac b/configure.ac
index 6b48f1e..db10242 100644
--- a/configure.ac
+++ b/configure.ac
@@ -65,7 +65,6 @@ AM_PROG_LIBTOOL
 dnl Checks for header files.
 AC_HEADER_STDC
 dnl AC_CHECK_HEADERS(sys/ioctl.h unistd.h)
-AC_CHECK_HEADERS(sys/sendfile.h sys/filio.h)
 AC_CHECK_HEADER(db.h,[],exit 1)
 
 dnl Checks for typedefs, structures, and compiler characteristics.
@@ -97,7 +96,7 @@ PKG_CHECK_MODULES(TOKYOCABINET, tokyocabinet)
 dnl -------------------------------------
 dnl Checks for optional library functions
 dnl -------------------------------------
-AC_CHECK_FUNCS(strnlen daemon memmem memrchr sendfile)
+AC_CHECK_FUNCS(strnlen daemon memmem memrchr)
 AC_CHECK_FUNC(xdr_sizeof,
 	[AC_DEFINE([HAVE_XDR_SIZEOF], [1],
 		[Define to 1 if you have xdr_sizeof.])],

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/3 v2] chunkd: Add checksum table to on-disk format, one sum per 64k of data
  2010-07-17  5:46 [PATCH 0/3 v2] chunkd on-disk checksumming Jeff Garzik
  2010-07-17  5:46 ` [PATCH 1/3 v2] chunkd: remove sendfile(2) zero-copy support Jeff Garzik
@ 2010-07-17  5:47 ` Jeff Garzik
  2010-07-17  5:47 ` [PATCH 3/3 v2] [chunkd] match network buffer size to checksum buffer size (64k) Jeff Garzik
  2010-07-18  7:09 ` [PATCH 4/4] chunkd checksums each block, as it is read from disk Jeff Garzik
  3 siblings, 0 replies; 8+ messages in thread
From: Jeff Garzik @ 2010-07-17  5:47 UTC (permalink / raw)
  To: hail-devel


 chunkd/be-fs.c  |  145 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 chunkd/chunkd.h |    3 +
 2 files changed, 131 insertions(+), 17 deletions(-)
commit 394109d5c2fc2d15d91c2d36eecd57594922c1b3
Author: Jeff Garzik <jeff@garzik.org>
Date:   Sat Jul 17 01:05:15 2010 -0400

    chunkd: Add checksum table to on-disk format, one sum per 64k of data
    
    Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/chunkd/be-fs.c b/chunkd/be-fs.c
index 0a81134..5955afa 100644
--- a/chunkd/be-fs.c
+++ b/chunkd/be-fs.c
@@ -49,14 +49,23 @@ struct fs_obj {
 
 	int			in_fd;
 	char			*in_fn;
+
+	size_t			checked_bytes;
+	SHA_CTX			checksum;
+	unsigned int		csum_idx;
+	void			*csum_tbl;
+	size_t			csum_tbl_sz;
+
+	unsigned int		n_blk;
 };
 
 struct be_fs_obj_hdr {
 	char			magic[4];
 	uint32_t		key_len;
 	uint64_t		value_len;
+	uint32_t		n_blk;
 
-	char			reserved[16];
+	char			reserved[12];
 
 	unsigned char		hash[CHD_CSUM_SZ];
 	char			owner[128];
@@ -204,6 +213,8 @@ static struct fs_obj *fs_obj_alloc(void)
 	obj->out_fd = -1;
 	obj->in_fd = -1;
 
+	SHA1_Init(&obj->checksum);
+
 	return obj;
 }
 
@@ -314,6 +325,17 @@ static bool key_valid(const void *key, size_t key_len)
 	return true;
 }
 
+static unsigned int fs_blk_count(uint64_t data_len)
+{
+	uint64_t n_blk;
+
+	n_blk = data_len >> CHUNK_BLK_ORDER;
+	if (data_len & (CHUNK_BLK_SZ - 1))
+		n_blk++;
+
+	return (unsigned int) n_blk;
+}
+
 struct backend_obj *fs_obj_new(uint32_t table_id,
 			       const void *key, size_t key_len,
 			       uint64_t data_len,
@@ -321,6 +343,7 @@ struct backend_obj *fs_obj_new(uint32_t table_id,
 {
 	struct fs_obj *obj;
 	char *fn = NULL;
+	size_t csum_bytes;
 	enum chunk_errcode erc = che_InternalError;
 	off_t skip_len;
 
@@ -335,6 +358,13 @@ struct backend_obj *fs_obj_new(uint32_t table_id,
 		return NULL;
 	}
 
+	obj->n_blk = fs_blk_count(data_len);
+	csum_bytes = obj->n_blk * CHD_CSUM_SZ;
+	obj->csum_tbl = malloc(csum_bytes);
+	if (!obj->csum_tbl)
+		goto err_out;
+	obj->csum_tbl_sz = csum_bytes;
+
 	/* build local fs pathname */
 	fn = fs_obj_pathname(table_id, key, key_len);
 	if (!fn)
@@ -355,7 +385,7 @@ struct backend_obj *fs_obj_new(uint32_t table_id,
 	obj->out_fn = fn;
 
 	/* calculate size of front-of-file metadata area */
-	skip_len = sizeof(struct be_fs_obj_hdr) + key_len;
+	skip_len = sizeof(struct be_fs_obj_hdr) + key_len + csum_bytes;
 
 	/* position file pointer where object data (as in, not metadata)
 	 * will begin
@@ -393,7 +423,10 @@ struct backend_obj *fs_obj_open(uint32_t table_id, const char *user,
 	struct be_fs_obj_hdr hdr;
 	ssize_t rrc;
 	uint64_t value_len, tmp64;
+	size_t csum_bytes;
 	enum chunk_errcode erc = che_InternalError;
+	struct iovec iov[2];
+	size_t total_rd_len;
 
 	if (!key_valid(key, key_len)) {
 		*err_code = che_InvalidKey;
@@ -453,23 +486,45 @@ struct backend_obj *fs_obj_open(uint32_t table_id, const char *user,
 		goto err_out;
 
 	value_len = GUINT64_FROM_LE(hdr.value_len);
+	obj->n_blk = GUINT32_FROM_LE(hdr.n_blk);
+	csum_bytes = obj->n_blk * CHD_CSUM_SZ;
 
 	/* verify file size large enough to contain value */
-	tmp64 = value_len + sizeof(hdr) + key_len;
+	tmp64 = value_len + sizeof(hdr) + key_len + csum_bytes;
 	if (G_UNLIKELY(st.st_size < tmp64)) {
 		applog(LOG_ERR, "obj(%s) size error, too small", obj->in_fn);
 		goto err_out;
 	}
 
+	/* verify expected size of checksum table */
+	if (G_UNLIKELY(fs_blk_count(value_len) != obj->n_blk)) {
+		applog(LOG_ERR, "obj(%s) unexpected blk count "
+		       "(%u from val sz, %u from hdr)",
+		       obj->in_fn, fs_blk_count(value_len), obj->n_blk);
+		goto err_out;
+	}
+
+	obj->csum_tbl = malloc(csum_bytes);
+	if (!obj->csum_tbl)
+		goto err_out;
+	obj->csum_tbl_sz = csum_bytes;
+
 	obj->bo.key = malloc(key_len);
 	obj->bo.key_len = key_len;
 	if (!obj->bo.key)
 		goto err_out;
 
-	/* read object variable-length header */
-	rrc = read(obj->in_fd, obj->bo.key, key_len);
-	if ((rrc != key_len) || (memcmp(key, obj->bo.key, key_len))) {
-		applog(LOG_ERR, "read hdr key obj(%s) failed: %s",
+	/* init additional header segment list */
+	iov[0].iov_base = obj->bo.key;
+	iov[0].iov_len = key_len;
+	iov[1].iov_base = obj->csum_tbl;
+	iov[1].iov_len = csum_bytes;
+	total_rd_len = iov[0].iov_len + iov[1].iov_len;
+
+	/* read additional header segments (key, checksum table) */
+	rrc = readv(obj->in_fd, iov, ARRAY_SIZE(iov));
+	if ((rrc != total_rd_len) || (memcmp(key, obj->bo.key, key_len))) {
+		applog(LOG_ERR, "read addnl hdrs(%s) failed: %s",
 			obj->in_fn,
 			(rrc < 0) ? strerror(errno) : "<unknown reasons>");
 		goto err_out;
@@ -512,6 +567,7 @@ void fs_obj_free(struct backend_obj *bo)
 	if (obj->in_fd >= 0)
 		close(obj->in_fd);
 
+	free(obj->csum_tbl);
 	free(obj);
 }
 
@@ -528,19 +584,59 @@ ssize_t fs_obj_read(struct backend_obj *bo, void *ptr, size_t len)
 	return rc;
 }
 
+static void obj_flush_csum(struct backend_obj *bo)
+{
+	struct fs_obj *obj = bo->private;
+	unsigned char md[CHD_CSUM_SZ];
+
+	if (G_UNLIKELY(obj->csum_idx >= obj->n_blk)) {
+		applog(LOG_ERR, "BUG %s: cidx %u, n_blk %u",
+		       __func__, obj->csum_idx, obj->n_blk);
+		return;
+	}
+
+	SHA1_Final(md, &obj->checksum);
+
+	memcpy(obj->csum_tbl + ((obj->csum_idx++) * CHD_CSUM_SZ),
+	       md, CHD_CSUM_SZ);
+
+	obj->checked_bytes = 0;
+	SHA1_Init(&obj->checksum);
+}
+
 ssize_t fs_obj_write(struct backend_obj *bo, const void *ptr, size_t len)
 {
 	struct fs_obj *obj = bo->private;
-	ssize_t rc;
+	ssize_t total_written = 0;
 
-	rc = write(obj->out_fd, ptr, len);
-	if (rc < 0)
-		applog(LOG_ERR, "obj write(%s) failed: %s",
-		       obj->out_fn, strerror(errno));
-	else
-		obj->written_bytes += rc;
+	while (len > 0) {
+		size_t unchecked;
+		ssize_t wrc;
 
-	return rc;
+		unchecked = CHUNK_BLK_SZ - obj->checked_bytes;
+
+		wrc = write(obj->out_fd, ptr, MIN(unchecked, len));
+		if (wrc < 0) {
+			applog(LOG_ERR, "obj write(%s) failed: %s",
+			       obj->out_fn, strerror(errno));
+			return wrc;
+			break;
+		}
+
+		SHA1_Update(&obj->checksum, ptr, wrc);
+
+		total_written += wrc;
+		obj->written_bytes += wrc;
+		obj->checked_bytes += wrc;
+		ptr += wrc;
+		len -= wrc;
+
+		/* if at end of 64k block, update csum table with new csum */
+		if (obj->checked_bytes == CHUNK_BLK_SZ)
+			obj_flush_csum(bo);
+	}
+
+	return total_written;
 }
 
 bool fs_obj_write_commit(struct backend_obj *bo, const char *user,
@@ -550,7 +646,7 @@ bool fs_obj_write_commit(struct backend_obj *bo, const char *user,
 	struct be_fs_obj_hdr hdr;
 	ssize_t wrc;
 	size_t total_wr_len;
-	struct iovec iov[2];
+	struct iovec iov[3];
 
 	if (G_UNLIKELY(obj->bo.size != obj->written_bytes)) {
 		applog(LOG_ERR, "BUG(%s): size/written_bytes mismatch: %llu/%llu",
@@ -566,6 +662,19 @@ bool fs_obj_write_commit(struct backend_obj *bo, const char *user,
 	strncpy(hdr.owner, user, sizeof(hdr.owner));
 	hdr.key_len = GUINT32_TO_LE(bo->key_len);
 	hdr.value_len = GUINT64_TO_LE(obj->written_bytes);
+	hdr.n_blk = GUINT32_TO_LE(obj->n_blk);
+
+	/* update checksum table with final csum, if necessary */
+	if (obj->checked_bytes > 0)
+		obj_flush_csum(bo);
+
+	if (G_UNLIKELY(obj->csum_idx != obj->n_blk)) {
+		applog(LOG_ERR, "BUG(%s): csum_idx/n_blk mismatch: %u/%u",
+		       obj->out_fn, obj->csum_idx, obj->n_blk);
+		return false;
+	}
+
+	obj->csum_idx = 0;
 
 	/* go back to beginning of file */
 	if (lseek(obj->out_fd, 0, SEEK_SET) < 0) {
@@ -579,7 +688,9 @@ bool fs_obj_write_commit(struct backend_obj *bo, const char *user,
 	iov[0].iov_len = sizeof(hdr);
 	iov[1].iov_base = bo->key;
 	iov[1].iov_len = bo->key_len;
-	total_wr_len = iov[0].iov_len + iov[1].iov_len;
+	iov[2].iov_base = obj->csum_tbl;
+	iov[2].iov_len = obj->csum_tbl_sz;
+	total_wr_len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len;
 
 	/* write object header segments */
 	wrc = writev(obj->out_fd, iov, ARRAY_SIZE(iov));
diff --git a/chunkd/chunkd.h b/chunkd/chunkd.h
index 716704b..5c34967 100644
--- a/chunkd/chunkd.h
+++ b/chunkd/chunkd.h
@@ -36,6 +36,9 @@
 #endif
 
 enum {
+	CHUNK_BLK_ORDER		= 16,			/* 64k blocks */
+	CHUNK_BLK_SZ		= 1 << CHUNK_BLK_ORDER,
+
 	CLI_DATA_BUF_SZ		= 16 * 1024,
 
 	CHD_TRASH_MAX		= 1000,

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/3 v2] [chunkd] match network buffer size to checksum buffer size (64k)
  2010-07-17  5:46 [PATCH 0/3 v2] chunkd on-disk checksumming Jeff Garzik
  2010-07-17  5:46 ` [PATCH 1/3 v2] chunkd: remove sendfile(2) zero-copy support Jeff Garzik
  2010-07-17  5:47 ` [PATCH 2/3 v2] chunkd: Add checksum table to on-disk format, one sum per 64k of data Jeff Garzik
@ 2010-07-17  5:47 ` Jeff Garzik
  2010-07-18  7:09 ` [PATCH 4/4] chunkd checksums each block, as it is read from disk Jeff Garzik
  3 siblings, 0 replies; 8+ messages in thread
From: Jeff Garzik @ 2010-07-17  5:47 UTC (permalink / raw)
  To: hail-devel


 chunkd/chunkd.h |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
commit 937e3fa1eeee9dd68dfba7f65711607849814faa
Author: Jeff Garzik <jeff@garzik.org>
Date:   Sat Jul 17 01:16:24 2010 -0400

    [chunkd] match network buffer size to checksum buffer size (64k)
    
    Size increased from (2 * 16k) to (2 * 64k).
    
    This increases the likelihood that we will input and output nicely
    aligned and sized requests.
    
    Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/chunkd/chunkd.h b/chunkd/chunkd.h
index 5c34967..8ba4fb8 100644
--- a/chunkd/chunkd.h
+++ b/chunkd/chunkd.h
@@ -39,7 +39,7 @@ enum {
 	CHUNK_BLK_ORDER		= 16,			/* 64k blocks */
 	CHUNK_BLK_SZ		= 1 << CHUNK_BLK_ORDER,
 
-	CLI_DATA_BUF_SZ		= 16 * 1024,
+	CLI_DATA_BUF_SZ		= CHUNK_BLK_SZ,
 
 	CHD_TRASH_MAX		= 1000,
 };

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3 v2] chunkd: remove sendfile(2) zero-copy support
  2010-07-17  5:46 ` [PATCH 1/3 v2] chunkd: remove sendfile(2) zero-copy support Jeff Garzik
@ 2010-07-18  3:45   ` Steven Dake
  2010-07-18 23:10     ` Jeff Garzik
  0 siblings, 1 reply; 8+ messages in thread
From: Steven Dake @ 2010-07-18  3:45 UTC (permalink / raw)
  To: hail-devel

On 07/16/2010 10:46 PM, Jeff Garzik wrote:
>
>   chunkd/be-fs.c  |   60 --------------------------------------------------------
>   chunkd/chunkd.h |   14 -------------
>   chunkd/object.c |   31 ++++++++++++----------------
>   chunkd/server.c |   28 --------------------------
>   configure.ac    |    3 --
>   5 files changed, 15 insertions(+), 121 deletions(-)
> commit e87d06526fa2052a1cecbed65ec2fac263c5e7d8
> Author: Jeff Garzik<jeff@garzik.org>
> Date:   Sat Jul 17 00:26:41 2010 -0400
>
>      chunkd: remove sendfile(2) zero-copy support
>
>      chunkd will be soon checksumming data in main memory.  That removes
>      the utility of a zero-copy interface which bypasses the on-heap
>      data requirement.
>
>      Signed-off-by: Jeff Garzik<jgarzik@redhat.com>
>
> diff --git a/chunkd/be-fs.c b/chunkd/be-fs.c
> index 4b851a7..0a81134 100644
> --- a/chunkd/be-fs.c
> +++ b/chunkd/be-fs.c
> @@ -25,9 +25,6 @@
>   #include<sys/stat.h>
>   #include<sys/socket.h>
>   #include<sys/uio.h>
> -#if defined(HAVE_SYS_SENDFILE_H)
> -#include<sys/sendfile.h>
> -#endif
>   #include<stdlib.h>
>   #include<unistd.h>
>   #include<stdio.h>
> @@ -52,7 +49,6 @@ struct fs_obj {
>
>   	int			in_fd;
>   	char			*in_fn;
> -	off_t			sendfile_ofs;
>   };
>
>   struct be_fs_obj_hdr {
> @@ -547,62 +543,6 @@ ssize_t fs_obj_write(struct backend_obj *bo, const void *ptr, size_t len)
>   	return rc;
>   }
>
> -#if defined(HAVE_SENDFILE)&&  defined(__linux__)
> -
> -ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len)
> -{
> -	struct fs_obj *obj = bo->private;
> -	ssize_t rc;
> -
> -	if (obj->sendfile_ofs == 0) {
> -		obj->sendfile_ofs += sizeof(struct be_fs_obj_hdr);
> -		obj->sendfile_ofs += bo->key_len;
> -	}
> -
> -	rc = sendfile(out_fd, obj->in_fd,&obj->sendfile_ofs, len);
> -	if (rc<  0)
> -		applog(LOG_ERR, "obj sendfile(%s) failed: %s",
> -		       obj->in_fn, strerror(errno));
> -
> -	return rc;
> -}
> -
> -#elif defined(HAVE_SENDFILE)&&  defined(__FreeBSD__)
> -
> -ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len)
> -{
> -	struct fs_obj *obj = bo->private;
> -	ssize_t rc;
> -	off_t sbytes = 0;
> -
> -	if (obj->sendfile_ofs == 0) {
> -		obj->sendfile_ofs += sizeof(struct be_fs_obj_hdr);
> -		obj->sendfile_ofs += bo->key_len;
> -	}
> -
> -	rc = sendfile(obj->in_fd, out_fd, obj->sendfile_ofs, len,
> -		      NULL,&sbytes, 0);
> -	if (rc<  0) {
> -		applog(LOG_ERR, "obj sendfile(%s) failed: %s",
> -		       obj->in_fn, strerror(errno));
> -		return rc;
> -	}
> -
> -	obj->sendfile_ofs += sbytes;
> -
> -	return sbytes;
> -}
> -
> -#else
> -
> -ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len)
> -{
> -	applog(LOG_ERR, "BUG: sendfile used but not supported");
> -	return -EOPNOTSUPP;
> -}
> -
> -#endif /* HAVE_SENDFILE&&  HAVE_SYS_SENDFILE_H */
> -
>   bool fs_obj_write_commit(struct backend_obj *bo, const char *user,
>   			 unsigned char *md, bool sync_data)
>   {
> diff --git a/chunkd/chunkd.h b/chunkd/chunkd.h
> index 72833f7..716704b 100644
> --- a/chunkd/chunkd.h
> +++ b/chunkd/chunkd.h
> @@ -39,8 +39,6 @@ enum {
>   	CLI_DATA_BUF_SZ		= 16 * 1024,
>
>   	CHD_TRASH_MAX		= 1000,
> -
> -	CLI_MAX_SENDFILE_SZ	= 512 * 1024,
>   };
>
>   struct client;
> @@ -54,7 +52,6 @@ struct client_write {
>   	uint64_t		len;		/* write buffer length */
>   	cli_write_func		cb;		/* callback */
>   	void			*cb_data;	/* data passed to cb */
> -	bool			sendfile;	/* using sendfile? */
>
>   	struct list_head	node;
>   };
> @@ -268,7 +265,6 @@ extern bool fs_obj_delete(uint32_t table_id, const char *user,
>   		          const void *kbuf, size_t klen,
>   			  enum chunk_errcode *err_code);
>   extern int fs_obj_disable(const char *fn);
> -extern ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len);
>   extern int fs_list_objs_open(struct fs_obj_lister *t,
>   			     const char *root_path, uint32_t table_id);
>   extern int fs_list_objs_next(struct fs_obj_lister *t, char **fnp);
> @@ -323,7 +319,6 @@ extern void applog(int prio, const char *fmt, ...);
>   extern bool cli_err(struct client *cli, enum chunk_errcode code, bool recycle_ok);
>   extern int cli_writeq(struct client *cli, const void *buf, unsigned int buflen,
>   		     cli_write_func cb, void *cb_data);
> -extern bool cli_wr_sendfile(struct client *, cli_write_func);
>   extern bool cli_rd_set_poll(struct client *cli, bool readable);
>   extern void cli_wr_set_poll(struct client *cli, bool writable);
>   extern bool cli_cb_free(struct client *cli, struct client_write *wr,
> @@ -342,15 +337,6 @@ extern void read_config(void);
>   /* selfcheck.c */
>   extern int chk_spawn(TCHDB *hdb);
>
> -static inline bool use_sendfile(struct client *cli)
> -{
> -#if defined(HAVE_SENDFILE)&&  defined(HAVE_SYS_SENDFILE_H)
> -	return cli->ssl ? false : true;
> -#else
> -	return false;
> -#endif
> -}
> -
>   #ifndef HAVE_STRNLEN
>   extern size_t strnlen(const char *s, size_t maxlen);
>   #endif
> diff --git a/chunkd/object.c b/chunkd/object.c
> index d7d3cb6..a517558 100644
> --- a/chunkd/object.c
> +++ b/chunkd/object.c
> @@ -253,28 +253,23 @@ void cli_in_end(struct client *cli)
>
>   static bool object_read_bytes(struct client *cli)
>   {
> -	if (use_sendfile(cli)) {
> -		if (!cli_wr_sendfile(cli, object_get_more))
> -			return false;
> -	} else {
> -		ssize_t bytes;
> +	ssize_t bytes;
>
> -		bytes = fs_obj_read(cli->in_obj, cli->netbuf_out,
> -				    MIN(cli->in_len, CLI_DATA_BUF_SZ));
> -		if (bytes<  0)
> -			return false;
> -		if (bytes == 0&&  cli->in_len != 0)
> -			return false;
> +	bytes = fs_obj_read(cli->in_obj, cli->netbuf_out,
> +			    MIN(cli->in_len, CLI_DATA_BUF_SZ));
> +	if (bytes<  0)
> +		return false;
> +	if (bytes == 0&&  cli->in_len != 0)
> +		return false;
>
> -		cli->in_len -= bytes;
> +	cli->in_len -= bytes;
>
> -		if (!cli->in_len)
> -			cli_in_end(cli);
> +	if (!cli->in_len)
> +		cli_in_end(cli);
>
> -		if (cli_writeq(cli, cli->netbuf_out, bytes,
> -			       cli->in_len ? object_get_more : NULL, NULL))
> -			return false;
> -	}
> +	if (cli_writeq(cli, cli->netbuf_out, bytes,
> +		       cli->in_len ? object_get_more : NULL, NULL))
> +		return false;
>
>   	return true;
>   }
> diff --git a/chunkd/server.c b/chunkd/server.c
> index 3115592..833dcf8 100644
> --- a/chunkd/server.c
> +++ b/chunkd/server.c
> @@ -504,14 +504,7 @@ restart:
>
>   	/* execute non-blocking write */
>   do_write:
> -	if (tmp->sendfile) {
> -		rc = fs_obj_sendfile(cli->in_obj, cli->fd,
> -				     MIN(cli->in_len, CLI_MAX_SENDFILE_SZ));
> -		if (rc<  0)
> -			goto err_out;
> -
> -		cli->in_len -= rc;
> -	} else if (cli->ssl) {
> +	if (cli->ssl) {
>   		rc = SSL_write(cli->ssl, tmp->buf, tmp->len);
>   		if (rc<= 0) {
>   			rc = SSL_get_error(cli->ssl, rc);
> @@ -612,31 +605,12 @@ int cli_writeq(struct client *cli, const void *buf, unsigned int buflen,
>   	wr->len = buflen;
>   	wr->cb = cb;
>   	wr->cb_data = cb_data;
> -	wr->sendfile = false;
>
>   	list_add_tail(&wr->node,&cli->write_q);
>
>   	return 0;
>   }
>
> -bool cli_wr_sendfile(struct client *cli, cli_write_func cb)
> -{
> -	struct client_write *wr;
> -
> -	wr = calloc(1, sizeof(struct client_write));
> -	if (!wr)
> -		return false;
> -
> -	wr->len = cli->in_len;
> -	wr->cb = cb;
> -	wr->sendfile = true;
> -	INIT_LIST_HEAD(&wr->node);
> -
> -	list_add_tail(&wr->node,&cli->write_q);
> -
> -	return true;
> -}
> -
>   static int cli_read_data(struct client *cli, void *buf, size_t buflen)
>   {
>   	ssize_t rc;
> diff --git a/configure.ac b/configure.ac
> index 6b48f1e..db10242 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -65,7 +65,6 @@ AM_PROG_LIBTOOL
>   dnl Checks for header files.
>   AC_HEADER_STDC
>   dnl AC_CHECK_HEADERS(sys/ioctl.h unistd.h)
> -AC_CHECK_HEADERS(sys/sendfile.h sys/filio.h)
>   AC_CHECK_HEADER(db.h,[],exit 1)
>
>   dnl Checks for typedefs, structures, and compiler characteristics.
> @@ -97,7 +96,7 @@ PKG_CHECK_MODULES(TOKYOCABINET, tokyocabinet)
>   dnl -------------------------------------
>   dnl Checks for optional library functions
>   dnl -------------------------------------
> -AC_CHECK_FUNCS(strnlen daemon memmem memrchr sendfile)
> +AC_CHECK_FUNCS(strnlen daemon memmem memrchr)
>   AC_CHECK_FUNC(xdr_sizeof,
>   	[AC_DEFINE([HAVE_XDR_SIZEOF], [1],
>   		[Define to 1 if you have xdr_sizeof.])],
> --
> To unsubscribe from this list: send the line "unsubscribe hail-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff,

May be able to use vmsplice with sendfile (if linux is only target 
platform).  Haven't tried it myself, but the operations look interesting 
at achieving zero copy with sockets from memory addresses.

Regards
-steve

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 4/4] chunkd checksums each block, as it is read from disk
  2010-07-17  5:46 [PATCH 0/3 v2] chunkd on-disk checksumming Jeff Garzik
                   ` (2 preceding siblings ...)
  2010-07-17  5:47 ` [PATCH 3/3 v2] [chunkd] match network buffer size to checksum buffer size (64k) Jeff Garzik
@ 2010-07-18  7:09 ` Jeff Garzik
  3 siblings, 0 replies; 8+ messages in thread
From: Jeff Garzik @ 2010-07-18  7:09 UTC (permalink / raw)
  To: hail-devel


Note that we are checksumming "hot cache" data, so SHA1 isn't as
punishing as one might think.


 chunkd/be-fs.c |   51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

commit 2211e3b58620093866be4130397cb3b476620725
Author: Jeff Garzik <jeff@garzik.org>
Date:   Sun Jul 18 03:03:35 2010 -0400

    [chunkd] checksum data prior to returning via GET
    
    When reading a file off disk, checksum the data after reading from
    disk, prior to sending across network to client.  Fail read, if
    checksum fails.
    
    This guarantees we will never send corrupted data to a client.
    
    Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/chunkd/be-fs.c b/chunkd/be-fs.c
index 2120991..dce2561 100644
--- a/chunkd/be-fs.c
+++ b/chunkd/be-fs.c
@@ -49,6 +49,10 @@ struct fs_obj {
 
 	int			in_fd;
 	char			*in_fn;
+	off_t			in_pos;
+
+	off_t			tail_pos;
+	size_t			tail_len;
 
 	size_t			checked_bytes;
 	SHA_CTX			checksum;
@@ -364,6 +368,8 @@ struct backend_obj *fs_obj_new(uint32_t table_id,
 	if (!obj->csum_tbl)
 		goto err_out;
 	obj->csum_tbl_sz = csum_bytes;
+	obj->tail_pos = data_len & ~(CHUNK_BLK_SZ - 1);
+	obj->tail_len = data_len & (CHUNK_BLK_SZ - 1);
 
 	/* build local fs pathname */
 	fn = fs_obj_pathname(table_id, key, key_len);
@@ -488,6 +494,8 @@ struct backend_obj *fs_obj_open(uint32_t table_id, const char *user,
 	value_len = GUINT64_FROM_LE(hdr.value_len);
 	obj->n_blk = GUINT32_FROM_LE(hdr.n_blk);
 	csum_bytes = obj->n_blk * CHD_CSUM_SZ;
+	obj->tail_pos = value_len & ~(CHUNK_BLK_SZ - 1);
+	obj->tail_len = value_len & (CHUNK_BLK_SZ - 1);
 
 	/* verify file size large enough to contain value */
 	tmp64 = value_len + sizeof(hdr) + key_len + csum_bytes;
@@ -571,15 +579,56 @@ void fs_obj_free(struct backend_obj *bo)
 	free(obj);
 }
 
+static bool can_csum_blk(struct fs_obj *obj, size_t len)
+{
+	if (obj->in_pos & (CHUNK_BLK_SZ - 1))
+		return false;
+
+	if (obj->in_pos == obj->tail_pos && len == obj->tail_len)
+		return true;
+	if (len == CHUNK_BLK_SZ)
+		return true;
+
+	return false;
+}
+
 ssize_t fs_obj_read(struct backend_obj *bo, void *ptr, size_t len)
 {
 	struct fs_obj *obj = bo->private;
 	ssize_t rc;
 
 	rc = read(obj->in_fd, ptr, len);
-	if (rc < 0)
+	if (rc < 0) {
 		applog(LOG_ERR, "obj read(%s) failed: %s",
 		       obj->in_fn, strerror(errno));
+		return -errno;
+	}
+
+	if (can_csum_blk(obj, rc)) {
+		unsigned char md[CHD_CSUM_SZ];
+		unsigned int blk_pos;
+		int cmprc;
+
+		SHA1(ptr, rc, md);
+
+		blk_pos = (unsigned int) (obj->in_pos >> CHUNK_BLK_ORDER);
+		cmprc = memcmp(md, obj->csum_tbl + (blk_pos * CHD_CSUM_SZ),
+			       CHD_CSUM_SZ);
+
+		if (cmprc) {
+			applog(LOG_WARNING, "obj(%s) csum failed @ 0x%llx",
+			       obj->in_fn,
+			       (unsigned long long) obj->in_pos);
+			return -EIO;
+		}
+	} else {
+		applog(LOG_INFO, "obj(%s) unaligned read, 0x%x @ 0x%llx",
+		       obj->in_fn, len,
+		       (unsigned long long) obj->in_pos);
+		
+	}
+
+	obj->in_pos += rc;
 
 	return rc;
 }

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3 v2] chunkd: remove sendfile(2) zero-copy support
  2010-07-18  3:45   ` Steven Dake
@ 2010-07-18 23:10     ` Jeff Garzik
  2010-07-18 23:23       ` Jeff Garzik
  0 siblings, 1 reply; 8+ messages in thread
From: Jeff Garzik @ 2010-07-18 23:10 UTC (permalink / raw)
  To: Steven Dake; +Cc: hail-devel

On 07/17/2010 11:45 PM, Steven Dake wrote:
> On 07/16/2010 10:46 PM, Jeff Garzik wrote:
>> chunkd: remove sendfile(2) zero-copy support
>>
>> chunkd will be soon checksumming data in main memory. That removes
>> the utility of a zero-copy interface which bypasses the on-heap
>> data requirement.
>>
>> Signed-off-by: Jeff Garzik<jgarzik@redhat.com>

> May be able to use vmsplice with sendfile (if linux is only target
> platform). Haven't tried it myself, but the operations look interesting
> at achieving zero copy with sockets from memory addresses.

Even though the man pages say "only for pipes", this syscall definitely 
works with TCP.  The big question:  is it actually faster than 
read()+write() ?

Years ago, I experimented with using some fancy new Linux-specific 
syscalls in a from-scratch implementation of cp(1).  It turned out that 
read()+write() was faster than other methods.

That was file->file copying.  It's probably worth investigating 
vmsplice() for our file->checksum->TCP case, definitely.

	Jeff



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3 v2] chunkd: remove sendfile(2) zero-copy support
  2010-07-18 23:10     ` Jeff Garzik
@ 2010-07-18 23:23       ` Jeff Garzik
  0 siblings, 0 replies; 8+ messages in thread
From: Jeff Garzik @ 2010-07-18 23:23 UTC (permalink / raw)
  To: hail-devel; +Cc: Steven Dake

On 07/18/2010 07:10 PM, Jeff Garzik wrote:
> On 07/17/2010 11:45 PM, Steven Dake wrote:
>> On 07/16/2010 10:46 PM, Jeff Garzik wrote:
>>> chunkd: remove sendfile(2) zero-copy support
>>>
>>> chunkd will be soon checksumming data in main memory. That removes
>>> the utility of a zero-copy interface which bypasses the on-heap
>>> data requirement.
>>>
>>> Signed-off-by: Jeff Garzik<jgarzik@redhat.com>
>
>> May be able to use vmsplice with sendfile (if linux is only target
>> platform). Haven't tried it myself, but the operations look interesting
>> at achieving zero copy with sockets from memory addresses.

As an aside, Project Hail -is- intended to be portable to other 
operating systems.  That said, I happily use OS-specific features if 
they have a measurable impact on our core code paths.

Another OS-specific feature I plan on using, for example, is 
sync_file_range(2) for large objects.  We can make use of the technique 
used by MythTV for streaming, which Linus describes here:

http://marc.info/?l=linux-kernel&m=127429771726842&w=2
http://marc.info/?l=linux-kernel&m=127431438118461&w=2

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2010-07-18 23:23 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-07-17  5:46 [PATCH 0/3 v2] chunkd on-disk checksumming Jeff Garzik
2010-07-17  5:46 ` [PATCH 1/3 v2] chunkd: remove sendfile(2) zero-copy support Jeff Garzik
2010-07-18  3:45   ` Steven Dake
2010-07-18 23:10     ` Jeff Garzik
2010-07-18 23:23       ` Jeff Garzik
2010-07-17  5:47 ` [PATCH 2/3 v2] chunkd: Add checksum table to on-disk format, one sum per 64k of data Jeff Garzik
2010-07-17  5:47 ` [PATCH 3/3 v2] [chunkd] match network buffer size to checksum buffer size (64k) Jeff Garzik
2010-07-18  7:09 ` [PATCH 4/4] chunkd checksums each block, as it is read from disk Jeff Garzik

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.