* [PATCH v3 01/28] xfs: Add new name to attri/d
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
@ 2023-10-06 18:48 ` Andrey Albershteyn
2023-10-06 18:48 ` [PATCH v3 02/28] xfs: add parent pointer support to attribute code Andrey Albershteyn
` (26 subsequent siblings)
27 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:48 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Allison Henderson
From: Allison Henderson <allison.henderson@oracle.com>
This patch adds two new fields to the atti/d. They are nname and
nnamelen. This will be used for parent pointer updates since a
rename operation may cause the parent pointer to update both the
name and value. So we need to carry both the new name as well as
the target name in the attri/d.
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
fs/xfs/libxfs/xfs_attr.c | 12 ++-
fs/xfs/libxfs/xfs_attr.h | 4 +-
fs/xfs/libxfs/xfs_da_btree.h | 2 +
fs/xfs/libxfs/xfs_log_format.h | 6 +-
fs/xfs/xfs_attr_item.c | 135 +++++++++++++++++++++++++++------
fs/xfs/xfs_attr_item.h | 1 +
6 files changed, 133 insertions(+), 27 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index e28d93d232de..b1dbed7655e8 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -423,6 +423,12 @@ xfs_attr_complete_op(
args->op_flags &= ~XFS_DA_OP_REPLACE;
if (do_replace) {
args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ if (args->new_namelen > 0) {
+ args->name = args->new_name;
+ args->namelen = args->new_namelen;
+ args->hashval = xfs_da_hashname(args->name,
+ args->namelen);
+ }
return replace_state;
}
return XFS_DAS_DONE;
@@ -922,9 +928,13 @@ xfs_attr_defer_replace(
struct xfs_da_args *args)
{
struct xfs_attr_intent *new;
+ int op_flag;
int error = 0;
- error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new);
+ op_flag = args->new_namelen == 0 ? XFS_ATTRI_OP_FLAGS_REPLACE :
+ XFS_ATTRI_OP_FLAGS_NVREPLACE;
+
+ error = xfs_attr_intent_init(args, op_flag, &new);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 81be9b3e4004..3e81f3f48560 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -510,8 +510,8 @@ struct xfs_attr_intent {
struct xfs_da_args *xattri_da_args;
/*
- * Shared buffer containing the attr name and value so that the logging
- * code can share large memory buffers between log items.
+ * Shared buffer containing the attr name, new name, and value so that
+ * the logging code can share large memory buffers between log items.
*/
struct xfs_attri_log_nameval *xattri_nameval;
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ffa3df5b2893..a4b29827603f 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -55,7 +55,9 @@ enum xfs_dacmp {
typedef struct xfs_da_args {
struct xfs_da_geometry *geo; /* da block geometry */
const uint8_t *name; /* string (maybe not NULL terminated) */
+ const uint8_t *new_name; /* new attr name */
int namelen; /* length of string (maybe no NULL) */
+ int new_namelen; /* new attr name len */
uint8_t filetype; /* filetype of inode for directories */
void *value; /* set of bytes (maybe contain NULLs) */
int valuelen; /* length of value */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 269573c82808..82f910b857b7 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -117,7 +117,8 @@ struct xfs_unmount_log_format {
#define XLOG_REG_TYPE_ATTRD_FORMAT 28
#define XLOG_REG_TYPE_ATTR_NAME 29
#define XLOG_REG_TYPE_ATTR_VALUE 30
-#define XLOG_REG_TYPE_MAX 30
+#define XLOG_REG_TYPE_ATTR_NNAME 31
+#define XLOG_REG_TYPE_MAX 31
/*
@@ -964,6 +965,7 @@ struct xfs_icreate_log {
#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */
#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */
#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */
+#define XFS_ATTRI_OP_FLAGS_NVREPLACE 4 /* Replace attr name and val */
#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */
/*
@@ -981,7 +983,7 @@ struct xfs_icreate_log {
struct xfs_attri_log_format {
uint16_t alfi_type; /* attri log item type */
uint16_t alfi_size; /* size of this item */
- uint32_t __pad; /* pad to 64 bit aligned */
+ uint32_t alfi_nname_len; /* attr new name length */
uint64_t alfi_id; /* attri identifier */
uint64_t alfi_ino; /* the inode for this attr operation */
uint32_t alfi_op_flags; /* marks the op as a set or remove */
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 36fe2abb16e6..97ee9d89b5b8 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -75,6 +75,8 @@ static inline struct xfs_attri_log_nameval *
xfs_attri_log_nameval_alloc(
const void *name,
unsigned int name_len,
+ const void *nname,
+ unsigned int nname_len,
const void *value,
unsigned int value_len)
{
@@ -85,15 +87,25 @@ xfs_attri_log_nameval_alloc(
* this. But kvmalloc() utterly sucks, so we use our own version.
*/
nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) +
- name_len + value_len);
+ name_len + nname_len + value_len);
nv->name.i_addr = nv + 1;
nv->name.i_len = name_len;
nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME;
memcpy(nv->name.i_addr, name, name_len);
+ if (nname_len) {
+ nv->nname.i_addr = nv->name.i_addr + name_len;
+ nv->nname.i_len = nname_len;
+ memcpy(nv->nname.i_addr, nname, nname_len);
+ } else {
+ nv->nname.i_addr = NULL;
+ nv->nname.i_len = 0;
+ }
+ nv->nname.i_type = XLOG_REG_TYPE_ATTR_NNAME;
+
if (value_len) {
- nv->value.i_addr = nv->name.i_addr + name_len;
+ nv->value.i_addr = nv->name.i_addr + nname_len + name_len;
nv->value.i_len = value_len;
memcpy(nv->value.i_addr, value, value_len);
} else {
@@ -147,11 +159,15 @@ xfs_attri_item_size(
*nbytes += sizeof(struct xfs_attri_log_format) +
xlog_calc_iovec_len(nv->name.i_len);
- if (!nv->value.i_len)
- return;
+ if (nv->nname.i_len) {
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(nv->nname.i_len);
+ }
- *nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+ if (nv->value.i_len) {
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+ }
}
/*
@@ -181,6 +197,9 @@ xfs_attri_item_format(
ASSERT(nv->name.i_len > 0);
attrip->attri_format.alfi_size++;
+ if (nv->nname.i_len > 0)
+ attrip->attri_format.alfi_size++;
+
if (nv->value.i_len > 0)
attrip->attri_format.alfi_size++;
@@ -188,6 +207,10 @@ xfs_attri_item_format(
&attrip->attri_format,
sizeof(struct xfs_attri_log_format));
xlog_copy_from_iovec(lv, &vecp, &nv->name);
+
+ if (nv->nname.i_len > 0)
+ xlog_copy_from_iovec(lv, &vecp, &nv->nname);
+
if (nv->value.i_len > 0)
xlog_copy_from_iovec(lv, &vecp, &nv->value);
}
@@ -374,6 +397,7 @@ xfs_attr_log_item(
attrp->alfi_op_flags = attr->xattri_op_flags;
attrp->alfi_value_len = attr->xattri_nameval->value.i_len;
attrp->alfi_name_len = attr->xattri_nameval->name.i_len;
+ attrp->alfi_nname_len = attr->xattri_nameval->nname.i_len;
ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK));
attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter;
}
@@ -415,7 +439,8 @@ xfs_attr_create_intent(
* deferred work state structure.
*/
attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name,
- args->namelen, args->value, args->valuelen);
+ args->namelen, args->new_name,
+ args->new_namelen, args->value, args->valuelen);
}
attrip = xfs_attri_init(mp, attr->xattri_nameval);
@@ -503,7 +528,8 @@ xfs_attri_validate(
unsigned int op = attrp->alfi_op_flags &
XFS_ATTRI_OP_FLAGS_TYPE_MASK;
- if (attrp->__pad != 0)
+ if (attrp->alfi_op_flags != XFS_ATTRI_OP_FLAGS_NVREPLACE &&
+ attrp->alfi_nname_len != 0)
return false;
if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)
@@ -517,6 +543,7 @@ xfs_attri_validate(
case XFS_ATTRI_OP_FLAGS_SET:
case XFS_ATTRI_OP_FLAGS_REPLACE:
case XFS_ATTRI_OP_FLAGS_REMOVE:
+ case XFS_ATTRI_OP_FLAGS_NVREPLACE:
break;
default:
return false;
@@ -526,9 +553,14 @@ xfs_attri_validate(
return false;
if ((attrp->alfi_name_len > XATTR_NAME_MAX) ||
+ (attrp->alfi_nname_len > XATTR_NAME_MAX) ||
(attrp->alfi_name_len == 0))
return false;
+ if (op == XFS_ATTRI_OP_FLAGS_REMOVE &&
+ attrp->alfi_value_len != 0)
+ return false;
+
return xfs_verify_ino(mp, attrp->alfi_ino);
}
@@ -589,6 +621,8 @@ xfs_attri_item_recover(
args->whichfork = XFS_ATTR_FORK;
args->name = nv->name.i_addr;
args->namelen = nv->name.i_len;
+ args->new_name = nv->nname.i_addr;
+ args->new_namelen = nv->nname.i_len;
args->hashval = xfs_da_hashname(args->name, args->namelen);
args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK;
args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
@@ -599,6 +633,7 @@ xfs_attri_item_recover(
switch (attr->xattri_op_flags) {
case XFS_ATTRI_OP_FLAGS_SET:
case XFS_ATTRI_OP_FLAGS_REPLACE:
+ case XFS_ATTRI_OP_FLAGS_NVREPLACE:
args->value = nv->value.i_addr;
args->valuelen = nv->value.i_len;
args->total = xfs_attr_calc_size(args, &local);
@@ -689,6 +724,7 @@ xfs_attri_item_relog(
new_attrp->alfi_op_flags = old_attrp->alfi_op_flags;
new_attrp->alfi_value_len = old_attrp->alfi_value_len;
new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+ new_attrp->alfi_nname_len = old_attrp->alfi_nname_len;
new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter;
xfs_trans_add_item(tp, &new_attrip->attri_item);
@@ -711,48 +747,102 @@ xlog_recover_attri_commit_pass2(
const void *attr_value = NULL;
const void *attr_name;
size_t len;
-
- attri_formatp = item->ri_buf[0].i_addr;
- attr_name = item->ri_buf[1].i_addr;
+ const void *attr_nname = NULL;
+ int op, i = 0;
/* Validate xfs_attri_log_format before the large memory allocation */
len = sizeof(struct xfs_attri_log_format);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[i].i_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[i].i_addr, item->ri_buf[i].i_len);
return -EFSCORRUPTED;
}
+ attri_formatp = item->ri_buf[i].i_addr;
if (!xfs_attri_validate(mp, attri_formatp)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[i].i_addr, item->ri_buf[i].i_len);
return -EFSCORRUPTED;
}
+ op = attri_formatp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+ switch (op) {
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ if (item->ri_total != 3) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ break;
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ if (item->ri_total != 2) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ break;
+ case XFS_ATTRI_OP_FLAGS_NVREPLACE:
+ if (item->ri_total != 4) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ break;
+ default:
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+
+ i++;
/* Validate the attr name */
- if (item->ri_buf[1].i_len !=
+ if (item->ri_buf[i].i_len !=
xlog_calc_iovec_len(attri_formatp->alfi_name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ attri_formatp, len);
return -EFSCORRUPTED;
}
+ attr_name = item->ri_buf[i].i_addr;
if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[1].i_addr, item->ri_buf[1].i_len);
+ item->ri_buf[i].i_addr, item->ri_buf[i].i_len);
return -EFSCORRUPTED;
}
+ i++;
+ if (attri_formatp->alfi_nname_len) {
+ /* Validate the attr nname */
+ if (item->ri_buf[i].i_len !=
+ xlog_calc_iovec_len(attri_formatp->alfi_nname_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[i].i_addr,
+ item->ri_buf[i].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ attr_nname = item->ri_buf[i].i_addr;
+ if (!xfs_attr_namecheck(attr_nname,
+ attri_formatp->alfi_nname_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[i].i_addr,
+ item->ri_buf[i].i_len);
+ return -EFSCORRUPTED;
+ }
+ i++;
+ }
+
+
/* Validate the attr value, if present */
if (attri_formatp->alfi_value_len != 0) {
- if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) {
+ if (item->ri_buf[i].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr,
- item->ri_buf[0].i_len);
+ attri_formatp, len);
return -EFSCORRUPTED;
}
- attr_value = item->ri_buf[2].i_addr;
+ attr_value = item->ri_buf[i].i_addr;
}
/*
@@ -761,7 +851,8 @@ xlog_recover_attri_commit_pass2(
* reference.
*/
nv = xfs_attri_log_nameval_alloc(attr_name,
- attri_formatp->alfi_name_len, attr_value,
+ attri_formatp->alfi_name_len, attr_nname,
+ attri_formatp->alfi_nname_len, attr_value,
attri_formatp->alfi_value_len);
attrip = xfs_attri_init(mp, nv);
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
index 3280a7930287..24d4968dd6cc 100644
--- a/fs/xfs/xfs_attr_item.h
+++ b/fs/xfs/xfs_attr_item.h
@@ -13,6 +13,7 @@ struct kmem_zone;
struct xfs_attri_log_nameval {
struct xfs_log_iovec name;
+ struct xfs_log_iovec nname;
struct xfs_log_iovec value;
refcount_t refcount;
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH v3 02/28] xfs: add parent pointer support to attribute code
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
2023-10-06 18:48 ` [PATCH v3 01/28] xfs: Add new name to attri/d Andrey Albershteyn
@ 2023-10-06 18:48 ` Andrey Albershteyn
2023-10-06 18:48 ` [PATCH v3 03/28] xfs: define parent pointer xattr format Andrey Albershteyn
` (25 subsequent siblings)
27 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:48 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Allison Henderson, Mark Tinguely
From: Allison Henderson <allison.henderson@oracle.com>
Add the new parent attribute type. XFS_ATTR_PARENT is used only for parent pointer
entries; it uses reserved blocks like XFS_ATTR_ROOT.
Signed-off-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
fs/xfs/libxfs/xfs_attr.c | 4 +++-
fs/xfs/libxfs/xfs_da_format.h | 5 ++++-
fs/xfs/libxfs/xfs_log_format.h | 1 +
fs/xfs/scrub/attr.c | 2 +-
4 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index b1dbed7655e8..101823772bf9 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -976,11 +976,13 @@ xfs_attr_set(
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
struct xfs_trans_res tres;
- bool rsvd = (args->attr_filter & XFS_ATTR_ROOT);
+ bool rsvd;
int error, local;
int rmt_blks = 0;
unsigned int total;
+ rsvd = (args->attr_filter & (XFS_ATTR_ROOT | XFS_ATTR_PARENT)) != 0;
+
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index f9015f88eca7..fca622d43a38 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -698,12 +698,15 @@ struct xfs_attr3_leafblock {
#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
+#define XFS_ATTR_PARENT_BIT 3 /* parent pointer attrs */
#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT)
#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_PARENT (1u << XFS_ATTR_PARENT_BIT)
#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
-#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK \
+ (XFS_ATTR_ROOT | XFS_ATTR_SECURE | XFS_ATTR_PARENT)
/*
* Alignment for namelist and valuelist entries (since they are mixed
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 82f910b857b7..0bc1749fb7bb 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -974,6 +974,7 @@ struct xfs_icreate_log {
*/
#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \
XFS_ATTR_SECURE | \
+ XFS_ATTR_PARENT | \
XFS_ATTR_INCOMPLETE)
/*
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 6c16d9530cca..8bc6aa274fa6 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -494,7 +494,7 @@ xchk_xattr_rec(
/* Retrieve the entry and check it. */
hash = be32_to_cpu(ent->hashval);
badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE |
- XFS_ATTR_INCOMPLETE);
+ XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT);
if ((ent->flags & badflags) != 0)
xchk_da_set_corrupt(ds, level);
if (ent->flags & XFS_ATTR_LOCAL) {
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH v3 03/28] xfs: define parent pointer xattr format
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
2023-10-06 18:48 ` [PATCH v3 01/28] xfs: Add new name to attri/d Andrey Albershteyn
2023-10-06 18:48 ` [PATCH v3 02/28] xfs: add parent pointer support to attribute code Andrey Albershteyn
@ 2023-10-06 18:48 ` Andrey Albershteyn
2023-10-06 18:48 ` [PATCH v3 04/28] xfs: Add xfs_verify_pptr Andrey Albershteyn
` (24 subsequent siblings)
27 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:48 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Allison Henderson
From: Allison Henderson <allison.henderson@oracle.com>
We need to define the parent pointer attribute format before we start
adding support for it into all the code that needs to use it. The EA
format we will use encodes the following information:
name={parent inode #, parent inode generation, dirent offset}
value={dirent filename}
The inode/gen gives all the information we need to reliably identify the
parent without requiring child->parent lock ordering, and allows
userspace to do pathname component level reconstruction without the
kernel ever needing to verify the parent itself as part of ioctl calls.
By using the dirent offset in the EA name, we have a method of knowing
the exact parent pointer EA we need to modify/remove in rename/unlink
without an unbound EA name search.
By keeping the dirent name in the value, we have enough information to
be able to validate and reconstruct damaged directory trees. While the
diroffset of a filename alone is not unique enough to identify the
child, the {diroffset,filename,child_inode} tuple is sufficient. That
is, if the diroffset gets reused and points to a different filename, we
can detect that from the contents of EA. If a link of the same name is
created, then we can check whether it points at the same inode as the
parent EA we current have.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
fs/xfs/libxfs/xfs_da_format.h | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index fca622d43a38..307c8cdb6f10 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -862,4 +862,29 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
struct xfs_da3_blkinfo *hdr3);
+/*
+ * Parent pointer attribute format definition
+ *
+ * EA name encodes the parent inode number, generation and the offset of
+ * the dirent that points to the child inode. The EA value contains the
+ * same name as the dirent in the parent directory.
+ */
+struct xfs_parent_name_rec {
+ __be64 p_ino;
+ __be32 p_gen;
+ __be32 p_diroffset;
+};
+
+/*
+ * incore version of the above, also contains name pointers so callers
+ * can pass/obtain all the parent pointer information in a single structure
+ */
+struct xfs_parent_name_irec {
+ xfs_ino_t p_ino;
+ uint32_t p_gen;
+ xfs_dir2_dataptr_t p_diroffset;
+ const char *p_name;
+ uint8_t p_namelen;
+};
+
#endif /* __XFS_DA_FORMAT_H__ */
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH v3 04/28] xfs: Add xfs_verify_pptr
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (2 preceding siblings ...)
2023-10-06 18:48 ` [PATCH v3 03/28] xfs: define parent pointer xattr format Andrey Albershteyn
@ 2023-10-06 18:48 ` Andrey Albershteyn
2023-10-11 1:01 ` Darrick J. Wong
2023-10-06 18:48 ` [PATCH v3 05/28] fs: add FS_XFLAG_VERITY for fs-verity sealed inodes Andrey Albershteyn
` (23 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:48 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Allison Henderson
From: Allison Henderson <allison.henderson@oracle.com>
Attribute names of parent pointers are not strings. So we need to modify
attr_namecheck to verify parent pointer records when the XFS_ATTR_PARENT flag is
set.
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
fs/xfs/libxfs/xfs_attr.c | 47 ++++++++++++++++++++++++++++++++---
fs/xfs/libxfs/xfs_attr.h | 3 ++-
fs/xfs/libxfs/xfs_da_format.h | 8 ++++++
fs/xfs/scrub/attr.c | 2 +-
fs/xfs/xfs_attr_item.c | 11 +++++---
fs/xfs/xfs_attr_list.c | 17 +++++++++----
6 files changed, 74 insertions(+), 14 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 101823772bf9..711022742e34 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1577,9 +1577,33 @@ xfs_attr_node_get(
return error;
}
-/* Returns true if the attribute entry name is valid. */
-bool
-xfs_attr_namecheck(
+/*
+ * Verify parent pointer attribute is valid.
+ * Return true on success or false on failure
+ */
+STATIC bool
+xfs_verify_pptr(
+ struct xfs_mount *mp,
+ const struct xfs_parent_name_rec *rec)
+{
+ xfs_ino_t p_ino;
+ xfs_dir2_dataptr_t p_diroffset;
+
+ p_ino = be64_to_cpu(rec->p_ino);
+ p_diroffset = be32_to_cpu(rec->p_diroffset);
+
+ if (!xfs_verify_ino(mp, p_ino))
+ return false;
+
+ if (p_diroffset > XFS_DIR2_MAX_DATAPTR)
+ return false;
+
+ return true;
+}
+
+/* Returns true if the string attribute entry name is valid. */
+static bool
+xfs_str_attr_namecheck(
const void *name,
size_t length)
{
@@ -1594,6 +1618,23 @@ xfs_attr_namecheck(
return !memchr(name, 0, length);
}
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+ struct xfs_mount *mp,
+ const void *name,
+ size_t length,
+ int flags)
+{
+ if (flags & XFS_ATTR_PARENT) {
+ if (length != sizeof(struct xfs_parent_name_rec))
+ return false;
+ return xfs_verify_pptr(mp, (struct xfs_parent_name_rec *)name);
+ }
+
+ return xfs_str_attr_namecheck(name, length);
+}
+
int __init
xfs_attr_intent_init_cache(void)
{
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 3e81f3f48560..b79dae788cfb 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -547,7 +547,8 @@ int xfs_attr_get(struct xfs_da_args *args);
int xfs_attr_set(struct xfs_da_args *args);
int xfs_attr_set_iter(struct xfs_attr_intent *attr);
int xfs_attr_remove_iter(struct xfs_attr_intent *attr);
-bool xfs_attr_namecheck(const void *name, size_t length);
+bool xfs_attr_namecheck(struct xfs_mount *mp, const void *name, size_t length,
+ int flags);
int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
unsigned int *total);
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 307c8cdb6f10..6deefe03207f 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -741,6 +741,14 @@ xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
}
+static inline int
+xfs_attr3_leaf_flags(xfs_attr_leafblock_t *leafp, int idx)
+{
+ struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+
+ return entries[idx].flags;
+}
+
static inline xfs_attr_leaf_name_remote_t *
xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
{
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 8bc6aa274fa6..f35144704395 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -195,7 +195,7 @@ xchk_xattr_listent(
}
/* Does this name make sense? */
- if (!xfs_attr_namecheck(name, namelen)) {
+ if (!xfs_attr_namecheck(sx->sc->mp, name, namelen, flags)) {
xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
goto fail_xref;
}
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 97ee9d89b5b8..63393216159f 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -593,7 +593,8 @@ xfs_attri_item_recover(
*/
attrp = &attrip->attri_format;
if (!xfs_attri_validate(mp, attrp) ||
- !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
+ !xfs_attr_namecheck(mp, nv->name.i_addr, nv->name.i_len,
+ attrp->alfi_attr_filter))
return -EFSCORRUPTED;
error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
@@ -805,7 +806,8 @@ xlog_recover_attri_commit_pass2(
}
attr_name = item->ri_buf[i].i_addr;
- if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
+ if (!xfs_attr_namecheck(mp, attr_name, attri_formatp->alfi_name_len,
+ attri_formatp->alfi_attr_filter)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
item->ri_buf[i].i_addr, item->ri_buf[i].i_len);
return -EFSCORRUPTED;
@@ -823,8 +825,9 @@ xlog_recover_attri_commit_pass2(
}
attr_nname = item->ri_buf[i].i_addr;
- if (!xfs_attr_namecheck(attr_nname,
- attri_formatp->alfi_nname_len)) {
+ if (!xfs_attr_namecheck(mp, attr_nname,
+ attri_formatp->alfi_nname_len,
+ attri_formatp->alfi_attr_filter)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
item->ri_buf[i].i_addr,
item->ri_buf[i].i_len);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 99bbbe1a0e44..a51f7f13a352 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -58,9 +58,13 @@ xfs_attr_shortform_list(
struct xfs_attr_sf_sort *sbuf, *sbp;
struct xfs_attr_shortform *sf;
struct xfs_attr_sf_entry *sfe;
+ struct xfs_mount *mp;
int sbsize, nsbuf, count, i;
int error = 0;
+ ASSERT(context != NULL);
+ ASSERT(dp != NULL);
+ mp = dp->i_mount;
sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
ASSERT(sf != NULL);
if (!sf->hdr.count)
@@ -82,8 +86,9 @@ xfs_attr_shortform_list(
(dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(sfe->nameval,
- sfe->namelen)))
+ !xfs_attr_namecheck(mp, sfe->nameval,
+ sfe->namelen,
+ sfe->flags)))
return -EFSCORRUPTED;
context->put_listent(context,
sfe->flags,
@@ -174,8 +179,9 @@ xfs_attr_shortform_list(
cursor->offset = 0;
}
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(sbp->name,
- sbp->namelen))) {
+ !xfs_attr_namecheck(mp, sbp->name,
+ sbp->namelen,
+ sbp->flags))) {
error = -EFSCORRUPTED;
goto out;
}
@@ -465,7 +471,8 @@ xfs_attr3_leaf_list_int(
}
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(name, namelen)))
+ !xfs_attr_namecheck(mp, name, namelen,
+ entry->flags)))
return -EFSCORRUPTED;
context->put_listent(context, entry->flags,
name, namelen, valuelen);
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 04/28] xfs: Add xfs_verify_pptr
2023-10-06 18:48 ` [PATCH v3 04/28] xfs: Add xfs_verify_pptr Andrey Albershteyn
@ 2023-10-11 1:01 ` Darrick J. Wong
2023-10-11 11:09 ` Andrey Albershteyn
0 siblings, 1 reply; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 1:01 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner,
Allison Henderson
On Fri, Oct 06, 2023 at 08:48:58PM +0200, Andrey Albershteyn wrote:
> From: Allison Henderson <allison.henderson@oracle.com>
>
> Attribute names of parent pointers are not strings. So we need to modify
> attr_namecheck to verify parent pointer records when the XFS_ATTR_PARENT flag is
> set.
>
> Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Hi Andrey,
Could you take a look at the latest revision of the parent pointers
patchset, please? Your version and mine have drifted very far apart
over the summer...
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=pptrs-fsck
--D
> ---
> fs/xfs/libxfs/xfs_attr.c | 47 ++++++++++++++++++++++++++++++++---
> fs/xfs/libxfs/xfs_attr.h | 3 ++-
> fs/xfs/libxfs/xfs_da_format.h | 8 ++++++
> fs/xfs/scrub/attr.c | 2 +-
> fs/xfs/xfs_attr_item.c | 11 +++++---
> fs/xfs/xfs_attr_list.c | 17 +++++++++----
> 6 files changed, 74 insertions(+), 14 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> index 101823772bf9..711022742e34 100644
> --- a/fs/xfs/libxfs/xfs_attr.c
> +++ b/fs/xfs/libxfs/xfs_attr.c
> @@ -1577,9 +1577,33 @@ xfs_attr_node_get(
> return error;
> }
>
> -/* Returns true if the attribute entry name is valid. */
> -bool
> -xfs_attr_namecheck(
> +/*
> + * Verify parent pointer attribute is valid.
> + * Return true on success or false on failure
> + */
> +STATIC bool
> +xfs_verify_pptr(
> + struct xfs_mount *mp,
> + const struct xfs_parent_name_rec *rec)
> +{
> + xfs_ino_t p_ino;
> + xfs_dir2_dataptr_t p_diroffset;
> +
> + p_ino = be64_to_cpu(rec->p_ino);
> + p_diroffset = be32_to_cpu(rec->p_diroffset);
> +
> + if (!xfs_verify_ino(mp, p_ino))
> + return false;
> +
> + if (p_diroffset > XFS_DIR2_MAX_DATAPTR)
> + return false;
> +
> + return true;
> +}
> +
> +/* Returns true if the string attribute entry name is valid. */
> +static bool
> +xfs_str_attr_namecheck(
> const void *name,
> size_t length)
> {
> @@ -1594,6 +1618,23 @@ xfs_attr_namecheck(
> return !memchr(name, 0, length);
> }
>
> +/* Returns true if the attribute entry name is valid. */
> +bool
> +xfs_attr_namecheck(
> + struct xfs_mount *mp,
> + const void *name,
> + size_t length,
> + int flags)
> +{
> + if (flags & XFS_ATTR_PARENT) {
> + if (length != sizeof(struct xfs_parent_name_rec))
> + return false;
> + return xfs_verify_pptr(mp, (struct xfs_parent_name_rec *)name);
> + }
> +
> + return xfs_str_attr_namecheck(name, length);
> +}
> +
> int __init
> xfs_attr_intent_init_cache(void)
> {
> diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
> index 3e81f3f48560..b79dae788cfb 100644
> --- a/fs/xfs/libxfs/xfs_attr.h
> +++ b/fs/xfs/libxfs/xfs_attr.h
> @@ -547,7 +547,8 @@ int xfs_attr_get(struct xfs_da_args *args);
> int xfs_attr_set(struct xfs_da_args *args);
> int xfs_attr_set_iter(struct xfs_attr_intent *attr);
> int xfs_attr_remove_iter(struct xfs_attr_intent *attr);
> -bool xfs_attr_namecheck(const void *name, size_t length);
> +bool xfs_attr_namecheck(struct xfs_mount *mp, const void *name, size_t length,
> + int flags);
> int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
> void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
> unsigned int *total);
> diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
> index 307c8cdb6f10..6deefe03207f 100644
> --- a/fs/xfs/libxfs/xfs_da_format.h
> +++ b/fs/xfs/libxfs/xfs_da_format.h
> @@ -741,6 +741,14 @@ xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
> return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
> }
>
> +static inline int
> +xfs_attr3_leaf_flags(xfs_attr_leafblock_t *leafp, int idx)
> +{
> + struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
> +
> + return entries[idx].flags;
> +}
> +
> static inline xfs_attr_leaf_name_remote_t *
> xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
> {
> diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
> index 8bc6aa274fa6..f35144704395 100644
> --- a/fs/xfs/scrub/attr.c
> +++ b/fs/xfs/scrub/attr.c
> @@ -195,7 +195,7 @@ xchk_xattr_listent(
> }
>
> /* Does this name make sense? */
> - if (!xfs_attr_namecheck(name, namelen)) {
> + if (!xfs_attr_namecheck(sx->sc->mp, name, namelen, flags)) {
> xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
> goto fail_xref;
> }
> diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
> index 97ee9d89b5b8..63393216159f 100644
> --- a/fs/xfs/xfs_attr_item.c
> +++ b/fs/xfs/xfs_attr_item.c
> @@ -593,7 +593,8 @@ xfs_attri_item_recover(
> */
> attrp = &attrip->attri_format;
> if (!xfs_attri_validate(mp, attrp) ||
> - !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
> + !xfs_attr_namecheck(mp, nv->name.i_addr, nv->name.i_len,
> + attrp->alfi_attr_filter))
> return -EFSCORRUPTED;
>
> error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
> @@ -805,7 +806,8 @@ xlog_recover_attri_commit_pass2(
> }
>
> attr_name = item->ri_buf[i].i_addr;
> - if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
> + if (!xfs_attr_namecheck(mp, attr_name, attri_formatp->alfi_name_len,
> + attri_formatp->alfi_attr_filter)) {
> XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
> item->ri_buf[i].i_addr, item->ri_buf[i].i_len);
> return -EFSCORRUPTED;
> @@ -823,8 +825,9 @@ xlog_recover_attri_commit_pass2(
> }
>
> attr_nname = item->ri_buf[i].i_addr;
> - if (!xfs_attr_namecheck(attr_nname,
> - attri_formatp->alfi_nname_len)) {
> + if (!xfs_attr_namecheck(mp, attr_nname,
> + attri_formatp->alfi_nname_len,
> + attri_formatp->alfi_attr_filter)) {
> XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
> item->ri_buf[i].i_addr,
> item->ri_buf[i].i_len);
> diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
> index 99bbbe1a0e44..a51f7f13a352 100644
> --- a/fs/xfs/xfs_attr_list.c
> +++ b/fs/xfs/xfs_attr_list.c
> @@ -58,9 +58,13 @@ xfs_attr_shortform_list(
> struct xfs_attr_sf_sort *sbuf, *sbp;
> struct xfs_attr_shortform *sf;
> struct xfs_attr_sf_entry *sfe;
> + struct xfs_mount *mp;
> int sbsize, nsbuf, count, i;
> int error = 0;
>
> + ASSERT(context != NULL);
> + ASSERT(dp != NULL);
> + mp = dp->i_mount;
> sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
> ASSERT(sf != NULL);
> if (!sf->hdr.count)
> @@ -82,8 +86,9 @@ xfs_attr_shortform_list(
> (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
> for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
> if (XFS_IS_CORRUPT(context->dp->i_mount,
> - !xfs_attr_namecheck(sfe->nameval,
> - sfe->namelen)))
> + !xfs_attr_namecheck(mp, sfe->nameval,
> + sfe->namelen,
> + sfe->flags)))
> return -EFSCORRUPTED;
> context->put_listent(context,
> sfe->flags,
> @@ -174,8 +179,9 @@ xfs_attr_shortform_list(
> cursor->offset = 0;
> }
> if (XFS_IS_CORRUPT(context->dp->i_mount,
> - !xfs_attr_namecheck(sbp->name,
> - sbp->namelen))) {
> + !xfs_attr_namecheck(mp, sbp->name,
> + sbp->namelen,
> + sbp->flags))) {
> error = -EFSCORRUPTED;
> goto out;
> }
> @@ -465,7 +471,8 @@ xfs_attr3_leaf_list_int(
> }
>
> if (XFS_IS_CORRUPT(context->dp->i_mount,
> - !xfs_attr_namecheck(name, namelen)))
> + !xfs_attr_namecheck(mp, name, namelen,
> + entry->flags)))
> return -EFSCORRUPTED;
> context->put_listent(context, entry->flags,
> name, namelen, valuelen);
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 04/28] xfs: Add xfs_verify_pptr
2023-10-11 1:01 ` Darrick J. Wong
@ 2023-10-11 11:09 ` Andrey Albershteyn
0 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-11 11:09 UTC (permalink / raw)
To: Darrick J. Wong
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner,
Allison Henderson
On 2023-10-10 18:01:43, Darrick J. Wong wrote:
> Could you take a look at the latest revision of the parent pointers
> patchset, please? Your version and mine have drifted very far apart
> over the summer...
>
> https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=pptrs-fsck
>
Sure thing, sorry, haven't checked if they changed.
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 05/28] fs: add FS_XFLAG_VERITY for fs-verity sealed inodes
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (3 preceding siblings ...)
2023-10-06 18:48 ` [PATCH v3 04/28] xfs: Add xfs_verify_pptr Andrey Albershteyn
@ 2023-10-06 18:48 ` Andrey Albershteyn
2023-10-11 4:05 ` Eric Biggers
2023-10-06 18:49 ` [PATCH v3 06/28] fsverity: add drop_page() callout Andrey Albershteyn
` (22 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:48 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
Add extended file attribute FS_XFLAG_VERITY for inodes sealed with
fs-verity.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
Documentation/filesystems/fsverity.rst | 9 +++++++++
include/uapi/linux/fs.h | 1 +
2 files changed, 10 insertions(+)
diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst
index 13e4b18e5dbb..af889512c6ac 100644
--- a/Documentation/filesystems/fsverity.rst
+++ b/Documentation/filesystems/fsverity.rst
@@ -326,6 +326,15 @@ the file has fs-verity enabled. This can perform better than
FS_IOC_GETFLAGS and FS_IOC_MEASURE_VERITY because it doesn't require
opening the file, and opening verity files can be expensive.
+Extended file attributes
+------------------------
+
+For fs-verity sealed files the FS_XFLAG_VERITY extended file
+attribute is set. The attribute can be observed via lsattr.
+
+ [root@vm:~]# lsattr /mnt/test/foo
+ --------------------V- /mnt/test/foo
+
.. _accessing_verity_files:
Accessing verity files
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b7b56871029c..5172a2eb902c 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -140,6 +140,7 @@ struct fsxattr {
#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
#define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */
#define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */
+#define FS_XFLAG_VERITY 0x00020000 /* fs-verity sealed inode */
#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
/* the read-only stuff doesn't really belong here, but any other place is
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 05/28] fs: add FS_XFLAG_VERITY for fs-verity sealed inodes
2023-10-06 18:48 ` [PATCH v3 05/28] fs: add FS_XFLAG_VERITY for fs-verity sealed inodes Andrey Albershteyn
@ 2023-10-11 4:05 ` Eric Biggers
2023-10-11 11:06 ` Andrey Albershteyn
0 siblings, 1 reply; 74+ messages in thread
From: Eric Biggers @ 2023-10-11 4:05 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On Fri, Oct 06, 2023 at 08:48:59PM +0200, Andrey Albershteyn wrote:
> Add extended file attribute FS_XFLAG_VERITY for inodes sealed with
> fs-verity.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> Documentation/filesystems/fsverity.rst | 9 +++++++++
> include/uapi/linux/fs.h | 1 +
> 2 files changed, 10 insertions(+)
>
> diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst
> index 13e4b18e5dbb..af889512c6ac 100644
> --- a/Documentation/filesystems/fsverity.rst
> +++ b/Documentation/filesystems/fsverity.rst
> @@ -326,6 +326,15 @@ the file has fs-verity enabled. This can perform better than
> FS_IOC_GETFLAGS and FS_IOC_MEASURE_VERITY because it doesn't require
> opening the file, and opening verity files can be expensive.
>
> +Extended file attributes
> +------------------------
> +
> +For fs-verity sealed files the FS_XFLAG_VERITY extended file
> +attribute is set. The attribute can be observed via lsattr.
> +
> + [root@vm:~]# lsattr /mnt/test/foo
> + --------------------V- /mnt/test/foo
> +
There's currently nowhere in the documentation or code that uses the phrase
"fs-verity sealed file". It's instead called a verity file, or a file that has
fs-verity enabled. I suggest we try to avoid inconsistent terminology.
Also, it should be mentioned which kernel versions this works on.
See for example what the statx section of the documentation says just above the
new section that you're adding:
Since Linux v5.5, the statx() system call sets STATX_ATTR_VERITY if
the file has fs-verity enabled.
Also, is FS_XFLAG_VERITY going to work on all filesystems? The existing ways to
query the verity flag work on all filesystems. Hopefully any new API will too.
Also, "Extended file attributes" is easily confused with, well, extended file
attributes (xattrs). It should be made clear that this is talking about the
FS_IOC_FSGETXATTR ioctl, not real xattrs.
Also, it should be made clear that FS_XFLAG_VERITY cannot be set using
FS_IOC_FSSETXATTR. See e.g. how the existing documentation says that
FS_IOC_GETFLAGS can get FS_VERITY_FL but FS_IOC_SETFLAGS cannot set it.
- Eric
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 05/28] fs: add FS_XFLAG_VERITY for fs-verity sealed inodes
2023-10-11 4:05 ` Eric Biggers
@ 2023-10-11 11:06 ` Andrey Albershteyn
0 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-11 11:06 UTC (permalink / raw)
To: Eric Biggers; +Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On 2023-10-10 21:05:44, Eric Biggers wrote:
> There's currently nowhere in the documentation or code that uses the phrase
> "fs-verity sealed file". It's instead called a verity file, or a file that has
> fs-verity enabled. I suggest we try to avoid inconsistent terminology.
>
> Also, it should be mentioned which kernel versions this works on.
>
> See for example what the statx section of the documentation says just above the
> new section that you're adding:
>
> Since Linux v5.5, the statx() system call sets STATX_ATTR_VERITY if
> the file has fs-verity enabled.
Sure, will change terminology. Would it be fine to add kernel
version in additional patch when patchset is merged?
>
> Also, is FS_XFLAG_VERITY going to work on all filesystems? The existing ways to
> query the verity flag work on all filesystems. Hopefully any new API will too.
>
Yes, if FS_VERITY_FL is set on the verity file. I will probably move
hunks in fs/ioctl.c from [1] to this patch so it makes more sense.
> Also, "Extended file attributes" is easily confused with, well, extended file
> attributes (xattrs). It should be made clear that this is talking about the
> FS_IOC_FSGETXATTR ioctl, not real xattrs.
>
> Also, it should be made clear that FS_XFLAG_VERITY cannot be set using
> FS_IOC_FSSETXATTR. See e.g. how the existing documentation says that
> FS_IOC_GETFLAGS can get FS_VERITY_FL but FS_IOC_SETFLAGS cannot set it.
Thanks, will add it.
[1]: https://lore.kernel.org/all/20231011013940.GJ21298@frogsfrogsfrogs/T/#m75e77f585b9b7437556d108c325126865c1f6ce7
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 06/28] fsverity: add drop_page() callout
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (4 preceding siblings ...)
2023-10-06 18:48 ` [PATCH v3 05/28] fs: add FS_XFLAG_VERITY for fs-verity sealed inodes Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 3:06 ` Eric Biggers
2023-10-06 18:49 ` [PATCH v3 07/28] fsverity: always use bitmap to track verified status Andrey Albershteyn
` (21 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
Allow filesystem to make additional processing on verified pages
instead of just dropping a reference. This will be used by XFS for
internal buffer cache manipulation in further patches. The btrfs,
ext4, and f2fs just drop the reference.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/verity/read_metadata.c | 4 ++--
fs/verity/verify.c | 6 +++---
include/linux/fsverity.h | 33 +++++++++++++++++++++++++++++++++
3 files changed, 38 insertions(+), 5 deletions(-)
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index f58432772d9e..8bd4b29a9a95 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -56,12 +56,12 @@ static int fsverity_read_merkle_tree(struct inode *inode,
virt = kmap_local_page(page);
if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) {
kunmap_local(virt);
- put_page(page);
+ fsverity_drop_page(inode, page);
err = -EFAULT;
break;
}
kunmap_local(virt);
- put_page(page);
+ fsverity_drop_page(inode, page);
retval += bytes_to_copy;
buf += bytes_to_copy;
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 904ccd7e8e16..2fe7bd57b16e 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -183,7 +183,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
memcpy(_want_hash, haddr + hoffset, hsize);
want_hash = _want_hash;
kunmap_local(haddr);
- put_page(hpage);
+ fsverity_drop_page(inode, hpage);
goto descend;
}
hblocks[level].page = hpage;
@@ -218,7 +218,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
memcpy(_want_hash, haddr + hoffset, hsize);
want_hash = _want_hash;
kunmap_local(haddr);
- put_page(hpage);
+ fsverity_drop_page(inode, hpage);
}
/* Finally, verify the data block. */
@@ -237,7 +237,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
error:
for (; level > 0; level--) {
kunmap_local(hblocks[level - 1].addr);
- put_page(hblocks[level - 1].page);
+ fsverity_drop_page(inode, hblocks[level - 1].page);
}
return false;
}
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index 1eb7eae580be..6514ed6b09b4 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -120,6 +120,16 @@ struct fsverity_operations {
*/
int (*write_merkle_tree_block)(struct inode *inode, const void *buf,
u64 pos, unsigned int size);
+
+ /**
+ * Release the reference to a Merkle tree page
+ *
+ * @page: the page to release
+ *
+ * This is called when fs-verity is done with a page obtained with
+ * ->read_merkle_tree_page().
+ */
+ void (*drop_page)(struct page *page);
};
#ifdef CONFIG_FS_VERITY
@@ -174,6 +184,24 @@ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset);
void fsverity_verify_bio(struct bio *bio);
void fsverity_enqueue_verify_work(struct work_struct *work);
+/**
+ * fsverity_drop_page() - drop page obtained with ->read_merkle_tree_page()
+ * @inode: inode in use for verification or metadata reading
+ * @page: page to be dropped
+ *
+ * Generic put_page() method. Calls out back to filesystem if ->drop_page() is
+ * set, otherwise just drops the reference to a page.
+ *
+ */
+static inline void fsverity_drop_page(struct inode *inode, struct page *page)
+{
+ if (inode->i_sb->s_vop->drop_page)
+ inode->i_sb->s_vop->drop_page(page);
+ else
+ put_page(page);
+}
+
+
#else /* !CONFIG_FS_VERITY */
static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
@@ -251,6 +279,11 @@ static inline void fsverity_enqueue_verify_work(struct work_struct *work)
WARN_ON_ONCE(1);
}
+static inline void fsverity_drop_page(struct inode *inode, struct page *page)
+{
+ WARN_ON_ONCE(1);
+}
+
#endif /* !CONFIG_FS_VERITY */
static inline bool fsverity_verify_folio(struct folio *folio)
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 06/28] fsverity: add drop_page() callout
2023-10-06 18:49 ` [PATCH v3 06/28] fsverity: add drop_page() callout Andrey Albershteyn
@ 2023-10-11 3:06 ` Eric Biggers
2023-10-11 11:11 ` Andrey Albershteyn
0 siblings, 1 reply; 74+ messages in thread
From: Eric Biggers @ 2023-10-11 3:06 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On Fri, Oct 06, 2023 at 08:49:00PM +0200, Andrey Albershteyn wrote:
> Allow filesystem to make additional processing on verified pages
> instead of just dropping a reference. This will be used by XFS for
> internal buffer cache manipulation in further patches. The btrfs,
> ext4, and f2fs just drop the reference.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> fs/verity/read_metadata.c | 4 ++--
> fs/verity/verify.c | 6 +++---
> include/linux/fsverity.h | 33 +++++++++++++++++++++++++++++++++
> 3 files changed, 38 insertions(+), 5 deletions(-)
The changes from this patch all get superseded by the changes in patch 10 that
make it drop_block instead. So that puts this patch in a weird position where
there's no real point in reviewing it alone. Maybe fold it into patch 10? I
suppose you're trying not to make that patch too large, but perhaps there's a
better way to split it up.
- Eric
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 06/28] fsverity: add drop_page() callout
2023-10-11 3:06 ` Eric Biggers
@ 2023-10-11 11:11 ` Andrey Albershteyn
0 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-11 11:11 UTC (permalink / raw)
To: Eric Biggers; +Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On 2023-10-10 20:06:46, Eric Biggers wrote:
> The changes from this patch all get superseded by the changes in patch 10 that
> make it drop_block instead. So that puts this patch in a weird position where
> there's no real point in reviewing it alone. Maybe fold it into patch 10? I
> suppose you're trying not to make that patch too large, but perhaps there's a
> better way to split it up.
Yes I was trying to make it easier to review. I will try to split it
differently :)
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 07/28] fsverity: always use bitmap to track verified status
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (5 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 06/28] fsverity: add drop_page() callout Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 3:15 ` Eric Biggers
2023-10-06 18:49 ` [PATCH v3 08/28] fsverity: pass Merkle tree block size to ->read_merkle_tree_page() Andrey Albershteyn
` (20 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
The bitmap is used to track verified status of the Merkle tree
blocks which are smaller than a PAGE. Blocks which fits exactly in a
page - use PageChecked() for tracking "verified" status.
This patch switches to always use bitmap to track verified status.
This is needed to move fs-verity away from page management and work
only with Merkle tree blocks.
Also, this patch removes spinlock. The lock was used to reset bits
in bitmap belonging to one page. This patch works only with one
Merkle tree block and won't reset other blocks status.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/verity/fsverity_private.h | 1 -
fs/verity/open.c | 49 ++++++++++++-------------
fs/verity/verify.c | 71 +++++-------------------------------
3 files changed, 33 insertions(+), 88 deletions(-)
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index d071a6e32581..9611eeae3527 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -69,7 +69,6 @@ struct fsverity_info {
u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE];
const struct inode *inode;
unsigned long *hash_block_verified;
- spinlock_t hash_page_init_lock;
};
#define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 6c31a871b84b..dfb9fe6aaae9 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -182,6 +182,7 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
{
struct fsverity_info *vi;
int err;
+ unsigned long num_bits;
vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL);
if (!vi)
@@ -213,33 +214,29 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
if (err)
goto fail;
- if (vi->tree_params.block_size != PAGE_SIZE) {
- /*
- * When the Merkle tree block size and page size differ, we use
- * a bitmap to keep track of which hash blocks have been
- * verified. This bitmap must contain one bit per hash block,
- * including alignment to a page boundary at the end.
- *
- * Eventually, to support extremely large files in an efficient
- * way, it might be necessary to make pages of this bitmap
- * reclaimable. But for now, simply allocating the whole bitmap
- * is a simple solution that works well on the files on which
- * fsverity is realistically used. E.g., with SHA-256 and 4K
- * blocks, a 100MB file only needs a 24-byte bitmap, and the
- * bitmap for any file under 17GB fits in a 4K page.
- */
- unsigned long num_bits =
- vi->tree_params.tree_pages <<
- vi->tree_params.log_blocks_per_page;
+ /*
+ * We use a bitmap to keep track of which hash blocks have been
+ * verified. This bitmap must contain one bit per hash block,
+ * including alignment to a page boundary at the end.
+ *
+ * Eventually, to support extremely large files in an efficient
+ * way, it might be necessary to make pages of this bitmap
+ * reclaimable. But for now, simply allocating the whole bitmap
+ * is a simple solution that works well on the files on which
+ * fsverity is realistically used. E.g., with SHA-256 and 4K
+ * blocks, a 100MB file only needs a 24-byte bitmap, and the
+ * bitmap for any file under 17GB fits in a 4K page.
+ */
+ num_bits =
+ vi->tree_params.tree_pages <<
+ vi->tree_params.log_blocks_per_page;
- vi->hash_block_verified = kvcalloc(BITS_TO_LONGS(num_bits),
- sizeof(unsigned long),
- GFP_KERNEL);
- if (!vi->hash_block_verified) {
- err = -ENOMEM;
- goto fail;
- }
- spin_lock_init(&vi->hash_page_init_lock);
+ vi->hash_block_verified = kvcalloc(BITS_TO_LONGS(num_bits),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!vi->hash_block_verified) {
+ err = -ENOMEM;
+ goto fail;
}
return vi;
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 2fe7bd57b16e..e7b13d143ae9 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -13,69 +13,18 @@
static struct workqueue_struct *fsverity_read_workqueue;
/*
- * Returns true if the hash block with index @hblock_idx in the tree, located in
- * @hpage, has already been verified.
+ * Returns true if the hash block with index @hblock_idx in the tree has
+ * already been verified.
*/
-static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
- unsigned long hblock_idx)
+static bool is_hash_block_verified(struct fsverity_info *vi,
+ unsigned long hblock_idx,
+ bool block_cached)
{
- bool verified;
- unsigned int blocks_per_page;
- unsigned int i;
-
- /*
- * When the Merkle tree block size and page size are the same, then the
- * ->hash_block_verified bitmap isn't allocated, and we use PG_checked
- * to directly indicate whether the page's block has been verified.
- *
- * Using PG_checked also guarantees that we re-verify hash pages that
- * get evicted and re-instantiated from the backing storage, as new
- * pages always start out with PG_checked cleared.
- */
- if (!vi->hash_block_verified)
- return PageChecked(hpage);
-
- /*
- * When the Merkle tree block size and page size differ, we use a bitmap
- * to indicate whether each hash block has been verified.
- *
- * However, we still need to ensure that hash pages that get evicted and
- * re-instantiated from the backing storage are re-verified. To do
- * this, we use PG_checked again, but now it doesn't really mean
- * "checked". Instead, now it just serves as an indicator for whether
- * the hash page is newly instantiated or not.
- *
- * The first thread that sees PG_checked=0 must clear the corresponding
- * bitmap bits, then set PG_checked=1. This requires a spinlock. To
- * avoid having to take this spinlock in the common case of
- * PG_checked=1, we start with an opportunistic lockless read.
- */
- if (PageChecked(hpage)) {
- /*
- * A read memory barrier is needed here to give ACQUIRE
- * semantics to the above PageChecked() test.
- */
- smp_rmb();
+ if (block_cached)
return test_bit(hblock_idx, vi->hash_block_verified);
- }
- spin_lock(&vi->hash_page_init_lock);
- if (PageChecked(hpage)) {
- verified = test_bit(hblock_idx, vi->hash_block_verified);
- } else {
- blocks_per_page = vi->tree_params.blocks_per_page;
- hblock_idx = round_down(hblock_idx, blocks_per_page);
- for (i = 0; i < blocks_per_page; i++)
- clear_bit(hblock_idx + i, vi->hash_block_verified);
- /*
- * A write memory barrier is needed here to give RELEASE
- * semantics to the below SetPageChecked() operation.
- */
- smp_wmb();
- SetPageChecked(hpage);
- verified = false;
- }
- spin_unlock(&vi->hash_page_init_lock);
- return verified;
+
+ clear_bit(hblock_idx, vi->hash_block_verified);
+ return false;
}
/*
@@ -179,7 +128,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
goto error;
}
haddr = kmap_local_page(hpage) + hblock_offset_in_page;
- if (is_hash_block_verified(vi, hpage, hblock_idx)) {
+ if (is_hash_block_verified(vi, hblock_idx, PageChecked(hpage))) {
memcpy(_want_hash, haddr + hoffset, hsize);
want_hash = _want_hash;
kunmap_local(haddr);
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 07/28] fsverity: always use bitmap to track verified status
2023-10-06 18:49 ` [PATCH v3 07/28] fsverity: always use bitmap to track verified status Andrey Albershteyn
@ 2023-10-11 3:15 ` Eric Biggers
2023-10-11 13:03 ` Andrey Albershteyn
0 siblings, 1 reply; 74+ messages in thread
From: Eric Biggers @ 2023-10-11 3:15 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On Fri, Oct 06, 2023 at 08:49:01PM +0200, Andrey Albershteyn wrote:
> The bitmap is used to track verified status of the Merkle tree
> blocks which are smaller than a PAGE. Blocks which fits exactly in a
> page - use PageChecked() for tracking "verified" status.
>
> This patch switches to always use bitmap to track verified status.
> This is needed to move fs-verity away from page management and work
> only with Merkle tree blocks.
How complicated would it be to keep supporting using the page bit when
merkle_tree_block_size == page_size and the filesystem supports it? It's an
efficient solution, so it would be a shame to lose it. Also it doesn't have the
max file size limit that the bitmap has.
> Also, this patch removes spinlock. The lock was used to reset bits
> in bitmap belonging to one page. This patch works only with one
> Merkle tree block and won't reset other blocks status.
The spinlock is needed when there are multiple Merkle tree blocks per page and
the filesystem is using the page-based caching. So I don't think you can remove
it. Can you elaborate on why you feel it can be removed?
> /*
> - * Returns true if the hash block with index @hblock_idx in the tree, located in
> - * @hpage, has already been verified.
> + * Returns true if the hash block with index @hblock_idx in the tree has
> + * already been verified.
> */
> -static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
> - unsigned long hblock_idx)
> +static bool is_hash_block_verified(struct fsverity_info *vi,
> + unsigned long hblock_idx,
> + bool block_cached)
> {
> - bool verified;
> - unsigned int blocks_per_page;
> - unsigned int i;
> -
> - /*
> - * When the Merkle tree block size and page size are the same, then the
> - * ->hash_block_verified bitmap isn't allocated, and we use PG_checked
> - * to directly indicate whether the page's block has been verified.
> - *
> - * Using PG_checked also guarantees that we re-verify hash pages that
> - * get evicted and re-instantiated from the backing storage, as new
> - * pages always start out with PG_checked cleared.
> - */
> - if (!vi->hash_block_verified)
> - return PageChecked(hpage);
> -
> - /*
> - * When the Merkle tree block size and page size differ, we use a bitmap
> - * to indicate whether each hash block has been verified.
> - *
> - * However, we still need to ensure that hash pages that get evicted and
> - * re-instantiated from the backing storage are re-verified. To do
> - * this, we use PG_checked again, but now it doesn't really mean
> - * "checked". Instead, now it just serves as an indicator for whether
> - * the hash page is newly instantiated or not.
> - *
> - * The first thread that sees PG_checked=0 must clear the corresponding
> - * bitmap bits, then set PG_checked=1. This requires a spinlock. To
> - * avoid having to take this spinlock in the common case of
> - * PG_checked=1, we start with an opportunistic lockless read.
> - */
Note that the above comment explains why the spinlock is needed.
> - if (PageChecked(hpage)) {
> - /*
> - * A read memory barrier is needed here to give ACQUIRE
> - * semantics to the above PageChecked() test.
> - */
> - smp_rmb();
> + if (block_cached)
> return test_bit(hblock_idx, vi->hash_block_verified);
It's not clear what block_cached is supposed to mean here.
- Eric
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 07/28] fsverity: always use bitmap to track verified status
2023-10-11 3:15 ` Eric Biggers
@ 2023-10-11 13:03 ` Andrey Albershteyn
2023-10-12 7:27 ` Eric Biggers
0 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-11 13:03 UTC (permalink / raw)
To: Eric Biggers; +Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On 2023-10-10 20:15:43, Eric Biggers wrote:
> On Fri, Oct 06, 2023 at 08:49:01PM +0200, Andrey Albershteyn wrote:
> > The bitmap is used to track verified status of the Merkle tree
> > blocks which are smaller than a PAGE. Blocks which fits exactly in a
> > page - use PageChecked() for tracking "verified" status.
> >
> > This patch switches to always use bitmap to track verified status.
> > This is needed to move fs-verity away from page management and work
> > only with Merkle tree blocks.
>
> How complicated would it be to keep supporting using the page bit when
> merkle_tree_block_size == page_size and the filesystem supports it? It's an
> efficient solution, so it would be a shame to lose it. Also it doesn't have the
> max file size limit that the bitmap has.
Well, I think it's possible but my motivation was to step away from
page manipulation as much as possible with intent to not affect other
filesystems too much. I can probably add handling of this case to
fsverity_read_merkle_tree_block() but fs/verity still will create
bitmap and have a limit. The other way is basically revert changes
done in patch 09, then, it probably will be quite a mix of page/block
handling in fs/verity/verify.c
> > Also, this patch removes spinlock. The lock was used to reset bits
> > in bitmap belonging to one page. This patch works only with one
> > Merkle tree block and won't reset other blocks status.
>
> The spinlock is needed when there are multiple Merkle tree blocks per page and
> the filesystem is using the page-based caching. So I don't think you can remove
> it. Can you elaborate on why you feel it can be removed?
With this patch is_hash_block_verified() doesn't reset bits for
blocks belonging to the same page. Even if page is re-instantiated
only one block is checked in this case. So, when other blocks are
checked they are reset.
if (block_cached)
return test_bit(hblock_idx, vi->hash_block_verified);
> > - if (PageChecked(hpage)) {
> > - /*
> > - * A read memory barrier is needed here to give ACQUIRE
> > - * semantics to the above PageChecked() test.
> > - */
> > - smp_rmb();
> > + if (block_cached)
> > return test_bit(hblock_idx, vi->hash_block_verified);
>
> It's not clear what block_cached is supposed to mean here.
If block was re-instantiated or was in cache. I can try to come up
with better name.
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 07/28] fsverity: always use bitmap to track verified status
2023-10-11 13:03 ` Andrey Albershteyn
@ 2023-10-12 7:27 ` Eric Biggers
2023-10-13 3:12 ` Darrick J. Wong
` (2 more replies)
0 siblings, 3 replies; 74+ messages in thread
From: Eric Biggers @ 2023-10-12 7:27 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On Wed, Oct 11, 2023 at 03:03:55PM +0200, Andrey Albershteyn wrote:
> > How complicated would it be to keep supporting using the page bit when
> > merkle_tree_block_size == page_size and the filesystem supports it? It's an
> > efficient solution, so it would be a shame to lose it. Also it doesn't have the
> > max file size limit that the bitmap has.
>
> Well, I think it's possible but my motivation was to step away from
> page manipulation as much as possible with intent to not affect other
> filesystems too much. I can probably add handling of this case to
> fsverity_read_merkle_tree_block() but fs/verity still will create
> bitmap and have a limit. The other way is basically revert changes
> done in patch 09, then, it probably will be quite a mix of page/block
> handling in fs/verity/verify.c
The page-based caching still has to be supported anyway, since that's what the
other filesystems that support fsverity use, and it seems you don't plan to
change that. The question is where the "block verified" flags should be stored.
Currently there are two options: PG_checked and the separate bitmap. I'm not
yet convinced that removing the support for the PG_checked method is a good
change. PG_checked is a nice solution for the cases where it can be used; it
requires no extra memory, no locking, and has no max file size. Also, this
change seems mostly orthogonal to what you're actually trying to accomplish.
> > > Also, this patch removes spinlock. The lock was used to reset bits
> > > in bitmap belonging to one page. This patch works only with one
> > > Merkle tree block and won't reset other blocks status.
> >
> > The spinlock is needed when there are multiple Merkle tree blocks per page and
> > the filesystem is using the page-based caching. So I don't think you can remove
> > it. Can you elaborate on why you feel it can be removed?
>
> With this patch is_hash_block_verified() doesn't reset bits for
> blocks belonging to the same page. Even if page is re-instantiated
> only one block is checked in this case. So, when other blocks are
> checked they are reset.
>
> if (block_cached)
> return test_bit(hblock_idx, vi->hash_block_verified);
When part of the Merkle tree cache is evicted and re-instantiated, whether that
part is a "page" or something else, the verified status of all the blocks
contained in that part need to be invalidated so that they get re-verified. The
existing code does that. I don't see how your proposed code does that.
- Eric
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 07/28] fsverity: always use bitmap to track verified status
2023-10-12 7:27 ` Eric Biggers
@ 2023-10-13 3:12 ` Darrick J. Wong
2023-10-17 4:58 ` Eric Biggers
2023-10-17 6:01 ` Christoph Hellwig
2023-10-16 11:52 ` Andrey Albershteyn
2023-10-17 5:57 ` Christoph Hellwig
2 siblings, 2 replies; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-13 3:12 UTC (permalink / raw)
To: Eric Biggers
Cc: Andrey Albershteyn, linux-xfs, linux-fsdevel, fsverity, david, dchinner
Hi Eric,
[Please excuse my ignorance, this is only the third time I've dived
into fsverity.]
On Thu, Oct 12, 2023 at 12:27:46AM -0700, Eric Biggers wrote:
> On Wed, Oct 11, 2023 at 03:03:55PM +0200, Andrey Albershteyn wrote:
> > > How complicated would it be to keep supporting using the page bit when
> > > merkle_tree_block_size == page_size and the filesystem supports it? It's an
> > > efficient solution, so it would be a shame to lose it. Also it doesn't have the
> > > max file size limit that the bitmap has.
How complex would it be to get rid of the bitmap entirely, and validate
all the verity tree blocks within a page all at once instead of
individual blocks within a page?
Assuming willy isn't grinding his axe to get rid of PGchecked,
obviously. ;)
> > Well, I think it's possible but my motivation was to step away from
> > page manipulation as much as possible with intent to not affect other
> > filesystems too much. I can probably add handling of this case to
> > fsverity_read_merkle_tree_block() but fs/verity still will create
> > bitmap and have a limit. The other way is basically revert changes
> > done in patch 09, then, it probably will be quite a mix of page/block
> > handling in fs/verity/verify.c
>
> The page-based caching still has to be supported anyway, since that's what the
> other filesystems that support fsverity use, and it seems you don't plan to
> change that.
I frankly have been asking myself why /this/ patchset adds so much extra
code and flags and whatnot to XFS and fs/verity. From what I can tell,
the xfs buffer cache has been extended to allocate double the memory so
that xattr contents can be shadowed. getxattr for merkle tree contents
then pins the buffer, shadows the contents, and hands both back to the
caller (aka xfs_read_merkle_tree_block). The shadow memory is then
handed to fs/verity to do its magic; following that, fsverity releases
the reference and we can eventually drop the xfs_buf reference.
But this seems way overcomplicated to me. ->read_merkle_tree_page hands
us a pgoff_t and a suggestion for page readahead, and wants us to return
an uptodate locked page, right?
Why can't xfs allocate a page, walk the requested range to fill the page
with merkle tree blocks that were written to the xattr structure (or
zero the page contents if there is no xattr), and hand that page to
fsverity? (It helps to provide the merkle tree block size to
xfs_read_merkle_tree_page, thanks for adding that).
Assuming fsverity also wants some caching, we could augment the
xfs_inode to point to a separate address_space for cached merkle tree
pages, and then xfs_read_merkle_tree_page can use __filemap_get_folio to
find uptodate cached pages, or instantiate one and make it uptodate.
Even better, we can pretty easily use multipage folios for this, though
AFAICT the fs/verity code isn't yet up to handling that.
The only thing I can't quite figure out is how to get memory reclaim to
scan the extra address_space when it wants to try to reclaim pages.
That part ext4 and f2fs got for free because they stuffed the merkle
tree in the posteof space.
But wouldn't that solve /all/ the plumbing problems without scattering
bits of new code and flags into the xfs buffer cache, the extended
attributes code, and elsewhere? And then xfs would not need to burn up
vmap space to allocate 8K memory blocks just to provide 4k merkel tree
blocks to fs/verity.
That's just my 2 cents from spending a couple of hours hypothesizing how
I would fill out the fsverity_operations.
> The question is where the "block verified" flags should be stored.
> Currently there are two options: PG_checked and the separate bitmap. I'm not
> yet convinced that removing the support for the PG_checked method is a good
> change. PG_checked is a nice solution for the cases where it can be used; it
> requires no extra memory, no locking, and has no max file size. Also, this
> change seems mostly orthogonal to what you're actually trying to accomplish.
That scheme sounds way better to me than the bitmap. ;)
--D
> > > > Also, this patch removes spinlock. The lock was used to reset bits
> > > > in bitmap belonging to one page. This patch works only with one
> > > > Merkle tree block and won't reset other blocks status.
> > >
> > > The spinlock is needed when there are multiple Merkle tree blocks per page and
> > > the filesystem is using the page-based caching. So I don't think you can remove
> > > it. Can you elaborate on why you feel it can be removed?
> >
> > With this patch is_hash_block_verified() doesn't reset bits for
> > blocks belonging to the same page. Even if page is re-instantiated
> > only one block is checked in this case. So, when other blocks are
> > checked they are reset.
> >
> > if (block_cached)
> > return test_bit(hblock_idx, vi->hash_block_verified);
>
> When part of the Merkle tree cache is evicted and re-instantiated, whether that
> part is a "page" or something else, the verified status of all the blocks
> contained in that part need to be invalidated so that they get re-verified. The
> existing code does that. I don't see how your proposed code does that.
>
> - Eric
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 07/28] fsverity: always use bitmap to track verified status
2023-10-13 3:12 ` Darrick J. Wong
@ 2023-10-17 4:58 ` Eric Biggers
2023-10-18 2:35 ` Darrick J. Wong
2023-10-17 6:01 ` Christoph Hellwig
1 sibling, 1 reply; 74+ messages in thread
From: Eric Biggers @ 2023-10-17 4:58 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Andrey Albershteyn, linux-xfs, linux-fsdevel, fsverity, david, dchinner
On Thu, Oct 12, 2023 at 08:12:09PM -0700, Darrick J. Wong wrote:
> Hi Eric,
>
> [Please excuse my ignorance, this is only the third time I've dived
> into fsverity.]
>
> On Thu, Oct 12, 2023 at 12:27:46AM -0700, Eric Biggers wrote:
> > On Wed, Oct 11, 2023 at 03:03:55PM +0200, Andrey Albershteyn wrote:
> > > > How complicated would it be to keep supporting using the page bit when
> > > > merkle_tree_block_size == page_size and the filesystem supports it? It's an
> > > > efficient solution, so it would be a shame to lose it. Also it doesn't have the
> > > > max file size limit that the bitmap has.
>
> How complex would it be to get rid of the bitmap entirely, and validate
> all the verity tree blocks within a page all at once instead of
> individual blocks within a page?
>
> Assuming willy isn't grinding his axe to get rid of PGchecked,
> obviously. ;)
See what I wrote earlier at
https://lore.kernel.org/linux-xfs/Y5ltzp6yeMo1oDSk@sol.localdomain.
Basically it would increase the worst-case latency by a lot.
>
> > > Well, I think it's possible but my motivation was to step away from
> > > page manipulation as much as possible with intent to not affect other
> > > filesystems too much. I can probably add handling of this case to
> > > fsverity_read_merkle_tree_block() but fs/verity still will create
> > > bitmap and have a limit. The other way is basically revert changes
> > > done in patch 09, then, it probably will be quite a mix of page/block
> > > handling in fs/verity/verify.c
> >
> > The page-based caching still has to be supported anyway, since that's what the
> > other filesystems that support fsverity use, and it seems you don't plan to
> > change that.
>
> I frankly have been asking myself why /this/ patchset adds so much extra
> code and flags and whatnot to XFS and fs/verity. From what I can tell,
> the xfs buffer cache has been extended to allocate double the memory so
> that xattr contents can be shadowed. getxattr for merkle tree contents
> then pins the buffer, shadows the contents, and hands both back to the
> caller (aka xfs_read_merkle_tree_block). The shadow memory is then
> handed to fs/verity to do its magic; following that, fsverity releases
> the reference and we can eventually drop the xfs_buf reference.
>
> But this seems way overcomplicated to me. ->read_merkle_tree_page hands
> us a pgoff_t and a suggestion for page readahead, and wants us to return
> an uptodate locked page, right?
>
> Why can't xfs allocate a page, walk the requested range to fill the page
> with merkle tree blocks that were written to the xattr structure (or
> zero the page contents if there is no xattr), and hand that page to
> fsverity? (It helps to provide the merkle tree block size to
> xfs_read_merkle_tree_page, thanks for adding that).
Earlier versions of this patchset did that. But, it's only really feasible if
the pages are actually cached. Otherwise it's very inefficient and can result
in random ENOMEM.
> Assuming fsverity also wants some caching, we could augment the
> xfs_inode to point to a separate address_space for cached merkle tree
> pages, and then xfs_read_merkle_tree_page can use __filemap_get_folio to
> find uptodate cached pages, or instantiate one and make it uptodate.
> Even better, we can pretty easily use multipage folios for this, though
> AFAICT the fs/verity code isn't yet up to handling that.
>
> The only thing I can't quite figure out is how to get memory reclaim to
> scan the extra address_space when it wants to try to reclaim pages.
> That part ext4 and f2fs got for free because they stuffed the merkle
> tree in the posteof space.
>
> But wouldn't that solve /all/ the plumbing problems without scattering
> bits of new code and flags into the xfs buffer cache, the extended
> attributes code, and elsewhere? And then xfs would not need to burn up
> vmap space to allocate 8K memory blocks just to provide 4k merkel tree
> blocks to fs/verity.
>
> That's just my 2 cents from spending a couple of hours hypothesizing how
> I would fill out the fsverity_operations.
That might work. I'm not sure about the details though, e.g. can mapping->host
point to the file's inode or would it need to be a fake one.
- Eric
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 07/28] fsverity: always use bitmap to track verified status
2023-10-17 4:58 ` Eric Biggers
@ 2023-10-18 2:35 ` Darrick J. Wong
0 siblings, 0 replies; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-18 2:35 UTC (permalink / raw)
To: Eric Biggers
Cc: Andrey Albershteyn, linux-xfs, linux-fsdevel, fsverity, david, dchinner
On Mon, Oct 16, 2023 at 09:58:34PM -0700, Eric Biggers wrote:
> On Thu, Oct 12, 2023 at 08:12:09PM -0700, Darrick J. Wong wrote:
> > Hi Eric,
> >
> > [Please excuse my ignorance, this is only the third time I've dived
> > into fsverity.]
> >
> > On Thu, Oct 12, 2023 at 12:27:46AM -0700, Eric Biggers wrote:
> > > On Wed, Oct 11, 2023 at 03:03:55PM +0200, Andrey Albershteyn wrote:
> > > > > How complicated would it be to keep supporting using the page bit when
> > > > > merkle_tree_block_size == page_size and the filesystem supports it? It's an
> > > > > efficient solution, so it would be a shame to lose it. Also it doesn't have the
> > > > > max file size limit that the bitmap has.
> >
> > How complex would it be to get rid of the bitmap entirely, and validate
> > all the verity tree blocks within a page all at once instead of
> > individual blocks within a page?
> >
> > Assuming willy isn't grinding his axe to get rid of PGchecked,
> > obviously. ;)
>
> See what I wrote earlier at
> https://lore.kernel.org/linux-xfs/Y5ltzp6yeMo1oDSk@sol.localdomain.
> Basically it would increase the worst-case latency by a lot.
Ahh. Yeah, ok, got it, you don't necessarily want to prefault a bunch
more merkle tree blocks all at once just to read a single byte. :)
> >
> > > > Well, I think it's possible but my motivation was to step away from
> > > > page manipulation as much as possible with intent to not affect other
> > > > filesystems too much. I can probably add handling of this case to
> > > > fsverity_read_merkle_tree_block() but fs/verity still will create
> > > > bitmap and have a limit. The other way is basically revert changes
> > > > done in patch 09, then, it probably will be quite a mix of page/block
> > > > handling in fs/verity/verify.c
> > >
> > > The page-based caching still has to be supported anyway, since that's what the
> > > other filesystems that support fsverity use, and it seems you don't plan to
> > > change that.
> >
> > I frankly have been asking myself why /this/ patchset adds so much extra
> > code and flags and whatnot to XFS and fs/verity. From what I can tell,
> > the xfs buffer cache has been extended to allocate double the memory so
> > that xattr contents can be shadowed. getxattr for merkle tree contents
> > then pins the buffer, shadows the contents, and hands both back to the
> > caller (aka xfs_read_merkle_tree_block). The shadow memory is then
> > handed to fs/verity to do its magic; following that, fsverity releases
> > the reference and we can eventually drop the xfs_buf reference.
> >
> > But this seems way overcomplicated to me. ->read_merkle_tree_page hands
> > us a pgoff_t and a suggestion for page readahead, and wants us to return
> > an uptodate locked page, right?
> >
> > Why can't xfs allocate a page, walk the requested range to fill the page
> > with merkle tree blocks that were written to the xattr structure (or
> > zero the page contents if there is no xattr), and hand that page to
> > fsverity? (It helps to provide the merkle tree block size to
> > xfs_read_merkle_tree_page, thanks for adding that).
>
> Earlier versions of this patchset did that. But, it's only really feasible if
> the pages are actually cached. Otherwise it's very inefficient and can result
> in random ENOMEM.
>
> > Assuming fsverity also wants some caching, we could augment the
> > xfs_inode to point to a separate address_space for cached merkle tree
> > pages, and then xfs_read_merkle_tree_page can use __filemap_get_folio to
> > find uptodate cached pages, or instantiate one and make it uptodate.
> > Even better, we can pretty easily use multipage folios for this, though
> > AFAICT the fs/verity code isn't yet up to handling that.
> >
> > The only thing I can't quite figure out is how to get memory reclaim to
> > scan the extra address_space when it wants to try to reclaim pages.
> > That part ext4 and f2fs got for free because they stuffed the merkle
> > tree in the posteof space.
> >
> > But wouldn't that solve /all/ the plumbing problems without scattering
> > bits of new code and flags into the xfs buffer cache, the extended
> > attributes code, and elsewhere? And then xfs would not need to burn up
> > vmap space to allocate 8K memory blocks just to provide 4k merkel tree
> > blocks to fs/verity.
> >
> > That's just my 2 cents from spending a couple of hours hypothesizing how
> > I would fill out the fsverity_operations.
>
> That might work. I'm not sure about the details though, e.g. can mapping->host
> point to the file's inode or would it need to be a fake one.
Me neither. I /think/ invalidate_mapping_pages would do the job, though
one would probably want to have a custom shrinker so we could find out
just how many pages are desired by reclaim, and then try to invalidate
only that many pages. Not sure what happens if there are multiple
mappings whose ->host pointers point to the same inode.
Alternately I wonder if we could make a tmpfs file that would evict live
contents instead of writing them to the paging file... ;)
--D
>
> - Eric
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 07/28] fsverity: always use bitmap to track verified status
2023-10-13 3:12 ` Darrick J. Wong
2023-10-17 4:58 ` Eric Biggers
@ 2023-10-17 6:01 ` Christoph Hellwig
1 sibling, 0 replies; 74+ messages in thread
From: Christoph Hellwig @ 2023-10-17 6:01 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Eric Biggers, Andrey Albershteyn, linux-xfs, linux-fsdevel,
fsverity, david, dchinner
On Thu, Oct 12, 2023 at 08:12:09PM -0700, Darrick J. Wong wrote:
> I frankly have been asking myself why /this/ patchset adds so much extra
> code and flags and whatnot to XFS and fs/verity. From what I can tell,
> the xfs buffer cache has been extended to allocate double the memory so
> that xattr contents can be shadowed. getxattr for merkle tree contents
> then pins the buffer, shadows the contents, and hands both back to the
> caller (aka xfs_read_merkle_tree_block). The shadow memory is then
> handed to fs/verity to do its magic; following that, fsverity releases
> the reference and we can eventually drop the xfs_buf reference.
>
> But this seems way overcomplicated to me. ->read_merkle_tree_page hands
> us a pgoff_t and a suggestion for page readahead, and wants us to return
> an uptodate locked page, right?
It does. That beeing said I really much prefer the block based
interface from Andrey. It is a lot cleaner and without weird page
cache internals, although it can still be implemented very nicely
by file systems that store the tree in the page cache.
> The only thing I can't quite figure out is how to get memory reclaim to
> scan the extra address_space when it wants to try to reclaim pages.
> That part ext4 and f2fs got for free because they stuffed the merkle
> tree in the posteof space.
Except for th magic swapper_spaces, and address_space without an
inode is not a thing, so you need to allocate an extra inode anyway,
which is what reclaim works on.
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 07/28] fsverity: always use bitmap to track verified status
2023-10-12 7:27 ` Eric Biggers
2023-10-13 3:12 ` Darrick J. Wong
@ 2023-10-16 11:52 ` Andrey Albershteyn
2023-10-17 5:57 ` Christoph Hellwig
2 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-16 11:52 UTC (permalink / raw)
To: Eric Biggers; +Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On 2023-10-12 00:27:46, Eric Biggers wrote:
> On Wed, Oct 11, 2023 at 03:03:55PM +0200, Andrey Albershteyn wrote:
> > > How complicated would it be to keep supporting using the page bit when
> > > merkle_tree_block_size == page_size and the filesystem supports it? It's an
> > > efficient solution, so it would be a shame to lose it. Also it doesn't have the
> > > max file size limit that the bitmap has.
> >
> > Well, I think it's possible but my motivation was to step away from
> > page manipulation as much as possible with intent to not affect other
> > filesystems too much. I can probably add handling of this case to
> > fsverity_read_merkle_tree_block() but fs/verity still will create
> > bitmap and have a limit. The other way is basically revert changes
> > done in patch 09, then, it probably will be quite a mix of page/block
> > handling in fs/verity/verify.c
>
> The page-based caching still has to be supported anyway, since that's what the
> other filesystems that support fsverity use, and it seems you don't plan to
> change that. The question is where the "block verified" flags should be stored.
> Currently there are two options: PG_checked and the separate bitmap. I'm not
> yet convinced that removing the support for the PG_checked method is a good
> change. PG_checked is a nice solution for the cases where it can be used; it
> requires no extra memory, no locking, and has no max file size. Also, this
> change seems mostly orthogonal to what you're actually trying to accomplish.
Yes, I was trying to combine PG_checked and bitmap in the way it can
be also used by XFS. I will try change this patch to not drop
merkle_tree_block_size == page_size case.
> > > > Also, this patch removes spinlock. The lock was used to reset bits
> > > > in bitmap belonging to one page. This patch works only with one
> > > > Merkle tree block and won't reset other blocks status.
> > >
> > > The spinlock is needed when there are multiple Merkle tree blocks per page and
> > > the filesystem is using the page-based caching. So I don't think you can remove
> > > it. Can you elaborate on why you feel it can be removed?
> >
> > With this patch is_hash_block_verified() doesn't reset bits for
> > blocks belonging to the same page. Even if page is re-instantiated
> > only one block is checked in this case. So, when other blocks are
> > checked they are reset.
> >
> > if (block_cached)
> > return test_bit(hblock_idx, vi->hash_block_verified);
>
> When part of the Merkle tree cache is evicted and re-instantiated, whether that
> part is a "page" or something else, the verified status of all the blocks
> contained in that part need to be invalidated so that they get re-verified. The
> existing code does that. I don't see how your proposed code does that.
Oh, I see the problem now, I will revert it back.
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 07/28] fsverity: always use bitmap to track verified status
2023-10-12 7:27 ` Eric Biggers
2023-10-13 3:12 ` Darrick J. Wong
2023-10-16 11:52 ` Andrey Albershteyn
@ 2023-10-17 5:57 ` Christoph Hellwig
2023-10-17 17:49 ` Eric Biggers
2 siblings, 1 reply; 74+ messages in thread
From: Christoph Hellwig @ 2023-10-17 5:57 UTC (permalink / raw)
To: Eric Biggers
Cc: Andrey Albershteyn, linux-xfs, linux-fsdevel, fsverity, djwong,
david, dchinner, willy
On Thu, Oct 12, 2023 at 12:27:46AM -0700, Eric Biggers wrote:
> Currently there are two options: PG_checked and the separate bitmap. I'm not
> yet convinced that removing the support for the PG_checked method is a good
> change. PG_checked is a nice solution for the cases where it can be used; it
> requires no extra memory, no locking, and has no max file size. Also, this
> change seems mostly orthogonal to what you're actually trying to accomplish.
Given that willy has been on a (IMHO reasonable) quest to kill off
as many as possible page flags I'd really like to seize the opportunity
and kill PageCheck in fsverity. How big are the downsides of the bitmap
vs using the page flags, and do they matter in practice?
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 07/28] fsverity: always use bitmap to track verified status
2023-10-17 5:57 ` Christoph Hellwig
@ 2023-10-17 17:49 ` Eric Biggers
0 siblings, 0 replies; 74+ messages in thread
From: Eric Biggers @ 2023-10-17 17:49 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Andrey Albershteyn, linux-xfs, linux-fsdevel, fsverity, djwong,
david, dchinner, willy
On Mon, Oct 16, 2023 at 10:57:45PM -0700, Christoph Hellwig wrote:
> On Thu, Oct 12, 2023 at 12:27:46AM -0700, Eric Biggers wrote:
> > Currently there are two options: PG_checked and the separate bitmap. I'm not
> > yet convinced that removing the support for the PG_checked method is a good
> > change. PG_checked is a nice solution for the cases where it can be used; it
> > requires no extra memory, no locking, and has no max file size. Also, this
> > change seems mostly orthogonal to what you're actually trying to accomplish.
>
> Given that willy has been on a (IMHO reasonable) quest to kill off
> as many as possible page flags I'd really like to seize the opportunity
> and kill PageCheck in fsverity. How big are the downsides of the bitmap
> vs using the page flags, and do they matter in practice?
>
Currently PG_checked is used even with the bitmap-based approach, as a way to
ensure that hash pages that get evicted and re-instantiated from the backing
storage are re-verified. See is_hash_block_verified() in fs/verity/verify.c.
That would need to be replaced with something else. I'm not sure what else
would work and still be efficient.
No one has actually deployed the bitmap-based approach in production yet; it was
added in v6.3 and is only used for merkle_tree_block_size != PAGE_SIZE. But,
the performance and memory overhead is probably not significant in practice.
I'm more worried about the file size limit of the bitmap-based approach, which
is currently ~4 TB. *Probably* no one is using fsverity on files that large,
but introducing a limit for a case that wasn't supported before
(merkle_tree_block_size != PAGE_SIZE) isn't as bad as retroactively introducing
a limit for existing files that worked before and refusing to open them. Huge
files would need something other than a simple bitmap, which would add
complexity and overhead.
Note that many filesystems use PG_checked for other purposes such as keeping
track of when directory blocks have been validated. I'm not sure how feasible
it will be to free up that flag entirely.
- Eric
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 08/28] fsverity: pass Merkle tree block size to ->read_merkle_tree_page()
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (6 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 07/28] fsverity: always use bitmap to track verified status Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 3:17 ` Eric Biggers
2023-10-06 18:49 ` [PATCH v3 09/28] fsverity: pass log_blocksize to end_enable_verity() Andrey Albershteyn
` (19 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
XFS will need to know size of Merkle tree block as these blocks
will not be stored consecutively in fs blocks. Therefore, they could
not be obtained in PAGEs like in ext4. Rather, they are stored under
offsets used as name in extended attributes. The size is needed to
calculate the offset.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/btrfs/verity.c | 3 ++-
fs/ext4/verity.c | 3 ++-
fs/f2fs/verity.c | 3 ++-
fs/verity/read_metadata.c | 3 ++-
fs/verity/verify.c | 3 ++-
include/linux/fsverity.h | 3 ++-
6 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 744f4f4d4c68..b39199b57a69 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -713,7 +713,8 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
*/
static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
- unsigned long num_ra_pages)
+ unsigned long num_ra_pages,
+ u8 log_blocksize)
{
struct folio *folio;
u64 off = (u64)index << PAGE_SHIFT;
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 2f37e1ea3955..4eb77cefdbe1 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -359,7 +359,8 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
static struct page *ext4_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
- unsigned long num_ra_pages)
+ unsigned long num_ra_pages,
+ u8 log_blocksize)
{
struct folio *folio;
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 4fc95f353a7a..bb354ab8ca5a 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -256,7 +256,8 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
- unsigned long num_ra_pages)
+ unsigned long num_ra_pages,
+ u8 log_blocksize)
{
struct page *page;
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index 8bd4b29a9a95..197624cab43e 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -44,7 +44,8 @@ static int fsverity_read_merkle_tree(struct inode *inode,
struct page *page;
const void *virt;
- page = vops->read_merkle_tree_page(inode, index, num_ra_pages);
+ page = vops->read_merkle_tree_page(inode, index, num_ra_pages,
+ vi->tree_params.log_blocksize);
if (IS_ERR(page)) {
err = PTR_ERR(page);
fsverity_err(inode,
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index e7b13d143ae9..f556336ebd8d 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -120,7 +120,8 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
hpage_idx, level == 0 ? min(max_ra_pages,
- params->tree_pages - hpage_idx) : 0);
+ params->tree_pages - hpage_idx) : 0,
+ params->log_blocksize);
if (IS_ERR(hpage)) {
fsverity_err(inode,
"Error %ld reading Merkle tree page %lu",
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index 6514ed6b09b4..252b2668894c 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -103,7 +103,8 @@ struct fsverity_operations {
*/
struct page *(*read_merkle_tree_page)(struct inode *inode,
pgoff_t index,
- unsigned long num_ra_pages);
+ unsigned long num_ra_pages,
+ u8 log_blocksize);
/**
* Write a Merkle tree block to the given inode.
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 08/28] fsverity: pass Merkle tree block size to ->read_merkle_tree_page()
2023-10-06 18:49 ` [PATCH v3 08/28] fsverity: pass Merkle tree block size to ->read_merkle_tree_page() Andrey Albershteyn
@ 2023-10-11 3:17 ` Eric Biggers
2023-10-11 11:13 ` Andrey Albershteyn
0 siblings, 1 reply; 74+ messages in thread
From: Eric Biggers @ 2023-10-11 3:17 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On Fri, Oct 06, 2023 at 08:49:02PM +0200, Andrey Albershteyn wrote:
> diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
> index 6514ed6b09b4..252b2668894c 100644
> --- a/include/linux/fsverity.h
> +++ b/include/linux/fsverity.h
> @@ -103,7 +103,8 @@ struct fsverity_operations {
> */
> struct page *(*read_merkle_tree_page)(struct inode *inode,
> pgoff_t index,
> - unsigned long num_ra_pages);
> + unsigned long num_ra_pages,
> + u8 log_blocksize);
XFS doesn't actually use this, though. In patch 10 you add
read_merkle_tree_block, and that is used instead.
So this patch seems unnecessary.
- Eric
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 08/28] fsverity: pass Merkle tree block size to ->read_merkle_tree_page()
2023-10-11 3:17 ` Eric Biggers
@ 2023-10-11 11:13 ` Andrey Albershteyn
0 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-11 11:13 UTC (permalink / raw)
To: Eric Biggers; +Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On 2023-10-10 20:17:12, Eric Biggers wrote:
> XFS doesn't actually use this, though. In patch 10 you add
> read_merkle_tree_block, and that is used instead.
>
> So this patch seems unnecessary.
True, will drop this one.
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 09/28] fsverity: pass log_blocksize to end_enable_verity()
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (7 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 08/28] fsverity: pass Merkle tree block size to ->read_merkle_tree_page() Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 3:19 ` Eric Biggers
2023-10-06 18:49 ` [PATCH v3 10/28] fsverity: operate with Merkle tree blocks instead of pages Andrey Albershteyn
` (18 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
XFS will need to know log_blocksize to remove the tree in case of an
error. The size is needed to calculate offsets of particular Merkle
tree blocks.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/btrfs/verity.c | 4 +++-
fs/ext4/verity.c | 3 ++-
fs/f2fs/verity.c | 3 ++-
fs/verity/enable.c | 6 ++++--
include/linux/fsverity.h | 4 +++-
5 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index b39199b57a69..2b34796f68d3 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -621,6 +621,7 @@ static int btrfs_begin_enable_verity(struct file *filp)
* @desc: verity descriptor to write out (NULL in error conditions)
* @desc_size: size of the verity descriptor (variable with signatures)
* @merkle_tree_size: size of the merkle tree in bytes
+ * @log_blocksize: log size of the Merkle tree block
*
* If desc is null, then VFS is signaling an error occurred during verity
* enable, and we should try to rollback. Otherwise, attempt to finish verity.
@@ -628,7 +629,8 @@ static int btrfs_begin_enable_verity(struct file *filp)
* Returns 0 on success, negative error code on error.
*/
static int btrfs_end_enable_verity(struct file *filp, const void *desc,
- size_t desc_size, u64 merkle_tree_size)
+ size_t desc_size, u64 merkle_tree_size,
+ u8 log_blocksize)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
int ret = 0;
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 4eb77cefdbe1..4e2f01f048c0 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -189,7 +189,8 @@ static int ext4_write_verity_descriptor(struct inode *inode, const void *desc,
}
static int ext4_end_enable_verity(struct file *filp, const void *desc,
- size_t desc_size, u64 merkle_tree_size)
+ size_t desc_size, u64 merkle_tree_size,
+ u8 log_blocksize)
{
struct inode *inode = file_inode(filp);
const int credits = 2; /* superblock and inode for ext4_orphan_del() */
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index bb354ab8ca5a..601ab9f0c024 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -144,7 +144,8 @@ static int f2fs_begin_enable_verity(struct file *filp)
}
static int f2fs_end_enable_verity(struct file *filp, const void *desc,
- size_t desc_size, u64 merkle_tree_size)
+ size_t desc_size, u64 merkle_tree_size,
+ u8 log_blocksize)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index c284f46d1b53..c87cab796f0b 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -274,7 +274,8 @@ static int enable_verity(struct file *filp,
* Serialized with ->begin_enable_verity() by the inode lock.
*/
inode_lock(inode);
- err = vops->end_enable_verity(filp, desc, desc_size, params.tree_size);
+ err = vops->end_enable_verity(filp, desc, desc_size, params.tree_size,
+ desc->log_blocksize);
inode_unlock(inode);
if (err) {
fsverity_err(inode, "%ps() failed with err %d",
@@ -300,7 +301,8 @@ static int enable_verity(struct file *filp,
rollback:
inode_lock(inode);
- (void)vops->end_enable_verity(filp, NULL, 0, params.tree_size);
+ (void)vops->end_enable_verity(filp, NULL, 0, params.tree_size,
+ desc->log_blocksize);
inode_unlock(inode);
goto out;
}
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index 252b2668894c..cac012d4c86a 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -51,6 +51,7 @@ struct fsverity_operations {
* @desc: the verity descriptor to write, or NULL on failure
* @desc_size: size of verity descriptor, or 0 on failure
* @merkle_tree_size: total bytes the Merkle tree took up
+ * @log_blocksize: log size of the Merkle tree block
*
* If desc == NULL, then enabling verity failed and the filesystem only
* must do any necessary cleanups. Else, it must also store the given
@@ -65,7 +66,8 @@ struct fsverity_operations {
* Return: 0 on success, -errno on failure
*/
int (*end_enable_verity)(struct file *filp, const void *desc,
- size_t desc_size, u64 merkle_tree_size);
+ size_t desc_size, u64 merkle_tree_size,
+ u8 log_blocksize);
/**
* Get the verity descriptor of the given inode.
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 09/28] fsverity: pass log_blocksize to end_enable_verity()
2023-10-06 18:49 ` [PATCH v3 09/28] fsverity: pass log_blocksize to end_enable_verity() Andrey Albershteyn
@ 2023-10-11 3:19 ` Eric Biggers
2023-10-11 11:17 ` Andrey Albershteyn
0 siblings, 1 reply; 74+ messages in thread
From: Eric Biggers @ 2023-10-11 3:19 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On Fri, Oct 06, 2023 at 08:49:03PM +0200, Andrey Albershteyn wrote:
> diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
> index 252b2668894c..cac012d4c86a 100644
> --- a/include/linux/fsverity.h
> +++ b/include/linux/fsverity.h
> @@ -51,6 +51,7 @@ struct fsverity_operations {
> * @desc: the verity descriptor to write, or NULL on failure
> * @desc_size: size of verity descriptor, or 0 on failure
> * @merkle_tree_size: total bytes the Merkle tree took up
> + * @log_blocksize: log size of the Merkle tree block
> *
> * If desc == NULL, then enabling verity failed and the filesystem only
> * must do any necessary cleanups. Else, it must also store the given
> @@ -65,7 +66,8 @@ struct fsverity_operations {
> * Return: 0 on success, -errno on failure
> */
> int (*end_enable_verity)(struct file *filp, const void *desc,
> - size_t desc_size, u64 merkle_tree_size);
> + size_t desc_size, u64 merkle_tree_size,
> + u8 log_blocksize);
Maybe just pass the block_size itself instead of log2(block_size)?
- Eric
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 09/28] fsverity: pass log_blocksize to end_enable_verity()
2023-10-11 3:19 ` Eric Biggers
@ 2023-10-11 11:17 ` Andrey Albershteyn
2023-10-12 7:34 ` Eric Biggers
0 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-11 11:17 UTC (permalink / raw)
To: Eric Biggers; +Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On 2023-10-10 20:19:06, Eric Biggers wrote:
> On Fri, Oct 06, 2023 at 08:49:03PM +0200, Andrey Albershteyn wrote:
> > diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
> > index 252b2668894c..cac012d4c86a 100644
> > --- a/include/linux/fsverity.h
> > +++ b/include/linux/fsverity.h
> > @@ -51,6 +51,7 @@ struct fsverity_operations {
> > * @desc: the verity descriptor to write, or NULL on failure
> > * @desc_size: size of verity descriptor, or 0 on failure
> > * @merkle_tree_size: total bytes the Merkle tree took up
> > + * @log_blocksize: log size of the Merkle tree block
> > *
> > * If desc == NULL, then enabling verity failed and the filesystem only
> > * must do any necessary cleanups. Else, it must also store the given
> > @@ -65,7 +66,8 @@ struct fsverity_operations {
> > * Return: 0 on success, -errno on failure
> > */
> > int (*end_enable_verity)(struct file *filp, const void *desc,
> > - size_t desc_size, u64 merkle_tree_size);
> > + size_t desc_size, u64 merkle_tree_size,
> > + u8 log_blocksize);
>
> Maybe just pass the block_size itself instead of log2(block_size)?
XFS will still do `index << log2(block_size)` to get block's offset.
So, not sure if there's any difference.
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 09/28] fsverity: pass log_blocksize to end_enable_verity()
2023-10-11 11:17 ` Andrey Albershteyn
@ 2023-10-12 7:34 ` Eric Biggers
0 siblings, 0 replies; 74+ messages in thread
From: Eric Biggers @ 2023-10-12 7:34 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On Wed, Oct 11, 2023 at 01:17:36PM +0200, Andrey Albershteyn wrote:
> On 2023-10-10 20:19:06, Eric Biggers wrote:
> > On Fri, Oct 06, 2023 at 08:49:03PM +0200, Andrey Albershteyn wrote:
> > > diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
> > > index 252b2668894c..cac012d4c86a 100644
> > > --- a/include/linux/fsverity.h
> > > +++ b/include/linux/fsverity.h
> > > @@ -51,6 +51,7 @@ struct fsverity_operations {
> > > * @desc: the verity descriptor to write, or NULL on failure
> > > * @desc_size: size of verity descriptor, or 0 on failure
> > > * @merkle_tree_size: total bytes the Merkle tree took up
> > > + * @log_blocksize: log size of the Merkle tree block
> > > *
> > > * If desc == NULL, then enabling verity failed and the filesystem only
> > > * must do any necessary cleanups. Else, it must also store the given
> > > @@ -65,7 +66,8 @@ struct fsverity_operations {
> > > * Return: 0 on success, -errno on failure
> > > */
> > > int (*end_enable_verity)(struct file *filp, const void *desc,
> > > - size_t desc_size, u64 merkle_tree_size);
> > > + size_t desc_size, u64 merkle_tree_size,
> > > + u8 log_blocksize);
> >
> > Maybe just pass the block_size itself instead of log2(block_size)?
>
> XFS will still do `index << log2(block_size)` to get block's offset.
> So, not sure if there's any difference.
It's only used in the following:
offset = 0;
for (index = 1; offset < merkle_tree_size; index++) {
xfs_fsverity_merkle_key_to_disk(&name, offset);
args.name = (const uint8_t *)&name.merkleoff;
args.attr_filter = XFS_ATTR_VERITY;
error = xfs_attr_set(&args);
offset = index << log_blocksize;
}
... which can be the following instead:
for (offset = 0; offset < merkle_tree_size; offset += block_size) {
xfs_fsverity_merkle_key_to_disk(&name, offset);
args.name = (const uint8_t *)&name.merkleoff;
args.attr_filter = XFS_ATTR_VERITY;
error = xfs_attr_set(&args);
}
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 10/28] fsverity: operate with Merkle tree blocks instead of pages
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (8 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 09/28] fsverity: pass log_blocksize to end_enable_verity() Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-07 4:02 ` kernel test robot
` (2 more replies)
2023-10-06 18:49 ` [PATCH v3 11/28] iomap: pass readpage operation to read path Andrey Albershteyn
` (17 subsequent siblings)
27 siblings, 3 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
fsverity expects filesystem to provide PAGEs with Merkle tree
blocks in it. Then, when fsverity is done with processing the
blocks, reference to PAGE is freed. This doesn't fit well with the
way XFS manages its memory.
This patch moves page reference management out of fsverity to
filesystem. This way fsverity expects a kaddr to the Merkle tree
block and filesystem can handle all caching and reference counting.
As btrfs, ext4 and f2fs return page with Merkle tree blocks this
patch also adds fsverity_read_merkle_tree_block() which wraps
addressing blocks in the page (not to implement it in every fs).
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/verity/open.c | 15 ++---
fs/verity/read_metadata.c | 41 +++++++-------
fs/verity/verify.c | 58 ++++++++-----------
include/linux/fsverity.h | 115 +++++++++++++++++++++++++++++++++-----
4 files changed, 150 insertions(+), 79 deletions(-)
diff --git a/fs/verity/open.c b/fs/verity/open.c
index dfb9fe6aaae9..8665d8b40081 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -126,19 +126,16 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
}
/*
- * With block_size != PAGE_SIZE, an in-memory bitmap will need to be
- * allocated to track the "verified" status of hash blocks. Don't allow
- * this bitmap to get too large. For now, limit it to 1 MiB, which
- * limits the file size to about 4.4 TB with SHA-256 and 4K blocks.
+ * An in-memory bitmap will need to be allocated to track the "verified"
+ * status of hash blocks. Don't allow this bitmap to get too large.
+ * For now, limit it to 1 MiB, which limits the file size to
+ * about 4.4 TB with SHA-256 and 4K blocks.
*
* Together with the fact that the data, and thus also the Merkle tree,
* cannot have more than ULONG_MAX pages, this implies that hash block
- * indices can always fit in an 'unsigned long'. But to be safe, we
- * explicitly check for that too. Note, this is only for hash block
- * indices; data block indices might not fit in an 'unsigned long'.
+ * indices can always fit in an 'unsigned long'.
*/
- if ((params->block_size != PAGE_SIZE && offset > 1 << 23) ||
- offset > ULONG_MAX) {
+ if (offset > (1 << 23)) {
fsverity_err(inode, "Too many blocks in Merkle tree");
err = -EFBIG;
goto out_err;
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index 197624cab43e..182bddf5dec5 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -16,9 +16,9 @@ static int fsverity_read_merkle_tree(struct inode *inode,
const struct fsverity_info *vi,
void __user *buf, u64 offset, int length)
{
- const struct fsverity_operations *vops = inode->i_sb->s_vop;
u64 end_offset;
- unsigned int offs_in_page;
+ unsigned int offs_in_block;
+ unsigned int block_size = vi->tree_params.block_size;
pgoff_t index, last_index;
int retval = 0;
int err = 0;
@@ -26,8 +26,8 @@ static int fsverity_read_merkle_tree(struct inode *inode,
end_offset = min(offset + length, vi->tree_params.tree_size);
if (offset >= end_offset)
return 0;
- offs_in_page = offset_in_page(offset);
- last_index = (end_offset - 1) >> PAGE_SHIFT;
+ offs_in_block = offset % block_size;
+ last_index = (end_offset - 1) >> vi->tree_params.log_blocksize;
/*
* Iterate through each Merkle tree page in the requested range and copy
@@ -35,34 +35,31 @@ static int fsverity_read_merkle_tree(struct inode *inode,
* size isn't important here, as we are returning a byte stream; i.e.,
* we can just work with pages even if the tree block size != PAGE_SIZE.
*/
- for (index = offset >> PAGE_SHIFT; index <= last_index; index++) {
+ for (index = offset >> vi->tree_params.log_blocksize;
+ index <= last_index; index++) {
unsigned long num_ra_pages =
min_t(unsigned long, last_index - index + 1,
inode->i_sb->s_bdi->io_pages);
unsigned int bytes_to_copy = min_t(u64, end_offset - offset,
- PAGE_SIZE - offs_in_page);
- struct page *page;
- const void *virt;
+ block_size - offs_in_block);
+ struct fsverity_block block;
- page = vops->read_merkle_tree_page(inode, index, num_ra_pages,
- vi->tree_params.log_blocksize);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- fsverity_err(inode,
- "Error %d reading Merkle tree page %lu",
- err, index);
+ block.len = block_size;
+ if (fsverity_read_merkle_tree_block(inode,
+ index << vi->tree_params.log_blocksize,
+ &block, num_ra_pages)) {
+ fsverity_drop_block(inode, &block);
+ err = -EFAULT;
break;
}
- virt = kmap_local_page(page);
- if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) {
- kunmap_local(virt);
- fsverity_drop_page(inode, page);
+ if (copy_to_user(buf, block.kaddr + offs_in_block, bytes_to_copy)) {
+ fsverity_drop_block(inode, &block);
err = -EFAULT;
break;
}
- kunmap_local(virt);
- fsverity_drop_page(inode, page);
+ fsverity_drop_block(inode, &block);
+ block.kaddr = NULL;
retval += bytes_to_copy;
buf += bytes_to_copy;
@@ -73,7 +70,7 @@ static int fsverity_read_merkle_tree(struct inode *inode,
break;
}
cond_resched();
- offs_in_page = 0;
+ offs_in_block = 0;
}
return retval ? retval : err;
}
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index f556336ebd8d..dfe01f121843 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -44,15 +44,15 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
const struct merkle_tree_params *params = &vi->tree_params;
const unsigned int hsize = params->digest_size;
int level;
+ int err;
+ int num_ra_pages;
u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE];
const u8 *want_hash;
u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
/* The hash blocks that are traversed, indexed by level */
struct {
- /* Page containing the hash block */
- struct page *page;
- /* Mapped address of the hash block (will be within @page) */
- const void *addr;
+ /* Block containing the hash block */
+ struct fsverity_block block;
/* Index of the hash block in the tree overall */
unsigned long index;
/* Byte offset of the wanted hash relative to @addr */
@@ -93,10 +93,8 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
unsigned long next_hidx;
unsigned long hblock_idx;
pgoff_t hpage_idx;
- unsigned int hblock_offset_in_page;
unsigned int hoffset;
- struct page *hpage;
- const void *haddr;
+ struct fsverity_block *block = &hblocks[level].block;
/*
* The index of the block in the current level; also the index
@@ -110,34 +108,28 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
/* Index of the hash page in the tree overall */
hpage_idx = hblock_idx >> params->log_blocks_per_page;
- /* Byte offset of the hash block within the page */
- hblock_offset_in_page =
- (hblock_idx << params->log_blocksize) & ~PAGE_MASK;
-
/* Byte offset of the hash within the block */
hoffset = (hidx << params->log_digestsize) &
(params->block_size - 1);
- hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
- hpage_idx, level == 0 ? min(max_ra_pages,
- params->tree_pages - hpage_idx) : 0,
- params->log_blocksize);
- if (IS_ERR(hpage)) {
+ block->len = params->block_size;
+ num_ra_pages = level == 0 ?
+ min(max_ra_pages, params->tree_pages - hpage_idx) : 0;
+ err = fsverity_read_merkle_tree_block(
+ inode, hblock_idx << params->log_blocksize, block,
+ num_ra_pages);
+ if (err) {
fsverity_err(inode,
- "Error %ld reading Merkle tree page %lu",
- PTR_ERR(hpage), hpage_idx);
+ "Error %d reading Merkle tree block %lu",
+ err, hblock_idx);
goto error;
}
- haddr = kmap_local_page(hpage) + hblock_offset_in_page;
- if (is_hash_block_verified(vi, hblock_idx, PageChecked(hpage))) {
- memcpy(_want_hash, haddr + hoffset, hsize);
+ if (is_hash_block_verified(vi, hblock_idx, block->cached)) {
+ memcpy(_want_hash, block->kaddr + hoffset, hsize);
want_hash = _want_hash;
- kunmap_local(haddr);
- fsverity_drop_page(inode, hpage);
+ fsverity_drop_block(inode, block);
goto descend;
}
- hblocks[level].page = hpage;
- hblocks[level].addr = haddr;
hblocks[level].index = hblock_idx;
hblocks[level].hoffset = hoffset;
hidx = next_hidx;
@@ -147,8 +139,8 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
descend:
/* Descend the tree verifying hash blocks. */
for (; level > 0; level--) {
- struct page *hpage = hblocks[level - 1].page;
- const void *haddr = hblocks[level - 1].addr;
+ struct fsverity_block *block = &hblocks[level - 1].block;
+ const void *haddr = block->kaddr;
unsigned long hblock_idx = hblocks[level - 1].index;
unsigned int hoffset = hblocks[level - 1].hoffset;
@@ -161,14 +153,11 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
* idempotent, as the same hash block might be verified by
* multiple threads concurrently.
*/
- if (vi->hash_block_verified)
- set_bit(hblock_idx, vi->hash_block_verified);
- else
- SetPageChecked(hpage);
+ set_bit(hblock_idx, vi->hash_block_verified);
+ block->verified = true;
memcpy(_want_hash, haddr + hoffset, hsize);
want_hash = _want_hash;
- kunmap_local(haddr);
- fsverity_drop_page(inode, hpage);
+ fsverity_drop_block(inode, block);
}
/* Finally, verify the data block. */
@@ -186,8 +175,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
params->hash_alg->name, hsize, real_hash);
error:
for (; level > 0; level--) {
- kunmap_local(hblocks[level - 1].addr);
- fsverity_drop_page(inode, hblocks[level - 1].page);
+ fsverity_drop_block(inode, &hblocks[level - 1].block);
}
return false;
}
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index cac012d4c86a..ce37a430bc97 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -26,6 +26,24 @@
/* Arbitrary limit to bound the kmalloc() size. Can be changed. */
#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384
+/**
+ * struct fsverity_block - Merkle Tree block
+ * @kaddr: virtual address of the block's data
+ * @len: length of the data
+ * @cached: true if block was already in cache, false otherwise
+ * @verified: true if block is verified against Merkle tree
+ * @context: filesystem private context
+ *
+ * Merkle Tree blocks passed and requested from filesystem
+ */
+struct fsverity_block {
+ void *kaddr;
+ unsigned int len;
+ bool cached;
+ bool verified;
+ void *context;
+};
+
/* Verity operations for filesystems */
struct fsverity_operations {
@@ -107,6 +125,24 @@ struct fsverity_operations {
pgoff_t index,
unsigned long num_ra_pages,
u8 log_blocksize);
+ /**
+ * Read a Merkle tree block of the given inode.
+ * @inode: the inode
+ * @index: 0-based index of the block within the Merkle tree
+ * @num_ra_pages: The number of pages with blocks that should be
+ * prefetched starting at @index if the page at @index
+ * isn't already cached. Implementations may ignore this
+ * argument; it's only a performance optimization.
+ *
+ * This can be called at any time on an open verity file. It may be
+ * called by multiple processes concurrently.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+ int (*read_merkle_tree_block)(struct inode *inode,
+ unsigned int index,
+ struct fsverity_block *block,
+ unsigned long num_ra_pages);
/**
* Write a Merkle tree block to the given inode.
@@ -125,14 +161,14 @@ struct fsverity_operations {
u64 pos, unsigned int size);
/**
- * Release the reference to a Merkle tree page
+ * Release the reference to a Merkle tree block
*
- * @page: the page to release
+ * @page: the block to release
*
- * This is called when fs-verity is done with a page obtained with
- * ->read_merkle_tree_page().
+ * This is called when fs-verity is done with a block obtained with
+ * ->read_merkle_tree_block().
*/
- void (*drop_page)(struct page *page);
+ void (*drop_block)(struct fsverity_block *block);
};
#ifdef CONFIG_FS_VERITY
@@ -188,22 +224,66 @@ void fsverity_verify_bio(struct bio *bio);
void fsverity_enqueue_verify_work(struct work_struct *work);
/**
- * fsverity_drop_page() - drop page obtained with ->read_merkle_tree_page()
+ * fsverity_drop_block() - drop block obtained with ->read_merkle_tree_block()
* @inode: inode in use for verification or metadata reading
- * @page: page to be dropped
+ * @block: block to be dropped
*
- * Generic put_page() method. Calls out back to filesystem if ->drop_page() is
- * set, otherwise just drops the reference to a page.
+ * Generic put_page() method. Calls out back to filesystem if ->drop_block() is
+ * set, otherwise do nothing.
*
*/
-static inline void fsverity_drop_page(struct inode *inode, struct page *page)
+static inline void fsverity_drop_block(struct inode *inode,
+ struct fsverity_block *block)
{
- if (inode->i_sb->s_vop->drop_page)
- inode->i_sb->s_vop->drop_page(page);
- else
+ if (inode->i_sb->s_vop->drop_block)
+ inode->i_sb->s_vop->drop_block(block);
+ else {
+ struct page *page = (struct page *)block->context;
+
+ if (block->verified)
+ SetPageChecked(page);
+
put_page(page);
+ }
}
+/**
+ * fsverity_read_block_from_page() - layer between fs using read page
+ * and read block
+ * @inode: inode in use for verification or metadata reading
+ * @index: index of the block in the tree (offset into the tree)
+ * @block: block to be read
+ * @num_ra_pages: number of pages to readahead, may be ignored
+ *
+ * Depending on fs implementation use read_merkle_tree_block or
+ * read_merkle_tree_page.
+ */
+static inline int fsverity_read_merkle_tree_block(struct inode *inode,
+ unsigned int index,
+ struct fsverity_block *block,
+ unsigned long num_ra_pages)
+{
+ struct page *page;
+
+ if (inode->i_sb->s_vop->read_merkle_tree_block)
+ return inode->i_sb->s_vop->read_merkle_tree_block(
+ inode, index, block, num_ra_pages);
+
+ page = inode->i_sb->s_vop->read_merkle_tree_page(
+ inode, index >> PAGE_SHIFT, num_ra_pages,
+ block->len);
+
+ block->kaddr = page_address(page) + (index % PAGE_SIZE);
+ block->cached = PageChecked(page);
+ block->context = page;
+
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ else
+ return 0;
+}
+
+
#else /* !CONFIG_FS_VERITY */
@@ -287,6 +367,15 @@ static inline void fsverity_drop_page(struct inode *inode, struct page *page)
WARN_ON_ONCE(1);
}
+static inline int fsverity_read_merkle_tree_block(struct inode *inode,
+ unsigned int index,
+ struct fsverity_block *block,
+ unsigned long num_ra_pages)
+{
+ WARN_ON_ONCE(1);
+ return -EOPNOTSUPP;
+}
+
#endif /* !CONFIG_FS_VERITY */
static inline bool fsverity_verify_folio(struct folio *folio)
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 10/28] fsverity: operate with Merkle tree blocks instead of pages
2023-10-06 18:49 ` [PATCH v3 10/28] fsverity: operate with Merkle tree blocks instead of pages Andrey Albershteyn
@ 2023-10-07 4:02 ` kernel test robot
2023-10-11 3:56 ` Eric Biggers
2023-10-16 13:00 ` Christoph Hellwig
2 siblings, 0 replies; 74+ messages in thread
From: kernel test robot @ 2023-10-07 4:02 UTC (permalink / raw)
To: Andrey Albershteyn, linux-xfs, linux-fsdevel, fsverity
Cc: oe-kbuild-all, djwong, ebiggers, david, dchinner, Andrey Albershteyn
Hi Andrey,
kernel test robot noticed the following build errors:
[auto build test ERROR on xfs-linux/for-next]
[also build test ERROR on kdave/for-next tytso-ext4/dev jaegeuk-f2fs/dev-test jaegeuk-f2fs/dev linus/master v6.6-rc4 next-20231006]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Andrey-Albershteyn/xfs-Add-new-name-to-attri-d/20231007-025742
base: https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git for-next
patch link: https://lore.kernel.org/r/20231006184922.252188-11-aalbersh%40redhat.com
patch subject: [PATCH v3 10/28] fsverity: operate with Merkle tree blocks instead of pages
config: powerpc-randconfig-003-20231007 (https://download.01.org/0day-ci/archive/20231007/202310071152.hVd0qEeD-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231007/202310071152.hVd0qEeD-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310071152.hVd0qEeD-lkp@intel.com/
All errors (new ones prefixed by >>):
powerpc-linux-ld: fs/verity/read_metadata.o: in function `fsverity_read_merkle_tree':
>> read_metadata.c:(.text.fsverity_read_merkle_tree+0xa0): undefined reference to `__umoddi3'
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 10/28] fsverity: operate with Merkle tree blocks instead of pages
2023-10-06 18:49 ` [PATCH v3 10/28] fsverity: operate with Merkle tree blocks instead of pages Andrey Albershteyn
2023-10-07 4:02 ` kernel test robot
@ 2023-10-11 3:56 ` Eric Biggers
2023-10-16 13:00 ` Christoph Hellwig
2 siblings, 0 replies; 74+ messages in thread
From: Eric Biggers @ 2023-10-11 3:56 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, djwong, david, dchinner
On Fri, Oct 06, 2023 at 08:49:04PM +0200, Andrey Albershteyn wrote:
> fsverity: operate with Merkle tree blocks instead of pages
Well, it already does, just not for the Merkle tree caching. A better title
might be something like "fsverity: support block-based Merkle tree caching".
> fsverity expects filesystem to provide PAGEs with Merkle tree
> blocks in it. Then, when fsverity is done with processing the
> blocks, reference to PAGE is freed. This doesn't fit well with the
> way XFS manages its memory.
BTW, I encourage using "fs/verity/" when referring specifically to the fsverity
support code in the kernel, which is located in that directory, as opposed to
the whole fsverity feature.
> This patch moves page reference management out of fsverity to
> filesystem. This way fsverity expects a kaddr to the Merkle tree
> block and filesystem can handle all caching and reference counting.
That's not done for the existing filesystems, though. Which is probably the
right choice, but it isn't what this commit message says.
> diff --git a/fs/verity/open.c b/fs/verity/open.c
> index dfb9fe6aaae9..8665d8b40081 100644
> --- a/fs/verity/open.c
> +++ b/fs/verity/open.c
> @@ -126,19 +126,16 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
> }
>
> /*
> - * With block_size != PAGE_SIZE, an in-memory bitmap will need to be
> - * allocated to track the "verified" status of hash blocks. Don't allow
> - * this bitmap to get too large. For now, limit it to 1 MiB, which
> - * limits the file size to about 4.4 TB with SHA-256 and 4K blocks.
> + * An in-memory bitmap will need to be allocated to track the "verified"
> + * status of hash blocks. Don't allow this bitmap to get too large.
> + * For now, limit it to 1 MiB, which limits the file size to
> + * about 4.4 TB with SHA-256 and 4K blocks.
> *
> * Together with the fact that the data, and thus also the Merkle tree,
> * cannot have more than ULONG_MAX pages, this implies that hash block
> - * indices can always fit in an 'unsigned long'. But to be safe, we
> - * explicitly check for that too. Note, this is only for hash block
> - * indices; data block indices might not fit in an 'unsigned long'.
> + * indices can always fit in an 'unsigned long'.
> */
> - if ((params->block_size != PAGE_SIZE && offset > 1 << 23) ||
> - offset > ULONG_MAX) {
> + if (offset > (1 << 23)) {
> fsverity_err(inode, "Too many blocks in Merkle tree");
> err = -EFBIG;
> goto out_err;
This hunk should have been in the patch "fsverity: always use bitmap to track
verified status".
> diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
> index 197624cab43e..182bddf5dec5 100644
> --- a/fs/verity/read_metadata.c
> +++ b/fs/verity/read_metadata.c
> @@ -16,9 +16,9 @@ static int fsverity_read_merkle_tree(struct inode *inode,
> const struct fsverity_info *vi,
> void __user *buf, u64 offset, int length)
> {
> - const struct fsverity_operations *vops = inode->i_sb->s_vop;
> u64 end_offset;
> - unsigned int offs_in_page;
> + unsigned int offs_in_block;
> + unsigned int block_size = vi->tree_params.block_size;
Maybe do here:
const unsigned int block_size = vi->tree_params.block_size;
const u8 log_blocksize = vi->tree_params.log_blocksize;
Then both are easily accessible to the rest of the function.
> pgoff_t index, last_index;
> int retval = 0;
> int err = 0;
> @@ -26,8 +26,8 @@ static int fsverity_read_merkle_tree(struct inode *inode,
> end_offset = min(offset + length, vi->tree_params.tree_size);
> if (offset >= end_offset)
> return 0;
> - offs_in_page = offset_in_page(offset);
> - last_index = (end_offset - 1) >> PAGE_SHIFT;
> + offs_in_block = offset % block_size;
No modulo by non-constant values, please. The block size always is a power of
2, so use 'offset & (block_size - 1)'.
> + last_index = (end_offset - 1) >> vi->tree_params.log_blocksize;
>
> /*
> * Iterate through each Merkle tree page in the requested range and copy
> * the requested portion to userspace. Note that the Merkle tree block
> * size isn't important here, as we are returning a byte stream; i.e.,
> * we can just work with pages even if the tree block size != PAGE_SIZE.
> */
The above comment needs to be updated.
> - for (index = offset >> PAGE_SHIFT; index <= last_index; index++) {
> + for (index = offset >> vi->tree_params.log_blocksize;
> + index <= last_index; index++) {
> unsigned long num_ra_pages =
> min_t(unsigned long, last_index - index + 1,
> inode->i_sb->s_bdi->io_pages);
> unsigned int bytes_to_copy = min_t(u64, end_offset - offset,
> - PAGE_SIZE - offs_in_page);
> - struct page *page;
> - const void *virt;
> + block_size - offs_in_block);
> + struct fsverity_block block;
>
> - page = vops->read_merkle_tree_page(inode, index, num_ra_pages,
> - vi->tree_params.log_blocksize);
> - if (IS_ERR(page)) {
> - err = PTR_ERR(page);
> - fsverity_err(inode,
> - "Error %d reading Merkle tree page %lu",
> - err, index);
> + block.len = block_size;
> + if (fsverity_read_merkle_tree_block(inode,
> + index << vi->tree_params.log_blocksize,
> + &block, num_ra_pages)) {
> + fsverity_drop_block(inode, &block);
> + err = -EFAULT;
> break;
> }
EFAULT is the wrong error code for an I/O error.
> diff --git a/fs/verity/verify.c b/fs/verity/verify.c
> index f556336ebd8d..dfe01f121843 100644
> --- a/fs/verity/verify.c
> +++ b/fs/verity/verify.c
> @@ -44,15 +44,15 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
> const struct merkle_tree_params *params = &vi->tree_params;
> const unsigned int hsize = params->digest_size;
> int level;
> + int err;
> + int num_ra_pages;
> u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE];
> const u8 *want_hash;
> u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
> /* The hash blocks that are traversed, indexed by level */
> struct {
> - /* Page containing the hash block */
> - struct page *page;
> - /* Mapped address of the hash block (will be within @page) */
> - const void *addr;
> + /* Block containing the hash block */
> + struct fsverity_block block;
"Block containing the hash block" is nonsensical. I think this indicates that
fsverity_block is misnamed. It should be called something like
fsverity_blockbuf. Then the above comment would be something like "Buffer
containing the hash block".
> @@ -93,10 +93,8 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
> unsigned long next_hidx;
> unsigned long hblock_idx;
> pgoff_t hpage_idx;
> - unsigned int hblock_offset_in_page;
> unsigned int hoffset;
> - struct page *hpage;
> - const void *haddr;
> + struct fsverity_block *block = &hblocks[level].block;
>
> /*
> * The index of the block in the current level; also the index
> @@ -110,34 +108,28 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
> /* Index of the hash page in the tree overall */
> hpage_idx = hblock_idx >> params->log_blocks_per_page;
>
> - /* Byte offset of the hash block within the page */
> - hblock_offset_in_page =
> - (hblock_idx << params->log_blocksize) & ~PAGE_MASK;
> -
> /* Byte offset of the hash within the block */
> hoffset = (hidx << params->log_digestsize) &
> (params->block_size - 1);
>
> - hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
> - hpage_idx, level == 0 ? min(max_ra_pages,
> - params->tree_pages - hpage_idx) : 0,
> - params->log_blocksize);
> - if (IS_ERR(hpage)) {
> + block->len = params->block_size;
> + num_ra_pages = level == 0 ?
> + min(max_ra_pages, params->tree_pages - hpage_idx) : 0;
The fact that the readahead amount is still calculated in pages seems out of
place. Maybe it should be done in bytes?
> - if (vi->hash_block_verified)
> - set_bit(hblock_idx, vi->hash_block_verified);
> - else
> - SetPageChecked(hpage);
> + set_bit(hblock_idx, vi->hash_block_verified);
This hunk also should have been in "fsverity: always use bitmap to track
verified status". But as I mentioned on that patch, I'm not sure we should
always use the bitmap.
> diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
> index cac012d4c86a..ce37a430bc97 100644
> --- a/include/linux/fsverity.h
> +++ b/include/linux/fsverity.h
> @@ -26,6 +26,24 @@
> /* Arbitrary limit to bound the kmalloc() size. Can be changed. */
> #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384
>
> +/**
> + * struct fsverity_block - Merkle Tree block
> + * @kaddr: virtual address of the block's data
> + * @len: length of the data
Maybe use size instead of len? Currently it's pretty consistently called the
"block size", not "block length".
> + * @cached: true if block was already in cache, false otherwise
cached => already_cached?
> + * @verified: true if block is verified against Merkle tree
> + * @context: filesystem private context
> + *
> + * Merkle Tree blocks passed and requested from filesystem
> + */
It needs to be clearly documented how these fields flow between the filesystem
and fs/verity/ when ->read_merkle_tree_block() and ->drop_block() are used. It
seems to be different for different fields. Which are inputs and which are
outputs to which functions?
> + /**
> + * Read a Merkle tree block of the given inode.
> + * @inode: the inode
> + * @index: 0-based index of the block within the Merkle tree
> + * @num_ra_pages: The number of pages with blocks that should be
> + * prefetched starting at @index if the page at @index
> + * isn't already cached. Implementations may ignore this
> + * argument; it's only a performance optimization.
> + *
> + * This can be called at any time on an open verity file. It may be
> + * called by multiple processes concurrently.
> + *
> + * Return: 0 on success, -errno on failure
> + */
> + int (*read_merkle_tree_block)(struct inode *inode,
> + unsigned int index,
> + struct fsverity_block *block,
> + unsigned long num_ra_pages);
For the second parameter your code actually passes the block position (in
bytes), not the block index. Perhaps you meant for it to be 'u64 pos'? Either
way, 'unsigned int' is wrong. Merkle tree block indices are unsigned long;
positions are u64.
> -static inline void fsverity_drop_page(struct inode *inode, struct page *page)
> +static inline void fsverity_drop_block(struct inode *inode,
> + struct fsverity_block *block)
> {
> - if (inode->i_sb->s_vop->drop_page)
> - inode->i_sb->s_vop->drop_page(page);
> - else
> + if (inode->i_sb->s_vop->drop_block)
> + inode->i_sb->s_vop->drop_block(block);
> + else {
> + struct page *page = (struct page *)block->context;
> +
> + if (block->verified)
> + SetPageChecked(page);
> +
Why is PG_checked being set here?
> +/**
> + * fsverity_read_block_from_page() - layer between fs using read page
> + * and read block
> + * @inode: inode in use for verification or metadata reading
> + * @index: index of the block in the tree (offset into the tree)
> + * @block: block to be read
> + * @num_ra_pages: number of pages to readahead, may be ignored
> + *
> + * Depending on fs implementation use read_merkle_tree_block or
> + * read_merkle_tree_page.
> + */
> +static inline int fsverity_read_merkle_tree_block(struct inode *inode,
> + unsigned int index,
> + struct fsverity_block *block,
> + unsigned long num_ra_pages)
> +{
> + struct page *page;
> +
> + if (inode->i_sb->s_vop->read_merkle_tree_block)
> + return inode->i_sb->s_vop->read_merkle_tree_block(
> + inode, index, block, num_ra_pages);
> +
> + page = inode->i_sb->s_vop->read_merkle_tree_page(
> + inode, index >> PAGE_SHIFT, num_ra_pages,
> + block->len);
> +
> + block->kaddr = page_address(page) + (index % PAGE_SIZE);
> + block->cached = PageChecked(page);
> + block->context = page;
> +
> + if (IS_ERR(page))
> + return PTR_ERR(page);
> + else
> + return 0;
> +}
This isn't handling errors from ->read_merkle_tree_page() correctly. Also,
page_address() can't be used for pagecache pages unless the file has opted out
of highmem. You need to use kmap_local_page(), as the code was doing before.
Also, since fsverity_read_merkle_tree_block() is a private function for
fs/verity/, not for filesystems to use, it should be put in
fs/verity/fsverity_private.h, not in include/linux/fsverity.h.
- Eric
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 10/28] fsverity: operate with Merkle tree blocks instead of pages
2023-10-06 18:49 ` [PATCH v3 10/28] fsverity: operate with Merkle tree blocks instead of pages Andrey Albershteyn
2023-10-07 4:02 ` kernel test robot
2023-10-11 3:56 ` Eric Biggers
@ 2023-10-16 13:00 ` Christoph Hellwig
2 siblings, 0 replies; 74+ messages in thread
From: Christoph Hellwig @ 2023-10-16 13:00 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, djwong, ebiggers, david, dchinner
Can we just switch everyone over to the block interface and just
provide helpers for the existing users? The patch below (untested)
should do that and the diffstate looks quite nice.
Btw, why do we pass the block offset as unsigned int? Is there
anything that guarantees it stays under 32-bits? I would have
expected a loff_t there.
---
diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst
index af889512c6ac99..c616d530a89086 100644
--- a/Documentation/filesystems/fsverity.rst
+++ b/Documentation/filesystems/fsverity.rst
@@ -648,7 +648,7 @@ which verifies data that has been read into the pagecache of a verity
inode. The containing folio must still be locked and not Uptodate, so
it's not yet readable by userspace. As needed to do the verification,
fsverity_verify_blocks() will call back into the filesystem to read
-hash blocks via fsverity_operations::read_merkle_tree_page().
+hash blocks via fsverity_operations::read_merkle_tree_block().
fsverity_verify_blocks() returns false if verification failed; in this
case, the filesystem must not set the folio Uptodate. Following this,
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 2b34796f68d349..4b6134923232e7 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -713,20 +713,20 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
*
* Returns the page we read, or an ERR_PTR on error.
*/
-static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
- pgoff_t index,
- unsigned long num_ra_pages,
- u8 log_blocksize)
+static int btrfs_read_merkle_tree_block(struct inode *inode,
+ unsigned int offset, struct fsverity_block *block,
+ unsigned long num_ra_pages)
{
struct folio *folio;
+ pgoff_t index = offset >> PAGE_SHIFT;
u64 off = (u64)index << PAGE_SHIFT;
loff_t merkle_pos = merkle_file_pos(inode);
int ret;
if (merkle_pos < 0)
- return ERR_PTR(merkle_pos);
+ return merkle_pos;
if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE)
- return ERR_PTR(-EFBIG);
+ return -EFBIG;
index += merkle_pos >> PAGE_SHIFT;
again:
folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
@@ -739,7 +739,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
if (!folio_test_uptodate(folio)) {
folio_unlock(folio);
folio_put(folio);
- return ERR_PTR(-EIO);
+ return -EIO;
}
folio_unlock(folio);
goto out;
@@ -748,7 +748,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS),
0);
if (!folio)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
ret = filemap_add_folio(inode->i_mapping, folio, index, GFP_NOFS);
if (ret) {
@@ -756,7 +756,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
/* Did someone else insert a folio here? */
if (ret == -EEXIST)
goto again;
- return ERR_PTR(ret);
+ return ret;
}
/*
@@ -769,7 +769,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
folio_address(folio), PAGE_SIZE, &folio->page);
if (ret < 0) {
folio_put(folio);
- return ERR_PTR(ret);
+ return ret;
}
if (ret < PAGE_SIZE)
folio_zero_segment(folio, ret, PAGE_SIZE);
@@ -778,7 +778,8 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
folio_unlock(folio);
out:
- return folio_file_page(folio, index);
+ return fsverity_set_block_page(block, folio_file_page(folio, index),
+ offset);
}
/*
@@ -809,6 +810,7 @@ const struct fsverity_operations btrfs_verityops = {
.begin_enable_verity = btrfs_begin_enable_verity,
.end_enable_verity = btrfs_end_enable_verity,
.get_verity_descriptor = btrfs_get_verity_descriptor,
- .read_merkle_tree_page = btrfs_read_merkle_tree_page,
+ .read_merkle_tree_block = btrfs_read_merkle_tree_block,
.write_merkle_tree_block = btrfs_write_merkle_tree_block,
+ .drop_merkle_tree_block = fsverity_drop_page_merke_tree_block,
};
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 4e2f01f048c09b..5623e2c1c302e8 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -358,15 +358,13 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
return desc_size;
}
-static struct page *ext4_read_merkle_tree_page(struct inode *inode,
- pgoff_t index,
- unsigned long num_ra_pages,
- u8 log_blocksize)
+static int ext4_read_merkle_tree_block(struct inode *inode, unsigned int offset,
+ struct fsverity_block *block, unsigned long num_ra_pages)
{
struct folio *folio;
+ pgoff_t index;
- index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
-
+ index = (ext4_verity_metadata_pos(inode) + offset) >> PAGE_SHIFT;
folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
@@ -377,9 +375,10 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
folio = read_mapping_folio(inode->i_mapping, index, NULL);
if (IS_ERR(folio))
- return ERR_CAST(folio);
+ return PTR_ERR(folio);
}
- return folio_file_page(folio, index);
+ return fsverity_set_block_page(block, folio_file_page(folio, index),
+ offset);
}
static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
@@ -394,6 +393,7 @@ const struct fsverity_operations ext4_verityops = {
.begin_enable_verity = ext4_begin_enable_verity,
.end_enable_verity = ext4_end_enable_verity,
.get_verity_descriptor = ext4_get_verity_descriptor,
- .read_merkle_tree_page = ext4_read_merkle_tree_page,
+ .read_merkle_tree_block = ext4_read_merkle_tree_block,
.write_merkle_tree_block = ext4_write_merkle_tree_block,
+ .drop_merkle_tree_block = fsverity_drop_page_merke_tree_block,
};
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 601ab9f0c02492..aac9281e9c4565 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -255,15 +255,13 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
return size;
}
-static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
- pgoff_t index,
- unsigned long num_ra_pages,
- u8 log_blocksize)
+static int f2fs_read_merkle_tree_block(struct inode *inode, unsigned int offset,
+ struct fsverity_block *block, unsigned long num_ra_pages)
{
struct page *page;
+ pgoff_t index;
- index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
-
+ index = (f2fs_verity_metadata_pos(inode) + offset) >> PAGE_SHIFT;
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
@@ -274,7 +272,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
page = read_mapping_page(inode->i_mapping, index, NULL);
}
- return page;
+ return fsverity_set_block_page(block, page, offset);
}
static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
@@ -289,6 +287,7 @@ const struct fsverity_operations f2fs_verityops = {
.begin_enable_verity = f2fs_begin_enable_verity,
.end_enable_verity = f2fs_end_enable_verity,
.get_verity_descriptor = f2fs_get_verity_descriptor,
- .read_merkle_tree_page = f2fs_read_merkle_tree_page,
+ .read_merkle_tree_block = f2fs_read_merkle_tree_block,
.write_merkle_tree_block = f2fs_write_merkle_tree_block,
+ .drop_merkle_tree_block = fsverity_drop_page_merke_tree_block,
};
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index 182bddf5dec54c..5e362f8562bd5d 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -12,10 +12,33 @@
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
+int fsverity_set_block_page(struct fsverity_block *block,
+ struct page *page, unsigned int index)
+{
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ block->kaddr = page_address(page) + (index % PAGE_SIZE);
+ block->cached = PageChecked(page);
+ block->context = page;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fsverity_set_block_page);
+
+void fsverity_drop_page_merke_tree_block(struct fsverity_block *block)
+{
+ struct page *page = block->context;
+
+ if (block->verified)
+ SetPageChecked(page);
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(fsverity_drop_page_merke_tree_block);
+
static int fsverity_read_merkle_tree(struct inode *inode,
const struct fsverity_info *vi,
void __user *buf, u64 offset, int length)
{
+ const struct fsverity_operations *vop = inode->i_sb->s_vop;
u64 end_offset;
unsigned int offs_in_block;
unsigned int block_size = vi->tree_params.block_size;
@@ -45,20 +68,19 @@ static int fsverity_read_merkle_tree(struct inode *inode,
struct fsverity_block block;
block.len = block_size;
- if (fsverity_read_merkle_tree_block(inode,
- index << vi->tree_params.log_blocksize,
- &block, num_ra_pages)) {
- fsverity_drop_block(inode, &block);
+ if (vop->read_merkle_tree_block(inode,
+ index << vi->tree_params.log_blocksize,
+ &block, num_ra_pages)) {
err = -EFAULT;
break;
}
if (copy_to_user(buf, block.kaddr + offs_in_block, bytes_to_copy)) {
- fsverity_drop_block(inode, &block);
+ vop->drop_merkle_tree_block(&block);
err = -EFAULT;
break;
}
- fsverity_drop_block(inode, &block);
+ vop->drop_merkle_tree_block(&block);
block.kaddr = NULL;
retval += bytes_to_copy;
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index dfe01f12184341..9b84262a6fa413 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -42,6 +42,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
const void *data, u64 data_pos, unsigned long max_ra_pages)
{
const struct merkle_tree_params *params = &vi->tree_params;
+ const struct fsverity_operations *vop = inode->i_sb->s_vop;
const unsigned int hsize = params->digest_size;
int level;
int err;
@@ -115,9 +116,9 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
block->len = params->block_size;
num_ra_pages = level == 0 ?
min(max_ra_pages, params->tree_pages - hpage_idx) : 0;
- err = fsverity_read_merkle_tree_block(
- inode, hblock_idx << params->log_blocksize, block,
- num_ra_pages);
+ err = vop->read_merkle_tree_block(inode,
+ hblock_idx << params->log_blocksize, block,
+ num_ra_pages);
if (err) {
fsverity_err(inode,
"Error %d reading Merkle tree block %lu",
@@ -127,7 +128,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
if (is_hash_block_verified(vi, hblock_idx, block->cached)) {
memcpy(_want_hash, block->kaddr + hoffset, hsize);
want_hash = _want_hash;
- fsverity_drop_block(inode, block);
+ vop->drop_merkle_tree_block(block);
goto descend;
}
hblocks[level].index = hblock_idx;
@@ -157,7 +158,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
block->verified = true;
memcpy(_want_hash, haddr + hoffset, hsize);
want_hash = _want_hash;
- fsverity_drop_block(inode, block);
+ vop->drop_merkle_tree_block(block);
}
/* Finally, verify the data block. */
@@ -174,9 +175,8 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
params->hash_alg->name, hsize, want_hash,
params->hash_alg->name, hsize, real_hash);
error:
- for (; level > 0; level--) {
- fsverity_drop_block(inode, &hblocks[level - 1].block);
- }
+ for (; level > 0; level--)
+ vop->drop_merkle_tree_block(&hblocks[level - 1].block);
return false;
}
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index ce37a430bc97f2..ae9ae7719af558 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -104,27 +104,6 @@ struct fsverity_operations {
int (*get_verity_descriptor)(struct inode *inode, void *buf,
size_t bufsize);
- /**
- * Read a Merkle tree page of the given inode.
- *
- * @inode: the inode
- * @index: 0-based index of the page within the Merkle tree
- * @num_ra_pages: The number of Merkle tree pages that should be
- * prefetched starting at @index if the page at @index
- * isn't already cached. Implementations may ignore this
- * argument; it's only a performance optimization.
- *
- * This can be called at any time on an open verity file. It may be
- * called by multiple processes concurrently, even with the same page.
- *
- * Note that this must retrieve a *page*, not necessarily a *block*.
- *
- * Return: the page on success, ERR_PTR() on failure
- */
- struct page *(*read_merkle_tree_page)(struct inode *inode,
- pgoff_t index,
- unsigned long num_ra_pages,
- u8 log_blocksize);
/**
* Read a Merkle tree block of the given inode.
* @inode: the inode
@@ -162,13 +141,12 @@ struct fsverity_operations {
/**
* Release the reference to a Merkle tree block
- *
- * @page: the block to release
+ * @block: the block to release
*
* This is called when fs-verity is done with a block obtained with
* ->read_merkle_tree_block().
*/
- void (*drop_block)(struct fsverity_block *block);
+ void (*drop_merkle_tree_block)(struct fsverity_block *block);
};
#ifdef CONFIG_FS_VERITY
@@ -217,74 +195,16 @@ static inline void fsverity_cleanup_inode(struct inode *inode)
int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg);
+int fsverity_set_block_page(struct fsverity_block *block,
+ struct page *page, unsigned int index);
+void fsverity_drop_page_merke_tree_block(struct fsverity_block *block);
+
/* verify.c */
bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset);
void fsverity_verify_bio(struct bio *bio);
void fsverity_enqueue_verify_work(struct work_struct *work);
-/**
- * fsverity_drop_block() - drop block obtained with ->read_merkle_tree_block()
- * @inode: inode in use for verification or metadata reading
- * @block: block to be dropped
- *
- * Generic put_page() method. Calls out back to filesystem if ->drop_block() is
- * set, otherwise do nothing.
- *
- */
-static inline void fsverity_drop_block(struct inode *inode,
- struct fsverity_block *block)
-{
- if (inode->i_sb->s_vop->drop_block)
- inode->i_sb->s_vop->drop_block(block);
- else {
- struct page *page = (struct page *)block->context;
-
- if (block->verified)
- SetPageChecked(page);
-
- put_page(page);
- }
-}
-
-/**
- * fsverity_read_block_from_page() - layer between fs using read page
- * and read block
- * @inode: inode in use for verification or metadata reading
- * @index: index of the block in the tree (offset into the tree)
- * @block: block to be read
- * @num_ra_pages: number of pages to readahead, may be ignored
- *
- * Depending on fs implementation use read_merkle_tree_block or
- * read_merkle_tree_page.
- */
-static inline int fsverity_read_merkle_tree_block(struct inode *inode,
- unsigned int index,
- struct fsverity_block *block,
- unsigned long num_ra_pages)
-{
- struct page *page;
-
- if (inode->i_sb->s_vop->read_merkle_tree_block)
- return inode->i_sb->s_vop->read_merkle_tree_block(
- inode, index, block, num_ra_pages);
-
- page = inode->i_sb->s_vop->read_merkle_tree_page(
- inode, index >> PAGE_SHIFT, num_ra_pages,
- block->len);
-
- block->kaddr = page_address(page) + (index % PAGE_SIZE);
- block->cached = PageChecked(page);
- block->context = page;
-
- if (IS_ERR(page))
- return PTR_ERR(page);
- else
- return 0;
-}
-
-
-
#else /* !CONFIG_FS_VERITY */
static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
@@ -362,20 +282,6 @@ static inline void fsverity_enqueue_verify_work(struct work_struct *work)
WARN_ON_ONCE(1);
}
-static inline void fsverity_drop_page(struct inode *inode, struct page *page)
-{
- WARN_ON_ONCE(1);
-}
-
-static inline int fsverity_read_merkle_tree_block(struct inode *inode,
- unsigned int index,
- struct fsverity_block *block,
- unsigned long num_ra_pages)
-{
- WARN_ON_ONCE(1);
- return -EOPNOTSUPP;
-}
-
#endif /* !CONFIG_FS_VERITY */
static inline bool fsverity_verify_folio(struct folio *folio)
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH v3 11/28] iomap: pass readpage operation to read path
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (9 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 10/28] fsverity: operate with Merkle tree blocks instead of pages Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 18:31 ` Darrick J. Wong
2023-10-16 9:15 ` Christoph Hellwig
2023-10-06 18:49 ` [PATCH v3 12/28] iomap: allow filesystem to implement read path verification Andrey Albershteyn
` (16 subsequent siblings)
27 siblings, 2 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
Preparation for allowing filesystems to provide bio_set and
->submit_io() for read path. This will allow fs to do an additional
processing of ioend on ioend completion.
Make iomap_read_end_io() exportable, so, it can be called back from
filesystem callout after verification is done.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/erofs/data.c | 4 ++--
fs/gfs2/aops.c | 4 ++--
fs/iomap/buffered-io.c | 13 ++++++++++---
fs/xfs/xfs_aops.c | 4 ++--
fs/zonefs/file.c | 4 ++--
include/linux/iomap.h | 21 +++++++++++++++++++--
6 files changed, 37 insertions(+), 13 deletions(-)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0c2c99c58b5e..3f5482d6cedb 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -357,12 +357,12 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
static int erofs_read_folio(struct file *file, struct folio *folio)
{
- return iomap_read_folio(folio, &erofs_iomap_ops);
+ return iomap_read_folio(folio, &erofs_iomap_ops, NULL);
}
static void erofs_readahead(struct readahead_control *rac)
{
- return iomap_readahead(rac, &erofs_iomap_ops);
+ return iomap_readahead(rac, &erofs_iomap_ops, NULL);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index c26d48355cc2..9c09ff75e586 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -456,7 +456,7 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
if (!gfs2_is_jdata(ip) ||
(i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
- error = iomap_read_folio(folio, &gfs2_iomap_ops);
+ error = iomap_read_folio(folio, &gfs2_iomap_ops, NULL);
} else if (gfs2_is_stuffed(ip)) {
error = stuffed_readpage(ip, &folio->page);
folio_unlock(folio);
@@ -534,7 +534,7 @@ static void gfs2_readahead(struct readahead_control *rac)
else if (gfs2_is_jdata(ip))
mpage_readahead(rac, gfs2_block_map);
else
- iomap_readahead(rac, &gfs2_iomap_ops);
+ iomap_readahead(rac, &gfs2_iomap_ops, NULL);
}
/**
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 644479ccefbd..ca78c7f62527 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -264,7 +264,7 @@ static void iomap_finish_folio_read(struct folio *folio, size_t offset,
folio_unlock(folio);
}
-static void iomap_read_end_io(struct bio *bio)
+void iomap_read_end_io(struct bio *bio)
{
int error = blk_status_to_errno(bio->bi_status);
struct folio_iter fi;
@@ -273,12 +273,14 @@ static void iomap_read_end_io(struct bio *bio)
iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
bio_put(bio);
}
+EXPORT_SYMBOL_GPL(iomap_read_end_io);
struct iomap_readpage_ctx {
struct folio *cur_folio;
bool cur_folio_in_bio;
struct bio *bio;
struct readahead_control *rac;
+ const struct iomap_readpage_ops *ops;
};
/**
@@ -402,7 +404,8 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
return pos - orig_pos + plen;
}
-int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
+int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops,
+ const struct iomap_readpage_ops *readpage_ops)
{
struct iomap_iter iter = {
.inode = folio->mapping->host,
@@ -411,6 +414,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
};
struct iomap_readpage_ctx ctx = {
.cur_folio = folio,
+ .ops = readpage_ops,
};
int ret;
@@ -468,6 +472,7 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
* iomap_readahead - Attempt to read pages from a file.
* @rac: Describes the pages to be read.
* @ops: The operations vector for the filesystem.
+ * @readpage_ops: Filesystem supplied folio processiong operation
*
* This function is for filesystems to call to implement their readahead
* address_space operation.
@@ -479,7 +484,8 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
* function is called with memalloc_nofs set, so allocations will not cause
* the filesystem to be reentered.
*/
-void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
+void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops,
+ const struct iomap_readpage_ops *readpage_ops)
{
struct iomap_iter iter = {
.inode = rac->mapping->host,
@@ -488,6 +494,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
};
struct iomap_readpage_ctx ctx = {
.rac = rac,
+ .ops = readpage_ops,
};
trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 465d7630bb21..b413a2dbcc18 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -553,14 +553,14 @@ xfs_vm_read_folio(
struct file *unused,
struct folio *folio)
{
- return iomap_read_folio(folio, &xfs_read_iomap_ops);
+ return iomap_read_folio(folio, &xfs_read_iomap_ops, NULL);
}
STATIC void
xfs_vm_readahead(
struct readahead_control *rac)
{
- iomap_readahead(rac, &xfs_read_iomap_ops);
+ iomap_readahead(rac, &xfs_read_iomap_ops, NULL);
}
static int
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index b2c9b35df8f7..29428c012150 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -112,12 +112,12 @@ static const struct iomap_ops zonefs_write_iomap_ops = {
static int zonefs_read_folio(struct file *unused, struct folio *folio)
{
- return iomap_read_folio(folio, &zonefs_read_iomap_ops);
+ return iomap_read_folio(folio, &zonefs_read_iomap_ops, NULL);
}
static void zonefs_readahead(struct readahead_control *rac)
{
- iomap_readahead(rac, &zonefs_read_iomap_ops);
+ iomap_readahead(rac, &zonefs_read_iomap_ops, NULL);
}
/*
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 96dd0acbba44..3565c449f3c9 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -262,8 +262,25 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
int (*punch)(struct inode *inode, loff_t pos, loff_t length));
-int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
-void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
+struct iomap_readpage_ops {
+ /*
+ * Filesystems wishing to attach private information to a direct io bio
+ * must provide a ->submit_io method that attaches the additional
+ * information to the bio and changes the ->bi_end_io callback to a
+ * custom function. This function should, at a minimum, perform any
+ * relevant post-processing of the bio and end with a call to
+ * iomap_read_end_io.
+ */
+ void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
+ loff_t file_offset);
+ struct bio_set *bio_set;
+};
+
+void iomap_read_end_io(struct bio *bio);
+int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops,
+ const struct iomap_readpage_ops *readpage_ops);
+void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops,
+ const struct iomap_readpage_ops *readpage_ops);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 11/28] iomap: pass readpage operation to read path
2023-10-06 18:49 ` [PATCH v3 11/28] iomap: pass readpage operation to read path Andrey Albershteyn
@ 2023-10-11 18:31 ` Darrick J. Wong
2023-10-16 12:35 ` Andrey Albershteyn
2023-10-16 9:15 ` Christoph Hellwig
1 sibling, 1 reply; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 18:31 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:05PM +0200, Andrey Albershteyn wrote:
> Preparation for allowing filesystems to provide bio_set and
> ->submit_io() for read path. This will allow fs to do an additional
> processing of ioend on ioend completion.
>
> Make iomap_read_end_io() exportable, so, it can be called back from
> filesystem callout after verification is done.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> fs/erofs/data.c | 4 ++--
> fs/gfs2/aops.c | 4 ++--
> fs/iomap/buffered-io.c | 13 ++++++++++---
> fs/xfs/xfs_aops.c | 4 ++--
> fs/zonefs/file.c | 4 ++--
> include/linux/iomap.h | 21 +++++++++++++++++++--
> 6 files changed, 37 insertions(+), 13 deletions(-)
>
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index 0c2c99c58b5e..3f5482d6cedb 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
> @@ -357,12 +357,12 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> */
> static int erofs_read_folio(struct file *file, struct folio *folio)
> {
> - return iomap_read_folio(folio, &erofs_iomap_ops);
> + return iomap_read_folio(folio, &erofs_iomap_ops, NULL);
> }
>
> static void erofs_readahead(struct readahead_control *rac)
> {
> - return iomap_readahead(rac, &erofs_iomap_ops);
> + return iomap_readahead(rac, &erofs_iomap_ops, NULL);
> }
>
> static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
> index c26d48355cc2..9c09ff75e586 100644
> --- a/fs/gfs2/aops.c
> +++ b/fs/gfs2/aops.c
> @@ -456,7 +456,7 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
>
> if (!gfs2_is_jdata(ip) ||
> (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
> - error = iomap_read_folio(folio, &gfs2_iomap_ops);
> + error = iomap_read_folio(folio, &gfs2_iomap_ops, NULL);
> } else if (gfs2_is_stuffed(ip)) {
> error = stuffed_readpage(ip, &folio->page);
> folio_unlock(folio);
> @@ -534,7 +534,7 @@ static void gfs2_readahead(struct readahead_control *rac)
> else if (gfs2_is_jdata(ip))
> mpage_readahead(rac, gfs2_block_map);
> else
> - iomap_readahead(rac, &gfs2_iomap_ops);
> + iomap_readahead(rac, &gfs2_iomap_ops, NULL);
> }
>
> /**
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 644479ccefbd..ca78c7f62527 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -264,7 +264,7 @@ static void iomap_finish_folio_read(struct folio *folio, size_t offset,
> folio_unlock(folio);
> }
>
> -static void iomap_read_end_io(struct bio *bio)
> +void iomap_read_end_io(struct bio *bio)
> {
> int error = blk_status_to_errno(bio->bi_status);
> struct folio_iter fi;
> @@ -273,12 +273,14 @@ static void iomap_read_end_io(struct bio *bio)
> iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
> bio_put(bio);
> }
> +EXPORT_SYMBOL_GPL(iomap_read_end_io);
>
> struct iomap_readpage_ctx {
> struct folio *cur_folio;
> bool cur_folio_in_bio;
> struct bio *bio;
> struct readahead_control *rac;
> + const struct iomap_readpage_ops *ops;
> };
>
> /**
> @@ -402,7 +404,8 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
> return pos - orig_pos + plen;
> }
>
> -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
> +int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops,
> + const struct iomap_readpage_ops *readpage_ops)
> {
> struct iomap_iter iter = {
> .inode = folio->mapping->host,
> @@ -411,6 +414,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
> };
> struct iomap_readpage_ctx ctx = {
> .cur_folio = folio,
> + .ops = readpage_ops,
> };
> int ret;
>
> @@ -468,6 +472,7 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
> * iomap_readahead - Attempt to read pages from a file.
> * @rac: Describes the pages to be read.
> * @ops: The operations vector for the filesystem.
> + * @readpage_ops: Filesystem supplied folio processiong operation
> *
> * This function is for filesystems to call to implement their readahead
> * address_space operation.
> @@ -479,7 +484,8 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
> * function is called with memalloc_nofs set, so allocations will not cause
> * the filesystem to be reentered.
> */
> -void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
> +void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops,
> + const struct iomap_readpage_ops *readpage_ops)
> {
> struct iomap_iter iter = {
> .inode = rac->mapping->host,
> @@ -488,6 +494,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
> };
> struct iomap_readpage_ctx ctx = {
> .rac = rac,
> + .ops = readpage_ops,
> };
>
> trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 465d7630bb21..b413a2dbcc18 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -553,14 +553,14 @@ xfs_vm_read_folio(
> struct file *unused,
> struct folio *folio)
> {
> - return iomap_read_folio(folio, &xfs_read_iomap_ops);
> + return iomap_read_folio(folio, &xfs_read_iomap_ops, NULL);
> }
>
> STATIC void
> xfs_vm_readahead(
> struct readahead_control *rac)
> {
> - iomap_readahead(rac, &xfs_read_iomap_ops);
> + iomap_readahead(rac, &xfs_read_iomap_ops, NULL);
> }
>
> static int
> diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
> index b2c9b35df8f7..29428c012150 100644
> --- a/fs/zonefs/file.c
> +++ b/fs/zonefs/file.c
> @@ -112,12 +112,12 @@ static const struct iomap_ops zonefs_write_iomap_ops = {
>
> static int zonefs_read_folio(struct file *unused, struct folio *folio)
> {
> - return iomap_read_folio(folio, &zonefs_read_iomap_ops);
> + return iomap_read_folio(folio, &zonefs_read_iomap_ops, NULL);
> }
>
> static void zonefs_readahead(struct readahead_control *rac)
> {
> - iomap_readahead(rac, &zonefs_read_iomap_ops);
> + iomap_readahead(rac, &zonefs_read_iomap_ops, NULL);
> }
>
> /*
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 96dd0acbba44..3565c449f3c9 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -262,8 +262,25 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
> struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
> int (*punch)(struct inode *inode, loff_t pos, loff_t length));
>
> -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
> -void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
> +struct iomap_readpage_ops {
> + /*
> + * Filesystems wishing to attach private information to a direct io bio
> + * must provide a ->submit_io method that attaches the additional
> + * information to the bio and changes the ->bi_end_io callback to a
> + * custom function. This function should, at a minimum, perform any
> + * relevant post-processing of the bio and end with a call to
> + * iomap_read_end_io.
> + */
> + void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
> + loff_t file_offset);
> + struct bio_set *bio_set;
Needs a comment to mention that iomap will allocate bios from @bio_set
if non-null; or its own internal bioset if null.
> +};
It's odd that this patch adds this ops structure but doesn't actually
start using it until the next patch.
--D
> +
> +void iomap_read_end_io(struct bio *bio);
> +int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops,
> + const struct iomap_readpage_ops *readpage_ops);
> +void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops,
> + const struct iomap_readpage_ops *readpage_ops);
> bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
> struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
> bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 11/28] iomap: pass readpage operation to read path
2023-10-11 18:31 ` Darrick J. Wong
@ 2023-10-16 12:35 ` Andrey Albershteyn
0 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-16 12:35 UTC (permalink / raw)
To: Darrick J. Wong
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On 2023-10-11 11:31:17, Darrick J. Wong wrote:
> On Fri, Oct 06, 2023 at 08:49:05PM +0200, Andrey Albershteyn wrote:
> > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > index 96dd0acbba44..3565c449f3c9 100644
> > --- a/include/linux/iomap.h
> > +++ b/include/linux/iomap.h
> > @@ -262,8 +262,25 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
> > struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
> > int (*punch)(struct inode *inode, loff_t pos, loff_t length));
> >
> > -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
> > -void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
> > +struct iomap_readpage_ops {
> > + /*
> > + * Filesystems wishing to attach private information to a direct io bio
> > + * must provide a ->submit_io method that attaches the additional
> > + * information to the bio and changes the ->bi_end_io callback to a
> > + * custom function. This function should, at a minimum, perform any
> > + * relevant post-processing of the bio and end with a call to
> > + * iomap_read_end_io.
> > + */
> > + void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
> > + loff_t file_offset);
> > + struct bio_set *bio_set;
>
> Needs a comment to mention that iomap will allocate bios from @bio_set
> if non-null; or its own internal bioset if null.
Sure.
> > +};
>
> It's odd that this patch adds this ops structure but doesn't actually
> start using it until the next patch.
I wanted to separate iomap changes with xfs changes so it's easier
to go through, but I fine with merging these two.
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 11/28] iomap: pass readpage operation to read path
2023-10-06 18:49 ` [PATCH v3 11/28] iomap: pass readpage operation to read path Andrey Albershteyn
2023-10-11 18:31 ` Darrick J. Wong
@ 2023-10-16 9:15 ` Christoph Hellwig
2023-10-16 12:32 ` Andrey Albershteyn
1 sibling, 1 reply; 74+ messages in thread
From: Christoph Hellwig @ 2023-10-16 9:15 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, djwong, ebiggers, david, dchinner
Looking at the entire series, it seems like the only XFS-specific
part of the fsverity processing in iomap is the per-sb workqueue
now that the fsverity interfaces were cleaned up.
Based on that it seems like we'd be much better just doing all the
work inside iomap, and just allow XFS to pass the workqueue to
iomap_read_folio and iomap_readahead. The patch below does that
as an untested WIP on top of your branch.
If we go down that way I suspect the better interface would be
to allocate the iomap_readpage_ctx in the callers of these functions
instead of passing an extra argument, but I'm not entirely sure
about that yet.
---
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 0a1bec91fdf678..95077676b714cf 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -6,6 +6,7 @@
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
+#include <linux/fsverity.h>
#include <linux/iomap.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
@@ -264,7 +265,7 @@ static void iomap_finish_folio_read(struct folio *folio, size_t offset,
folio_unlock(folio);
}
-void iomap_read_end_io(struct bio *bio)
+static void iomap_read_end_io(struct bio *bio)
{
int error = blk_status_to_errno(bio->bi_status);
struct folio_iter fi;
@@ -273,14 +274,13 @@ void iomap_read_end_io(struct bio *bio)
iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
bio_put(bio);
}
-EXPORT_SYMBOL_GPL(iomap_read_end_io);
struct iomap_readpage_ctx {
struct folio *cur_folio;
bool cur_folio_in_bio;
struct bio *bio;
struct readahead_control *rac;
- const struct iomap_readpage_ops *ops;
+ struct workqueue_struct *wq;
};
/**
@@ -332,17 +332,55 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
pos >= i_size_read(iter->inode);
}
+#ifdef CONFIG_FS_VERITY
+struct iomap_fsverity_bio {
+ struct work_struct work;
+ struct bio bio;
+};
+static struct bio_set iomap_fsverity_bioset;
+
static void
-iomap_submit_read_io(const struct iomap_iter *iter,
- struct iomap_readpage_ctx *ctx)
+iomap_read_fsverify_end_io_work(struct work_struct *work)
{
- if (!ctx->bio)
- return;
+ struct iomap_fsverity_bio *fbio =
+ container_of(work, struct iomap_fsverity_bio, work);
- if (ctx->ops && ctx->ops->submit_io)
- ctx->ops->submit_io(iter, ctx->bio, iter->pos);
- else
- submit_bio(ctx->bio);
+ fsverity_verify_bio(&fbio->bio);
+ iomap_read_end_io(&fbio->bio);
+}
+
+static void
+iomap_read_fsverity_end_io(struct bio *bio)
+{
+ struct iomap_fsverity_bio *fbio =
+ container_of(bio, struct iomap_fsverity_bio, bio);
+
+ INIT_WORK(&fbio->work, iomap_read_fsverify_end_io_work);
+ queue_work(bio->bi_private, &fbio->work);
+}
+#endif /* CONFIG_FS_VERITY */
+
+static struct bio *iomap_read_bio_alloc(struct inode *inode,
+ struct block_device *bdev, int nr_vecs, gfp_t gfp,
+ struct workqueue_struct *wq)
+{
+ struct bio *bio;
+
+#ifdef CONFIG_FS_VERITY
+ if (fsverity_active(inode)) {
+ bio = bio_alloc_bioset(bdev, nr_vecs, REQ_OP_READ, gfp,
+ &iomap_fsverity_bioset);
+ if (bio) {
+ bio->bi_private = wq;
+ bio->bi_end_io = iomap_read_fsverity_end_io;
+ }
+ return bio;
+ }
+#endif
+ bio = bio_alloc(bdev, nr_vecs, REQ_OP_READ, gfp);
+ if (bio)
+ bio->bi_end_io = iomap_read_end_io;
+ return bio;
}
static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
@@ -368,11 +406,10 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
if (iomap_block_needs_zeroing(iter, pos)) {
folio_zero_range(folio, poff, plen);
- if (iomap->flags & IOMAP_F_READ_VERITY) {
- if (ctx->ops->verify_folio(folio, poff, plen)) {
- folio_set_error(folio);
- goto done;
- }
+ if (fsverity_active(iter->inode) &&
+ !fsverity_verify_blocks(folio, poff, plen)) {
+ folio_set_error(folio);
+ goto done;
}
iomap_set_range_uptodate(folio, poff, plen);
@@ -389,21 +426,16 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
!bio_add_folio(ctx->bio, folio, plen, poff)) {
gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
gfp_t orig_gfp = gfp;
- unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
- iomap_submit_read_io(iter, ctx);
+ if (ctx->bio)
+ submit_bio(ctx->bio);
if (ctx->rac) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
- if (ctx->ops && ctx->ops->bio_set)
- ctx->bio = bio_alloc_bioset(iomap->bdev,
- bio_max_segs(nr_vecs),
- REQ_OP_READ, GFP_NOFS,
- ctx->ops->bio_set);
- else
- ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
- REQ_OP_READ, gfp);
+ ctx->bio = iomap_read_bio_alloc(iter->inode, iomap->bdev,
+ bio_max_segs(DIV_ROUND_UP(length, PAGE_SIZE)),
+ gfp, ctx->wq);
/*
* If the bio_alloc fails, try it again for a single page to
@@ -411,13 +443,12 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
* what do_mpage_read_folio does.
*/
if (!ctx->bio) {
- ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
- orig_gfp);
+ ctx->bio = iomap_read_bio_alloc(iter->inode,
+ iomap->bdev, 1, orig_gfp, ctx->wq);
}
if (ctx->rac)
ctx->bio->bi_opf |= REQ_RAHEAD;
ctx->bio->bi_iter.bi_sector = sector;
- ctx->bio->bi_end_io = iomap_read_end_io;
bio_add_folio_nofail(ctx->bio, folio, plen, poff);
}
@@ -432,7 +463,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
}
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops,
- const struct iomap_readpage_ops *readpage_ops)
+ struct workqueue_struct *wq)
{
struct iomap_iter iter = {
.inode = folio->mapping->host,
@@ -441,7 +472,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops,
};
struct iomap_readpage_ctx ctx = {
.cur_folio = folio,
- .ops = readpage_ops,
+ .wq = wq,
};
int ret;
@@ -454,7 +485,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops,
folio_set_error(folio);
if (ctx.bio) {
- iomap_submit_read_io(&iter, &ctx);
+ submit_bio(ctx.bio);
WARN_ON_ONCE(!ctx.cur_folio_in_bio);
} else {
WARN_ON_ONCE(ctx.cur_folio_in_bio);
@@ -499,7 +530,7 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
* iomap_readahead - Attempt to read pages from a file.
* @rac: Describes the pages to be read.
* @ops: The operations vector for the filesystem.
- * @readpage_ops: Filesystem supplied folio processiong operation
+ * @wq: Workqueue for post-I/O processing (only need for fsverity)
*
* This function is for filesystems to call to implement their readahead
* address_space operation.
@@ -512,7 +543,7 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
* the filesystem to be reentered.
*/
void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops,
- const struct iomap_readpage_ops *readpage_ops)
+ struct workqueue_struct *wq)
{
struct iomap_iter iter = {
.inode = rac->mapping->host,
@@ -521,7 +552,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops,
};
struct iomap_readpage_ctx ctx = {
.rac = rac,
- .ops = readpage_ops,
+ .wq = wq,
};
trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
@@ -529,7 +560,8 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops,
while (iomap_iter(&iter, ops) > 0)
iter.processed = iomap_readahead_iter(&iter, &ctx);
- iomap_submit_read_io(&iter, &ctx);
+ if (ctx.bio)
+ submit_bio(ctx.bio);
if (ctx.cur_folio) {
if (!ctx.cur_folio_in_bio)
folio_unlock(ctx.cur_folio);
@@ -2022,10 +2054,25 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
}
EXPORT_SYMBOL_GPL(iomap_writepages);
+#define IOMAP_POOL_SIZE (4 * (PAGE_SIZE / SECTOR_SIZE))
+
static int __init iomap_init(void)
{
- return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct iomap_ioend, io_inline_bio),
- BIOSET_NEED_BVECS);
+ int error;
+
+ error = bioset_init(&iomap_ioend_bioset, IOMAP_POOL_SIZE,
+ offsetof(struct iomap_ioend, io_inline_bio),
+ BIOSET_NEED_BVECS);
+#ifdef CONFIG_FS_VERITY
+ if (error)
+ return error;
+
+ error = bioset_init(&iomap_fsverity_bioset, IOMAP_POOL_SIZE,
+ offsetof(struct iomap_fsverity_bio, bio),
+ BIOSET_NEED_BVECS);
+ if (error)
+ bioset_exit(&iomap_ioend_bioset);
+#endif
+ return error;
}
fs_initcall(iomap_init);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index fceb0c3de61ff3..1982bdb456d0ee 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -26,8 +26,6 @@ struct xfs_writepage_ctx {
unsigned int cow_seq;
};
-static struct bio_set xfs_read_ioend_bioset;
-
static inline struct xfs_writepage_ctx *
XFS_WPC(struct iomap_writepage_ctx *ctx)
{
@@ -550,97 +548,30 @@ xfs_vm_bmap(
return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
}
-static void
-xfs_read_work_end_io(
- struct work_struct *work)
+static inline struct workqueue_struct *
+xfs_fsverity_wq(
+ struct address_space *mapping)
{
- struct iomap_read_ioend *ioend =
- container_of(work, struct iomap_read_ioend, work);
- struct bio *bio = &ioend->read_inline_bio;
-
- fsverity_verify_bio(bio);
- iomap_read_end_io(bio);
- /*
- * The iomap_read_ioend has been freed by bio_put() in
- * iomap_read_end_io()
- */
+ if (fsverity_active(mapping->host))
+ return XFS_I(mapping->host)->i_mount->m_postread_workqueue;
+ return NULL;
}
-static void
-xfs_read_end_io(
- struct bio *bio)
-{
- struct iomap_read_ioend *ioend =
- container_of(bio, struct iomap_read_ioend, read_inline_bio);
- struct xfs_inode *ip = XFS_I(ioend->io_inode);
-
- WARN_ON_ONCE(!queue_work(ip->i_mount->m_postread_workqueue,
- &ioend->work));
-}
-
-static int
-xfs_verify_folio(
- struct folio *folio,
- loff_t pos,
- unsigned int len)
-{
- if (fsverity_verify_blocks(folio, len, pos))
- return 0;
- return -EFSCORRUPTED;
-}
-
-int
-xfs_init_iomap_bioset(void)
-{
- return bioset_init(&xfs_read_ioend_bioset,
- 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct iomap_read_ioend, read_inline_bio),
- BIOSET_NEED_BVECS);
-}
-
-void
-xfs_free_iomap_bioset(void)
-{
- bioset_exit(&xfs_read_ioend_bioset);
-}
-
-static void
-xfs_submit_read_bio(
- const struct iomap_iter *iter,
- struct bio *bio,
- loff_t file_offset)
-{
- struct iomap_read_ioend *ioend;
-
- ioend = container_of(bio, struct iomap_read_ioend, read_inline_bio);
- ioend->io_inode = iter->inode;
- if (fsverity_active(ioend->io_inode)) {
- INIT_WORK(&ioend->work, &xfs_read_work_end_io);
- ioend->read_inline_bio.bi_end_io = &xfs_read_end_io;
- }
-
- submit_bio(bio);
-}
-
-static const struct iomap_readpage_ops xfs_readpage_ops = {
- .verify_folio = &xfs_verify_folio,
- .submit_io = &xfs_submit_read_bio,
- .bio_set = &xfs_read_ioend_bioset,
-};
-
STATIC int
xfs_vm_read_folio(
struct file *unused,
struct folio *folio)
{
- return iomap_read_folio(folio, &xfs_read_iomap_ops, &xfs_readpage_ops);
+ return iomap_read_folio(folio, &xfs_read_iomap_ops,
+ xfs_fsverity_wq(folio->mapping));
}
STATIC void
xfs_vm_readahead(
struct readahead_control *rac)
{
- iomap_readahead(rac, &xfs_read_iomap_ops, &xfs_readpage_ops);
+ iomap_readahead(rac, &xfs_read_iomap_ops,
+ xfs_fsverity_wq(rac->mapping));
}
static int
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index fa7c512b27176e..e0bd684197643d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -10,7 +10,5 @@ extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
-int xfs_init_iomap_bioset(void);
-void xfs_free_iomap_bioset(void);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 80b249c420678a..18c8f168b1532d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -132,9 +132,6 @@ xfs_bmbt_to_iomap(
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
- if (fsverity_active(VFS_I(ip)))
- iomap->flags |= IOMAP_F_READ_VERITY;
-
iomap->validity_cookie = sequence_cookie;
iomap->folio_ops = &xfs_iomap_folio_ops;
return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f32392add622f3..880d9039437eb1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2385,17 +2385,11 @@ init_xfs_fs(void)
if (error)
goto out_remove_dbg_kobj;
- error = xfs_init_iomap_bioset();
- if (error)
- goto out_qm_exit;
-
error = register_filesystem(&xfs_fs_type);
if (error)
- goto out_iomap_bioset;
+ goto out_qm_exit;
return 0;
- out_iomap_bioset:
- xfs_free_iomap_bioset();
out_qm_exit:
xfs_qm_exit();
out_remove_dbg_kobj:
@@ -2428,7 +2422,6 @@ init_xfs_fs(void)
STATIC void __exit
exit_xfs_fs(void)
{
- xfs_free_iomap_bioset();
xfs_qm_exit();
unregister_filesystem(&xfs_fs_type);
#ifdef DEBUG
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 8d7206cd2f0f49..c7522eb3a8eafd 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -53,9 +53,6 @@ struct vm_fault;
*
* IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent
* rather than a file data extent.
- *
- * IOMAP_F_READ_VERITY indicates that the iomap needs verification of read
- * folios
*/
#define IOMAP_F_NEW (1U << 0)
#define IOMAP_F_DIRTY (1U << 1)
@@ -67,7 +64,6 @@ struct vm_fault;
#define IOMAP_F_BUFFER_HEAD 0
#endif /* CONFIG_BUFFER_HEAD */
#define IOMAP_F_XATTR (1U << 5)
-#define IOMAP_F_READ_VERITY (1U << 6)
/*
* Flags set by the core iomap code during operations:
@@ -266,36 +262,10 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
int (*punch)(struct inode *inode, loff_t pos, loff_t length));
-struct iomap_read_ioend {
- struct inode *io_inode; /* file being read from */
- struct work_struct work; /* post read work (e.g. fs-verity) */
- struct bio read_inline_bio;/* MUST BE LAST! */
-};
-
-struct iomap_readpage_ops {
- /*
- * Optional, verify folio when successfully read
- */
- int (*verify_folio)(struct folio *folio, loff_t pos, unsigned int len);
-
- /*
- * Filesystems wishing to attach private information to a direct io bio
- * must provide a ->submit_io method that attaches the additional
- * information to the bio and changes the ->bi_end_io callback to a
- * custom function. This function should, at a minimum, perform any
- * relevant post-processing of the bio and end with a call to
- * iomap_read_end_io.
- */
- void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
- loff_t file_offset);
- struct bio_set *bio_set;
-};
-
-void iomap_read_end_io(struct bio *bio);
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops,
- const struct iomap_readpage_ops *readpage_ops);
+ struct workqueue_struct *wq);
void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops,
- const struct iomap_readpage_ops *readpage_ops);
+ struct workqueue_struct *wq);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 11/28] iomap: pass readpage operation to read path
2023-10-16 9:15 ` Christoph Hellwig
@ 2023-10-16 12:32 ` Andrey Albershteyn
2023-10-16 12:58 ` Christoph Hellwig
0 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-16 12:32 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-xfs, linux-fsdevel, fsverity, djwong, ebiggers, david, dchinner
On 2023-10-16 02:15:00, Christoph Hellwig wrote:
> Looking at the entire series, it seems like the only XFS-specific
> part of the fsverity processing in iomap is the per-sb workqueue
> now that the fsverity interfaces were cleaned up.
>
> Based on that it seems like we'd be much better just doing all the
> work inside iomap, and just allow XFS to pass the workqueue to
> iomap_read_folio and iomap_readahead. The patch below does that
> as an untested WIP on top of your branch.
>
> If we go down that way I suspect the better interface would be
> to allocate the iomap_readpage_ctx in the callers of these functions
> instead of passing an extra argument, but I'm not entirely sure
> about that yet.
From the discussion in v2 [1] I understood that btrfs would like to
have this bio_set/submit_io interface for iomap transition; and any
other filesystem deferrals would also be possible. Is it no more the
case with btrfs? Would fs-verity verification in iomap_read_end_io
combine both solutions (fs-verity verification in iomap +
submit_io/bio_set interface).
[1]: https://lore.kernel.org/linux-xfs/ZCxEHkWayQyGqnxL@infradead.org/#t
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 11/28] iomap: pass readpage operation to read path
2023-10-16 12:32 ` Andrey Albershteyn
@ 2023-10-16 12:58 ` Christoph Hellwig
0 siblings, 0 replies; 74+ messages in thread
From: Christoph Hellwig @ 2023-10-16 12:58 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: Christoph Hellwig, linux-xfs, linux-fsdevel, fsverity, djwong,
ebiggers, david, dchinner
On Mon, Oct 16, 2023 at 02:32:25PM +0200, Andrey Albershteyn wrote:
> >From the discussion in v2 [1] I understood that btrfs would like to
> have this bio_set/submit_io interface for iomap transition;
For btrfs it would not be a transitional thing, but forever as it has
it's own equivalent of a device mapper at this level. But now that
the fsverity interfaces work so that we don't have other file system
dependencies I would not want to design around it. If/when btrfs
migrates to iomap for the buffer read path we can easily hook this
into the existing code.
> and any
> other filesystem deferrals would also be possible. Is it no more the
> case with btrfs? Would fs-verity verification in iomap_read_end_io
> combine both solutions (fs-verity verification in iomap +
> submit_io/bio_set interface).
btrfs does also need to do I/O completion from a workqueue, but it
needs it's own. fsverity OTOH is a pretty generic feature.
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 12/28] iomap: allow filesystem to implement read path verification
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (10 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 11/28] iomap: pass readpage operation to read path Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 18:39 ` Darrick J. Wong
2023-10-06 18:49 ` [PATCH v3 13/28] xfs: add XBF_VERITY_CHECKED xfs_buf flag Andrey Albershteyn
` (15 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
Currently, there is no interface to let filesystem do
post-processing of completed BIO (ioend) in read path. This can be
very handy for fs-verity verification. This patch add a callout to
filesystem provided ->submit_bio to configure BIO completion callout.
The read path ioend iomap_read_ioend are stored side by side with
BIOs allocated from filesystem provided bio_set.
Add IOMAP_F_READ_VERITY which indicates that iomap need to
verify BIO (e.g. fs-verity) after I/O is completed.
Any verification itself happens on filesystem side. The verification
is done when the BIO is processed by calling out ->bi_end_io().
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/iomap/buffered-io.c | 40 +++++++++++++++++++++++++++++++++-------
include/linux/iomap.h | 15 +++++++++++++++
2 files changed, 48 insertions(+), 7 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ca78c7f62527..0a1bec91fdf6 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -332,6 +332,19 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
pos >= i_size_read(iter->inode);
}
+static void
+iomap_submit_read_io(const struct iomap_iter *iter,
+ struct iomap_readpage_ctx *ctx)
+{
+ if (!ctx->bio)
+ return;
+
+ if (ctx->ops && ctx->ops->submit_io)
+ ctx->ops->submit_io(iter, ctx->bio, iter->pos);
+ else
+ submit_bio(ctx->bio);
+}
+
static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx, loff_t offset)
{
@@ -355,6 +368,13 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
if (iomap_block_needs_zeroing(iter, pos)) {
folio_zero_range(folio, poff, plen);
+ if (iomap->flags & IOMAP_F_READ_VERITY) {
+ if (ctx->ops->verify_folio(folio, poff, plen)) {
+ folio_set_error(folio);
+ goto done;
+ }
+ }
+
iomap_set_range_uptodate(folio, poff, plen);
goto done;
}
@@ -371,13 +391,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
gfp_t orig_gfp = gfp;
unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
- if (ctx->bio)
- submit_bio(ctx->bio);
+ iomap_submit_read_io(iter, ctx);
if (ctx->rac) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
- ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
- REQ_OP_READ, gfp);
+
+ if (ctx->ops && ctx->ops->bio_set)
+ ctx->bio = bio_alloc_bioset(iomap->bdev,
+ bio_max_segs(nr_vecs),
+ REQ_OP_READ, GFP_NOFS,
+ ctx->ops->bio_set);
+ else
+ ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
+ REQ_OP_READ, gfp);
+
/*
* If the bio_alloc fails, try it again for a single page to
* avoid having to deal with partial page reads. This emulates
@@ -427,7 +454,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops,
folio_set_error(folio);
if (ctx.bio) {
- submit_bio(ctx.bio);
+ iomap_submit_read_io(&iter, &ctx);
WARN_ON_ONCE(!ctx.cur_folio_in_bio);
} else {
WARN_ON_ONCE(ctx.cur_folio_in_bio);
@@ -502,8 +529,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops,
while (iomap_iter(&iter, ops) > 0)
iter.processed = iomap_readahead_iter(&iter, &ctx);
- if (ctx.bio)
- submit_bio(ctx.bio);
+ iomap_submit_read_io(&iter, &ctx);
if (ctx.cur_folio) {
if (!ctx.cur_folio_in_bio)
folio_unlock(ctx.cur_folio);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3565c449f3c9..8d7206cd2f0f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -53,6 +53,9 @@ struct vm_fault;
*
* IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent
* rather than a file data extent.
+ *
+ * IOMAP_F_READ_VERITY indicates that the iomap needs verification of read
+ * folios
*/
#define IOMAP_F_NEW (1U << 0)
#define IOMAP_F_DIRTY (1U << 1)
@@ -64,6 +67,7 @@ struct vm_fault;
#define IOMAP_F_BUFFER_HEAD 0
#endif /* CONFIG_BUFFER_HEAD */
#define IOMAP_F_XATTR (1U << 5)
+#define IOMAP_F_READ_VERITY (1U << 6)
/*
* Flags set by the core iomap code during operations:
@@ -262,7 +266,18 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
int (*punch)(struct inode *inode, loff_t pos, loff_t length));
+struct iomap_read_ioend {
+ struct inode *io_inode; /* file being read from */
+ struct work_struct work; /* post read work (e.g. fs-verity) */
+ struct bio read_inline_bio;/* MUST BE LAST! */
+};
+
struct iomap_readpage_ops {
+ /*
+ * Optional, verify folio when successfully read
+ */
+ int (*verify_folio)(struct folio *folio, loff_t pos, unsigned int len);
+
/*
* Filesystems wishing to attach private information to a direct io bio
* must provide a ->submit_io method that attaches the additional
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 12/28] iomap: allow filesystem to implement read path verification
2023-10-06 18:49 ` [PATCH v3 12/28] iomap: allow filesystem to implement read path verification Andrey Albershteyn
@ 2023-10-11 18:39 ` Darrick J. Wong
0 siblings, 0 replies; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 18:39 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:06PM +0200, Andrey Albershteyn wrote:
> Currently, there is no interface to let filesystem do
> post-processing of completed BIO (ioend) in read path. This can be
> very handy for fs-verity verification. This patch add a callout to
> filesystem provided ->submit_bio to configure BIO completion callout.
>
> The read path ioend iomap_read_ioend are stored side by side with
> BIOs allocated from filesystem provided bio_set.
>
> Add IOMAP_F_READ_VERITY which indicates that iomap need to
> verify BIO (e.g. fs-verity) after I/O is completed.
>
> Any verification itself happens on filesystem side. The verification
> is done when the BIO is processed by calling out ->bi_end_io().
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> fs/iomap/buffered-io.c | 40 +++++++++++++++++++++++++++++++++-------
> include/linux/iomap.h | 15 +++++++++++++++
> 2 files changed, 48 insertions(+), 7 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index ca78c7f62527..0a1bec91fdf6 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -332,6 +332,19 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
> pos >= i_size_read(iter->inode);
> }
>
> +static void
> +iomap_submit_read_io(const struct iomap_iter *iter,
> + struct iomap_readpage_ctx *ctx)
> +{
> + if (!ctx->bio)
> + return;
> +
> + if (ctx->ops && ctx->ops->submit_io)
> + ctx->ops->submit_io(iter, ctx->bio, iter->pos);
> + else
> + submit_bio(ctx->bio);
> +}
> +
> static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
> struct iomap_readpage_ctx *ctx, loff_t offset)
> {
> @@ -355,6 +368,13 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>
> if (iomap_block_needs_zeroing(iter, pos)) {
> folio_zero_range(folio, poff, plen);
> + if (iomap->flags & IOMAP_F_READ_VERITY) {
> + if (ctx->ops->verify_folio(folio, poff, plen)) {
What validation does fsverity need to do for zeroed folios that aren't
read from disk? Does that imply that holes and unwritten extents are
allowed in fsverity files, and that the merkle tree will actually
contain an entry for them?
> + folio_set_error(folio);
> + goto done;
> + }
> + }
> +
> iomap_set_range_uptodate(folio, poff, plen);
> goto done;
> }
> @@ -371,13 +391,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
> gfp_t orig_gfp = gfp;
> unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
>
> - if (ctx->bio)
> - submit_bio(ctx->bio);
> + iomap_submit_read_io(iter, ctx);
>
> if (ctx->rac) /* same as readahead_gfp_mask */
> gfp |= __GFP_NORETRY | __GFP_NOWARN;
> - ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
> - REQ_OP_READ, gfp);
> +
> + if (ctx->ops && ctx->ops->bio_set)
> + ctx->bio = bio_alloc_bioset(iomap->bdev,
> + bio_max_segs(nr_vecs),
> + REQ_OP_READ, GFP_NOFS,
> + ctx->ops->bio_set);
> + else
> + ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
> + REQ_OP_READ, gfp);
/methinks this should be in the previous patch, and without the
indenting damage.
--D
> +
> /*
> * If the bio_alloc fails, try it again for a single page to
> * avoid having to deal with partial page reads. This emulates
> @@ -427,7 +454,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops,
> folio_set_error(folio);
>
> if (ctx.bio) {
> - submit_bio(ctx.bio);
> + iomap_submit_read_io(&iter, &ctx);
> WARN_ON_ONCE(!ctx.cur_folio_in_bio);
> } else {
> WARN_ON_ONCE(ctx.cur_folio_in_bio);
> @@ -502,8 +529,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops,
> while (iomap_iter(&iter, ops) > 0)
> iter.processed = iomap_readahead_iter(&iter, &ctx);
>
> - if (ctx.bio)
> - submit_bio(ctx.bio);
> + iomap_submit_read_io(&iter, &ctx);
> if (ctx.cur_folio) {
> if (!ctx.cur_folio_in_bio)
> folio_unlock(ctx.cur_folio);
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 3565c449f3c9..8d7206cd2f0f 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -53,6 +53,9 @@ struct vm_fault;
> *
> * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent
> * rather than a file data extent.
> + *
> + * IOMAP_F_READ_VERITY indicates that the iomap needs verification of read
> + * folios
> */
> #define IOMAP_F_NEW (1U << 0)
> #define IOMAP_F_DIRTY (1U << 1)
> @@ -64,6 +67,7 @@ struct vm_fault;
> #define IOMAP_F_BUFFER_HEAD 0
> #endif /* CONFIG_BUFFER_HEAD */
> #define IOMAP_F_XATTR (1U << 5)
> +#define IOMAP_F_READ_VERITY (1U << 6)
>
> /*
> * Flags set by the core iomap code during operations:
> @@ -262,7 +266,18 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
> struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
> int (*punch)(struct inode *inode, loff_t pos, loff_t length));
>
> +struct iomap_read_ioend {
> + struct inode *io_inode; /* file being read from */
> + struct work_struct work; /* post read work (e.g. fs-verity) */
> + struct bio read_inline_bio;/* MUST BE LAST! */
> +};
> +
> struct iomap_readpage_ops {
> + /*
> + * Optional, verify folio when successfully read
> + */
> + int (*verify_folio)(struct folio *folio, loff_t pos, unsigned int len);
> +
> /*
> * Filesystems wishing to attach private information to a direct io bio
> * must provide a ->submit_io method that attaches the additional
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 13/28] xfs: add XBF_VERITY_CHECKED xfs_buf flag
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (11 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 12/28] iomap: allow filesystem to implement read path verification Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 18:54 ` Darrick J. Wong
2023-10-06 18:49 ` [PATCH v3 14/28] xfs: add XFS_DA_OP_BUFFER to make xfs_attr_get() return buffer Andrey Albershteyn
` (14 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
One of essential ideas of fs-verity is that pages which are already
verified won't need to be re-verified if they still in page cache.
XFS will store Merkle tree blocks in extended attributes. Each
attribute has one Merkle tree block. When read extended attribute
data is put into xfs_buf.
The data in the buffer is not aligned with xfs_buf pages and we
don't have a reference to these pages. Moreover, these pages are
released when value is copied out in xfs_attr code. In other words,
we can not directly mark underlying xfs_buf's pages as verified.
One way to track that these pages were verified is to mark xattr's
buffer as verified instead. If buffer is evicted the incore
XBF_VERITY_CHECKED flag is lost. When the xattr is read again
xfs_attr_get() returns new buffer without the flag. The xfs_buf's
flag is then used to tell fs-verity if it's new page or cached one.
The meaning of the flag is that value of the extended attribute in
the buffer is verified.
Note that, the underlying pages have PageChecked() == false (the way
fs-verity identifies verified pages).
The flag is being used later to SetPageChecked() on pages handed to
the fs-verity.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/xfs_buf.h | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index df8f47953bb4..d0fadb6d4b59 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -24,14 +24,15 @@ struct xfs_buf;
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-#define XBF_READ (1u << 0) /* buffer intended for reading from device */
-#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */
-#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */
-#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
-#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */
-#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */
-#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
-#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */
+#define XBF_READ (1u << 0) /* buffer intended for reading from device */
+#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
+#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */
+#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */
+#define XBF_VERITY_CHECKED (1u << 8) /* buffer was verified by fs-verity*/
/* buffer type flags for write callbacks */
#define _XBF_INODES (1u << 16)/* inode buffer */
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 13/28] xfs: add XBF_VERITY_CHECKED xfs_buf flag
2023-10-06 18:49 ` [PATCH v3 13/28] xfs: add XBF_VERITY_CHECKED xfs_buf flag Andrey Albershteyn
@ 2023-10-11 18:54 ` Darrick J. Wong
0 siblings, 0 replies; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 18:54 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:07PM +0200, Andrey Albershteyn wrote:
> One of essential ideas of fs-verity is that pages which are already
> verified won't need to be re-verified if they still in page cache.
>
> XFS will store Merkle tree blocks in extended attributes. Each
> attribute has one Merkle tree block. When read extended attribute
> data is put into xfs_buf.
>
> The data in the buffer is not aligned with xfs_buf pages and we
> don't have a reference to these pages. Moreover, these pages are
> released when value is copied out in xfs_attr code. In other words,
> we can not directly mark underlying xfs_buf's pages as verified.
/me wonders why the fs/verity code itself doesn't track which parts of
the merkle tree have been verified.
> One way to track that these pages were verified is to mark xattr's
> buffer as verified instead. If buffer is evicted the incore
> XBF_VERITY_CHECKED flag is lost. When the xattr is read again
> xfs_attr_get() returns new buffer without the flag. The xfs_buf's
> flag is then used to tell fs-verity if it's new page or cached one.
>
> The meaning of the flag is that value of the extended attribute in
> the buffer is verified.
Can there be multiple blocks from distant parts of the merkle tree
stored in a single xattr leaf block? I'm imagining the case where
merkle tree blocks are 4K each, but the fs block size is 64k.
(Or: what is the relationship between merkle tree blocks and fs
blocksize? Are they always the same, or can they differ?)
Or, is there some guarantee that merkle tree blocks will always be
stored as remote xattrs?
I'm worrying about the case where an xfs_buf might contain 2 merkle tree
blocks, we set XBF_VERITY_CHECKED having checked *one* of them but then
forget to check any other verity blobs that might be in the same buffer.
--D
> Note that, the underlying pages have PageChecked() == false (the way
> fs-verity identifies verified pages).
>
> The flag is being used later to SetPageChecked() on pages handed to
> the fs-verity.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> fs/xfs/xfs_buf.h | 17 +++++++++--------
> 1 file changed, 9 insertions(+), 8 deletions(-)
>
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index df8f47953bb4..d0fadb6d4b59 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -24,14 +24,15 @@ struct xfs_buf;
>
> #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
>
> -#define XBF_READ (1u << 0) /* buffer intended for reading from device */
> -#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */
> -#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */
> -#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
> -#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */
> -#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */
> -#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
> -#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */
> +#define XBF_READ (1u << 0) /* buffer intended for reading from device */
> +#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */
> +#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */
> +#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
> +#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */
> +#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */
> +#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
> +#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */
> +#define XBF_VERITY_CHECKED (1u << 8) /* buffer was verified by fs-verity*/
>
> /* buffer type flags for write callbacks */
> #define _XBF_INODES (1u << 16)/* inode buffer */
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 14/28] xfs: add XFS_DA_OP_BUFFER to make xfs_attr_get() return buffer
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (12 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 13/28] xfs: add XBF_VERITY_CHECKED xfs_buf flag Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-06 18:49 ` [PATCH v3 15/28] xfs: introduce workqueue for post read IO work Andrey Albershteyn
` (13 subsequent siblings)
27 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
With XBF_VERITY_CHECKED flag on xfs_buf XFS can track which buffers
contain verified Merkle tree blocks. However, we also need to expose
the buffer to pass a reference of underlying page to fs-verity.
This patch adds XFS_DA_OP_BUFFER to tell xfs_attr_get() to
xfs_buf_hold() underlying buffer and return it as xfs_da_args->bp.
The caller must then xfs_buf_rele() the buffer. Therefore, XFS will
hold a reference to xfs_buf till fs-verity is verifying xfs_buf's
content.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/libxfs/xfs_attr.c | 5 ++++-
fs/xfs/libxfs/xfs_attr_leaf.c | 7 +++++++
fs/xfs/libxfs/xfs_attr_remote.c | 13 +++++++++++--
fs/xfs/libxfs/xfs_da_btree.h | 5 ++++-
4 files changed, 26 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 711022742e34..298b74245267 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -251,6 +251,8 @@ xfs_attr_get_ilocked(
* If the attribute is found, but exceeds the size limit set by the caller in
* args->valuelen, return -ERANGE with the size of the attribute that was found
* in args->valuelen.
+ *
+ * Using XFS_DA_OP_BUFFER the caller have to release the buffer args->bp.
*/
int
xfs_attr_get(
@@ -269,7 +271,8 @@ xfs_attr_get(
args->hashval = xfs_da_hashname(args->name, args->namelen);
/* Entirely possible to look up a name which doesn't exist */
- args->op_flags = XFS_DA_OP_OKNOENT;
+ args->op_flags = XFS_DA_OP_OKNOENT |
+ (args->op_flags & XFS_DA_OP_BUFFER);
lock_mode = xfs_ilock_attr_map_shared(args->dp);
error = xfs_attr_get_ilocked(args);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2580ae47209a..a84795d70de1 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -2531,6 +2531,13 @@ xfs_attr3_leaf_getvalue(
name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
ASSERT(name_loc->namelen == args->namelen);
ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
+
+ /* must be released by the caller */
+ if (args->op_flags & XFS_DA_OP_BUFFER) {
+ xfs_buf_hold(bp);
+ args->bp = bp;
+ }
+
return xfs_attr_copy_value(args,
&name_loc->nameval[args->namelen],
be16_to_cpu(name_loc->valuelen));
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d440393b40eb..72908e0e1c86 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -424,9 +424,18 @@ xfs_attr_rmtval_get(
error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
&offset, &valuelen,
&dst);
- xfs_buf_relse(bp);
- if (error)
+ xfs_buf_unlock(bp);
+ /* must be released by the caller */
+ if (args->op_flags & XFS_DA_OP_BUFFER)
+ args->bp = bp;
+ else
+ xfs_buf_rele(bp);
+
+ if (error) {
+ if (args->op_flags & XFS_DA_OP_BUFFER)
+ xfs_buf_rele(args->bp);
return error;
+ }
/* roll attribute extent map forwards */
lblkno += map[i].br_blockcount;
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index a4b29827603f..269d26730bca 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -61,6 +61,7 @@ typedef struct xfs_da_args {
uint8_t filetype; /* filetype of inode for directories */
void *value; /* set of bytes (maybe contain NULLs) */
int valuelen; /* length of value */
+ struct xfs_buf *bp; /* OUT: xfs_buf which contains the attr */
unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */
xfs_dahash_t hashval; /* hash value of name */
@@ -95,6 +96,7 @@ typedef struct xfs_da_args {
#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */
+#define XFS_DA_OP_BUFFER (1u << 9) /* Return underlying buffer */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
@@ -105,7 +107,8 @@ typedef struct xfs_da_args {
{ XFS_DA_OP_NOTIME, "NOTIME" }, \
{ XFS_DA_OP_REMOVE, "REMOVE" }, \
{ XFS_DA_OP_RECOVERY, "RECOVERY" }, \
- { XFS_DA_OP_LOGGED, "LOGGED" }
+ { XFS_DA_OP_LOGGED, "LOGGED" }, \
+ { XFS_DA_OP_BUFFER, "BUFFER" }
/*
* Storage for holding state during Btree searches and split/join ops.
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH v3 15/28] xfs: introduce workqueue for post read IO work
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (13 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 14/28] xfs: add XFS_DA_OP_BUFFER to make xfs_attr_get() return buffer Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 18:55 ` Darrick J. Wong
2023-10-06 18:49 ` [PATCH v3 16/28] xfs: add bio_set and submit_io for ioend post-processing Andrey Albershteyn
` (12 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
As noted by Dave there are two problems with using fs-verity's
workqueue in XFS:
1. High priority workqueues are used within XFS to ensure that data
IO completion cannot stall processing of journal IO completions.
Hence using a WQ_HIGHPRI workqueue directly in the user data IO
path is a potential filesystem livelock/deadlock vector.
2. The fsverity workqueue is global - it creates a cross-filesystem
contention point.
This patch adds per-filesystem, per-cpu workqueue for fsverity
work.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/xfs_mount.h | 1 +
fs/xfs/xfs_super.c | 9 +++++++++
2 files changed, 10 insertions(+)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d19cca099bc3..3d77844b255e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -109,6 +109,7 @@ typedef struct xfs_mount {
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
+ struct workqueue_struct *m_postread_workqueue;
struct workqueue_struct *m_reclaim_workqueue;
struct workqueue_struct *m_sync_workqueue;
struct workqueue_struct *m_blockgc_wq;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 819a3568b28f..5e1ec5978176 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -554,6 +554,12 @@ xfs_init_mount_workqueues(
if (!mp->m_unwritten_workqueue)
goto out_destroy_buf;
+ mp->m_postread_workqueue = alloc_workqueue("xfs-pread/%s",
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 0, mp->m_super->s_id);
+ if (!mp->m_postread_workqueue)
+ goto out_destroy_postread;
+
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
0, mp->m_super->s_id);
@@ -587,6 +593,8 @@ xfs_init_mount_workqueues(
destroy_workqueue(mp->m_reclaim_workqueue);
out_destroy_unwritten:
destroy_workqueue(mp->m_unwritten_workqueue);
+out_destroy_postread:
+ destroy_workqueue(mp->m_postread_workqueue);
out_destroy_buf:
destroy_workqueue(mp->m_buf_workqueue);
out:
@@ -602,6 +610,7 @@ xfs_destroy_mount_workqueues(
destroy_workqueue(mp->m_inodegc_wq);
destroy_workqueue(mp->m_reclaim_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
+ destroy_workqueue(mp->m_postread_workqueue);
destroy_workqueue(mp->m_buf_workqueue);
}
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 15/28] xfs: introduce workqueue for post read IO work
2023-10-06 18:49 ` [PATCH v3 15/28] xfs: introduce workqueue for post read IO work Andrey Albershteyn
@ 2023-10-11 18:55 ` Darrick J. Wong
2023-10-16 12:37 ` Andrey Albershteyn
0 siblings, 1 reply; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 18:55 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:09PM +0200, Andrey Albershteyn wrote:
> As noted by Dave there are two problems with using fs-verity's
> workqueue in XFS:
>
> 1. High priority workqueues are used within XFS to ensure that data
> IO completion cannot stall processing of journal IO completions.
> Hence using a WQ_HIGHPRI workqueue directly in the user data IO
> path is a potential filesystem livelock/deadlock vector.
>
> 2. The fsverity workqueue is global - it creates a cross-filesystem
> contention point.
>
> This patch adds per-filesystem, per-cpu workqueue for fsverity
> work.
If we ever want to implement compression and/or fscrypt, can we use this
pread workqueue for that too?
Sounds good to me...
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
--D
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> fs/xfs/xfs_mount.h | 1 +
> fs/xfs/xfs_super.c | 9 +++++++++
> 2 files changed, 10 insertions(+)
>
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index d19cca099bc3..3d77844b255e 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -109,6 +109,7 @@ typedef struct xfs_mount {
> struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
> struct workqueue_struct *m_buf_workqueue;
> struct workqueue_struct *m_unwritten_workqueue;
> + struct workqueue_struct *m_postread_workqueue;
> struct workqueue_struct *m_reclaim_workqueue;
> struct workqueue_struct *m_sync_workqueue;
> struct workqueue_struct *m_blockgc_wq;
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 819a3568b28f..5e1ec5978176 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -554,6 +554,12 @@ xfs_init_mount_workqueues(
> if (!mp->m_unwritten_workqueue)
> goto out_destroy_buf;
>
> + mp->m_postread_workqueue = alloc_workqueue("xfs-pread/%s",
> + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
> + 0, mp->m_super->s_id);
> + if (!mp->m_postread_workqueue)
> + goto out_destroy_postread;
> +
> mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
> XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
> 0, mp->m_super->s_id);
> @@ -587,6 +593,8 @@ xfs_init_mount_workqueues(
> destroy_workqueue(mp->m_reclaim_workqueue);
> out_destroy_unwritten:
> destroy_workqueue(mp->m_unwritten_workqueue);
> +out_destroy_postread:
> + destroy_workqueue(mp->m_postread_workqueue);
> out_destroy_buf:
> destroy_workqueue(mp->m_buf_workqueue);
> out:
> @@ -602,6 +610,7 @@ xfs_destroy_mount_workqueues(
> destroy_workqueue(mp->m_inodegc_wq);
> destroy_workqueue(mp->m_reclaim_workqueue);
> destroy_workqueue(mp->m_unwritten_workqueue);
> + destroy_workqueue(mp->m_postread_workqueue);
> destroy_workqueue(mp->m_buf_workqueue);
> }
>
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 15/28] xfs: introduce workqueue for post read IO work
2023-10-11 18:55 ` Darrick J. Wong
@ 2023-10-16 12:37 ` Andrey Albershteyn
0 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-16 12:37 UTC (permalink / raw)
To: Darrick J. Wong
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On 2023-10-11 11:55:58, Darrick J. Wong wrote:
> On Fri, Oct 06, 2023 at 08:49:09PM +0200, Andrey Albershteyn wrote:
> > As noted by Dave there are two problems with using fs-verity's
> > workqueue in XFS:
> >
> > 1. High priority workqueues are used within XFS to ensure that data
> > IO completion cannot stall processing of journal IO completions.
> > Hence using a WQ_HIGHPRI workqueue directly in the user data IO
> > path is a potential filesystem livelock/deadlock vector.
> >
> > 2. The fsverity workqueue is global - it creates a cross-filesystem
> > contention point.
> >
> > This patch adds per-filesystem, per-cpu workqueue for fsverity
> > work.
>
> If we ever want to implement compression and/or fscrypt, can we use this
> pread workqueue for that too?
I think yes.
> Sounds good to me...
> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Thanks!
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 16/28] xfs: add bio_set and submit_io for ioend post-processing
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (14 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 15/28] xfs: introduce workqueue for post read IO work Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 18:47 ` Darrick J. Wong
2023-10-06 18:49 ` [PATCH v3 17/28] xfs: add attribute type for fs-verity Andrey Albershteyn
` (11 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
The read IO path provides callout for configuring ioend. This allows
filesystem to add verification of completed BIOs. One of such tasks
is verification against fs-verity tree when pages were read. iomap
allows using custom bio_set with submit_bio() to add ioend
processing. The xfs_prepare_read_ioend() configures bio->bi_end_io
which places verification task in the workqueue. The task does
fs-verity verification and then call back to the iomap to finish IO.
This patch adds callouts implementation to verify pages with
fs-verity. Also implements folio operation .verify_folio for direct
folio verification by fs-verity.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/xfs_aops.c | 84 ++++++++++++++++++++++++++++++++++++++++++++--
fs/xfs/xfs_aops.h | 2 ++
fs/xfs/xfs_linux.h | 1 +
fs/xfs/xfs_super.c | 9 ++++-
4 files changed, 93 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index b413a2dbcc18..fceb0c3de61f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -26,6 +26,8 @@ struct xfs_writepage_ctx {
unsigned int cow_seq;
};
+static struct bio_set xfs_read_ioend_bioset;
+
static inline struct xfs_writepage_ctx *
XFS_WPC(struct iomap_writepage_ctx *ctx)
{
@@ -548,19 +550,97 @@ xfs_vm_bmap(
return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
}
+static void
+xfs_read_work_end_io(
+ struct work_struct *work)
+{
+ struct iomap_read_ioend *ioend =
+ container_of(work, struct iomap_read_ioend, work);
+ struct bio *bio = &ioend->read_inline_bio;
+
+ fsverity_verify_bio(bio);
+ iomap_read_end_io(bio);
+ /*
+ * The iomap_read_ioend has been freed by bio_put() in
+ * iomap_read_end_io()
+ */
+}
+
+static void
+xfs_read_end_io(
+ struct bio *bio)
+{
+ struct iomap_read_ioend *ioend =
+ container_of(bio, struct iomap_read_ioend, read_inline_bio);
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+
+ WARN_ON_ONCE(!queue_work(ip->i_mount->m_postread_workqueue,
+ &ioend->work));
+}
+
+static int
+xfs_verify_folio(
+ struct folio *folio,
+ loff_t pos,
+ unsigned int len)
+{
+ if (fsverity_verify_blocks(folio, len, pos))
+ return 0;
+ return -EFSCORRUPTED;
+}
+
+int
+xfs_init_iomap_bioset(void)
+{
+ return bioset_init(&xfs_read_ioend_bioset,
+ 4 * (PAGE_SIZE / SECTOR_SIZE),
+ offsetof(struct iomap_read_ioend, read_inline_bio),
+ BIOSET_NEED_BVECS);
+}
+
+void
+xfs_free_iomap_bioset(void)
+{
+ bioset_exit(&xfs_read_ioend_bioset);
+}
+
+static void
+xfs_submit_read_bio(
+ const struct iomap_iter *iter,
+ struct bio *bio,
+ loff_t file_offset)
+{
+ struct iomap_read_ioend *ioend;
+
+ ioend = container_of(bio, struct iomap_read_ioend, read_inline_bio);
+ ioend->io_inode = iter->inode;
+ if (fsverity_active(ioend->io_inode)) {
+ INIT_WORK(&ioend->work, &xfs_read_work_end_io);
+ ioend->read_inline_bio.bi_end_io = &xfs_read_end_io;
+ }
+
+ submit_bio(bio);
+}
+
+static const struct iomap_readpage_ops xfs_readpage_ops = {
+ .verify_folio = &xfs_verify_folio,
+ .submit_io = &xfs_submit_read_bio,
+ .bio_set = &xfs_read_ioend_bioset,
+};
+
STATIC int
xfs_vm_read_folio(
struct file *unused,
struct folio *folio)
{
- return iomap_read_folio(folio, &xfs_read_iomap_ops, NULL);
+ return iomap_read_folio(folio, &xfs_read_iomap_ops, &xfs_readpage_ops);
}
STATIC void
xfs_vm_readahead(
struct readahead_control *rac)
{
- iomap_readahead(rac, &xfs_read_iomap_ops, NULL);
+ iomap_readahead(rac, &xfs_read_iomap_ops, &xfs_readpage_ops);
}
static int
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index e0bd68419764..fa7c512b2717 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -10,5 +10,7 @@ extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+int xfs_init_iomap_bioset(void);
+void xfs_free_iomap_bioset(void);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index e9d317a3dafe..ee213c6dfcaf 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -64,6 +64,7 @@ typedef __u32 xfs_nlink_t;
#include <linux/xattr.h>
#include <linux/mnt_idmapping.h>
#include <linux/debugfs.h>
+#include <linux/fsverity.h>
#include <asm/page.h>
#include <asm/div64.h>
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 5e1ec5978176..3cdb642961f4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2375,11 +2375,17 @@ init_xfs_fs(void)
if (error)
goto out_remove_dbg_kobj;
- error = register_filesystem(&xfs_fs_type);
+ error = xfs_init_iomap_bioset();
if (error)
goto out_qm_exit;
+
+ error = register_filesystem(&xfs_fs_type);
+ if (error)
+ goto out_iomap_bioset;
return 0;
+ out_iomap_bioset:
+ xfs_free_iomap_bioset();
out_qm_exit:
xfs_qm_exit();
out_remove_dbg_kobj:
@@ -2412,6 +2418,7 @@ init_xfs_fs(void)
STATIC void __exit
exit_xfs_fs(void)
{
+ xfs_free_iomap_bioset();
xfs_qm_exit();
unregister_filesystem(&xfs_fs_type);
#ifdef DEBUG
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 16/28] xfs: add bio_set and submit_io for ioend post-processing
2023-10-06 18:49 ` [PATCH v3 16/28] xfs: add bio_set and submit_io for ioend post-processing Andrey Albershteyn
@ 2023-10-11 18:47 ` Darrick J. Wong
0 siblings, 0 replies; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 18:47 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:10PM +0200, Andrey Albershteyn wrote:
> The read IO path provides callout for configuring ioend. This allows
> filesystem to add verification of completed BIOs. One of such tasks
> is verification against fs-verity tree when pages were read. iomap
> allows using custom bio_set with submit_bio() to add ioend
> processing. The xfs_prepare_read_ioend() configures bio->bi_end_io
> which places verification task in the workqueue. The task does
> fs-verity verification and then call back to the iomap to finish IO.
>
> This patch adds callouts implementation to verify pages with
> fs-verity. Also implements folio operation .verify_folio for direct
> folio verification by fs-verity.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> fs/xfs/xfs_aops.c | 84 ++++++++++++++++++++++++++++++++++++++++++++--
> fs/xfs/xfs_aops.h | 2 ++
> fs/xfs/xfs_linux.h | 1 +
> fs/xfs/xfs_super.c | 9 ++++-
> 4 files changed, 93 insertions(+), 3 deletions(-)
>
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index b413a2dbcc18..fceb0c3de61f 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -26,6 +26,8 @@ struct xfs_writepage_ctx {
> unsigned int cow_seq;
> };
>
> +static struct bio_set xfs_read_ioend_bioset;
> +
> static inline struct xfs_writepage_ctx *
> XFS_WPC(struct iomap_writepage_ctx *ctx)
> {
> @@ -548,19 +550,97 @@ xfs_vm_bmap(
> return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
> }
>
> +static void
> +xfs_read_work_end_io(
> + struct work_struct *work)
> +{
> + struct iomap_read_ioend *ioend =
> + container_of(work, struct iomap_read_ioend, work);
> + struct bio *bio = &ioend->read_inline_bio;
> +
> + fsverity_verify_bio(bio);
> + iomap_read_end_io(bio);
> + /*
> + * The iomap_read_ioend has been freed by bio_put() in
> + * iomap_read_end_io()
> + */
> +}
> +
> +static void
> +xfs_read_end_io(
> + struct bio *bio)
> +{
> + struct iomap_read_ioend *ioend =
> + container_of(bio, struct iomap_read_ioend, read_inline_bio);
> + struct xfs_inode *ip = XFS_I(ioend->io_inode);
> +
> + WARN_ON_ONCE(!queue_work(ip->i_mount->m_postread_workqueue,
> + &ioend->work));
If queue_work fails we should EIO the read.
> +}
> +
> +static int
> +xfs_verify_folio(
> + struct folio *folio,
> + loff_t pos,
> + unsigned int len)
> +{
> + if (fsverity_verify_blocks(folio, len, pos))
> + return 0;
> + return -EFSCORRUPTED;
> +}
> +
> +int
> +xfs_init_iomap_bioset(void)
Probably should be marked __init, right?
> +{
> + return bioset_init(&xfs_read_ioend_bioset,
> + 4 * (PAGE_SIZE / SECTOR_SIZE),
> + offsetof(struct iomap_read_ioend, read_inline_bio),
> + BIOSET_NEED_BVECS);
Also, there's nothing specific to XFS in this bioset, is there?
Shouldn't this be in fs/iomap/buffered-io.c and not XFS?
> +}
> +
> +void
> +xfs_free_iomap_bioset(void)
> +{
> + bioset_exit(&xfs_read_ioend_bioset);
> +}
> +
> +static void
> +xfs_submit_read_bio(
> + const struct iomap_iter *iter,
> + struct bio *bio,
> + loff_t file_offset)
> +{
> + struct iomap_read_ioend *ioend;
> +
> + ioend = container_of(bio, struct iomap_read_ioend, read_inline_bio);
> + ioend->io_inode = iter->inode;
> + if (fsverity_active(ioend->io_inode)) {
> + INIT_WORK(&ioend->work, &xfs_read_work_end_io);
> + ioend->read_inline_bio.bi_end_io = &xfs_read_end_io;
> + }
> +
> + submit_bio(bio);
> +}
> +
> +static const struct iomap_readpage_ops xfs_readpage_ops = {
> + .verify_folio = &xfs_verify_folio,
> + .submit_io = &xfs_submit_read_bio,
> + .bio_set = &xfs_read_ioend_bioset,
> +};
> +
> STATIC int
> xfs_vm_read_folio(
> struct file *unused,
> struct folio *folio)
> {
> - return iomap_read_folio(folio, &xfs_read_iomap_ops, NULL);
> + return iomap_read_folio(folio, &xfs_read_iomap_ops, &xfs_readpage_ops);
Leave the ops parameter as NULL for non-verity filesystems to avoid the
overhead of indirect calls. Work data partitions aren't going to enable
verity.
--D
> }
>
> STATIC void
> xfs_vm_readahead(
> struct readahead_control *rac)
> {
> - iomap_readahead(rac, &xfs_read_iomap_ops, NULL);
> + iomap_readahead(rac, &xfs_read_iomap_ops, &xfs_readpage_ops);
> }
>
> static int
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index e0bd68419764..fa7c512b2717 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -10,5 +10,7 @@ extern const struct address_space_operations xfs_address_space_operations;
> extern const struct address_space_operations xfs_dax_aops;
>
> int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
> +int xfs_init_iomap_bioset(void);
> +void xfs_free_iomap_bioset(void);
>
> #endif /* __XFS_AOPS_H__ */
> diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
> index e9d317a3dafe..ee213c6dfcaf 100644
> --- a/fs/xfs/xfs_linux.h
> +++ b/fs/xfs/xfs_linux.h
> @@ -64,6 +64,7 @@ typedef __u32 xfs_nlink_t;
> #include <linux/xattr.h>
> #include <linux/mnt_idmapping.h>
> #include <linux/debugfs.h>
> +#include <linux/fsverity.h>
>
> #include <asm/page.h>
> #include <asm/div64.h>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 5e1ec5978176..3cdb642961f4 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -2375,11 +2375,17 @@ init_xfs_fs(void)
> if (error)
> goto out_remove_dbg_kobj;
>
> - error = register_filesystem(&xfs_fs_type);
> + error = xfs_init_iomap_bioset();
> if (error)
> goto out_qm_exit;
> +
> + error = register_filesystem(&xfs_fs_type);
> + if (error)
> + goto out_iomap_bioset;
> return 0;
>
> + out_iomap_bioset:
> + xfs_free_iomap_bioset();
> out_qm_exit:
> xfs_qm_exit();
> out_remove_dbg_kobj:
> @@ -2412,6 +2418,7 @@ init_xfs_fs(void)
> STATIC void __exit
> exit_xfs_fs(void)
> {
> + xfs_free_iomap_bioset();
> xfs_qm_exit();
> unregister_filesystem(&xfs_fs_type);
> #ifdef DEBUG
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 17/28] xfs: add attribute type for fs-verity
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (15 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 16/28] xfs: add bio_set and submit_io for ioend post-processing Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 18:48 ` Darrick J. Wong
2023-10-06 18:49 ` [PATCH v3 18/28] xfs: make xfs_buf_get() to take XBF_* flags Andrey Albershteyn
` (10 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
The Merkle tree blocks and descriptor are stored in the extended
attributes of the inode. Add new attribute type for fs-verity
metadata. Add XFS_ATTR_INTERNAL_MASK to skip parent pointer and
fs-verity attributes as those are only for internal use. While we're
at it add a few comments in relevant places that internally visible
attributes are not suppose to be handled via interface defined in
xfs_xattr.c.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/libxfs/xfs_da_format.h | 10 +++++++++-
fs/xfs/libxfs/xfs_log_format.h | 1 +
fs/xfs/xfs_ioctl.c | 5 +++++
fs/xfs/xfs_trace.h | 1 +
fs/xfs/xfs_xattr.c | 9 +++++++++
5 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 6deefe03207f..b56bdae83563 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -699,14 +699,22 @@ struct xfs_attr3_leafblock {
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
#define XFS_ATTR_PARENT_BIT 3 /* parent pointer attrs */
+#define XFS_ATTR_VERITY_BIT 4 /* verity merkle tree and descriptor */
#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT)
#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
#define XFS_ATTR_PARENT (1u << XFS_ATTR_PARENT_BIT)
+#define XFS_ATTR_VERITY (1u << XFS_ATTR_VERITY_BIT)
#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
#define XFS_ATTR_NSP_ONDISK_MASK \
- (XFS_ATTR_ROOT | XFS_ATTR_SECURE | XFS_ATTR_PARENT)
+ (XFS_ATTR_ROOT | XFS_ATTR_SECURE | XFS_ATTR_PARENT | \
+ XFS_ATTR_VERITY)
+
+/*
+ * Internal attributes not exposed to the user
+ */
+#define XFS_ATTR_INTERNAL_MASK (XFS_ATTR_PARENT | XFS_ATTR_VERITY)
/*
* Alignment for namelist and valuelist entries (since they are mixed
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 0bc1749fb7bb..c42cc58cd152 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -975,6 +975,7 @@ struct xfs_icreate_log {
#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \
XFS_ATTR_SECURE | \
XFS_ATTR_PARENT | \
+ XFS_ATTR_VERITY | \
XFS_ATTR_INCOMPLETE)
/*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 55bb01173cde..3d6d680b6cf3 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -351,6 +351,11 @@ static unsigned int
xfs_attr_filter(
u32 ioc_flags)
{
+ /*
+ * Only externally visible attributes should be specified here.
+ * Internally used attributes (such as parent pointers or fs-verity)
+ * should not be exposed to userspace.
+ */
if (ioc_flags & XFS_IOC_ATTR_ROOT)
return XFS_ATTR_ROOT;
if (ioc_flags & XFS_IOC_ATTR_SECURE)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 3926cf7f2a6e..3696709907bf 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -82,6 +82,7 @@ struct xfs_perag;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
{ XFS_ATTR_SECURE, "SECURE" }, \
+ { XFS_ATTR_VERITY, "VERITY" }, \
{ XFS_ATTR_INCOMPLETE, "INCOMPLETE" }
DECLARE_EVENT_CLASS(xfs_attr_list_class,
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index a3975f325f4e..56f7f4122fcb 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -20,6 +20,12 @@
#include <linux/posix_acl_xattr.h>
+/*
+ * This file defines interface to work with externally visible extended
+ * attributes, such as those in system or security namespaces. This interface
+ * should not be used for internally used attributes (consider xfs_attr.c).
+ */
+
/*
* Get permission to use log-assisted atomic exchange of file extents.
*
@@ -241,6 +247,9 @@ xfs_xattr_put_listent(
ASSERT(context->count >= 0);
+ if (flags & XFS_ATTR_INTERNAL_MASK)
+ return;
+
if (flags & XFS_ATTR_ROOT) {
#ifdef CONFIG_XFS_POSIX_ACL
if (namelen == SGI_ACL_FILE_SIZE &&
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 17/28] xfs: add attribute type for fs-verity
2023-10-06 18:49 ` [PATCH v3 17/28] xfs: add attribute type for fs-verity Andrey Albershteyn
@ 2023-10-11 18:48 ` Darrick J. Wong
0 siblings, 0 replies; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 18:48 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:11PM +0200, Andrey Albershteyn wrote:
> The Merkle tree blocks and descriptor are stored in the extended
> attributes of the inode. Add new attribute type for fs-verity
> metadata. Add XFS_ATTR_INTERNAL_MASK to skip parent pointer and
> fs-verity attributes as those are only for internal use. While we're
> at it add a few comments in relevant places that internally visible
> attributes are not suppose to be handled via interface defined in
> xfs_xattr.c.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> fs/xfs/libxfs/xfs_da_format.h | 10 +++++++++-
> fs/xfs/libxfs/xfs_log_format.h | 1 +
> fs/xfs/xfs_ioctl.c | 5 +++++
> fs/xfs/xfs_trace.h | 1 +
> fs/xfs/xfs_xattr.c | 9 +++++++++
> 5 files changed, 25 insertions(+), 1 deletion(-)
>
> diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
> index 6deefe03207f..b56bdae83563 100644
> --- a/fs/xfs/libxfs/xfs_da_format.h
> +++ b/fs/xfs/libxfs/xfs_da_format.h
> @@ -699,14 +699,22 @@ struct xfs_attr3_leafblock {
> #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
> #define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
> #define XFS_ATTR_PARENT_BIT 3 /* parent pointer attrs */
> +#define XFS_ATTR_VERITY_BIT 4 /* verity merkle tree and descriptor */
> #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
> #define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT)
> #define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
> #define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
> #define XFS_ATTR_PARENT (1u << XFS_ATTR_PARENT_BIT)
> +#define XFS_ATTR_VERITY (1u << XFS_ATTR_VERITY_BIT)
> #define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
> #define XFS_ATTR_NSP_ONDISK_MASK \
> - (XFS_ATTR_ROOT | XFS_ATTR_SECURE | XFS_ATTR_PARENT)
> + (XFS_ATTR_ROOT | XFS_ATTR_SECURE | XFS_ATTR_PARENT | \
> + XFS_ATTR_VERITY)
> +
> +/*
> + * Internal attributes not exposed to the user
> + */
> +#define XFS_ATTR_INTERNAL_MASK (XFS_ATTR_PARENT | XFS_ATTR_VERITY)
>
> /*
> * Alignment for namelist and valuelist entries (since they are mixed
> diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> index 0bc1749fb7bb..c42cc58cd152 100644
> --- a/fs/xfs/libxfs/xfs_log_format.h
> +++ b/fs/xfs/libxfs/xfs_log_format.h
> @@ -975,6 +975,7 @@ struct xfs_icreate_log {
> #define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \
> XFS_ATTR_SECURE | \
> XFS_ATTR_PARENT | \
> + XFS_ATTR_VERITY | \
> XFS_ATTR_INCOMPLETE)
>
> /*
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 55bb01173cde..3d6d680b6cf3 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -351,6 +351,11 @@ static unsigned int
> xfs_attr_filter(
> u32 ioc_flags)
> {
> + /*
> + * Only externally visible attributes should be specified here.
> + * Internally used attributes (such as parent pointers or fs-verity)
> + * should not be exposed to userspace.
> + */
> if (ioc_flags & XFS_IOC_ATTR_ROOT)
> return XFS_ATTR_ROOT;
> if (ioc_flags & XFS_IOC_ATTR_SECURE)
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 3926cf7f2a6e..3696709907bf 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -82,6 +82,7 @@ struct xfs_perag;
> #define XFS_ATTR_FILTER_FLAGS \
> { XFS_ATTR_ROOT, "ROOT" }, \
> { XFS_ATTR_SECURE, "SECURE" }, \
> + { XFS_ATTR_VERITY, "VERITY" }, \
> { XFS_ATTR_INCOMPLETE, "INCOMPLETE" }
>
> DECLARE_EVENT_CLASS(xfs_attr_list_class,
> diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
> index a3975f325f4e..56f7f4122fcb 100644
> --- a/fs/xfs/xfs_xattr.c
> +++ b/fs/xfs/xfs_xattr.c
> @@ -20,6 +20,12 @@
>
> #include <linux/posix_acl_xattr.h>
>
> +/*
> + * This file defines interface to work with externally visible extended
> + * attributes, such as those in system or security namespaces. This interface
"...such as those in user, system, or security namespaces."
With that fixed,
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
--D
> + * should not be used for internally used attributes (consider xfs_attr.c).
> + */
> +
> /*
> * Get permission to use log-assisted atomic exchange of file extents.
> *
> @@ -241,6 +247,9 @@ xfs_xattr_put_listent(
>
> ASSERT(context->count >= 0);
>
> + if (flags & XFS_ATTR_INTERNAL_MASK)
> + return;
> +
> if (flags & XFS_ATTR_ROOT) {
> #ifdef CONFIG_XFS_POSIX_ACL
> if (namelen == SGI_ACL_FILE_SIZE &&
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 18/28] xfs: make xfs_buf_get() to take XBF_* flags
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (16 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 17/28] xfs: add attribute type for fs-verity Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-06 18:49 ` [PATCH v3 19/28] xfs: add XBF_DOUBLE_ALLOC to increase size of the buffer Andrey Albershteyn
` (9 subsequent siblings)
27 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
Allow passing XBF_* buffer flags from xfs_buf_get(). This will allow
fs-verity to specify flag for increased buffer size.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/libxfs/xfs_attr_remote.c | 2 +-
fs/xfs/libxfs/xfs_sb.c | 2 +-
fs/xfs/xfs_buf.h | 3 ++-
3 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 72908e0e1c86..5762135dc2a6 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -521,7 +521,7 @@ xfs_attr_rmtval_set_value(
dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
- error = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, &bp);
+ error = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0, &bp);
if (error)
return error;
bp->b_ops = &xfs_attr3_rmt_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 6264daaab37b..4191da4fb669 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1096,7 +1096,7 @@ xfs_update_secondary_sbs(
error = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR),
- XFS_FSS_TO_BB(mp, 1), &bp);
+ XFS_FSS_TO_BB(mp, 1), 0, &bp);
/*
* If we get an error reading or writing alternate superblocks,
* continue. xfs_repair chooses the "best" superblock based
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index d0fadb6d4b59..e79bfe548952 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -243,11 +243,12 @@ xfs_buf_get(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
size_t numblks,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return xfs_buf_get_map(target, &map, 1, 0, bpp);
+ return xfs_buf_get_map(target, &map, 1, flags, bpp);
}
static inline int
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH v3 19/28] xfs: add XBF_DOUBLE_ALLOC to increase size of the buffer
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (17 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 18/28] xfs: make xfs_buf_get() to take XBF_* flags Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-06 18:49 ` [PATCH v3 20/28] xfs: add fs-verity ro-compat flag Andrey Albershteyn
` (8 subsequent siblings)
27 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
For fs-verity integration, XFS needs to supply kaddr'es of Merkle
tree blocks to fs-verity core and track which blocks are already
verified. One way to track verified status is to set xfs_buf flag
(previously added XBF_VERITY_CHECKED). When xfs_buf is evicted from
memory we loose verified status. Otherwise, fs-verity hits the
xfs_buf which is still in cache and contains already verified blocks.
However, the leaf blocks which are read to the xfs_buf contains leaf
headers. xfs_attr_get() allocates new pages and copies out the data
without header. Those newly allocated pages with extended attribute
data are not attached to the buffer anymore.
Add new XBF_DOUBLE_ALLOC which makes xfs_buf allocates x2 memory for
the buffer. Additional memory will be used for a copy of the
attribute data but without any headers. Also, make
xfs_attr_rmtval_get() to copy data to the buffer itself if XFS asked
for fs-verity block.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/libxfs/xfs_attr_remote.c | 27 +++++++++++++++++++++++++--
fs/xfs/xfs_buf.c | 7 ++++++-
fs/xfs/xfs_buf.h | 1 +
3 files changed, 32 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 5762135dc2a6..7657daf7cff3 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -392,12 +392,22 @@ xfs_attr_rmtval_get(
int blkcnt = args->rmtblkcnt;
int i;
int offset = 0;
+ int flags = 0;
+ void *addr;
trace_xfs_attr_rmtval_get(args);
ASSERT(args->valuelen != 0);
ASSERT(args->rmtvaluelen == args->valuelen);
+ /*
+ * We also check for _OP_BUFFER as we want to trigger on
+ * verity blocks only, not on verity_descriptor
+ */
+ if (args->attr_filter & XFS_ATTR_VERITY &&
+ args->op_flags & XFS_DA_OP_BUFFER)
+ flags = XBF_DOUBLE_ALLOC;
+
valuelen = args->rmtvaluelen;
while (valuelen > 0) {
nmap = ATTR_RMTVALUE_MAPSIZE;
@@ -417,13 +427,25 @@ xfs_attr_rmtval_get(
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt,
- 0, &bp, &xfs_attr3_rmt_buf_ops);
+ flags, &bp, &xfs_attr3_rmt_buf_ops);
if (error)
return error;
+ /*
+ * For fs-verity we allocated more space. That space is
+ * filled with the same xattr data but without leaf
+ * headers. Point args->value to that data
+ */
+ if (flags & XBF_DOUBLE_ALLOC) {
+ addr = xfs_buf_offset(bp, BBTOB(bp->b_length));
+ args->value = addr;
+ dst = addr;
+ }
+
error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
&offset, &valuelen,
&dst);
+
xfs_buf_unlock(bp);
/* must be released by the caller */
if (args->op_flags & XFS_DA_OP_BUFFER)
@@ -521,7 +543,8 @@ xfs_attr_rmtval_set_value(
dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
- error = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0, &bp);
+ error = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt,
+ XBF_DOUBLE_ALLOC, &bp);
if (error)
return error;
bp->b_ops = &xfs_attr3_rmt_buf_ops;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c1ece4a08ff4..c5071a970596 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -328,6 +328,9 @@ xfs_buf_alloc_kmem(
xfs_km_flags_t kmflag_mask = KM_NOFS;
size_t size = BBTOB(bp->b_length);
+ if (flags & XBF_DOUBLE_ALLOC)
+ size *= 2;
+
/* Assure zeroed buffer for non-read cases. */
if (!(flags & XBF_READ))
kmflag_mask |= KM_ZERO;
@@ -358,6 +361,7 @@ xfs_buf_alloc_pages(
{
gfp_t gfp_mask = __GFP_NOWARN;
long filled = 0;
+ int mul = (flags & XBF_DOUBLE_ALLOC) ? 2 : 1;
if (flags & XBF_READ_AHEAD)
gfp_mask |= __GFP_NORETRY;
@@ -365,7 +369,8 @@ xfs_buf_alloc_pages(
gfp_mask |= GFP_NOFS;
/* Make sure that we have a page list */
- bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
+ bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length*mul), PAGE_SIZE);
+
if (bp->b_page_count <= XB_PAGES) {
bp->b_pages = bp->b_page_array;
} else {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e79bfe548952..8e3272fb6e65 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -33,6 +33,7 @@ struct xfs_buf;
#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */
#define XBF_VERITY_CHECKED (1u << 8) /* buffer was verified by fs-verity*/
+#define XBF_DOUBLE_ALLOC (1u << 9) /* double allocated space */
/* buffer type flags for write callbacks */
#define _XBF_INODES (1u << 16)/* inode buffer */
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH v3 20/28] xfs: add fs-verity ro-compat flag
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (18 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 19/28] xfs: add XBF_DOUBLE_ALLOC to increase size of the buffer Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 18:56 ` Darrick J. Wong
2023-10-06 18:49 ` [PATCH v3 21/28] xfs: add inode on-disk VERITY flag Andrey Albershteyn
` (7 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
To mark inodes sealed with fs-verity the new XFS_DIFLAG2_VERITY flag
will be added in further patch. This requires ro-compat flag to let
older kernels know that fs with fs-verity can not be modified.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/libxfs/xfs_format.h | 1 +
fs/xfs/libxfs/xfs_sb.c | 2 ++
fs/xfs/xfs_mount.h | 2 ++
3 files changed, 5 insertions(+)
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 371dc07233e0..ef617be2839c 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -353,6 +353,7 @@ xfs_sb_has_compat_feature(
#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */
#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */
+#define XFS_SB_FEAT_RO_COMPAT_VERITY (1 << 4) /* fs-verity */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 4191da4fb669..236f3b833fa4 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -162,6 +162,8 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_REFLINK;
if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
features |= XFS_FEAT_INOBTCNT;
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_VERITY)
+ features |= XFS_FEAT_VERITY;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
features |= XFS_FEAT_FTYPE;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 3d77844b255e..95fba704f60e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -288,6 +288,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
+#define XFS_FEAT_VERITY (1ULL << 27) /* fs-verity */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@@ -351,6 +352,7 @@ __XFS_HAS_FEAT(inobtcounts, INOBTCNT)
__XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
+__XFS_HAS_FEAT(verity, VERITY)
/*
* Mount features
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 20/28] xfs: add fs-verity ro-compat flag
2023-10-06 18:49 ` [PATCH v3 20/28] xfs: add fs-verity ro-compat flag Andrey Albershteyn
@ 2023-10-11 18:56 ` Darrick J. Wong
0 siblings, 0 replies; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 18:56 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:14PM +0200, Andrey Albershteyn wrote:
> To mark inodes sealed with fs-verity the new XFS_DIFLAG2_VERITY flag
> will be added in further patch. This requires ro-compat flag to let
> older kernels know that fs with fs-verity can not be modified.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
LGTM,
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
--D
> ---
> fs/xfs/libxfs/xfs_format.h | 1 +
> fs/xfs/libxfs/xfs_sb.c | 2 ++
> fs/xfs/xfs_mount.h | 2 ++
> 3 files changed, 5 insertions(+)
>
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index 371dc07233e0..ef617be2839c 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -353,6 +353,7 @@ xfs_sb_has_compat_feature(
> #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
> #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */
> #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */
> +#define XFS_SB_FEAT_RO_COMPAT_VERITY (1 << 4) /* fs-verity */
> #define XFS_SB_FEAT_RO_COMPAT_ALL \
> (XFS_SB_FEAT_RO_COMPAT_FINOBT | \
> XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> index 4191da4fb669..236f3b833fa4 100644
> --- a/fs/xfs/libxfs/xfs_sb.c
> +++ b/fs/xfs/libxfs/xfs_sb.c
> @@ -162,6 +162,8 @@ xfs_sb_version_to_features(
> features |= XFS_FEAT_REFLINK;
> if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
> features |= XFS_FEAT_INOBTCNT;
> + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_VERITY)
> + features |= XFS_FEAT_VERITY;
> if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
> features |= XFS_FEAT_FTYPE;
> if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 3d77844b255e..95fba704f60e 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -288,6 +288,7 @@ typedef struct xfs_mount {
> #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
> #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
> #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
> +#define XFS_FEAT_VERITY (1ULL << 27) /* fs-verity */
>
> /* Mount features */
> #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
> @@ -351,6 +352,7 @@ __XFS_HAS_FEAT(inobtcounts, INOBTCNT)
> __XFS_HAS_FEAT(bigtime, BIGTIME)
> __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
> __XFS_HAS_FEAT(large_extent_counts, NREXT64)
> +__XFS_HAS_FEAT(verity, VERITY)
>
> /*
> * Mount features
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 21/28] xfs: add inode on-disk VERITY flag
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (19 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 20/28] xfs: add fs-verity ro-compat flag Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 18:57 ` Darrick J. Wong
2023-10-06 18:49 ` [PATCH v3 22/28] xfs: initialize fs-verity on file open and cleanup on inode destruction Andrey Albershteyn
` (6 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
Add flag to mark inodes which have fs-verity enabled on them (i.e.
descriptor exist and tree is built).
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/ioctl.c | 4 ++++
fs/xfs/libxfs/xfs_format.h | 4 +++-
fs/xfs/xfs_inode.c | 2 ++
fs/xfs/xfs_iops.c | 2 ++
4 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index f5fd99d6b0d4..81a69cb8016b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -481,6 +481,8 @@ void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
fa->flags |= FS_DAX_FL;
if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
fa->flags |= FS_PROJINHERIT_FL;
+ if (fa->fsx_xflags & FS_XFLAG_VERITY)
+ fa->flags |= FS_VERITY_FL;
}
EXPORT_SYMBOL(fileattr_fill_xflags);
@@ -511,6 +513,8 @@ void fileattr_fill_flags(struct fileattr *fa, u32 flags)
fa->fsx_xflags |= FS_XFLAG_DAX;
if (fa->flags & FS_PROJINHERIT_FL)
fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
+ if (fa->flags & FS_VERITY_FL)
+ fa->fsx_xflags |= FS_XFLAG_VERITY;
}
EXPORT_SYMBOL(fileattr_fill_flags);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index ef617be2839c..ccb2ae5c2c93 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1070,16 +1070,18 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
+#define XFS_DIFLAG2_VERITY_BIT 5 /* inode sealed by fsverity */
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
+#define XFS_DIFLAG2_VERITY (1 << XFS_DIFLAG2_VERITY_BIT)
#define XFS_DIFLAG2_ANY \
(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
- XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+ XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_VERITY)
static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4d55f58d99b7..94eb33abcb8f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -634,6 +634,8 @@ xfs_ip2xflags(
flags |= FS_XFLAG_DAX;
if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
flags |= FS_XFLAG_COWEXTSIZE;
+ if (ip->i_diflags2 & XFS_DIFLAG2_VERITY)
+ flags |= FS_XFLAG_VERITY;
}
if (xfs_inode_has_attr_fork(ip))
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1c1e6171209d..9f2d5c2505ae 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1237,6 +1237,8 @@ xfs_diflags_to_iflags(
flags |= S_NOATIME;
if (init && xfs_inode_should_enable_dax(ip))
flags |= S_DAX;
+ if (xflags & FS_XFLAG_VERITY)
+ flags |= S_VERITY;
/*
* S_DAX can only be set during inode initialization and is never set by
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 21/28] xfs: add inode on-disk VERITY flag
2023-10-06 18:49 ` [PATCH v3 21/28] xfs: add inode on-disk VERITY flag Andrey Albershteyn
@ 2023-10-11 18:57 ` Darrick J. Wong
0 siblings, 0 replies; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 18:57 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:15PM +0200, Andrey Albershteyn wrote:
> Add flag to mark inodes which have fs-verity enabled on them (i.e.
> descriptor exist and tree is built).
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> fs/ioctl.c | 4 ++++
> fs/xfs/libxfs/xfs_format.h | 4 +++-
> fs/xfs/xfs_inode.c | 2 ++
> fs/xfs/xfs_iops.c | 2 ++
> 4 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index f5fd99d6b0d4..81a69cb8016b 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -481,6 +481,8 @@ void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
> fa->flags |= FS_DAX_FL;
> if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
> fa->flags |= FS_PROJINHERIT_FL;
> + if (fa->fsx_xflags & FS_XFLAG_VERITY)
> + fa->flags |= FS_VERITY_FL;
> }
> EXPORT_SYMBOL(fileattr_fill_xflags);
>
> @@ -511,6 +513,8 @@ void fileattr_fill_flags(struct fileattr *fa, u32 flags)
> fa->fsx_xflags |= FS_XFLAG_DAX;
> if (fa->flags & FS_PROJINHERIT_FL)
> fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
> + if (fa->flags & FS_VERITY_FL)
> + fa->fsx_xflags |= FS_XFLAG_VERITY;
> }
> EXPORT_SYMBOL(fileattr_fill_flags);
>
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index ef617be2839c..ccb2ae5c2c93 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -1070,16 +1070,18 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
> #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
> #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
> #define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
> +#define XFS_DIFLAG2_VERITY_BIT 5 /* inode sealed by fsverity */
>
> #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
> #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
> #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
> #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
> #define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
> +#define XFS_DIFLAG2_VERITY (1 << XFS_DIFLAG2_VERITY_BIT)
>
> #define XFS_DIFLAG2_ANY \
> (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
> - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
> + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_VERITY)
>
> static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
> {
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 4d55f58d99b7..94eb33abcb8f 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -634,6 +634,8 @@ xfs_ip2xflags(
> flags |= FS_XFLAG_DAX;
> if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
> flags |= FS_XFLAG_COWEXTSIZE;
> + if (ip->i_diflags2 & XFS_DIFLAG2_VERITY)
> + flags |= FS_XFLAG_VERITY;
> }
>
> if (xfs_inode_has_attr_fork(ip))
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 1c1e6171209d..9f2d5c2505ae 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -1237,6 +1237,8 @@ xfs_diflags_to_iflags(
> flags |= S_NOATIME;
> if (init && xfs_inode_should_enable_dax(ip))
> flags |= S_DAX;
> + if (xflags & FS_XFLAG_VERITY)
> + flags |= S_VERITY;
>
> /*
> * S_DAX can only be set during inode initialization and is never set by
I think Eric Biggers already covered this, but I don't think you can let
the FSSETXATTR ioctl set FS_XFLAG_VERITY.
--D
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 22/28] xfs: initialize fs-verity on file open and cleanup on inode destruction
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (20 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 21/28] xfs: add inode on-disk VERITY flag Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-06 18:49 ` [PATCH v3 23/28] xfs: don't allow to enable DAX on fs-verity sealsed inode Andrey Albershteyn
` (5 subsequent siblings)
27 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
fs-verity will read and attach metadata (not the tree itself) from
a disk for those inodes which already have fs-verity enabled.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/xfs_file.c | 8 ++++++++
fs/xfs/xfs_super.c | 2 ++
2 files changed, 10 insertions(+)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 203700278ddb..a92c8197c26a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -31,6 +31,7 @@
#include <linux/mman.h>
#include <linux/fadvise.h>
#include <linux/mount.h>
+#include <linux/fsverity.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -1191,10 +1192,17 @@ xfs_file_open(
struct inode *inode,
struct file *file)
{
+ int error = 0;
+
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
+
+ error = fsverity_file_open(inode, file);
+ if (error)
+ return error;
+
return generic_file_open(inode, file);
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3cdb642961f4..6a3b5285044a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -47,6 +47,7 @@
#include <linux/magic.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
+#include <linux/fsverity.h>
static const struct super_operations xfs_super_operations;
@@ -673,6 +674,7 @@ xfs_fs_destroy_inode(
ASSERT(!rwsem_is_locked(&inode->i_rwsem));
XFS_STATS_INC(ip->i_mount, vn_rele);
XFS_STATS_INC(ip->i_mount, vn_remove);
+ fsverity_cleanup_inode(inode);
xfs_inode_mark_reclaimable(ip);
}
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH v3 23/28] xfs: don't allow to enable DAX on fs-verity sealsed inode
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (21 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 22/28] xfs: initialize fs-verity on file open and cleanup on inode destruction Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 19:00 ` Darrick J. Wong
2023-10-06 18:49 ` [PATCH v3 24/28] xfs: disable direct read path for fs-verity sealed files Andrey Albershteyn
` (4 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
fs-verity doesn't support DAX. Forbid filesystem to enable DAX on
inodes which already have fs-verity enabled. The opposite is checked
when fs-verity is enabled, it won't be enabled if DAX is.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/xfs_iops.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 9f2d5c2505ae..3153767f0d6f 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1209,6 +1209,8 @@ xfs_inode_should_enable_dax(
return false;
if (!xfs_inode_supports_dax(ip))
return false;
+ if (ip->i_diflags2 & XFS_DIFLAG2_VERITY)
+ return false;
if (xfs_has_dax_always(ip->i_mount))
return true;
if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 23/28] xfs: don't allow to enable DAX on fs-verity sealsed inode
2023-10-06 18:49 ` [PATCH v3 23/28] xfs: don't allow to enable DAX on fs-verity sealsed inode Andrey Albershteyn
@ 2023-10-11 19:00 ` Darrick J. Wong
0 siblings, 0 replies; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 19:00 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:17PM +0200, Andrey Albershteyn wrote:
> fs-verity doesn't support DAX. Forbid filesystem to enable DAX on
> inodes which already have fs-verity enabled. The opposite is checked
> when fs-verity is enabled, it won't be enabled if DAX is.
Why can't we allow S_DAX and S_VERITY at the same time? Is there a
design problem that prohibits checking the verity data when we go to
copy the pmem to userspace or take a read fault, or is S_DAX support
simply not implemented?
--D
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> fs/xfs/xfs_iops.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 9f2d5c2505ae..3153767f0d6f 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -1209,6 +1209,8 @@ xfs_inode_should_enable_dax(
> return false;
> if (!xfs_inode_supports_dax(ip))
> return false;
> + if (ip->i_diflags2 & XFS_DIFLAG2_VERITY)
> + return false;
> if (xfs_has_dax_always(ip->i_mount))
> return true;
> if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 24/28] xfs: disable direct read path for fs-verity sealed files
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (22 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 23/28] xfs: don't allow to enable DAX on fs-verity sealsed inode Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 19:02 ` Darrick J. Wong
2023-10-06 18:49 ` [PATCH v3 25/28] xfs: add fs-verity support Andrey Albershteyn
` (3 subsequent siblings)
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
The direct path is not supported on verity files. Attempts to use direct
I/O path on such files should fall back to buffered I/O path.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/xfs_file.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a92c8197c26a..7363cbdff803 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -244,7 +244,8 @@ xfs_file_dax_read(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
ssize_t ret = 0;
trace_xfs_file_dax_read(iocb, to);
@@ -297,10 +298,17 @@ xfs_file_read_iter(
if (IS_DAX(inode))
ret = xfs_file_dax_read(iocb, to);
- else if (iocb->ki_flags & IOCB_DIRECT)
+ else if (iocb->ki_flags & IOCB_DIRECT && !fsverity_active(inode))
ret = xfs_file_dio_read(iocb, to);
- else
+ else {
+ /*
+ * In case fs-verity is enabled, we also fallback to the
+ * buffered read from the direct read path. Therefore,
+ * IOCB_DIRECT is set and need to be cleared
+ */
+ iocb->ki_flags &= ~IOCB_DIRECT;
ret = xfs_file_buffered_read(iocb, to);
+ }
if (ret > 0)
XFS_STATS_ADD(mp, xs_read_bytes, ret);
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 24/28] xfs: disable direct read path for fs-verity sealed files
2023-10-06 18:49 ` [PATCH v3 24/28] xfs: disable direct read path for fs-verity sealed files Andrey Albershteyn
@ 2023-10-11 19:02 ` Darrick J. Wong
0 siblings, 0 replies; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 19:02 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:18PM +0200, Andrey Albershteyn wrote:
> The direct path is not supported on verity files. Attempts to use direct
> I/O path on such files should fall back to buffered I/O path.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> fs/xfs/xfs_file.c | 14 +++++++++++---
> 1 file changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index a92c8197c26a..7363cbdff803 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -244,7 +244,8 @@ xfs_file_dax_read(
> struct kiocb *iocb,
> struct iov_iter *to)
> {
> - struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
> + struct inode *inode = iocb->ki_filp->f_mapping->host;
> + struct xfs_inode *ip = XFS_I(inode);
> ssize_t ret = 0;
>
> trace_xfs_file_dax_read(iocb, to);
> @@ -297,10 +298,17 @@ xfs_file_read_iter(
>
> if (IS_DAX(inode))
> ret = xfs_file_dax_read(iocb, to);
> - else if (iocb->ki_flags & IOCB_DIRECT)
> + else if (iocb->ki_flags & IOCB_DIRECT && !fsverity_active(inode))
> ret = xfs_file_dio_read(iocb, to);
> - else
> + else {
> + /*
> + * In case fs-verity is enabled, we also fallback to the
> + * buffered read from the direct read path. Therefore,
> + * IOCB_DIRECT is set and need to be cleared
> + */
> + iocb->ki_flags &= ~IOCB_DIRECT;
We don't clear IOCB_DIRECT when directio writes fall back to buffered
writes; why is it necessary here?
--D
> ret = xfs_file_buffered_read(iocb, to);
> + }
>
> if (ret > 0)
> XFS_STATS_ADD(mp, xs_read_bytes, ret);
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 25/28] xfs: add fs-verity support
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (23 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 24/28] xfs: disable direct read path for fs-verity sealed files Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-06 23:40 ` kernel test robot
` (2 more replies)
2023-10-06 18:49 ` [PATCH v3 26/28] xfs: make scrub aware of verity dinode flag Andrey Albershteyn
` (2 subsequent siblings)
27 siblings, 3 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
Add integration with fs-verity. The XFS store fs-verity metadata in
the extended attributes. The metadata consist of verity descriptor
and Merkle tree blocks.
The descriptor is stored under "verity_descriptor" extended
attribute. The Merkle tree blocks are stored under binary indexes.
When fs-verity is enabled on an inode, the XFS_IVERITY_CONSTRUCTION
flag is set meaning that the Merkle tree is being build. The
initialization ends with storing of verity descriptor and setting
inode on-disk flag (XFS_DIFLAG2_VERITY).
The verification on read is done in iomap. Based on the inode verity
flag the IOMAP_F_READ_VERITY is set in xfs_read_iomap_begin() to let
iomap know that verification is needed.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/Makefile | 1 +
fs/xfs/libxfs/xfs_attr.c | 13 ++
fs/xfs/libxfs/xfs_attr_leaf.c | 17 ++-
fs/xfs/libxfs/xfs_attr_remote.c | 8 +-
fs/xfs/libxfs/xfs_da_format.h | 16 ++
fs/xfs/xfs_inode.h | 3 +-
fs/xfs/xfs_iomap.c | 3 +
fs/xfs/xfs_ondisk.h | 4 +
fs/xfs/xfs_super.c | 8 +
fs/xfs/xfs_verity.c | 257 ++++++++++++++++++++++++++++++++
fs/xfs/xfs_verity.h | 37 +++++
11 files changed, 360 insertions(+), 7 deletions(-)
create mode 100644 fs/xfs/xfs_verity.c
create mode 100644 fs/xfs/xfs_verity.h
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7762c01a85cf..c1a58ed8b419 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -130,6 +130,7 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o
+xfs-$(CONFIG_FS_VERITY) += xfs_verity.o
# notify failure
ifeq ($(CONFIG_MEMORY_FAILURE),y)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 298b74245267..25e1f829e01e 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -26,6 +26,7 @@
#include "xfs_trace.h"
#include "xfs_attr_item.h"
#include "xfs_xattr.h"
+#include "xfs_verity.h"
struct kmem_cache *xfs_attr_intent_cache;
@@ -1635,6 +1636,18 @@ xfs_attr_namecheck(
return xfs_verify_pptr(mp, (struct xfs_parent_name_rec *)name);
}
+ if (flags & XFS_ATTR_VERITY) {
+ /* Merkle tree pages are stored under u64 indexes */
+ if (length == sizeof(struct xfs_fsverity_merkle_key))
+ return true;
+
+ /* Verity descriptor blocks are held in a named attribute. */
+ if (length == XFS_VERITY_DESCRIPTOR_NAME_LEN)
+ return true;
+
+ return false;
+ }
+
return xfs_str_attr_namecheck(name, length);
}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index a84795d70de1..36d1f88d972f 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -29,6 +29,7 @@
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_errortag.h"
+#include "xfs_verity.h"
/*
@@ -518,7 +519,12 @@ xfs_attr_copy_value(
return -ERANGE;
}
- if (!args->value) {
+ /*
+ * We don't want to allocate memory for fs-verity Merkle tree blocks
+ * (fs-verity descriptor is fine though). They will be stored in
+ * underlying xfs_buf
+ */
+ if (!args->value && !xfs_verity_merkle_block(args)) {
args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP);
if (!args->value)
return -ENOMEM;
@@ -537,7 +543,14 @@ xfs_attr_copy_value(
*/
if (!value)
return -EINVAL;
- memcpy(args->value, value, valuelen);
+ /*
+ * We won't copy Merkle tree block to the args->value as we want it be
+ * in the xfs_buf. And we didn't allocate any memory in args->value.
+ */
+ if (xfs_verity_merkle_block(args))
+ args->value = value;
+ else
+ memcpy(args->value, value, valuelen);
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 7657daf7cff3..7b4424e3454b 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -22,6 +22,7 @@
#include "xfs_attr_remote.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_verity.h"
#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
@@ -401,11 +402,10 @@ xfs_attr_rmtval_get(
ASSERT(args->rmtvaluelen == args->valuelen);
/*
- * We also check for _OP_BUFFER as we want to trigger on
- * verity blocks only, not on verity_descriptor
+ * For fs-verity we want additional space in the xfs_buf. This space is
+ * used to copy xattr value without leaf headers (crc header).
*/
- if (args->attr_filter & XFS_ATTR_VERITY &&
- args->op_flags & XFS_DA_OP_BUFFER)
+ if (xfs_verity_merkle_block(args))
flags = XBF_DOUBLE_ALLOC;
valuelen = args->rmtvaluelen;
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index b56bdae83563..a678ad5e4a08 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -903,4 +903,20 @@ struct xfs_parent_name_irec {
uint8_t p_namelen;
};
+/*
+ * fs-verity attribute name format
+ *
+ * Merkle tree blocks are stored under extended attributes of the inode. The
+ * name of the attributes are offsets into merkle tree.
+ */
+struct xfs_fsverity_merkle_key {
+ __be64 merkleoff;
+};
+
+static inline void
+xfs_fsverity_merkle_key_to_disk(struct xfs_fsverity_merkle_key *key, loff_t pos)
+{
+ key->merkleoff = cpu_to_be64(pos);
+}
+
#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0c5bdb91152e..e6c30a69e8d1 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -342,7 +342,8 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
* inactivation completes, both flags will be cleared and the inode is a
* plain old IRECLAIMABLE inode.
*/
-#define XFS_INACTIVATING (1 << 13)
+#define XFS_INACTIVATING (1 << 13)
+#define XFS_IVERITY_CONSTRUCTION (1 << 14) /* merkle tree construction */
/* Quotacheck is running but inode has not been added to quota counts. */
#define XFS_IQUOTAUNCHECKED (1 << 14)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 18c8f168b153..80b249c42067 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -132,6 +132,9 @@ xfs_bmbt_to_iomap(
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
+ if (fsverity_active(VFS_I(ip)))
+ iomap->flags |= IOMAP_F_READ_VERITY;
+
iomap->validity_cookie = sequence_cookie;
iomap->folio_ops = &xfs_iomap_folio_ops;
return 0;
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index c4cc99b70dd3..accbbdeb7624 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -190,6 +190,10 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4);
XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT,
16299260424LL);
+
+ /* fs-verity descriptor xattr name */
+ XFS_CHECK_VALUE(strlen(XFS_VERITY_DESCRIPTOR_NAME),
+ XFS_VERITY_DESCRIPTOR_NAME_LEN);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 6a3b5285044a..f32392add622 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -30,6 +30,7 @@
#include "xfs_filestream.h"
#include "xfs_quota.h"
#include "xfs_sysfs.h"
+#include "xfs_verity.h"
#include "xfs_ondisk.h"
#include "xfs_rmap_item.h"
#include "xfs_refcount_item.h"
@@ -1526,6 +1527,9 @@ xfs_fs_fill_super(
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
sb->s_op = &xfs_super_operations;
+#ifdef CONFIG_FS_VERITY
+ sb->s_vop = &xfs_verity_ops;
+#endif
/*
* Delay mount work if the debug hook is set. This is debug
@@ -1735,6 +1739,10 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_has_verity(mp))
+ xfs_alert(mp,
+ "EXPERIMENTAL fs-verity feature in use. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_verity.c b/fs/xfs/xfs_verity.c
new file mode 100644
index 000000000000..a2db56974122
--- /dev/null
+++ b/fs/xfs/xfs_verity.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 Red Hat, Inc.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_attr.h"
+#include "xfs_verity.h"
+#include "xfs_bmap_util.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+
+static int
+xfs_get_verity_descriptor(
+ struct inode *inode,
+ void *buf,
+ size_t buf_size)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ int error = 0;
+ struct xfs_da_args args = {
+ .dp = ip,
+ .attr_filter = XFS_ATTR_VERITY,
+ .name = (const uint8_t *)XFS_VERITY_DESCRIPTOR_NAME,
+ .namelen = XFS_VERITY_DESCRIPTOR_NAME_LEN,
+ .value = buf,
+ .valuelen = buf_size,
+ };
+
+ /*
+ * The fact that (returned attribute size) == (provided buf_size) is
+ * checked by xfs_attr_copy_value() (returns -ERANGE)
+ */
+ error = xfs_attr_get(&args);
+ if (error)
+ return error;
+
+ return args.valuelen;
+}
+
+static int
+xfs_begin_enable_verity(
+ struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ struct xfs_inode *ip = XFS_I(inode);
+ int error = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+
+ if (IS_DAX(inode))
+ return -EINVAL;
+
+ if (xfs_iflags_test_and_set(ip, XFS_IVERITY_CONSTRUCTION))
+ return -EBUSY;
+
+ return error;
+}
+
+static int
+xfs_drop_merkle_tree(
+ struct xfs_inode *ip,
+ u64 merkle_tree_size,
+ u8 log_blocksize)
+{
+ struct xfs_fsverity_merkle_key name;
+ int error = 0, index;
+ u64 offset = 0;
+ struct xfs_da_args args = {
+ .dp = ip,
+ .whichfork = XFS_ATTR_FORK,
+ .attr_filter = XFS_ATTR_VERITY,
+ .namelen = sizeof(struct xfs_fsverity_merkle_key),
+ /* NULL value make xfs_attr_set remove the attr */
+ .value = NULL,
+ };
+
+ for (index = 1; offset < merkle_tree_size; index++) {
+ xfs_fsverity_merkle_key_to_disk(&name, offset);
+ args.name = (const uint8_t *)&name.merkleoff;
+ args.attr_filter = XFS_ATTR_VERITY;
+ error = xfs_attr_set(&args);
+ offset = index << log_blocksize;
+ }
+
+ args.name = (const uint8_t *)XFS_VERITY_DESCRIPTOR_NAME;
+ args.namelen = XFS_VERITY_DESCRIPTOR_NAME_LEN;
+ args.attr_filter = XFS_ATTR_VERITY;
+ error = xfs_attr_set(&args);
+
+ return error;
+}
+
+static int
+xfs_end_enable_verity(
+ struct file *filp,
+ const void *desc,
+ size_t desc_size,
+ u64 merkle_tree_size,
+ u8 log_blocksize)
+{
+ struct inode *inode = file_inode(filp);
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ struct xfs_da_args args = {
+ .dp = ip,
+ .whichfork = XFS_ATTR_FORK,
+ .attr_filter = XFS_ATTR_VERITY,
+ .attr_flags = XATTR_CREATE,
+ .name = (const uint8_t *)XFS_VERITY_DESCRIPTOR_NAME,
+ .namelen = XFS_VERITY_DESCRIPTOR_NAME_LEN,
+ .value = (void *)desc,
+ .valuelen = desc_size,
+ };
+ int error = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+
+ /* fs-verity failed, just cleanup */
+ if (desc == NULL)
+ goto out;
+
+ error = xfs_attr_set(&args);
+ if (error)
+ goto out;
+
+ /* Set fsverity inode flag */
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_ichange,
+ 0, 0, false, &tp);
+ if (error)
+ goto out;
+
+ /*
+ * Ensure that we've persisted the verity information before we enable
+ * it on the inode and tell the caller we have sealed the inode.
+ */
+ ip->i_diflags2 |= XFS_DIFLAG2_VERITY;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ xfs_trans_set_sync(tp);
+
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ if (!error)
+ inode->i_flags |= S_VERITY;
+
+out:
+ if (error)
+ WARN_ON_ONCE(xfs_drop_merkle_tree(ip, merkle_tree_size,
+ log_blocksize));
+
+ xfs_iflags_clear(ip, XFS_IVERITY_CONSTRUCTION);
+ return error;
+}
+
+int
+xfs_read_merkle_tree_block(
+ struct inode *inode,
+ unsigned int pos,
+ struct fsverity_block *block,
+ unsigned long num_ra_pages)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_fsverity_merkle_key name;
+ int error = 0;
+ struct xfs_da_args args = {
+ .dp = ip,
+ .attr_filter = XFS_ATTR_VERITY,
+ .namelen = sizeof(struct xfs_fsverity_merkle_key),
+ };
+ xfs_fsverity_merkle_key_to_disk(&name, pos);
+ args.name = (const uint8_t *)&name.merkleoff;
+
+ error = xfs_attr_get(&args);
+ if (error)
+ goto out;
+
+ WARN_ON_ONCE(!args.valuelen);
+
+ /* now we also want to get underlying xfs_buf */
+ args.op_flags = XFS_DA_OP_BUFFER;
+ error = xfs_attr_get(&args);
+ if (error)
+ goto out;
+
+ block->kaddr = args.value;
+ block->len = args.valuelen;
+ block->cached = args.bp->b_flags & XBF_VERITY_CHECKED;
+ block->context = args.bp;
+
+ return error;
+
+out:
+ kmem_free(args.value);
+ if (args.bp)
+ xfs_buf_rele(args.bp);
+ return error;
+}
+
+static int
+xfs_write_merkle_tree_block(
+ struct inode *inode,
+ const void *buf,
+ u64 pos,
+ unsigned int size)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_fsverity_merkle_key name;
+ struct xfs_da_args args = {
+ .dp = ip,
+ .whichfork = XFS_ATTR_FORK,
+ .attr_filter = XFS_ATTR_VERITY,
+ .attr_flags = XATTR_CREATE,
+ .namelen = sizeof(struct xfs_fsverity_merkle_key),
+ .value = (void *)buf,
+ .valuelen = size,
+ };
+
+ xfs_fsverity_merkle_key_to_disk(&name, pos);
+ args.name = (const uint8_t *)&name.merkleoff;
+
+ return xfs_attr_set(&args);
+}
+
+static void
+xfs_drop_block(
+ struct fsverity_block *block)
+{
+ struct xfs_buf *buf;
+
+ ASSERT(block != NULL);
+
+ buf = (struct xfs_buf *)block->context;
+
+ if (block->cached)
+ buf->b_flags |= XBF_VERITY_CHECKED;
+ xfs_buf_rele(buf);
+
+ kunmap_local(block->kaddr);
+}
+
+const struct fsverity_operations xfs_verity_ops = {
+ .begin_enable_verity = &xfs_begin_enable_verity,
+ .end_enable_verity = &xfs_end_enable_verity,
+ .get_verity_descriptor = &xfs_get_verity_descriptor,
+ .read_merkle_tree_block = &xfs_read_merkle_tree_block,
+ .write_merkle_tree_block = &xfs_write_merkle_tree_block,
+ .drop_block = &xfs_drop_block,
+};
diff --git a/fs/xfs/xfs_verity.h b/fs/xfs/xfs_verity.h
new file mode 100644
index 000000000000..0f32fd212091
--- /dev/null
+++ b/fs/xfs/xfs_verity.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Red Hat, Inc.
+ */
+#ifndef __XFS_VERITY_H__
+#define __XFS_VERITY_H__
+
+#include "xfs.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include <linux/fsverity.h>
+
+#define XFS_VERITY_DESCRIPTOR_NAME "verity_descriptor"
+#define XFS_VERITY_DESCRIPTOR_NAME_LEN 17
+
+static inline bool
+xfs_verity_merkle_block(
+ struct xfs_da_args *args)
+{
+ if (!(args->attr_filter & XFS_ATTR_VERITY))
+ return false;
+
+ if (!(args->op_flags & XFS_DA_OP_BUFFER))
+ return false;
+
+ if (args->valuelen < 1024 || args->valuelen > PAGE_SIZE ||
+ !is_power_of_2(args->valuelen))
+ return false;
+
+ return true;
+}
+
+#ifdef CONFIG_FS_VERITY
+extern const struct fsverity_operations xfs_verity_ops;
+#endif /* CONFIG_FS_VERITY */
+
+#endif /* __XFS_VERITY_H__ */
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 25/28] xfs: add fs-verity support
2023-10-06 18:49 ` [PATCH v3 25/28] xfs: add fs-verity support Andrey Albershteyn
@ 2023-10-06 23:40 ` kernel test robot
2023-10-11 1:39 ` Darrick J. Wong
2023-10-18 19:18 ` kernel test robot
2 siblings, 0 replies; 74+ messages in thread
From: kernel test robot @ 2023-10-06 23:40 UTC (permalink / raw)
To: Andrey Albershteyn, linux-xfs, linux-fsdevel, fsverity
Cc: oe-kbuild-all, djwong, ebiggers, david, dchinner, Andrey Albershteyn
Hi Andrey,
kernel test robot noticed the following build warnings:
[auto build test WARNING on xfs-linux/for-next]
[also build test WARNING on kdave/for-next tytso-ext4/dev jaegeuk-f2fs/dev-test jaegeuk-f2fs/dev linus/master v6.6-rc4 next-20231006]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Andrey-Albershteyn/xfs-Add-new-name-to-attri-d/20231007-025742
base: https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git for-next
patch link: https://lore.kernel.org/r/20231006184922.252188-26-aalbersh%40redhat.com
patch subject: [PATCH v3 25/28] xfs: add fs-verity support
config: powerpc64-randconfig-002-20231007 (https://download.01.org/0day-ci/archive/20231007/202310070701.WiXRnofP-lkp@intel.com/config)
compiler: powerpc64-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231007/202310070701.WiXRnofP-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310070701.WiXRnofP-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> fs/xfs/xfs_verity.c:165:1: warning: no previous prototype for 'xfs_read_merkle_tree_block' [-Wmissing-prototypes]
165 | xfs_read_merkle_tree_block(
| ^~~~~~~~~~~~~~~~~~~~~~~~~~
vim +/xfs_read_merkle_tree_block +165 fs/xfs/xfs_verity.c
163
164 int
> 165 xfs_read_merkle_tree_block(
166 struct inode *inode,
167 unsigned int pos,
168 struct fsverity_block *block,
169 unsigned long num_ra_pages)
170 {
171 struct xfs_inode *ip = XFS_I(inode);
172 struct xfs_fsverity_merkle_key name;
173 int error = 0;
174 struct xfs_da_args args = {
175 .dp = ip,
176 .attr_filter = XFS_ATTR_VERITY,
177 .namelen = sizeof(struct xfs_fsverity_merkle_key),
178 };
179 xfs_fsverity_merkle_key_to_disk(&name, pos);
180 args.name = (const uint8_t *)&name.merkleoff;
181
182 error = xfs_attr_get(&args);
183 if (error)
184 goto out;
185
186 WARN_ON_ONCE(!args.valuelen);
187
188 /* now we also want to get underlying xfs_buf */
189 args.op_flags = XFS_DA_OP_BUFFER;
190 error = xfs_attr_get(&args);
191 if (error)
192 goto out;
193
194 block->kaddr = args.value;
195 block->len = args.valuelen;
196 block->cached = args.bp->b_flags & XBF_VERITY_CHECKED;
197 block->context = args.bp;
198
199 return error;
200
201 out:
202 kmem_free(args.value);
203 if (args.bp)
204 xfs_buf_rele(args.bp);
205 return error;
206 }
207
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 25/28] xfs: add fs-verity support
2023-10-06 18:49 ` [PATCH v3 25/28] xfs: add fs-verity support Andrey Albershteyn
2023-10-06 23:40 ` kernel test robot
@ 2023-10-11 1:39 ` Darrick J. Wong
2023-10-11 14:36 ` Andrey Albershteyn
2023-10-18 19:18 ` kernel test robot
2 siblings, 1 reply; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 1:39 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:19PM +0200, Andrey Albershteyn wrote:
> Add integration with fs-verity. The XFS store fs-verity metadata in
> the extended attributes. The metadata consist of verity descriptor
> and Merkle tree blocks.
>
> The descriptor is stored under "verity_descriptor" extended
> attribute. The Merkle tree blocks are stored under binary indexes.
>
> When fs-verity is enabled on an inode, the XFS_IVERITY_CONSTRUCTION
> flag is set meaning that the Merkle tree is being build. The
> initialization ends with storing of verity descriptor and setting
> inode on-disk flag (XFS_DIFLAG2_VERITY).
>
> The verification on read is done in iomap. Based on the inode verity
> flag the IOMAP_F_READ_VERITY is set in xfs_read_iomap_begin() to let
> iomap know that verification is needed.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
> fs/xfs/Makefile | 1 +
> fs/xfs/libxfs/xfs_attr.c | 13 ++
> fs/xfs/libxfs/xfs_attr_leaf.c | 17 ++-
> fs/xfs/libxfs/xfs_attr_remote.c | 8 +-
> fs/xfs/libxfs/xfs_da_format.h | 16 ++
> fs/xfs/xfs_inode.h | 3 +-
> fs/xfs/xfs_iomap.c | 3 +
> fs/xfs/xfs_ondisk.h | 4 +
> fs/xfs/xfs_super.c | 8 +
> fs/xfs/xfs_verity.c | 257 ++++++++++++++++++++++++++++++++
> fs/xfs/xfs_verity.h | 37 +++++
> 11 files changed, 360 insertions(+), 7 deletions(-)
> create mode 100644 fs/xfs/xfs_verity.c
> create mode 100644 fs/xfs/xfs_verity.h
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 7762c01a85cf..c1a58ed8b419 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -130,6 +130,7 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
> xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
> xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
> xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o
> +xfs-$(CONFIG_FS_VERITY) += xfs_verity.o
>
> # notify failure
> ifeq ($(CONFIG_MEMORY_FAILURE),y)
> diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> index 298b74245267..25e1f829e01e 100644
> --- a/fs/xfs/libxfs/xfs_attr.c
> +++ b/fs/xfs/libxfs/xfs_attr.c
> @@ -26,6 +26,7 @@
> #include "xfs_trace.h"
> #include "xfs_attr_item.h"
> #include "xfs_xattr.h"
> +#include "xfs_verity.h"
>
> struct kmem_cache *xfs_attr_intent_cache;
>
> @@ -1635,6 +1636,18 @@ xfs_attr_namecheck(
> return xfs_verify_pptr(mp, (struct xfs_parent_name_rec *)name);
> }
>
> + if (flags & XFS_ATTR_VERITY) {
> + /* Merkle tree pages are stored under u64 indexes */
> + if (length == sizeof(struct xfs_fsverity_merkle_key))
> + return true;
> +
> + /* Verity descriptor blocks are held in a named attribute. */
> + if (length == XFS_VERITY_DESCRIPTOR_NAME_LEN)
> + return true;
> +
> + return false;
> + }
> +
> return xfs_str_attr_namecheck(name, length);
> }
>
> diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
> index a84795d70de1..36d1f88d972f 100644
> --- a/fs/xfs/libxfs/xfs_attr_leaf.c
> +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
> @@ -29,6 +29,7 @@
> #include "xfs_log.h"
> #include "xfs_ag.h"
> #include "xfs_errortag.h"
> +#include "xfs_verity.h"
>
>
> /*
> @@ -518,7 +519,12 @@ xfs_attr_copy_value(
> return -ERANGE;
> }
>
> - if (!args->value) {
> + /*
> + * We don't want to allocate memory for fs-verity Merkle tree blocks
> + * (fs-verity descriptor is fine though). They will be stored in
> + * underlying xfs_buf
> + */
> + if (!args->value && !xfs_verity_merkle_block(args)) {
Hmm, why isn't this simply !(args->op_flags & XFS_DA_OP_BUFFER) ?
> args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP);
> if (!args->value)
> return -ENOMEM;
> @@ -537,7 +543,14 @@ xfs_attr_copy_value(
> */
> if (!value)
> return -EINVAL;
> - memcpy(args->value, value, valuelen);
> + /*
> + * We won't copy Merkle tree block to the args->value as we want it be
> + * in the xfs_buf. And we didn't allocate any memory in args->value.
> + */
> + if (xfs_verity_merkle_block(args))
> + args->value = value;
> + else
> + memcpy(args->value, value, valuelen);
> return 0;
> }
>
> diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
> index 7657daf7cff3..7b4424e3454b 100644
> --- a/fs/xfs/libxfs/xfs_attr_remote.c
> +++ b/fs/xfs/libxfs/xfs_attr_remote.c
> @@ -22,6 +22,7 @@
> #include "xfs_attr_remote.h"
> #include "xfs_trace.h"
> #include "xfs_error.h"
> +#include "xfs_verity.h"
>
> #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
>
> @@ -401,11 +402,10 @@ xfs_attr_rmtval_get(
> ASSERT(args->rmtvaluelen == args->valuelen);
>
> /*
> - * We also check for _OP_BUFFER as we want to trigger on
> - * verity blocks only, not on verity_descriptor
> + * For fs-verity we want additional space in the xfs_buf. This space is
> + * used to copy xattr value without leaf headers (crc header).
> */
> - if (args->attr_filter & XFS_ATTR_VERITY &&
> - args->op_flags & XFS_DA_OP_BUFFER)
> + if (xfs_verity_merkle_block(args))
> flags = XBF_DOUBLE_ALLOC;
Hmm, not sure what DOUBLE_ALLOC does, but I'll get there as I go
backwards through the XFS patches...
>
> valuelen = args->rmtvaluelen;
> diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
> index b56bdae83563..a678ad5e4a08 100644
> --- a/fs/xfs/libxfs/xfs_da_format.h
> +++ b/fs/xfs/libxfs/xfs_da_format.h
> @@ -903,4 +903,20 @@ struct xfs_parent_name_irec {
> uint8_t p_namelen;
> };
>
> +/*
> + * fs-verity attribute name format
> + *
> + * Merkle tree blocks are stored under extended attributes of the inode. The
> + * name of the attributes are offsets into merkle tree.
Are these offsets byte offsets?
> + */
> +struct xfs_fsverity_merkle_key {
> + __be64 merkleoff;
> +};
> +
> +static inline void
> +xfs_fsverity_merkle_key_to_disk(struct xfs_fsverity_merkle_key *key, loff_t pos)
> +{
> + key->merkleoff = cpu_to_be64(pos);
> +}
> +
> #endif /* __XFS_DA_FORMAT_H__ */
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 0c5bdb91152e..e6c30a69e8d1 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -342,7 +342,8 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
> * inactivation completes, both flags will be cleared and the inode is a
> * plain old IRECLAIMABLE inode.
> */
> -#define XFS_INACTIVATING (1 << 13)
> +#define XFS_INACTIVATING (1 << 13)
> +#define XFS_IVERITY_CONSTRUCTION (1 << 14) /* merkle tree construction */
Under construction? Or already built?
>
> /* Quotacheck is running but inode has not been added to quota counts. */
> #define XFS_IQUOTAUNCHECKED (1 << 14)
These two iflags have the same definition.
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 18c8f168b153..80b249c42067 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -132,6 +132,9 @@ xfs_bmbt_to_iomap(
> (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
> iomap->flags |= IOMAP_F_DIRTY;
>
> + if (fsverity_active(VFS_I(ip)))
> + iomap->flags |= IOMAP_F_READ_VERITY;
> +
> iomap->validity_cookie = sequence_cookie;
> iomap->folio_ops = &xfs_iomap_folio_ops;
> return 0;
> diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
> index c4cc99b70dd3..accbbdeb7624 100644
> --- a/fs/xfs/xfs_ondisk.h
> +++ b/fs/xfs/xfs_ondisk.h
> @@ -190,6 +190,10 @@ xfs_check_ondisk_structs(void)
> XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4);
> XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT,
> 16299260424LL);
> +
> + /* fs-verity descriptor xattr name */
> + XFS_CHECK_VALUE(strlen(XFS_VERITY_DESCRIPTOR_NAME),
> + XFS_VERITY_DESCRIPTOR_NAME_LEN);
> }
>
> #endif /* __XFS_ONDISK_H */
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 6a3b5285044a..f32392add622 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -30,6 +30,7 @@
> #include "xfs_filestream.h"
> #include "xfs_quota.h"
> #include "xfs_sysfs.h"
> +#include "xfs_verity.h"
> #include "xfs_ondisk.h"
> #include "xfs_rmap_item.h"
> #include "xfs_refcount_item.h"
> @@ -1526,6 +1527,9 @@ xfs_fs_fill_super(
> sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
> #endif
> sb->s_op = &xfs_super_operations;
> +#ifdef CONFIG_FS_VERITY
> + sb->s_vop = &xfs_verity_ops;
> +#endif
>
> /*
> * Delay mount work if the debug hook is set. This is debug
> @@ -1735,6 +1739,10 @@ xfs_fs_fill_super(
> goto out_filestream_unmount;
> }
>
> + if (xfs_has_verity(mp))
> + xfs_alert(mp,
> + "EXPERIMENTAL fs-verity feature in use. Use at your own risk!");
> +
> error = xfs_mountfs(mp);
> if (error)
> goto out_filestream_unmount;
> diff --git a/fs/xfs/xfs_verity.c b/fs/xfs/xfs_verity.c
> new file mode 100644
> index 000000000000..a2db56974122
> --- /dev/null
> +++ b/fs/xfs/xfs_verity.c
> @@ -0,0 +1,257 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2023 Red Hat, Inc.
> + */
> +#include "xfs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_da_format.h"
> +#include "xfs_da_btree.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_inode.h"
> +#include "xfs_attr.h"
> +#include "xfs_verity.h"
> +#include "xfs_bmap_util.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans.h"
> +
> +static int
Hum, why isn't this ssize_t? That's what other IO functions return when
returning the number of bytes acted upon.
> +xfs_get_verity_descriptor(
> + struct inode *inode,
> + void *buf,
> + size_t buf_size)
> +{
> + struct xfs_inode *ip = XFS_I(inode);
> + int error = 0;
> + struct xfs_da_args args = {
> + .dp = ip,
> + .attr_filter = XFS_ATTR_VERITY,
> + .name = (const uint8_t *)XFS_VERITY_DESCRIPTOR_NAME,
> + .namelen = XFS_VERITY_DESCRIPTOR_NAME_LEN,
> + .value = buf,
> + .valuelen = buf_size,
> + };
> +
> + /*
> + * The fact that (returned attribute size) == (provided buf_size) is
> + * checked by xfs_attr_copy_value() (returns -ERANGE)
> + */
> + error = xfs_attr_get(&args);
> + if (error)
> + return error;
> +
> + return args.valuelen;
> +}
> +
> +static int
> +xfs_begin_enable_verity(
> + struct file *filp)
> +{
> + struct inode *inode = file_inode(filp);
> + struct xfs_inode *ip = XFS_I(inode);
> + int error = 0;
> +
> + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
> +
> + if (IS_DAX(inode))
> + return -EINVAL;
> +
> + if (xfs_iflags_test_and_set(ip, XFS_IVERITY_CONSTRUCTION))
> + return -EBUSY;
> +
> + return error;
> +}
> +
> +static int
> +xfs_drop_merkle_tree(
> + struct xfs_inode *ip,
> + u64 merkle_tree_size,
> + u8 log_blocksize)
> +{
> + struct xfs_fsverity_merkle_key name;
> + int error = 0, index;
> + u64 offset = 0;
> + struct xfs_da_args args = {
> + .dp = ip,
> + .whichfork = XFS_ATTR_FORK,
> + .attr_filter = XFS_ATTR_VERITY,
> + .namelen = sizeof(struct xfs_fsverity_merkle_key),
> + /* NULL value make xfs_attr_set remove the attr */
> + .value = NULL,
> + };
> +
> + for (index = 1; offset < merkle_tree_size; index++) {
> + xfs_fsverity_merkle_key_to_disk(&name, offset);
> + args.name = (const uint8_t *)&name.merkleoff;
> + args.attr_filter = XFS_ATTR_VERITY;
Why do these two args. fields need to be reset every time through the
loop?
> + error = xfs_attr_set(&args);
Why is it ok to drop the error here?
> + offset = index << log_blocksize;
Hm, ok, the merkle key offsets /are/ byte offsets.
Isn't this whole loop just:
args.name = ...;
args.attr_filter = ...;
for (offset = 0; offset < merkle_tree_size; offset++) {
xfs_fsverity_merkle_key_to_disk(&name, pos << log_blocksize);
error = xfs_attr_set(&args);
if (error)
return error;
}
> + }
> +
> + args.name = (const uint8_t *)XFS_VERITY_DESCRIPTOR_NAME;
> + args.namelen = XFS_VERITY_DESCRIPTOR_NAME_LEN;
> + args.attr_filter = XFS_ATTR_VERITY;
> + error = xfs_attr_set(&args);
> +
> + return error;
> +}
> +
> +static int
> +xfs_end_enable_verity(
> + struct file *filp,
> + const void *desc,
> + size_t desc_size,
> + u64 merkle_tree_size,
> + u8 log_blocksize)
> +{
> + struct inode *inode = file_inode(filp);
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_mount *mp = ip->i_mount;
> + struct xfs_trans *tp;
> + struct xfs_da_args args = {
> + .dp = ip,
> + .whichfork = XFS_ATTR_FORK,
> + .attr_filter = XFS_ATTR_VERITY,
> + .attr_flags = XATTR_CREATE,
> + .name = (const uint8_t *)XFS_VERITY_DESCRIPTOR_NAME,
> + .namelen = XFS_VERITY_DESCRIPTOR_NAME_LEN,
> + .value = (void *)desc,
> + .valuelen = desc_size,
> + };
> + int error = 0;
> +
> + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
> +
> + /* fs-verity failed, just cleanup */
> + if (desc == NULL)
> + goto out;
> +
> + error = xfs_attr_set(&args);
> + if (error)
> + goto out;
> +
> + /* Set fsverity inode flag */
> + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_ichange,
> + 0, 0, false, &tp);
> + if (error)
> + goto out;
> +
> + /*
> + * Ensure that we've persisted the verity information before we enable
> + * it on the inode and tell the caller we have sealed the inode.
> + */
> + ip->i_diflags2 |= XFS_DIFLAG2_VERITY;
> +
> + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> + xfs_trans_set_sync(tp);
> +
> + error = xfs_trans_commit(tp);
> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> + if (!error)
> + inode->i_flags |= S_VERITY;
> +
> +out:
> + if (error)
> + WARN_ON_ONCE(xfs_drop_merkle_tree(ip, merkle_tree_size,
> + log_blocksize));
(Don't WARNings panic some kernels?)
> +
> + xfs_iflags_clear(ip, XFS_IVERITY_CONSTRUCTION);
> + return error;
> +}
> +
> +int
> +xfs_read_merkle_tree_block(
> + struct inode *inode,
> + unsigned int pos,
> + struct fsverity_block *block,
> + unsigned long num_ra_pages)
> +{
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_fsverity_merkle_key name;
> + int error = 0;
> + struct xfs_da_args args = {
> + .dp = ip,
> + .attr_filter = XFS_ATTR_VERITY,
> + .namelen = sizeof(struct xfs_fsverity_merkle_key),
> + };
> + xfs_fsverity_merkle_key_to_disk(&name, pos);
> + args.name = (const uint8_t *)&name.merkleoff;
> +
> + error = xfs_attr_get(&args);
> + if (error)
> + goto out;
> +
> + WARN_ON_ONCE(!args.valuelen);
> +
> + /* now we also want to get underlying xfs_buf */
> + args.op_flags = XFS_DA_OP_BUFFER;
If XFS_DA_OP_BUFFER returns the (bhold'd) xfs_buf containing the value,
then ... can't we call xfs_attr_get once?
Can the xfs_buf contents change after we drop the I{,O}LOCK?
Can users (or the kernel) change or add xattrs on a verity file?
Are they allowed to move the file, and hence update the parent pointers?
Or is the point of XBF_DOUBLE_ALLOC that we'll snapshot the attr data
into the second half of the buffer, and that's what gets passed to
fsverity core code?
> + error = xfs_attr_get(&args);
> + if (error)
> + goto out;
> +
> + block->kaddr = args.value;
> + block->len = args.valuelen;
> + block->cached = args.bp->b_flags & XBF_VERITY_CHECKED;
> + block->context = args.bp;
> +
> + return error;
> +
> +out:
> + kmem_free(args.value);
> + if (args.bp)
> + xfs_buf_rele(args.bp);
> + return error;
> +}
> +
> +static int
> +xfs_write_merkle_tree_block(
> + struct inode *inode,
> + const void *buf,
> + u64 pos,
> + unsigned int size)
> +{
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_fsverity_merkle_key name;
> + struct xfs_da_args args = {
> + .dp = ip,
> + .whichfork = XFS_ATTR_FORK,
> + .attr_filter = XFS_ATTR_VERITY,
> + .attr_flags = XATTR_CREATE,
What happens if merkle tree fails midway through writing the blobs to
the xattr tree? If they're not removed, then won't XATTR_CREATE here
cause a second attempt to fail because there's a half-built tree?
> + .namelen = sizeof(struct xfs_fsverity_merkle_key),
> + .value = (void *)buf,
> + .valuelen = size,
> + };
> +
> + xfs_fsverity_merkle_key_to_disk(&name, pos);
> + args.name = (const uint8_t *)&name.merkleoff;
> +
> + return xfs_attr_set(&args);
> +}
> +
> +static void
> +xfs_drop_block(
> + struct fsverity_block *block)
> +{
> + struct xfs_buf *buf;
> +
> + ASSERT(block != NULL);
> +
> + buf = (struct xfs_buf *)block->context;
> +
> + if (block->cached)
> + buf->b_flags |= XBF_VERITY_CHECKED;
> + xfs_buf_rele(buf);
> +
> + kunmap_local(block->kaddr);
> +}
> +
> +const struct fsverity_operations xfs_verity_ops = {
> + .begin_enable_verity = &xfs_begin_enable_verity,
> + .end_enable_verity = &xfs_end_enable_verity,
> + .get_verity_descriptor = &xfs_get_verity_descriptor,
> + .read_merkle_tree_block = &xfs_read_merkle_tree_block,
> + .write_merkle_tree_block = &xfs_write_merkle_tree_block,
> + .drop_block = &xfs_drop_block,
> +};
> diff --git a/fs/xfs/xfs_verity.h b/fs/xfs/xfs_verity.h
> new file mode 100644
> index 000000000000..0f32fd212091
> --- /dev/null
> +++ b/fs/xfs/xfs_verity.h
> @@ -0,0 +1,37 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2022 Red Hat, Inc.
> + */
> +#ifndef __XFS_VERITY_H__
> +#define __XFS_VERITY_H__
> +
> +#include "xfs.h"
> +#include "xfs_da_format.h"
> +#include "xfs_da_btree.h"
> +#include <linux/fsverity.h>
> +
> +#define XFS_VERITY_DESCRIPTOR_NAME "verity_descriptor"
> +#define XFS_VERITY_DESCRIPTOR_NAME_LEN 17
Want to shorten this by one for u64 alignment? ;)
> +
> +static inline bool
> +xfs_verity_merkle_block(
> + struct xfs_da_args *args)
> +{
> + if (!(args->attr_filter & XFS_ATTR_VERITY))
> + return false;
> +
> + if (!(args->op_flags & XFS_DA_OP_BUFFER))
> + return false;
> +
> + if (args->valuelen < 1024 || args->valuelen > PAGE_SIZE ||
> + !is_power_of_2(args->valuelen))
> + return false;
Why do we check the valuelen here?
Also, if you're passing the xfs_buf out, I thought the buffer cache
could handle weird sizes?
> +
> + return true;
> +}
> +
> +#ifdef CONFIG_FS_VERITY
> +extern const struct fsverity_operations xfs_verity_ops;
> +#endif /* CONFIG_FS_VERITY */
> +
> +#endif /* __XFS_VERITY_H__ */
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 25/28] xfs: add fs-verity support
2023-10-11 1:39 ` Darrick J. Wong
@ 2023-10-11 14:36 ` Andrey Albershteyn
0 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-11 14:36 UTC (permalink / raw)
To: Darrick J. Wong
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On 2023-10-10 18:39:40, Darrick J. Wong wrote:
> On Fri, Oct 06, 2023 at 08:49:19PM +0200, Andrey Albershteyn wrote:
> > - if (!args->value) {
> > + /*
> > + * We don't want to allocate memory for fs-verity Merkle tree blocks
> > + * (fs-verity descriptor is fine though). They will be stored in
> > + * underlying xfs_buf
> > + */
> > + if (!args->value && !xfs_verity_merkle_block(args)) {
>
> Hmm, why isn't this simply !(args->op_flags & XFS_DA_OP_BUFFER) ?
I check if args contains fs-verity block in multiple places so I
wrapped it into a function. Also, XFS_DA_OP_BUFFER only sets
args->bp. It doesn't affect where xfs_attr_copy_value copies the
value (this is changed with XBF_DOUBLE_ALLOC).
> > - if (args->attr_filter & XFS_ATTR_VERITY &&
> > - args->op_flags & XFS_DA_OP_BUFFER)
> > + if (xfs_verity_merkle_block(args))
> > flags = XBF_DOUBLE_ALLOC;
>
> Hmm, not sure what DOUBLE_ALLOC does, but I'll get there as I go
> backwards through the XFS patches...
>
> > +/*
> > + * fs-verity attribute name format
> > + *
> > + * Merkle tree blocks are stored under extended attributes of the inode. The
> > + * name of the attributes are offsets into merkle tree.
>
> Are these offsets byte offsets?
>
Byte offsets.
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -342,7 +342,8 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
> > * inactivation completes, both flags will be cleared and the inode is a
> > * plain old IRECLAIMABLE inode.
> > */
> > -#define XFS_INACTIVATING (1 << 13)
> > +#define XFS_INACTIVATING (1 << 13)
> > +#define XFS_IVERITY_CONSTRUCTION (1 << 14) /* merkle tree construction */
>
> Under construction? Or already built?
Under construction.
> >
> > /* Quotacheck is running but inode has not been added to quota counts. */
> > #define XFS_IQUOTAUNCHECKED (1 << 14)
>
> These two iflags have the same definition.
Oops! Don't know how did I missed that, thanks!
> > diff --git a/fs/xfs/xfs_verity.c b/fs/xfs/xfs_verity.c
> > new file mode 100644
> > index 000000000000..a2db56974122
> > --- /dev/null
> > +++ b/fs/xfs/xfs_verity.c
> > @@ -0,0 +1,257 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Copyright (C) 2023 Red Hat, Inc.
> > + */
> > +#include "xfs.h"
> > +#include "xfs_shared.h"
> > +#include "xfs_format.h"
> > +#include "xfs_da_format.h"
> > +#include "xfs_da_btree.h"
> > +#include "xfs_trans_resv.h"
> > +#include "xfs_mount.h"
> > +#include "xfs_inode.h"
> > +#include "xfs_attr.h"
> > +#include "xfs_verity.h"
> > +#include "xfs_bmap_util.h"
> > +#include "xfs_log_format.h"
> > +#include "xfs_trans.h"
> > +
> > +static int
>
> Hum, why isn't this ssize_t? That's what other IO functions return when
> returning the number of bytes acted upon.
The fs/verity prototype is with int, so I left it as int.
> > +static int
> > +xfs_drop_merkle_tree(
> > + struct xfs_inode *ip,
> > + u64 merkle_tree_size,
> > + u8 log_blocksize)
> > +{
> > + struct xfs_fsverity_merkle_key name;
> > + int error = 0, index;
> > + u64 offset = 0;
> > + struct xfs_da_args args = {
> > + .dp = ip,
> > + .whichfork = XFS_ATTR_FORK,
> > + .attr_filter = XFS_ATTR_VERITY,
> > + .namelen = sizeof(struct xfs_fsverity_merkle_key),
> > + /* NULL value make xfs_attr_set remove the attr */
> > + .value = NULL,
> > + };
> > +
> > + for (index = 1; offset < merkle_tree_size; index++) {
> > + xfs_fsverity_merkle_key_to_disk(&name, offset);
> > + args.name = (const uint8_t *)&name.merkleoff;
> > + args.attr_filter = XFS_ATTR_VERITY;
>
> Why do these two args. fields need to be reset every time through the
> loop?
Oh, yeah this should be ok. I will check it.
> > + error = xfs_attr_set(&args);
>
> Why is it ok to drop the error here?
>
My bad, will handle it.
> > + offset = index << log_blocksize;
>
> Hm, ok, the merkle key offsets /are/ byte offsets.
>
> Isn't this whole loop just:
>
> args.name = ...;
> args.attr_filter = ...;
> for (offset = 0; offset < merkle_tree_size; offset++) {
> xfs_fsverity_merkle_key_to_disk(&name, pos << log_blocksize);
> error = xfs_attr_set(&args);
> if (error)
> return error;
> }
>
> > +static int
> > +xfs_end_enable_verity(
> > + struct file *filp,
> > + const void *desc,
> > + size_t desc_size,
> > + u64 merkle_tree_size,
> > + u8 log_blocksize)
> > +{
> > + struct inode *inode = file_inode(filp);
> > + struct xfs_inode *ip = XFS_I(inode);
> > + struct xfs_mount *mp = ip->i_mount;
> > + struct xfs_trans *tp;
> > + struct xfs_da_args args = {
> > + .dp = ip,
> > + .whichfork = XFS_ATTR_FORK,
> > + .attr_filter = XFS_ATTR_VERITY,
> > + .attr_flags = XATTR_CREATE,
> > + .name = (const uint8_t *)XFS_VERITY_DESCRIPTOR_NAME,
> > + .namelen = XFS_VERITY_DESCRIPTOR_NAME_LEN,
> > + .value = (void *)desc,
> > + .valuelen = desc_size,
> > + };
> > + int error = 0;
> > +
> > + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
> > +
> > + /* fs-verity failed, just cleanup */
> > + if (desc == NULL)
> > + goto out;
> > +
> > + error = xfs_attr_set(&args);
> > + if (error)
> > + goto out;
> > +
> > + /* Set fsverity inode flag */
> > + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_ichange,
> > + 0, 0, false, &tp);
> > + if (error)
> > + goto out;
> > +
> > + /*
> > + * Ensure that we've persisted the verity information before we enable
> > + * it on the inode and tell the caller we have sealed the inode.
> > + */
> > + ip->i_diflags2 |= XFS_DIFLAG2_VERITY;
> > +
> > + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> > + xfs_trans_set_sync(tp);
> > +
> > + error = xfs_trans_commit(tp);
> > + xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > +
> > + if (!error)
> > + inode->i_flags |= S_VERITY;
> > +
> > +out:
> > + if (error)
> > + WARN_ON_ONCE(xfs_drop_merkle_tree(ip, merkle_tree_size,
> > + log_blocksize));
>
> (Don't WARNings panic some kernels?)
Not sure, but in case we fail to drop a tree (error already
happened) we probably want to know that something going on and tree
can not be removed. But I'm fine with removing this, not sure what
are the guidelines on WARNinigs.
> > +int
> > +xfs_read_merkle_tree_block(
> > + struct inode *inode,
> > + unsigned int pos,
> > + struct fsverity_block *block,
> > + unsigned long num_ra_pages)
> > +{
> > + struct xfs_inode *ip = XFS_I(inode);
> > + struct xfs_fsverity_merkle_key name;
> > + int error = 0;
> > + struct xfs_da_args args = {
> > + .dp = ip,
> > + .attr_filter = XFS_ATTR_VERITY,
> > + .namelen = sizeof(struct xfs_fsverity_merkle_key),
> > + };
> > + xfs_fsverity_merkle_key_to_disk(&name, pos);
> > + args.name = (const uint8_t *)&name.merkleoff;
> > +
> > + error = xfs_attr_get(&args);
> > + if (error)
> > + goto out;
> > +
> > + WARN_ON_ONCE(!args.valuelen);
> > +
> > + /* now we also want to get underlying xfs_buf */
> > + args.op_flags = XFS_DA_OP_BUFFER;
>
> If XFS_DA_OP_BUFFER returns the (bhold'd) xfs_buf containing the value,
> then ... can't we call xfs_attr_get once?
hmm, yeah, one call would be probably enough. The initial two calls
were here because fs/verity expected an error if attribute doesn't
exist. But with change to fs/verity/verify.c in patch 10 it's
probably fine to do everything just in one call. Thanks!
> Can the xfs_buf contents change after we drop the I{,O}LOCK?
By fs-verity? No
> Can users (or the kernel) change or add xattrs on a verity file?
>
> Are they allowed to move the file, and hence update the parent pointers?
Moving is probably fine but I haven't tested if it works fine with
parent pointers. But as they are xattrs I think it shouldn't be a
problem. I will check it.
> Or is the point of XBF_DOUBLE_ALLOC that we'll snapshot the attr data
> into the second half of the buffer, and that's what gets passed to
> fsverity core code?
Yes, this
> > +static int
> > +xfs_write_merkle_tree_block(
> > + struct inode *inode,
> > + const void *buf,
> > + u64 pos,
> > + unsigned int size)
> > +{
> > + struct xfs_inode *ip = XFS_I(inode);
> > + struct xfs_fsverity_merkle_key name;
> > + struct xfs_da_args args = {
> > + .dp = ip,
> > + .whichfork = XFS_ATTR_FORK,
> > + .attr_filter = XFS_ATTR_VERITY,
> > + .attr_flags = XATTR_CREATE,
>
> What happens if merkle tree fails midway through writing the blobs to
> the xattr tree? If they're not removed, then won't XATTR_CREATE here
> cause a second attempt to fail because there's a half-built tree?
If write fails during tree building fs-verity calls
xfs_end_enable_verity() which then goes into xfs_drop_merkle_tree()
(if(desc == NULL) case).
> > diff --git a/fs/xfs/xfs_verity.h b/fs/xfs/xfs_verity.h
> > new file mode 100644
> > index 000000000000..0f32fd212091
> > --- /dev/null
> > +++ b/fs/xfs/xfs_verity.h
> > @@ -0,0 +1,37 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Copyright (C) 2022 Red Hat, Inc.
> > + */
> > +#ifndef __XFS_VERITY_H__
> > +#define __XFS_VERITY_H__
> > +
> > +#include "xfs.h"
> > +#include "xfs_da_format.h"
> > +#include "xfs_da_btree.h"
> > +#include <linux/fsverity.h>
> > +
> > +#define XFS_VERITY_DESCRIPTOR_NAME "verity_descriptor"
> > +#define XFS_VERITY_DESCRIPTOR_NAME_LEN 17
>
> Want to shorten this by one for u64 alignment? ;)
sure :)
> > +static inline bool
> > +xfs_verity_merkle_block(
> > + struct xfs_da_args *args)
> > +{
> > + if (!(args->attr_filter & XFS_ATTR_VERITY))
> > + return false;
> > +
> > + if (!(args->op_flags & XFS_DA_OP_BUFFER))
> > + return false;
> > +
> > + if (args->valuelen < 1024 || args->valuelen > PAGE_SIZE ||
> > + !is_power_of_2(args->valuelen))
> > + return false;
>
> Why do we check the valuelen here?
I use this function in multiple places to distinguish between
xfs_da_args which contains:
- fs-verity block
- fs-verity descriptor/any other attribute
Check for size is not necessary.
But this is one of the requirements on fs-verity blocks so I thought
it could be good candidate for one additional check. I'm fine with
removing this
> Also, if you're passing the xfs_buf out, I thought the buffer cache
> could handle weird sizes?
Not sure what you mean here
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 25/28] xfs: add fs-verity support
2023-10-06 18:49 ` [PATCH v3 25/28] xfs: add fs-verity support Andrey Albershteyn
2023-10-06 23:40 ` kernel test robot
2023-10-11 1:39 ` Darrick J. Wong
@ 2023-10-18 19:18 ` kernel test robot
2 siblings, 0 replies; 74+ messages in thread
From: kernel test robot @ 2023-10-18 19:18 UTC (permalink / raw)
To: Andrey Albershteyn, linux-xfs, linux-fsdevel, fsverity
Cc: oe-kbuild-all, djwong, ebiggers, david, dchinner, Andrey Albershteyn
Hi Andrey,
kernel test robot noticed the following build warnings:
[auto build test WARNING on xfs-linux/for-next]
[also build test WARNING on kdave/for-next tytso-ext4/dev jaegeuk-f2fs/dev-test jaegeuk-f2fs/dev linus/master v6.6-rc6 next-20231018]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Andrey-Albershteyn/xfs-Add-new-name-to-attri-d/20231007-025742
base: https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git for-next
patch link: https://lore.kernel.org/r/20231006184922.252188-26-aalbersh%40redhat.com
patch subject: [PATCH v3 25/28] xfs: add fs-verity support
config: i386-randconfig-063-20231018 (https://download.01.org/0day-ci/archive/20231019/202310190201.brxlwE23-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231019/202310190201.brxlwE23-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310190201.brxlwE23-lkp@intel.com/
sparse warnings: (new ones prefixed by >>)
>> fs/xfs/xfs_verity.c:165:1: sparse: sparse: symbol 'xfs_read_merkle_tree_block' was not declared. Should it be static?
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 26/28] xfs: make scrub aware of verity dinode flag
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (24 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 25/28] xfs: add fs-verity support Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-11 1:06 ` Darrick J. Wong
2023-10-06 18:49 ` [PATCH v3 27/28] xfs: add fs-verity ioctls Andrey Albershteyn
2023-10-06 18:49 ` [PATCH v3 28/28] xfs: enable ro-compat fs-verity flag Andrey Albershteyn
27 siblings, 1 reply; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
fs-verity adds new inode flag which causes scrub to fail as it is
not yet known.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/scrub/attr.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index f35144704395..b4f0ba45a092 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -494,7 +494,7 @@ xchk_xattr_rec(
/* Retrieve the entry and check it. */
hash = be32_to_cpu(ent->hashval);
badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE |
- XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT);
+ XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT | XFS_ATTR_VERITY);
if ((ent->flags & badflags) != 0)
xchk_da_set_corrupt(ds, level);
if (ent->flags & XFS_ATTR_LOCAL) {
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH v3 26/28] xfs: make scrub aware of verity dinode flag
2023-10-06 18:49 ` [PATCH v3 26/28] xfs: make scrub aware of verity dinode flag Andrey Albershteyn
@ 2023-10-11 1:06 ` Darrick J. Wong
2023-10-11 14:37 ` Andrey Albershteyn
0 siblings, 1 reply; 74+ messages in thread
From: Darrick J. Wong @ 2023-10-11 1:06 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On Fri, Oct 06, 2023 at 08:49:20PM +0200, Andrey Albershteyn wrote:
> fs-verity adds new inode flag which causes scrub to fail as it is
> not yet known.
>
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
--D
> ---
> fs/xfs/scrub/attr.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
> index f35144704395..b4f0ba45a092 100644
> --- a/fs/xfs/scrub/attr.c
> +++ b/fs/xfs/scrub/attr.c
> @@ -494,7 +494,7 @@ xchk_xattr_rec(
> /* Retrieve the entry and check it. */
> hash = be32_to_cpu(ent->hashval);
> badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE |
> - XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT);
> + XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT | XFS_ATTR_VERITY);
> if ((ent->flags & badflags) != 0)
> xchk_da_set_corrupt(ds, level);
> if (ent->flags & XFS_ATTR_LOCAL) {
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH v3 26/28] xfs: make scrub aware of verity dinode flag
2023-10-11 1:06 ` Darrick J. Wong
@ 2023-10-11 14:37 ` Andrey Albershteyn
0 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-11 14:37 UTC (permalink / raw)
To: Darrick J. Wong
Cc: linux-xfs, linux-fsdevel, fsverity, ebiggers, david, dchinner
On 2023-10-10 18:06:41, Darrick J. Wong wrote:
> On Fri, Oct 06, 2023 at 08:49:20PM +0200, Andrey Albershteyn wrote:
> > fs-verity adds new inode flag which causes scrub to fail as it is
> > not yet known.
> >
> > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
>
> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Thanks!
--
- Andrey
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH v3 27/28] xfs: add fs-verity ioctls
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (25 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 26/28] xfs: make scrub aware of verity dinode flag Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
2023-10-06 18:49 ` [PATCH v3 28/28] xfs: enable ro-compat fs-verity flag Andrey Albershteyn
27 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
Add fs-verity ioctls to enable, dump metadata (descriptor and Merkle
tree pages) and obtain file's digest.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/xfs_ioctl.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 3d6d680b6cf3..ffa04f0aed4a 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/fileattr.h>
+#include <linux/fsverity.h>
/*
* xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
@@ -2154,6 +2155,22 @@ xfs_file_ioctl(
return error;
}
+ case FS_IOC_ENABLE_VERITY:
+ if (!xfs_has_verity(mp))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_enable(filp, (const void __user *)arg);
+
+ case FS_IOC_MEASURE_VERITY:
+ if (!xfs_has_verity(mp))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_measure(filp, (void __user *)arg);
+
+ case FS_IOC_READ_VERITY_METADATA:
+ if (!xfs_has_verity(mp))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_read_metadata(filp,
+ (const void __user *)arg);
+
default:
return -ENOTTY;
}
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH v3 28/28] xfs: enable ro-compat fs-verity flag
2023-10-06 18:48 [PATCH v3 00/28] fs-verity support for XFS Andrey Albershteyn
` (26 preceding siblings ...)
2023-10-06 18:49 ` [PATCH v3 27/28] xfs: add fs-verity ioctls Andrey Albershteyn
@ 2023-10-06 18:49 ` Andrey Albershteyn
27 siblings, 0 replies; 74+ messages in thread
From: Andrey Albershteyn @ 2023-10-06 18:49 UTC (permalink / raw)
To: linux-xfs, linux-fsdevel, fsverity
Cc: djwong, ebiggers, david, dchinner, Andrey Albershteyn
Finalize fs-verity integration in XFS by making kernel fs-verity
aware with ro-compat flag.
Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
fs/xfs/libxfs/xfs_format.h | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index ccb2ae5c2c93..a21612319765 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -355,10 +355,11 @@ xfs_sb_has_compat_feature(
#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */
#define XFS_SB_FEAT_RO_COMPAT_VERITY (1 << 4) /* fs-verity */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
- (XFS_SB_FEAT_RO_COMPAT_FINOBT | \
- XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
- XFS_SB_FEAT_RO_COMPAT_REFLINK| \
- XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
+ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \
+ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
+ XFS_SB_FEAT_RO_COMPAT_REFLINK | \
+ XFS_SB_FEAT_RO_COMPAT_INOBTCNT| \
+ XFS_SB_FEAT_RO_COMPAT_VERITY)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
--
2.40.1
^ permalink raw reply related [flat|nested] 74+ messages in thread