All of lore.kernel.org
 help / color / mirror / Atom feed
From: David Howells <dhowells@redhat.com>
To: Marc Dionne <marc.dionne@auristor.com>
Cc: David Howells <dhowells@redhat.com>,
	linux-afs@lists.infradead.org, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH v2 35/40] afs: Don't leave DONTUSE/NEWREPSITE servers out of server list
Date: Wed, 13 Dec 2023 13:49:57 +0000	[thread overview]
Message-ID: <20231213135003.367397-36-dhowells@redhat.com> (raw)
In-Reply-To: <20231213135003.367397-1-dhowells@redhat.com>

Don't leave servers that are marked VLSF_DONTUSE or VLSF_NEWREPSITE out of
the server list for a volume; rather, mark DONTUSE ones excluded and mark
either NEWREPSITE excluded if the number of updated servers is <50% of the
usable servers or mark !NEWREPSITE excluded otherwise.

Mark the server list as a whole with a 3-state flag to indicate whether we
think the RW volume is being replicated to the RO volume, and, if so,
whether we should switch to using updated replication sites
(VLSF_NEWREPSITE) or stick with the old for now.

This processing is pushed up from the VLDB RPC reply parser to the code
that generates the server list from that information.

Doing this allows the old list to be kept with just the exclusion flags
replaced and to keep the server records pinned and maintained.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
---
 fs/afs/internal.h    | 10 ++++++++
 fs/afs/rotate.c      |  4 +++-
 fs/afs/server_list.c | 54 ++++++++++++++++++++++++++++++++++++--------
 fs/afs/vlclient.c    | 19 +++-------------
 4 files changed, 61 insertions(+), 26 deletions(-)

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5ae4ca999d65..3d90415c2527 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -515,6 +515,7 @@ struct afs_vldb_entry {
 #define AFS_VOL_VTM_RW	0x01 /* R/W version of the volume is available (on this server) */
 #define AFS_VOL_VTM_RO	0x02 /* R/O version of the volume is available (on this server) */
 #define AFS_VOL_VTM_BAK	0x04 /* backup version of the volume is available (on this server) */
+	u8			vlsf_flags[AFS_NMAXNSERVERS];
 	short			error;
 	u8			nr_servers;	/* Number of server records */
 	u8			name_len;
@@ -601,6 +602,12 @@ struct afs_server {
 	spinlock_t		probe_lock;
 };
 
+enum afs_ro_replicating {
+	AFS_RO_NOT_REPLICATING,			/* Not doing replication */
+	AFS_RO_REPLICATING_USE_OLD,		/* Replicating; use old version */
+	AFS_RO_REPLICATING_USE_NEW,		/* Replicating; switch to new version */
+} __mode(byte);
+
 /*
  * Replaceable volume server list.
  */
@@ -608,12 +615,15 @@ struct afs_server_entry {
 	struct afs_server	*server;
 	struct afs_volume	*volume;
 	struct list_head	slink;		/* Link in server->volumes */
+	unsigned long		flags;
+#define AFS_SE_EXCLUDED		0		/* Set if server is to be excluded in rotation */
 };
 
 struct afs_server_list {
 	struct rcu_head		rcu;
 	refcount_t		usage;
 	bool			attached;	/* T if attached to servers */
+	enum afs_ro_replicating	ro_replicating;	/* RW->RO update (probably) in progress */
 	unsigned char		nr_servers;
 	unsigned char		preferred;	/* Preferred server */
 	unsigned short		vnovol_mask;	/* Servers to be skipped due to VNOVOL */
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index e8635f60b97d..3ab85a907a1d 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -448,9 +448,11 @@ bool afs_select_fileserver(struct afs_operation *op)
 	op->server_index = -1;
 	rtt = UINT_MAX;
 	for (i = 0; i < op->server_list->nr_servers; i++) {
-		struct afs_server *s = op->server_list->servers[i].server;
+		struct afs_server_entry *se = &op->server_list->servers[i];
+		struct afs_server *s = se->server;
 
 		if (!test_bit(i, &op->untried_servers) ||
+		    test_bit(AFS_SE_EXCLUDED, &se->flags) ||
 		    !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
 			continue;
 		if (s->rtt <= rtt) {
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index cfd900eb09ed..fb0f4afcb304 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -31,23 +31,53 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
 	struct afs_server_list *slist;
 	struct afs_server *server;
 	unsigned int type_mask = 1 << volume->type;
-	int ret = -ENOMEM, nr_servers = 0, i, j;
-
-	for (i = 0; i < vldb->nr_servers; i++)
-		if (vldb->fs_mask[i] & type_mask)
-			nr_servers++;
+	bool use_newrepsites = false;
+	int ret = -ENOMEM, nr_servers = 0, newrep = 0, i, j, usable = 0;
+
+	/* Work out if we're going to restrict to NEWREPSITE-marked servers or
+	 * not.  If at least one site is marked as NEWREPSITE, then it's likely
+	 * that "vos release" is busy updating RO sites.  We cut over from one
+	 * to the other when >=50% of the sites have been updated.  Sites that
+	 * are in the process of being updated are marked DONTUSE.
+	 */
+	for (i = 0; i < vldb->nr_servers; i++) {
+		if (!(vldb->fs_mask[i] & type_mask))
+			continue;
+		nr_servers++;
+		if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE)
+			continue;
+		usable++;
+		if (vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE)
+			newrep++;
+	}
 
 	slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL);
 	if (!slist)
 		goto error;
 
+	if (newrep) {
+		if (newrep < usable / 2) {
+			slist->ro_replicating = AFS_RO_REPLICATING_USE_OLD;
+		} else {
+			slist->ro_replicating = AFS_RO_REPLICATING_USE_NEW;
+			use_newrepsites = true;
+		}
+	}
+
 	refcount_set(&slist->usage, 1);
 	rwlock_init(&slist->lock);
 
 	/* Make sure a records exists for each server in the list. */
 	for (i = 0; i < vldb->nr_servers; i++) {
+		unsigned long se_flags = 0;
+		bool newrepsite = vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE;
+
 		if (!(vldb->fs_mask[i] & type_mask))
 			continue;
+		if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE)
+			__set_bit(AFS_SE_EXCLUDED, &se_flags);
+		if (newrep && (newrepsite ^ use_newrepsites))
+			__set_bit(AFS_SE_EXCLUDED, &se_flags);
 
 		server = afs_lookup_server(volume->cell, key, &vldb->fs_server[i],
 					   vldb->addr_version[i]);
@@ -79,6 +109,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
 
 		slist->servers[j].server = server;
 		slist->servers[j].volume = volume;
+		slist->servers[j].flags = se_flags;
 		slist->nr_servers++;
 	}
 
@@ -101,16 +132,20 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
 bool afs_annotate_server_list(struct afs_server_list *new,
 			      struct afs_server_list *old)
 {
+	unsigned long mask = 1UL << AFS_SE_EXCLUDED;
 	struct afs_server *cur;
 	int i, j;
 
-	if (old->nr_servers != new->nr_servers)
+	if (old->nr_servers != new->nr_servers ||
+	    old->ro_replicating != new->ro_replicating)
 		goto changed;
 
-	for (i = 0; i < old->nr_servers; i++)
+	for (i = 0; i < old->nr_servers; i++) {
 		if (old->servers[i].server != new->servers[i].server)
 			goto changed;
-
+		if ((old->servers[i].flags & mask) != (new->servers[i].flags & mask))
+			goto changed;
+	}
 	return false;
 
 changed:
@@ -118,7 +153,8 @@ bool afs_annotate_server_list(struct afs_server_list *new,
 	cur = old->servers[old->preferred].server;
 	for (j = 0; j < new->nr_servers; j++) {
 		if (new->servers[j].server == cur) {
-			new->preferred = j;
+			if (!test_bit(AFS_SE_EXCLUDED, &new->servers[j].flags))
+				new->preferred = j;
 			break;
 		}
 	}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index cef02a265edc..cac75f89b64a 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -18,8 +18,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
 {
 	struct afs_uvldbentry__xdr *uvldb;
 	struct afs_vldb_entry *entry;
-	bool new_only = false;
-	u32 tmp, nr_servers, vlflags;
+	u32 nr_servers, vlflags;
 	int i, ret;
 
 	_enter("");
@@ -41,27 +40,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
 	entry->name[i] = 0;
 	entry->name_len = strlen(entry->name);
 
-	/* If there is a new replication site that we can use, ignore all the
-	 * sites that aren't marked as new.
-	 */
-	for (i = 0; i < nr_servers; i++) {
-		tmp = ntohl(uvldb->serverFlags[i]);
-		if (!(tmp & AFS_VLSF_DONTUSE) &&
-		    (tmp & AFS_VLSF_NEWREPSITE))
-			new_only = true;
-	}
-
 	vlflags = ntohl(uvldb->flags);
 	for (i = 0; i < nr_servers; i++) {
 		struct afs_uuid__xdr *xdr;
 		struct afs_uuid *uuid;
+		u32 tmp = ntohl(uvldb->serverFlags[i]);
 		int j;
 		int n = entry->nr_servers;
 
-		tmp = ntohl(uvldb->serverFlags[i]);
-		if (tmp & AFS_VLSF_DONTUSE ||
-		    (new_only && !(tmp & AFS_VLSF_NEWREPSITE)))
-			continue;
 		if (tmp & AFS_VLSF_RWVOL) {
 			entry->fs_mask[n] |= AFS_VOL_VTM_RW;
 			if (vlflags & AFS_VLF_BACKEXISTS)
@@ -82,6 +68,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
 		for (j = 0; j < 6; j++)
 			uuid->node[j] = (u8)ntohl(xdr->node[j]);
 
+		entry->vlsf_flags[n] = tmp;
 		entry->addr_version[n] = ntohl(uvldb->serverUnique[i]);
 		entry->nr_servers++;
 	}


  parent reply	other threads:[~2023-12-13 13:52 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-13 13:49 [PATCH v2 00/40] afs: Fix probe handling, server rotation and RO volume callback handling David Howells
2023-12-13 13:49 ` [PATCH v2 01/40] afs: fix the usage of read_seqbegin_or_lock() in afs_lookup_volume_rcu() David Howells
2023-12-13 13:49 ` [PATCH v2 02/40] afs: fix the usage of read_seqbegin_or_lock() in afs_find_server*() David Howells
2023-12-13 13:49 ` [PATCH v2 03/40] afs: use read_seqbegin() in afs_check_validity() and afs_getattr() David Howells
2023-12-13 13:49 ` [PATCH v2 04/40] rxrpc_find_service_conn_rcu: fix the usage of read_seqbegin_or_lock() David Howells
2023-12-13 13:49 ` [PATCH v2 05/40] afs: Remove whitespace before most ')' from the trace header David Howells
2023-12-13 13:49 ` [PATCH v2 06/40] afs: Automatically generate trace tag enums David Howells
2023-12-13 13:49 ` [PATCH v2 07/40] afs: Add comments on abort handling David Howells
2023-12-13 13:49 ` [PATCH v2 08/40] afs: Turn the afs_addr_list address array into an array of structs David Howells
2023-12-13 13:49 ` [PATCH v2 09/40] rxrpc, afs: Allow afs to pin rxrpc_peer objects David Howells
2023-12-13 13:49 ` [PATCH v2 10/40] afs: Don't skip server addresses for which we didn't get an RTT reading David Howells
2023-12-13 13:49 ` [PATCH v2 11/40] afs: Rename addr_list::failed to probe_failed David Howells
2023-12-13 13:49 ` [PATCH v2 12/40] afs: Handle the VIO and UAEIO aborts explicitly David Howells
2023-12-13 13:49 ` [PATCH v2 13/40] afs: Use op->nr_iterations=-1 to indicate to begin fileserver iteration David Howells
2023-12-13 13:49 ` [PATCH v2 14/40] afs: Wrap most op->error accesses with inline funcs David Howells
2023-12-13 13:49 ` [PATCH v2 15/40] afs: Don't put afs_call in afs_wait_for_call_to_complete() David Howells
2023-12-13 13:49 ` [PATCH v2 16/40] afs: Simplify error handling David Howells
2023-12-13 13:49 ` [PATCH v2 17/40] afs: Add a tracepoint for struct afs_addr_list David Howells
2023-12-13 13:49 ` [PATCH v2 18/40] afs: Rename some fields David Howells
2023-12-13 13:49 ` [PATCH v2 19/40] afs: Use peer + service_id as call address David Howells
2023-12-13 13:49 ` [PATCH v2 20/40] afs: Fold the afs_addr_cursor struct in David Howells
2023-12-13 13:49 ` [PATCH v2 21/40] rxrpc: Create a procfile to display outstanding client conn bundles David Howells
2023-12-13 13:49 ` [PATCH v2 22/40] afs: Add some more info to /proc/net/afs/servers David Howells
2023-12-13 13:49 ` [PATCH v2 23/40] afs: Remove the unimplemented afs_cmp_addr_list() David Howells
2023-12-13 13:49 ` [PATCH v2 24/40] afs: Provide a way to configure address priorities David Howells
2023-12-13 13:49 ` [PATCH v2 25/40] afs: Mark address lists with configured priorities David Howells
2023-12-13 13:49 ` [PATCH v2 26/40] afs: Dispatch fileserver probes in priority order David Howells
2023-12-13 13:49 ` [PATCH v2 27/40] afs: Dispatch vlserver " David Howells
2023-12-13 13:49 ` [PATCH v2 28/40] afs: Keep a record of the current fileserver endpoint state David Howells
2023-12-13 13:49 ` [PATCH v2 29/40] afs: Combine the endpoint state bools into a bitmask David Howells
2023-12-13 13:49 ` [PATCH v2 30/40] afs: Make it possible to find the volumes that are using a server David Howells
2023-12-13 13:49 ` [PATCH v2 31/40] afs: Defer volume record destruction to a workqueue David Howells
2023-12-13 13:49 ` [PATCH v2 32/40] afs: Move the vnode/volume validity checking code into its own file David Howells
2023-12-13 13:49 ` [PATCH v2 33/40] afs: Apply server breaks to mmap'd files in the call processor David Howells
2023-12-13 13:49 ` [PATCH v2 34/40] afs: Fix comment in afs_do_lookup() David Howells
2023-12-13 13:49 ` David Howells [this message]
2023-12-13 13:49 ` [PATCH v2 36/40] afs: Parse the VolSync record in the reply of a number of RPC ops David Howells
2023-12-13 13:49 ` [PATCH v2 37/40] afs: Overhaul invalidation handling to better support RO volumes David Howells
2023-12-13 13:50 ` [PATCH v2 38/40] afs: Fix fileserver rotation David Howells
2023-12-13 13:50 ` [PATCH v2 39/40] afs: Fix offline and busy message emission David Howells
2023-12-13 13:50 ` [PATCH v2 40/40] afs: trace: Log afs_make_call(), including server address David Howells

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231213135003.367397-36-dhowells@redhat.com \
    --to=dhowells@redhat.com \
    --cc=linux-afs@lists.infradead.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=marc.dionne@auristor.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.