All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Yan, Zheng" <zheng.z.yan@intel.com>
To: Gregory Farnum <greg@inktank.com>
Cc: Sage Weil <sage@inktank.com>,
	"ceph-devel@vger.kernel.org" <ceph-devel@vger.kernel.org>
Subject: Re: [PATCH 04/39] mds: make sure table request id unique
Date: Mon, 25 Mar 2013 19:30:37 +0800	[thread overview]
Message-ID: <5150355D.1050501@intel.com> (raw)
In-Reply-To: <CAPYLRzhrH-=9itgY95XuE+uP54b4GxqVstRtmmW9_Qn1C3dHXw@mail.gmail.com>

On 03/22/2013 06:03 AM, Gregory Farnum wrote:
> Right. I'd like to somehow mark those reqid's so that we can tell when
> they come from a different incarnation of the MDS TableClient daemon.
> One way is via some piece of random data that will probably
> distinguish them, although if we have something which we can know is
> different that would be preferable. I think we can work something out
> of the startup session data each MDS does with the monitors, but I'm
> not sure I can check any time soon; I have a number of other things to
> get to now that I've gotten through (the first round on) this series.
> 

How about the attached patch?

Thanks

----
commit d460b766e16ec2cacac239a74af0e226108ab95a
Author: Yan, Zheng <zheng.z.yan@intel.com>
Date:   Sat Mar 16 08:02:18 2013 +0800

    mds: make sure table request id unique
    
    When a MDS becomes active, the table server re-sends 'agree' messages
    for old prepared request. If the recoverd MDS starts a new table request
    at the same time, The new request's ID can happen to be the same as old
    prepared request's ID, because current table client code assigns request
    ID from zero after MDS restarts.
    
    This patch make table server send 'ready' messages when table clients
    become active or itself becomes active. The 'ready' message updates
    table client's last_reqid to avoid request ID collision. The message
    also replaces the roles of finish_recovery() and handle_mds_recovery()
    callbacks for table client.
    
    Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>

diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index bb1c833..834a7aa 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1508,14 +1508,13 @@ void MDS::recovery_done()
   
   // kick anchortable (resent AGREEs)
   if (mdsmap->get_tableserver() == whoami) {
-    anchorserver->finish_recovery();
-    snapserver->finish_recovery();
+    set<int> active;
+    mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
+    mdsmap->get_mds_set(active, MDSMap::STATE_STOPPING);
+    anchorserver->finish_recovery(active);
+    snapserver->finish_recovery(active);
   }
-  
-  // kick anchorclient (resent COMMITs)
-  anchorclient->finish_recovery();
-  snapclient->finish_recovery();
-  
+
   mdcache->start_recovered_truncates();
   mdcache->do_file_recover();
 
@@ -1537,8 +1536,6 @@ void MDS::handle_mds_recovery(int who)
     anchorserver->handle_mds_recovery(who);
     snapserver->handle_mds_recovery(who);
   }
-  anchorclient->handle_mds_recovery(who);
-  snapclient->handle_mds_recovery(who);
   
   queue_waiters(waiting_for_active_peer[who]);
   waiting_for_active_peer.erase(who);
diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
index ea021f5..29e172b 100644
--- a/src/mds/MDSTableClient.cc
+++ b/src/mds/MDSTableClient.cc
@@ -101,6 +101,16 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
     }
     break;
 
+  case TABLESERVER_OP_SERVER_READY:
+    if (last_reqid == 0) {
+      assert(reqid > 0);
+      last_reqid = reqid;
+    }
+
+    resend_queries();
+    resend_prepares();
+    resend_commits();
+    break;
   default:
     assert(0);
   }
@@ -126,19 +136,23 @@ void MDSTableClient::_logged_ack(version_t tid)
 void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl,
 			      Context *onfinish)
 {
-  uint64_t reqid = ++last_reqid;
-  dout(10) << "_prepare " << reqid << dendl;
-
-  // send message
-  MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
-  req->bl = mutation;
-
-  pending_prepare[reqid].mutation = mutation;
-  pending_prepare[reqid].ptid = ptid;
-  pending_prepare[reqid].pbl = pbl;
-  pending_prepare[reqid].onfinish = onfinish;
-
-  send_to_tableserver(req);
+  if (last_reqid > 0) {
+    uint64_t reqid = ++last_reqid;
+    dout(10) << "_prepare " << reqid << dendl;
+    // send message
+    MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
+    req->bl = mutation;
+
+    pending_prepare[reqid].mutation = mutation;
+    pending_prepare[reqid].ptid = ptid;
+    pending_prepare[reqid].pbl = pbl;
+    pending_prepare[reqid].onfinish = onfinish;
+
+    send_to_tableserver(req);
+  } else {
+    dout(10) << "table server is not ready yet, waiting" << dendl;
+    waiting_for_server.push_back(_pending_prepare(onfinish, ptid, pbl, mutation));
+  }
 }
 
 void MDSTableClient::send_to_tableserver(MMDSTableRequest *req)
@@ -176,6 +190,7 @@ void MDSTableClient::got_journaled_agree(version_t tid, LogSegment *ls)
   ls->pending_commit_tids[table].insert(tid);
   pending_commit[tid] = ls;
 }
+
 void MDSTableClient::got_journaled_ack(version_t tid)
 {
   dout(10) << "got_journaled_ack " << tid << dendl;
@@ -185,12 +200,6 @@ void MDSTableClient::got_journaled_ack(version_t tid)
   }
 }
 
-void MDSTableClient::finish_recovery()
-{
-  dout(7) << "finish_recovery" << dendl;
-  resend_commits();
-}
-
 void MDSTableClient::resend_commits()
 {
   for (map<version_t,LogSegment*>::iterator p = pending_commit.begin();
@@ -202,24 +211,18 @@ void MDSTableClient::resend_commits()
   }
 }
 
-void MDSTableClient::handle_mds_recovery(int who)
+void MDSTableClient::resend_prepares()
 {
-  dout(7) << "handle_mds_recovery mds." << who << dendl;
-
-  if (who != mds->mdsmap->get_tableserver()) 
-    return; // do nothing.
-
-  resend_queries();
-  
-  // prepares.
+  while (!waiting_for_server.empty()) {
+    pending_prepare[++last_reqid] = waiting_for_server.front();
+    waiting_for_server.pop_front();
+  }
   for (map<uint64_t, _pending_prepare>::iterator p = pending_prepare.begin();
        p != pending_prepare.end();
        ++p) {
-    dout(10) << "resending " << p->first << dendl;
+    dout(10) << "resending prepare on " << p->first << dendl;
     MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, p->first);
     req->bl = p->second.mutation;
     mds->send_message_mds(req, mds->mdsmap->get_tableserver());
-  } 
-
-  resend_commits();
+  }
 }
diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
index e15837f..7638260 100644
--- a/src/mds/MDSTableClient.h
+++ b/src/mds/MDSTableClient.h
@@ -38,9 +38,12 @@ protected:
     bufferlist mutation;
 
     _pending_prepare() : onfinish(0), ptid(0), pbl(0) {}
+    _pending_prepare(Context *c, version_t *pt, bufferlist *pb, bufferlist& m) :
+      onfinish(c), ptid(pt), pbl(pb), mutation(m) {}
   };
 
   map<uint64_t, _pending_prepare> pending_prepare;
+  list<_pending_prepare> waiting_for_server;
 
   // pending commits
   map<version_t, LogSegment*> pending_commit;
@@ -68,9 +71,8 @@ public:
   void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, Context *onfinish);
   void commit(version_t tid, LogSegment *ls);
 
-  // for recovery (by other nodes)
-  void handle_mds_recovery(int mds); // called when someone else recovers
   void resend_commits();
+  void resend_prepares();
 
   // for recovery (by me)
   void got_journaled_agree(version_t tid, LogSegment *ls);
@@ -82,7 +84,6 @@ public:
   void wait_for_ack(version_t tid, Context *c) {
     ack_waiters[tid].push_back(c);
   }
-  void finish_recovery();                // called when i recover and go active
 
   void send_to_tableserver(MMDSTableRequest *req);
 
diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc
index 4f86ff1..e56e2b4 100644
--- a/src/mds/MDSTableServer.cc
+++ b/src/mds/MDSTableServer.cc
@@ -144,24 +144,30 @@ void MDSTableServer::do_server_update(bufferlist& bl)
 
 // recovery
 
-void MDSTableServer::finish_recovery()
+void MDSTableServer::finish_recovery(set<int>& active)
 {
   dout(7) << "finish_recovery" << dendl;
-  handle_mds_recovery(-1);  // resend agrees for everyone.
+  for (set<int>::iterator p = active.begin(); p != active.end(); ++p)
+    handle_mds_recovery(*p);  // resend agrees for everyone.
 }
 
 void MDSTableServer::handle_mds_recovery(int who)
 {
-  if (who >= 0)
-    dout(7) << "handle_mds_recovery mds." << who << dendl;
-  
+  dout(7) << "handle_mds_recovery mds." << who << dendl;
+
+  uint64_t next_reqid = 1;
   // resend agrees for recovered mds
   for (map<version_t,mds_table_pending_t>::iterator p = pending_for_mds.begin();
        p != pending_for_mds.end();
        ++p) {
-    if (who >= 0 && p->second.mds != who)
+    if (p->second.mds != who)
       continue;
+    if (p->second.reqid >= next_reqid)
+      next_reqid = p->second.reqid + 1;
     MMDSTableRequest *reply = new MMDSTableRequest(table, TABLESERVER_OP_AGREE, p->second.reqid, p->second.tid);
-    mds->send_message_mds(reply, p->second.mds);
+    mds->send_message_mds(reply, who);
   }
+
+  MMDSTableRequest *reply = new MMDSTableRequest(table, TABLESERVER_OP_SERVER_READY, next_reqid);
+  mds->send_message_mds(reply, who);
 }
diff --git a/src/mds/MDSTableServer.h b/src/mds/MDSTableServer.h
index 26cd594..55827e7 100644
--- a/src/mds/MDSTableServer.h
+++ b/src/mds/MDSTableServer.h
@@ -90,7 +90,7 @@ private:
   }
 
   // recovery
-  void finish_recovery();
+  void finish_recovery(set<int>& active);
   void handle_mds_recovery(int who);
 };
 
diff --git a/src/mds/mds_table_types.h b/src/mds/mds_table_types.h
index b094c75..c08519a 100644
--- a/src/mds/mds_table_types.h
+++ b/src/mds/mds_table_types.h
@@ -39,6 +39,7 @@ enum {
   TABLESERVER_OP_ACK          = -6,
   TABLESERVER_OP_ROLLBACK     =  7,
   TABLESERVER_OP_SERVER_UPDATE = 8,
+  TABLESERVER_OP_SERVER_READY = -9,
 };
 
 inline const char *get_mdstableserver_opname(int op) {
@@ -51,6 +52,7 @@ inline const char *get_mdstableserver_opname(int op) {
   case TABLESERVER_OP_ACK: return "ack";
   case TABLESERVER_OP_ROLLBACK: return "rollback";
   case TABLESERVER_OP_SERVER_UPDATE: return "server_update";
+  case TABLESERVER_OP_SERVER_READY: return "server_ready";
   default: assert(0); return 0;
   }
 };


  reply	other threads:[~2013-03-25 11:30 UTC|newest]

Thread overview: 117+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
2013-03-17 14:51 ` [PATCH 01/39] mds: preserve subtree bounds until slave commit Yan, Zheng
2013-03-20 18:33   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 02/39] mds: process finished contexts in batch Yan, Zheng
2013-03-20 18:33   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 03/39] mds: fix MDCache::adjust_bounded_subtree_auth() Yan, Zheng
2013-03-20 18:33   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 04/39] mds: make sure table request id unique Yan, Zheng
2013-03-19 23:09   ` Greg Farnum
2013-03-20  5:53     ` Yan, Zheng
2013-03-20  6:15       ` Sage Weil
2013-03-20  6:24         ` Yan, Zheng
2013-03-20  6:49         ` Yan, Zheng
2013-03-20 18:31           ` Greg Farnum
2013-03-21  8:07             ` Yan, Zheng
2013-03-21 22:03               ` Gregory Farnum
2013-03-25 11:30                 ` Yan, Zheng [this message]
2013-03-29 22:12                   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 05/39] mds: send table request when peer is in proper state Yan, Zheng
2013-03-20 18:34   ` Greg Farnum
2013-03-29 21:58   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 06/39] mds: make table client/server tolerate duplicated message Yan, Zheng
2013-03-29 22:00   ` Gregory Farnum
2013-03-31 13:21     ` Yan, Zheng
2013-03-17 14:51 ` [PATCH 07/39] mds: mark connection down when MDS fails Yan, Zheng
2013-03-20 18:37   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state Yan, Zheng
2013-03-20 18:40   ` Greg Farnum
2013-03-21  2:22     ` Yan, Zheng
2013-03-21 21:43       ` Gregory Farnum
2013-03-20 19:09   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 09/39] mds: defer eval gather locks when removing replica Yan, Zheng
2013-03-20 19:36   ` Greg Farnum
2013-03-21  2:29     ` Yan, Zheng
2013-03-17 14:51 ` [PATCH 10/39] mds: unify slave request waiting Yan, Zheng
2013-03-20 22:52   ` Sage Weil
2013-03-17 14:51 ` [PATCH 11/39] mds: don't delay processing replica buffer in slave request Yan, Zheng
2013-03-20 21:19   ` Greg Farnum
2013-03-21  2:38     ` Yan, Zheng
2013-03-21  4:15       ` Sage Weil
2013-03-21 21:48         ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 12/39] mds: compose and send resolve messages in batch Yan, Zheng
2013-03-20 21:45   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 13/39] mds: don't send resolve message between active MDS Yan, Zheng
2013-03-20 21:56   ` Gregory Farnum
2013-03-21  2:55     ` Yan, Zheng
2013-03-21 21:55       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 14/39] mds: set resolve/rejoin gather MDS set in advance Yan, Zheng
2013-03-20 22:09   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 15/39] mds: don't send MDentry{Link,Unlink} before receiving cache rejoin Yan, Zheng
2013-03-20 22:17   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 16/39] mds: send cache rejoin messages after gathering all resolves Yan, Zheng
2013-03-20 22:57   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 17/39] mds: send resolve acks after master updates are safely logged Yan, Zheng
2013-03-20 22:58   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 18/39] mds: fix MDS recovery involving cross authority rename Yan, Zheng
2013-03-21 17:59   ` Gregory Farnum
2013-03-22  3:04     ` Yan, Zheng
2013-03-29 22:02       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 19/39] mds: remove MDCache::rejoin_fetch_dirfrags() Yan, Zheng
2013-03-20 22:58   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 20/39] mds: include replica nonce in MMDSCacheRejoin::inode_strong Yan, Zheng
2013-03-20 23:26   ` Gregory Farnum
2013-03-20 23:36     ` Sage Weil
2013-03-17 14:51 ` [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack Yan, Zheng
2013-03-20 23:33   ` Gregory Farnum
2013-03-20 23:40     ` Gregory Farnum
2013-03-21  6:41     ` Yan, Zheng
2013-03-21 21:58       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 22/39] mds: handle linkage mismatch during cache rejoin Yan, Zheng
2013-03-21 21:23   ` Gregory Farnum
2013-03-22  3:05     ` Yan, Zheng
2013-03-25 16:14       ` Gregory Farnum
2013-03-26  7:21     ` Yan, Zheng
2013-03-29 22:09       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 23/39] mds: reqid for rejoinning authpin/wrlock need to be list Yan, Zheng
2013-03-20 23:59   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 24/39] mds: take object's versionlock when rejoinning xlock Yan, Zheng
2013-03-21  0:37   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 25/39] mds: share inode max size after MDS recovers Yan, Zheng
2013-03-21  0:45   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 26/39] mds: issue caps when lock state in replica become SYNC Yan, Zheng
2013-03-21  0:52   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 27/39] mds: send lock action message when auth MDS is in proper state Yan, Zheng
2013-03-21  3:12   ` Gregory Farnum
2013-03-21  3:20     ` Yan, Zheng
2013-03-17 14:51 ` [PATCH 28/39] mds: add dirty imported dirfrag to LogSegment Yan, Zheng
2013-03-21  3:14   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 29/39] mds: avoid double auth pin for file recovery Yan, Zheng
2013-03-21  3:20   ` Gregory Farnum
2013-03-21  3:33     ` Yan, Zheng
2013-03-21  4:20       ` Sage Weil
2013-03-21 21:58     ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 30/39] mds: check MDS peer's state through mdsmap Yan, Zheng
2013-03-21  3:24   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 31/39] mds: unfreeze subtree if import aborts in PREPPED state Yan, Zheng
2013-03-21  3:27   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 32/39] mds: fix export cancel notification Yan, Zheng
2013-03-21  3:31   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 33/39] mds: notify bystanders if export aborts Yan, Zheng
2013-03-21  3:34   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 34/39] mds: don't open dirfrag while subtree is frozen Yan, Zheng
2013-03-21  3:38   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 35/39] mds: clear dirty inode rstat if import fails Yan, Zheng
2013-03-21  3:40   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 36/39] mds: try merging subtree after clear EXPORTBOUND Yan, Zheng
2013-03-21  3:44   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 37/39] mds: eval inodes with caps imported by cache rejoin message Yan, Zheng
2013-03-21  3:45   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 38/39] mds: don't replicate purging dentry Yan, Zheng
2013-03-21  3:46   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 39/39] mds: clear scatter dirty if replica inode has no auth subtree Yan, Zheng
2013-03-21  3:49   ` Gregory Farnum
2013-04-01  8:46 ` [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
2013-04-01 17:00   ` Gregory Farnum
2013-04-01  8:51 ` [PATCH] mds: avoid sending duplicated table prepare/commit Yan, Zheng
2013-04-01  8:51   ` [PATCH] mds: don't roll back prepared table updates Yan, Zheng

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5150355D.1050501@intel.com \
    --to=zheng.z.yan@intel.com \
    --cc=ceph-devel@vger.kernel.org \
    --cc=greg@inktank.com \
    --cc=sage@inktank.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.