From mboxrd@z Thu Jan 1 00:00:00 1970 From: Alexander Aring Date: Fri, 3 Mar 2023 09:40:36 -0500 Subject: [Cluster-devel] [PATCH dlm-tool 14/14] python: add posix lockdb plot tool In-Reply-To: <20230302171441.1509914-14-aahringo@redhat.com> References: <20230302171441.1509914-1-aahringo@redhat.com> <20230302171441.1509914-14-aahringo@redhat.com> Message-ID: List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Hi, On Thu, Mar 2, 2023 at 12:14?PM Alexander Aring wrote: > > This patch adds a python tool to plot with use of python plotly [0] the > dlm_controld cluster-wide lockdb. It scans dlm_controld plock log files > for the required information about possible contention (waiters) and the > acquired lock modes over time. > > Example: > > ./lockdb_plot /var/log/dlm_controld/plock.log > > and plotly will start a webserver and you can see a generated, interactive > plot about the lockranges in y axis and time in x axis. Depending on > usage it's limited and does not scale. Ther exists arguments like --info > to get plock.log timing information and with --ts_start/ts_end you can > filter a specific time range to only show a small part of the plock log > to reduce the amount of rectangles need to be drawn by the browser. > > It shows only lock modes and contention, no lock requests. Means A trylock > is only visible when it was successful. A non trylock will be drawn and > has possible contention. > > With this plot it should be easy to figure out on which node a specific > lock range was acquired and resulted in contention. > > [0] https://plotly.com/python/ > --- > python/tools/lockdb_plot | 429 +++++++++++++++++++++++++++++++++++++++ > 1 file changed, 429 insertions(+) > create mode 100755 python/tools/lockdb_plot > > diff --git a/python/tools/lockdb_plot b/python/tools/lockdb_plot > new file mode 100755 > index 00000000..761bdea2 > --- /dev/null > +++ b/python/tools/lockdb_plot > @@ -0,0 +1,429 @@ > +#!/usr/bin/env python3 > + > +import plotly.graph_objects as go > +from enum import Enum > +import argparse > +import sys > + > +NSECS_PER_SECS = 1000000000 > + > +def nsecs_to_secs(nsecs): > + return nsecs/NSECS_PER_SECS > + > +def secs_to_nsecs(secs): > + return secs * NSECS_PER_SECS > + > +# entry point of collecting data > +nodes = dict() > +# will parse log to get our_nodeid > +our_nodeid = None > +# legend groups, tmp set to show legend entry only once > +legend_groups = set() > + > +class EType(Enum): > + LOCK = 1 > + WAITER = 2 > + > +def lookup_lockdb(nodeid, ls, number): > + if nodeid not in nodes: > + nodes[nodeid] = dict() > + > + lockspaces = nodes[nodeid] > + if ls not in lockspaces: > + lockspaces[ls] = dict() > + > + if number not in lockspaces[ls]: > + lockspaces[ls][number] = LockDb(nodeid, ls, number) > + > + return lockspaces[ls][number] > + > +def plotly_rect_shape(x0, y0, x1, y1, color, opacity, text, group): > + # we show it in secs > + x0 = nsecs_to_secs(x0) > + x1 = nsecs_to_secs(x1) > + > + rv = go.Scatter(x=[x0,x0,x1,x1,x0], > + y=[y0,y1,y1,y0,y0], > + fill="toself", > + fillcolor=color, > + mode='lines', > + name=group, > + legendgroup=group, > + showlegend=group not in legend_groups, > + visible=True, > + text=text, > + line=dict(color=color, > + width=1), > + opacity=opacity) > + > + legend_groups.add(group) > + return rv > + > + > +class LockDbRect(): > + def __init__(self, **kwargs): > + self.ls = kwargs["ls"] > + self.number = kwargs["number"] > + self.pid = kwargs["pid"] > + self.nodeid = kwargs["nodeid"] > + self.owner = kwargs["owner"] > + self.ts_start = kwargs["ts_start"] > + self.ts_end = kwargs["ts_end"] > + # +1 because lock range 1000-1000 will then be 999-1000 > + # exclusive vs inclusive we need to decide how to display > + # it but we don't set a zero height > + self.lk_start = kwargs["lk_start"] - 1 > + self.lk_end = kwargs["lk_end"] > + > + def create_shape(self): > + return plotly_rect_shape(self.ts_start, > + self.lk_start, > + self.ts_end, > + self.lk_end, > + self.get_color(), > + self.get_opacity(), > + self.get_text(), > + self.get_group()) > + > + def get_duration(self): > + return self.ts_end - self.ts_start > + > + def get_range(self): > + return self.lk_end - self.lk_start > + > + def __str__(self): > + return str(self.__dict__) > + > + def __repr__(self): > + return str(self) > + > +class LockDbRectWaiter(LockDbRect): > + def __init__(self, **kwargs): > + super(LockDbRectWaiter, self).__init__(**kwargs) > + > + def get_color(self): > + return "green" > + > + def get_opacity(self): > + return 0.2 > + > + def get_text(self): > + # TODO make it human readable for high numbers? > + ts_start = nsecs_to_secs(self.ts_start); > + ts_end = nsecs_to_secs(self.ts_end); > + dt = nsecs_to_secs(self.get_duration()); > + > + text = "nodeid: {}, pid: {}, owner: {}
".format(self.nodeid, self.pid, hex(self.owner)) + \ > + "ts_start: {0:.3f},".format(ts_start) + \ > + "ts_end: {0:.3f},".format(ts_end) + \ > + "dt: {0:.3f}
".format(dt) + \ > + "lk_start: {}, lk_end: {}, dy: {}
".format(hex(self.lk_start), hex(self.lk_end), hex(self.get_range())) > + > + return text > + > + def get_text_pl(self): > + text = "contention:
" + \ > + "start: {0:.3f},".format(nsecs_to_secs(self.ts_start)) + \ > + "end: {0:.3f},".format(nsecs_to_secs(self.ts_end)) + \ > + "dt: {0:.3f}".format(nsecs_to_secs(self.get_duration())) > + there are some missing spaces after each comma. I will try to add a function and use this function for the same code below. > + return text > + > + def get_group(self): > + return "contention" > + > +class LockDbRectPLock(LockDbRect): > + def __init__(self, **kwargs): > + super(LockDbRectPLock, self).__init__(**kwargs) > + self.mode = kwargs["mode"] > + if self.mode == "WR": > + self.color = "red" > + self.group = "write lock" > + elif self.mode == "RD": > + self.color = "blue" > + self.group = "read lock" > + else: > + print("{}: received unknown mode".format(self.ts_start)) > + return > + > + self.waiter = None > + > + def __str__(self): > + return str(self.__dict__) > + > + def __repr__(self): > + return str(self) > + > + def has_waiter(self): > + return self.waiter > + > + def set_waiter(self, wa): > + if self.has_waiter(): > + print("{}: has already a waiter set".format(self.ts_start)) > + return > + > + self.waiter = wa > + > + def get_waiter(self): > + return self.waiter > + > + def get_color(self): > + return self.color > + > + def get_opacity(self): > + return 0.4 > + > + def get_text(self): > + # TODO make it human readable for high numbers? > + ts_start = nsecs_to_secs(self.ts_start); > + ts_end = nsecs_to_secs(self.ts_end); > + dt = nsecs_to_secs(self.get_duration()); > + > + text = "nodeid: {}, pid: {}, owner: {}
".format(self.nodeid, self.pid, hex(self.owner)) + \ > + "ts_start: {0:.3f},".format(ts_start) + \ > + "ts_end: {0:.3f},".format(ts_end) + \ > + "dt: {0:.3f}
".format(dt) + \ here... > + "lk_start: {}, lk_end: {}, dy: {}
".format(hex(self.lk_start), hex(self.lk_end), hex(self.get_range())) > + > + if self.has_waiter(): > + text = text + "
{}".format(self.get_waiter().get_text_pl()) > + > + return text; > + > + def get_group(self): > + return self.group > + > +class LockDb: > + def __init__(self, nodeid, ls, number): > + self.nodeid = nodeid > + self.ls = ls > + self.number = number > + self.__ts = dict() > + self.plocks = list() > + self.__last_plock = dict() > + self.waiters = list() > + self.__last_waiter = dict() > + self.__ts_start_min = None > + self.__ts_end_max = 0 > + > + def event_start(self, vals): > + self.__ts[vals["ptr"]] = vals > + > + def __plock(self, vals_start, vals_end): > + pl = LockDbRectPLock(ls=vals_start["ls"], > + number=vals_start["number"], > + pid=vals_start["pid"], > + nodeid=vals_start["nodeid"], > + owner=vals_start["owner"], > + ts_start=vals_start["ts"], > + ts_end=vals_end["ts"], > + lk_start=vals_start["start"], > + lk_end=vals_start["end"], > + mode=vals_start["mode"]) > + self.plocks.append(pl) > + self.__last_plock[vals_end["ptr"]] = pl > + > + try: > + pl.set_waiter(self.__last_waiter[vals_start["owner"]]) > + del(self.__last_waiter[vals_start["owner"]]) > + except KeyError: > + pass > + > + def __waiter(self, vals_start, vals_end): > + wa = LockDbRectWaiter(ls=vals_start["ls"], > + number=vals_start["number"], > + pid=vals_start["pid"], > + nodeid=vals_start["nodeid"], > + owner=vals_start["owner"], > + ts_start=vals_start["ts"], > + ts_end=vals_end["ts"], > + lk_start=vals_start["start"], > + lk_end=vals_start["end"]) > + self.waiters.append(wa) > + self.__last_waiter[vals_start["owner"]] = wa > + > + def event_end(self, vals): > + try: > + vals_start = self.__ts[vals["ptr"]] > + del(self.__ts[vals["ptr"]]) > + except KeyError: > + print("{}: cannot find previous start, ignoring".format(vals["ts"])) > + return > + > + if not self.__ts_start_min: > + self.__ts_start_min = vals_start["ts"] > + else: > + self.__ts_start_min = min(self.__ts_start_min, vals_start["ts"]) > + > + if not self.__ts_start_min: > + self.__ts_end_max = vals["ts"] > + else: > + self.__ts_end_max = max(self.__ts_end_max, vals["ts"]) > + > + if vals["event"] == EType.LOCK: > + self.__plock(vals_start, vals) > + elif vals["event"] == EType.WAITER: > + self.__waiter(vals_start, vals) > + else: > + print("{}: unknown event type".format(vals["ts"])) > + > + def add_shapes(self, fig): > + global legend_groups > + legend_groups = set() > + > + # first waiters to be in background > + for wa in self.waiters: > + fig.add_trace(wa.create_shape()) > + > + # next layer to draw lock states > + for pl in self.plocks: > + fig.add_trace(pl.create_shape()) > + > + def get_ts_min(self): > + return self.__ts_start_min > + > + def get_ts_max(self): > + return self.__ts_end_max > + > + def __str__(self): > + return "ls {} inode {}".format(self.ls, self.number) > + > + def __repr__(self): > + return str(self) > + > +parser = argparse.ArgumentParser(description="Plots dlm_controld lock databases") > +parser.add_argument('-i', "--info", action="store_true") > +parser.add_argument('-s', "--ts_start", type=float) > +parser.add_argument('-e', "--ts_end", type=float) > + > +parser.add_argument('plock_logs', type=argparse.FileType('r'), nargs='+') > +args = parser.parse_args() > + > +for f in args.plock_logs: > + lines = f.readlines() > + > + # Current way to figure out our nodeid > + for line in lines: > + if "read plock" in line: > + lsplit = line.split() > + npo = lsplit[11].split('/') > + nodeid = int(npo[0]) > + our_nodeid = nodeid > + break > + > + if not our_nodeid: > + print("{}: could not determine our nodeid".format(f)) > + continue > + > + for line in lines: > + if "state plock" in line: > + pass > + elif "state waiter" in line: > + pass > + else: > + continue > + > + vals = dict() > + lsplit = line.split() > + vals["ts"] = int(lsplit[3]) > + > + if args.ts_start and secs_to_nsecs(args.ts_start) > int(vals["ts"]): > + continue > + > + if args.ts_end and secs_to_nsecs(args.ts_end) < int(vals["ts"]): > + break; > + > + vals["ls"] = lsplit[4] > + vals["op"] = lsplit[7] > + vals["number"] = int(lsplit[8], 16) > + vals["ptr"] = lsplit[9] > + > + db = lookup_lockdb(our_nodeid, vals["ls"], vals["number"]) > + if "state plock" in line: > + vals["event"] = EType.LOCK > + if vals["op"] == "start": > + vals["mode"] = lsplit[10] > + se = lsplit[11].split('-') > + vals["start"] = int(se[0], 16) > + vals["end"] = int(se[1], 16) > + npo = lsplit[12].split('/') > + vals["nodeid"] = int(npo[0]) > + vals["pid"] = int(npo[1]) > + vals["owner"] = int(npo[2], 16) > + db.event_start(vals) > + elif vals["op"] =="end": > + db.event_end(vals) > + else: > + print("{}: unknown state plock operation: {}".format(vals["ts"], vals["op"])) > + elif "state waiter" in line: > + vals["event"] = EType.WAITER > + if vals["op"] == "start": > + se = lsplit[10].split('-') > + vals["start"] = int(se[0], 16) > + vals["end"] = int(se[1], 16) > + npo = lsplit[11].split('/') > + vals["nodeid"] = int(npo[0]) > + vals["pid"] = int(npo[1]) > + vals["owner"] = int(npo[2], 16) > + db.event_start(vals) > + elif vals["op"] == "end": > + db.event_end(vals) > + else: > + print("{}: unknown state waiter operation: {}".format(vals["ts"], vals["op"])) > + else: > + print("received unknown line event: {}".format(line)) > + > +if args.info: > + ts_max = None > + ts_min = None > + > + for nodeid in nodes: > + for ls in nodes[nodeid]: > + for number in nodes[nodeid][ls]: > + db = lookup_lockdb(nodeid, ls, number) > + if not ts_min: > + ts_min = db.get_ts_min() > + else: > + ts_min = min(ts_min, db.get_ts_min()) > + > + if not ts_max: > + ts_max = db.get_ts_max() > + else: > + ts_max = min(ts_max, db.get_ts_max()) > + > + print("global info:") > + print("\tts_start: {0:.3f} secs".format(nsecs_to_secs(ts_min))) > + print("\tts_end: {0:.3f} secs".format(nsecs_to_secs(ts_max))) > + print("\tduration: {0:.3f} secs".format(nsecs_to_secs(ts_max) - nsecs_to_secs(ts_min))) > + > + print() > + print("per inode lock info:") > + for nodeid in nodes: > + print("nodeid: {}".format(nodeid)) > + for ls in nodes[nodeid]: > + print("\tls: {}".format(ls)) > + for number in nodes[nodeid][ls]: > + print("\t\tinode: {}".format(hex(number))) > + db = lookup_lockdb(nodeid, ls, number) > + print("\t\t\tts_start: {0:.3f} secs".format(nsecs_to_secs(db.get_ts_min()))) > + print("\t\t\tts_end: {0:.3f} secs".format(nsecs_to_secs(db.get_ts_max()))) > + print("\t\t\tduration: {0:.3f} secs".format(nsecs_to_secs(db.get_ts_max()) - nsecs_to_secs(db.get_ts_min()))) > + I will add a "top 5" high score of the longest duration of contention state... So if somebody is looking for long contention states it is easy to find them with a fast parsing of the log files. - Alex