From mboxrd@z Thu Jan 1 00:00:00 1970 From: Alexander Aring Date: Thu, 2 Mar 2023 12:14:41 -0500 Subject: [Cluster-devel] [PATCH dlm-tool 14/14] python: add posix lockdb plot tool In-Reply-To: <20230302171441.1509914-1-aahringo@redhat.com> References: <20230302171441.1509914-1-aahringo@redhat.com> Message-ID: <20230302171441.1509914-14-aahringo@redhat.com> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit This patch adds a python tool to plot with use of python plotly [0] the dlm_controld cluster-wide lockdb. It scans dlm_controld plock log files for the required information about possible contention (waiters) and the acquired lock modes over time. Example: ./lockdb_plot /var/log/dlm_controld/plock.log and plotly will start a webserver and you can see a generated, interactive plot about the lockranges in y axis and time in x axis. Depending on usage it's limited and does not scale. Ther exists arguments like --info to get plock.log timing information and with --ts_start/ts_end you can filter a specific time range to only show a small part of the plock log to reduce the amount of rectangles need to be drawn by the browser. It shows only lock modes and contention, no lock requests. Means A trylock is only visible when it was successful. A non trylock will be drawn and has possible contention. With this plot it should be easy to figure out on which node a specific lock range was acquired and resulted in contention. [0] https://plotly.com/python/ --- python/tools/lockdb_plot | 429 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 429 insertions(+) create mode 100755 python/tools/lockdb_plot diff --git a/python/tools/lockdb_plot b/python/tools/lockdb_plot new file mode 100755 index 00000000..761bdea2 --- /dev/null +++ b/python/tools/lockdb_plot @@ -0,0 +1,429 @@ +#!/usr/bin/env python3 + +import plotly.graph_objects as go +from enum import Enum +import argparse +import sys + +NSECS_PER_SECS = 1000000000 + +def nsecs_to_secs(nsecs): + return nsecs/NSECS_PER_SECS + +def secs_to_nsecs(secs): + return secs * NSECS_PER_SECS + +# entry point of collecting data +nodes = dict() +# will parse log to get our_nodeid +our_nodeid = None +# legend groups, tmp set to show legend entry only once +legend_groups = set() + +class EType(Enum): + LOCK = 1 + WAITER = 2 + +def lookup_lockdb(nodeid, ls, number): + if nodeid not in nodes: + nodes[nodeid] = dict() + + lockspaces = nodes[nodeid] + if ls not in lockspaces: + lockspaces[ls] = dict() + + if number not in lockspaces[ls]: + lockspaces[ls][number] = LockDb(nodeid, ls, number) + + return lockspaces[ls][number] + +def plotly_rect_shape(x0, y0, x1, y1, color, opacity, text, group): + # we show it in secs + x0 = nsecs_to_secs(x0) + x1 = nsecs_to_secs(x1) + + rv = go.Scatter(x=[x0,x0,x1,x1,x0], + y=[y0,y1,y1,y0,y0], + fill="toself", + fillcolor=color, + mode='lines', + name=group, + legendgroup=group, + showlegend=group not in legend_groups, + visible=True, + text=text, + line=dict(color=color, + width=1), + opacity=opacity) + + legend_groups.add(group) + return rv + + +class LockDbRect(): + def __init__(self, **kwargs): + self.ls = kwargs["ls"] + self.number = kwargs["number"] + self.pid = kwargs["pid"] + self.nodeid = kwargs["nodeid"] + self.owner = kwargs["owner"] + self.ts_start = kwargs["ts_start"] + self.ts_end = kwargs["ts_end"] + # +1 because lock range 1000-1000 will then be 999-1000 + # exclusive vs inclusive we need to decide how to display + # it but we don't set a zero height + self.lk_start = kwargs["lk_start"] - 1 + self.lk_end = kwargs["lk_end"] + + def create_shape(self): + return plotly_rect_shape(self.ts_start, + self.lk_start, + self.ts_end, + self.lk_end, + self.get_color(), + self.get_opacity(), + self.get_text(), + self.get_group()) + + def get_duration(self): + return self.ts_end - self.ts_start + + def get_range(self): + return self.lk_end - self.lk_start + + def __str__(self): + return str(self.__dict__) + + def __repr__(self): + return str(self) + +class LockDbRectWaiter(LockDbRect): + def __init__(self, **kwargs): + super(LockDbRectWaiter, self).__init__(**kwargs) + + def get_color(self): + return "green" + + def get_opacity(self): + return 0.2 + + def get_text(self): + # TODO make it human readable for high numbers? + ts_start = nsecs_to_secs(self.ts_start); + ts_end = nsecs_to_secs(self.ts_end); + dt = nsecs_to_secs(self.get_duration()); + + text = "nodeid: {}, pid: {}, owner: {}
".format(self.nodeid, self.pid, hex(self.owner)) + \ + "ts_start: {0:.3f},".format(ts_start) + \ + "ts_end: {0:.3f},".format(ts_end) + \ + "dt: {0:.3f}
".format(dt) + \ + "lk_start: {}, lk_end: {}, dy: {}
".format(hex(self.lk_start), hex(self.lk_end), hex(self.get_range())) + + return text + + def get_text_pl(self): + text = "contention:
" + \ + "start: {0:.3f},".format(nsecs_to_secs(self.ts_start)) + \ + "end: {0:.3f},".format(nsecs_to_secs(self.ts_end)) + \ + "dt: {0:.3f}".format(nsecs_to_secs(self.get_duration())) + + return text + + def get_group(self): + return "contention" + +class LockDbRectPLock(LockDbRect): + def __init__(self, **kwargs): + super(LockDbRectPLock, self).__init__(**kwargs) + self.mode = kwargs["mode"] + if self.mode == "WR": + self.color = "red" + self.group = "write lock" + elif self.mode == "RD": + self.color = "blue" + self.group = "read lock" + else: + print("{}: received unknown mode".format(self.ts_start)) + return + + self.waiter = None + + def __str__(self): + return str(self.__dict__) + + def __repr__(self): + return str(self) + + def has_waiter(self): + return self.waiter + + def set_waiter(self, wa): + if self.has_waiter(): + print("{}: has already a waiter set".format(self.ts_start)) + return + + self.waiter = wa + + def get_waiter(self): + return self.waiter + + def get_color(self): + return self.color + + def get_opacity(self): + return 0.4 + + def get_text(self): + # TODO make it human readable for high numbers? + ts_start = nsecs_to_secs(self.ts_start); + ts_end = nsecs_to_secs(self.ts_end); + dt = nsecs_to_secs(self.get_duration()); + + text = "nodeid: {}, pid: {}, owner: {}
".format(self.nodeid, self.pid, hex(self.owner)) + \ + "ts_start: {0:.3f},".format(ts_start) + \ + "ts_end: {0:.3f},".format(ts_end) + \ + "dt: {0:.3f}
".format(dt) + \ + "lk_start: {}, lk_end: {}, dy: {}
".format(hex(self.lk_start), hex(self.lk_end), hex(self.get_range())) + + if self.has_waiter(): + text = text + "
{}".format(self.get_waiter().get_text_pl()) + + return text; + + def get_group(self): + return self.group + +class LockDb: + def __init__(self, nodeid, ls, number): + self.nodeid = nodeid + self.ls = ls + self.number = number + self.__ts = dict() + self.plocks = list() + self.__last_plock = dict() + self.waiters = list() + self.__last_waiter = dict() + self.__ts_start_min = None + self.__ts_end_max = 0 + + def event_start(self, vals): + self.__ts[vals["ptr"]] = vals + + def __plock(self, vals_start, vals_end): + pl = LockDbRectPLock(ls=vals_start["ls"], + number=vals_start["number"], + pid=vals_start["pid"], + nodeid=vals_start["nodeid"], + owner=vals_start["owner"], + ts_start=vals_start["ts"], + ts_end=vals_end["ts"], + lk_start=vals_start["start"], + lk_end=vals_start["end"], + mode=vals_start["mode"]) + self.plocks.append(pl) + self.__last_plock[vals_end["ptr"]] = pl + + try: + pl.set_waiter(self.__last_waiter[vals_start["owner"]]) + del(self.__last_waiter[vals_start["owner"]]) + except KeyError: + pass + + def __waiter(self, vals_start, vals_end): + wa = LockDbRectWaiter(ls=vals_start["ls"], + number=vals_start["number"], + pid=vals_start["pid"], + nodeid=vals_start["nodeid"], + owner=vals_start["owner"], + ts_start=vals_start["ts"], + ts_end=vals_end["ts"], + lk_start=vals_start["start"], + lk_end=vals_start["end"]) + self.waiters.append(wa) + self.__last_waiter[vals_start["owner"]] = wa + + def event_end(self, vals): + try: + vals_start = self.__ts[vals["ptr"]] + del(self.__ts[vals["ptr"]]) + except KeyError: + print("{}: cannot find previous start, ignoring".format(vals["ts"])) + return + + if not self.__ts_start_min: + self.__ts_start_min = vals_start["ts"] + else: + self.__ts_start_min = min(self.__ts_start_min, vals_start["ts"]) + + if not self.__ts_start_min: + self.__ts_end_max = vals["ts"] + else: + self.__ts_end_max = max(self.__ts_end_max, vals["ts"]) + + if vals["event"] == EType.LOCK: + self.__plock(vals_start, vals) + elif vals["event"] == EType.WAITER: + self.__waiter(vals_start, vals) + else: + print("{}: unknown event type".format(vals["ts"])) + + def add_shapes(self, fig): + global legend_groups + legend_groups = set() + + # first waiters to be in background + for wa in self.waiters: + fig.add_trace(wa.create_shape()) + + # next layer to draw lock states + for pl in self.plocks: + fig.add_trace(pl.create_shape()) + + def get_ts_min(self): + return self.__ts_start_min + + def get_ts_max(self): + return self.__ts_end_max + + def __str__(self): + return "ls {} inode {}".format(self.ls, self.number) + + def __repr__(self): + return str(self) + +parser = argparse.ArgumentParser(description="Plots dlm_controld lock databases") +parser.add_argument('-i', "--info", action="store_true") +parser.add_argument('-s', "--ts_start", type=float) +parser.add_argument('-e', "--ts_end", type=float) + +parser.add_argument('plock_logs', type=argparse.FileType('r'), nargs='+') +args = parser.parse_args() + +for f in args.plock_logs: + lines = f.readlines() + + # Current way to figure out our nodeid + for line in lines: + if "read plock" in line: + lsplit = line.split() + npo = lsplit[11].split('/') + nodeid = int(npo[0]) + our_nodeid = nodeid + break + + if not our_nodeid: + print("{}: could not determine our nodeid".format(f)) + continue + + for line in lines: + if "state plock" in line: + pass + elif "state waiter" in line: + pass + else: + continue + + vals = dict() + lsplit = line.split() + vals["ts"] = int(lsplit[3]) + + if args.ts_start and secs_to_nsecs(args.ts_start) > int(vals["ts"]): + continue + + if args.ts_end and secs_to_nsecs(args.ts_end) < int(vals["ts"]): + break; + + vals["ls"] = lsplit[4] + vals["op"] = lsplit[7] + vals["number"] = int(lsplit[8], 16) + vals["ptr"] = lsplit[9] + + db = lookup_lockdb(our_nodeid, vals["ls"], vals["number"]) + if "state plock" in line: + vals["event"] = EType.LOCK + if vals["op"] == "start": + vals["mode"] = lsplit[10] + se = lsplit[11].split('-') + vals["start"] = int(se[0], 16) + vals["end"] = int(se[1], 16) + npo = lsplit[12].split('/') + vals["nodeid"] = int(npo[0]) + vals["pid"] = int(npo[1]) + vals["owner"] = int(npo[2], 16) + db.event_start(vals) + elif vals["op"] =="end": + db.event_end(vals) + else: + print("{}: unknown state plock operation: {}".format(vals["ts"], vals["op"])) + elif "state waiter" in line: + vals["event"] = EType.WAITER + if vals["op"] == "start": + se = lsplit[10].split('-') + vals["start"] = int(se[0], 16) + vals["end"] = int(se[1], 16) + npo = lsplit[11].split('/') + vals["nodeid"] = int(npo[0]) + vals["pid"] = int(npo[1]) + vals["owner"] = int(npo[2], 16) + db.event_start(vals) + elif vals["op"] == "end": + db.event_end(vals) + else: + print("{}: unknown state waiter operation: {}".format(vals["ts"], vals["op"])) + else: + print("received unknown line event: {}".format(line)) + +if args.info: + ts_max = None + ts_min = None + + for nodeid in nodes: + for ls in nodes[nodeid]: + for number in nodes[nodeid][ls]: + db = lookup_lockdb(nodeid, ls, number) + if not ts_min: + ts_min = db.get_ts_min() + else: + ts_min = min(ts_min, db.get_ts_min()) + + if not ts_max: + ts_max = db.get_ts_max() + else: + ts_max = min(ts_max, db.get_ts_max()) + + print("global info:") + print("\tts_start: {0:.3f} secs".format(nsecs_to_secs(ts_min))) + print("\tts_end: {0:.3f} secs".format(nsecs_to_secs(ts_max))) + print("\tduration: {0:.3f} secs".format(nsecs_to_secs(ts_max) - nsecs_to_secs(ts_min))) + + print() + print("per inode lock info:") + for nodeid in nodes: + print("nodeid: {}".format(nodeid)) + for ls in nodes[nodeid]: + print("\tls: {}".format(ls)) + for number in nodes[nodeid][ls]: + print("\t\tinode: {}".format(hex(number))) + db = lookup_lockdb(nodeid, ls, number) + print("\t\t\tts_start: {0:.3f} secs".format(nsecs_to_secs(db.get_ts_min()))) + print("\t\t\tts_end: {0:.3f} secs".format(nsecs_to_secs(db.get_ts_max()))) + print("\t\t\tduration: {0:.3f} secs".format(nsecs_to_secs(db.get_ts_max()) - nsecs_to_secs(db.get_ts_min()))) + + exit(0) + +for nodeid in nodes: + for ls in nodes[nodeid]: + for number in nodes[nodeid][ls]: + print("create plot: {} {} {}".format(nodeid, ls, number)) + + fig = go.Figure() + fig.update_layout(title_text="dlm_controld cluster-wide lock database
nodeid: {}, lockspace: {}, inode: {}".format(nodeid, ls, hex(number))) + + # fill plot with shapes + lookup_lockdb(nodeid, ls, number).add_shapes(fig) + + fig.update_xaxes(title_text='Time (secs)') + fig.update_yaxes(title_text='Lock Range (bytes decimal)') + + config = dict({'scrollZoom': True, 'displaylogo': False}) + fig.show(config=config) -- 2.31.1