From mboxrd@z Thu Jan  1 00:00:00 1970
From: Alexander Aring <aahringo@redhat.com>
Date: Fri, 3 Mar 2023 09:40:36 -0500
Subject: [Cluster-devel] [PATCH dlm-tool 14/14] python: add posix lockdb
 plot tool
In-Reply-To: <20230302171441.1509914-14-aahringo@redhat.com>
References: <20230302171441.1509914-1-aahringo@redhat.com>
 <20230302171441.1509914-14-aahringo@redhat.com>
Message-ID: <CAK-6q+jk5MQHsw_=H=Wjpp+dYE-X=f5rvtT+hgu7PN8_vDkdJA@mail.gmail.com>
List-Id: <cluster-devel.redhat.com>
To: cluster-devel.redhat.com
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit

Hi,

On Thu, Mar 2, 2023 at 12:14?PM Alexander Aring <aahringo@redhat.com> wrote:
>
> This patch adds a python tool to plot with use of python plotly [0] the
> dlm_controld cluster-wide lockdb. It scans dlm_controld plock log files
> for the required information about possible contention (waiters) and the
> acquired lock modes over time.
>
> Example:
>
> ./lockdb_plot /var/log/dlm_controld/plock.log
>
> and plotly will start a webserver and you can see a generated, interactive
> plot about the lockranges in y axis and time in x axis. Depending on
> usage it's limited and does not scale. Ther exists arguments like --info
> to get plock.log timing information and with --ts_start/ts_end you can
> filter a specific time range to only show a small part of the plock log
> to reduce the amount of rectangles need to be drawn by the browser.
>
> It shows only lock modes and contention, no lock requests. Means A trylock
> is only visible when it was successful. A non trylock will be drawn and
> has possible contention.
>
> With this plot it should be easy to figure out on which node a specific
> lock range was acquired and resulted in contention.
>
> [0] https://plotly.com/python/
> ---
>  python/tools/lockdb_plot | 429 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 429 insertions(+)
>  create mode 100755 python/tools/lockdb_plot
>
> diff --git a/python/tools/lockdb_plot b/python/tools/lockdb_plot
> new file mode 100755
> index 00000000..761bdea2
> --- /dev/null
> +++ b/python/tools/lockdb_plot
> @@ -0,0 +1,429 @@
> +#!/usr/bin/env python3
> +
> +import plotly.graph_objects as go
> +from enum import Enum
> +import argparse
> +import sys
> +
> +NSECS_PER_SECS = 1000000000
> +
> +def nsecs_to_secs(nsecs):
> +    return nsecs/NSECS_PER_SECS
> +
> +def secs_to_nsecs(secs):
> +    return secs * NSECS_PER_SECS
> +
> +# entry point of collecting data
> +nodes = dict()
> +# will parse log to get our_nodeid
> +our_nodeid = None
> +# legend groups, tmp set to show legend entry only once
> +legend_groups = set()
> +
> +class EType(Enum):
> +    LOCK = 1
> +    WAITER = 2
> +
> +def lookup_lockdb(nodeid, ls, number):
> +    if nodeid not in nodes:
> +        nodes[nodeid] = dict()
> +
> +    lockspaces = nodes[nodeid]
> +    if ls not in lockspaces:
> +        lockspaces[ls] = dict()
> +
> +    if number not in lockspaces[ls]:
> +        lockspaces[ls][number] = LockDb(nodeid, ls, number)
> +
> +    return lockspaces[ls][number]
> +
> +def plotly_rect_shape(x0, y0, x1, y1, color, opacity, text, group):
> +    # we show it in secs
> +    x0 = nsecs_to_secs(x0)
> +    x1 = nsecs_to_secs(x1)
> +
> +    rv = go.Scatter(x=[x0,x0,x1,x1,x0],
> +                    y=[y0,y1,y1,y0,y0],
> +                    fill="toself",
> +                    fillcolor=color,
> +                    mode='lines',
> +                    name=group,
> +                    legendgroup=group,
> +                    showlegend=group not in legend_groups,
> +                    visible=True,
> +                    text=text,
> +                    line=dict(color=color,
> +                              width=1),
> +                    opacity=opacity)
> +
> +    legend_groups.add(group)
> +    return rv
> +
> +
> +class LockDbRect():
> +    def __init__(self, **kwargs):
> +        self.ls = kwargs["ls"]
> +        self.number = kwargs["number"]
> +        self.pid = kwargs["pid"]
> +        self.nodeid = kwargs["nodeid"]
> +        self.owner = kwargs["owner"]
> +        self.ts_start = kwargs["ts_start"]
> +        self.ts_end = kwargs["ts_end"]
> +        # +1 because lock range 1000-1000 will then be 999-1000
> +        # exclusive vs inclusive we need to decide how to display
> +        # it but we don't set a zero height
> +        self.lk_start = kwargs["lk_start"] - 1
> +        self.lk_end = kwargs["lk_end"]
> +
> +    def create_shape(self):
> +        return plotly_rect_shape(self.ts_start,
> +                                 self.lk_start,
> +                                 self.ts_end,
> +                                 self.lk_end,
> +                                 self.get_color(),
> +                                 self.get_opacity(),
> +                                 self.get_text(),
> +                                 self.get_group())
> +
> +    def get_duration(self):
> +        return self.ts_end - self.ts_start
> +
> +    def get_range(self):
> +        return self.lk_end - self.lk_start
> +
> +    def __str__(self):
> +        return str(self.__dict__)
> +
> +    def __repr__(self):
> +        return str(self)
> +
> +class LockDbRectWaiter(LockDbRect):
> +    def __init__(self, **kwargs):
> +        super(LockDbRectWaiter, self).__init__(**kwargs)
> +
> +    def get_color(self):
> +        return "green"
> +
> +    def get_opacity(self):
> +        return 0.2
> +
> +    def get_text(self):
> +        # TODO make it human readable for high numbers?
> +        ts_start = nsecs_to_secs(self.ts_start);
> +        ts_end = nsecs_to_secs(self.ts_end);
> +        dt = nsecs_to_secs(self.get_duration());
> +
> +        text = "nodeid: {}, pid: {}, owner: {}<br>".format(self.nodeid, self.pid, hex(self.owner)) + \
> +               "ts_start: {0:.3f},".format(ts_start) + \
> +               "ts_end: {0:.3f},".format(ts_end) + \
> +               "dt: {0:.3f}<br>".format(dt) + \
> +               "lk_start: {}, lk_end: {}, dy: {}<br>".format(hex(self.lk_start), hex(self.lk_end), hex(self.get_range()))
> +
> +        return text
> +
> +    def get_text_pl(self):
> +        text = "contention:<br>" + \
> +                "start: {0:.3f},".format(nsecs_to_secs(self.ts_start)) + \
> +                "end: {0:.3f},".format(nsecs_to_secs(self.ts_end)) + \
> +                "dt: {0:.3f}".format(nsecs_to_secs(self.get_duration()))
> +
there are some missing spaces after each comma. I will try to add a
function and use this function for the same code below.

> +        return text
> +
> +    def get_group(self):
> +        return "contention"
> +
> +class LockDbRectPLock(LockDbRect):
> +    def __init__(self, **kwargs):
> +        super(LockDbRectPLock, self).__init__(**kwargs)
> +        self.mode = kwargs["mode"]
> +        if self.mode == "WR":
> +            self.color = "red"
> +            self.group = "write lock"
> +        elif self.mode == "RD":
> +            self.color = "blue"
> +            self.group = "read lock"
> +        else:
> +            print("{}: received unknown mode".format(self.ts_start))
> +            return
> +
> +        self.waiter = None
> +
> +    def __str__(self):
> +        return str(self.__dict__)
> +
> +    def __repr__(self):
> +        return str(self)
> +
> +    def has_waiter(self):
> +        return self.waiter
> +
> +    def set_waiter(self, wa):
> +        if self.has_waiter():
> +            print("{}: has already a waiter set".format(self.ts_start))
> +            return
> +
> +        self.waiter = wa
> +
> +    def get_waiter(self):
> +        return self.waiter
> +
> +    def get_color(self):
> +        return self.color
> +
> +    def get_opacity(self):
> +        return 0.4
> +
> +    def get_text(self):
> +        # TODO make it human readable for high numbers?
> +        ts_start = nsecs_to_secs(self.ts_start);
> +        ts_end = nsecs_to_secs(self.ts_end);
> +        dt = nsecs_to_secs(self.get_duration());
> +
> +        text = "nodeid: {}, pid: {}, owner: {}<br>".format(self.nodeid, self.pid, hex(self.owner)) + \
> +               "ts_start: {0:.3f},".format(ts_start) + \
> +               "ts_end: {0:.3f},".format(ts_end) + \
> +               "dt: {0:.3f}<br>".format(dt) + \

here...

> +               "lk_start: {}, lk_end: {}, dy: {}<br>".format(hex(self.lk_start), hex(self.lk_end), hex(self.get_range()))
> +
> +        if self.has_waiter():
> +            text = text + "<br>{}".format(self.get_waiter().get_text_pl())
> +
> +        return text;
> +
> +    def get_group(self):
> +        return self.group
> +
> +class LockDb:
> +    def __init__(self, nodeid, ls, number):
> +        self.nodeid = nodeid
> +        self.ls = ls
> +        self.number = number
> +        self.__ts = dict()
> +        self.plocks = list()
> +        self.__last_plock = dict()
> +        self.waiters = list()
> +        self.__last_waiter = dict()
> +        self.__ts_start_min = None
> +        self.__ts_end_max = 0
> +
> +    def event_start(self, vals):
> +        self.__ts[vals["ptr"]] = vals
> +
> +    def __plock(self, vals_start, vals_end):
> +        pl = LockDbRectPLock(ls=vals_start["ls"],
> +                             number=vals_start["number"],
> +                             pid=vals_start["pid"],
> +                             nodeid=vals_start["nodeid"],
> +                             owner=vals_start["owner"],
> +                             ts_start=vals_start["ts"],
> +                             ts_end=vals_end["ts"],
> +                             lk_start=vals_start["start"],
> +                             lk_end=vals_start["end"],
> +                             mode=vals_start["mode"])
> +        self.plocks.append(pl)
> +        self.__last_plock[vals_end["ptr"]] = pl
> +
> +        try:
> +            pl.set_waiter(self.__last_waiter[vals_start["owner"]])
> +            del(self.__last_waiter[vals_start["owner"]])
> +        except KeyError:
> +            pass
> +
> +    def __waiter(self, vals_start, vals_end):
> +        wa = LockDbRectWaiter(ls=vals_start["ls"],
> +                              number=vals_start["number"],
> +                              pid=vals_start["pid"],
> +                              nodeid=vals_start["nodeid"],
> +                              owner=vals_start["owner"],
> +                              ts_start=vals_start["ts"],
> +                              ts_end=vals_end["ts"],
> +                              lk_start=vals_start["start"],
> +                              lk_end=vals_start["end"])
> +        self.waiters.append(wa)
> +        self.__last_waiter[vals_start["owner"]] = wa
> +
> +    def event_end(self, vals):
> +        try:
> +            vals_start = self.__ts[vals["ptr"]]
> +            del(self.__ts[vals["ptr"]])
> +        except KeyError:
> +            print("{}: cannot find previous start, ignoring".format(vals["ts"]))
> +            return
> +
> +        if not self.__ts_start_min:
> +            self.__ts_start_min = vals_start["ts"]
> +        else:
> +            self.__ts_start_min = min(self.__ts_start_min, vals_start["ts"])
> +
> +        if not self.__ts_start_min:
> +            self.__ts_end_max = vals["ts"]
> +        else:
> +            self.__ts_end_max = max(self.__ts_end_max, vals["ts"])
> +
> +        if vals["event"] == EType.LOCK:
> +            self.__plock(vals_start, vals)
> +        elif vals["event"] == EType.WAITER:
> +            self.__waiter(vals_start, vals)
> +        else:
> +            print("{}: unknown event type".format(vals["ts"]))
> +
> +    def add_shapes(self, fig):
> +        global legend_groups
> +        legend_groups = set()
> +
> +        # first waiters to be in background
> +        for wa in self.waiters:
> +            fig.add_trace(wa.create_shape())
> +
> +        # next layer to draw lock states
> +        for pl in self.plocks:
> +            fig.add_trace(pl.create_shape())
> +
> +    def get_ts_min(self):
> +        return self.__ts_start_min
> +
> +    def get_ts_max(self):
> +        return self.__ts_end_max
> +
> +    def __str__(self):
> +        return "ls {} inode {}".format(self.ls, self.number)
> +
> +    def __repr__(self):
> +        return str(self)
> +
> +parser = argparse.ArgumentParser(description="Plots dlm_controld lock databases")
> +parser.add_argument('-i', "--info", action="store_true")
> +parser.add_argument('-s', "--ts_start", type=float)
> +parser.add_argument('-e', "--ts_end", type=float)
> +
> +parser.add_argument('plock_logs', type=argparse.FileType('r'), nargs='+')
> +args = parser.parse_args()
> +
> +for f in args.plock_logs:
> +    lines = f.readlines()
> +
> +    # Current way to figure out our nodeid
> +    for line in lines:
> +        if "read plock" in line:
> +            lsplit = line.split()
> +            npo = lsplit[11].split('/')
> +            nodeid = int(npo[0])
> +            our_nodeid = nodeid
> +            break
> +
> +    if not our_nodeid:
> +        print("{}: could not determine our nodeid".format(f))
> +        continue
> +
> +    for line in lines:
> +        if "state plock" in line:
> +            pass
> +        elif "state waiter" in line:
> +            pass
> +        else:
> +            continue
> +
> +        vals = dict()
> +        lsplit = line.split()
> +        vals["ts"] = int(lsplit[3])
> +
> +        if args.ts_start and secs_to_nsecs(args.ts_start) > int(vals["ts"]):
> +            continue
> +
> +        if args.ts_end and secs_to_nsecs(args.ts_end) < int(vals["ts"]):
> +            break;
> +
> +        vals["ls"] = lsplit[4]
> +        vals["op"] = lsplit[7]
> +        vals["number"] = int(lsplit[8], 16)
> +        vals["ptr"] = lsplit[9]
> +
> +        db = lookup_lockdb(our_nodeid, vals["ls"], vals["number"])
> +        if "state plock" in line:
> +            vals["event"] = EType.LOCK
> +            if vals["op"] == "start":
> +                vals["mode"] = lsplit[10]
> +                se = lsplit[11].split('-')
> +                vals["start"] = int(se[0], 16)
> +                vals["end"] = int(se[1], 16)
> +                npo = lsplit[12].split('/')
> +                vals["nodeid"] = int(npo[0])
> +                vals["pid"] = int(npo[1])
> +                vals["owner"] = int(npo[2], 16)
> +                db.event_start(vals)
> +            elif vals["op"] =="end":
> +                db.event_end(vals)
> +            else:
> +                print("{}: unknown state plock operation: {}".format(vals["ts"], vals["op"]))
> +        elif "state waiter" in line:
> +            vals["event"] = EType.WAITER
> +            if vals["op"] == "start":
> +                se = lsplit[10].split('-')
> +                vals["start"] = int(se[0], 16)
> +                vals["end"] = int(se[1], 16)
> +                npo = lsplit[11].split('/')
> +                vals["nodeid"] = int(npo[0])
> +                vals["pid"] = int(npo[1])
> +                vals["owner"] = int(npo[2], 16)
> +                db.event_start(vals)
> +            elif vals["op"] == "end":
> +                db.event_end(vals)
> +            else:
> +                print("{}: unknown state waiter operation: {}".format(vals["ts"], vals["op"]))
> +        else:
> +            print("received unknown line event: {}".format(line))
> +
> +if args.info:
> +    ts_max = None
> +    ts_min = None
> +
> +    for nodeid in nodes:
> +        for ls in nodes[nodeid]:
> +            for number in nodes[nodeid][ls]:
> +                db = lookup_lockdb(nodeid, ls, number)
> +                if not ts_min:
> +                    ts_min = db.get_ts_min()
> +                else:
> +                    ts_min = min(ts_min, db.get_ts_min())
> +
> +                if not ts_max:
> +                    ts_max = db.get_ts_max()
> +                else:
> +                    ts_max = min(ts_max, db.get_ts_max())
> +
> +    print("global info:")
> +    print("\tts_start: {0:.3f} secs".format(nsecs_to_secs(ts_min)))
> +    print("\tts_end: {0:.3f} secs".format(nsecs_to_secs(ts_max)))
> +    print("\tduration: {0:.3f} secs".format(nsecs_to_secs(ts_max) - nsecs_to_secs(ts_min)))
> +
> +    print()
> +    print("per inode lock info:")
> +    for nodeid in nodes:
> +        print("nodeid: {}".format(nodeid))
> +        for ls in nodes[nodeid]:
> +            print("\tls: {}".format(ls))
> +            for number in nodes[nodeid][ls]:
> +                print("\t\tinode: {}".format(hex(number)))
> +                db = lookup_lockdb(nodeid, ls, number)
> +                print("\t\t\tts_start: {0:.3f} secs".format(nsecs_to_secs(db.get_ts_min())))
> +                print("\t\t\tts_end: {0:.3f} secs".format(nsecs_to_secs(db.get_ts_max())))
> +                print("\t\t\tduration: {0:.3f} secs".format(nsecs_to_secs(db.get_ts_max()) - nsecs_to_secs(db.get_ts_min())))
> +

I will add a "top 5" high score of the longest duration of contention
state... So if somebody is looking for long contention states it is
easy to find them with a fast parsing of the log files.

- Alex