[PATCH RFC 2/9] RDMA/rv: Add the internal header files

From: kaike.wan@intel.com
To: dledford@redhat.com, jgg@nvidia.com
Cc: linux-rdma@vger.kernel.org, todd.rimmer@intel.com,
	Kaike Wan <kaike.wan@intel.com>
Subject: [PATCH RFC 2/9] RDMA/rv: Add the internal header files
Date: Fri, 19 Mar 2021 08:56:28 -0400	[thread overview]
Message-ID: <20210319125635.34492-3-kaike.wan@intel.com> (raw)
In-Reply-To: <20210319125635.34492-1-kaike.wan@intel.com>

From: Kaike Wan <kaike.wan@intel.com>

The two header files include the defines, structures, MACROs,
function prototypes, and functions used in the module.

Signed-off-by: Todd Rimmer <todd.rimmer@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
---
 drivers/infiniband/ulp/rv/rv.h          | 892 ++++++++++++++++++++++++
 drivers/infiniband/ulp/rv/rv_mr_cache.h | 152 ++++
 2 files changed, 1044 insertions(+)
 create mode 100644 drivers/infiniband/ulp/rv/rv.h
 create mode 100644 drivers/infiniband/ulp/rv/rv_mr_cache.h

diff --git a/drivers/infiniband/ulp/rv/rv.h b/drivers/infiniband/ulp/rv/rv.h
new file mode 100644
index 000000000000..20ca65387d22
--- /dev/null
+++ b/drivers/infiniband/ulp/rv/rv.h
@@ -0,0 +1,892 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2020 - 2021 Intel Corporation.
+ */
+
+#ifndef __RV_H__
+#define __RV_H__
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cm.h>
+#include <rdma/rdma_cm.h>
+#include <linux/mutex.h>
+#include <linux/socket.h>
+#include <linux/timer.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cm.h>
+#include <linux/moduleparam.h>
+
+#include "rv_mr_cache.h"
+#include <rdma/rv_user_ioctls.h>
+
+/*
+ * Lock Heirachy
+ * In order that locks can be acquired:
+ * mm->mmap_lock - held during mmap call and mmu_notifier_register
+ * rv_user.mutex
+ * rv_job_dev_list_mutex
+ * rv_job_dev.conn_list_mutex
+ * rv_device.listener_mutex
+ * rv_sconn.mutex - we can never hold this while calling destroy_cm_id
+ *	because destroy cm_id will wait for handlers via it's mutex and deadlock
+ * rv_job_dev_list rcu
+ * rv_dev_list_lock
+ * rv_job_dev.conn_list rcu - no sub locks
+ * rv_device.listener_lock - no sub locks
+ * rv_user.umrs.cache.lock - no sub locks
+ * rv_job_dev.user_array_lock - no sub locks
+ * ring.lock - no sub locks
+ * mr_pd_uobject_lock - no sub locks
+ * rv_conn.next_lock - no sub locks
+ * rv_sconn.drain_lock - no sub locks
+ */
+
+/*
+ * Our goal is to allocate shared resources, but all processes are equal peers.
+ * So we use a get_alloc approach where processes attempt to get a resource
+ * reference (lookup and get) and if it's not on the list, it is allocated and
+ * added.  This is used for rv_job_dev, rv_listener and rv_conn.
+ * Each such list is protected by a mutex to prevent duplicate additions
+ * and an RCU for actual list access.  The mutex also protects
+ * get_alloc/list_del races.  All get_alloc calls are in premptable
+ * app ioctl call context.
+ * rv_listener list is rarely searched, so it simply uses a mutex and spinlock.
+ */
+
+/*
+ * When working with IB CM, Async events and timeouts we can't predict if and
+ * when their callbacks will occur so they can't have a reference held in
+ * advance. This presents opportunities for the callback to race with rv_conn
+ * destruction. To solve this problem, we use rv_conn_get_check to get a
+ * reference to rv_conn only if rv_conn.kref != 0. rv_conn and rv_sconn
+ * destruction uses destroy_cm_id and del_timer_sync to stop all callbacks (and
+ * allow any outstanding handlers to finish) before it proceeds to free rv_conn.
+ * This ensures the cm_id, etc can remain valid enough for the callback to
+ * call rv_conn_get_check and decide whether to simply ignore the event.
+ * It also allows the rv_sconn to be the ib_cm, CQ, QP and timer context and
+ * avoid searches. Once the callback has the reference, it is protected
+ * from other threads calling rv_conn_release.
+ */
+
+#define DRIVER_NAME "rv"
+
+#define RV_INVALID -1
+
+/* For errors to surface on the console */
+#define rv_err(idx, fmt, ...) \
+	pr_err("[%s:%s %d]: " fmt, DRIVER_NAME, __func__, idx, ##__VA_ARGS__)
+
+#define rv_ptr_err(prefix, ptr, fmt, ...) \
+	pr_err("[%s:%s %s 0x%p]: " fmt, DRIVER_NAME, __func__, prefix, ptr, \
+	       ##__VA_ARGS__)
+
+/* For debugging*/
+#define rv_dbg(idx, fmt, ...) \
+	pr_debug("[%s:%s %d]: " fmt, DRIVER_NAME, __func__, idx, ##__VA_ARGS__)
+
+/* For general console info */
+#define rv_info(idx, fmt, ...) \
+	pr_info("[%s:%s %d]: " fmt, DRIVER_NAME, __func__, idx, ##__VA_ARGS__)
+
+/* For debugging with any pointer */
+#define rv_ptr_dbg(prefix, ptr, fmt, ...) \
+	pr_debug("[%s:%s %s 0x%p]: " fmt, DRIVER_NAME, __func__, prefix, ptr, \
+		 ##__VA_ARGS__)
+
+#define rv_ptr_info(prefix, ptr, fmt, ...) \
+	pr_info("[%s:%s %s 0x%p]: " fmt, DRIVER_NAME, __func__, prefix, ptr, \
+		##__VA_ARGS__)
+
+/* For debugging with rv_conn_info */
+#define rv_conn_err(ptr, fmt, ...) rv_ptr_err("sconn", ptr, fmt, ##__VA_ARGS__)
+#define rv_conn_info(ptr, fmt, ...) rv_ptr_info("sconn", ptr, fmt,\
+						##__VA_ARGS__)
+#define rv_conn_dbg(ptr, fmt, ...) rv_ptr_dbg("sconn", ptr, fmt, ##__VA_ARGS__)
+
+/* For debugging with rv_device */
+#define rv_dev_err(ptr, fmt, ...) rv_ptr_err("dev", ptr, fmt, ##__VA_ARGS__)
+#define rv_dev_info(ptr, fmt, ...) rv_ptr_info("dev", ptr, fmt, ##__VA_ARGS__)
+#define rv_dev_dbg(ptr, fmt, ...) rv_ptr_dbg("dev", ptr, fmt, ##__VA_ARGS__)
+
+/* For debugging with ib cm id */
+#define rv_cm_err(ptr, fmt, ...) rv_ptr_err("cm_id", ptr, fmt, ##__VA_ARGS__)
+#define rv_cm_info(ptr, fmt, ...) rv_ptr_info("cm_id", ptr, fmt, ##__VA_ARGS__)
+#define rv_cm_dbg(ptr, fmt, ...) rv_ptr_dbg("cm_id", ptr, fmt, ##__VA_ARGS__)
+
+struct rv_device;
+
+/*
+ * A listener can handle more than 1 job on a given dev.
+ * There is a listener for each unique service_id on each dev and
+ * could be as few as 1 even for multiple jobs.
+ * For IB, a listener services all ports on a given HCA.
+ * listener_entry - entry on rv_device.listener_list
+ * dev - device being listened on
+ * cm_id - actual IB CM listener, stores service_id here
+ */
+struct rv_listener {
+	struct list_head listener_entry;
+	struct rv_device *dev;
+	struct ib_cm_id *cm_id;
+	struct kref kref;
+};
+
+/*
+ * For each physical device, RV establishes a single rv_device which
+ * is shared across all jobs, listeners, etc.
+ * ib_dev - underlying RDMA device
+ * dev_entry - entry on rv_dev_list
+ *
+ * IB cm listener management:
+ *	listener_mutex, listener_lock, listener_list
+ *
+ * user_list - list of rv_user (protected by rv_dev_list_lock)
+ */
+struct rv_device {
+	struct ib_device *ib_dev;
+	struct list_head dev_entry;
+	struct kref kref;
+	struct ib_event_handler event_handler;
+
+	struct mutex listener_mutex; /* avoid duplicate add in get_alloc */
+	spinlock_t listener_lock; /* protect list search, add, remove */
+	struct list_head listener_list;
+
+	struct list_head user_list;
+};
+
+#define RV_CONN_MAX_ACTIVE_WQ_ENTRIES 100 /* used for conn handling */
+
+#define RV_RESOLVER_RETRY 5		/* max retries (0 == try once) */
+#define RV_RESOLVER_TIMEOUT 10000	/* in milliseconds */
+
+/* duration client spends in RV_DELAY before re-attempt reconnect */
+#define RV_RECONNECT_DELAY msecs_to_jiffies(100)
+
+/* private data in REQ and REP is used to negotiate features and exchange
+ * additional informaton.
+ * REQ.ver is the version of the sender and defines the priv_data structure.
+ * Newer versions of struct should be a superset of older versions.
+ * REP.ver is the version the listener is accepting and defines REP priv_data.
+ * New client, old listerner:
+ *	listener accepts REQ >= it's version, responds with it's version in REP
+ * Old client, new listener:
+ *	listener accepts REQ, responds with REP matching client's version
+ * Protocol ver is now 2.  For simplicity protocol 0 and 1 (pre-releases) are
+ * not accepted.
+ */
+
+#define RV_PRIVATE_DATA_MAGIC 0x00125550534d2121ULL
+#define RV_PRIVATE_DATA_VER 2
+
+/*
+ * Private data used in CM REQ
+ * laid out for good alignment of fields without packing
+ * index indicates index of rv_sconn within it's rv_conn and differentiates
+ * the multiple connection REQs for num_conn > 1
+ * The job_key selects the rv_job_dev on receiver side
+ * uid is 32b
+ */
+struct rv_req_priv_data {
+	u64 magic;
+	u32 ver;
+	u16 resv1;
+	u8 index;
+	u8 job_key_len;
+	u8 job_key[RV_MAX_JOB_KEY_LEN];
+	uid_t uid;
+};
+
+/*
+ * Private data used in CM REP
+ * laid out for good alignment of fields without packing
+ */
+struct rv_rep_priv_data {
+	u64 magic;
+	u32 ver;
+};
+
+/*
+ * RV_INIT - client side create_conn done
+ * RV_WAITING - server side create_conn done, listening
+ * RV_RESOLVING - client side resolving ethernet dmac via ARP
+ * RV_CONNECTING - client or server side going through CM states
+ * RV_CONNECTED - connection established
+ * RV_DISCONNECTING - connection teardown
+ * RV_DELAY- RV delay before restarting client connection
+ * RV_ERROR - connection lost or failed
+ * destroying - ref count == 0
+ */
+enum rv_sconn_state {
+	RV_INIT = 1,
+	RV_WAITING,
+	RV_RESOLVING,
+	RV_CONNECTING,
+	RV_CONNECTED,
+	RV_DISCONNECTING,
+	RV_DELAY,
+	RV_ERROR,
+};
+
+/*
+ * Meaning of each state and status of key fields once in state and
+ * have unlocked mutex.
+ * In all states (except while destroying) immutable fields in rv_conn valid.
+ *
+ * WAS_CONNECTED flag essentially creates a superstate indicating connection
+ * recovery for WAITING, RESOLVING, and CONNECTING.  During connection
+ * recovery, conn_timer is running for reconnect_timeout and once it fires,
+ * the connect recovery is aborted and moved to ERROR
+ *
+ * If WAS_CONNECTED, post write in wrong state returns EAGAIN instead of EINVAL
+ *
+ * A complication is identifying when a connection is down for a receiving
+ * end of a traffic pattern.  The receiver may see no packets and can't tell
+ * a path down from an idle QP.  To address this periodic RDMA Write zero
+ * can be sent if no traffic has been sent or received for a while.  This
+ * situation is no worse that user space UD PSM as the job failure may
+ * occur over an out of band network to kill the job.  A gap is the client side
+ * of a connection desiring recovery, which requires the heatbeat to recognize.
+ *
+ * If the receiver happens to also be the server side of rv_sconn, we may
+ * get a REQ while in connected because the sender may get a QP timeout
+ * long before the receiver heartbeat notices.  We treat this as disconnect
+ * and if appropriate (likely) begin connection recovery.
+ *
+ * RV_INIT: initial state for client side connections (after 1st create_conn)
+ *	cm_id, primary_path are NULL
+ *	dev_addr, resolver_retry_left  uninitialized
+ *	qp in RESET state, no outstanding CQEs nor WQEs
+ *	conn_timer not running (no reconnect_timeout)
+ *	delay_timer not running
+ *	hb_timer not running
+ *	Next States:
+ *		RESOLVING - user calls cm_connect
+ *		destroying - user close
+ * RV_WAITING: initial state for server side connections (after 1st create_conn)
+ *	a listener exists at rv_job_dev level, rv_cm_server_handler
+ *	cm_id NULL
+ *	start_time is set when enter state
+ *	qp in RESET state, no outstanding CQEs nor WQEs
+ *	if WAS_CONNECTED, conn_timer is running for reconnect_timeout
+ *	delay_timer not running
+ *	hb_timer not running
+ *	DRAINING, SQ_DRAINED and RQ_DRAINED flags clear
+ *	Next States:
+ *		CONNECTING - inbound REQ
+ *		ERROR - reconnect_timeout expires
+ *		destroying - user close
+ * RV_RESOLVING: 1st step in establishing a client connection (ARP)
+ *	For non-ethernet, this is a brief transient state (only inside mutex)
+ *	cm_id established (client side), rv_cm_handler
+ *	resolver_retry_left valid
+ *	primary_path != NULL, but contents incomplete
+ *		dmac, route_resolved, hop__limit uninitialized
+ *	dev_addr undefined
+ *	start_time is set when enter state
+ *	a rdma_resolve_ip callback is scheduled
+ *	qp in RESET state, no outstanding CQEs nor WQEs
+ *	if WAS_CONNECTED, conn_timer is running for reconnect_timeout
+ *	delay_timer not running
+ *	hb_timer not running
+ *	DRAINING, SQ_DRAINED and RQ_DRAINED flags clear
+ *	Next States:
+ *		CONNECTING - resolution successfully complete
+ *		ERROR - resolving hard fail or retry exceeded or connect timeout
+ *		RESOLVING - cb error and < retry limit
+ *		DELAY - cb error > retrylimit and WAS_CONNECTED and reconnect
+ *		destroying - user close
+ * RV_CONNECTING: client or server connection in hands of IB CM
+ *	cm_id established (either side), rv_cm_handler
+ *	primary_path NULL
+ *	dev_addr valid on client side
+ *	resolver_retry_left undefined
+ *	start_time is set when enter state
+ *	CM may have a rep or rtu outstanding
+ *	qp progressing through RESET->INIT->RTR->RTS via IB CM states/events
+ *	if WAS_CONNECTED, conn_timer is running for reconnect_timeout
+ *	delay_timer not running
+ *	hb_timer not running
+ *	DRAINING, SQ_DRAINED and RQ_DRAINED flags clear
+ *	Next States:
+ *		CONNECTED - client gets REP, server gets RTU or Established
+ *		ERROR - REJ, REQ err, REP err, DREQ, QP event or connect timeout
+ *		DISCONNECTING - WAS_CONNECTED and REQ err, REJ, REP err, QP evt
+ *				or REQ
+ *		destroying - user close
+ * RV_CONNECTED: client or server connection usable
+ *	cm_id established (either side), rv_cm_handler
+ *	primary_path NULL
+ *	dev_addr valid on client side
+ *	resolver_retry_left undefined
+ *	no outbound CM operations outstanding
+ *	start_time is set when enter state
+ *	qp in RTS, has recv WQEs posted
+ *	qp may have outstanding CQEs and WQEs
+ *	conn_timer not running (no reconnect_timeout)
+ *	delay_timer not running
+ *	hb_timer may be running (client side only)
+ *	WAS_CONNECTED is set on entry
+ *	DRAINING, SQ_DRAINED and RQ_DRAINED flags clear
+ *	heartbeat timer running
+ *		record post SQ and RQ CQE count when start timer
+ *		when fires, if counts same, send 0 len RDMA Write
+ *		TBD - set larger than timeout*retry or simply track SQ so
+ *			don't overflow SQ
+ *		TBD - stagger heartbeat sends on different sconns
+ *	Next States:
+ *		ERROR - no reconnect and get DREQ, QP event, REQ
+ *		DISCONNECTING - reconnect and get DREQ, QP event (send DREQ),
+ *				get REQ (remote end reconnecting)
+ *		destroying - user close
+ * RV_DISCONNECTING: client or server connection cleanup in prep for reconnect
+ *	cm_id established (either side), rv_cm_handler
+ *	primary_path NULL
+ *	dev_addr valid on client side
+ *	resolver_retry_left undefined
+ *	DREP or DREQ outbound CM operations may be outstanding
+ *	start_time is set when enter state
+ *	qp in ERROR, SCONN_DRAINING flag set,
+ *		- waiting for "drain indicator" WQEs' CQEs on RQ and SQ
+ *		- drain indicator CQEs set SQ_DRAINED or RQ_DRAINED
+ *	qp may have other outstanding CQEs and WQEs
+ *	if WAS_CONNECTED, conn_timer may be is running for reconnect_timeout
+ *	delay_timer not running
+ *	hb_timer not running
+ *	Note: we do not depend on DREP and ignore DREQ Err or DREP Err
+ *		path to remote node may be down
+ *	Next States:
+ *		ERROR - unable to queue CM_id destroy or reset QP
+ *			or connect timeout
+ *		DELAY - SQ & RQ drain finish for client
+ *		WAITING - SQ & RQ drain finish for server (reset QP,
+ *				cm_id NULL and queued for destroy)
+ *		destroying - user close
+ * RV_DELAY: client reconnection delay
+ *	cm_id established, rv_cm_handler
+ *	primary_path NULL
+ *	dev_addr valid on client side
+ *	resolver_retry_left undefined
+ *	DREP or DREQ outbound CM operations may be outstanding on prior cm_id
+ *	start_time is set when enter state
+ *	qp in RESET
+ *	prior qp has no outstanding CQEs and WQEs
+ *	if WAS_CONNECTED, conn_timer may be is running for reconnect_timeout
+ *	delay_timer is running
+ *	hb_timer not running
+ *	DRAINING, SQ_DRAINED and RQ_DRAINED flags clear
+ *	while in this state all other errors are ignored
+ *	Next States:
+ *		RESOLVING - timer expires
+ *		ERROR - reconnect timeout
+ *		destroying - user close
+ * RV_ERROR: terminal state for lost/failed connection
+ *	may enter from any state
+ *	cm_id may be NULL or established with rv_cm_handler
+ *	CM may have an outstanding message
+ *		typically REJ, DREQ or DREP, but could be REQ, REP or RTU
+ *	primary_path NULL
+ *	dev_addr, resolver_retry_left undefined
+ *	no rdma_resolve_ip callback scheduled (cancel when enter RV_ERROR)
+ *	qp in ERROR, SCONN_DRAINING flag set or qp in RESET or no QP
+ *	SQ_DRAINED and RQ_DRAINED flags may be progressing on drain
+ *	qp may have outstanding CQEs and WQEs
+ *	conn_timer not running (no reconnect_timeout)
+ *	delay_timer not running
+ *	hb_timer not running
+ *	Next States:
+ *		destroying - user close
+ * destroying: rv_conn kref is 0
+ *	may enter from any state
+ *	on entry:
+ *		cm_id may be NULL or established with rv_cm_handler
+ *		a rdma_resolve_ip callback may be scheduled
+ *		qp may have outstanding CQEs and WQEs and callbacks active
+ *		conn_timer may be running (reconnect_timeout or reconnect delay)
+ *	all fields (including immitable) will be released via rv_sconn_deinit
+ *	no way out other than kfree of parent rv_conn
+ *	may prempt, does not require mutex (kref==0 protects races with cb)
+ *	Next States:
+ *		N/A - conn/sconn freed
+ *
+ * typical client side state sequence:
+ *	RV_INIT->RV_RESOLVING->RV_CONNECTING->RV_CONNECTED->destroying
+ * typical server side state sequence:
+ *	RV_WAITING->RV_CONNECTING->RV_CONNECTED->destroying
+ *
+ * typical client side recovery state sequence:
+ *	...->RV_CONNECTED->DISCONNECTING->DELAY->RV_RESOLVING->...
+ * typical server side recovery state sequence:
+ *	...->RV_CONNECTED->DISCONNECTING->WAITING->...
+ */
+
+/*
+ * rv_sconn.flags bit numbers
+ *	RV_SCONN_SERVER	- server vs client side (immutable)
+ *	RV_SCONN_WAS_CONNECTED - was RV_CONNECTED at least once
+ *	RV_SCONN_DRAINING - started draining in DISCONNECTING
+ *	RV_SCONN_SQ_DRAINED - SQ drained in DISCONNECTING
+ *	RV_SCONN_RQ_DRAINED - RQ drained in DISCONNECTING
+ *	RV_SCONN_ASYNC - got async event
+ */
+#define RV_SCONN_SERVER		0
+#define RV_SCONN_WAS_CONNECTED	1
+#define RV_SCONN_DRAINING	2
+#define RV_SCONN_SQ_DRAINED	3
+#define RV_SCONN_RQ_DRAINED	4
+
+/*
+ * rv_sconn.stats.cm_evt_cnt[] classify events > CM_APR_RECEIVED as unexpected.
+ * Beyond that are only SIDR events
+ */
+#define RV_CM_EVENT_MAX IB_CM_APR_RECEIVED
+#define RV_CM_EVENT_UNEXP ((enum ib_cm_event_type)(RV_CM_EVENT_MAX + 1))
+
+/*
+ * a single QP/connection
+ * mutex - protects state driven fields
+ *
+ * These are set once at create and can be read without lock
+ *	index - unique index for connection within rv_conn
+ *	qp, send_cq, recv_cq, max_send_wr (from ib_qp_cap)
+ *	cqe - for recv completions
+ *
+ * Fields below require the mutex and their validity depends on the current
+ * value of rv_sconn.state and the RV_SCONN_SERVER flag.
+ *
+ * Basic fields:
+ *	state, flags
+ *	start_time - when started waiting, resolving or connecting
+ *	cm_id - our actual connection
+ *	path - from PSM.  Consistency checked on listener, for client connect
+ *		we use path.dlid != 0 to identify if path has been initialized
+ *
+ * Client only fields:
+ *	resolver_retry_left, primary_path, dev_addr
+ *	For RoCE our resolver step fills in dev_addr with resolved RoCE HW
+ *	addresses (aka MAC address)
+ *
+ * Async QP Draining,
+ *	drain_lock -  protects these and enter;test of RC_SCONN_*DRAIN* flags
+ *	drain_lock, rdrain_cqe, sdrain_cqe, drain_work (for drain done handler)
+ *	drain_timer - for drain timeout
+ *	done_wr_list - most recent completed pend_wr's
+ *	done_wr_count - number of entries on list
+ *
+ * reconnect_timeout timer: conn_timer, timer_work
+ * RV_DELAY timer: delay_timer, delay_work
+ * Heartbeat: hb_timer, act_count (activity since last hb), hb_cqe, hb_work
+ *
+ * Stats:
+ *	all but atomic CQE stats protected by rv_sconn.mutex
+ * connection:
+ *	*_time values are in microseconds
+ *	max_* is the largest observed for all reconnects on this sconn
+ *	cm_evt_cnt - extra +1 to include EVENT_MAX and +1 for UNEXP
+ *	stats for each CM packet explicitly send (req, rep, rtu, dreq, drep)
+ *	initial connect: wait_time, resolve_time, connect_time
+ *		connect time does not include wait_time nor resolve_time
+ *		resolve - attempts, resolve_fail - unexpected local issues
+ *	connected_time - total time connected (after initial + after recovery)
+ *	conn_recovery - # times recovered connection
+ *	connect recovery: rewait_time, reresolve_time, reconnect_time
+ *		reconnect time does not include wait_time nor resolve_time
+ *		reresolve - attempts, reresolve_fail - unexpected local issues
+ * data movement:
+ *	post_* is SQ posts (success, fail, payload byte count)
+ *	outstand_send_write - current send writes waiting for CQE
+ *	send_write_cqe_* is SQ cqes (RDMA Write w/Immediate)
+ *	recv_write_cqe_* is RQ cqes (RDMA Write w/Immediate)
+ *	recv_cqe_fail - RQ CQEs with bad status (opcode undefined)
+ *	*_hb_* - heartbeat
+ */
+
+struct rv_sconn {
+	struct mutex mutex; /* lock for state driven fields */
+	u8 index;
+	struct ib_qp *qp;
+	struct ib_cq *send_cq;
+	struct ib_cq *recv_cq;
+	struct ib_cqe cqe;
+	struct rv_conn *parent;
+
+	unsigned long flags;
+	enum rv_sconn_state state;
+	ktime_t start_time;
+	struct ib_cm_id *cm_id;
+	struct ib_user_path_rec path;
+
+	u32 resolver_retry_left;
+	struct sa_path_rec *primary_path;
+	struct rdma_dev_addr dev_addr;
+
+	/* protects these & enter;test RC_SCONN_*DRAIN* flags */
+	spinlock_t drain_lock;
+	struct ib_cqe rdrain_cqe;
+	struct ib_cqe sdrain_cqe;
+	struct work_struct drain_work;
+	struct timer_list drain_timer;
+
+	struct timer_list conn_timer;
+	struct work_struct timer_work;
+
+	struct timer_list delay_timer;
+	struct work_struct delay_work;
+
+	struct timer_list hb_timer;
+	u64 act_count;
+	struct ib_cqe hb_cqe;
+	struct work_struct hb_work;
+
+	struct {
+		u32 cm_evt_cnt[RV_CM_EVENT_MAX + 2];
+		u32 req_sent;
+		u32 rep_sent;
+		u32 rtu_sent;
+		u32 rej_sent;
+		u32 dreq_sent;
+		u32 drep_sent;
+		u64 wait_time;
+		u64 resolve_time;
+		u64 connect_time;
+		u64 connected_time;
+		u32 resolve;
+		u32 resolve_fail;
+		u32 conn_recovery;
+		u64 rewait_time;
+		u64 reresolve_time;
+		u64 reconnect_time;
+		u64 max_rewait_time;
+		u64 max_reresolve_time;
+		u64 max_reconnect_time;
+		u32 reresolve;
+		u32 reresolve_fail;
+		u64 post_write;
+		u64 post_write_fail;
+		u64 post_write_bytes;
+		u64 post_hb;
+		u64 post_hb_fail;
+
+		atomic_t outstand_send_write;
+		atomic64_t send_write_cqe;
+		atomic64_t send_write_cqe_fail;
+		atomic64_t recv_write_cqe;
+		atomic64_t recv_write_bytes;
+		atomic64_t recv_cqe_fail;
+		atomic64_t send_hb_cqe;
+		atomic64_t send_hb_cqe_fail;
+		atomic64_t recv_hb_cqe;
+	} stats;
+};
+
+/*
+ * A load balanced multi QP connection using multiple underlying connections
+ * and is shared by multiple rv_user's.
+ * num_conn, jdev, ah and rem_addr are immutable (set once at create)
+ * Entry in rv_job_dev.conn_list: conn_entry, rcu
+ * sconn round robin IO: next_lock, next
+ *	next_lock also protects read;inc of sconn->stats.outstand_send_write
+ */
+struct rv_conn {
+	u8 num_conn;
+	struct rv_job_dev *jdev;
+	struct ib_uverbs_ah_attr ah;
+	u32 rem_addr;
+
+	struct list_head conn_entry;
+	struct rcu_head rcu;
+
+	struct kref kref;
+	struct work_struct put_work;
+
+	spinlock_t next_lock; /* protect rv_conn.next & read;inc outstand_wr */
+	u8 next;
+	struct work_struct free_work;
+
+	struct rv_sconn sconn_arr[];
+};
+
+/*
+ * from IBTA 1.4 Vol1 section A3.2.3.4.  Externally Admined Service IDs
+ * 1h:OUI:OUI:OUI:hh:hh:hh:hh
+ * using Intel 00:12:55 OUI
+ */
+#define RV_DFLT_SERVICE_ID 0x1000125500000001ULL
+
+/*
+ * the set of MRs registered by a single rv_user (on a single NIC)
+ * These are cached for efficiency.
+ * When using kernel rendezvous QPs (eg. rv_conn) these MRs are
+ * registered with the rv_job_dev.pd.
+ * When using user space QPs, we register with the user supplied pd
+ * cache has it's own lock
+ * jdev, rv_inx , CQs, QP set once at alloc
+ * need parent rv_user.mutex for: cqe, post REG_MR to QP (doit_reg_mem), stats
+ */
+struct rv_user_mrs {
+	struct rv_mr_cache cache;
+	struct rv_job_dev *jdev;
+	int rv_inx; /* our creator, for logging */
+
+	struct kref kref;
+	struct work_struct put_work;
+
+	struct {
+		u64 failed;	/* cache miss and failed to register */
+	} stats;
+};
+
+/*
+ * Resources shared among all rv_user's using a given NIC port within a job
+ * for a given kuid.  On the wire we use uid to limit jobs to a single user.
+ * This approach has similarities to the hfi1 and Omni-Path jkey concept
+ * but we are not restricted to the 16b HW jkey field here, so we use all
+ * 32b of uid on the wire.
+ *
+ * There will typically be 1-8 NICs per node and 1-2 concurrent jobs
+ * with one rv_user per CPU core per NIC.
+ *
+ * kref tracks total jdev references while user_kref is the subset of
+ * references representing attached rv_user objects. user_kref <= kref.
+ * To get_alloc a jdev we must successfully get both. Once user_kref
+ * is zero, the jdev is removed from rv_job_dev_list and future attach
+ * or inbound REQ processing will not find it anymore.  At this point it is
+ * destined for destruction as soon as the remaining callbacks with a reference
+ * occur/finish. jdev get_alloc searches which see (and skip) jdevs with
+ * user_kref==0 but kfef>0 only happen when a new job starts without a unique
+ * job key, as the previous job is cleaning up. Such cases are rare.
+ * By decoupling the jobs into different jdev's the new job may also have
+ * different attach and connection params.
+ *
+ * These fields are immutable and can be accessed without a lock:
+ *	kuid, uid, dev, pd, dev_name, port_num,
+ *	num_conn - number of rv_sconn per rv_conn
+ *	index_bits - number of high bits of RDMA immed data to hold rv index
+ *	loc_gid_index - SGID - only used on client rv_sconn
+ *	loc_addr - abstraction of address compared with rem_addr to select
+ *		client/server mode for each rv_sconn at conn_create time.
+ *	log_gid - SGID - to double check loc_gid_index still is same GID
+ *	service_id, q_depth
+ *	qp_depth - max send and recv WQEs to use (N/A space for drain)
+ *	reconnect_timeout (seconds), hb_interval (milliseconds),
+ *	sgid_attr - local NIC gid & address for use by resolver
+ *	max_users - 1<<index_bits
+ *
+ * job_dev_entry, rcu - entry on rv_job_dev_list
+ * conn_list - list of shared rv_conn, protected by RCU and conn_list_mutex
+ *	conn_list_mutex prevents duplicate add in get_alloc
+ * listener - created and shared when 1st server rv_conn is added
+ * user_array[max_users] - rv_users sharing this jdev. user_array_lock protects
+ *	rv_user.index - subscript to this array for given rv_user so
+ *		RDMA recv immediate data can simply index to find which
+ *		rv_user to deliver the completion to.
+ *	user_array_next - where to start next search for free slot
+ */
+struct rv_job_dev {
+	uid_t uid;
+	kuid_t kuid;
+	struct rv_device *dev;
+	struct ib_pd *pd;
+	char dev_name[RV_MAX_DEV_NAME_LEN];
+	u8 port_num;
+	u8 num_conn;
+	u8 index_bits;
+	u16 loc_gid_index;
+	u32 loc_addr;
+	u8 loc_gid[16];
+	u8 job_key[RV_MAX_JOB_KEY_LEN];
+	u8 job_key_len;
+	u64 service_id;
+	u32 q_depth;
+	u32 qp_depth;
+	u32 reconnect_timeout;
+	u32 hb_interval;
+	const struct ib_gid_attr *sgid_attr;
+	int max_users;
+
+	struct kref kref;
+	struct list_head job_dev_entry;
+	struct rcu_head rcu;
+
+	struct mutex conn_list_mutex; /* prevent duplicate add in get_alloc */
+	struct list_head conn_list;
+	struct rv_listener *listener;
+
+	spinlock_t user_array_lock;/* protects add/remove from user_array */
+	u32 user_array_next;
+	struct kref user_kref;
+	struct rv_user *user_array[];
+};
+
+/*
+ * rv_user represents a single open fd from a user
+ * In multi-rail a process may have multiple rv_user (separate open/close)
+ *
+ * mutex - prevents concurrent ATTACH ioctl and protects conn_list
+ *	also protects doit_reg vs self and vs doit_dreg races
+ * attached - set last during ATTACH after dev/jdev, rdma_mode and cq_entries
+ *	have been set.  We have no detach.
+ * inx - immutable ID assignd to rv_user strictly for logging
+ * rdma_mode - indicates USER (MRs only) or KERNEL (jdev, conns, etc) attach
+ *	For rdma_mode KERNEL these are also valid:
+ *		context, cq_entries
+ *		index - rv_user index within rv_job_dev.user_array[]
+ *		cqr - event ring to deliver send and recv RDMA completion
+ *			events to PSM (only if ATTACH.cq_entries!=0)
+ * rv_user_mrs  - MRs registered by this rv_user
+ * conn_xa - ID indexed list of rv_conn (entries assigned on create_conn)
+ *	ID 0 is reserved, PSM uses this value to indicate uninitialized rv intf
+ *	rv_user.mutex protects, so no need to use xas_lock.
+ * user_entry - entry in rv_device.user_list
+ * compl - completion for detach
+ */
+struct rv_user {
+	struct mutex mutex; /* single thread most actions for a user */
+
+	int inx;
+
+	u8 rdma_mode;
+	u8 attached;
+	u8 was_attached;
+	union {
+		struct rv_device *dev;
+		struct rv_job_dev *jdev;
+	};
+
+	u64 context;
+	u32 cq_entries;
+	u16 index;
+	struct rv_user_ring *cqr;
+	struct rv_user_mrs *umrs;
+
+	struct xarray conn_xa;
+
+	struct list_head user_entry;
+	struct completion compl;
+};
+
+/*
+ * an event ring for use by a single rv_user
+ * allows events to be efficiently passed from rv to PSM for PSM polling
+ * Immutable fields set on alloc:
+ *	rv_inx - our creator's rv_user.inx (for logging)
+ *	num_entries, page, order
+ * lock - protects kernel posting to ring and stats
+ * hdr - mmapped into PSM
+ * stats - index for each is rv_event_type excluding RV_TEST_EVENT
+ */
+struct rv_user_ring {
+	int rv_inx;
+	u32 num_entries;
+	unsigned long page;
+	unsigned int order;
+
+	spinlock_t lock; /* protects posting to ring and stats*/
+	struct rv_ring_header *hdr;
+	struct {
+		u64 cqe[2];
+		u64 cqe_fail[2];
+		u64 bytes[2];
+	} stats;
+};
+
+/*
+ * an inflight RDMA write on behalf of an rv_user
+ *
+ * user_index - rv_user index within rv_job_dev.user_array[]
+ * umrs - MR cache holding the local MR
+ * mrc, loc_addr - local MR (with lkey), source memory address for RDMA Write
+ * rkey, rem_addr - dest memory address for RDMA Write
+ * wr_id - for completion event to PSM (supplied by PSM)
+ * done_wr_entry - Entry in rv_sconn.done_wr_list
+ */
+struct rv_pend_write {
+	struct ib_cqe cqe;
+	u16 user_index;
+	struct rv_user_mrs *umrs;
+	struct rv_sconn *sconn;
+
+	struct rv_mr_cached *mrc;
+	u64 loc_addr;
+	u32 rkey;
+	u64 rem_addr;
+	size_t length;
+	u32 immed;
+	u64 wr_id;
+};
+
+extern unsigned int enable_user_mr;
+
+/* Prototypes */
+struct rv_device *rv_device_get_add_user(char *dev_name, struct rv_user *rv);
+void rv_device_get(struct rv_device *dev);
+void rv_device_put(struct rv_device *dev);
+int rv_device_del_user(struct rv_user *rv);
+
+void rv_listener_get(struct rv_listener *listener);
+void rv_listener_put(struct rv_listener *listener);
+struct rv_listener *rv_listener_get_alloc(struct rv_device *dev,
+					  u64 service_id,
+					  ib_cm_handler handler);
+
+int rv_file_init(void);
+void rv_file_uninit(void);
+void rv_queue_work(struct work_struct *work);
+void rv_queue_work2(struct work_struct *work);
+void rv_flush_work2(void);
+void rv_queue_work3(struct work_struct *work);
+
+int doit_reg_mem(struct rv_user *rv, unsigned long arg);
+int doit_dereg_mem(struct rv_user *rv, unsigned long arg);
+void rv_user_mrs_get(struct rv_user_mrs *umrs);
+void rv_user_mrs_put(struct rv_user_mrs *umrs);
+struct rv_user_mrs *rv_user_mrs_alloc(struct rv_user *rv, u32 cache_size);
+void rv_user_mrs_attach(struct rv_user *rv);
+int rv_drv_api_dereg_mem(struct mr_info *mr);
+
+int rv_drv_prepost_recv(struct rv_sconn *sconn);
+void rv_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+void rv_report_cqe_error(struct ib_cq *cq, struct ib_wc *wc,
+			 struct rv_sconn *sconn, const char *opname);
+int doit_post_rdma_write(struct rv_user *rv, unsigned long arg);
+
+static inline struct rv_conn *user_conn_find(struct rv_user *rv, u64 handle)
+{
+	return xa_load(&rv->conn_xa, handle);
+}
+
+void rv_conn_put(struct rv_conn *conn);
+int rv_conn_get_check(struct rv_conn *conn);
+void rv_conn_get(struct rv_conn *conn);
+int doit_conn_create(struct rv_user *rv, unsigned long arg);
+int doit_conn_connect(struct rv_user *rv, unsigned long arg);
+int doit_conn_connected(struct rv_user *rv, unsigned long arg);
+int doit_conn_get_conn_count(struct rv_user *rv, unsigned long arg);
+int doit_conn_get_stats(struct rv_user *rv, unsigned long arg);
+int cmp_gid(const void *gid1, const void *gid2);
+
+void rv_job_dev_get(struct rv_job_dev *jdev);
+void rv_job_dev_put(struct rv_job_dev *jdev);
+
+static inline bool rv_job_dev_has_users(struct rv_job_dev *jdev)
+{
+	return kref_read(&jdev->user_kref);
+}
+
+static inline bool rv_jdev_protocol_roce(const struct rv_job_dev *jdev)
+{
+	return rdma_protocol_roce(jdev->dev->ib_dev, jdev->port_num);
+}
+
+struct rv_sconn *
+rv_find_sconn_from_req(struct ib_cm_id *id,
+		       const struct ib_cm_req_event_param *param,
+		       struct rv_req_priv_data *priv_data);
+
+void rv_detach_user(struct rv_user *rv);
+
+#endif /* __RV_H__ */
diff --git a/drivers/infiniband/ulp/rv/rv_mr_cache.h b/drivers/infiniband/ulp/rv/rv_mr_cache.h
new file mode 100644
index 000000000000..6d36ff11b5b6
--- /dev/null
+++ b/drivers/infiniband/ulp/rv/rv_mr_cache.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2020 - 2021 Intel Corporation.
+ */
+
+#ifndef __RV_MR_CACHE_H__
+#define __RV_MR_CACHE_H__
+
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/interval_tree_generic.h>
+
+#define MAX_RB_SIZE 256 /* This is MB */
+#define RV_RB_MAX_ACTIVE_WQ_ENTRIES 5
+
+/*
+ * The MR cache holds registered MRs and tracks reference counts for each.
+ * Entries with a refcount==0 may remain in the cache and on an lru_list.
+ * If the MMU notifier indicates pages would like to be freed, the entry
+ * will be removed from the cache if it's refcount==0.  Otherwise there
+ * are IOs in flight (app should not free memory for buffers with IOs in flight)
+ * and the MMU notifier is not allowed to free the pages.
+ * If a new cache entry is needed (cache miss), entries will be evicted, oldest
+ * to newest based on the lru_list, until there is space for the new entry.
+ *
+ * max_size - limit allowed for total_size in bytes, immutable
+ * ops_arg - owner context for all ops calls, immutable
+ * mn - MMU notifier
+ * lock - protects the RB-tree, lru_list, del_list, total_size, and stats
+ * root - an RB-tree with an interval based lookup
+ * total_size - current bytes in the cache
+ * ops - owner callbacks for major cache events
+ * mm - for MMU notifier
+ * lru_list - ordered list, most recently used to least recently used
+ * del_work, del_list, wq - handle deletes on a work queue
+ *
+ * Statistics:
+ *	max_cache_size - max bytes in the cache
+ *	count - Current number of MRs in the cache
+ *	max_count - Maximum of count
+ *	inuse - Current number of MRs with refcount > 0
+ *	max_inuse - Maximum of inuse
+ *	inuse_bytes - number of bytes with refcount > 0
+ *	max_inuse_bytes - of inuse_bytes
+ *	max_refcount - Maximum of refcount for any MR
+ *	hit - Cache hit
+ *	miss - Cache miss and added
+ *	full - Cache miss and can't add since full
+ *	evict - Removed due to lack of cache space
+ *	remove - Refcount==0 & remove by mmu notifier event
+ */
+struct rv_mr_cache {
+	u64 max_size;
+	void *ops_arg;
+	struct mmu_notifier mn;
+	spinlock_t lock; /* See above */
+	struct rb_root_cached root;
+	u64 total_size;
+	const struct rv_mr_cache_ops *ops;
+	struct mm_struct *mm;
+	struct list_head lru_list;
+	struct work_struct del_work;
+	struct list_head del_list;
+	struct workqueue_struct *wq;
+
+	struct {
+		u64 max_cache_size;
+		u32 count;
+		u32 max_count;
+		u32 inuse;
+		u32 max_inuse;
+		u64 inuse_bytes;
+		u64 max_inuse_bytes;
+		u32 max_refcount;
+		u64 hit;
+		u64 miss;
+		u64 full;
+		u64 evict;
+		u64 remove;
+	} stats;
+};
+
+/*
+ * basic info about an MR
+ * ib_pd - converted from user version
+ * fd - converted from user provided cmd_fd
+ */
+struct mr_info {
+	struct ib_mr *ib_mr;
+	struct ib_pd *ib_pd;
+	struct fd fd;
+};
+
+/* an MR entry in the MR cache RB-tree */
+struct rv_mr_cached {
+	struct mr_info mr;
+	u64 addr;
+	u64 len;
+	u32 access;
+	u64 __last;
+	atomic_t refcount;
+	struct rb_node node;
+	struct list_head list;
+};
+
+/* callbacks for each major cache event */
+struct rv_mr_cache_ops {
+	bool (*filter)(struct rv_mr_cached *mrc, u64 addr, u64 len, u32 acc);
+	void (*get)(struct rv_mr_cache *cache,
+		    void *ops_arg, struct rv_mr_cached *mrc);
+	int (*put)(struct rv_mr_cache *cache,
+		   void *ops_arg, struct rv_mr_cached *mrc);
+	int (*invalidate)(struct rv_mr_cache *cache,
+			  void *ops_arg, struct rv_mr_cached *mrc);
+	int (*evict)(struct rv_mr_cache *cache,
+		     void *ops_arg, struct rv_mr_cached *mrc,
+		     void *evict_arg, bool *stop);
+};
+
+void rv_mr_cache_update_stats_max(struct rv_mr_cache *cache,
+				  int refcount);
+
+int rv_mr_cache_insert(struct rv_mr_cache *cache, struct rv_mr_cached *mrc);
+
+void rv_mr_cache_evict(struct rv_mr_cache *cache, void *evict_arg);
+
+struct rv_mr_cached *rv_mr_cache_search_get(struct rv_mr_cache *cache,
+					    u64 addr, u64 len, u32 acc,
+					    bool update_hit);
+struct rv_mr_cached *rv_mr_cache_search_put(struct rv_mr_cache *cache,
+					    u64 addr, u64 len, u32 acc);
+void rv_mr_cache_put(struct rv_mr_cache *cache, struct rv_mr_cached *mrc);
+
+int rv_mr_cache_init(int rv_inx, struct rv_mr_cache *cache,
+		     const struct rv_mr_cache_ops *ops, void *priv,
+		     struct mm_struct *mm, u32 cache_size);
+void rv_mr_cache_deinit(int rv_inx, struct rv_mr_cache *cache);
+
+/*
+ * evict operation argument
+ * cleared - count evicted so far in bytes
+ * target - target count to evict in bytes
+ */
+struct evict_data {
+	u64 cleared;
+	u64 target;
+};
+
+#endif /* __RV_MR_CACHE_H__ */
-- 
2.18.1