Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma updates from Jason Gunthorpe:
 "A quiet cycle after the larger 5.8 effort. Substantially cleanup and
  driver work with a few smaller features this time.

   - Driver updates for hfi1, rxe, mlx5, hns, qedr, usnic, bnxt_re

   - Removal of dead or redundant code across the drivers

   - RAW resource tracker dumps to include a device specific data blob
     for device objects to aide device debugging

   - Further advance the IOCTL interface, remove the ability to turn it
     off. Add QUERY_CONTEXT, QUERY_MR, and QUERY_PD commands

   - Remove stubs related to devices with no pkey table

   - A shared CQ scheme to allow multiple ULPs to share the CQ rings of
     a device to give higher performance

   - Several more static checker, syzkaller and rare crashers fixed"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (121 commits)
  RDMA/mlx5: Fix flow destination setting for RDMA TX flow table
  RDMA/rxe: Remove pkey table
  RDMA/umem: Add a schedule point in ib_umem_get()
  RDMA/hns: Fix the unneeded process when getting a general type of CQE error
  RDMA/hns: Fix error during modify qp RTS2RTS
  RDMA/hns: Delete unnecessary memset when allocating VF resource
  RDMA/hns: Remove redundant parameters in set_rc_wqe()
  RDMA/hns: Remove support for HIP08_A
  RDMA/hns: Refactor hns_roce_v2_set_hem()
  RDMA/hns: Remove redundant hardware opcode definitions
  RDMA/netlink: Remove CAP_NET_RAW check when dump a raw QP
  RDMA/include: Replace license text with SPDX tags
  RDMA/rtrs: remove WQ_MEM_RECLAIM for rtrs_wq
  RDMA/rtrs-clt: add an additional random 8 seconds before reconnecting
  RDMA/cma: Execute rdma_cm destruction from a handler properly
  RDMA/cma: Remove unneeded locking for req paths
  RDMA/cma: Using the standard locking pattern when delivering the removal event
  RDMA/cma: Simplify DEVICE_REMOVAL for internal_id
  RDMA/efa: Add EFA 0xefa1 PCI ID
  RDMA/efa: User/kernel compatibility handshake mechanism
  ...
diff --git a/MAINTAINERS b/MAINTAINERS
index 4e497d1..60a1bf7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3621,6 +3621,7 @@
 M:	Devesh Sharma <devesh.sharma@broadcom.com>
 M:	Somnath Kotur <somnath.kotur@broadcom.com>
 M:	Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
+M:	Naresh Kumar PBS <nareshkumar.pbs@broadcom.com>
 L:	linux-rdma@vger.kernel.org
 S:	Supported
 W:	http://www.broadcom.com
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index a83f9eb..91b0233 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -37,14 +37,6 @@
 	  libibverbs, libibcm and a hardware driver library from
 	  rdma-core <https://github.com/linux-rdma/rdma-core>.
 
-config INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI
-	bool "Allow experimental legacy verbs in new ioctl uAPI  (EXPERIMENTAL)"
-	depends on INFINIBAND_USER_ACCESS
-	help
-	  IOCTL based uAPI support for Infiniband is enabled by default for
-	  new verbs only. This allows userspace to invoke the IOCTL based uAPI
-	  for current legacy verbs too.
-
 config INFINIBAND_USER_MEM
 	bool
 	depends on INFINIBAND_USER_ACCESS != n
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index a670209..ffad73b 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -1054,7 +1054,7 @@
 
 	cache = device->port_data[port_num].cache.pkey;
 
-	if (index < 0 || index >= cache->table_len)
+	if (!cache || index < 0 || index >= cache->table_len)
 		ret = -EINVAL;
 	else
 		*pkey = cache->table[index];
@@ -1099,6 +1099,10 @@
 	read_lock_irqsave(&device->cache_lock, flags);
 
 	cache = device->port_data[port_num].cache.pkey;
+	if (!cache) {
+		ret = -EINVAL;
+		goto err;
+	}
 
 	*index = -1;
 
@@ -1117,6 +1121,7 @@
 		ret = 0;
 	}
 
+err:
 	read_unlock_irqrestore(&device->cache_lock, flags);
 
 	return ret;
@@ -1139,6 +1144,10 @@
 	read_lock_irqsave(&device->cache_lock, flags);
 
 	cache = device->port_data[port_num].cache.pkey;
+	if (!cache) {
+		ret = -EINVAL;
+		goto err;
+	}
 
 	*index = -1;
 
@@ -1149,6 +1158,7 @@
 			break;
 		}
 
+err:
 	read_unlock_irqrestore(&device->cache_lock, flags);
 
 	return ret;
@@ -1425,24 +1435,27 @@
 			goto err;
 	}
 
-	pkey_cache = kmalloc(struct_size(pkey_cache, table,
-					 tprops->pkey_tbl_len),
-			     GFP_KERNEL);
-	if (!pkey_cache) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	pkey_cache->table_len = tprops->pkey_tbl_len;
-
-	for (i = 0; i < pkey_cache->table_len; ++i) {
-		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
-		if (ret) {
-			dev_warn(&device->dev,
-				 "ib_query_pkey failed (%d) for index %d\n",
-				 ret, i);
+	if (tprops->pkey_tbl_len) {
+		pkey_cache = kmalloc(struct_size(pkey_cache, table,
+						 tprops->pkey_tbl_len),
+				     GFP_KERNEL);
+		if (!pkey_cache) {
+			ret = -ENOMEM;
 			goto err;
 		}
+
+		pkey_cache->table_len = tprops->pkey_tbl_len;
+
+		for (i = 0; i < pkey_cache->table_len; ++i) {
+			ret = ib_query_pkey(device, port, i,
+					    pkey_cache->table + i);
+			if (ret) {
+				dev_warn(&device->dev,
+					 "ib_query_pkey failed (%d) for index %d\n",
+					 ret, i);
+				goto err;
+			}
+		}
 	}
 
 	write_lock_irq(&device->cache_lock);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index c30cf53..26de0da 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -428,19 +428,6 @@
 	return ret;
 }
 
-static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
-				   enum rdma_cm_state exch)
-{
-	unsigned long flags;
-	enum rdma_cm_state old;
-
-	spin_lock_irqsave(&id_priv->lock, flags);
-	old = id_priv->state;
-	id_priv->state = exch;
-	spin_unlock_irqrestore(&id_priv->lock, flags);
-	return old;
-}
-
 static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
 {
 	return hdr->ip_version >> 4;
@@ -1829,23 +1816,11 @@
 	}
 }
 
-void rdma_destroy_id(struct rdma_cm_id *id)
+static void _destroy_id(struct rdma_id_private *id_priv,
+			enum rdma_cm_state state)
 {
-	struct rdma_id_private *id_priv;
-	enum rdma_cm_state state;
-
-	id_priv = container_of(id, struct rdma_id_private, id);
-	trace_cm_id_destroy(id_priv);
-	state = cma_exch(id_priv, RDMA_CM_DESTROYING);
 	cma_cancel_operation(id_priv, state);
 
-	/*
-	 * Wait for any active callback to finish.  New callbacks will find
-	 * the id_priv state set to destroying and abort.
-	 */
-	mutex_lock(&id_priv->handler_mutex);
-	mutex_unlock(&id_priv->handler_mutex);
-
 	rdma_restrack_del(&id_priv->res);
 	if (id_priv->cma_dev) {
 		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
@@ -1874,6 +1849,42 @@
 	put_net(id_priv->id.route.addr.dev_addr.net);
 	kfree(id_priv);
 }
+
+/*
+ * destroy an ID from within the handler_mutex. This ensures that no other
+ * handlers can start running concurrently.
+ */
+static void destroy_id_handler_unlock(struct rdma_id_private *id_priv)
+	__releases(&idprv->handler_mutex)
+{
+	enum rdma_cm_state state;
+	unsigned long flags;
+
+	trace_cm_id_destroy(id_priv);
+
+	/*
+	 * Setting the state to destroyed under the handler mutex provides a
+	 * fence against calling handler callbacks. If this is invoked due to
+	 * the failure of a handler callback then it guarentees that no future
+	 * handlers will be called.
+	 */
+	lockdep_assert_held(&id_priv->handler_mutex);
+	spin_lock_irqsave(&id_priv->lock, flags);
+	state = id_priv->state;
+	id_priv->state = RDMA_CM_DESTROYING;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	mutex_unlock(&id_priv->handler_mutex);
+	_destroy_id(id_priv, state);
+}
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv =
+		container_of(id, struct rdma_id_private, id);
+
+	mutex_lock(&id_priv->handler_mutex);
+	destroy_id_handler_unlock(id_priv);
+}
 EXPORT_SYMBOL(rdma_destroy_id);
 
 static int cma_rep_recv(struct rdma_id_private *id_priv)
@@ -1925,6 +1936,8 @@
 {
 	int ret;
 
+	lockdep_assert_held(&id_priv->handler_mutex);
+
 	trace_cm_event_handler(id_priv, event);
 	ret = id_priv->id.event_handler(&id_priv->id, event);
 	trace_cm_event_done(id_priv, event, ret);
@@ -1936,7 +1949,7 @@
 {
 	struct rdma_id_private *id_priv = cm_id->context;
 	struct rdma_cm_event event = {};
-	int ret = 0;
+	int ret;
 
 	mutex_lock(&id_priv->handler_mutex);
 	if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
@@ -2005,14 +2018,12 @@
 	if (ret) {
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.ib = NULL;
-		cma_exch(id_priv, RDMA_CM_DESTROYING);
-		mutex_unlock(&id_priv->handler_mutex);
-		rdma_destroy_id(&id_priv->id);
+		destroy_id_handler_unlock(id_priv);
 		return ret;
 	}
 out:
 	mutex_unlock(&id_priv->handler_mutex);
-	return ret;
+	return 0;
 }
 
 static struct rdma_id_private *
@@ -2174,7 +2185,7 @@
 	mutex_lock(&listen_id->handler_mutex);
 	if (listen_id->state != RDMA_CM_LISTEN) {
 		ret = -ECONNABORTED;
-		goto err1;
+		goto err_unlock;
 	}
 
 	offset = cma_user_data_offset(listen_id);
@@ -2191,55 +2202,38 @@
 	}
 	if (!conn_id) {
 		ret = -ENOMEM;
-		goto err1;
+		goto err_unlock;
 	}
 
 	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
 	ret = cma_ib_acquire_dev(conn_id, listen_id, &req);
-	if (ret)
-		goto err2;
+	if (ret) {
+		destroy_id_handler_unlock(conn_id);
+		goto err_unlock;
+	}
 
 	conn_id->cm_id.ib = cm_id;
 	cm_id->context = conn_id;
 	cm_id->cm_handler = cma_ib_handler;
 
-	/*
-	 * Protect against the user destroying conn_id from another thread
-	 * until we're done accessing it.
-	 */
-	cma_id_get(conn_id);
 	ret = cma_cm_event_handler(conn_id, &event);
-	if (ret)
-		goto err3;
-	/*
-	 * Acquire mutex to prevent user executing rdma_destroy_id()
-	 * while we're accessing the cm_id.
-	 */
-	mutex_lock(&lock);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		conn_id->cm_id.ib = NULL;
+		mutex_unlock(&listen_id->handler_mutex);
+		destroy_id_handler_unlock(conn_id);
+		goto net_dev_put;
+	}
+
 	if (cma_comp(conn_id, RDMA_CM_CONNECT) &&
 	    (conn_id->id.qp_type != IB_QPT_UD)) {
 		trace_cm_send_mra(cm_id->context);
 		ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
 	}
-	mutex_unlock(&lock);
 	mutex_unlock(&conn_id->handler_mutex);
-	mutex_unlock(&listen_id->handler_mutex);
-	cma_id_put(conn_id);
-	if (net_dev)
-		dev_put(net_dev);
-	return 0;
 
-err3:
-	cma_id_put(conn_id);
-	/* Destroy the CM ID by returning a non-zero value. */
-	conn_id->cm_id.ib = NULL;
-err2:
-	cma_exch(conn_id, RDMA_CM_DESTROYING);
-	mutex_unlock(&conn_id->handler_mutex);
-err1:
+err_unlock:
 	mutex_unlock(&listen_id->handler_mutex);
-	if (conn_id)
-		rdma_destroy_id(&conn_id->id);
 
 net_dev_put:
 	if (net_dev)
@@ -2339,9 +2333,7 @@
 	if (ret) {
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.iw = NULL;
-		cma_exch(id_priv, RDMA_CM_DESTROYING);
-		mutex_unlock(&id_priv->handler_mutex);
-		rdma_destroy_id(&id_priv->id);
+		destroy_id_handler_unlock(id_priv);
 		return ret;
 	}
 
@@ -2388,16 +2380,16 @@
 
 	ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr);
 	if (ret) {
-		mutex_unlock(&conn_id->handler_mutex);
-		rdma_destroy_id(new_cm_id);
-		goto out;
+		mutex_unlock(&listen_id->handler_mutex);
+		destroy_id_handler_unlock(conn_id);
+		return ret;
 	}
 
 	ret = cma_iw_acquire_dev(conn_id, listen_id);
 	if (ret) {
-		mutex_unlock(&conn_id->handler_mutex);
-		rdma_destroy_id(new_cm_id);
-		goto out;
+		mutex_unlock(&listen_id->handler_mutex);
+		destroy_id_handler_unlock(conn_id);
+		return ret;
 	}
 
 	conn_id->cm_id.iw = cm_id;
@@ -2407,25 +2399,16 @@
 	memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
 	memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));
 
-	/*
-	 * Protect against the user destroying conn_id from another thread
-	 * until we're done accessing it.
-	 */
-	cma_id_get(conn_id);
 	ret = cma_cm_event_handler(conn_id, &event);
 	if (ret) {
 		/* User wants to destroy the CM ID */
 		conn_id->cm_id.iw = NULL;
-		cma_exch(conn_id, RDMA_CM_DESTROYING);
-		mutex_unlock(&conn_id->handler_mutex);
 		mutex_unlock(&listen_id->handler_mutex);
-		cma_id_put(conn_id);
-		rdma_destroy_id(&conn_id->id);
+		destroy_id_handler_unlock(conn_id);
 		return ret;
 	}
 
 	mutex_unlock(&conn_id->handler_mutex);
-	cma_id_put(conn_id);
 
 out:
 	mutex_unlock(&listen_id->handler_mutex);
@@ -2482,6 +2465,10 @@
 {
 	struct rdma_id_private *id_priv = id->context;
 
+	/* Listening IDs are always destroyed on removal */
+	if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+		return -1;
+
 	id->context = id_priv->id.context;
 	id->event_handler = id_priv->id.event_handler;
 	trace_cm_event_handler(id_priv, event);
@@ -2657,21 +2644,21 @@
 {
 	struct cma_work *work = container_of(_work, struct cma_work, work);
 	struct rdma_id_private *id_priv = work->id;
-	int destroy = 0;
 
 	mutex_lock(&id_priv->handler_mutex);
 	if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
-		goto out;
+		goto out_unlock;
 
 	if (cma_cm_event_handler(id_priv, &work->event)) {
-		cma_exch(id_priv, RDMA_CM_DESTROYING);
-		destroy = 1;
+		cma_id_put(id_priv);
+		destroy_id_handler_unlock(id_priv);
+		goto out_free;
 	}
-out:
+
+out_unlock:
 	mutex_unlock(&id_priv->handler_mutex);
 	cma_id_put(id_priv);
-	if (destroy)
-		rdma_destroy_id(&id_priv->id);
+out_free:
 	kfree(work);
 }
 
@@ -2679,23 +2666,22 @@
 {
 	struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
 	struct rdma_id_private *id_priv = work->id;
-	int destroy = 0;
 
 	mutex_lock(&id_priv->handler_mutex);
 	if (id_priv->state == RDMA_CM_DESTROYING ||
 	    id_priv->state == RDMA_CM_DEVICE_REMOVAL)
-		goto out;
+		goto out_unlock;
 
 	if (cma_cm_event_handler(id_priv, &work->event)) {
-		cma_exch(id_priv, RDMA_CM_DESTROYING);
-		destroy = 1;
+		cma_id_put(id_priv);
+		destroy_id_handler_unlock(id_priv);
+		goto out_free;
 	}
 
-out:
+out_unlock:
 	mutex_unlock(&id_priv->handler_mutex);
 	cma_id_put(id_priv);
-	if (destroy)
-		rdma_destroy_id(&id_priv->id);
+out_free:
 	kfree(work);
 }
 
@@ -3171,9 +3157,7 @@
 		event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
 
 	if (cma_cm_event_handler(id_priv, &event)) {
-		cma_exch(id_priv, RDMA_CM_DESTROYING);
-		mutex_unlock(&id_priv->handler_mutex);
-		rdma_destroy_id(&id_priv->id);
+		destroy_id_handler_unlock(id_priv);
 		return;
 	}
 out:
@@ -3790,7 +3774,7 @@
 	struct rdma_cm_event event = {};
 	const struct ib_cm_sidr_rep_event_param *rep =
 				&ib_event->param.sidr_rep_rcvd;
-	int ret = 0;
+	int ret;
 
 	mutex_lock(&id_priv->handler_mutex);
 	if (id_priv->state != RDMA_CM_CONNECT)
@@ -3840,14 +3824,12 @@
 	if (ret) {
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.ib = NULL;
-		cma_exch(id_priv, RDMA_CM_DESTROYING);
-		mutex_unlock(&id_priv->handler_mutex);
-		rdma_destroy_id(&id_priv->id);
+		destroy_id_handler_unlock(id_priv);
 		return ret;
 	}
 out:
 	mutex_unlock(&id_priv->handler_mutex);
-	return ret;
+	return 0;
 }
 
 static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
@@ -4372,9 +4354,7 @@
 
 	rdma_destroy_ah_attr(&event.param.ud.ah_attr);
 	if (ret) {
-		cma_exch(id_priv, RDMA_CM_DESTROYING);
-		mutex_unlock(&id_priv->handler_mutex);
-		rdma_destroy_id(&id_priv->id);
+		destroy_id_handler_unlock(id_priv);
 		return 0;
 	}
 
@@ -4789,50 +4769,59 @@
 	return ret;
 }
 
-static int cma_remove_id_dev(struct rdma_id_private *id_priv)
+static void cma_send_device_removal_put(struct rdma_id_private *id_priv)
 {
-	struct rdma_cm_event event = {};
+	struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL };
 	enum rdma_cm_state state;
-	int ret = 0;
+	unsigned long flags;
 
-	/* Record that we want to remove the device */
-	state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL);
-	if (state == RDMA_CM_DESTROYING)
-		return 0;
-
-	cma_cancel_operation(id_priv, state);
 	mutex_lock(&id_priv->handler_mutex);
+	/* Record that we want to remove the device */
+	spin_lock_irqsave(&id_priv->lock, flags);
+	state = id_priv->state;
+	if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) {
+		spin_unlock_irqrestore(&id_priv->lock, flags);
+		mutex_unlock(&id_priv->handler_mutex);
+		cma_id_put(id_priv);
+		return;
+	}
+	id_priv->state = RDMA_CM_DEVICE_REMOVAL;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
 
-	/* Check for destruction from another callback. */
-	if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL))
-		goto out;
-
-	event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
-	ret = cma_cm_event_handler(id_priv, &event);
-out:
+	if (cma_cm_event_handler(id_priv, &event)) {
+		/*
+		 * At this point the ULP promises it won't call
+		 * rdma_destroy_id() concurrently
+		 */
+		cma_id_put(id_priv);
+		mutex_unlock(&id_priv->handler_mutex);
+		trace_cm_id_destroy(id_priv);
+		_destroy_id(id_priv, state);
+		return;
+	}
 	mutex_unlock(&id_priv->handler_mutex);
-	return ret;
+
+	/*
+	 * If this races with destroy then the thread that first assigns state
+	 * to a destroying does the cancel.
+	 */
+	cma_cancel_operation(id_priv, state);
+	cma_id_put(id_priv);
 }
 
 static void cma_process_remove(struct cma_device *cma_dev)
 {
-	struct rdma_id_private *id_priv;
-	int ret;
-
 	mutex_lock(&lock);
 	while (!list_empty(&cma_dev->id_list)) {
-		id_priv = list_entry(cma_dev->id_list.next,
-				     struct rdma_id_private, list);
+		struct rdma_id_private *id_priv = list_first_entry(
+			&cma_dev->id_list, struct rdma_id_private, list);
 
 		list_del(&id_priv->listen_list);
 		list_del_init(&id_priv->list);
 		cma_id_get(id_priv);
 		mutex_unlock(&lock);
 
-		ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv);
-		cma_id_put(id_priv);
-		if (ret)
-			rdma_destroy_id(&id_priv->id);
+		cma_send_device_removal_put(id_priv);
 
 		mutex_lock(&lock);
 	}
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index 738d1fa..6361668 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -8,7 +8,7 @@
 #include "core_priv.h"
 #include "restrack.h"
 
-#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE)
+#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID)
 
 static int __counter_set_mode(struct rdma_counter_mode *curr,
 			      enum rdma_nl_counter_mode new_mode,
@@ -149,23 +149,13 @@
 	struct auto_mode_param *param = &counter->mode.param;
 	bool match = true;
 
-	/*
-	 * Ensure that counter belongs to the right PID.  This operation can
-	 * race with user space which kills the process and leaves QP and
-	 * counters orphans.
-	 *
-	 * It is not a big deal because exitted task will leave both QP and
-	 * counter in the same bucket of zombie process. Just ensure that
-	 * process is still alive before procedding.
-	 *
-	 */
-	if (task_pid_nr(counter->res.task) != task_pid_nr(qp->res.task) ||
-	    !task_pid_nr(qp->res.task))
-		return false;
-
 	if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE)
 		match &= (param->qp_type == qp->qp_type);
 
+	if (auto_mask & RDMA_COUNTER_MASK_PID)
+		match &= (task_pid_nr(counter->res.task) ==
+			  task_pid_nr(qp->res.task));
+
 	return match;
 }
 
@@ -288,7 +278,7 @@
 	struct rdma_counter *counter;
 	int ret;
 
-	if (!qp->res.valid)
+	if (!qp->res.valid || rdma_is_kernel_res(&qp->res))
 		return 0;
 
 	if (!rdma_is_port_valid(dev, port))
@@ -483,7 +473,7 @@
 		goto err;
 	}
 
-	if (counter->res.task != qp->res.task) {
+	if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) {
 		ret = -EINVAL;
 		goto err_task;
 	}
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 2927a9d..ef0cd29 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -272,7 +272,6 @@
 	} mandatory_table[] = {
 		IB_MANDATORY_FUNC(query_device),
 		IB_MANDATORY_FUNC(query_port),
-		IB_MANDATORY_FUNC(query_pkey),
 		IB_MANDATORY_FUNC(alloc_pd),
 		IB_MANDATORY_FUNC(dealloc_pd),
 		IB_MANDATORY_FUNC(create_qp),
@@ -1343,6 +1342,10 @@
 	return ret;
 }
 
+static void prevent_dealloc_device(struct ib_device *ib_dev)
+{
+}
+
 /**
  * ib_register_device - Register an IB device with IB core
  * @device: Device to register
@@ -1413,11 +1416,11 @@
 		 * possibility for a parallel unregistration along with this
 		 * error flow. Since we have a refcount here we know any
 		 * parallel flow is stopped in disable_device and will see the
-		 * NULL pointers, causing the responsibility to
+		 * special dealloc_driver pointer, causing the responsibility to
 		 * ib_dealloc_device() to revert back to this thread.
 		 */
 		dealloc_fn = device->ops.dealloc_driver;
-		device->ops.dealloc_driver = NULL;
+		device->ops.dealloc_driver = prevent_dealloc_device;
 		ib_device_put(device);
 		__ib_unregister_device(device);
 		device->ops.dealloc_driver = dealloc_fn;
@@ -1466,7 +1469,8 @@
 	 * Drivers using the new flow may not call ib_dealloc_device except
 	 * in error unwind prior to registration success.
 	 */
-	if (ib_dev->ops.dealloc_driver) {
+	if (ib_dev->ops.dealloc_driver &&
+	    ib_dev->ops.dealloc_driver != prevent_dealloc_device) {
 		WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
 		ib_dealloc_device(ib_dev);
 	}
@@ -2361,6 +2365,9 @@
 	if (!rdma_is_port_valid(device, port_num))
 		return -EINVAL;
 
+	if (!device->ops.query_pkey)
+		return -EOPNOTSUPP;
+
 	return device->ops.query_pkey(device, port_num, index, pkey);
 }
 EXPORT_SYMBOL(ib_query_pkey);
@@ -2621,8 +2628,14 @@
 	SET_DEVICE_OP(dev_ops, drain_rq);
 	SET_DEVICE_OP(dev_ops, drain_sq);
 	SET_DEVICE_OP(dev_ops, enable_driver);
-	SET_DEVICE_OP(dev_ops, fill_res_entry);
-	SET_DEVICE_OP(dev_ops, fill_stat_entry);
+	SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry);
+	SET_DEVICE_OP(dev_ops, fill_res_cq_entry);
+	SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw);
+	SET_DEVICE_OP(dev_ops, fill_res_mr_entry);
+	SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw);
+	SET_DEVICE_OP(dev_ops, fill_res_qp_entry);
+	SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw);
+	SET_DEVICE_OP(dev_ops, fill_stat_mr_entry);
 	SET_DEVICE_OP(dev_ops, get_dev_fw_str);
 	SET_DEVICE_OP(dev_ops, get_dma_mr);
 	SET_DEVICE_OP(dev_ops, get_hw_stats);
@@ -2667,6 +2680,7 @@
 	SET_DEVICE_OP(dev_ops, query_port);
 	SET_DEVICE_OP(dev_ops, query_qp);
 	SET_DEVICE_OP(dev_ops, query_srq);
+	SET_DEVICE_OP(dev_ops, query_ucontext);
 	SET_DEVICE_OP(dev_ops, rdma_netdev_get_params);
 	SET_DEVICE_OP(dev_ops, read_counters);
 	SET_DEVICE_OP(dev_ops, reg_dm_mr);
@@ -2679,10 +2693,12 @@
 	SET_DEVICE_OP(dev_ops, set_vf_link_state);
 
 	SET_OBJ_SIZE(dev_ops, ib_ah);
+	SET_OBJ_SIZE(dev_ops, ib_counters);
 	SET_OBJ_SIZE(dev_ops, ib_cq);
 	SET_OBJ_SIZE(dev_ops, ib_pd);
 	SET_OBJ_SIZE(dev_ops, ib_srq);
 	SET_OBJ_SIZE(dev_ops, ib_ucontext);
+	SET_OBJ_SIZE(dev_ops, ib_xrcd);
 }
 EXPORT_SYMBOL(ib_set_device_ops);
 
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index a09f8e3..9355e52 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -402,7 +402,7 @@
 	INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
 	INIT_LIST_HEAD(&mad_agent_priv->local_list);
 	INIT_WORK(&mad_agent_priv->local_work, local_completions);
-	atomic_set(&mad_agent_priv->refcount, 1);
+	refcount_set(&mad_agent_priv->refcount, 1);
 	init_completion(&mad_agent_priv->comp);
 
 	ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type);
@@ -484,7 +484,7 @@
 
 static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
 {
-	if (atomic_dec_and_test(&mad_agent_priv->refcount))
+	if (refcount_dec_and_test(&mad_agent_priv->refcount))
 		complete(&mad_agent_priv->comp);
 }
 
@@ -718,7 +718,7 @@
 			 * Reference MAD agent until receive
 			 * side of local completion handled
 			 */
-			atomic_inc(&mad_agent_priv->refcount);
+			refcount_inc(&mad_agent_priv->refcount);
 		} else
 			kfree(mad_priv);
 		break;
@@ -758,7 +758,7 @@
 		local->return_wc_byte_len = mad_size;
 	}
 	/* Reference MAD agent until send side of local completion handled */
-	atomic_inc(&mad_agent_priv->refcount);
+	refcount_inc(&mad_agent_priv->refcount);
 	/* Queue local completion to local list */
 	spin_lock_irqsave(&mad_agent_priv->lock, flags);
 	list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
@@ -916,7 +916,7 @@
 	}
 
 	mad_send_wr->send_buf.mad_agent = mad_agent;
-	atomic_inc(&mad_agent_priv->refcount);
+	refcount_inc(&mad_agent_priv->refcount);
 	return &mad_send_wr->send_buf;
 }
 EXPORT_SYMBOL(ib_create_send_mad);
@@ -1131,7 +1131,7 @@
 		mad_send_wr->status = IB_WC_SUCCESS;
 
 		/* Reference MAD agent until send completes */
-		atomic_inc(&mad_agent_priv->refcount);
+		refcount_inc(&mad_agent_priv->refcount);
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 		list_add_tail(&mad_send_wr->agent_list,
 			      &mad_agent_priv->send_list);
@@ -1148,7 +1148,7 @@
 			spin_lock_irqsave(&mad_agent_priv->lock, flags);
 			list_del(&mad_send_wr->agent_list);
 			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
-			atomic_dec(&mad_agent_priv->refcount);
+			deref_mad_agent(mad_agent_priv);
 			goto error;
 		}
 	}
@@ -1554,7 +1554,7 @@
 		hi_tid = be64_to_cpu(mad_hdr->tid) >> 32;
 		rcu_read_lock();
 		mad_agent = xa_load(&ib_mad_clients, hi_tid);
-		if (mad_agent && !atomic_inc_not_zero(&mad_agent->refcount))
+		if (mad_agent && !refcount_inc_not_zero(&mad_agent->refcount))
 			mad_agent = NULL;
 		rcu_read_unlock();
 	} else {
@@ -1606,7 +1606,7 @@
 			}
 		}
 		if (mad_agent)
-			atomic_inc(&mad_agent->refcount);
+			refcount_inc(&mad_agent->refcount);
 out:
 		spin_unlock_irqrestore(&port_priv->reg_lock, flags);
 	}
@@ -1831,7 +1831,7 @@
 				mad_agent_priv->agent.recv_handler(
 						&mad_agent_priv->agent, NULL,
 						mad_recv_wc);
-				atomic_dec(&mad_agent_priv->refcount);
+				deref_mad_agent(mad_agent_priv);
 			} else {
 				/* not user rmpp, revert to normal behavior and
 				 * drop the mad */
@@ -1848,7 +1848,7 @@
 					&mad_agent_priv->agent,
 					&mad_send_wr->send_buf,
 					mad_recv_wc);
-			atomic_dec(&mad_agent_priv->refcount);
+			deref_mad_agent(mad_agent_priv);
 
 			mad_send_wc.status = IB_WC_SUCCESS;
 			mad_send_wc.vendor_err = 0;
@@ -2438,7 +2438,7 @@
 		list_del(&mad_send_wr->agent_list);
 		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
 						   &mad_send_wc);
-		atomic_dec(&mad_agent_priv->refcount);
+		deref_mad_agent(mad_agent_priv);
 	}
 }
 
@@ -2572,7 +2572,7 @@
 						&local->mad_send_wr->send_buf,
 						&local->mad_priv->header.recv_wc);
 			spin_lock_irqsave(&recv_mad_agent->lock, flags);
-			atomic_dec(&recv_mad_agent->refcount);
+			deref_mad_agent(recv_mad_agent);
 			spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
 		}
 
@@ -2585,7 +2585,7 @@
 						   &mad_send_wc);
 
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
-		atomic_dec(&mad_agent_priv->refcount);
+		deref_mad_agent(mad_agent_priv);
 		if (free_mad)
 			kfree(local->mad_priv);
 		kfree(local);
@@ -2671,7 +2671,7 @@
 		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
 						   &mad_send_wc);
 
-		atomic_dec(&mad_agent_priv->refcount);
+		deref_mad_agent(mad_agent_priv);
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 	}
 	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 403d867..4aa16b3 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -103,7 +103,7 @@
 	struct work_struct local_work;
 	struct list_head rmpp_list;
 
-	atomic_t refcount;
+	refcount_t refcount;
 	union {
 		struct completion comp;
 		struct rcu_head rcu;
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index 5ec57ab..e0573e4 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -40,8 +40,7 @@
 enum rmpp_state {
 	RMPP_STATE_ACTIVE,
 	RMPP_STATE_TIMEOUT,
-	RMPP_STATE_COMPLETE,
-	RMPP_STATE_CANCELING
+	RMPP_STATE_COMPLETE
 };
 
 struct mad_rmpp_recv {
@@ -52,7 +51,7 @@
 	struct completion comp;
 	enum rmpp_state state;
 	spinlock_t lock;
-	atomic_t refcount;
+	refcount_t refcount;
 
 	struct ib_ah *ah;
 	struct ib_mad_recv_wc *rmpp_wc;
@@ -73,7 +72,7 @@
 
 static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
 {
-	if (atomic_dec_and_test(&rmpp_recv->refcount))
+	if (refcount_dec_and_test(&rmpp_recv->refcount))
 		complete(&rmpp_recv->comp);
 }
 
@@ -92,22 +91,18 @@
 
 	spin_lock_irqsave(&agent->lock, flags);
 	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
-		if (rmpp_recv->state != RMPP_STATE_COMPLETE)
-			ib_free_recv_mad(rmpp_recv->rmpp_wc);
-		rmpp_recv->state = RMPP_STATE_CANCELING;
-	}
-	spin_unlock_irqrestore(&agent->lock, flags);
-
-	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
 		cancel_delayed_work(&rmpp_recv->timeout_work);
 		cancel_delayed_work(&rmpp_recv->cleanup_work);
 	}
+	spin_unlock_irqrestore(&agent->lock, flags);
 
 	flush_workqueue(agent->qp_info->port_priv->wq);
 
 	list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv,
 				 &agent->rmpp_list, list) {
 		list_del(&rmpp_recv->list);
+		if (rmpp_recv->state != RMPP_STATE_COMPLETE)
+			ib_free_recv_mad(rmpp_recv->rmpp_wc);
 		destroy_rmpp_recv(rmpp_recv);
 	}
 }
@@ -272,10 +267,6 @@
 	unsigned long flags;
 
 	spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
-	if (rmpp_recv->state == RMPP_STATE_CANCELING) {
-		spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
-		return;
-	}
 	list_del(&rmpp_recv->list);
 	spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
 	destroy_rmpp_recv(rmpp_recv);
@@ -305,7 +296,7 @@
 	INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler);
 	spin_lock_init(&rmpp_recv->lock);
 	rmpp_recv->state = RMPP_STATE_ACTIVE;
-	atomic_set(&rmpp_recv->refcount, 1);
+	refcount_set(&rmpp_recv->refcount, 1);
 
 	rmpp_recv->rmpp_wc = mad_recv_wc;
 	rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf;
@@ -357,7 +348,7 @@
 	spin_lock_irqsave(&agent->lock, flags);
 	rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
 	if (rmpp_recv)
-		atomic_inc(&rmpp_recv->refcount);
+		refcount_inc(&rmpp_recv->refcount);
 	spin_unlock_irqrestore(&agent->lock, flags);
 	return rmpp_recv;
 }
@@ -553,7 +544,7 @@
 		destroy_rmpp_recv(rmpp_recv);
 		return continue_rmpp(agent, mad_recv_wc);
 	}
-	atomic_inc(&rmpp_recv->refcount);
+	refcount_inc(&rmpp_recv->refcount);
 
 	if (get_last_flag(&mad_recv_wc->recv_buf)) {
 		rmpp_recv->state = RMPP_STATE_COMPLETE;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index e16105b..12d29d5 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -114,6 +114,7 @@
 	[RDMA_NLDEV_ATTR_RES_PS]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_QP]		= { .type = NLA_NESTED },
 	[RDMA_NLDEV_ATTR_RES_QP_ENTRY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_RAW]		= { .type = NLA_BINARY },
 	[RDMA_NLDEV_ATTR_RES_RKEY]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_RQPN]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_RQ_PSN]		= { .type = NLA_U32 },
@@ -446,27 +447,11 @@
 	return err ? -EMSGSIZE : 0;
 }
 
-static bool fill_res_entry(struct ib_device *dev, struct sk_buff *msg,
-			   struct rdma_restrack_entry *res)
+static int fill_res_qp_entry_query(struct sk_buff *msg,
+				   struct rdma_restrack_entry *res,
+				   struct ib_device *dev,
+				   struct ib_qp *qp)
 {
-	if (!dev->ops.fill_res_entry)
-		return false;
-	return dev->ops.fill_res_entry(msg, res);
-}
-
-static bool fill_stat_entry(struct ib_device *dev, struct sk_buff *msg,
-			    struct rdma_restrack_entry *res)
-{
-	if (!dev->ops.fill_stat_entry)
-		return false;
-	return dev->ops.fill_stat_entry(msg, res);
-}
-
-static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
-			     struct rdma_restrack_entry *res, uint32_t port)
-{
-	struct ib_qp *qp = container_of(res, struct ib_qp, res);
-	struct ib_device *dev = qp->device;
 	struct ib_qp_init_attr qp_init_attr;
 	struct ib_qp_attr qp_attr;
 	int ret;
@@ -475,16 +460,6 @@
 	if (ret)
 		return ret;
 
-	if (port && port != qp_attr.port_num)
-		return -EAGAIN;
-
-	/* In create_qp() port is not set yet */
-	if (qp_attr.port_num &&
-	    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp_attr.port_num))
-		goto err;
-
-	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num))
-		goto err;
 	if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) {
 		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN,
 				qp_attr.dest_qp_num))
@@ -508,21 +483,55 @@
 	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state))
 		goto err;
 
-	if (!rdma_is_kernel_res(res) &&
-	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id))
-		goto err;
-
-	if (fill_res_name_pid(msg, res))
-		goto err;
-
-	if (fill_res_entry(dev, msg, res))
-		goto err;
-
+	if (dev->ops.fill_res_qp_entry)
+		return dev->ops.fill_res_qp_entry(msg, qp);
 	return 0;
 
 err:	return -EMSGSIZE;
 }
 
+static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
+			     struct rdma_restrack_entry *res, uint32_t port)
+{
+	struct ib_qp *qp = container_of(res, struct ib_qp, res);
+	struct ib_device *dev = qp->device;
+	int ret;
+
+	if (port && port != qp->port)
+		return -EAGAIN;
+
+	/* In create_qp() port is not set yet */
+	if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port))
+		return -EINVAL;
+
+	ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num);
+	if (ret)
+		return -EMSGSIZE;
+
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id))
+		return -EMSGSIZE;
+
+	ret = fill_res_name_pid(msg, res);
+	if (ret)
+		return -EMSGSIZE;
+
+	return fill_res_qp_entry_query(msg, res, dev, qp);
+}
+
+static int fill_res_qp_raw_entry(struct sk_buff *msg, bool has_cap_net_admin,
+				 struct rdma_restrack_entry *res, uint32_t port)
+{
+	struct ib_qp *qp = container_of(res, struct ib_qp, res);
+	struct ib_device *dev = qp->device;
+
+	if (port && port != qp->port)
+		return -EAGAIN;
+	if (!dev->ops.fill_res_qp_entry_raw)
+		return -EINVAL;
+	return dev->ops.fill_res_qp_entry_raw(msg, qp);
+}
+
 static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin,
 				struct rdma_restrack_entry *res, uint32_t port)
 {
@@ -568,9 +577,8 @@
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
-	if (fill_res_entry(dev, msg, res))
-		goto err;
-
+	if (dev->ops.fill_res_cm_id_entry)
+		return dev->ops.fill_res_cm_id_entry(msg, cm_id);
 	return 0;
 
 err: return -EMSGSIZE;
@@ -583,35 +591,42 @@
 	struct ib_device *dev = cq->device;
 
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe))
-		goto err;
+		return -EMSGSIZE;
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT,
 			      atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD))
-		goto err;
+		return -EMSGSIZE;
 
 	/* Poll context is only valid for kernel CQs */
 	if (rdma_is_kernel_res(res) &&
 	    nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
-		goto err;
+		return -EMSGSIZE;
 
 	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL)))
-		goto err;
+		return -EMSGSIZE;
 
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
-		goto err;
+		return -EMSGSIZE;
 	if (!rdma_is_kernel_res(res) &&
 	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
 			cq->uobject->uevent.uobject.context->res.id))
-		goto err;
+		return -EMSGSIZE;
 
 	if (fill_res_name_pid(msg, res))
-		goto err;
+		return -EMSGSIZE;
 
-	if (fill_res_entry(dev, msg, res))
-		goto err;
+	return (dev->ops.fill_res_cq_entry) ?
+		dev->ops.fill_res_cq_entry(msg, cq) : 0;
+}
 
-	return 0;
+static int fill_res_cq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin,
+				 struct rdma_restrack_entry *res, uint32_t port)
+{
+	struct ib_cq *cq = container_of(res, struct ib_cq, res);
+	struct ib_device *dev = cq->device;
 
-err:	return -EMSGSIZE;
+	if (!dev->ops.fill_res_cq_entry_raw)
+		return -EINVAL;
+	return dev->ops.fill_res_cq_entry_raw(msg, cq);
 }
 
 static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
@@ -622,38 +637,45 @@
 
 	if (has_cap_net_admin) {
 		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey))
-			goto err;
+			return -EMSGSIZE;
 		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey))
-			goto err;
+			return -EMSGSIZE;
 	}
 
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length,
 			      RDMA_NLDEV_ATTR_PAD))
-		goto err;
+		return -EMSGSIZE;
 
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
-		goto err;
+		return -EMSGSIZE;
 
 	if (!rdma_is_kernel_res(res) &&
 	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id))
-		goto err;
+		return -EMSGSIZE;
 
 	if (fill_res_name_pid(msg, res))
-		goto err;
+		return -EMSGSIZE;
 
-	if (fill_res_entry(dev, msg, res))
-		goto err;
+	return (dev->ops.fill_res_mr_entry) ?
+		       dev->ops.fill_res_mr_entry(msg, mr) :
+		       0;
+}
 
-	return 0;
+static int fill_res_mr_raw_entry(struct sk_buff *msg, bool has_cap_net_admin,
+				 struct rdma_restrack_entry *res, uint32_t port)
+{
+	struct ib_mr *mr = container_of(res, struct ib_mr, res);
+	struct ib_device *dev = mr->pd->device;
 
-err:	return -EMSGSIZE;
+	if (!dev->ops.fill_res_mr_entry_raw)
+		return -EINVAL;
+	return dev->ops.fill_res_mr_entry_raw(msg, mr);
 }
 
 static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_pd *pd = container_of(res, struct ib_pd, res);
-	struct ib_device *dev = pd->device;
 
 	if (has_cap_net_admin) {
 		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,
@@ -676,13 +698,7 @@
 			pd->uobject->context->res.id))
 		goto err;
 
-	if (fill_res_name_pid(msg, res))
-		goto err;
-
-	if (fill_res_entry(dev, msg, res))
-		goto err;
-
-	return 0;
+	return fill_res_name_pid(msg, res);
 
 err:	return -EMSGSIZE;
 }
@@ -695,11 +711,16 @@
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode))
 		return -EMSGSIZE;
 
-	if (m->mode == RDMA_COUNTER_MODE_AUTO)
+	if (m->mode == RDMA_COUNTER_MODE_AUTO) {
 		if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) &&
 		    nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type))
 			return -EMSGSIZE;
 
+		if ((m->mask & RDMA_COUNTER_MASK_PID) &&
+		    fill_res_name_pid(msg, &counter->res))
+			return -EMSGSIZE;
+	}
+
 	return 0;
 }
 
@@ -738,9 +759,6 @@
 	xa_lock(&rt->xa);
 	xa_for_each(&rt->xa, id, res) {
 		qp = container_of(res, struct ib_qp, res);
-		if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
-			continue;
-
 		if (!qp->counter || (qp->counter->id != counter->id))
 			continue;
 
@@ -793,9 +811,8 @@
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
 		goto err;
 
-	if (fill_stat_entry(dev, msg, res))
-		goto err;
-
+	if (dev->ops.fill_stat_mr_entry)
+		return dev->ops.fill_stat_mr_entry(msg, mr);
 	return 0;
 
 err:
@@ -840,7 +857,6 @@
 
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) ||
 	    nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) ||
-	    fill_res_name_pid(msg, &counter->res) ||
 	    fill_stat_counter_mode(msg, counter) ||
 	    fill_stat_counter_qps(msg, counter) ||
 	    fill_stat_counter_hwcounters(msg, counter))
@@ -1177,7 +1193,6 @@
 
 struct nldev_fill_res_entry {
 	enum rdma_nldev_attr nldev_attr;
-	enum rdma_nldev_command nldev_cmd;
 	u8 flags;
 	u32 entry;
 	u32 id;
@@ -1189,40 +1204,34 @@
 
 static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
 	[RDMA_RESTRACK_QP] = {
-		.nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_QP,
 		.entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY,
 		.id = RDMA_NLDEV_ATTR_RES_LQPN,
 	},
 	[RDMA_RESTRACK_CM_ID] = {
-		.nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID,
 		.entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,
 		.id = RDMA_NLDEV_ATTR_RES_CM_IDN,
 	},
 	[RDMA_RESTRACK_CQ] = {
-		.nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_CQ,
 		.flags = NLDEV_PER_DEV,
 		.entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY,
 		.id = RDMA_NLDEV_ATTR_RES_CQN,
 	},
 	[RDMA_RESTRACK_MR] = {
-		.nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_MR,
 		.flags = NLDEV_PER_DEV,
 		.entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY,
 		.id = RDMA_NLDEV_ATTR_RES_MRN,
 	},
 	[RDMA_RESTRACK_PD] = {
-		.nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_PD,
 		.flags = NLDEV_PER_DEV,
 		.entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY,
 		.id = RDMA_NLDEV_ATTR_RES_PDN,
 	},
 	[RDMA_RESTRACK_COUNTER] = {
-		.nldev_cmd = RDMA_NLDEV_CMD_STAT_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER,
 		.entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY,
 		.id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID,
@@ -1281,7 +1290,8 @@
 	}
 
 	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
-			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd),
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NL_GET_OP(nlh->nlmsg_type)),
 			0, 0);
 
 	if (fill_nldev_handle(msg, device)) {
@@ -1359,7 +1369,8 @@
 	}
 
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
-			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd),
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NL_GET_OP(cb->nlh->nlmsg_type)),
 			0, NLM_F_MULTI);
 
 	if (fill_nldev_handle(skb, device)) {
@@ -1441,26 +1452,29 @@
 	return ret;
 }
 
-#define RES_GET_FUNCS(name, type)					       \
-	static int nldev_res_get_##name##_dumpit(struct sk_buff *skb,	       \
+#define RES_GET_FUNCS(name, type)                                              \
+	static int nldev_res_get_##name##_dumpit(struct sk_buff *skb,          \
 						 struct netlink_callback *cb)  \
-	{								       \
-		return res_get_common_dumpit(skb, cb, type,		       \
-					     fill_res_##name##_entry);	       \
-	}								       \
-	static int nldev_res_get_##name##_doit(struct sk_buff *skb,	       \
-					       struct nlmsghdr *nlh,	       \
+	{                                                                      \
+		return res_get_common_dumpit(skb, cb, type,                    \
+					     fill_res_##name##_entry);         \
+	}                                                                      \
+	static int nldev_res_get_##name##_doit(struct sk_buff *skb,            \
+					       struct nlmsghdr *nlh,           \
 					       struct netlink_ext_ack *extack) \
-	{								       \
-		return res_get_common_doit(skb, nlh, extack, type,	       \
-					   fill_res_##name##_entry);	       \
+	{                                                                      \
+		return res_get_common_doit(skb, nlh, extack, type,             \
+					   fill_res_##name##_entry);           \
 	}
 
 RES_GET_FUNCS(qp, RDMA_RESTRACK_QP);
+RES_GET_FUNCS(qp_raw, RDMA_RESTRACK_QP);
 RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID);
 RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ);
+RES_GET_FUNCS(cq_raw, RDMA_RESTRACK_CQ);
 RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
 RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
+RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR);
 RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER);
 
 static LIST_HEAD(link_ops);
@@ -2145,6 +2159,21 @@
 		.doit = nldev_stat_del_doit,
 		.flags = RDMA_NL_ADMIN_PERM,
 	},
+	[RDMA_NLDEV_CMD_RES_QP_GET_RAW] = {
+		.doit = nldev_res_get_qp_raw_doit,
+		.dump = nldev_res_get_qp_raw_dumpit,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
+	[RDMA_NLDEV_CMD_RES_CQ_GET_RAW] = {
+		.doit = nldev_res_get_cq_raw_doit,
+		.dump = nldev_res_get_cq_raw_dumpit,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
+	[RDMA_NLDEV_CMD_RES_MR_GET_RAW] = {
+		.doit = nldev_res_get_mr_raw_doit,
+		.dump = nldev_res_get_mr_raw_dumpit,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 };
 
 void __init nldev_init(void)
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index defe9cd..c11e505 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -58,7 +58,7 @@
 	struct ib_device      *ibdev;
 	struct gid_attr_group *gid_attr_group;
 	struct attribute_group gid_group;
-	struct attribute_group pkey_group;
+	struct attribute_group *pkey_group;
 	struct attribute_group *pma_table;
 	struct attribute_group *hw_stats_ag;
 	struct rdma_hw_stats   *hw_stats;
@@ -681,11 +681,16 @@
 		kfree(p->gid_group.attrs);
 	}
 
-	if (p->pkey_group.attrs) {
-		for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
-			kfree(a);
+	if (p->pkey_group) {
+		if (p->pkey_group->attrs) {
+			for (i = 0; (a = p->pkey_group->attrs[i]); ++i)
+				kfree(a);
 
-		kfree(p->pkey_group.attrs);
+			kfree(p->pkey_group->attrs);
+		}
+
+		kfree(p->pkey_group);
+		p->pkey_group = NULL;
 	}
 
 	kfree(p);
@@ -1118,17 +1123,26 @@
 	if (ret)
 		goto err_free_gid_type;
 
-	p->pkey_group.name  = "pkeys";
-	p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
-						attr.pkey_tbl_len);
-	if (!p->pkey_group.attrs) {
-		ret = -ENOMEM;
-		goto err_remove_gid_type;
+	if (attr.pkey_tbl_len) {
+		p->pkey_group = kzalloc(sizeof(*p->pkey_group), GFP_KERNEL);
+		if (!p->pkey_group) {
+			ret = -ENOMEM;
+			goto err_remove_gid_type;
+		}
+
+		p->pkey_group->name  = "pkeys";
+		p->pkey_group->attrs = alloc_group_attrs(show_port_pkey,
+							 attr.pkey_tbl_len);
+		if (!p->pkey_group->attrs) {
+			ret = -ENOMEM;
+			goto err_free_pkey_group;
+		}
+
+		ret = sysfs_create_group(&p->kobj, p->pkey_group);
+		if (ret)
+			goto err_free_pkey;
 	}
 
-	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
-	if (ret)
-		goto err_free_pkey;
 
 	if (device->ops.init_port && is_full_dev) {
 		ret = device->ops.init_port(device, port_num, &p->kobj);
@@ -1150,14 +1164,20 @@
 	return 0;
 
 err_remove_pkey:
-	sysfs_remove_group(&p->kobj, &p->pkey_group);
+	if (p->pkey_group)
+		sysfs_remove_group(&p->kobj, p->pkey_group);
 
 err_free_pkey:
-	for (i = 0; i < attr.pkey_tbl_len; ++i)
-		kfree(p->pkey_group.attrs[i]);
+	if (p->pkey_group) {
+		for (i = 0; i < attr.pkey_tbl_len; ++i)
+			kfree(p->pkey_group->attrs[i]);
 
-	kfree(p->pkey_group.attrs);
-	p->pkey_group.attrs = NULL;
+		kfree(p->pkey_group->attrs);
+		p->pkey_group->attrs = NULL;
+	}
+
+err_free_pkey_group:
+	kfree(p->pkey_group);
 
 err_remove_gid_type:
 	sysfs_remove_group(&p->gid_attr_group->kobj,
@@ -1317,7 +1337,8 @@
 
 		if (port->pma_table)
 			sysfs_remove_group(p, port->pma_table);
-		sysfs_remove_group(p, &port->pkey_group);
+		if (port->pkey_group)
+			sysfs_remove_group(p, port->pkey_group);
 		sysfs_remove_group(p, &port->gid_group);
 		sysfs_remove_group(&port->gid_attr_group->kobj,
 				   &port->gid_attr_group->ndev);
diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c
index 6c3514b..31e7860 100644
--- a/drivers/infiniband/core/trace.c
+++ b/drivers/infiniband/core/trace.c
@@ -9,6 +9,4 @@
 
 #define CREATE_TRACE_POINTS
 
-#include <rdma/ib_verbs.h>
-
 #include <trace/events/rdma_core.h>
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 82455a1..831bff8 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -261,6 +261,7 @@
 	sg = umem->sg_head.sgl;
 
 	while (npages) {
+		cond_resched();
 		ret = pin_user_pages_fast(cur_base,
 					  min_t(unsigned long, npages,
 						PAGE_SIZE /
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index ccd2840..5e32f61 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -152,6 +152,7 @@
  *        ib_alloc_implicit_odp_umem()
  * @addr: The starting userspace VA
  * @size: The length of the userspace VA
+ * @ops: MMU interval ops, currently only @invalidate
  */
 struct ib_umem_odp *
 ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
@@ -213,6 +214,7 @@
  * @addr: userspace virtual address to start at
  * @size: length of region to pin
  * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @ops: MMU interval ops, currently only @invalidate
  *
  * The driver should use when the access flags indicate ODP memory. It avoids
  * pinning, instead, stores the mm for future page fault handling in
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 76e7ec0..2fbc583 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -415,8 +415,8 @@
 
 static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
 {
+	struct ib_uverbs_alloc_pd_resp resp = {};
 	struct ib_uverbs_alloc_pd      cmd;
-	struct ib_uverbs_alloc_pd_resp resp;
 	struct ib_uobject             *uobj;
 	struct ib_pd                  *pd;
 	int                            ret;
@@ -438,29 +438,20 @@
 
 	pd->device  = ib_dev;
 	pd->uobject = uobj;
-	pd->__internal_mr = NULL;
 	atomic_set(&pd->usecnt, 0);
 	pd->res.type = RDMA_RESTRACK_PD;
 
 	ret = ib_dev->ops.alloc_pd(pd, &attrs->driver_udata);
 	if (ret)
 		goto err_alloc;
-
-	uobj->object = pd;
-	memset(&resp, 0, sizeof resp);
-	resp.pd_handle = uobj->id;
 	rdma_restrack_uadd(&pd->res);
 
-	ret = uverbs_response(attrs, &resp, sizeof(resp));
-	if (ret)
-		goto err_copy;
+	uobj->object = pd;
+	uobj_finalize_uobj_create(uobj, attrs);
 
-	rdma_alloc_commit_uobject(uobj, attrs);
-	return 0;
+	resp.pd_handle = uobj->id;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 
-err_copy:
-	ib_dealloc_pd_user(pd, uverbs_get_cleared_udata(attrs));
-	pd = NULL;
 err_alloc:
 	kfree(pd);
 err:
@@ -568,15 +559,15 @@
 static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_device *ibudev = attrs->ufile->device;
+	struct ib_uverbs_open_xrcd_resp	resp = {};
 	struct ib_uverbs_open_xrcd	cmd;
-	struct ib_uverbs_open_xrcd_resp	resp;
 	struct ib_uxrcd_object         *obj;
 	struct ib_xrcd                 *xrcd = NULL;
-	struct fd			f = {NULL, 0};
 	struct inode                   *inode = NULL;
-	int				ret = 0;
 	int				new_xrcd = 0;
 	struct ib_device *ib_dev;
+	struct fd f = {};
+	int ret;
 
 	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
 	if (ret)
@@ -614,24 +605,16 @@
 	}
 
 	if (!xrcd) {
-		xrcd = ib_dev->ops.alloc_xrcd(ib_dev, &attrs->driver_udata);
+		xrcd = ib_alloc_xrcd_user(ib_dev, inode, &attrs->driver_udata);
 		if (IS_ERR(xrcd)) {
 			ret = PTR_ERR(xrcd);
 			goto err;
 		}
-
-		xrcd->inode   = inode;
-		xrcd->device  = ib_dev;
-		atomic_set(&xrcd->usecnt, 0);
-		mutex_init(&xrcd->tgt_qp_mutex);
-		INIT_LIST_HEAD(&xrcd->tgt_qp_list);
 		new_xrcd = 1;
 	}
 
 	atomic_set(&obj->refcnt, 0);
 	obj->uobject.object = xrcd;
-	memset(&resp, 0, sizeof resp);
-	resp.xrcd_handle = obj->uobject.id;
 
 	if (inode) {
 		if (new_xrcd) {
@@ -643,27 +626,17 @@
 		atomic_inc(&xrcd->usecnt);
 	}
 
-	ret = uverbs_response(attrs, &resp, sizeof(resp));
-	if (ret)
-		goto err_copy;
-
 	if (f.file)
 		fdput(f);
 
 	mutex_unlock(&ibudev->xrcd_tree_mutex);
+	uobj_finalize_uobj_create(&obj->uobject, attrs);
 
-	rdma_alloc_commit_uobject(&obj->uobject, attrs);
-	return 0;
-
-err_copy:
-	if (inode) {
-		if (new_xrcd)
-			xrcd_table_delete(ibudev, inode);
-		atomic_dec(&xrcd->usecnt);
-	}
+	resp.xrcd_handle = obj->uobject.id;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 
 err_dealloc_xrcd:
-	ib_dealloc_xrcd(xrcd, uverbs_get_cleared_udata(attrs));
+	ib_dealloc_xrcd_user(xrcd, uverbs_get_cleared_udata(attrs));
 
 err:
 	uobj_alloc_abort(&obj->uobject, attrs);
@@ -701,7 +674,7 @@
 	if (inode && !atomic_dec_and_test(&xrcd->usecnt))
 		return 0;
 
-	ret = ib_dealloc_xrcd(xrcd, &attrs->driver_udata);
+	ret = ib_dealloc_xrcd_user(xrcd, &attrs->driver_udata);
 
 	if (ib_is_destroy_retryable(ret, why, uobject)) {
 		atomic_inc(&xrcd->usecnt);
@@ -716,8 +689,8 @@
 
 static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
 {
+	struct ib_uverbs_reg_mr_resp resp = {};
 	struct ib_uverbs_reg_mr      cmd;
-	struct ib_uverbs_reg_mr_resp resp;
 	struct ib_uobject           *uobj;
 	struct ib_pd                *pd;
 	struct ib_mr                *mr;
@@ -770,30 +743,20 @@
 	mr->uobject = uobj;
 	atomic_inc(&pd->usecnt);
 	mr->res.type = RDMA_RESTRACK_MR;
+	mr->iova = cmd.hca_va;
 	rdma_restrack_uadd(&mr->res);
 
 	uobj->object = mr;
-
-	memset(&resp, 0, sizeof resp);
-	resp.lkey      = mr->lkey;
-	resp.rkey      = mr->rkey;
-	resp.mr_handle = uobj->id;
-
-	ret = uverbs_response(attrs, &resp, sizeof(resp));
-	if (ret)
-		goto err_copy;
-
 	uobj_put_obj_read(pd);
+	uobj_finalize_uobj_create(uobj, attrs);
 
-	rdma_alloc_commit_uobject(uobj, attrs);
-	return 0;
-
-err_copy:
-	ib_dereg_mr_user(mr, uverbs_get_cleared_udata(attrs));
+	resp.lkey = mr->lkey;
+	resp.rkey = mr->rkey;
+	resp.mr_handle = uobj->id;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 
 err_put:
 	uobj_put_obj_read(pd);
-
 err_free:
 	uobj_alloc_abort(uobj, attrs);
 	return ret;
@@ -861,6 +824,9 @@
 		atomic_dec(&old_pd->usecnt);
 	}
 
+	if (cmd.flags & IB_MR_REREG_TRANS)
+		mr->iova = cmd.hca_va;
+
 	memset(&resp, 0, sizeof(resp));
 	resp.lkey      = mr->lkey;
 	resp.rkey      = mr->rkey;
@@ -930,21 +896,13 @@
 	atomic_inc(&pd->usecnt);
 
 	uobj->object = mw;
-
-	memset(&resp, 0, sizeof(resp));
-	resp.rkey      = mw->rkey;
-	resp.mw_handle = uobj->id;
-
-	ret = uverbs_response(attrs, &resp, sizeof(resp));
-	if (ret)
-		goto err_copy;
-
 	uobj_put_obj_read(pd);
-	rdma_alloc_commit_uobject(uobj, attrs);
-	return 0;
+	uobj_finalize_uobj_create(uobj, attrs);
 
-err_copy:
-	uverbs_dealloc_mw(mw);
+	resp.rkey = mw->rkey;
+	resp.mw_handle = uobj->id;
+	return uverbs_response(attrs, &resp, sizeof(resp));
+
 err_put:
 	uobj_put_obj_read(pd);
 err_free:
@@ -981,40 +939,33 @@
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
-	resp.fd = uobj->id;
-
 	ev_file = container_of(uobj, struct ib_uverbs_completion_event_file,
 			       uobj);
 	ib_uverbs_init_event_queue(&ev_file->ev_queue);
+	uobj_finalize_uobj_create(uobj, attrs);
 
-	ret = uverbs_response(attrs, &resp, sizeof(resp));
-	if (ret) {
-		uobj_alloc_abort(uobj, attrs);
-		return ret;
-	}
-
-	rdma_alloc_commit_uobject(uobj, attrs);
-	return 0;
+	resp.fd = uobj->id;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 }
 
-static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
-				       struct ib_uverbs_ex_create_cq *cmd)
+static int create_cq(struct uverbs_attr_bundle *attrs,
+		     struct ib_uverbs_ex_create_cq *cmd)
 {
 	struct ib_ucq_object           *obj;
 	struct ib_uverbs_completion_event_file    *ev_file = NULL;
 	struct ib_cq                   *cq;
 	int                             ret;
-	struct ib_uverbs_ex_create_cq_resp resp;
+	struct ib_uverbs_ex_create_cq_resp resp = {};
 	struct ib_cq_init_attr attr = {};
 	struct ib_device *ib_dev;
 
 	if (cmd->comp_vector >= attrs->ufile->device->num_comp_vectors)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, attrs,
 						 &ib_dev);
 	if (IS_ERR(obj))
-		return obj;
+		return PTR_ERR(obj);
 
 	if (cmd->comp_channel >= 0) {
 		ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, attrs);
@@ -1043,53 +994,38 @@
 	cq->event_handler = ib_uverbs_cq_event_handler;
 	cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
 	atomic_set(&cq->usecnt, 0);
+	cq->res.type = RDMA_RESTRACK_CQ;
 
 	ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
 	if (ret)
 		goto err_free;
+	rdma_restrack_uadd(&cq->res);
 
 	obj->uevent.uobject.object = cq;
 	obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file);
 	if (obj->uevent.event_file)
 		uverbs_uobject_get(&obj->uevent.event_file->uobj);
+	uobj_finalize_uobj_create(&obj->uevent.uobject, attrs);
 
-	memset(&resp, 0, sizeof resp);
 	resp.base.cq_handle = obj->uevent.uobject.id;
-	resp.base.cqe       = cq->cqe;
+	resp.base.cqe = cq->cqe;
 	resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+	return uverbs_response(attrs, &resp, sizeof(resp));
 
-	cq->res.type = RDMA_RESTRACK_CQ;
-	rdma_restrack_uadd(&cq->res);
-
-	ret = uverbs_response(attrs, &resp, sizeof(resp));
-	if (ret)
-		goto err_cb;
-
-	rdma_alloc_commit_uobject(&obj->uevent.uobject, attrs);
-	return obj;
-
-err_cb:
-	if (obj->uevent.event_file)
-		uverbs_uobject_put(&obj->uevent.event_file->uobj);
-	ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs));
-	cq = NULL;
 err_free:
 	kfree(cq);
 err_file:
 	if (ev_file)
 		ib_uverbs_release_ucq(ev_file, obj);
-
 err:
 	uobj_alloc_abort(&obj->uevent.uobject, attrs);
-
-	return ERR_PTR(ret);
+	return ret;
 }
 
 static int ib_uverbs_create_cq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_create_cq      cmd;
 	struct ib_uverbs_ex_create_cq	cmd_ex;
-	struct ib_ucq_object           *obj;
 	int ret;
 
 	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
@@ -1102,14 +1038,12 @@
 	cmd_ex.comp_vector = cmd.comp_vector;
 	cmd_ex.comp_channel = cmd.comp_channel;
 
-	obj = create_cq(attrs, &cmd_ex);
-	return PTR_ERR_OR_ZERO(obj);
+	return create_cq(attrs, &cmd_ex);
 }
 
 static int ib_uverbs_ex_create_cq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_ex_create_cq  cmd;
-	struct ib_ucq_object           *obj;
 	int ret;
 
 	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
@@ -1122,8 +1056,7 @@
 	if (cmd.reserved)
 		return -EINVAL;
 
-	obj = create_cq(attrs, &cmd);
-	return PTR_ERR_OR_ZERO(obj);
+	return create_cq(attrs, &cmd);
 }
 
 static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs)
@@ -1131,7 +1064,7 @@
 	struct ib_uverbs_resize_cq	cmd;
 	struct ib_uverbs_resize_cq_resp	resp = {};
 	struct ib_cq			*cq;
-	int				ret = -EINVAL;
+	int ret;
 
 	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
 	if (ret)
@@ -1298,7 +1231,7 @@
 	struct ib_srq			*srq = NULL;
 	struct ib_qp			*qp;
 	struct ib_qp_init_attr		attr = {};
-	struct ib_uverbs_ex_create_qp_resp resp;
+	struct ib_uverbs_ex_create_qp_resp resp = {};
 	int				ret;
 	struct ib_rwq_ind_table *ind_tbl = NULL;
 	bool has_sq = true;
@@ -1468,20 +1401,6 @@
 	if (obj->uevent.event_file)
 		uverbs_uobject_get(&obj->uevent.event_file->uobj);
 
-	memset(&resp, 0, sizeof resp);
-	resp.base.qpn             = qp->qp_num;
-	resp.base.qp_handle       = obj->uevent.uobject.id;
-	resp.base.max_recv_sge    = attr.cap.max_recv_sge;
-	resp.base.max_send_sge    = attr.cap.max_send_sge;
-	resp.base.max_recv_wr     = attr.cap.max_recv_wr;
-	resp.base.max_send_wr     = attr.cap.max_send_wr;
-	resp.base.max_inline_data = attr.cap.max_inline_data;
-	resp.response_length = uverbs_response_length(attrs, sizeof(resp));
-
-	ret = uverbs_response(attrs, &resp, sizeof(resp));
-	if (ret)
-		goto err_uevent;
-
 	if (xrcd) {
 		obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
 					  uobject);
@@ -1502,12 +1421,18 @@
 					UVERBS_LOOKUP_READ);
 	if (ind_tbl)
 		uobj_put_obj_read(ind_tbl);
+	uobj_finalize_uobj_create(&obj->uevent.uobject, attrs);
 
-	rdma_alloc_commit_uobject(&obj->uevent.uobject, attrs);
-	return 0;
-err_uevent:
-	if (obj->uevent.event_file)
-		uverbs_uobject_put(&obj->uevent.event_file->uobj);
+	resp.base.qpn             = qp->qp_num;
+	resp.base.qp_handle       = obj->uevent.uobject.id;
+	resp.base.max_recv_sge    = attr.cap.max_recv_sge;
+	resp.base.max_send_sge    = attr.cap.max_send_sge;
+	resp.base.max_recv_wr     = attr.cap.max_recv_wr;
+	resp.base.max_send_wr     = attr.cap.max_send_wr;
+	resp.base.max_inline_data = attr.cap.max_inline_data;
+	resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+	return uverbs_response(attrs, &resp, sizeof(resp));
+
 err_cb:
 	ib_destroy_qp_user(qp, uverbs_get_cleared_udata(attrs));
 
@@ -1580,14 +1505,14 @@
 
 static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs)
 {
+	struct ib_uverbs_create_qp_resp resp = {};
 	struct ib_uverbs_open_qp        cmd;
-	struct ib_uverbs_create_qp_resp resp;
 	struct ib_uqp_object           *obj;
 	struct ib_xrcd		       *xrcd;
-	struct ib_uobject	       *xrcd_uobj;
 	struct ib_qp                   *qp;
 	struct ib_qp_open_attr          attr = {};
 	int ret;
+	struct ib_uobject *xrcd_uobj;
 	struct ib_device *ib_dev;
 
 	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
@@ -1627,24 +1552,16 @@
 	obj->uevent.uobject.object = qp;
 	obj->uevent.uobject.user_handle = cmd.user_handle;
 
-	memset(&resp, 0, sizeof resp);
-	resp.qpn       = qp->qp_num;
-	resp.qp_handle = obj->uevent.uobject.id;
-
-	ret = uverbs_response(attrs, &resp, sizeof(resp));
-	if (ret)
-		goto err_destroy;
-
 	obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
 	atomic_inc(&obj->uxrcd->refcnt);
 	qp->uobject = obj;
 	uobj_put_read(xrcd_uobj);
+	uobj_finalize_uobj_create(&obj->uevent.uobject, attrs);
 
-	rdma_alloc_commit_uobject(&obj->uevent.uobject, attrs);
-	return 0;
+	resp.qpn = qp->qp_num;
+	resp.qp_handle = obj->uevent.uobject.id;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 
-err_destroy:
-	ib_destroy_qp_user(qp, uverbs_get_cleared_udata(attrs));
 err_xrcd:
 	uobj_put_read(xrcd_uobj);
 err_put:
@@ -1980,7 +1897,7 @@
 	 * Last bit is reserved for extending the attr_mask by
 	 * using another field.
 	 */
-	BUILD_BUG_ON(IB_USER_LAST_QP_ATTR_MASK == (1 << 31));
+	BUILD_BUG_ON(IB_USER_LAST_QP_ATTR_MASK == (1ULL << 31));
 
 	if (cmd.base.attr_mask &
 	    ~((IB_USER_LAST_QP_ATTR_MASK << 1) - 1))
@@ -2480,24 +2397,14 @@
 	ah->uobject  = uobj;
 	uobj->user_handle = cmd.user_handle;
 	uobj->object = ah;
+	uobj_put_obj_read(pd);
+	uobj_finalize_uobj_create(uobj, attrs);
 
 	resp.ah_handle = uobj->id;
-
-	ret = uverbs_response(attrs, &resp, sizeof(resp));
-	if (ret)
-		goto err_copy;
-
-	uobj_put_obj_read(pd);
-	rdma_alloc_commit_uobject(uobj, attrs);
-	return 0;
-
-err_copy:
-	rdma_destroy_ah_user(ah, RDMA_DESTROY_AH_SLEEPABLE,
-			     uverbs_get_cleared_udata(attrs));
+	return uverbs_response(attrs, &resp, sizeof(resp));
 
 err_put:
 	uobj_put_obj_read(pd);
-
 err:
 	uobj_alloc_abort(uobj, attrs);
 	return ret;
@@ -2989,26 +2896,18 @@
 	if (obj->uevent.event_file)
 		uverbs_uobject_get(&obj->uevent.event_file->uobj);
 
-	memset(&resp, 0, sizeof(resp));
+	uobj_put_obj_read(pd);
+	rdma_lookup_put_uobject(&cq->uobject->uevent.uobject,
+				UVERBS_LOOKUP_READ);
+	uobj_finalize_uobj_create(&obj->uevent.uobject, attrs);
+
 	resp.wq_handle = obj->uevent.uobject.id;
 	resp.max_sge = wq_init_attr.max_sge;
 	resp.max_wr = wq_init_attr.max_wr;
 	resp.wqn = wq->wq_num;
 	resp.response_length = uverbs_response_length(attrs, sizeof(resp));
-	err = uverbs_response(attrs, &resp, sizeof(resp));
-	if (err)
-		goto err_copy;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 
-	uobj_put_obj_read(pd);
-	rdma_lookup_put_uobject(&cq->uobject->uevent.uobject,
-				UVERBS_LOOKUP_READ);
-	rdma_alloc_commit_uobject(&obj->uevent.uobject, attrs);
-	return 0;
-
-err_copy:
-	if (obj->uevent.event_file)
-		uverbs_uobject_put(&obj->uevent.event_file->uobj);
-	ib_destroy_wq(wq, uverbs_get_cleared_udata(attrs));
 err_put_cq:
 	rdma_lookup_put_uobject(&cq->uobject->uevent.uobject,
 				UVERBS_LOOKUP_READ);
@@ -3093,7 +2992,7 @@
 	struct ib_wq	**wqs = NULL;
 	u32 *wqs_handles = NULL;
 	struct ib_wq	*wq = NULL;
-	int i, j, num_read_wqs;
+	int i, num_read_wqs;
 	u32 num_wq_handles;
 	struct uverbs_req_iter iter;
 	struct ib_device *ib_dev;
@@ -3139,6 +3038,7 @@
 		}
 
 		wqs[num_read_wqs] = wq;
+		atomic_inc(&wqs[num_read_wqs]->usecnt);
 	}
 
 	uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, attrs, &ib_dev);
@@ -3166,33 +3066,24 @@
 	atomic_set(&rwq_ind_tbl->usecnt, 0);
 
 	for (i = 0; i < num_wq_handles; i++)
-		atomic_inc(&wqs[i]->usecnt);
+		rdma_lookup_put_uobject(&wqs[i]->uobject->uevent.uobject,
+					UVERBS_LOOKUP_READ);
+	kfree(wqs_handles);
+	uobj_finalize_uobj_create(uobj, attrs);
 
 	resp.ind_tbl_handle = uobj->id;
 	resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num;
 	resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+	return uverbs_response(attrs, &resp, sizeof(resp));
 
-	err = uverbs_response(attrs, &resp, sizeof(resp));
-	if (err)
-		goto err_copy;
-
-	kfree(wqs_handles);
-
-	for (j = 0; j < num_read_wqs; j++)
-		rdma_lookup_put_uobject(&wqs[j]->uobject->uevent.uobject,
-					UVERBS_LOOKUP_READ);
-
-	rdma_alloc_commit_uobject(uobj, attrs);
-	return 0;
-
-err_copy:
-	ib_destroy_rwq_ind_table(rwq_ind_tbl);
 err_uobj:
 	uobj_alloc_abort(uobj, attrs);
 put_wqs:
-	for (j = 0; j < num_read_wqs; j++)
-		rdma_lookup_put_uobject(&wqs[j]->uobject->uevent.uobject,
+	for (i = 0; i < num_read_wqs; i++) {
+		rdma_lookup_put_uobject(&wqs[i]->uobject->uevent.uobject,
 					UVERBS_LOOKUP_READ);
+		atomic_dec(&wqs[i]->usecnt);
+	}
 err_free:
 	kfree(wqs_handles);
 	kfree(wqs);
@@ -3218,7 +3109,7 @@
 static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_create_flow	  cmd;
-	struct ib_uverbs_create_flow_resp resp;
+	struct ib_uverbs_create_flow_resp resp = {};
 	struct ib_uobject		  *uobj;
 	struct ib_flow			  *flow_id;
 	struct ib_uverbs_flow_attr	  *kern_flow_attr;
@@ -3351,23 +3242,17 @@
 
 	ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res);
 
-	memset(&resp, 0, sizeof(resp));
-	resp.flow_handle = uobj->id;
-
-	err = uverbs_response(attrs, &resp, sizeof(resp));
-	if (err)
-		goto err_copy;
-
 	rdma_lookup_put_uobject(&qp->uobject->uevent.uobject,
 				UVERBS_LOOKUP_READ);
 	kfree(flow_attr);
+
 	if (cmd.flow_attr.num_of_specs)
 		kfree(kern_flow_attr);
-	rdma_alloc_commit_uobject(uobj, attrs);
-	return 0;
-err_copy:
-	if (!qp->device->ops.destroy_flow(flow_id))
-		atomic_dec(&qp->usecnt);
+	uobj_finalize_uobj_create(uobj, attrs);
+
+	resp.flow_handle = uobj->id;
+	return uverbs_response(attrs, &resp, sizeof(resp));
+
 err_free:
 	ib_uverbs_flow_resources_free(uflow_res);
 err_free_flow_attr:
@@ -3402,13 +3287,13 @@
 				struct ib_uverbs_create_xsrq *cmd,
 				struct ib_udata *udata)
 {
-	struct ib_uverbs_create_srq_resp resp;
+	struct ib_uverbs_create_srq_resp resp = {};
 	struct ib_usrq_object           *obj;
 	struct ib_pd                    *pd;
 	struct ib_srq                   *srq;
-	struct ib_uobject               *xrcd_uobj;
 	struct ib_srq_init_attr          attr;
 	int ret;
+	struct ib_uobject *xrcd_uobj;
 	struct ib_device *ib_dev;
 
 	obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, attrs,
@@ -3473,17 +3358,9 @@
 	if (obj->uevent.event_file)
 		uverbs_uobject_get(&obj->uevent.event_file->uobj);
 
-	memset(&resp, 0, sizeof resp);
-	resp.srq_handle = obj->uevent.uobject.id;
-	resp.max_wr     = attr.attr.max_wr;
-	resp.max_sge    = attr.attr.max_sge;
 	if (cmd->srq_type == IB_SRQT_XRC)
 		resp.srqn = srq->ext.xrc.srq_num;
 
-	ret = uverbs_response(attrs, &resp, sizeof(resp));
-	if (ret)
-		goto err_copy;
-
 	if (cmd->srq_type == IB_SRQT_XRC)
 		uobj_put_read(xrcd_uobj);
 
@@ -3492,13 +3369,13 @@
 					UVERBS_LOOKUP_READ);
 
 	uobj_put_obj_read(pd);
-	rdma_alloc_commit_uobject(&obj->uevent.uobject, attrs);
-	return 0;
+	uobj_finalize_uobj_create(&obj->uevent.uobject, attrs);
 
-err_copy:
-	if (obj->uevent.event_file)
-		uverbs_uobject_put(&obj->uevent.event_file->uobj);
-	ib_destroy_srq_user(srq, uverbs_get_cleared_udata(attrs));
+	resp.srq_handle = obj->uevent.uobject.id;
+	resp.max_wr = attr.attr.max_wr;
+	resp.max_sge = attr.attr.max_sge;
+	return uverbs_response(attrs, &resp, sizeof(resp));
+
 err_put_pd:
 	uobj_put_obj_read(pd);
 err_put_cq:
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index 2d882c0..ef04a26 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -790,6 +790,7 @@
 	}
 	return uverbs_copy_to(bundle, idx, from, size);
 }
+EXPORT_SYMBOL(uverbs_copy_to_struct_or_zero);
 
 /* Once called an abort will call through to the type's destroy_hw() */
 void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle,
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 69e4755..37794d8 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -601,6 +601,7 @@
 	memset(bundle.attr_present, 0, sizeof(bundle.attr_present));
 	bundle.ufile = file;
 	bundle.context = NULL; /* only valid if bundle has uobject */
+	bundle.uobject = NULL;
 	if (!method_elm->is_ex) {
 		size_t in_len = hdr.in_words * 4 - sizeof(hdr);
 		size_t out_len = hdr.out_words * 4;
@@ -664,6 +665,9 @@
 	}
 
 	ret = method_elm->handler(&bundle);
+	if (bundle.uobject)
+		uverbs_finalize_object(bundle.uobject, UVERBS_ACCESS_NEW, true,
+				       !ret, &bundle);
 out_unlock:
 	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 	return (ret) ? : count;
diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c
index 9f01330..c7e7438 100644
--- a/drivers/infiniband/core/uverbs_std_types_counters.c
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -46,7 +46,9 @@
 	if (ret)
 		return ret;
 
-	return counters->device->ops.destroy_counters(counters);
+	counters->device->ops.destroy_counters(counters);
+	kfree(counters);
+	return 0;
 }
 
 static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(
@@ -66,20 +68,19 @@
 	if (!ib_dev->ops.create_counters)
 		return -EOPNOTSUPP;
 
-	counters = ib_dev->ops.create_counters(ib_dev, attrs);
-	if (IS_ERR(counters)) {
-		ret = PTR_ERR(counters);
-		goto err_create_counters;
-	}
+	counters = rdma_zalloc_drv_obj(ib_dev, ib_counters);
+	if (!counters)
+		return -ENOMEM;
 
 	counters->device = ib_dev;
 	counters->uobject = uobj;
 	uobj->object = counters;
 	atomic_set(&counters->usecnt, 0);
 
-	return 0;
+	ret = ib_dev->ops.create_counters(counters, attrs);
+	if (ret)
+		kfree(counters);
 
-err_create_counters:
 	return ret;
 }
 
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index 5dce2c7..b1c7dac 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -207,11 +207,8 @@
 DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_CQ,
 	UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), uverbs_free_cq),
-
-#if IS_ENABLED(CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI)
 	&UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE),
 	&UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY)
-#endif
 );
 
 const struct uapi_definition uverbs_def_obj_cq[] = {
diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
index ae4a59d..75df2094 100644
--- a/drivers/infiniband/core/uverbs_std_types_device.c
+++ b/drivers/infiniband/core/uverbs_std_types_device.c
@@ -38,7 +38,12 @@
 	    attrs->ucore.outlen < method_elm->resp_size)
 		return -ENOSPC;
 
-	return method_elm->handler(attrs);
+	attrs->uobject = NULL;
+	rc = method_elm->handler(attrs);
+	if (attrs->uobject)
+		uverbs_finalize_object(attrs->uobject, UVERBS_ACCESS_NEW, true,
+				       !rc, attrs);
+	return rc;
 }
 
 DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_INVOKE_WRITE,
@@ -229,6 +234,37 @@
 	return 0;
 }
 
+static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_CONTEXT)(
+	struct uverbs_attr_bundle *attrs)
+{
+	u64 core_support = IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS;
+	struct ib_ucontext *ucontext;
+	struct ib_device *ib_dev;
+	u32 num_comp;
+	int ret;
+
+	ucontext = ib_uverbs_get_ucontext(attrs);
+	if (IS_ERR(ucontext))
+		return PTR_ERR(ucontext);
+	ib_dev = ucontext->device;
+
+	if (!ib_dev->ops.query_ucontext)
+		return -EOPNOTSUPP;
+
+	num_comp = attrs->ufile->device->num_comp_vectors;
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS,
+			     &num_comp, sizeof(num_comp));
+	if (IS_UVERBS_COPY_ERR(ret))
+		return ret;
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT,
+			     &core_support, sizeof(core_support));
+	if (IS_UVERBS_COPY_ERR(ret))
+		return ret;
+
+	return ucontext->device->ops.query_ucontext(ucontext, attrs);
+}
+
 DECLARE_UVERBS_NAMED_METHOD(
 	UVERBS_METHOD_GET_CONTEXT,
 	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS,
@@ -238,6 +274,13 @@
 	UVERBS_ATTR_UHW());
 
 DECLARE_UVERBS_NAMED_METHOD(
+	UVERBS_METHOD_QUERY_CONTEXT,
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS,
+			    UVERBS_ATTR_TYPE(u32), UA_OPTIONAL),
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT,
+			    UVERBS_ATTR_TYPE(u64), UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD(
 	UVERBS_METHOD_INFO_HANDLES,
 	/* Also includes any device specific object ids */
 	UVERBS_ATTR_CONST_IN(UVERBS_ATTR_INFO_OBJECT_ID,
@@ -260,7 +303,8 @@
 			      &UVERBS_METHOD(UVERBS_METHOD_GET_CONTEXT),
 			      &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE),
 			      &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES),
-			      &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT));
+			      &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT),
+			      &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT));
 
 const struct uapi_definition uverbs_def_obj_device[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE),
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index a2722ef..9b22bb5 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -69,7 +69,7 @@
 
 	num_sge = uverbs_attr_ptr_get_array_size(
 		attrs, UVERBS_ATTR_ADVISE_MR_SGE_LIST, sizeof(struct ib_sge));
-	if (num_sge < 0)
+	if (num_sge <= 0)
 		return num_sge;
 
 	sg_list = uverbs_attr_get_alloced_ptr(attrs,
@@ -148,6 +148,36 @@
 	return ret;
 }
 
+static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_MR)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_mr *mr =
+		uverbs_attr_get_obj(attrs, UVERBS_ATTR_QUERY_MR_HANDLE);
+	int ret;
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LKEY, &mr->lkey,
+			     sizeof(mr->lkey));
+	if (ret)
+		return ret;
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_RKEY,
+			     &mr->rkey, sizeof(mr->rkey));
+
+	if (ret)
+		return ret;
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LENGTH,
+			     &mr->length, sizeof(mr->length));
+
+	if (ret)
+		return ret;
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_IOVA,
+			     &mr->iova, sizeof(mr->iova));
+
+	return IS_UVERBS_COPY_ERR(ret) ? ret : 0;
+}
+
 DECLARE_UVERBS_NAMED_METHOD(
 	UVERBS_METHOD_ADVISE_MR,
 	UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE,
@@ -166,6 +196,25 @@
 			   UA_ALLOC_AND_COPY));
 
 DECLARE_UVERBS_NAMED_METHOD(
+	UVERBS_METHOD_QUERY_MR,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_QUERY_MR_HANDLE,
+			UVERBS_OBJECT_MR,
+			UVERBS_ACCESS_READ,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_RKEY,
+			    UVERBS_ATTR_TYPE(u32),
+			    UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LKEY,
+			    UVERBS_ATTR_TYPE(u32),
+			    UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LENGTH,
+			    UVERBS_ATTR_TYPE(u64),
+			    UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_IOVA,
+			    UVERBS_ATTR_TYPE(u64),
+			    UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD(
 	UVERBS_METHOD_DM_MR_REG,
 	UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE,
 			UVERBS_OBJECT_MR,
@@ -206,7 +255,8 @@
 	UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr),
 	&UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG),
 	&UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY),
-	&UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR));
+	&UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR),
+	&UVERBS_METHOD(UVERBS_METHOD_QUERY_MR));
 
 const struct uapi_definition uverbs_def_obj_mr[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR,
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 53d6505c..3096e73 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -988,8 +988,8 @@
  * @srq_init_attr: A list of initial attributes required to create the
  *   SRQ.  If SRQ creation succeeds, then the attributes are updated to
  *   the actual capabilities of the created SRQ.
- * @uobject - uobject pointer if this is not a kernel SRQ
- * @udata - udata pointer if this is not a kernel SRQ
+ * @uobject: uobject pointer if this is not a kernel SRQ
+ * @udata: udata pointer if this is not a kernel SRQ
  *
  * srq_attr->max_wr and srq_attr->max_sge are read the determine the
  * requested size of the SRQ, and set to the actual values allocated
@@ -1090,13 +1090,6 @@
 	spin_unlock_irqrestore(&qp->device->qp_open_list_lock, flags);
 }
 
-static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
-{
-	mutex_lock(&xrcd->tgt_qp_mutex);
-	list_add(&qp->xrcd_list, &xrcd->tgt_qp_list);
-	mutex_unlock(&xrcd->tgt_qp_mutex);
-}
-
 static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
 				  void (*event_handler)(struct ib_event *, void *),
 				  void *qp_context)
@@ -1139,16 +1132,15 @@
 	if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
 		return ERR_PTR(-EINVAL);
 
-	qp = ERR_PTR(-EINVAL);
-	mutex_lock(&xrcd->tgt_qp_mutex);
-	list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) {
-		if (real_qp->qp_num == qp_open_attr->qp_num) {
-			qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
-					  qp_open_attr->qp_context);
-			break;
-		}
+	down_read(&xrcd->tgt_qps_rwsem);
+	real_qp = xa_load(&xrcd->tgt_qps, qp_open_attr->qp_num);
+	if (!real_qp) {
+		up_read(&xrcd->tgt_qps_rwsem);
+		return ERR_PTR(-EINVAL);
 	}
-	mutex_unlock(&xrcd->tgt_qp_mutex);
+	qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
+			  qp_open_attr->qp_context);
+	up_read(&xrcd->tgt_qps_rwsem);
 	return qp;
 }
 EXPORT_SYMBOL(ib_open_qp);
@@ -1157,6 +1149,7 @@
 					struct ib_qp_init_attr *qp_init_attr)
 {
 	struct ib_qp *real_qp = qp;
+	int err;
 
 	qp->event_handler = __ib_shared_qp_event_handler;
 	qp->qp_context = qp;
@@ -1172,7 +1165,12 @@
 	if (IS_ERR(qp))
 		return qp;
 
-	__ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
+	err = xa_err(xa_store(&qp_init_attr->xrcd->tgt_qps, real_qp->qp_num,
+			      real_qp, GFP_KERNEL));
+	if (err) {
+		ib_close_qp(qp);
+		return ERR_PTR(err);
+	}
 	return qp;
 }
 
@@ -1712,7 +1710,7 @@
 		if (!(rdma_protocol_ib(qp->device,
 				       attr->alt_ah_attr.port_num) &&
 		      rdma_protocol_ib(qp->device, port))) {
-			ret = EINVAL;
+			ret = -EINVAL;
 			goto out;
 		}
 	}
@@ -1887,21 +1885,18 @@
 
 	real_qp = qp->real_qp;
 	xrcd = real_qp->xrcd;
-
-	mutex_lock(&xrcd->tgt_qp_mutex);
+	down_write(&xrcd->tgt_qps_rwsem);
 	ib_close_qp(qp);
 	if (atomic_read(&real_qp->usecnt) == 0)
-		list_del(&real_qp->xrcd_list);
+		xa_erase(&xrcd->tgt_qps, real_qp->qp_num);
 	else
 		real_qp = NULL;
-	mutex_unlock(&xrcd->tgt_qp_mutex);
+	up_write(&xrcd->tgt_qps_rwsem);
 
 	if (real_qp) {
 		ret = ib_destroy_qp(real_qp);
 		if (!ret)
 			atomic_dec(&xrcd->usecnt);
-		else
-			__ib_insert_xrcd_qp(xrcd, real_qp);
 	}
 
 	return 0;
@@ -2077,6 +2072,9 @@
 	if (!pd->device->ops.advise_mr)
 		return -EOPNOTSUPP;
 
+	if (!num_sge)
+		return 0;
+
 	return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge,
 					 NULL);
 }
@@ -2104,11 +2102,10 @@
 EXPORT_SYMBOL(ib_dereg_mr_user);
 
 /**
- * ib_alloc_mr_user() - Allocates a memory region
+ * ib_alloc_mr() - Allocates a memory region
  * @pd:            protection domain associated with the region
  * @mr_type:       memory region type
  * @max_num_sg:    maximum sg entries available for registration.
- * @udata:	   user data or null for kernel objects
  *
  * Notes:
  * Memory registeration page/sg lists must not exceed max_num_sg.
@@ -2116,8 +2113,8 @@
  * max_num_sg * used_page_size.
  *
  */
-struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata)
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			  u32 max_num_sg)
 {
 	struct ib_mr *mr;
 
@@ -2132,25 +2129,26 @@
 		goto out;
 	}
 
-	mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata);
-	if (!IS_ERR(mr)) {
-		mr->device  = pd->device;
-		mr->pd      = pd;
-		mr->dm      = NULL;
-		mr->uobject = NULL;
-		atomic_inc(&pd->usecnt);
-		mr->need_inval = false;
-		mr->res.type = RDMA_RESTRACK_MR;
-		rdma_restrack_kadd(&mr->res);
-		mr->type = mr_type;
-		mr->sig_attrs = NULL;
-	}
+	mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg);
+	if (IS_ERR(mr))
+		goto out;
+
+	mr->device = pd->device;
+	mr->pd = pd;
+	mr->dm = NULL;
+	mr->uobject = NULL;
+	atomic_inc(&pd->usecnt);
+	mr->need_inval = false;
+	mr->res.type = RDMA_RESTRACK_MR;
+	rdma_restrack_kadd(&mr->res);
+	mr->type = mr_type;
+	mr->sig_attrs = NULL;
 
 out:
 	trace_mr_alloc(pd, mr_type, max_num_sg, mr);
 	return mr;
 }
-EXPORT_SYMBOL(ib_alloc_mr_user);
+EXPORT_SYMBOL(ib_alloc_mr);
 
 /**
  * ib_alloc_mr_integrity() - Allocates an integrity memory region
@@ -2288,45 +2286,57 @@
 }
 EXPORT_SYMBOL(ib_detach_mcast);
 
-struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller)
+/**
+ * ib_alloc_xrcd_user - Allocates an XRC domain.
+ * @device: The device on which to allocate the XRC domain.
+ * @inode: inode to connect XRCD
+ * @udata: Valid user data or NULL for kernel object
+ */
+struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device,
+				   struct inode *inode, struct ib_udata *udata)
 {
 	struct ib_xrcd *xrcd;
+	int ret;
 
 	if (!device->ops.alloc_xrcd)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	xrcd = device->ops.alloc_xrcd(device, NULL);
-	if (!IS_ERR(xrcd)) {
-		xrcd->device = device;
-		xrcd->inode = NULL;
-		atomic_set(&xrcd->usecnt, 0);
-		mutex_init(&xrcd->tgt_qp_mutex);
-		INIT_LIST_HEAD(&xrcd->tgt_qp_list);
-	}
+	xrcd = rdma_zalloc_drv_obj(device, ib_xrcd);
+	if (!xrcd)
+		return ERR_PTR(-ENOMEM);
 
+	xrcd->device = device;
+	xrcd->inode = inode;
+	atomic_set(&xrcd->usecnt, 0);
+	init_rwsem(&xrcd->tgt_qps_rwsem);
+	xa_init(&xrcd->tgt_qps);
+
+	ret = device->ops.alloc_xrcd(xrcd, udata);
+	if (ret)
+		goto err;
 	return xrcd;
+err:
+	kfree(xrcd);
+	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL(__ib_alloc_xrcd);
+EXPORT_SYMBOL(ib_alloc_xrcd_user);
 
-int ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
+/**
+ * ib_dealloc_xrcd_user - Deallocates an XRC domain.
+ * @xrcd: The XRC domain to deallocate.
+ * @udata: Valid user data or NULL for kernel object
+ */
+int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata)
 {
-	struct ib_qp *qp;
-	int ret;
-
 	if (atomic_read(&xrcd->usecnt))
 		return -EBUSY;
 
-	while (!list_empty(&xrcd->tgt_qp_list)) {
-		qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list);
-		ret = ib_destroy_qp(qp);
-		if (ret)
-			return ret;
-	}
-	mutex_destroy(&xrcd->tgt_qp_mutex);
-
-	return xrcd->device->ops.dealloc_xrcd(xrcd, udata);
+	WARN_ON(!xa_empty(&xrcd->tgt_qps));
+	xrcd->device->ops.dealloc_xrcd(xrcd, udata);
+	kfree(xrcd);
+	return 0;
 }
-EXPORT_SYMBOL(ib_dealloc_xrcd);
+EXPORT_SYMBOL(ib_dealloc_xrcd_user);
 
 /**
  * ib_create_wq - Creates a WQ associated with the specified protection
@@ -2410,45 +2420,6 @@
 EXPORT_SYMBOL(ib_modify_wq);
 
 /*
- * ib_create_rwq_ind_table - Creates a RQ Indirection Table.
- * @device: The device on which to create the rwq indirection table.
- * @ib_rwq_ind_table_init_attr: A list of initial attributes required to
- * create the Indirection Table.
- *
- * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less
- *	than the created ib_rwq_ind_table object and the caller is responsible
- *	for its memory allocation/free.
- */
-struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
-						 struct ib_rwq_ind_table_init_attr *init_attr)
-{
-	struct ib_rwq_ind_table *rwq_ind_table;
-	int i;
-	u32 table_size;
-
-	if (!device->ops.create_rwq_ind_table)
-		return ERR_PTR(-EOPNOTSUPP);
-
-	table_size = (1 << init_attr->log_ind_tbl_size);
-	rwq_ind_table = device->ops.create_rwq_ind_table(device,
-							 init_attr, NULL);
-	if (IS_ERR(rwq_ind_table))
-		return rwq_ind_table;
-
-	rwq_ind_table->ind_tbl = init_attr->ind_tbl;
-	rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size;
-	rwq_ind_table->device = device;
-	rwq_ind_table->uobject = NULL;
-	atomic_set(&rwq_ind_table->usecnt, 0);
-
-	for (i = 0; i < table_size; i++)
-		atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt);
-
-	return rwq_ind_table;
-}
-EXPORT_SYMBOL(ib_create_rwq_ind_table);
-
-/*
  * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table.
  * @wq_ind_table: The Indirection Table to destroy.
 */
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 8b6ad5c..3f18efc 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -842,16 +842,79 @@
 	}
 }
 
+static u16 bnxt_re_setup_rwqe_size(struct bnxt_qplib_qp *qplqp,
+				   int rsge, int max)
+{
+	if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC)
+		rsge = max;
+	return bnxt_re_get_rwqe_size(rsge);
+}
+
+static u16 bnxt_re_get_wqe_size(int ilsize, int nsge)
+{
+	u16 wqe_size, calc_ils;
+
+	wqe_size = bnxt_re_get_swqe_size(nsge);
+	if (ilsize) {
+		calc_ils = sizeof(struct sq_send_hdr) + ilsize;
+		wqe_size = max_t(u16, calc_ils, wqe_size);
+		wqe_size = ALIGN(wqe_size, sizeof(struct sq_send_hdr));
+	}
+	return wqe_size;
+}
+
+static int bnxt_re_setup_swqe_size(struct bnxt_re_qp *qp,
+				   struct ib_qp_init_attr *init_attr)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+	struct bnxt_qplib_q *sq;
+	int align, ilsize;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	sq = &qplqp->sq;
+	dev_attr = &rdev->dev_attr;
+
+	align = sizeof(struct sq_send_hdr);
+	ilsize = ALIGN(init_attr->cap.max_inline_data, align);
+
+	sq->wqe_size = bnxt_re_get_wqe_size(ilsize, sq->max_sge);
+	if (sq->wqe_size > bnxt_re_get_swqe_size(dev_attr->max_qp_sges))
+		return -EINVAL;
+	/* For gen p4 and gen p5 backward compatibility mode
+	 * wqe size is fixed to 128 bytes
+	 */
+	if (sq->wqe_size < bnxt_re_get_swqe_size(dev_attr->max_qp_sges) &&
+			qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC)
+		sq->wqe_size = bnxt_re_get_swqe_size(dev_attr->max_qp_sges);
+
+	if (init_attr->cap.max_inline_data) {
+		qplqp->max_inline_data = sq->wqe_size -
+			sizeof(struct sq_send_hdr);
+		init_attr->cap.max_inline_data = qplqp->max_inline_data;
+		if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC)
+			sq->max_sge = qplqp->max_inline_data /
+				sizeof(struct sq_sge);
+	}
+
+	return 0;
+}
+
 static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
 				struct bnxt_re_qp *qp, struct ib_udata *udata)
 {
+	struct bnxt_qplib_qp *qplib_qp;
+	struct bnxt_re_ucontext *cntx;
 	struct bnxt_re_qp_req ureq;
-	struct bnxt_qplib_qp *qplib_qp = &qp->qplib_qp;
-	struct ib_umem *umem;
 	int bytes = 0, psn_sz;
-	struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context(
-		udata, struct bnxt_re_ucontext, ib_uctx);
+	struct ib_umem *umem;
+	int psn_nume;
 
+	qplib_qp = &qp->qplib_qp;
+	cntx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext,
+					 ib_uctx);
 	if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
 		return -EFAULT;
 
@@ -859,10 +922,15 @@
 	/* Consider mapping PSN search memory only for RC QPs. */
 	if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC) {
 		psn_sz = bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ?
-					sizeof(struct sq_psn_search_ext) :
-					sizeof(struct sq_psn_search);
-		bytes += (qplib_qp->sq.max_wqe * psn_sz);
+						   sizeof(struct sq_psn_search_ext) :
+						   sizeof(struct sq_psn_search);
+		psn_nume = (qplib_qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ?
+			    qplib_qp->sq.max_wqe :
+			    ((qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size) /
+			      sizeof(struct bnxt_qplib_sge));
+		bytes += (psn_nume * psn_sz);
 	}
+
 	bytes = PAGE_ALIGN(bytes);
 	umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes,
 			   IB_ACCESS_LOCAL_WRITE);
@@ -975,7 +1043,7 @@
 	qp->qplib_qp.sig_type = true;
 
 	/* Shadow QP SQ depth should be same as QP1 RQ depth */
-	qp->qplib_qp.sq.wqe_size = bnxt_re_get_swqe_size();
+	qp->qplib_qp.sq.wqe_size = bnxt_re_get_wqe_size(0, 6);
 	qp->qplib_qp.sq.max_wqe = qp1_qp->rq.max_wqe;
 	qp->qplib_qp.sq.max_sge = 2;
 	/* Q full delta can be 1 since it is internal QP */
@@ -986,7 +1054,7 @@
 	qp->qplib_qp.scq = qp1_qp->scq;
 	qp->qplib_qp.rcq = qp1_qp->rcq;
 
-	qp->qplib_qp.rq.wqe_size = bnxt_re_get_rwqe_size();
+	qp->qplib_qp.rq.wqe_size = bnxt_re_get_rwqe_size(6);
 	qp->qplib_qp.rq.max_wqe = qp1_qp->rq.max_wqe;
 	qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge;
 	/* Q full delta can be 1 since it is internal QP */
@@ -1041,19 +1109,21 @@
 		qplqp->srq = &srq->qplib_srq;
 		rq->max_wqe = 0;
 	} else {
-		rq->wqe_size = bnxt_re_get_rwqe_size();
+		rq->max_sge = init_attr->cap.max_recv_sge;
+		if (rq->max_sge > dev_attr->max_qp_sges)
+			rq->max_sge = dev_attr->max_qp_sges;
+		init_attr->cap.max_recv_sge = rq->max_sge;
+		rq->wqe_size = bnxt_re_setup_rwqe_size(qplqp, rq->max_sge,
+						       dev_attr->max_qp_sges);
 		/* Allocate 1 more than what's provided so posting max doesn't
 		 * mean empty.
 		 */
 		entries = roundup_pow_of_two(init_attr->cap.max_recv_wr + 1);
 		rq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1);
-		rq->q_full_delta = rq->max_wqe - init_attr->cap.max_recv_wr;
-		rq->max_sge = init_attr->cap.max_recv_sge;
-		if (rq->max_sge > dev_attr->max_qp_sges)
-			rq->max_sge = dev_attr->max_qp_sges;
+		rq->q_full_delta = 0;
+		rq->sg_info.pgsize = PAGE_SIZE;
+		rq->sg_info.pgshft = PAGE_SHIFT;
 	}
-	rq->sg_info.pgsize = PAGE_SIZE;
-	rq->sg_info.pgshft = PAGE_SHIFT;
 
 	return 0;
 }
@@ -1068,41 +1138,48 @@
 	qplqp = &qp->qplib_qp;
 	dev_attr = &rdev->dev_attr;
 
-	qplqp->rq.max_sge = dev_attr->max_qp_sges;
-	if (qplqp->rq.max_sge > dev_attr->max_qp_sges)
+	if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) {
 		qplqp->rq.max_sge = dev_attr->max_qp_sges;
-	qplqp->rq.max_sge = 6;
+		if (qplqp->rq.max_sge > dev_attr->max_qp_sges)
+			qplqp->rq.max_sge = dev_attr->max_qp_sges;
+		qplqp->rq.max_sge = 6;
+	}
 }
 
-static void bnxt_re_init_sq_attr(struct bnxt_re_qp *qp,
-				 struct ib_qp_init_attr *init_attr,
-				 struct ib_udata *udata)
+static int bnxt_re_init_sq_attr(struct bnxt_re_qp *qp,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata)
 {
 	struct bnxt_qplib_dev_attr *dev_attr;
 	struct bnxt_qplib_qp *qplqp;
 	struct bnxt_re_dev *rdev;
 	struct bnxt_qplib_q *sq;
 	int entries;
+	int diff;
+	int rc;
 
 	rdev = qp->rdev;
 	qplqp = &qp->qplib_qp;
 	sq = &qplqp->sq;
 	dev_attr = &rdev->dev_attr;
 
-	sq->wqe_size = bnxt_re_get_swqe_size();
 	sq->max_sge = init_attr->cap.max_send_sge;
-	if (sq->max_sge > dev_attr->max_qp_sges)
+	if (sq->max_sge > dev_attr->max_qp_sges) {
 		sq->max_sge = dev_attr->max_qp_sges;
-	/*
-	 * Change the SQ depth if user has requested minimum using
-	 * configfs. Only supported for kernel consumers
-	 */
+		init_attr->cap.max_send_sge = sq->max_sge;
+	}
+
+	rc = bnxt_re_setup_swqe_size(qp, init_attr);
+	if (rc)
+		return rc;
+
 	entries = init_attr->cap.max_send_wr;
 	/* Allocate 128 + 1 more than what's provided */
-	entries = roundup_pow_of_two(entries + BNXT_QPLIB_RESERVED_QP_WRS + 1);
-	sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes +
-			    BNXT_QPLIB_RESERVED_QP_WRS + 1);
-	sq->q_full_delta = BNXT_QPLIB_RESERVED_QP_WRS + 1;
+	diff = (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) ?
+		0 : BNXT_QPLIB_RESERVED_QP_WRS;
+	entries = roundup_pow_of_two(entries + diff + 1);
+	sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1);
+	sq->q_full_delta = diff + 1;
 	/*
 	 * Reserving one slot for Phantom WQE. Application can
 	 * post one extra entry in this case. But allowing this to avoid
@@ -1111,6 +1188,8 @@
 	qplqp->sq.q_full_delta -= 1;
 	qplqp->sq.sg_info.pgsize = PAGE_SIZE;
 	qplqp->sq.sg_info.pgshft = PAGE_SHIFT;
+
+	return 0;
 }
 
 static void bnxt_re_adjust_gsi_sq_attr(struct bnxt_re_qp *qp,
@@ -1125,13 +1204,16 @@
 	qplqp = &qp->qplib_qp;
 	dev_attr = &rdev->dev_attr;
 
-	entries = roundup_pow_of_two(init_attr->cap.max_send_wr + 1);
-	qplqp->sq.max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1);
-	qplqp->sq.q_full_delta = qplqp->sq.max_wqe -
-				 init_attr->cap.max_send_wr;
-	qplqp->sq.max_sge++; /* Need one extra sge to put UD header */
-	if (qplqp->sq.max_sge > dev_attr->max_qp_sges)
-		qplqp->sq.max_sge = dev_attr->max_qp_sges;
+	if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) {
+		entries = roundup_pow_of_two(init_attr->cap.max_send_wr + 1);
+		qplqp->sq.max_wqe = min_t(u32, entries,
+					  dev_attr->max_qp_wqes + 1);
+		qplqp->sq.q_full_delta = qplqp->sq.max_wqe -
+			init_attr->cap.max_send_wr;
+		qplqp->sq.max_sge++; /* Need one extra sge to put UD header */
+		if (qplqp->sq.max_sge > dev_attr->max_qp_sges)
+			qplqp->sq.max_sge = dev_attr->max_qp_sges;
+	}
 }
 
 static int bnxt_re_init_qp_type(struct bnxt_re_dev *rdev,
@@ -1183,6 +1265,7 @@
 		goto out;
 	}
 	qplqp->type = (u8)qptype;
+	qplqp->wqe_mode = rdev->chip_ctx->modes.wqe_mode;
 
 	if (init_attr->qp_type == IB_QPT_RC) {
 		qplqp->max_rd_atomic = dev_attr->max_qp_rd_atom;
@@ -1226,7 +1309,9 @@
 		bnxt_re_adjust_gsi_rq_attr(qp);
 
 	/* Setup SQ */
-	bnxt_re_init_sq_attr(qp, init_attr, udata);
+	rc = bnxt_re_init_sq_attr(qp, init_attr, udata);
+	if (rc)
+		goto out;
 	if (init_attr->qp_type == IB_QPT_GSI)
 		bnxt_re_adjust_gsi_sq_attr(qp, init_attr);
 
@@ -1574,8 +1659,9 @@
 		entries = dev_attr->max_srq_wqes + 1;
 	srq->qplib_srq.max_wqe = entries;
 
-	srq->qplib_srq.wqe_size = bnxt_re_get_rwqe_size();
 	srq->qplib_srq.max_sge = srq_init_attr->attr.max_sge;
+	srq->qplib_srq.wqe_size =
+			bnxt_re_get_rwqe_size(srq->qplib_srq.max_sge);
 	srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit;
 	srq->srq_limit = srq_init_attr->attr.srq_limit;
 	srq->qplib_srq.eventq_hw_ring_id = rdev->nq[0].ring_id;
@@ -3569,7 +3655,7 @@
 }
 
 struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type,
-			       u32 max_num_sg, struct ib_udata *udata)
+			       u32 max_num_sg)
 {
 	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
 	struct bnxt_re_dev *rdev = pd->rdev;
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index e5fbbeb..1daeb30 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -136,14 +136,14 @@
 	spinlock_t		sh_lock;	/* protect shpg */
 };
 
-static inline u16 bnxt_re_get_swqe_size(void)
+static inline u16 bnxt_re_get_swqe_size(int nsge)
 {
-	return sizeof(struct sq_send);
+	return sizeof(struct sq_send_hdr) + nsge * sizeof(struct sq_sge);
 }
 
-static inline u16 bnxt_re_get_rwqe_size(void)
+static inline u16 bnxt_re_get_rwqe_size(int nsge)
 {
-	return sizeof(struct rq_wqe);
+	return sizeof(struct rq_wqe_hdr) + (nsge * sizeof(struct sq_sge));
 }
 
 int bnxt_re_query_device(struct ib_device *ibdev,
@@ -201,7 +201,7 @@
 int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
 		      unsigned int *sg_offset);
 struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata);
+			       u32 max_num_sg);
 int bnxt_re_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
 			       struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index b12fbc8..dad0df8 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -82,6 +82,15 @@
 static void bnxt_re_dealloc_driver(struct ib_device *ib_dev);
 static void bnxt_re_stop_irq(void *handle);
 
+static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode)
+{
+	struct bnxt_qplib_chip_ctx *cctx;
+
+	cctx = rdev->chip_ctx;
+	cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ?
+			       mode : BNXT_QPLIB_WQE_MODE_STATIC;
+}
+
 static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev)
 {
 	struct bnxt_qplib_chip_ctx *chip_ctx;
@@ -97,7 +106,7 @@
 	kfree(chip_ctx);
 }
 
-static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev)
+static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode)
 {
 	struct bnxt_qplib_chip_ctx *chip_ctx;
 	struct bnxt_en_dev *en_dev;
@@ -117,6 +126,7 @@
 	rdev->qplib_res.cctx = rdev->chip_ctx;
 	rdev->rcfw.res = &rdev->qplib_res;
 
+	bnxt_re_set_drv_mode(rdev, wqe_mode);
 	return 0;
 }
 
@@ -1386,7 +1396,7 @@
 	schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
 }
 
-static int bnxt_re_dev_init(struct bnxt_re_dev *rdev)
+static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode)
 {
 	struct bnxt_qplib_creq_ctx *creq;
 	struct bnxt_re_ring_attr rattr;
@@ -1406,7 +1416,7 @@
 	}
 	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
 
-	rc = bnxt_re_setup_chip_ctx(rdev);
+	rc = bnxt_re_setup_chip_ctx(rdev, wqe_mode);
 	if (rc) {
 		ibdev_err(&rdev->ibdev, "Failed to get chip context\n");
 		return -EINVAL;
@@ -1585,7 +1595,7 @@
 }
 
 static int bnxt_re_add_device(struct bnxt_re_dev **rdev,
-			      struct net_device *netdev)
+			      struct net_device *netdev, u8 wqe_mode)
 {
 	int rc;
 
@@ -1599,7 +1609,7 @@
 	}
 
 	pci_dev_get((*rdev)->en_dev->pdev);
-	rc = bnxt_re_dev_init(*rdev);
+	rc = bnxt_re_dev_init(*rdev, wqe_mode);
 	if (rc) {
 		pci_dev_put((*rdev)->en_dev->pdev);
 		bnxt_re_dev_unreg(*rdev);
@@ -1711,7 +1721,8 @@
 	case NETDEV_REGISTER:
 		if (rdev)
 			break;
-		rc = bnxt_re_add_device(&rdev, real_dev);
+		rc = bnxt_re_add_device(&rdev, real_dev,
+					BNXT_QPLIB_WQE_MODE_STATIC);
 		if (!rc)
 			sch_work = true;
 		release = false;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index c5e2957..117b423 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -178,11 +178,11 @@
 
 	if (qp->rq_hdr_buf)
 		dma_free_coherent(&res->pdev->dev,
-				  rq->hwq.max_elements * qp->rq_hdr_buf_size,
+				  rq->max_wqe * qp->rq_hdr_buf_size,
 				  qp->rq_hdr_buf, qp->rq_hdr_buf_map);
 	if (qp->sq_hdr_buf)
 		dma_free_coherent(&res->pdev->dev,
-				  sq->hwq.max_elements * qp->sq_hdr_buf_size,
+				  sq->max_wqe * qp->sq_hdr_buf_size,
 				  qp->sq_hdr_buf, qp->sq_hdr_buf_map);
 	qp->rq_hdr_buf = NULL;
 	qp->sq_hdr_buf = NULL;
@@ -199,10 +199,9 @@
 	struct bnxt_qplib_q *sq = &qp->sq;
 	int rc = 0;
 
-	if (qp->sq_hdr_buf_size && sq->hwq.max_elements) {
+	if (qp->sq_hdr_buf_size && sq->max_wqe) {
 		qp->sq_hdr_buf = dma_alloc_coherent(&res->pdev->dev,
-					sq->hwq.max_elements *
-					qp->sq_hdr_buf_size,
+					sq->max_wqe * qp->sq_hdr_buf_size,
 					&qp->sq_hdr_buf_map, GFP_KERNEL);
 		if (!qp->sq_hdr_buf) {
 			rc = -ENOMEM;
@@ -212,9 +211,9 @@
 		}
 	}
 
-	if (qp->rq_hdr_buf_size && rq->hwq.max_elements) {
+	if (qp->rq_hdr_buf_size && rq->max_wqe) {
 		qp->rq_hdr_buf = dma_alloc_coherent(&res->pdev->dev,
-						    rq->hwq.max_elements *
+						    rq->max_wqe *
 						    qp->rq_hdr_buf_size,
 						    &qp->rq_hdr_buf_map,
 						    GFP_KERNEL);
@@ -661,6 +660,7 @@
 	srq->dbinfo.hwq = &srq->hwq;
 	srq->dbinfo.xid = srq->id;
 	srq->dbinfo.db = srq->dpi->dbr;
+	srq->dbinfo.max_slot = 1;
 	srq->dbinfo.priv_db = res->dpi_tbl.dbr_bar_reg_iomem;
 	if (srq->threshold)
 		bnxt_qplib_armen_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ_ARMENA);
@@ -784,6 +784,28 @@
 }
 
 /* QP */
+
+static int bnxt_qplib_alloc_init_swq(struct bnxt_qplib_q *que)
+{
+	int rc = 0;
+	int indx;
+
+	que->swq = kcalloc(que->max_wqe, sizeof(*que->swq), GFP_KERNEL);
+	if (!que->swq) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	que->swq_start = 0;
+	que->swq_last = que->max_wqe - 1;
+	for (indx = 0; indx < que->max_wqe; indx++)
+		que->swq[indx].next_idx = indx + 1;
+	que->swq[que->swq_last].next_idx = 0; /* Make it circular */
+	que->swq_last = 0;
+out:
+	return rc;
+}
+
 int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 {
 	struct bnxt_qplib_hwq_attr hwq_attr = {};
@@ -808,71 +830,63 @@
 	/* SQ */
 	hwq_attr.res = res;
 	hwq_attr.sginfo = &sq->sg_info;
-	hwq_attr.depth = sq->max_wqe;
-	hwq_attr.stride = sq->wqe_size;
+	hwq_attr.stride = sizeof(struct sq_sge);
+	hwq_attr.depth = bnxt_qplib_get_depth(sq);
 	hwq_attr.type = HWQ_TYPE_QUEUE;
 	rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr);
 	if (rc)
 		goto exit;
 
-	sq->swq = kcalloc(sq->hwq.max_elements, sizeof(*sq->swq), GFP_KERNEL);
-	if (!sq->swq) {
-		rc = -ENOMEM;
+	rc = bnxt_qplib_alloc_init_swq(sq);
+	if (rc)
 		goto fail_sq;
-	}
+
+	req.sq_size = cpu_to_le32(bnxt_qplib_set_sq_size(sq, qp->wqe_mode));
 	pbl = &sq->hwq.pbl[PBL_LVL_0];
 	req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
 	pg_sz_lvl = (bnxt_qplib_base_pg_size(&sq->hwq) <<
 		     CMDQ_CREATE_QP1_SQ_PG_SIZE_SFT);
 	pg_sz_lvl |= (sq->hwq.level & CMDQ_CREATE_QP1_SQ_LVL_MASK);
 	req.sq_pg_size_sq_lvl = pg_sz_lvl;
+	req.sq_fwo_sq_sge =
+		cpu_to_le16((sq->max_sge & CMDQ_CREATE_QP1_SQ_SGE_MASK) <<
+			     CMDQ_CREATE_QP1_SQ_SGE_SFT);
+	req.scq_cid = cpu_to_le32(qp->scq->id);
 
-	if (qp->scq)
-		req.scq_cid = cpu_to_le32(qp->scq->id);
 	/* RQ */
 	if (rq->max_wqe) {
 		hwq_attr.res = res;
 		hwq_attr.sginfo = &rq->sg_info;
-		hwq_attr.stride = rq->wqe_size;
-		hwq_attr.depth = qp->rq.max_wqe;
+		hwq_attr.stride = sizeof(struct sq_sge);
+		hwq_attr.depth = bnxt_qplib_get_depth(rq);
 		hwq_attr.type = HWQ_TYPE_QUEUE;
 		rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr);
 		if (rc)
-			goto fail_sq;
-
-		rq->swq = kcalloc(rq->hwq.max_elements, sizeof(*rq->swq),
-				  GFP_KERNEL);
-		if (!rq->swq) {
-			rc = -ENOMEM;
+			goto sq_swq;
+		rc = bnxt_qplib_alloc_init_swq(rq);
+		if (rc)
 			goto fail_rq;
-		}
+		req.rq_size = cpu_to_le32(rq->max_wqe);
 		pbl = &rq->hwq.pbl[PBL_LVL_0];
 		req.rq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
 		pg_sz_lvl = (bnxt_qplib_base_pg_size(&rq->hwq) <<
 			     CMDQ_CREATE_QP1_RQ_PG_SIZE_SFT);
 		pg_sz_lvl |= (rq->hwq.level & CMDQ_CREATE_QP1_RQ_LVL_MASK);
 		req.rq_pg_size_rq_lvl = pg_sz_lvl;
-		if (qp->rcq)
-			req.rcq_cid = cpu_to_le32(qp->rcq->id);
+		req.rq_fwo_rq_sge =
+			cpu_to_le16((rq->max_sge &
+				     CMDQ_CREATE_QP1_RQ_SGE_MASK) <<
+				    CMDQ_CREATE_QP1_RQ_SGE_SFT);
 	}
+	req.rcq_cid = cpu_to_le32(qp->rcq->id);
 	/* Header buffer - allow hdr_buf pass in */
 	rc = bnxt_qplib_alloc_qp_hdr_buf(res, qp);
 	if (rc) {
 		rc = -ENOMEM;
-		goto fail;
+		goto rq_rwq;
 	}
 	qp_flags |= CMDQ_CREATE_QP1_QP_FLAGS_RESERVED_LKEY_ENABLE;
 	req.qp_flags = cpu_to_le32(qp_flags);
-	req.sq_size = cpu_to_le32(sq->hwq.max_elements);
-	req.rq_size = cpu_to_le32(rq->hwq.max_elements);
-
-	req.sq_fwo_sq_sge =
-		cpu_to_le16((sq->max_sge & CMDQ_CREATE_QP1_SQ_SGE_MASK) <<
-			    CMDQ_CREATE_QP1_SQ_SGE_SFT);
-	req.rq_fwo_rq_sge =
-		cpu_to_le16((rq->max_sge & CMDQ_CREATE_QP1_RQ_SGE_MASK) <<
-			    CMDQ_CREATE_QP1_RQ_SGE_SFT);
-
 	req.pd_id = cpu_to_le32(qp->pd->id);
 
 	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
@@ -886,10 +900,12 @@
 	sq->dbinfo.hwq = &sq->hwq;
 	sq->dbinfo.xid = qp->id;
 	sq->dbinfo.db = qp->dpi->dbr;
+	sq->dbinfo.max_slot = bnxt_qplib_set_sq_max_slot(qp->wqe_mode);
 	if (rq->max_wqe) {
 		rq->dbinfo.hwq = &rq->hwq;
 		rq->dbinfo.xid = qp->id;
 		rq->dbinfo.db = qp->dpi->dbr;
+		rq->dbinfo.max_slot = bnxt_qplib_set_rq_max_slot(rq->wqe_size);
 	}
 	rcfw->qp_tbl[qp->id].qp_id = qp->id;
 	rcfw->qp_tbl[qp->id].qp_handle = (void *)qp;
@@ -898,12 +914,14 @@
 
 fail:
 	bnxt_qplib_free_qp_hdr_buf(res, qp);
+rq_rwq:
+	kfree(rq->swq);
 fail_rq:
 	bnxt_qplib_free_hwq(res, &rq->hwq);
-	kfree(rq->swq);
+sq_swq:
+	kfree(sq->swq);
 fail_sq:
 	bnxt_qplib_free_hwq(res, &sq->hwq);
-	kfree(sq->swq);
 exit:
 	return rc;
 }
@@ -912,26 +930,18 @@
 {
 	struct bnxt_qplib_hwq *hwq;
 	struct bnxt_qplib_q *sq;
-	u64 fpsne, psne, psn_pg;
-	u16 indx_pad = 0, indx;
-	u16 pg_num, pg_indx;
-	u64 *page;
+	u64 fpsne, psn_pg;
+	u16 indx_pad = 0;
 
 	sq = &qp->sq;
 	hwq = &sq->hwq;
-
-	fpsne = (u64)bnxt_qplib_get_qe(hwq, hwq->max_elements, &psn_pg);
+	fpsne = (u64)bnxt_qplib_get_qe(hwq, hwq->depth, &psn_pg);
 	if (!IS_ALIGNED(fpsne, PAGE_SIZE))
 		indx_pad = ALIGN(fpsne, PAGE_SIZE) / size;
 
-	page = (u64 *)psn_pg;
-	for (indx = 0; indx < hwq->max_elements; indx++) {
-		pg_num = (indx + indx_pad) / (PAGE_SIZE / size);
-		pg_indx = (indx + indx_pad) % (PAGE_SIZE / size);
-		psne = page[pg_num] + pg_indx * size;
-		sq->swq[indx].psn_ext = (struct sq_psn_search_ext *)psne;
-		sq->swq[indx].psn_search = (struct sq_psn_search *)psne;
-	}
+	hwq->pad_pgofft = indx_pad;
+	hwq->pad_pg = (u64 *)psn_pg;
+	hwq->pad_stride = size;
 }
 
 int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
@@ -944,12 +954,12 @@
 	struct creq_create_qp_resp resp;
 	int rc, req_size, psn_sz = 0;
 	struct bnxt_qplib_hwq *xrrq;
-	u16 cmd_flags = 0, max_ssge;
 	struct bnxt_qplib_pbl *pbl;
 	struct cmdq_create_qp req;
+	u16 cmd_flags = 0;
 	u32 qp_flags = 0;
 	u8 pg_sz_lvl;
-	u16 max_rsge;
+	u16 nsge;
 
 	RCFW_CMD_PREP(req, CREATE_QP, cmd_flags);
 
@@ -967,97 +977,78 @@
 
 	hwq_attr.res = res;
 	hwq_attr.sginfo = &sq->sg_info;
-	hwq_attr.stride = sq->wqe_size;
-	hwq_attr.depth = sq->max_wqe;
+	hwq_attr.stride = sizeof(struct sq_sge);
+	hwq_attr.depth = bnxt_qplib_get_depth(sq);
 	hwq_attr.aux_stride = psn_sz;
-	hwq_attr.aux_depth = hwq_attr.depth;
+	hwq_attr.aux_depth = bnxt_qplib_set_sq_size(sq, qp->wqe_mode);
 	hwq_attr.type = HWQ_TYPE_QUEUE;
 	rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr);
 	if (rc)
 		goto exit;
 
-	sq->swq = kcalloc(sq->hwq.max_elements, sizeof(*sq->swq), GFP_KERNEL);
-	if (!sq->swq) {
-		rc = -ENOMEM;
+	rc = bnxt_qplib_alloc_init_swq(sq);
+	if (rc)
 		goto fail_sq;
-	}
 
 	if (psn_sz)
 		bnxt_qplib_init_psn_ptr(qp, psn_sz);
 
+	req.sq_size = cpu_to_le32(bnxt_qplib_set_sq_size(sq, qp->wqe_mode));
 	pbl = &sq->hwq.pbl[PBL_LVL_0];
 	req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
 	pg_sz_lvl = (bnxt_qplib_base_pg_size(&sq->hwq) <<
 		     CMDQ_CREATE_QP_SQ_PG_SIZE_SFT);
 	pg_sz_lvl |= (sq->hwq.level & CMDQ_CREATE_QP_SQ_LVL_MASK);
 	req.sq_pg_size_sq_lvl = pg_sz_lvl;
-
-	if (qp->scq)
-		req.scq_cid = cpu_to_le32(qp->scq->id);
+	req.sq_fwo_sq_sge =
+		cpu_to_le16(((sq->max_sge & CMDQ_CREATE_QP_SQ_SGE_MASK) <<
+			     CMDQ_CREATE_QP_SQ_SGE_SFT) | 0);
+	req.scq_cid = cpu_to_le32(qp->scq->id);
 
 	/* RQ */
-	if (rq->max_wqe) {
+	if (!qp->srq) {
 		hwq_attr.res = res;
 		hwq_attr.sginfo = &rq->sg_info;
-		hwq_attr.stride = rq->wqe_size;
-		hwq_attr.depth = rq->max_wqe;
+		hwq_attr.stride = sizeof(struct sq_sge);
+		hwq_attr.depth = bnxt_qplib_get_depth(rq);
 		hwq_attr.aux_stride = 0;
 		hwq_attr.aux_depth = 0;
 		hwq_attr.type = HWQ_TYPE_QUEUE;
 		rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr);
 		if (rc)
-			goto fail_sq;
-
-		rq->swq = kcalloc(rq->hwq.max_elements, sizeof(*rq->swq),
-				  GFP_KERNEL);
-		if (!rq->swq) {
-			rc = -ENOMEM;
+			goto sq_swq;
+		rc = bnxt_qplib_alloc_init_swq(rq);
+		if (rc)
 			goto fail_rq;
-		}
+
+		req.rq_size = cpu_to_le32(rq->max_wqe);
 		pbl = &rq->hwq.pbl[PBL_LVL_0];
 		req.rq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
 		pg_sz_lvl = (bnxt_qplib_base_pg_size(&rq->hwq) <<
 			     CMDQ_CREATE_QP_RQ_PG_SIZE_SFT);
 		pg_sz_lvl |= (rq->hwq.level & CMDQ_CREATE_QP_RQ_LVL_MASK);
 		req.rq_pg_size_rq_lvl = pg_sz_lvl;
+		nsge = (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ?
+			6 : rq->max_sge;
+		req.rq_fwo_rq_sge =
+			cpu_to_le16(((nsge &
+				      CMDQ_CREATE_QP_RQ_SGE_MASK) <<
+				     CMDQ_CREATE_QP_RQ_SGE_SFT) | 0);
 	} else {
 		/* SRQ */
-		if (qp->srq) {
-			qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_SRQ_USED;
-			req.srq_cid = cpu_to_le32(qp->srq->id);
-		}
+		qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_SRQ_USED;
+		req.srq_cid = cpu_to_le32(qp->srq->id);
 	}
-
-	if (qp->rcq)
-		req.rcq_cid = cpu_to_le32(qp->rcq->id);
+	req.rcq_cid = cpu_to_le32(qp->rcq->id);
 
 	qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_RESERVED_LKEY_ENABLE;
 	qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FR_PMR_ENABLED;
 	if (qp->sig_type)
 		qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION;
+	if (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE)
+		qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_VARIABLE_SIZED_WQE_ENABLED;
 	req.qp_flags = cpu_to_le32(qp_flags);
 
-	req.sq_size = cpu_to_le32(sq->hwq.max_elements);
-	req.rq_size = cpu_to_le32(rq->hwq.max_elements);
-	qp->sq_hdr_buf = NULL;
-	qp->rq_hdr_buf = NULL;
-
-	rc = bnxt_qplib_alloc_qp_hdr_buf(res, qp);
-	if (rc)
-		goto fail_rq;
-
-	/* CTRL-22434: Irrespective of the requested SGE count on the SQ
-	 * always create the QP with max send sges possible if the requested
-	 * inline size is greater than 0.
-	 */
-	max_ssge = qp->max_inline_data ? 6 : sq->max_sge;
-	req.sq_fwo_sq_sge = cpu_to_le16(
-				((max_ssge & CMDQ_CREATE_QP_SQ_SGE_MASK)
-				 << CMDQ_CREATE_QP_SQ_SGE_SFT) | 0);
-	max_rsge = bnxt_qplib_is_chip_gen_p5(res->cctx) ? 6 : rq->max_sge;
-	req.rq_fwo_rq_sge = cpu_to_le16(
-				((max_rsge & CMDQ_CREATE_QP_RQ_SGE_MASK)
-				 << CMDQ_CREATE_QP_RQ_SGE_SFT) | 0);
 	/* ORRQ and IRRQ */
 	if (psn_sz) {
 		xrrq = &qp->orrq;
@@ -1078,7 +1069,7 @@
 		hwq_attr.type = HWQ_TYPE_CTX;
 		rc = bnxt_qplib_alloc_init_hwq(xrrq, &hwq_attr);
 		if (rc)
-			goto fail_buf_free;
+			goto rq_swq;
 		pbl = &xrrq->pbl[PBL_LVL_0];
 		req.orrq_addr = cpu_to_le64(pbl->pg_map_arr[0]);
 
@@ -1113,30 +1104,29 @@
 	sq->dbinfo.hwq = &sq->hwq;
 	sq->dbinfo.xid = qp->id;
 	sq->dbinfo.db = qp->dpi->dbr;
+	sq->dbinfo.max_slot = bnxt_qplib_set_sq_max_slot(qp->wqe_mode);
 	if (rq->max_wqe) {
 		rq->dbinfo.hwq = &rq->hwq;
 		rq->dbinfo.xid = qp->id;
 		rq->dbinfo.db = qp->dpi->dbr;
+		rq->dbinfo.max_slot = bnxt_qplib_set_rq_max_slot(rq->wqe_size);
 	}
 	rcfw->qp_tbl[qp->id].qp_id = qp->id;
 	rcfw->qp_tbl[qp->id].qp_handle = (void *)qp;
 
 	return 0;
-
 fail:
-	if (qp->irrq.max_elements)
-		bnxt_qplib_free_hwq(res, &qp->irrq);
+	bnxt_qplib_free_hwq(res, &qp->irrq);
 fail_orrq:
-	if (qp->orrq.max_elements)
-		bnxt_qplib_free_hwq(res, &qp->orrq);
-fail_buf_free:
-	bnxt_qplib_free_qp_hdr_buf(res, qp);
+	bnxt_qplib_free_hwq(res, &qp->orrq);
+rq_swq:
+	kfree(rq->swq);
 fail_rq:
 	bnxt_qplib_free_hwq(res, &rq->hwq);
-	kfree(rq->swq);
+sq_swq:
+	kfree(sq->swq);
 fail_sq:
 	bnxt_qplib_free_hwq(res, &sq->hwq);
-	kfree(sq->swq);
 exit:
 	return rc;
 }
@@ -1512,7 +1502,7 @@
 	memset(sge, 0, sizeof(*sge));
 
 	if (qp->sq_hdr_buf) {
-		sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+		sw_prod = sq->swq_start;
 		sge->addr = (dma_addr_t)(qp->sq_hdr_buf_map +
 					 sw_prod * qp->sq_hdr_buf_size);
 		sge->lkey = 0xFFFFFFFF;
@@ -1526,7 +1516,7 @@
 {
 	struct bnxt_qplib_q *rq = &qp->rq;
 
-	return HWQ_CMP(rq->hwq.prod, &rq->hwq);
+	return rq->swq_start;
 }
 
 dma_addr_t bnxt_qplib_get_qp_buf_from_index(struct bnxt_qplib_qp *qp, u32 index)
@@ -1543,7 +1533,7 @@
 	memset(sge, 0, sizeof(*sge));
 
 	if (qp->rq_hdr_buf) {
-		sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+		sw_prod = rq->swq_start;
 		sge->addr = (dma_addr_t)(qp->rq_hdr_buf_map +
 					 sw_prod * qp->rq_hdr_buf_size);
 		sge->lkey = 0xFFFFFFFF;
@@ -1562,6 +1552,8 @@
 	u32 flg_npsn;
 	u32 op_spsn;
 
+	if (!swq->psn_search)
+		return;
 	psns = swq->psn_search;
 	psns_ext = swq->psn_ext;
 
@@ -1575,12 +1567,122 @@
 	if (bnxt_qplib_is_chip_gen_p5(qp->cctx)) {
 		psns_ext->opcode_start_psn = cpu_to_le32(op_spsn);
 		psns_ext->flags_next_psn = cpu_to_le32(flg_npsn);
+		psns_ext->start_slot_idx = cpu_to_le16(swq->slot_idx);
 	} else {
 		psns->opcode_start_psn = cpu_to_le32(op_spsn);
 		psns->flags_next_psn = cpu_to_le32(flg_npsn);
 	}
 }
 
+static int bnxt_qplib_put_inline(struct bnxt_qplib_qp *qp,
+				 struct bnxt_qplib_swqe *wqe,
+				 u16 *idx)
+{
+	struct bnxt_qplib_hwq *hwq;
+	int len, t_len, offt;
+	bool pull_dst = true;
+	void *il_dst = NULL;
+	void *il_src = NULL;
+	int t_cplen, cplen;
+	int indx;
+
+	hwq = &qp->sq.hwq;
+	t_len = 0;
+	for (indx = 0; indx < wqe->num_sge; indx++) {
+		len = wqe->sg_list[indx].size;
+		il_src = (void *)wqe->sg_list[indx].addr;
+		t_len += len;
+		if (t_len > qp->max_inline_data)
+			goto bad;
+		while (len) {
+			if (pull_dst) {
+				pull_dst = false;
+				il_dst = bnxt_qplib_get_prod_qe(hwq, *idx);
+				(*idx)++;
+				t_cplen = 0;
+				offt = 0;
+			}
+			cplen = min_t(int, len, sizeof(struct sq_sge));
+			cplen = min_t(int, cplen,
+					(sizeof(struct sq_sge) - offt));
+			memcpy(il_dst, il_src, cplen);
+			t_cplen += cplen;
+			il_src += cplen;
+			il_dst += cplen;
+			offt += cplen;
+			len -= cplen;
+			if (t_cplen == sizeof(struct sq_sge))
+				pull_dst = true;
+		}
+	}
+
+	return t_len;
+bad:
+	return -ENOMEM;
+}
+
+static u32 bnxt_qplib_put_sges(struct bnxt_qplib_hwq *hwq,
+			       struct bnxt_qplib_sge *ssge,
+			       u16 nsge, u16 *idx)
+{
+	struct sq_sge *dsge;
+	int indx, len = 0;
+
+	for (indx = 0; indx < nsge; indx++, (*idx)++) {
+		dsge = bnxt_qplib_get_prod_qe(hwq, *idx);
+		dsge->va_or_pa = cpu_to_le64(ssge[indx].addr);
+		dsge->l_key = cpu_to_le32(ssge[indx].lkey);
+		dsge->size = cpu_to_le32(ssge[indx].size);
+		len += ssge[indx].size;
+	}
+
+	return len;
+}
+
+static u16 bnxt_qplib_required_slots(struct bnxt_qplib_qp *qp,
+				     struct bnxt_qplib_swqe *wqe,
+				     u16 *wqe_sz, u16 *qdf, u8 mode)
+{
+	u32 ilsize, bytes;
+	u16 nsge;
+	u16 slot;
+
+	nsge = wqe->num_sge;
+	/* Adding sq_send_hdr is a misnomer, for rq also hdr size is same. */
+	bytes = sizeof(struct sq_send_hdr) + nsge * sizeof(struct sq_sge);
+	if (wqe->flags & BNXT_QPLIB_SWQE_FLAGS_INLINE) {
+		ilsize = bnxt_qplib_calc_ilsize(wqe, qp->max_inline_data);
+		bytes = ALIGN(ilsize, sizeof(struct sq_sge));
+		bytes += sizeof(struct sq_send_hdr);
+	}
+
+	*qdf =  __xlate_qfd(qp->sq.q_full_delta, bytes);
+	slot = bytes >> 4;
+	*wqe_sz = slot;
+	if (mode == BNXT_QPLIB_WQE_MODE_STATIC)
+		slot = 8;
+	return slot;
+}
+
+static void bnxt_qplib_pull_psn_buff(struct bnxt_qplib_q *sq,
+				     struct bnxt_qplib_swq *swq)
+{
+	struct bnxt_qplib_hwq *hwq;
+	u32 pg_num, pg_indx;
+	void *buff;
+	u32 tail;
+
+	hwq = &sq->hwq;
+	if (!hwq->pad_pg)
+		return;
+	tail = swq->slot_idx / sq->dbinfo.max_slot;
+	pg_num = (tail + hwq->pad_pgofft) / (PAGE_SIZE / hwq->pad_stride);
+	pg_indx = (tail + hwq->pad_pgofft) % (PAGE_SIZE / hwq->pad_stride);
+	buff = (void *)(hwq->pad_pg[pg_num] + pg_indx * hwq->pad_stride);
+	swq->psn_ext = buff;
+	swq->psn_search = buff;
+}
+
 void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp)
 {
 	struct bnxt_qplib_q *sq = &qp->sq;
@@ -1594,88 +1696,84 @@
 	struct bnxt_qplib_nq_work *nq_work = NULL;
 	int i, rc = 0, data_len = 0, pkt_num = 0;
 	struct bnxt_qplib_q *sq = &qp->sq;
-	struct sq_send *hw_sq_send_hdr;
+	struct bnxt_qplib_hwq *hwq;
 	struct bnxt_qplib_swq *swq;
 	bool sch_handler = false;
-	struct sq_sge *hw_sge;
-	u8 wqe_size16;
+	u16 wqe_sz, qdf = 0;
+	void *base_hdr;
+	void *ext_hdr;
 	__le32 temp32;
-	u32 sw_prod;
+	u32 wqe_idx;
+	u32 slots;
+	u16 idx;
 
-	if (qp->state != CMDQ_MODIFY_QP_NEW_STATE_RTS) {
-		if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
-			sch_handler = true;
-			dev_dbg(&sq->hwq.pdev->dev,
-				"%s Error QP. Scheduling for poll_cq\n",
-				__func__);
-			goto queue_err;
-		}
+	hwq = &sq->hwq;
+	if (qp->state != CMDQ_MODIFY_QP_NEW_STATE_RTS &&
+	    qp->state != CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+		dev_err(&hwq->pdev->dev,
+			"QPLIB: FP: QP (0x%x) is in the 0x%x state",
+			qp->id, qp->state);
+		rc = -EINVAL;
+		goto done;
 	}
 
-	if (bnxt_qplib_queue_full(sq)) {
-		dev_err(&sq->hwq.pdev->dev,
+	slots = bnxt_qplib_required_slots(qp, wqe, &wqe_sz, &qdf, qp->wqe_mode);
+	if (bnxt_qplib_queue_full(sq, slots + qdf)) {
+		dev_err(&hwq->pdev->dev,
 			"prod = %#x cons = %#x qdepth = %#x delta = %#x\n",
-			sq->hwq.prod, sq->hwq.cons, sq->hwq.max_elements,
-			sq->q_full_delta);
+			hwq->prod, hwq->cons, hwq->depth, sq->q_full_delta);
 		rc = -ENOMEM;
 		goto done;
 	}
-	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
-	swq = &sq->swq[sw_prod];
+
+	swq = bnxt_qplib_get_swqe(sq, &wqe_idx);
+	bnxt_qplib_pull_psn_buff(sq, swq);
+
+	idx = 0;
+	swq->slot_idx = hwq->prod;
+	swq->slots = slots;
 	swq->wr_id = wqe->wr_id;
 	swq->type = wqe->type;
 	swq->flags = wqe->flags;
+	swq->start_psn = sq->psn & BTH_PSN_MASK;
 	if (qp->sig_type)
 		swq->flags |= SQ_SEND_FLAGS_SIGNAL_COMP;
-	swq->start_psn = sq->psn & BTH_PSN_MASK;
 
-	hw_sq_send_hdr = bnxt_qplib_get_qe(&sq->hwq, sw_prod, NULL);
-	memset(hw_sq_send_hdr, 0, sq->wqe_size);
-
-	if (wqe->flags & BNXT_QPLIB_SWQE_FLAGS_INLINE) {
-		/* Copy the inline data */
-		if (wqe->inline_len > BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
-			dev_warn(&sq->hwq.pdev->dev,
-				 "Inline data length > 96 detected\n");
-			data_len = BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH;
-		} else {
-			data_len = wqe->inline_len;
-		}
-		memcpy(hw_sq_send_hdr->data, wqe->inline_data, data_len);
-		wqe_size16 = (data_len + 15) >> 4;
-	} else {
-		for (i = 0, hw_sge = (struct sq_sge *)hw_sq_send_hdr->data;
-		     i < wqe->num_sge; i++, hw_sge++) {
-			hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
-			hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
-			hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
-			data_len += wqe->sg_list[i].size;
-		}
-		/* Each SGE entry = 1 WQE size16 */
-		wqe_size16 = wqe->num_sge;
-		/* HW requires wqe size has room for atleast one SGE even if
-		 * none was supplied by ULP
-		 */
-		if (!wqe->num_sge)
-			wqe_size16++;
+	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+		sch_handler = true;
+		dev_dbg(&hwq->pdev->dev,
+			"%s Error QP. Scheduling for poll_cq\n", __func__);
+		goto queue_err;
 	}
 
+	base_hdr = bnxt_qplib_get_prod_qe(hwq, idx++);
+	ext_hdr = bnxt_qplib_get_prod_qe(hwq, idx++);
+	memset(base_hdr, 0, sizeof(struct sq_sge));
+	memset(ext_hdr, 0, sizeof(struct sq_sge));
+
+	if (wqe->flags & BNXT_QPLIB_SWQE_FLAGS_INLINE)
+		/* Copy the inline data */
+		data_len = bnxt_qplib_put_inline(qp, wqe, &idx);
+	else
+		data_len = bnxt_qplib_put_sges(hwq, wqe->sg_list, wqe->num_sge,
+					       &idx);
+	if (data_len < 0)
+		goto queue_err;
 	/* Specifics */
 	switch (wqe->type) {
 	case BNXT_QPLIB_SWQE_TYPE_SEND:
 		if (qp->type == CMDQ_CREATE_QP1_TYPE_GSI) {
+			struct sq_send_raweth_qp1_hdr *sqe = base_hdr;
+			struct sq_raw_ext_hdr *ext_sqe = ext_hdr;
 			/* Assemble info for Raw Ethertype QPs */
-			struct sq_send_raweth_qp1 *sqe =
-				(struct sq_send_raweth_qp1 *)hw_sq_send_hdr;
 
 			sqe->wqe_type = wqe->type;
 			sqe->flags = wqe->flags;
-			sqe->wqe_size = wqe_size16 +
-				((offsetof(typeof(*sqe), data) + 15) >> 4);
+			sqe->wqe_size = wqe_sz;
 			sqe->cfa_action = cpu_to_le16(wqe->rawqp1.cfa_action);
 			sqe->lflags = cpu_to_le16(wqe->rawqp1.lflags);
 			sqe->length = cpu_to_le32(data_len);
-			sqe->cfa_meta = cpu_to_le32((wqe->rawqp1.cfa_meta &
+			ext_sqe->cfa_meta = cpu_to_le32((wqe->rawqp1.cfa_meta &
 				SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_MASK) <<
 				SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_SFT);
 
@@ -1685,27 +1783,24 @@
 	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM:
 	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV:
 	{
-		struct sq_send *sqe = (struct sq_send *)hw_sq_send_hdr;
+		struct sq_ud_ext_hdr *ext_sqe = ext_hdr;
+		struct sq_send_hdr *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
-		sqe->wqe_size = wqe_size16 +
-				((offsetof(typeof(*sqe), data) + 15) >> 4);
-		sqe->inv_key_or_imm_data = cpu_to_le32(
-						wqe->send.inv_key);
+		sqe->wqe_size = wqe_sz;
+		sqe->inv_key_or_imm_data = cpu_to_le32(wqe->send.inv_key);
 		if (qp->type == CMDQ_CREATE_QP_TYPE_UD ||
 		    qp->type == CMDQ_CREATE_QP_TYPE_GSI) {
 			sqe->q_key = cpu_to_le32(wqe->send.q_key);
-			sqe->dst_qp = cpu_to_le32(
-					wqe->send.dst_qp & SQ_SEND_DST_QP_MASK);
 			sqe->length = cpu_to_le32(data_len);
-			sqe->avid = cpu_to_le32(wqe->send.avid &
-						SQ_SEND_AVID_MASK);
 			sq->psn = (sq->psn + 1) & BTH_PSN_MASK;
+			ext_sqe->dst_qp = cpu_to_le32(wqe->send.dst_qp &
+						      SQ_SEND_DST_QP_MASK);
+			ext_sqe->avid = cpu_to_le32(wqe->send.avid &
+						    SQ_SEND_AVID_MASK);
 		} else {
 			sqe->length = cpu_to_le32(data_len);
-			sqe->dst_qp = 0;
-			sqe->avid = 0;
 			if (qp->mtu)
 				pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
 			if (!pkt_num)
@@ -1718,16 +1813,16 @@
 	case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM:
 	case BNXT_QPLIB_SWQE_TYPE_RDMA_READ:
 	{
-		struct sq_rdma *sqe = (struct sq_rdma *)hw_sq_send_hdr;
+		struct sq_rdma_ext_hdr *ext_sqe = ext_hdr;
+		struct sq_rdma_hdr *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
-		sqe->wqe_size = wqe_size16 +
-				((offsetof(typeof(*sqe), data) + 15) >> 4);
+		sqe->wqe_size = wqe_sz;
 		sqe->imm_data = cpu_to_le32(wqe->rdma.inv_key);
 		sqe->length = cpu_to_le32((u32)data_len);
-		sqe->remote_va = cpu_to_le64(wqe->rdma.remote_va);
-		sqe->remote_key = cpu_to_le32(wqe->rdma.r_key);
+		ext_sqe->remote_va = cpu_to_le64(wqe->rdma.remote_va);
+		ext_sqe->remote_key = cpu_to_le32(wqe->rdma.r_key);
 		if (qp->mtu)
 			pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
 		if (!pkt_num)
@@ -1738,14 +1833,15 @@
 	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP:
 	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD:
 	{
-		struct sq_atomic *sqe = (struct sq_atomic *)hw_sq_send_hdr;
+		struct sq_atomic_ext_hdr *ext_sqe = ext_hdr;
+		struct sq_atomic_hdr *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
 		sqe->remote_key = cpu_to_le32(wqe->atomic.r_key);
 		sqe->remote_va = cpu_to_le64(wqe->atomic.remote_va);
-		sqe->swap_data = cpu_to_le64(wqe->atomic.swap_data);
-		sqe->cmp_data = cpu_to_le64(wqe->atomic.cmp_data);
+		ext_sqe->swap_data = cpu_to_le64(wqe->atomic.swap_data);
+		ext_sqe->cmp_data = cpu_to_le64(wqe->atomic.cmp_data);
 		if (qp->mtu)
 			pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
 		if (!pkt_num)
@@ -1755,8 +1851,7 @@
 	}
 	case BNXT_QPLIB_SWQE_TYPE_LOCAL_INV:
 	{
-		struct sq_localinvalidate *sqe =
-				(struct sq_localinvalidate *)hw_sq_send_hdr;
+		struct sq_localinvalidate *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
@@ -1766,7 +1861,8 @@
 	}
 	case BNXT_QPLIB_SWQE_TYPE_FAST_REG_MR:
 	{
-		struct sq_fr_pmr *sqe = (struct sq_fr_pmr *)hw_sq_send_hdr;
+		struct sq_fr_pmr_ext_hdr *ext_sqe = ext_hdr;
+		struct sq_fr_pmr_hdr *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
@@ -1790,14 +1886,15 @@
 			wqe->frmr.pbl_ptr[i] = cpu_to_le64(
 						wqe->frmr.page_list[i] |
 						PTU_PTE_VALID);
-		sqe->pblptr = cpu_to_le64(wqe->frmr.pbl_dma_ptr);
-		sqe->va = cpu_to_le64(wqe->frmr.va);
+		ext_sqe->pblptr = cpu_to_le64(wqe->frmr.pbl_dma_ptr);
+		ext_sqe->va = cpu_to_le64(wqe->frmr.va);
 
 		break;
 	}
 	case BNXT_QPLIB_SWQE_TYPE_BIND_MW:
 	{
-		struct sq_bind *sqe = (struct sq_bind *)hw_sq_send_hdr;
+		struct sq_bind_ext_hdr *ext_sqe = ext_hdr;
+		struct sq_bind_hdr *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
@@ -1806,9 +1903,8 @@
 			(wqe->bind.zero_based ? SQ_BIND_ZERO_BASED : 0);
 		sqe->parent_l_key = cpu_to_le32(wqe->bind.parent_l_key);
 		sqe->l_key = cpu_to_le32(wqe->bind.r_key);
-		sqe->va = cpu_to_le64(wqe->bind.va);
-		temp32 = cpu_to_le32(wqe->bind.length);
-		memcpy(&sqe->length, &temp32, sizeof(wqe->bind.length));
+		ext_sqe->va = cpu_to_le64(wqe->bind.va);
+		ext_sqe->length_lo = cpu_to_le32(wqe->bind.length);
 		break;
 	}
 	default:
@@ -1817,23 +1913,11 @@
 		goto done;
 	}
 	swq->next_psn = sq->psn & BTH_PSN_MASK;
-	if (qp->type == CMDQ_CREATE_QP_TYPE_RC)
-		bnxt_qplib_fill_psn_search(qp, wqe, swq);
+	bnxt_qplib_fill_psn_search(qp, wqe, swq);
 queue_err:
-	if (sch_handler) {
-		/* Store the ULP info in the software structures */
-		sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
-		swq = &sq->swq[sw_prod];
-		swq->wr_id = wqe->wr_id;
-		swq->type = wqe->type;
-		swq->flags = wqe->flags;
-		if (qp->sig_type)
-			swq->flags |= SQ_SEND_FLAGS_SIGNAL_COMP;
-		swq->start_psn = sq->psn & BTH_PSN_MASK;
-	}
-	sq->hwq.prod++;
+	bnxt_qplib_swq_mod_start(sq, wqe_idx);
+	bnxt_qplib_hwq_incr_prod(hwq, swq->slots);
 	qp->wqe_cnt++;
-
 done:
 	if (sch_handler) {
 		nq_work = kzalloc(sizeof(*nq_work), GFP_ATOMIC);
@@ -1843,7 +1927,7 @@
 			INIT_WORK(&nq_work->work, bnxt_qpn_cqn_sched_task);
 			queue_work(qp->scq->nq->cqn_wq, &nq_work->work);
 		} else {
-			dev_err(&sq->hwq.pdev->dev,
+			dev_err(&hwq->pdev->dev,
 				"FP: Failed to allocate SQ nq_work!\n");
 			rc = -ENOMEM;
 		}
@@ -1863,58 +1947,65 @@
 {
 	struct bnxt_qplib_nq_work *nq_work = NULL;
 	struct bnxt_qplib_q *rq = &qp->rq;
+	struct rq_wqe_hdr *base_hdr;
+	struct rq_ext_hdr *ext_hdr;
+	struct bnxt_qplib_hwq *hwq;
+	struct bnxt_qplib_swq *swq;
 	bool sch_handler = false;
-	struct sq_sge *hw_sge;
-	struct rq_wqe *rqe;
-	int i, rc = 0;
-	u32 sw_prod;
+	u16 wqe_sz, idx;
+	u32 wqe_idx;
+	int rc = 0;
 
-	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
-		sch_handler = true;
-		dev_dbg(&rq->hwq.pdev->dev,
-			"%s: Error QP. Scheduling for poll_cq\n", __func__);
-		goto queue_err;
+	hwq = &rq->hwq;
+	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_RESET) {
+		dev_err(&hwq->pdev->dev,
+			"QPLIB: FP: QP (0x%x) is in the 0x%x state",
+			qp->id, qp->state);
+		rc = -EINVAL;
+		goto done;
 	}
-	if (bnxt_qplib_queue_full(rq)) {
-		dev_err(&rq->hwq.pdev->dev,
+
+	if (bnxt_qplib_queue_full(rq, rq->dbinfo.max_slot)) {
+		dev_err(&hwq->pdev->dev,
 			"FP: QP (0x%x) RQ is full!\n", qp->id);
 		rc = -EINVAL;
 		goto done;
 	}
-	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
-	rq->swq[sw_prod].wr_id = wqe->wr_id;
 
-	rqe = bnxt_qplib_get_qe(&rq->hwq, sw_prod, NULL);
-	memset(rqe, 0, rq->wqe_size);
+	swq = bnxt_qplib_get_swqe(rq, &wqe_idx);
+	swq->wr_id = wqe->wr_id;
+	swq->slots = rq->dbinfo.max_slot;
 
-	/* Calculate wqe_size16 and data_len */
-	for (i = 0, hw_sge = (struct sq_sge *)rqe->data;
-	     i < wqe->num_sge; i++, hw_sge++) {
-		hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
-		hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
-		hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
+	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+		sch_handler = true;
+		dev_dbg(&hwq->pdev->dev,
+			"%s: Error QP. Scheduling for poll_cq\n", __func__);
+		goto queue_err;
 	}
-	rqe->wqe_type = wqe->type;
-	rqe->flags = wqe->flags;
-	rqe->wqe_size = wqe->num_sge +
-			((offsetof(typeof(*rqe), data) + 15) >> 4);
-	/* HW requires wqe size has room for atleast one SGE even if none
-	 * was supplied by ULP
-	 */
-	if (!wqe->num_sge)
-		rqe->wqe_size++;
 
-	/* Supply the rqe->wr_id index to the wr_id_tbl for now */
-	rqe->wr_id[0] = cpu_to_le32(sw_prod);
+	idx = 0;
+	base_hdr = bnxt_qplib_get_prod_qe(hwq, idx++);
+	ext_hdr = bnxt_qplib_get_prod_qe(hwq, idx++);
+	memset(base_hdr, 0, sizeof(struct sq_sge));
+	memset(ext_hdr, 0, sizeof(struct sq_sge));
+	wqe_sz = (sizeof(struct rq_wqe_hdr) +
+	wqe->num_sge * sizeof(struct sq_sge)) >> 4;
+	bnxt_qplib_put_sges(hwq, wqe->sg_list, wqe->num_sge, &idx);
+	if (!wqe->num_sge) {
+		struct sq_sge *sge;
 
+		sge = bnxt_qplib_get_prod_qe(hwq, idx++);
+		sge->size = 0;
+		wqe_sz++;
+	}
+	base_hdr->wqe_type = wqe->type;
+	base_hdr->flags = wqe->flags;
+	base_hdr->wqe_size = wqe_sz;
+	base_hdr->wr_id[0] = cpu_to_le32(wqe_idx);
 queue_err:
-	if (sch_handler) {
-		/* Store the ULP info in the software structures */
-		sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
-		rq->swq[sw_prod].wr_id = wqe->wr_id;
-	}
-
-	rq->hwq.prod++;
+	bnxt_qplib_swq_mod_start(rq, wqe_idx);
+	bnxt_qplib_hwq_incr_prod(hwq, swq->slots);
+done:
 	if (sch_handler) {
 		nq_work = kzalloc(sizeof(*nq_work), GFP_ATOMIC);
 		if (nq_work) {
@@ -1923,12 +2014,12 @@
 			INIT_WORK(&nq_work->work, bnxt_qpn_cqn_sched_task);
 			queue_work(qp->rcq->nq->cqn_wq, &nq_work->work);
 		} else {
-			dev_err(&rq->hwq.pdev->dev,
+			dev_err(&hwq->pdev->dev,
 				"FP: Failed to allocate RQ nq_work!\n");
 			rc = -ENOMEM;
 		}
 	}
-done:
+
 	return rc;
 }
 
@@ -2026,20 +2117,19 @@
 static int __flush_sq(struct bnxt_qplib_q *sq, struct bnxt_qplib_qp *qp,
 		      struct bnxt_qplib_cqe **pcqe, int *budget)
 {
-	u32 sw_prod, sw_cons;
 	struct bnxt_qplib_cqe *cqe;
+	u32 start, last;
 	int rc = 0;
 
 	/* Now complete all outstanding SQEs with FLUSHED_ERR */
-	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+	start = sq->swq_start;
 	cqe = *pcqe;
 	while (*budget) {
-		sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
-		if (sw_cons == sw_prod) {
+		last = sq->swq_last;
+		if (start == last)
 			break;
-		}
 		/* Skip the FENCE WQE completions */
-		if (sq->swq[sw_cons].wr_id == BNXT_QPLIB_FENCE_WRID) {
+		if (sq->swq[last].wr_id == BNXT_QPLIB_FENCE_WRID) {
 			bnxt_qplib_cancel_phantom_processing(qp);
 			goto skip_compl;
 		}
@@ -2047,16 +2137,17 @@
 		cqe->status = CQ_REQ_STATUS_WORK_REQUEST_FLUSHED_ERR;
 		cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
 		cqe->qp_handle = (u64)(unsigned long)qp;
-		cqe->wr_id = sq->swq[sw_cons].wr_id;
+		cqe->wr_id = sq->swq[last].wr_id;
 		cqe->src_qp = qp->id;
-		cqe->type = sq->swq[sw_cons].type;
+		cqe->type = sq->swq[last].type;
 		cqe++;
 		(*budget)--;
 skip_compl:
-		sq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&sq->hwq, sq->swq[last].slots);
+		sq->swq_last = sq->swq[last].next_idx;
 	}
 	*pcqe = cqe;
-	if (!(*budget) && HWQ_CMP(sq->hwq.cons, &sq->hwq) != sw_prod)
+	if (!(*budget) && sq->swq_last != start)
 		/* Out of budget */
 		rc = -EAGAIN;
 
@@ -2067,9 +2158,9 @@
 		      struct bnxt_qplib_cqe **pcqe, int *budget)
 {
 	struct bnxt_qplib_cqe *cqe;
-	u32 sw_prod, sw_cons;
-	int rc = 0;
+	u32 start, last;
 	int opcode = 0;
+	int rc = 0;
 
 	switch (qp->type) {
 	case CMDQ_CREATE_QP1_TYPE_GSI:
@@ -2085,24 +2176,25 @@
 	}
 
 	/* Flush the rest of the RQ */
-	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+	start = rq->swq_start;
 	cqe = *pcqe;
 	while (*budget) {
-		sw_cons = HWQ_CMP(rq->hwq.cons, &rq->hwq);
-		if (sw_cons == sw_prod)
+		last = rq->swq_last;
+		if (last == start)
 			break;
 		memset(cqe, 0, sizeof(*cqe));
 		cqe->status =
 		    CQ_RES_RC_STATUS_WORK_REQUEST_FLUSHED_ERR;
 		cqe->opcode = opcode;
 		cqe->qp_handle = (unsigned long)qp;
-		cqe->wr_id = rq->swq[sw_cons].wr_id;
+		cqe->wr_id = rq->swq[last].wr_id;
 		cqe++;
 		(*budget)--;
-		rq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&rq->hwq, rq->swq[last].slots);
+		rq->swq_last = rq->swq[last].next_idx;
 	}
 	*pcqe = cqe;
-	if (!*budget && HWQ_CMP(rq->hwq.cons, &rq->hwq) != sw_prod)
+	if (!*budget && rq->swq_last != start)
 		/* Out of budget */
 		rc = -EAGAIN;
 
@@ -2125,7 +2217,7 @@
  *       CQE is track from sw_cq_cons to max_element but valid only if VALID=1
  */
 static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq,
-		     u32 cq_cons, u32 sw_sq_cons, u32 cqe_sq_cons)
+		     u32 cq_cons, u32 swq_last, u32 cqe_sq_cons)
 {
 	u32 peek_sw_cq_cons, peek_raw_cq_cons, peek_sq_cons_idx;
 	struct bnxt_qplib_q *sq = &qp->sq;
@@ -2138,7 +2230,7 @@
 
 	/* Normal mode */
 	/* Check for the psn_search marking before completing */
-	swq = &sq->swq[sw_sq_cons];
+	swq = &sq->swq[swq_last];
 	if (swq->psn_search &&
 	    le32_to_cpu(swq->psn_search->flags_next_psn) & 0x80000000) {
 		/* Unmark */
@@ -2147,7 +2239,7 @@
 				     & ~0x80000000);
 		dev_dbg(&cq->hwq.pdev->dev,
 			"FP: Process Req cq_cons=0x%x qp=0x%x sq cons sw=0x%x cqe=0x%x marked!\n",
-			cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
+			cq_cons, qp->id, swq_last, cqe_sq_cons);
 		sq->condition = true;
 		sq->send_phantom = true;
 
@@ -2184,9 +2276,10 @@
 						 le64_to_cpu
 						 (peek_req_hwcqe->qp_handle));
 					peek_sq = &peek_qp->sq;
-					peek_sq_cons_idx = HWQ_CMP(le16_to_cpu(
-						peek_req_hwcqe->sq_cons_idx) - 1
-						, &sq->hwq);
+					peek_sq_cons_idx =
+						((le16_to_cpu(
+						  peek_req_hwcqe->sq_cons_idx)
+						  - 1) % sq->max_wqe);
 					/* If the hwcqe's sq's wr_id matches */
 					if (peek_sq == sq &&
 					    sq->swq[peek_sq_cons_idx].wr_id ==
@@ -2214,7 +2307,7 @@
 		}
 		dev_err(&cq->hwq.pdev->dev,
 			"Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x\n",
-			cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
+			cq_cons, qp->id, swq_last, cqe_sq_cons);
 		rc = -EINVAL;
 	}
 out:
@@ -2226,11 +2319,11 @@
 				     struct bnxt_qplib_cqe **pcqe, int *budget,
 				     u32 cq_cons, struct bnxt_qplib_qp **lib_qp)
 {
-	u32 sw_sq_cons, cqe_sq_cons;
 	struct bnxt_qplib_swq *swq;
 	struct bnxt_qplib_cqe *cqe;
 	struct bnxt_qplib_qp *qp;
 	struct bnxt_qplib_q *sq;
+	u32 cqe_sq_cons;
 	int rc = 0;
 
 	qp = (struct bnxt_qplib_qp *)((unsigned long)
@@ -2242,14 +2335,7 @@
 	}
 	sq = &qp->sq;
 
-	cqe_sq_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq);
-	if (cqe_sq_cons > sq->hwq.max_elements) {
-		dev_err(&cq->hwq.pdev->dev,
-			"FP: CQ Process req reported sq_cons_idx 0x%x which exceeded max 0x%x\n",
-			cqe_sq_cons, sq->hwq.max_elements);
-		return -EINVAL;
-	}
-
+	cqe_sq_cons = le16_to_cpu(hwcqe->sq_cons_idx) % sq->max_wqe;
 	if (qp->sq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
 			"%s: QP in Flush QP = %p\n", __func__, qp);
@@ -2261,12 +2347,11 @@
 	 */
 	cqe = *pcqe;
 	while (*budget) {
-		sw_sq_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
-		if (sw_sq_cons == cqe_sq_cons)
+		if (sq->swq_last == cqe_sq_cons)
 			/* Done */
 			break;
 
-		swq = &sq->swq[sw_sq_cons];
+		swq = &sq->swq[sq->swq_last];
 		memset(cqe, 0, sizeof(*cqe));
 		cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
 		cqe->qp_handle = (u64)(unsigned long)qp;
@@ -2280,12 +2365,12 @@
 		 * of the request being signaled or not, it must complete with
 		 * the hwcqe error status
 		 */
-		if (HWQ_CMP((sw_sq_cons + 1), &sq->hwq) == cqe_sq_cons &&
+		if (swq->next_idx == cqe_sq_cons &&
 		    hwcqe->status != CQ_REQ_STATUS_OK) {
 			cqe->status = hwcqe->status;
 			dev_err(&cq->hwq.pdev->dev,
 				"FP: CQ Processed Req wr_id[%d] = 0x%llx with status 0x%x\n",
-				sw_sq_cons, cqe->wr_id, cqe->status);
+				sq->swq_last, cqe->wr_id, cqe->status);
 			cqe++;
 			(*budget)--;
 			bnxt_qplib_mark_qp_error(qp);
@@ -2293,7 +2378,7 @@
 			bnxt_qplib_add_flush_qp(qp);
 		} else {
 			/* Before we complete, do WA 9060 */
-			if (do_wa9060(qp, cq, cq_cons, sw_sq_cons,
+			if (do_wa9060(qp, cq, cq_cons, sq->swq_last,
 				      cqe_sq_cons)) {
 				*lib_qp = qp;
 				goto out;
@@ -2305,13 +2390,14 @@
 			}
 		}
 skip:
-		sq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&sq->hwq, swq->slots);
+		sq->swq_last = swq->next_idx;
 		if (sq->single)
 			break;
 	}
 out:
 	*pcqe = cqe;
-	if (HWQ_CMP(sq->hwq.cons, &sq->hwq) != cqe_sq_cons) {
+	if (sq->swq_last != cqe_sq_cons) {
 		/* Out of budget */
 		rc = -EAGAIN;
 		goto done;
@@ -2386,17 +2472,23 @@
 		(*budget)--;
 		*pcqe = cqe;
 	} else {
+		struct bnxt_qplib_swq *swq;
+
 		rq = &qp->rq;
-		if (wr_id_idx >= rq->hwq.max_elements) {
+		if (wr_id_idx > (rq->max_wqe - 1)) {
 			dev_err(&cq->hwq.pdev->dev,
 				"FP: CQ Process RC wr_id idx 0x%x exceeded RQ max 0x%x\n",
-				wr_id_idx, rq->hwq.max_elements);
+				wr_id_idx, rq->max_wqe);
 			return -EINVAL;
 		}
-		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		if (wr_id_idx != rq->swq_last)
+			return -EINVAL;
+		swq = &rq->swq[rq->swq_last];
+		cqe->wr_id = swq->wr_id;
 		cqe++;
 		(*budget)--;
-		rq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&rq->hwq, swq->slots);
+		rq->swq_last = swq->next_idx;
 		*pcqe = cqe;
 
 		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
@@ -2467,18 +2559,24 @@
 		(*budget)--;
 		*pcqe = cqe;
 	} else {
+		struct bnxt_qplib_swq *swq;
+
 		rq = &qp->rq;
-		if (wr_id_idx >= rq->hwq.max_elements) {
+		if (wr_id_idx > (rq->max_wqe - 1)) {
 			dev_err(&cq->hwq.pdev->dev,
 				"FP: CQ Process UD wr_id idx 0x%x exceeded RQ max 0x%x\n",
-				wr_id_idx, rq->hwq.max_elements);
+				wr_id_idx, rq->max_wqe);
 			return -EINVAL;
 		}
 
-		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		if (rq->swq_last != wr_id_idx)
+			return -EINVAL;
+		swq = &rq->swq[rq->swq_last];
+		cqe->wr_id = swq->wr_id;
 		cqe++;
 		(*budget)--;
-		rq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&rq->hwq, swq->slots);
+		rq->swq_last = swq->next_idx;
 		*pcqe = cqe;
 
 		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
@@ -2569,17 +2667,23 @@
 		(*budget)--;
 		*pcqe = cqe;
 	} else {
+		struct bnxt_qplib_swq *swq;
+
 		rq = &qp->rq;
-		if (wr_id_idx >= rq->hwq.max_elements) {
+		if (wr_id_idx > (rq->max_wqe - 1)) {
 			dev_err(&cq->hwq.pdev->dev,
 				"FP: CQ Process Raw/QP1 RQ wr_id idx 0x%x exceeded RQ max 0x%x\n",
-				wr_id_idx, rq->hwq.max_elements);
+				wr_id_idx, rq->max_wqe);
 			return -EINVAL;
 		}
-		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		if (rq->swq_last != wr_id_idx)
+			return -EINVAL;
+		swq = &rq->swq[rq->swq_last];
+		cqe->wr_id = swq->wr_id;
 		cqe++;
 		(*budget)--;
-		rq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&rq->hwq, swq->slots);
+		rq->swq_last = swq->next_idx;
 		*pcqe = cqe;
 
 		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
@@ -2601,7 +2705,7 @@
 	struct bnxt_qplib_qp *qp;
 	struct bnxt_qplib_q *sq, *rq;
 	struct bnxt_qplib_cqe *cqe;
-	u32 sw_cons = 0, cqe_cons;
+	u32 swq_last = 0, cqe_cons;
 	int rc = 0;
 
 	/* Check the Status */
@@ -2627,13 +2731,7 @@
 	cqe_cons = le16_to_cpu(hwcqe->sq_cons_idx);
 	if (cqe_cons == 0xFFFF)
 		goto do_rq;
-
-	if (cqe_cons > sq->hwq.max_elements) {
-		dev_err(&cq->hwq.pdev->dev,
-			"FP: CQ Process terminal reported sq_cons_idx 0x%x which exceeded max 0x%x\n",
-			cqe_cons, sq->hwq.max_elements);
-		goto do_rq;
-	}
+	cqe_cons %= sq->max_wqe;
 
 	if (qp->sq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
@@ -2647,24 +2745,25 @@
 	 */
 	cqe = *pcqe;
 	while (*budget) {
-		sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
-		if (sw_cons == cqe_cons)
+		swq_last = sq->swq_last;
+		if (swq_last == cqe_cons)
 			break;
-		if (sq->swq[sw_cons].flags & SQ_SEND_FLAGS_SIGNAL_COMP) {
+		if (sq->swq[swq_last].flags & SQ_SEND_FLAGS_SIGNAL_COMP) {
 			memset(cqe, 0, sizeof(*cqe));
 			cqe->status = CQ_REQ_STATUS_OK;
 			cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
 			cqe->qp_handle = (u64)(unsigned long)qp;
 			cqe->src_qp = qp->id;
-			cqe->wr_id = sq->swq[sw_cons].wr_id;
-			cqe->type = sq->swq[sw_cons].type;
+			cqe->wr_id = sq->swq[swq_last].wr_id;
+			cqe->type = sq->swq[swq_last].type;
 			cqe++;
 			(*budget)--;
 		}
-		sq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&sq->hwq, sq->swq[swq_last].slots);
+		sq->swq_last = sq->swq[swq_last].next_idx;
 	}
 	*pcqe = cqe;
-	if (!(*budget) && sw_cons != cqe_cons) {
+	if (!(*budget) && swq_last != cqe_cons) {
 		/* Out of budget */
 		rc = -EAGAIN;
 		goto sq_done;
@@ -2676,10 +2775,10 @@
 	cqe_cons = le16_to_cpu(hwcqe->rq_cons_idx);
 	if (cqe_cons == 0xFFFF) {
 		goto done;
-	} else if (cqe_cons > rq->hwq.max_elements) {
+	} else if (cqe_cons > rq->max_wqe - 1) {
 		dev_err(&cq->hwq.pdev->dev,
 			"FP: CQ Processed terminal reported rq_cons_idx 0x%x exceeds max 0x%x\n",
-			cqe_cons, rq->hwq.max_elements);
+			cqe_cons, rq->max_wqe);
 		goto done;
 	}
 
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index 568ca39..f507844 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -39,6 +39,51 @@
 #ifndef __BNXT_QPLIB_FP_H__
 #define __BNXT_QPLIB_FP_H__
 
+/* Few helper structures temporarily defined here
+ * should get rid of these when roce_hsi.h is updated
+ * in original code base
+ */
+struct sq_ud_ext_hdr {
+	__le32 dst_qp;
+	__le32 avid;
+	__le64 rsvd;
+};
+
+struct sq_raw_ext_hdr {
+	__le32 cfa_meta;
+	__le32 rsvd0;
+	__le64 rsvd1;
+};
+
+struct sq_rdma_ext_hdr {
+	__le64 remote_va;
+	__le32 remote_key;
+	__le32 rsvd;
+};
+
+struct sq_atomic_ext_hdr {
+	__le64 swap_data;
+	__le64 cmp_data;
+};
+
+struct sq_fr_pmr_ext_hdr {
+	__le64 pblptr;
+	__le64 va;
+};
+
+struct sq_bind_ext_hdr {
+	__le64 va;
+	__le32 length_lo;
+	__le32 length_hi;
+};
+
+struct rq_ext_hdr {
+	__le64 rsvd1;
+	__le64 rsvd2;
+};
+
+/* Helper structures end */
+
 struct bnxt_qplib_srq {
 	struct bnxt_qplib_pd		*pd;
 	struct bnxt_qplib_dpi		*dpi;
@@ -74,6 +119,8 @@
 	u8				flags;
 	u32				start_psn;
 	u32				next_psn;
+	u32				slot_idx;
+	u8				slots;
 	struct sq_psn_search		*psn_search;
 	struct sq_psn_search_ext	*psn_ext;
 };
@@ -213,6 +260,8 @@
 	u32				phantom_cqe_cnt;
 	u32				next_cq_cons;
 	bool				flushed;
+	u32				swq_start;
+	u32				swq_last;
 };
 
 struct bnxt_qplib_qp {
@@ -224,9 +273,10 @@
 	u32				id;
 	u8				type;
 	u8				sig_type;
-	u32				modify_flags;
+	u8				wqe_mode;
 	u8				state;
 	u8				cur_qp_state;
+	u64				modify_flags;
 	u32				max_inline_data;
 	u32				mtu;
 	u8				path_mtu;
@@ -300,11 +350,18 @@
 	(!!((hdr)->cqe_type_toggle & CQ_BASE_TOGGLE) ==		\
 	   !((raw_cons) & (cp_bit)))
 
-static inline bool bnxt_qplib_queue_full(struct bnxt_qplib_q *qplib_q)
+static inline bool bnxt_qplib_queue_full(struct bnxt_qplib_q *que,
+					 u8 slots)
 {
-	return HWQ_CMP((qplib_q->hwq.prod + qplib_q->q_full_delta),
-		       &qplib_q->hwq) == HWQ_CMP(qplib_q->hwq.cons,
-						 &qplib_q->hwq);
+	struct bnxt_qplib_hwq *hwq;
+	int avail;
+
+	hwq = &que->hwq;
+	/* False full is possible, retrying post-send makes sense */
+	avail = hwq->cons - hwq->prod;
+	if (hwq->cons <= hwq->prod)
+		avail += hwq->depth;
+	return avail <= slots;
 }
 
 struct bnxt_qplib_cqe {
@@ -489,4 +546,64 @@
 				  struct bnxt_qplib_cqe *cqe,
 				  int num_cqes);
 void bnxt_qplib_flush_cqn_wq(struct bnxt_qplib_qp *qp);
+
+static inline void *bnxt_qplib_get_swqe(struct bnxt_qplib_q *que, u32 *swq_idx)
+{
+	u32 idx;
+
+	idx = que->swq_start;
+	if (swq_idx)
+		*swq_idx = idx;
+	return &que->swq[idx];
+}
+
+static inline void bnxt_qplib_swq_mod_start(struct bnxt_qplib_q *que, u32 idx)
+{
+	que->swq_start = que->swq[idx].next_idx;
+}
+
+static inline u32 bnxt_qplib_get_depth(struct bnxt_qplib_q *que)
+{
+	return (que->wqe_size * que->max_wqe) / sizeof(struct sq_sge);
+}
+
+static inline u32 bnxt_qplib_set_sq_size(struct bnxt_qplib_q *que, u8 wqe_mode)
+{
+	return (wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ?
+		que->max_wqe : bnxt_qplib_get_depth(que);
+}
+
+static inline u32 bnxt_qplib_set_sq_max_slot(u8 wqe_mode)
+{
+	return (wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ?
+		sizeof(struct sq_send) / sizeof(struct sq_sge) : 1;
+}
+
+static inline u32 bnxt_qplib_set_rq_max_slot(u32 wqe_size)
+{
+	return (wqe_size / sizeof(struct sq_sge));
+}
+
+static inline u16 __xlate_qfd(u16 delta, u16 wqe_bytes)
+{
+	/* For Cu/Wh delta = 128, stride = 16, wqe_bytes = 128
+	 * For Gen-p5 B/C mode delta = 0, stride = 16, wqe_bytes = 128.
+	 * For Gen-p5 delta = 0, stride = 16, 32 <= wqe_bytes <= 512.
+	 * when 8916 is disabled.
+	 */
+	return (delta * wqe_bytes) / sizeof(struct sq_sge);
+}
+
+static inline u16 bnxt_qplib_calc_ilsize(struct bnxt_qplib_swqe *wqe, u16 max)
+{
+	u16 size = 0;
+	int indx;
+
+	for (indx = 0; indx < wqe->num_sge; indx++)
+		size += wqe->sg_list[indx].size;
+	if (size > max)
+		size = max;
+
+	return size;
+}
 #endif /* __BNXT_QPLIB_FP_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
index c29cbd3..9da470d 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -41,6 +41,28 @@
 
 extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero;
 
+#define CHIP_NUM_57508		0x1750
+#define CHIP_NUM_57504		0x1751
+#define CHIP_NUM_57502		0x1752
+
+enum bnxt_qplib_wqe_mode {
+	BNXT_QPLIB_WQE_MODE_STATIC	= 0x00,
+	BNXT_QPLIB_WQE_MODE_VARIABLE	= 0x01,
+	BNXT_QPLIB_WQE_MODE_INVALID	= 0x02
+};
+
+struct bnxt_qplib_drv_modes {
+	u8	wqe_mode;
+	/* Other modes to follow here */
+};
+
+struct bnxt_qplib_chip_ctx {
+	u16	chip_num;
+	u8	chip_rev;
+	u8	chip_metal;
+	struct bnxt_qplib_drv_modes modes;
+};
+
 #define PTR_CNT_PER_PG		(PAGE_SIZE / sizeof(void *))
 #define PTR_MAX_IDX_PER_PG	(PTR_CNT_PER_PG - 1)
 #define PTR_PG(x)		(((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG)
@@ -141,6 +163,9 @@
 	u32				cons;		/* raw */
 	u8				cp_bit;
 	u8				is_user;
+	u64				*pad_pg;
+	u32				pad_stride;
+	u32				pad_pgofft;
 };
 
 struct bnxt_qplib_db_info {
@@ -148,6 +173,7 @@
 	void __iomem		*priv_db;
 	struct bnxt_qplib_hwq	*hwq;
 	u32			xid;
+	u32			max_slot;
 };
 
 /* Tables */
@@ -230,16 +256,6 @@
 	u64				hwrm_intf_ver;
 };
 
-struct bnxt_qplib_chip_ctx {
-	u16	chip_num;
-	u8	chip_rev;
-	u8	chip_metal;
-};
-
-#define CHIP_NUM_57508		0x1750
-#define CHIP_NUM_57504		0x1751
-#define CHIP_NUM_57502		0x1752
-
 struct bnxt_qplib_res {
 	struct pci_dev			*pdev;
 	struct bnxt_qplib_chip_ctx	*cctx;
@@ -317,6 +333,14 @@
 	return (void *)(hwq->pbl_ptr[pg_num] + hwq->element_size * pg_idx);
 }
 
+static inline void *bnxt_qplib_get_prod_qe(struct bnxt_qplib_hwq *hwq, u32 idx)
+{
+	idx += hwq->prod;
+	if (idx >= hwq->depth)
+		idx -= hwq->depth;
+	return bnxt_qplib_get_qe(hwq, idx, NULL);
+}
+
 #define to_bnxt_qplib(ptr, type, member)	\
 	container_of(ptr, type, member)
 
@@ -351,6 +375,17 @@
 			 struct bnxt_qplib_ctx *ctx,
 			 bool virt_fn, bool is_p5);
 
+static inline void bnxt_qplib_hwq_incr_prod(struct bnxt_qplib_hwq *hwq, u32 cnt)
+{
+	hwq->prod = (hwq->prod + cnt) % hwq->depth;
+}
+
+static inline void bnxt_qplib_hwq_incr_cons(struct bnxt_qplib_hwq *hwq,
+					    u32 cnt)
+{
+	hwq->cons = (hwq->cons + cnt) % hwq->depth;
+}
+
 static inline void bnxt_qplib_ring_db32(struct bnxt_qplib_db_info *info,
 					bool arm)
 {
@@ -383,8 +418,7 @@
 
 	key = (info->xid & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | type;
 	key <<= 32;
-	key |= (info->hwq->prod & (info->hwq->max_elements - 1)) &
-		DBC_DBC_INDEX_MASK;
+	key |= ((info->hwq->prod / info->max_slot)) & DBC_DBC_INDEX_MASK;
 	writeq(key, info->db);
 }
 
diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
index 6f00f07..3e40e0d 100644
--- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h
+++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
@@ -1126,6 +1126,7 @@
 	#define CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION	   0x2UL
 	#define CMDQ_CREATE_QP_QP_FLAGS_RESERVED_LKEY_ENABLE      0x4UL
 	#define CMDQ_CREATE_QP_QP_FLAGS_FR_PMR_ENABLED		   0x8UL
+	#define CMDQ_CREATE_QP_QP_FLAGS_VARIABLE_SIZED_WQE_ENABLED 0x10UL
 	u8 type;
 	#define CMDQ_CREATE_QP_TYPE_RC				   0x2UL
 	#define CMDQ_CREATE_QP_TYPE_UD				   0x4UL
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index e8e11bd..2b2b009 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -980,7 +980,7 @@
 void c4iw_qp_add_ref(struct ib_qp *qp);
 void c4iw_qp_rem_ref(struct ib_qp *qp);
 struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			    u32 max_num_sg, struct ib_udata *udata);
+			    u32 max_num_sg);
 int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		   unsigned int *sg_offset);
 int c4iw_dealloc_mw(struct ib_mw *mw);
@@ -1053,8 +1053,9 @@
 		       const struct ib_recv_wr **bad_wr);
 struct c4iw_wr_wait *c4iw_alloc_wr_wait(gfp_t gfp);
 
-typedef int c4iw_restrack_func(struct sk_buff *msg,
-			       struct rdma_restrack_entry *res);
-extern c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX];
+int c4iw_fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr);
+int c4iw_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ibcq);
+int c4iw_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ibqp);
+int c4iw_fill_res_cm_id_entry(struct sk_buff *msg, struct rdma_cm_id *cm_id);
 
 #endif
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index 962dc97..73936c3 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -399,7 +399,6 @@
 	mmid = stag >> 8;
 	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
 	mhp->ibmr.length = mhp->attr.len;
-	mhp->ibmr.iova = mhp->attr.va_fbo;
 	mhp->ibmr.page_size = 1U << (mhp->attr.page_size + 12);
 	pr_debug("mmid 0x%x mhp %p\n", mmid, mhp);
 	return xa_insert_irq(&mhp->rhp->mrs, mmid, mhp, GFP_KERNEL);
@@ -691,7 +690,7 @@
 }
 
 struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			    u32 max_num_sg, struct ib_udata *udata)
+			    u32 max_num_sg)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index ba83d94..6c579d2 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -236,14 +236,6 @@
 	return 0;
 }
 
-static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
-			   u16 *pkey)
-{
-	pr_debug("ibdev %p\n", ibdev);
-	*pkey = 0;
-	return 0;
-}
-
 static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index,
 			  union ib_gid *gid)
 {
@@ -317,7 +309,6 @@
 	    IB_PORT_DEVICE_MGMT_SUP |
 	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
 	props->gid_tbl_len = 1;
-	props->pkey_tbl_len = 1;
 	props->max_msg_sz = -1;
 
 	return ret;
@@ -439,7 +430,6 @@
 	if (err)
 		return err;
 
-	immutable->pkey_tbl_len = attr.pkey_tbl_len;
 	immutable->gid_tbl_len = attr.gid_tbl_len;
 
 	return 0;
@@ -458,13 +448,6 @@
 		 FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
 }
 
-static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res)
-{
-	return (res->type < ARRAY_SIZE(c4iw_restrack_funcs) &&
-		c4iw_restrack_funcs[res->type]) ?
-		c4iw_restrack_funcs[res->type](msg, res) : 0;
-}
-
 static const struct ib_device_ops c4iw_dev_ops = {
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_CXGB4,
@@ -485,7 +468,9 @@
 	.destroy_cq = c4iw_destroy_cq,
 	.destroy_qp = c4iw_destroy_qp,
 	.destroy_srq = c4iw_destroy_srq,
-	.fill_res_entry = fill_res_entry,
+	.fill_res_cq_entry = c4iw_fill_res_cq_entry,
+	.fill_res_cm_id_entry = c4iw_fill_res_cm_id_entry,
+	.fill_res_mr_entry = c4iw_fill_res_mr_entry,
 	.get_dev_fw_str = get_dev_fw_str,
 	.get_dma_mr = c4iw_get_dma_mr,
 	.get_hw_stats = c4iw_get_mib,
@@ -508,7 +493,6 @@
 	.post_srq_recv = c4iw_post_srq_recv,
 	.query_device = c4iw_query_device,
 	.query_gid = c4iw_query_gid,
-	.query_pkey = c4iw_query_pkey,
 	.query_port = c4iw_query_port,
 	.query_qp = c4iw_ib_query_qp,
 	.reg_user_mr = c4iw_reg_user_mr,
diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c
index f82d46e..b32e651 100644
--- a/drivers/infiniband/hw/cxgb4/restrack.c
+++ b/drivers/infiniband/hw/cxgb4/restrack.c
@@ -134,10 +134,8 @@
 	return -EMSGSIZE;
 }
 
-static int fill_res_qp_entry(struct sk_buff *msg,
-			     struct rdma_restrack_entry *res)
+int c4iw_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ibqp)
 {
-	struct ib_qp *ibqp = container_of(res, struct ib_qp, res);
 	struct t4_swsqe *fsp = NULL, *lsp = NULL;
 	struct c4iw_qp *qhp = to_c4iw_qp(ibqp);
 	u16 first_sq_idx = 0, last_sq_idx = 0;
@@ -195,10 +193,9 @@
 	struct c4iw_ep ep;
 };
 
-static int fill_res_ep_entry(struct sk_buff *msg,
-			     struct rdma_restrack_entry *res)
+int c4iw_fill_res_cm_id_entry(struct sk_buff *msg,
+			      struct rdma_cm_id *cm_id)
 {
-	struct rdma_cm_id *cm_id = rdma_res_to_id(res);
 	struct nlattr *table_attr;
 	struct c4iw_ep_common *epcp;
 	struct c4iw_listen_ep *listen_ep = NULL;
@@ -372,10 +369,8 @@
 	return -EMSGSIZE;
 }
 
-static int fill_res_cq_entry(struct sk_buff *msg,
-			     struct rdma_restrack_entry *res)
+int c4iw_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ibcq)
 {
-	struct ib_cq *ibcq = container_of(res, struct ib_cq, res);
 	struct c4iw_cq *chp = to_c4iw_cq(ibcq);
 	struct nlattr *table_attr;
 	struct t4_cqe hwcqes[2];
@@ -433,10 +428,8 @@
 	return -EMSGSIZE;
 }
 
-static int fill_res_mr_entry(struct sk_buff *msg,
-			     struct rdma_restrack_entry *res)
+int c4iw_fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr)
 {
-	struct ib_mr *ibmr = container_of(res, struct ib_mr, res);
 	struct c4iw_mr *mhp = to_c4iw_mr(ibmr);
 	struct c4iw_dev *dev = mhp->rhp;
 	u32 stag = mhp->attr.stag;
@@ -492,10 +485,3 @@
 err:
 	return -EMSGSIZE;
 }
-
-c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX] = {
-	[RDMA_RESTRACK_QP]	= fill_res_qp_entry,
-	[RDMA_RESTRACK_CM_ID]	= fill_res_ep_entry,
-	[RDMA_RESTRACK_CQ]	= fill_res_cq_entry,
-	[RDMA_RESTRACK_MR]	= fill_res_mr_entry,
-};
diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
index bef2bd2..5484b08 100644
--- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
+++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
@@ -606,8 +606,8 @@
 	/* Number of sub-CQs to be created for each CQ */
 	u16 sub_cqs_per_cq;
 
-	/* MBZ */
-	u16 reserved;
+	/* Minimum number of WQEs per SQ */
+	u16 min_sq_depth;
 
 	/* Maximum number of SGEs (buffers) allowed for a single send WQE */
 	u16 max_wr_send_sges;
@@ -632,6 +632,17 @@
 
 	/* Maximum number of SGEs for a single RDMA read WQE */
 	u16 max_wr_rdma_sges;
+
+	/*
+	 * Maximum number of bytes that can be written to SQ between two
+	 * consecutive doorbells (in units of 64B). Driver must ensure that only
+	 * complete WQEs are written to queue before issuing a doorbell.
+	 * Examples: max_tx_batch=16 and WQE size = 64B, means up to 16 WQEs can
+	 * be written to SQ between two consecutive doorbells. max_tx_batch=11
+	 * and WQE size = 128B, means up to 5 WQEs can be written to SQ between
+	 * two consecutive doorbells. Zero means unlimited.
+	 */
+	u16 max_tx_batch;
 };
 
 struct efa_admin_feature_aenq_desc {
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c
index fabd8df..6ac2362 100644
--- a/drivers/infiniband/hw/efa/efa_com_cmd.c
+++ b/drivers/infiniband/hw/efa/efa_com_cmd.c
@@ -480,6 +480,8 @@
 	result->max_llq_size = resp.u.queue_attr.max_llq_size;
 	result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq;
 	result->max_wr_rdma_sge = resp.u.queue_attr.max_wr_rdma_sges;
+	result->max_tx_batch = resp.u.queue_attr.max_tx_batch;
+	result->min_sq_depth = resp.u.queue_attr.min_sq_depth;
 
 	err = efa_com_get_feature(edev, &resp, EFA_ADMIN_NETWORK_ATTR);
 	if (err) {
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h
index 41ce4a4..190bac2 100644
--- a/drivers/infiniband/hw/efa/efa_com_cmd.h
+++ b/drivers/infiniband/hw/efa/efa_com_cmd.h
@@ -127,6 +127,8 @@
 	u16 max_sq_sge;
 	u16 max_rq_sge;
 	u16 max_wr_rdma_sge;
+	u16 max_tx_batch;
+	u16 min_sq_depth;
 	u8 db_bar;
 };
 
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index 82145574..92d7011 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -12,10 +12,12 @@
 
 #include "efa.h"
 
-#define PCI_DEV_ID_EFA_VF 0xefa0
+#define PCI_DEV_ID_EFA0_VF 0xefa0
+#define PCI_DEV_ID_EFA1_VF 0xefa1
 
 static const struct pci_device_id efa_pci_tbl[] = {
-	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA_VF) },
+	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA0_VF) },
+	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA1_VF) },
 	{ }
 };
 
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 7dd0824..9e201f169 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -1502,11 +1502,39 @@
 	return efa_com_dealloc_uar(&dev->edev, &params);
 }
 
+#define EFA_CHECK_USER_COMP(_dev, _comp_mask, _attr, _mask, _attr_str) \
+	(_attr_str = (!(_dev)->dev_attr._attr || ((_comp_mask) & (_mask))) ? \
+		     NULL : #_attr)
+
+static int efa_user_comp_handshake(const struct ib_ucontext *ibucontext,
+				   const struct efa_ibv_alloc_ucontext_cmd *cmd)
+{
+	struct efa_dev *dev = to_edev(ibucontext->device);
+	char *attr_str;
+
+	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, max_tx_batch,
+				EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH, attr_str))
+		goto err;
+
+	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, min_sq_depth,
+				EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR,
+				attr_str))
+		goto err;
+
+	return 0;
+
+err:
+	ibdev_dbg(&dev->ibdev, "Userspace handshake failed for %s attribute\n",
+		  attr_str);
+	return -EOPNOTSUPP;
+}
+
 int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
 {
 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
 	struct efa_dev *dev = to_edev(ibucontext->device);
 	struct efa_ibv_alloc_ucontext_resp resp = {};
+	struct efa_ibv_alloc_ucontext_cmd cmd = {};
 	struct efa_com_alloc_uar_result result;
 	int err;
 
@@ -1515,6 +1543,18 @@
 	 * we will ack input fields in our response.
 	 */
 
+	err = ib_copy_from_udata(&cmd, udata,
+				 min(sizeof(cmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(&dev->ibdev,
+			  "Cannot copy udata for alloc_ucontext\n");
+		goto err_out;
+	}
+
+	err = efa_user_comp_handshake(ibucontext, &cmd);
+	if (err)
+		goto err_out;
+
 	err = efa_com_alloc_uar(&dev->edev, &result);
 	if (err)
 		goto err_out;
@@ -1526,6 +1566,8 @@
 	resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq;
 	resp.inline_buf_size = dev->dev_attr.inline_buf_size;
 	resp.max_llq_size = dev->dev_attr.max_llq_size;
+	resp.max_tx_batch = dev->dev_attr.max_tx_batch;
+	resp.min_sq_wr = dev->dev_attr.min_sq_depth;
 
 	if (udata && udata->outlen) {
 		err = ib_copy_to_udata(udata, &resp,
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 15f9c63..7eaf9953 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -7317,11 +7317,11 @@
 	case 1: return OPA_LINK_WIDTH_1X;
 	case 2: return OPA_LINK_WIDTH_2X;
 	case 3: return OPA_LINK_WIDTH_3X;
+	case 4: return OPA_LINK_WIDTH_4X;
 	default:
 		dd_dev_info(dd, "%s: invalid width %d, using 4\n",
 			    __func__, width);
-		/* fall through */
-	case 4: return OPA_LINK_WIDTH_4X;
+		return OPA_LINK_WIDTH_4X;
 	}
 }
 
@@ -7376,12 +7376,13 @@
 		case 0:
 			dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
 			break;
+		case 1:
+			dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
+			break;
 		default:
 			dd_dev_err(dd,
 				   "%s: unexpected max rate %d, using 25Gb\n",
 				   __func__, (int)max_rate);
-			/* fall through */
-		case 1:
 			dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
 			break;
 		}
@@ -12878,11 +12879,6 @@
 static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
 {
 	switch (chip_lstate) {
-	default:
-		dd_dev_err(dd,
-			   "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
-			   chip_lstate);
-		/* fall through */
 	case LSTATE_DOWN:
 		return IB_PORT_DOWN;
 	case LSTATE_INIT:
@@ -12891,6 +12887,11 @@
 		return IB_PORT_ARMED;
 	case LSTATE_ACTIVE:
 		return IB_PORT_ACTIVE;
+	default:
+		dd_dev_err(dd,
+			   "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
+			   chip_lstate);
+		return IB_PORT_DOWN;
 	}
 }
 
@@ -12898,10 +12899,6 @@
 {
 	/* look at the HFI meta-states only */
 	switch (chip_pstate & 0xf0) {
-	default:
-		dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
-			   chip_pstate);
-		/* fall through */
 	case PLS_DISABLED:
 		return IB_PORTPHYSSTATE_DISABLED;
 	case PLS_OFFLINE:
@@ -12914,6 +12911,10 @@
 		return IB_PORTPHYSSTATE_LINKUP;
 	case PLS_PHYTEST:
 		return IB_PORTPHYSSTATE_PHY_TEST;
+	default:
+		dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
+			   chip_pstate);
+		return IB_PORTPHYSSTATE_DISABLED;
 	}
 }
 
diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c
index 2b57ba7..0e83d4b 100644
--- a/drivers/infiniband/hw/hfi1/firmware.c
+++ b/drivers/infiniband/hw/hfi1/firmware.c
@@ -1868,11 +1868,8 @@
 									2;
 				break;
 			case PLATFORM_CONFIG_RX_PRESET_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_TX_PRESET_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
 				pcfgcache->config_tables[table_type].num_table =
 							table_length_dwords;
@@ -1890,15 +1887,10 @@
 			/* metadata table */
 			switch (table_type) {
 			case PLATFORM_CONFIG_SYSTEM_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_PORT_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_RX_PRESET_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_TX_PRESET_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
 				break;
 			default:
@@ -2027,15 +2019,10 @@
 
 	switch (table) {
 	case PLATFORM_CONFIG_SYSTEM_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_PORT_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_RX_PRESET_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_TX_PRESET_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
 		if (field && field < platform_config_table_limits[table])
 			src_ptr =
@@ -2138,11 +2125,8 @@
 			pcfgcache->config_tables[table_type].table;
 		break;
 	case PLATFORM_CONFIG_RX_PRESET_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_TX_PRESET_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
 		src_ptr = pcfgcache->config_tables[table_type].table;
 
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index 7073f23..3222e3a 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -721,7 +721,7 @@
 			/* Bad mkey not a violation below level 2 */
 			if (ibp->rvp.mkeyprot < 2)
 				break;
-			/* fall through */
+			fallthrough;
 		case IB_MGMT_METHOD_SET:
 		case IB_MGMT_METHOD_TRAP_REPRESS:
 			if (ibp->rvp.mkey_violations != 0xFFFF)
@@ -1272,7 +1272,7 @@
 	case IB_PORT_NOP:
 		if (phys_state == IB_PORTPHYSSTATE_NOP)
 			break;
-		/* FALLTHROUGH */
+		fallthrough;
 	case IB_PORT_DOWN:
 		if (phys_state == IB_PORTPHYSSTATE_NOP) {
 			link_state = HLS_DN_DOWNDEF;
@@ -2300,7 +2300,6 @@
 	 * can be changed from the default values
 	 */
 	case OPA_VLARB_PREEMPT_ELEMENTS:
-		/* FALLTHROUGH */
 	case OPA_VLARB_PREEMPT_MATRIX:
 		smp->status |= IB_SMP_UNSUP_METH_ATTR;
 		break;
@@ -4170,7 +4169,7 @@
 			return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
 		if (ibp->rvp.port_cap_flags & IB_PORT_SM)
 			return IB_MAD_RESULT_SUCCESS;
-		/* FALLTHROUGH */
+		fallthrough;
 	default:
 		smp->status |= IB_SMP_UNSUP_METH_ATTR;
 		ret = reply((struct ib_mad_hdr *)smp);
@@ -4240,7 +4239,7 @@
 			return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
 		if (ibp->rvp.port_cap_flags & IB_PORT_SM)
 			return IB_MAD_RESULT_SUCCESS;
-		/* FALLTHROUGH */
+		fallthrough;
 	default:
 		smp->status |= IB_SMP_UNSUP_METH_ATTR;
 		ret = reply((struct ib_mad_hdr *)smp);
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
index 1a6268d..18d32f0 100644
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -306,7 +306,7 @@
 	ret = pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
 	if (ret) {
 		dd_dev_err(dd, "Unable to read from PCI config\n");
-		return ret;
+		return pcibios_err_to_errno(ret);
 	}
 
 	if ((linkcap & PCI_EXP_LNKCAP_SLS) != PCI_EXP_LNKCAP_SLS_8_0GB) {
@@ -334,10 +334,14 @@
 	return 0;
 }
 
-/* restore command and BARs after a reset has wiped them out */
+/**
+ * Restore command and BARs after a reset has wiped them out
+ *
+ * Returns 0 on success, otherwise a negative error value
+ */
 int restore_pci_variables(struct hfi1_devdata *dd)
 {
-	int ret = 0;
+	int ret;
 
 	ret = pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
 	if (ret)
@@ -386,13 +390,17 @@
 
 error:
 	dd_dev_err(dd, "Unable to write to PCI config\n");
-	return ret;
+	return pcibios_err_to_errno(ret);
 }
 
-/* Save BARs and command to rewrite after device reset */
+/**
+ * Save BARs and command to rewrite after device reset
+ *
+ * Returns 0 on success, otherwise a negative error value
+ */
 int save_pci_variables(struct hfi1_devdata *dd)
 {
-	int ret = 0;
+	int ret;
 
 	ret = pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0,
 				    &dd->pcibar0);
@@ -441,7 +449,7 @@
 
 error:
 	dd_dev_err(dd, "Unable to read from PCI config\n");
-	return ret;
+	return pcibios_err_to_errno(ret);
 }
 
 /*
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 79126b2..ff864f6 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -86,7 +86,7 @@
 	switch (op) {
 	case PSC_GLOBAL_ENABLE:
 		reg |= SEND_CTRL_SEND_ENABLE_SMASK;
-	/* Fall through */
+		fallthrough;
 	case PSC_DATA_VL_ENABLE:
 		mask = 0;
 		for (i = 0; i < ARRAY_SIZE(dd->vld); i++)
diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c
index 03024ce..b12e466 100644
--- a/drivers/infiniband/hw/hfi1/pio_copy.c
+++ b/drivers/infiniband/hw/hfi1/pio_copy.c
@@ -191,22 +191,22 @@
 	switch (n) {
 	case 7:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 6:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 5:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 4:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 3:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 2:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 1:
 		*dest++ = *src++;
 		/* fall through */
diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c
index 36593f2..4642d6c 100644
--- a/drivers/infiniband/hw/hfi1/platform.c
+++ b/drivers/infiniband/hw/hfi1/platform.c
@@ -668,8 +668,8 @@
 
 	/* active optical cables only */
 	switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) {
-	case 0x0 ... 0x9: /* fallthrough */
-	case 0xC: /* fallthrough */
+	case 0x0 ... 0x9: fallthrough;
+	case 0xC: fallthrough;
 	case 0xE:
 		/* active AOC */
 		power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
@@ -899,8 +899,8 @@
 
 		*ptr_tuning_method = OPA_PASSIVE_TUNING;
 		break;
-	case 0x0 ... 0x9: /* fallthrough */
-	case 0xC: /* fallthrough */
+	case 0x0 ... 0x9: fallthrough;
+	case 0xC: fallthrough;
 	case 0xE:
 		ret = tune_active_qsfp(ppd, ptr_tx_preset, ptr_rx_preset,
 				       ptr_total_atten);
@@ -909,7 +909,7 @@
 
 		*ptr_tuning_method = OPA_ACTIVE_TUNING;
 		break;
-	case 0xD: /* fallthrough */
+	case 0xD: fallthrough;
 	case 0xF:
 	default:
 		dd_dev_warn(ppd->dd, "%s: Unknown/unsupported cable\n",
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index be62284..356518e 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -312,7 +312,7 @@
 	switch (qp->ibqp.qp_type) {
 	case IB_QPT_RC:
 		hfi1_setup_tid_rdma_wqe(qp, wqe);
-		/* fall through */
+		fallthrough;
 	case IB_QPT_UC:
 		if (wqe->length > 0x80000000U)
 			return -EINVAL;
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index b670321..b0d053d 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -113,20 +113,6 @@
 }
 
 /**
- * hfi1_create_qp - create a queue pair for a device
- * @ibpd: the protection domain who's device we create the queue pair for
- * @init_attr: the attributes of the queue pair
- * @udata: user data for libibverbs.so
- *
- * Returns the queue pair on success, otherwise returns an errno.
- *
- * Called by the ib_create_qp() core verbs function.
- */
-struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
-			     struct ib_qp_init_attr *init_attr,
-			     struct ib_udata *udata);
-
-/**
  * hfi1_qp_wakeup - wake up on the indicated event
  * @qp: the QP
  * @flag: flag the qp on which the qp is stalled
diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c
index b596699..8386c84 100644
--- a/drivers/infiniband/hw/hfi1/qsfp.c
+++ b/drivers/infiniband/hw/hfi1/qsfp.c
@@ -231,7 +231,7 @@
 		break;
 	case 2:
 		offset_bytes[1] = (offset >> 8) & 0xff;
-		/* fall through */
+		fallthrough;
 	case 1:
 		num_msgs = 2;
 		offset_bytes[0] = offset & 0xff;
@@ -279,7 +279,7 @@
 		break;
 	case 2:
 		offset_bytes[1] = (offset >> 8) & 0xff;
-		/* fall through */
+		fallthrough;
 	case 1:
 		num_msgs = 2;
 		offset_bytes[0] = offset & 0xff;
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index f1734e5..1bb5f57 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -141,7 +141,7 @@
 	case OP(RDMA_READ_RESPONSE_ONLY):
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
 		release_rdma_sge_mr(e);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(ATOMIC_ACKNOWLEDGE):
 		/*
 		 * We can increment the tail pointer now that the last
@@ -160,7 +160,7 @@
 			qp->s_acked_ack_queue = next;
 		qp->s_tail_ack_queue = next;
 		trace_hfi1_rsp_make_rc_ack(qp, e->psn);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_ONLY):
 	case OP(ACKNOWLEDGE):
 		/* Check for no next entry in the queue. */
@@ -267,7 +267,7 @@
 
 	case OP(RDMA_READ_RESPONSE_FIRST):
 		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_READ_RESPONSE_MIDDLE):
 		ps->s_txreq->ss = &qp->s_ack_rdma_sge;
 		ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
@@ -881,8 +881,7 @@
 				goto bail;
 			}
 			qp->s_num_rd_atomic++;
-
-			/* FALLTHROUGH */
+			fallthrough;
 		case IB_WR_OPFN:
 			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
 				qp->s_lsn++;
@@ -946,10 +945,10 @@
 		 * See restart_rc().
 		 */
 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_FIRST):
 		qp->s_state = OP(SEND_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 		bth2 = mask_psn(qp->s_psn++);
 		ss = &qp->s_sge;
@@ -991,10 +990,10 @@
 		 * See restart_rc().
 		 */
 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_FIRST):
 		qp->s_state = OP(RDMA_WRITE_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_MIDDLE):
 		bth2 = mask_psn(qp->s_psn++);
 		ss = &qp->s_sge;
@@ -2901,7 +2900,7 @@
 		if (!ret)
 			goto rnr_nak;
 		qp->r_rcv_len = 0;
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 	case OP(RDMA_WRITE_MIDDLE):
 send_middle:
@@ -2941,7 +2940,7 @@
 			goto no_immediate_data;
 		if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
 			goto send_last_inv;
-		/* FALLTHROUGH -- for SEND_ONLY_WITH_IMMEDIATE */
+		fallthrough;	/* for SEND_ONLY_WITH_IMMEDIATE */
 	case OP(SEND_LAST_WITH_IMMEDIATE):
 send_last_imm:
 		wc.ex.imm_data = ohdr->u.imm_data;
@@ -2957,7 +2956,7 @@
 		goto send_last;
 	case OP(RDMA_WRITE_LAST):
 		copy_last = rvt_is_user_qp(qp);
-		/* fall through */
+		fallthrough;
 	case OP(SEND_LAST):
 no_immediate_data:
 		wc.wc_flags = 0;
@@ -3010,7 +3009,7 @@
 
 	case OP(RDMA_WRITE_ONLY):
 		copy_last = rvt_is_user_qp(qp);
-		/* fall through */
+		fallthrough;
 	case OP(RDMA_WRITE_FIRST):
 	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index c93ea02..04575c9 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -2584,7 +2584,7 @@
 			 * 7220, e.g.
 			 */
 			ss->go_s99_running = 1;
-			/* fall through -- and start dma engine */
+			fallthrough;	/* and start dma engine */
 		case sdma_event_e10_go_hw_start:
 			/* This reference means the state machine is started */
 			sdma_get(&sde->state);
@@ -2726,7 +2726,6 @@
 		case sdma_event_e70_go_idle:
 			break;
 		case sdma_event_e85_link_down:
-			/* fall through */
 		case sdma_event_e80_hw_freeze:
 			sdma_set_state(sde, sdma_state_s80_hw_freeze);
 			atomic_dec(&sde->dd->sdma_unfreeze_count);
@@ -3007,7 +3006,7 @@
 		case sdma_event_e60_hw_halted:
 			need_progress = 1;
 			sdma_err_progress_check_schedule(sde);
-			/* fall through */
+			fallthrough;
 		case sdma_event_e90_sw_halted:
 			/*
 			* SW initiated halt does not perform engines
@@ -3021,7 +3020,7 @@
 			break;
 		case sdma_event_e85_link_down:
 			ss->go_s99_running = 0;
-			/* fall through */
+			fallthrough;
 		case sdma_event_e80_hw_freeze:
 			sdma_set_state(sde, sdma_state_s80_hw_freeze);
 			atomic_dec(&sde->dd->sdma_unfreeze_count);
@@ -3252,7 +3251,7 @@
 		tx->num_desc++;
 		tx->descs[2].qw[0] = 0;
 		tx->descs[2].qw[1] = 0;
-		/* FALLTHROUGH */
+		fallthrough;
 	case SDMA_AHG_APPLY_UPDATE2:
 		tx->num_desc++;
 		tx->descs[1].qw[0] = 0;
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index facff13..9af82ff 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -3227,7 +3227,7 @@
 	case IB_WR_RDMA_READ:
 		if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
 			break;
-		/* fall through */
+		fallthrough;
 	case IB_WR_TID_RDMA_READ:
 		switch (prev->wr.opcode) {
 		case IB_WR_RDMA_READ:
@@ -5067,7 +5067,7 @@
 		if (priv->s_state == TID_OP(WRITE_REQ))
 			hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
 		priv->s_state = TID_OP(WRITE_DATA);
-		/* fall through */
+		fallthrough;
 
 	case TID_OP(WRITE_DATA):
 		/*
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index 0c77f18..1fb9183 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -216,7 +216,7 @@
 
 	case OP(SEND_FIRST):
 		qp->s_state = OP(SEND_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 		len = qp->s_len;
 		if (len > pmtu) {
@@ -241,7 +241,7 @@
 
 	case OP(RDMA_WRITE_FIRST):
 		qp->s_state = OP(RDMA_WRITE_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_MIDDLE):
 		len = qp->s_len;
 		if (len > pmtu) {
@@ -414,7 +414,7 @@
 			goto no_immediate_data;
 		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
 			goto send_last_imm;
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 		/* Check for invalid length PMTU or posted rwqe len. */
 		/*
@@ -515,7 +515,7 @@
 			wc.ex.imm_data = ohdr->u.rc.imm_data;
 			goto rdma_last_imm;
 		}
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_MIDDLE):
 		/* Check for invalid length PMTU or posted rwqe len. */
 		if (unlikely(tlen != (hdrsize + pmtu + 4)))
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 479fa55..da9888d 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -37,9 +37,8 @@
 
 #define DRV_NAME "hns_roce"
 
-/* hip08 is a pci device, it includes two version according pci version id */
-#define PCI_REVISION_ID_HIP08_A			0x20
-#define PCI_REVISION_ID_HIP08_B			0x21
+/* hip08 is a pci device */
+#define PCI_REVISION_ID_HIP08			0x21
 
 #define HNS_ROCE_HW_VER1	('h' << 24 | 'i' << 16 | '0' << 8 | '6')
 
@@ -348,20 +347,22 @@
 	bool mtt_only; /* only alloc buffer-required MTT memory */
 };
 
+struct hns_roce_hem_cfg {
+	dma_addr_t	root_ba; /* root BA table's address */
+	bool		is_direct; /* addressing without BA table */
+	unsigned int	ba_pg_shift; /* BA table page shift */
+	unsigned int	buf_pg_shift; /* buffer page shift */
+	unsigned int	buf_pg_count;  /* buffer page count */
+	struct hns_roce_buf_region region[HNS_ROCE_MAX_BT_REGION];
+	int		region_count;
+};
+
 /* memory translate region */
 struct hns_roce_mtr {
 	struct hns_roce_hem_list hem_list; /* multi-hop addressing resource */
 	struct ib_umem		*umem; /* user space buffer */
 	struct hns_roce_buf	*kmem; /* kernel space buffer */
-	struct {
-		dma_addr_t	root_ba; /* root BA table's address */
-		bool		is_direct; /* addressing without BA table */
-		unsigned int	ba_pg_shift; /* BA table page shift */
-		unsigned int	buf_pg_shift; /* buffer page shift */
-		int		buf_pg_count;  /* buffer page count */
-		struct hns_roce_buf_region region[HNS_ROCE_MAX_BT_REGION];
-		unsigned int	region_count;
-	} hem_cfg; /* config for hardware addressing */
+	struct hns_roce_hem_cfg  hem_cfg; /* config for hardware addressing */
 };
 
 struct hns_roce_mw {
@@ -1192,7 +1193,7 @@
 			   u64 virt_addr, int mr_access_flags, struct ib_pd *pd,
 			   struct ib_udata *udata);
 struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-				u32 max_num_sg, struct ib_udata *udata);
+				u32 max_num_sg);
 int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		       unsigned int *sg_offset);
 int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
@@ -1267,6 +1268,6 @@
 int hns_roce_init(struct hns_roce_dev *hr_dev);
 void hns_roce_exit(struct hns_roce_dev *hr_dev);
 
-int hns_roce_fill_res_entry(struct sk_buff *msg,
-			    struct rdma_restrack_entry *res);
+int hns_roce_fill_res_cq_entry(struct sk_buff *msg,
+			       struct ib_cq *ib_cq);
 #endif /* _HNS_ROCE_DEVICE_H */
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index cf39f56..07b4c85 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -2483,7 +2483,6 @@
 			u64 *sq_ba, u64 *rq_ba, dma_addr_t *bt_ba)
 {
 	struct ib_device *ibdev = &hr_dev->ib_dev;
-	int rq_pa_start;
 	int count;
 
 	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, sq_ba, 1, bt_ba);
@@ -2491,9 +2490,9 @@
 		ibdev_err(ibdev, "Failed to find SQ ba\n");
 		return -ENOBUFS;
 	}
-	rq_pa_start = hr_qp->rq.offset >> hr_qp->mtr.hem_cfg.buf_pg_shift;
-	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, rq_pa_start, rq_ba, 1,
-				  NULL);
+
+	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.offset, rq_ba,
+				  1, NULL);
 	if (!count) {
 		ibdev_err(ibdev, "Failed to find RQ ba\n");
 		return -ENOBUFS;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 0618ced..d296859 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -91,10 +91,11 @@
 }
 
 static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
-			 void *wqe, const struct ib_reg_wr *wr)
+			 const struct ib_reg_wr *wr)
 {
+	struct hns_roce_wqe_frmr_seg *fseg =
+		(void *)rc_sq_wqe + sizeof(struct hns_roce_v2_rc_send_wqe);
 	struct hns_roce_mr *mr = to_hr_mr(wr->mr);
-	struct hns_roce_wqe_frmr_seg *fseg = wqe;
 	u64 pbl_ba;
 
 	/* use ib_access_flags */
@@ -128,14 +129,16 @@
 		     V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S, 0);
 }
 
-static void set_atomic_seg(const struct ib_send_wr *wr, void *wqe,
+static void set_atomic_seg(const struct ib_send_wr *wr,
 			   struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
 			   unsigned int valid_num_sge)
 {
-	struct hns_roce_wqe_atomic_seg *aseg;
+	struct hns_roce_v2_wqe_data_seg *dseg =
+		(void *)rc_sq_wqe + sizeof(struct hns_roce_v2_rc_send_wqe);
+	struct hns_roce_wqe_atomic_seg *aseg =
+		(void *)dseg + sizeof(struct hns_roce_v2_wqe_data_seg);
 
-	set_data_seg_v2(wqe, wr->sg_list);
-	aseg = wqe + sizeof(struct hns_roce_v2_wqe_data_seg);
+	set_data_seg_v2(dseg, wr->sg_list);
 
 	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
 		aseg->fetchadd_swap_data = cpu_to_le64(atomic_wr(wr)->swap);
@@ -143,7 +146,7 @@
 	} else {
 		aseg->fetchadd_swap_data =
 			cpu_to_le64(atomic_wr(wr)->compare_add);
-		aseg->cmp_data  = 0;
+		aseg->cmp_data = 0;
 	}
 
 	roce_set_field(rc_sq_wqe->byte_16, V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
@@ -176,13 +179,15 @@
 
 static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 			     struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
-			     void *wqe, unsigned int *sge_ind,
+			     unsigned int *sge_ind,
 			     unsigned int valid_num_sge)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
-	struct hns_roce_v2_wqe_data_seg *dseg = wqe;
+	struct hns_roce_v2_wqe_data_seg *dseg =
+		(void *)rc_sq_wqe + sizeof(struct hns_roce_v2_rc_send_wqe);
 	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
+	void *wqe = dseg;
 	int j = 0;
 	int i;
 
@@ -438,7 +443,6 @@
 	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OWNER_S,
 		     owner_bit);
 
-	wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
 	switch (wr->opcode) {
 	case IB_WR_RDMA_READ:
 	case IB_WR_RDMA_WRITE:
@@ -451,7 +455,7 @@
 		rc_sq_wqe->inv_key = cpu_to_le32(wr->ex.invalidate_rkey);
 		break;
 	case IB_WR_REG_MR:
-		set_frmr_seg(rc_sq_wqe, wqe, reg_wr(wr));
+		set_frmr_seg(rc_sq_wqe, reg_wr(wr));
 		break;
 	case IB_WR_ATOMIC_CMP_AND_SWP:
 	case IB_WR_ATOMIC_FETCH_AND_ADD:
@@ -468,10 +472,10 @@
 
 	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 	    wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
-		set_atomic_seg(wr, wqe, rc_sq_wqe, valid_num_sge);
+		set_atomic_seg(wr, rc_sq_wqe, valid_num_sge);
 	else if (wr->opcode != IB_WR_REG_MR)
 		ret = set_rwqe_data_seg(&qp->ibqp, wr, rc_sq_wqe,
-					wqe, &curr_idx, valid_num_sge);
+					&curr_idx, valid_num_sge);
 
 	*sge_idx = curr_idx;
 
@@ -1510,8 +1514,6 @@
 
 	req_a = (struct hns_roce_vf_res_a *)desc[0].data;
 	req_b = (struct hns_roce_vf_res_b *)desc[1].data;
-	memset(req_a, 0, sizeof(*req_a));
-	memset(req_b, 0, sizeof(*req_b));
 	for (i = 0; i < 2; i++) {
 		hns_roce_cmq_setup_basic_desc(&desc[i],
 					      HNS_ROCE_OPC_ALLOC_VF_RES, false);
@@ -1744,27 +1746,25 @@
 	caps->max_srq_wrs	= HNS_ROCE_V2_MAX_SRQ_WR;
 	caps->max_srq_sges	= HNS_ROCE_V2_MAX_SRQ_SGE;
 
-	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP08_B) {
-		caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC | HNS_ROCE_CAP_FLAG_MW |
-			       HNS_ROCE_CAP_FLAG_SRQ | HNS_ROCE_CAP_FLAG_FRMR |
-			       HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL;
+	caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC | HNS_ROCE_CAP_FLAG_MW |
+		       HNS_ROCE_CAP_FLAG_SRQ | HNS_ROCE_CAP_FLAG_FRMR |
+		       HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL;
 
-		caps->num_qpc_timer	  = HNS_ROCE_V2_MAX_QPC_TIMER_NUM;
-		caps->qpc_timer_entry_sz  = HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ;
-		caps->qpc_timer_ba_pg_sz  = 0;
-		caps->qpc_timer_buf_pg_sz = 0;
-		caps->qpc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
-		caps->num_cqc_timer	  = HNS_ROCE_V2_MAX_CQC_TIMER_NUM;
-		caps->cqc_timer_entry_sz  = HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ;
-		caps->cqc_timer_ba_pg_sz  = 0;
-		caps->cqc_timer_buf_pg_sz = 0;
-		caps->cqc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
+	caps->num_qpc_timer	  = HNS_ROCE_V2_MAX_QPC_TIMER_NUM;
+	caps->qpc_timer_entry_sz  = HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ;
+	caps->qpc_timer_ba_pg_sz  = 0;
+	caps->qpc_timer_buf_pg_sz = 0;
+	caps->qpc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
+	caps->num_cqc_timer	  = HNS_ROCE_V2_MAX_CQC_TIMER_NUM;
+	caps->cqc_timer_entry_sz  = HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ;
+	caps->cqc_timer_ba_pg_sz  = 0;
+	caps->cqc_timer_buf_pg_sz = 0;
+	caps->cqc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
 
-		caps->sccc_entry_sz	  = HNS_ROCE_V2_SCCC_ENTRY_SZ;
-		caps->sccc_ba_pg_sz	  = 0;
-		caps->sccc_buf_pg_sz	  = 0;
-		caps->sccc_hop_num	  = HNS_ROCE_SCCC_HOP_NUM;
-	}
+	caps->sccc_entry_sz	  = HNS_ROCE_V2_SCCC_ENTRY_SZ;
+	caps->sccc_ba_pg_sz	  = 0;
+	caps->sccc_buf_pg_sz	  = 0;
+	caps->sccc_hop_num	  = HNS_ROCE_SCCC_HOP_NUM;
 }
 
 static void calc_pg_sz(int obj_num, int obj_size, int hop_num, int ctx_bt_num,
@@ -1995,20 +1995,18 @@
 		   caps->srqc_bt_num, &caps->srqc_buf_pg_sz,
 		   &caps->srqc_ba_pg_sz, HEM_TYPE_SRQC);
 
-	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP08_B) {
-		caps->sccc_hop_num = ctx_hop_num;
-		caps->qpc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
-		caps->cqc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
+	caps->sccc_hop_num = ctx_hop_num;
+	caps->qpc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
+	caps->cqc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
 
-		calc_pg_sz(caps->num_qps, caps->sccc_entry_sz,
-			   caps->sccc_hop_num, caps->sccc_bt_num,
-			   &caps->sccc_buf_pg_sz, &caps->sccc_ba_pg_sz,
-			   HEM_TYPE_SCCC);
-		calc_pg_sz(caps->num_cqc_timer, caps->cqc_timer_entry_sz,
-			   caps->cqc_timer_hop_num, caps->cqc_timer_bt_num,
-			   &caps->cqc_timer_buf_pg_sz,
-			   &caps->cqc_timer_ba_pg_sz, HEM_TYPE_CQC_TIMER);
-	}
+	calc_pg_sz(caps->num_qps, caps->sccc_entry_sz,
+		   caps->sccc_hop_num, caps->sccc_bt_num,
+		   &caps->sccc_buf_pg_sz, &caps->sccc_ba_pg_sz,
+		   HEM_TYPE_SCCC);
+	calc_pg_sz(caps->num_cqc_timer, caps->cqc_timer_entry_sz,
+		   caps->cqc_timer_hop_num, caps->cqc_timer_bt_num,
+		   &caps->cqc_timer_buf_pg_sz,
+		   &caps->cqc_timer_ba_pg_sz, HEM_TYPE_CQC_TIMER);
 
 	calc_pg_sz(caps->num_cqe_segs, caps->mtt_entry_sz, caps->cqe_hop_num,
 		   1, &caps->cqe_buf_pg_sz, &caps->cqe_ba_pg_sz, HEM_TYPE_CQE);
@@ -2055,22 +2053,19 @@
 		return ret;
 	}
 
-	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP08_B) {
-		ret = hns_roce_query_pf_timer_resource(hr_dev);
-		if (ret) {
-			dev_err(hr_dev->dev,
-				"Query pf timer resource fail, ret = %d.\n",
-				ret);
-			return ret;
-		}
+	ret = hns_roce_query_pf_timer_resource(hr_dev);
+	if (ret) {
+		dev_err(hr_dev->dev,
+			"failed to query pf timer resource, ret = %d.\n", ret);
+		return ret;
+	}
 
-		ret = hns_roce_set_vf_switch_param(hr_dev, 0);
-		if (ret) {
-			dev_err(hr_dev->dev,
-				"Set function switch param fail, ret = %d.\n",
-				ret);
-			return ret;
-		}
+	ret = hns_roce_set_vf_switch_param(hr_dev, 0);
+	if (ret) {
+		dev_err(hr_dev->dev,
+			"failed to set function switch param, ret = %d.\n",
+			ret);
+		return ret;
 	}
 
 	hr_dev->vendor_part_id = hr_dev->pci_dev->device;
@@ -2336,8 +2331,7 @@
 {
 	struct hns_roce_v2_priv *priv = hr_dev->priv;
 
-	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP08_B)
-		hns_roce_function_clear(hr_dev);
+	hns_roce_function_clear(hr_dev);
 
 	hns_roce_free_link_table(hr_dev, &priv->tpq);
 	hns_roce_free_link_table(hr_dev, &priv->tsq);
@@ -3053,6 +3047,7 @@
 		  IB_WC_RETRY_EXC_ERR },
 		{ HNS_ROCE_CQE_V2_RNR_RETRY_EXC_ERR, IB_WC_RNR_RETRY_EXC_ERR },
 		{ HNS_ROCE_CQE_V2_REMOTE_ABORT_ERR, IB_WC_REM_ABORT_ERR },
+		{ HNS_ROCE_CQE_V2_GENERAL_ERR, IB_WC_GENERAL_ERR}
 	};
 
 	u32 cqe_status = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_STATUS_M,
@@ -3075,6 +3070,14 @@
 		       sizeof(*cqe), false);
 
 	/*
+	 * For hns ROCEE, GENERAL_ERR is an error type that is not defined in
+	 * the standard protocol, the driver must ignore it and needn't to set
+	 * the QP to an error state.
+	 */
+	if (cqe_status == HNS_ROCE_CQE_V2_GENERAL_ERR)
+		return;
+
+	/*
 	 * Hip08 hardware cannot flush the WQEs in SQ/RQ if the QP state gets
 	 * into errored mode. Hence, as a workaround to this hardware
 	 * limitation, driver needs to assist in flushing. But the flushing
@@ -3170,51 +3173,51 @@
 		/* SQ corresponding to CQE */
 		switch (roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_OPCODE_M,
 				       V2_CQE_BYTE_4_OPCODE_S) & 0x1f) {
-		case HNS_ROCE_SQ_OPCODE_SEND:
+		case HNS_ROCE_V2_WQE_OP_SEND:
 			wc->opcode = IB_WC_SEND;
 			break;
-		case HNS_ROCE_SQ_OPCODE_SEND_WITH_INV:
+		case HNS_ROCE_V2_WQE_OP_SEND_WITH_INV:
 			wc->opcode = IB_WC_SEND;
 			break;
-		case HNS_ROCE_SQ_OPCODE_SEND_WITH_IMM:
+		case HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM:
 			wc->opcode = IB_WC_SEND;
 			wc->wc_flags |= IB_WC_WITH_IMM;
 			break;
-		case HNS_ROCE_SQ_OPCODE_RDMA_READ:
+		case HNS_ROCE_V2_WQE_OP_RDMA_READ:
 			wc->opcode = IB_WC_RDMA_READ;
 			wc->byte_len = le32_to_cpu(cqe->byte_cnt);
 			break;
-		case HNS_ROCE_SQ_OPCODE_RDMA_WRITE:
+		case HNS_ROCE_V2_WQE_OP_RDMA_WRITE:
 			wc->opcode = IB_WC_RDMA_WRITE;
 			break;
-		case HNS_ROCE_SQ_OPCODE_RDMA_WRITE_WITH_IMM:
+		case HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM:
 			wc->opcode = IB_WC_RDMA_WRITE;
 			wc->wc_flags |= IB_WC_WITH_IMM;
 			break;
-		case HNS_ROCE_SQ_OPCODE_LOCAL_INV:
+		case HNS_ROCE_V2_WQE_OP_LOCAL_INV:
 			wc->opcode = IB_WC_LOCAL_INV;
 			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
 			break;
-		case HNS_ROCE_SQ_OPCODE_ATOMIC_COMP_AND_SWAP:
+		case HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP:
 			wc->opcode = IB_WC_COMP_SWAP;
 			wc->byte_len  = 8;
 			break;
-		case HNS_ROCE_SQ_OPCODE_ATOMIC_FETCH_AND_ADD:
+		case HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD:
 			wc->opcode = IB_WC_FETCH_ADD;
 			wc->byte_len  = 8;
 			break;
-		case HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_COMP_AND_SWAP:
+		case HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP:
 			wc->opcode = IB_WC_MASKED_COMP_SWAP;
 			wc->byte_len  = 8;
 			break;
-		case HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_FETCH_AND_ADD:
+		case HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD:
 			wc->opcode = IB_WC_MASKED_FETCH_ADD;
 			wc->byte_len  = 8;
 			break;
-		case HNS_ROCE_SQ_OPCODE_FAST_REG_WR:
+		case HNS_ROCE_V2_WQE_OP_FAST_REG_PMR:
 			wc->opcode = IB_WC_REG_MR;
 			break;
-		case HNS_ROCE_SQ_OPCODE_BIND_MW:
+		case HNS_ROCE_V2_WQE_OP_BIND_MW:
 			wc->opcode = IB_WC_REG_MR;
 			break;
 		default:
@@ -3374,11 +3377,33 @@
 	return op + step_idx;
 }
 
+static int set_hem_to_hw(struct hns_roce_dev *hr_dev, int obj, u64 bt_ba,
+			 u32 hem_type, int step_idx)
+{
+	struct hns_roce_cmd_mailbox *mailbox;
+	int ret;
+	int op;
+
+	op = get_op_for_set_hem(hr_dev, hem_type, step_idx);
+	if (op < 0)
+		return 0;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	ret = hns_roce_cmd_mbox(hr_dev, bt_ba, mailbox->dma, obj,
+				0, op, HNS_ROCE_CMD_TIMEOUT_MSECS);
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return ret;
+}
+
 static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_hem_table *table, int obj,
 			       int step_idx)
 {
-	struct hns_roce_cmd_mailbox *mailbox;
 	struct hns_roce_hem_iter iter;
 	struct hns_roce_hem_mhop mhop;
 	struct hns_roce_hem *hem;
@@ -3390,7 +3415,6 @@
 	u64 bt_ba = 0;
 	u32 chunk_ba_num;
 	u32 hop_num;
-	int op;
 
 	if (!hns_roce_check_whether_mhop(hr_dev, table->type))
 		return 0;
@@ -3412,14 +3436,6 @@
 		hem_idx = i;
 	}
 
-	op = get_op_for_set_hem(hr_dev, table->type, step_idx);
-	if (op == -EINVAL)
-		return 0;
-
-	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
-	if (IS_ERR(mailbox))
-		return PTR_ERR(mailbox);
-
 	if (table->type == HEM_TYPE_SCCC)
 		obj = mhop.l0_idx;
 
@@ -3428,11 +3444,8 @@
 		for (hns_roce_hem_first(hem, &iter);
 		     !hns_roce_hem_last(&iter); hns_roce_hem_next(&iter)) {
 			bt_ba = hns_roce_hem_addr(&iter);
-
-			/* configure the ba, tag, and op */
-			ret = hns_roce_cmd_mbox(hr_dev, bt_ba, mailbox->dma,
-						obj, 0, op,
-						HNS_ROCE_CMD_TIMEOUT_MSECS);
+			ret = set_hem_to_hw(hr_dev, obj, bt_ba, table->type,
+					    step_idx);
 		}
 	} else {
 		if (step_idx == 0)
@@ -3440,12 +3453,9 @@
 		else if (step_idx == 1 && hop_num == 2)
 			bt_ba = table->bt_l1_dma_addr[l1_idx];
 
-		/* configure the ba, tag, and op */
-		ret = hns_roce_cmd_mbox(hr_dev, bt_ba, mailbox->dma, obj,
-					0, op, HNS_ROCE_CMD_TIMEOUT_MSECS);
+		ret = set_hem_to_hw(hr_dev, obj, bt_ba, table->type, step_idx);
 	}
 
-	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
 	return ret;
 }
 
@@ -3745,51 +3755,23 @@
 	}
 }
 
-static bool check_wqe_rq_mtt_count(struct hns_roce_dev *hr_dev,
-				   struct hns_roce_qp *hr_qp, int mtt_cnt,
-				   u32 page_size)
-{
-	struct ib_device *ibdev = &hr_dev->ib_dev;
-
-	if (hr_qp->rq.wqe_cnt < 1)
-		return true;
-
-	if (mtt_cnt < 1) {
-		ibdev_err(ibdev, "failed to find RQWQE buf ba of QP(0x%lx)\n",
-			  hr_qp->qpn);
-		return false;
-	}
-
-	if (mtt_cnt < MTT_MIN_COUNT &&
-		(hr_qp->rq.offset + page_size) < hr_qp->buff_size) {
-		ibdev_err(ibdev,
-			  "failed to find next RQWQE buf ba of QP(0x%lx)\n",
-			  hr_qp->qpn);
-		return false;
-	}
-
-	return true;
-}
-
 static int config_qp_rq_buf(struct hns_roce_dev *hr_dev,
 			    struct hns_roce_qp *hr_qp,
 			    struct hns_roce_v2_qp_context *context,
 			    struct hns_roce_v2_qp_context *qpc_mask)
 {
-	struct ib_qp *ibqp = &hr_qp->ibqp;
 	u64 mtts[MTT_MIN_COUNT] = { 0 };
 	u64 wqe_sge_ba;
-	u32 page_size;
 	int count;
 
 	/* Search qp buf's mtts */
-	page_size = 1 << hr_qp->mtr.hem_cfg.buf_pg_shift;
-	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
-				  hr_qp->rq.offset / page_size, mtts,
+	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.offset, mtts,
 				  MTT_MIN_COUNT, &wqe_sge_ba);
-	if (!ibqp->srq)
-		if (!check_wqe_rq_mtt_count(hr_dev, hr_qp, count, page_size))
-			return -EINVAL;
+	if (hr_qp->rq.wqe_cnt && count < 1) {
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to find RQ WQE, QPN = 0x%lx.\n", hr_qp->qpn);
+		return -EINVAL;
+	}
 
 	context->wqe_sge_ba = cpu_to_le32(wqe_sge_ba >> 3);
 	qpc_mask->wqe_sge_ba = 0;
@@ -3891,7 +3873,6 @@
 	struct ib_device *ibdev = &hr_dev->ib_dev;
 	u64 sge_cur_blk = 0;
 	u64 sq_cur_blk = 0;
-	u32 page_size;
 	int count;
 
 	/* search qp buf's mtts */
@@ -3902,9 +3883,8 @@
 		return -EINVAL;
 	}
 	if (hr_qp->sge.sge_cnt > 0) {
-		page_size = 1 << hr_qp->mtr.hem_cfg.buf_pg_shift;
 		count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
-					  hr_qp->sge.offset / page_size,
+					  hr_qp->sge.offset,
 					  &sge_cur_blk, 1, NULL);
 		if (count < 1) {
 			ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf.\n",
@@ -4265,12 +4245,13 @@
 	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M,
 		       V2_QPC_BYTE_24_HOP_LIMIT_S, 0);
 
-	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP08_B && is_udp)
+	if (is_udp)
 		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
 			       V2_QPC_BYTE_24_TC_S, grh->traffic_class >> 2);
 	else
 		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
 			       V2_QPC_BYTE_24_TC_S, grh->traffic_class);
+
 	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
 		       V2_QPC_BYTE_24_TC_S, 0);
 	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
@@ -4301,7 +4282,9 @@
 		[IB_QPS_RTR] = { [IB_QPS_RESET] = true,
 				 [IB_QPS_RTS] = true,
 				 [IB_QPS_ERR] = true },
-		[IB_QPS_RTS] = { [IB_QPS_RESET] = true, [IB_QPS_ERR] = true },
+		[IB_QPS_RTS] = { [IB_QPS_RESET] = true,
+				 [IB_QPS_RTS] = true,
+				 [IB_QPS_ERR] = true },
 		[IB_QPS_SQD] = {},
 		[IB_QPS_SQE] = {},
 		[IB_QPS_ERR] = { [IB_QPS_RESET] = true, [IB_QPS_ERR] = true }
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index e176b0a..1fb1c58 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -179,27 +179,11 @@
 	HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD	= 0x9,
 	HNS_ROCE_V2_WQE_OP_FAST_REG_PMR			= 0xa,
 	HNS_ROCE_V2_WQE_OP_LOCAL_INV			= 0xb,
-	HNS_ROCE_V2_WQE_OP_BIND_MW_TYPE			= 0xc,
+	HNS_ROCE_V2_WQE_OP_BIND_MW			= 0xc,
 	HNS_ROCE_V2_WQE_OP_MASK				= 0x1f,
 };
 
 enum {
-	HNS_ROCE_SQ_OPCODE_SEND = 0x0,
-	HNS_ROCE_SQ_OPCODE_SEND_WITH_INV = 0x1,
-	HNS_ROCE_SQ_OPCODE_SEND_WITH_IMM = 0x2,
-	HNS_ROCE_SQ_OPCODE_RDMA_WRITE = 0x3,
-	HNS_ROCE_SQ_OPCODE_RDMA_WRITE_WITH_IMM = 0x4,
-	HNS_ROCE_SQ_OPCODE_RDMA_READ = 0x5,
-	HNS_ROCE_SQ_OPCODE_ATOMIC_COMP_AND_SWAP = 0x6,
-	HNS_ROCE_SQ_OPCODE_ATOMIC_FETCH_AND_ADD = 0x7,
-	HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_COMP_AND_SWAP = 0x8,
-	HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_FETCH_AND_ADD = 0x9,
-	HNS_ROCE_SQ_OPCODE_FAST_REG_WR = 0xa,
-	HNS_ROCE_SQ_OPCODE_LOCAL_INV = 0xb,
-	HNS_ROCE_SQ_OPCODE_BIND_MW = 0xc,
-};
-
-enum {
 	/* rq operations */
 	HNS_ROCE_V2_OPCODE_RDMA_WRITE_IMM = 0x0,
 	HNS_ROCE_V2_OPCODE_SEND = 0x1,
@@ -230,6 +214,7 @@
 	HNS_ROCE_CQE_V2_TRANSPORT_RETRY_EXC_ERR		= 0x15,
 	HNS_ROCE_CQE_V2_RNR_RETRY_EXC_ERR		= 0x16,
 	HNS_ROCE_CQE_V2_REMOTE_ABORT_ERR		= 0x22,
+	HNS_ROCE_CQE_V2_GENERAL_ERR			= 0x23,
 
 	HNS_ROCE_V2_CQE_STATUS_MASK			= 0xff,
 };
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 50763cf..5907cfd 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -428,7 +428,7 @@
 	.destroy_ah = hns_roce_destroy_ah,
 	.destroy_cq = hns_roce_destroy_cq,
 	.disassociate_ucontext = hns_roce_disassociate_ucontext,
-	.fill_res_entry = hns_roce_fill_res_entry,
+	.fill_res_cq_entry = hns_roce_fill_res_cq_entry,
 	.get_dma_mr = hns_roce_get_dma_mr,
 	.get_link_layer = hns_roce_get_link_layer,
 	.get_port_immutable = hns_roce_port_immutable,
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 6b226a5..e5df388 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -415,7 +415,7 @@
 }
 
 struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-				u32 max_num_sg, struct ib_udata *udata)
+				u32 max_num_sg)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
 	struct device *dev = hr_dev->dev;
@@ -871,6 +871,15 @@
 	int err;
 	int i;
 
+	/*
+	 * Only use the first page address as root ba when hopnum is 0, this
+	 * is because the addresses of all pages are consecutive in this case.
+	 */
+	if (mtr->hem_cfg.is_direct) {
+		mtr->hem_cfg.root_ba = pages[0];
+		return 0;
+	}
+
 	for (i = 0; i < mtr->hem_cfg.region_count; i++) {
 		r = &mtr->hem_cfg.region[i];
 		if (r->offset + r->count > page_cnt) {
@@ -896,6 +905,8 @@
 int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
 		      int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr)
 {
+	struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg;
+	int start_index;
 	int mtt_count;
 	int total = 0;
 	__le64 *mtts;
@@ -907,26 +918,32 @@
 		goto done;
 
 	/* no mtt memory in direct mode, so just return the buffer address */
-	if (mtr->hem_cfg.is_direct) {
-		npage = offset;
-		for (total = 0; total < mtt_max; total++, npage++) {
-			addr = mtr->hem_cfg.root_ba +
-			       (npage << mtr->hem_cfg.buf_pg_shift);
+	if (cfg->is_direct) {
+		start_index = offset >> HNS_HW_PAGE_SHIFT;
+		for (mtt_count = 0; mtt_count < cfg->region_count &&
+		     total < mtt_max; mtt_count++) {
+			npage = cfg->region[mtt_count].offset;
+			if (npage < start_index)
+				continue;
 
+			addr = cfg->root_ba + (npage << HNS_HW_PAGE_SHIFT);
 			if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
 				mtt_buf[total] = to_hr_hw_page_addr(addr);
 			else
 				mtt_buf[total] = addr;
+
+			total++;
 		}
 
 		goto done;
 	}
 
+	start_index = offset >> cfg->buf_pg_shift;
 	left = mtt_max;
 	while (left > 0) {
 		mtt_count = 0;
 		mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
-						  offset + total,
+						  start_index + total,
 						  &mtt_count, NULL);
 		if (!mtts || !mtt_count)
 			goto done;
@@ -939,104 +956,136 @@
 
 done:
 	if (base_addr)
-		*base_addr = mtr->hem_cfg.root_ba;
+		*base_addr = cfg->root_ba;
 
 	return total;
 }
 
-/* convert buffer size to page index and page count */
-static unsigned int mtr_init_region(struct hns_roce_buf_attr *attr,
-				    int page_cnt,
-				    struct hns_roce_buf_region *regions,
-				    int region_cnt, unsigned int page_shift)
+static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_buf_attr *attr,
+			    struct hns_roce_hem_cfg *cfg,
+			    unsigned int *buf_page_shift)
 {
-	unsigned int page_size = 1 << page_shift;
-	int max_region = attr->region_count;
 	struct hns_roce_buf_region *r;
-	unsigned int i = 0;
-	int page_idx = 0;
+	unsigned int page_shift = 0;
+	int page_cnt = 0;
+	size_t buf_size;
+	int region_cnt;
 
-	for (; i < region_cnt && i < max_region && page_idx < page_cnt; i++) {
-		r = &regions[i];
-		r->hopnum = attr->region[i].hopnum == HNS_ROCE_HOP_NUM_0 ?
-			    0 : attr->region[i].hopnum;
-		r->offset = page_idx;
-		r->count = DIV_ROUND_UP(attr->region[i].size, page_size);
-		page_idx += r->count;
+	if (cfg->is_direct) {
+		buf_size = cfg->buf_pg_count << cfg->buf_pg_shift;
+		page_cnt = DIV_ROUND_UP(buf_size, HNS_HW_PAGE_SIZE);
+		/*
+		 * When HEM buffer use level-0 addressing, the page size equals
+		 * the buffer size, and the the page size = 4K * 2^N.
+		 */
+		cfg->buf_pg_shift = HNS_HW_PAGE_SHIFT + order_base_2(page_cnt);
+		if (attr->region_count > 1) {
+			cfg->buf_pg_count = page_cnt;
+			page_shift = HNS_HW_PAGE_SHIFT;
+		} else {
+			cfg->buf_pg_count = 1;
+			page_shift = cfg->buf_pg_shift;
+			if (buf_size != 1 << page_shift) {
+				ibdev_err(&hr_dev->ib_dev,
+					  "failed to check direct size %zu shift %d.\n",
+					  buf_size, page_shift);
+				return -EINVAL;
+			}
+		}
+	} else {
+		page_shift = cfg->buf_pg_shift;
 	}
 
-	return i;
+	/* convert buffer size to page index and page count */
+	for (page_cnt = 0, region_cnt = 0; page_cnt < cfg->buf_pg_count &&
+	     region_cnt < attr->region_count &&
+	     region_cnt < ARRAY_SIZE(cfg->region); region_cnt++) {
+		r = &cfg->region[region_cnt];
+		r->offset = page_cnt;
+		buf_size = hr_hw_page_align(attr->region[region_cnt].size);
+		r->count = DIV_ROUND_UP(buf_size, 1 << page_shift);
+		page_cnt += r->count;
+		r->hopnum = to_hr_hem_hopnum(attr->region[region_cnt].hopnum,
+					     r->count);
+	}
+
+	if (region_cnt < 1) {
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to check mtr region count, pages = %d.\n",
+			  cfg->buf_pg_count);
+		return -ENOBUFS;
+	}
+
+	cfg->region_count = region_cnt;
+	*buf_page_shift = page_shift;
+
+	return page_cnt;
 }
 
 /**
  * hns_roce_mtr_create - Create hns memory translate region.
  *
  * @mtr: memory translate region
- * @init_attr: init attribute for creating mtr
- * @page_shift: page shift for multi-hop base address table
+ * @buf_attr: buffer attribute for creating mtr
+ * @ba_page_shift: page shift for multi-hop base address table
  * @udata: user space context, if it's NULL, means kernel space
  * @user_addr: userspace virtual address to start at
- * @buf_alloced: mtr has private buffer, true means need to alloc
  */
 int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
 			struct hns_roce_buf_attr *buf_attr,
-			unsigned int page_shift, struct ib_udata *udata,
+			unsigned int ba_page_shift, struct ib_udata *udata,
 			unsigned long user_addr)
 {
+	struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg;
 	struct ib_device *ibdev = &hr_dev->ib_dev;
+	unsigned int buf_page_shift = 0;
 	dma_addr_t *pages = NULL;
-	int region_cnt = 0;
 	int all_pg_cnt;
 	int get_pg_cnt;
-	bool has_mtt;
-	int err = 0;
+	int ret = 0;
 
-	has_mtt = mtr_has_mtt(buf_attr);
+	/* if disable mtt, all pages must in a continuous address range */
+	cfg->is_direct = !mtr_has_mtt(buf_attr);
+
 	/* if buffer only need mtt, just init the hem cfg */
 	if (buf_attr->mtt_only) {
-		mtr->hem_cfg.buf_pg_shift = buf_attr->page_shift;
-		mtr->hem_cfg.buf_pg_count = mtr_bufs_size(buf_attr) >>
-					    buf_attr->page_shift;
+		cfg->buf_pg_shift = buf_attr->page_shift;
+		cfg->buf_pg_count = mtr_bufs_size(buf_attr) >>
+				    buf_attr->page_shift;
 		mtr->umem = NULL;
 		mtr->kmem = NULL;
 	} else {
-		err = mtr_alloc_bufs(hr_dev, mtr, buf_attr, !has_mtt, udata,
-				     user_addr);
-		if (err) {
-			ibdev_err(ibdev, "Failed to alloc mtr bufs, err %d\n",
-				  err);
-			return err;
+		ret = mtr_alloc_bufs(hr_dev, mtr, buf_attr, cfg->is_direct,
+				     udata, user_addr);
+		if (ret) {
+			ibdev_err(ibdev,
+				  "failed to alloc mtr bufs, ret = %d.\n", ret);
+			return ret;
 		}
 	}
 
-	/* alloc mtt memory */
-	all_pg_cnt = mtr->hem_cfg.buf_pg_count;
-	hns_roce_hem_list_init(&mtr->hem_list);
-	mtr->hem_cfg.is_direct = !has_mtt;
-	mtr->hem_cfg.ba_pg_shift = page_shift;
-	mtr->hem_cfg.region_count = 0;
-	region_cnt = mtr_init_region(buf_attr, all_pg_cnt,
-				     mtr->hem_cfg.region,
-				     ARRAY_SIZE(mtr->hem_cfg.region),
-				     mtr->hem_cfg.buf_pg_shift);
-	if (region_cnt < 1) {
-		err = -ENOBUFS;
-		ibdev_err(ibdev, "failed to init mtr region %d\n", region_cnt);
+	all_pg_cnt = mtr_init_buf_cfg(hr_dev, buf_attr, cfg, &buf_page_shift);
+	if (all_pg_cnt < 1) {
+		ret = -ENOBUFS;
+		ibdev_err(ibdev, "failed to init mtr buf cfg.\n");
 		goto err_alloc_bufs;
 	}
 
-	mtr->hem_cfg.region_count = region_cnt;
-
-	if (has_mtt) {
-		err = hns_roce_hem_list_request(hr_dev, &mtr->hem_list,
-						mtr->hem_cfg.region, region_cnt,
-						page_shift);
-		if (err) {
-			ibdev_err(ibdev, "Failed to request mtr hem, err %d\n",
-				  err);
+	hns_roce_hem_list_init(&mtr->hem_list);
+	if (!cfg->is_direct) {
+		ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list,
+						cfg->region, cfg->region_count,
+						ba_page_shift);
+		if (ret) {
+			ibdev_err(ibdev, "failed to request mtr hem, ret = %d.\n",
+				  ret);
 			goto err_alloc_bufs;
 		}
-		mtr->hem_cfg.root_ba = mtr->hem_list.root_ba;
+		cfg->root_ba = mtr->hem_list.root_ba;
+		cfg->ba_pg_shift = ba_page_shift;
+	} else {
+		cfg->ba_pg_shift = cfg->buf_pg_shift;
 	}
 
 	/* no buffer to map */
@@ -1046,31 +1095,26 @@
 	/* alloc a tmp array to store buffer's dma address */
 	pages = kvcalloc(all_pg_cnt, sizeof(dma_addr_t), GFP_KERNEL);
 	if (!pages) {
-		err = -ENOMEM;
-		ibdev_err(ibdev, "Failed to alloc mtr page list %d\n",
+		ret = -ENOMEM;
+		ibdev_err(ibdev, "failed to alloc mtr page list %d.\n",
 			  all_pg_cnt);
 		goto err_alloc_hem_list;
 	}
 
 	get_pg_cnt = mtr_get_pages(hr_dev, mtr, pages, all_pg_cnt,
-				   mtr->hem_cfg.buf_pg_shift);
+				   buf_page_shift);
 	if (get_pg_cnt != all_pg_cnt) {
-		ibdev_err(ibdev, "Failed to get mtr page %d != %d\n",
+		ibdev_err(ibdev, "failed to get mtr page %d != %d.\n",
 			  get_pg_cnt, all_pg_cnt);
-		err = -ENOBUFS;
+		ret = -ENOBUFS;
 		goto err_alloc_page_list;
 	}
 
-	if (!has_mtt) {
-		mtr->hem_cfg.root_ba = pages[0];
-	} else {
-		/* write buffer's dma address to BA table */
-		err = hns_roce_mtr_map(hr_dev, mtr, pages, all_pg_cnt);
-		if (err) {
-			ibdev_err(ibdev, "Failed to map mtr pages, err %d\n",
-				  err);
-			goto err_alloc_page_list;
-		}
+	/* write buffer's dma address to BA table */
+	ret = hns_roce_mtr_map(hr_dev, mtr, pages, all_pg_cnt);
+	if (ret) {
+		ibdev_err(ibdev, "failed to map mtr pages, ret = %d.\n", ret);
+		goto err_alloc_page_list;
 	}
 
 	/* drop tmp array */
@@ -1082,7 +1126,7 @@
 	hns_roce_hem_list_release(hr_dev, &mtr->hem_list);
 err_alloc_bufs:
 	mtr_free_bufs(hr_dev, mtr);
-	return err;
+	return ret;
 }
 
 void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index a0a47bd..e94ca13 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -411,7 +411,6 @@
 				struct hns_roce_qp *hr_qp,
 				struct ib_qp_cap *cap)
 {
-	struct ib_device *ibdev = &hr_dev->ib_dev;
 	u32 cnt;
 
 	cnt = max(1U, cap->max_send_sge);
@@ -431,15 +430,6 @@
 	} else if (hr_qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) {
 		cnt = roundup_pow_of_two(sq_wqe_cnt *
 				     (hr_qp->sq.max_gs - HNS_ROCE_SGE_IN_WQE));
-
-		if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_A) {
-			if (cnt > hr_dev->caps.max_extend_sg) {
-				ibdev_err(ibdev,
-					  "failed to check exSGE num, exSGE num = %d.\n",
-					  cnt);
-				return -EINVAL;
-			}
-		}
 	} else {
 		cnt = 0;
 	}
diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c
index 0687173..259444c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_restrack.c
+++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c
@@ -76,10 +76,9 @@
 	return -EMSGSIZE;
 }
 
-static int hns_roce_fill_res_cq_entry(struct sk_buff *msg,
-				      struct rdma_restrack_entry *res)
+int hns_roce_fill_res_cq_entry(struct sk_buff *msg,
+			       struct ib_cq *ib_cq)
 {
-	struct ib_cq *ib_cq = container_of(res, struct ib_cq, res);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
 	struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
 	struct hns_roce_v2_cq_context *context;
@@ -119,12 +118,3 @@
 	kfree(context);
 	return ret;
 }
-
-int hns_roce_fill_res_entry(struct sk_buff *msg,
-			    struct rdma_restrack_entry *res)
-{
-	if (res->type == RDMA_RESTRACK_CQ)
-		return hns_roce_fill_res_cq_entry(msg, res);
-
-	return 0;
-}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 19af29a..6957e4f 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -101,7 +101,6 @@
 	props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
 		IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
 	props->gid_tbl_len = 1;
-	props->pkey_tbl_len = 1;
 	props->active_width = IB_WIDTH_4X;
 	props->active_speed = 1;
 	props->max_msg_sz = I40IW_MAX_OUTBOUND_MESSAGE_SIZE;
@@ -1543,10 +1542,9 @@
  * @pd: ibpd pointer
  * @mr_type: memory for stag registrion
  * @max_num_sg: man number of pages
- * @udata: user data or NULL for kernel objects
  */
 static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-				    u32 max_num_sg, struct ib_udata *udata)
+				    u32 max_num_sg)
 {
 	struct i40iw_pd *iwpd = to_iwpd(pd);
 	struct i40iw_device *iwdev = to_iwdev(pd->device);
@@ -2460,7 +2458,6 @@
 	if (err)
 		return err;
 
-	immutable->pkey_tbl_len = attr.pkey_tbl_len;
 	immutable->gid_tbl_len = attr.gid_tbl_len;
 
 	return 0;
@@ -2616,22 +2613,6 @@
 	return 0;
 }
 
-/**
- * i40iw_query_pkey - Query partition key
- * @ibdev: device pointer from stack
- * @port: port number
- * @index: index of pkey
- * @pkey: pointer to store the pkey
- */
-static int i40iw_query_pkey(struct ib_device *ibdev,
-			    u8 port,
-			    u16 index,
-			    u16 *pkey)
-{
-	*pkey = 0;
-	return 0;
-}
-
 static const struct ib_device_ops i40iw_dev_ops = {
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_I40IW,
@@ -2671,7 +2652,6 @@
 	.post_send = i40iw_post_send,
 	.query_device = i40iw_query_device,
 	.query_gid = i40iw_query_gid,
-	.query_pkey = i40iw_query_pkey,
 	.query_port = i40iw_query_port,
 	.query_qp = i40iw_query_qp,
 	.reg_user_mr = i40iw_reg_user_mr,
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 816d288..5e7910a 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1219,56 +1219,47 @@
 	mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
 }
 
-static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
-					  struct ib_udata *udata)
+static int mlx4_ib_alloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata)
 {
-	struct mlx4_ib_xrcd *xrcd;
+	struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device);
+	struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd);
 	struct ib_cq_init_attr cq_attr = {};
 	int err;
 
-	if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
-		return ERR_PTR(-ENOSYS);
+	if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+		return -EOPNOTSUPP;
 
-	xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL);
-	if (!xrcd)
-		return ERR_PTR(-ENOMEM);
-
-	err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn);
+	err = mlx4_xrcd_alloc(dev->dev, &xrcd->xrcdn);
 	if (err)
-		goto err1;
+		return err;
 
-	xrcd->pd = ib_alloc_pd(ibdev, 0);
+	xrcd->pd = ib_alloc_pd(ibxrcd->device, 0);
 	if (IS_ERR(xrcd->pd)) {
 		err = PTR_ERR(xrcd->pd);
 		goto err2;
 	}
 
 	cq_attr.cqe = 1;
-	xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, &cq_attr);
+	xrcd->cq = ib_create_cq(ibxrcd->device, NULL, NULL, xrcd, &cq_attr);
 	if (IS_ERR(xrcd->cq)) {
 		err = PTR_ERR(xrcd->cq);
 		goto err3;
 	}
 
-	return &xrcd->ibxrcd;
+	return 0;
 
 err3:
 	ib_dealloc_pd(xrcd->pd);
 err2:
-	mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn);
-err1:
-	kfree(xrcd);
-	return ERR_PTR(err);
+	mlx4_xrcd_free(dev->dev, xrcd->xrcdn);
+	return err;
 }
 
-static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
+static void mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
 {
 	ib_destroy_cq(to_mxrcd(xrcd)->cq);
 	ib_dealloc_pd(to_mxrcd(xrcd)->pd);
 	mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn);
-	kfree(xrcd);
-
-	return 0;
 }
 
 static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
@@ -2607,6 +2598,8 @@
 static const struct ib_device_ops mlx4_ib_dev_xrc_ops = {
 	.alloc_xrcd = mlx4_ib_alloc_xrcd,
 	.dealloc_xrcd = mlx4_ib_dealloc_xrcd,
+
+	INIT_RDMA_OBJ_SIZE(ib_xrcd, mlx4_ib_xrcd, ibxrcd),
 };
 
 static const struct ib_device_ops mlx4_ib_dev_fs_ops = {
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 6f4ea10..38e87a7 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -729,7 +729,7 @@
 			       struct ib_udata *udata);
 int mlx4_ib_dealloc_mw(struct ib_mw *mw);
 struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata);
+			       u32 max_num_sg);
 int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		      unsigned int *sg_offset);
 int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 7e0b205..1d5ef0d 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -439,7 +439,6 @@
 
 	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
 	mr->ibmr.length = length;
-	mr->ibmr.iova = virt_addr;
 	mr->ibmr.page_size = 1U << shift;
 
 	return &mr->ibmr;
@@ -655,7 +654,7 @@
 }
 
 struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata)
+			       u32 max_num_sg)
 {
 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
 	struct mlx4_ib_mr *mr;
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index 8cca61c..b4c009b 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -4,6 +4,7 @@
 mlx5_ib-y := ah.o \
 	     cmd.o \
 	     cong.o \
+	     counters.o \
 	     cq.o \
 	     doorbell.o \
 	     gsi.o \
@@ -22,5 +23,6 @@
 mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
 mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o
 mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o \
-					    flow.o \
-					    qos.o
+					    fs.o \
+					    qos.o \
+					    std_types.o
diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index cc24c71..ebb2f10 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -148,18 +148,6 @@
 	spin_unlock(&dm->lock);
 }
 
-int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out)
-{
-	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
-	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
-
-	MLX5_SET(ppcnt_reg, in, local_port, 1);
-
-	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
-	return  mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPCNT,
-				     0, 0);
-}
-
 void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {};
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index f4d8558..1d192a8 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -41,7 +41,6 @@
 int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey);
 int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
 			       void *out);
-int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out);
 int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
 			 u64 length, u32 alignment);
 void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length);
diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c
new file mode 100644
index 0000000..145f3cb
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/counters.c
@@ -0,0 +1,709 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include "mlx5_ib.h"
+#include <linux/mlx5/eswitch.h>
+#include "counters.h"
+#include "ib_rep.h"
+#include "qp.h"
+
+struct mlx5_ib_counter {
+	const char *name;
+	size_t offset;
+};
+
+#define INIT_Q_COUNTER(_name)		\
+	{ .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
+
+static const struct mlx5_ib_counter basic_q_cnts[] = {
+	INIT_Q_COUNTER(rx_write_requests),
+	INIT_Q_COUNTER(rx_read_requests),
+	INIT_Q_COUNTER(rx_atomic_requests),
+	INIT_Q_COUNTER(out_of_buffer),
+};
+
+static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
+	INIT_Q_COUNTER(out_of_sequence),
+};
+
+static const struct mlx5_ib_counter retrans_q_cnts[] = {
+	INIT_Q_COUNTER(duplicate_request),
+	INIT_Q_COUNTER(rnr_nak_retry_err),
+	INIT_Q_COUNTER(packet_seq_err),
+	INIT_Q_COUNTER(implied_nak_seq_err),
+	INIT_Q_COUNTER(local_ack_timeout_err),
+};
+
+#define INIT_CONG_COUNTER(_name)		\
+	{ .name = #_name, .offset =	\
+		MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
+
+static const struct mlx5_ib_counter cong_cnts[] = {
+	INIT_CONG_COUNTER(rp_cnp_ignored),
+	INIT_CONG_COUNTER(rp_cnp_handled),
+	INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
+	INIT_CONG_COUNTER(np_cnp_sent),
+};
+
+static const struct mlx5_ib_counter extended_err_cnts[] = {
+	INIT_Q_COUNTER(resp_local_length_error),
+	INIT_Q_COUNTER(resp_cqe_error),
+	INIT_Q_COUNTER(req_cqe_error),
+	INIT_Q_COUNTER(req_remote_invalid_request),
+	INIT_Q_COUNTER(req_remote_access_errors),
+	INIT_Q_COUNTER(resp_remote_access_errors),
+	INIT_Q_COUNTER(resp_cqe_flush_error),
+	INIT_Q_COUNTER(req_cqe_flush_error),
+};
+
+static const struct mlx5_ib_counter roce_accl_cnts[] = {
+	INIT_Q_COUNTER(roce_adp_retrans),
+	INIT_Q_COUNTER(roce_adp_retrans_to),
+	INIT_Q_COUNTER(roce_slow_restart),
+	INIT_Q_COUNTER(roce_slow_restart_cnps),
+	INIT_Q_COUNTER(roce_slow_restart_trans),
+};
+
+#define INIT_EXT_PPCNT_COUNTER(_name)		\
+	{ .name = #_name, .offset =	\
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_extended_cntrs_grp_data_layout._name##_high)}
+
+static const struct mlx5_ib_counter ext_ppcnt_cnts[] = {
+	INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
+};
+
+static int mlx5_ib_read_counters(struct ib_counters *counters,
+				 struct ib_counters_read_attr *read_attr,
+				 struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+	struct mlx5_read_counters_attr mread_attr = {};
+	struct mlx5_ib_flow_counters_desc *desc;
+	int ret, i;
+
+	mutex_lock(&mcounters->mcntrs_mutex);
+	if (mcounters->cntrs_max_index > read_attr->ncounters) {
+		ret = -EINVAL;
+		goto err_bound;
+	}
+
+	mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64),
+				 GFP_KERNEL);
+	if (!mread_attr.out) {
+		ret = -ENOMEM;
+		goto err_bound;
+	}
+
+	mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl;
+	mread_attr.flags = read_attr->flags;
+	ret = mcounters->read_counters(counters->device, &mread_attr);
+	if (ret)
+		goto err_read;
+
+	/* do the pass over the counters data array to assign according to the
+	 * descriptions and indexing pairs
+	 */
+	desc = mcounters->counters_data;
+	for (i = 0; i < mcounters->ncounters; i++)
+		read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description];
+
+err_read:
+	kfree(mread_attr.out);
+err_bound:
+	mutex_unlock(&mcounters->mcntrs_mutex);
+	return ret;
+}
+
+static void mlx5_ib_destroy_counters(struct ib_counters *counters)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+
+	mlx5_ib_counters_clear_description(counters);
+	if (mcounters->hw_cntrs_hndl)
+		mlx5_fc_destroy(to_mdev(counters->device)->mdev,
+				mcounters->hw_cntrs_hndl);
+}
+
+static int mlx5_ib_create_counters(struct ib_counters *counters,
+				   struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+
+	mutex_init(&mcounters->mcntrs_mutex);
+	return 0;
+}
+
+
+static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev)
+{
+	return MLX5_ESWITCH_MANAGER(mdev) &&
+	       mlx5_ib_eswitch_mode(mdev->priv.eswitch) ==
+		       MLX5_ESWITCH_OFFLOADS;
+}
+
+static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
+						   u8 port_num)
+{
+	return is_mdev_switchdev_mode(dev->mdev) ? &dev->port[0].cnts :
+						   &dev->port[port_num].cnts;
+}
+
+/**
+ * mlx5_ib_get_counters_id - Returns counters id to use for device+port
+ * @dev:	Pointer to mlx5 IB device
+ * @port_num:	Zero based port number
+ *
+ * mlx5_ib_get_counters_id() Returns counters set id to use for given
+ * device port combination in switchdev and non switchdev mode of the
+ * parent device.
+ */
+u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num)
+{
+	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
+
+	return cnts->set_id;
+}
+
+static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
+						    u8 port_num)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	const struct mlx5_ib_counters *cnts;
+	bool is_switchdev = is_mdev_switchdev_mode(dev->mdev);
+
+	if ((is_switchdev && port_num) || (!is_switchdev && !port_num))
+		return NULL;
+
+	cnts = get_counters(dev, port_num - 1);
+
+	return rdma_alloc_hw_stats_struct(cnts->names,
+					  cnts->num_q_counters +
+					  cnts->num_cong_counters +
+					  cnts->num_ext_ppcnt_counters,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
+				    const struct mlx5_ib_counters *cnts,
+				    struct rdma_hw_stats *stats,
+				    u16 set_id)
+{
+	u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {};
+	__be32 val;
+	int ret, i;
+
+	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
+	MLX5_SET(query_q_counter_in, in, counter_set_id, set_id);
+	ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < cnts->num_q_counters; i++) {
+		val = *(__be32 *)((void *)out + cnts->offsets[i]);
+		stats->value[i] = (u64)be32_to_cpu(val);
+	}
+
+	return 0;
+}
+
+static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
+					    const struct mlx5_ib_counters *cnts,
+					    struct rdma_hw_stats *stats)
+{
+	int offset = cnts->num_q_counters + cnts->num_cong_counters;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	int ret, i;
+	void *out;
+
+	out = kvzalloc(sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
+	ret = mlx5_core_access_reg(dev->mdev, in, sz, out, sz, MLX5_REG_PPCNT,
+				   0, 0);
+	if (ret)
+		goto free;
+
+	for (i = 0; i < cnts->num_ext_ppcnt_counters; i++)
+		stats->value[i + offset] =
+			be64_to_cpup((__be64 *)(out +
+				    cnts->offsets[i + offset]));
+free:
+	kvfree(out);
+	return ret;
+}
+
+static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
+				struct rdma_hw_stats *stats,
+				u8 port_num, int index)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1);
+	struct mlx5_core_dev *mdev;
+	int ret, num_counters;
+	u8 mdev_port_num;
+
+	if (!stats)
+		return -EINVAL;
+
+	num_counters = cnts->num_q_counters +
+		       cnts->num_cong_counters +
+		       cnts->num_ext_ppcnt_counters;
+
+	/* q_counters are per IB device, query the master mdev */
+	ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats, cnts->set_id);
+	if (ret)
+		return ret;
+
+	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
+		ret =  mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats);
+		if (ret)
+			return ret;
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		mdev = mlx5_ib_get_native_port_mdev(dev, port_num,
+						    &mdev_port_num);
+		if (!mdev) {
+			/* If port is not affiliated yet, its in down state
+			 * which doesn't have any counters yet, so it would be
+			 * zero. So no need to read from the HCA.
+			 */
+			goto done;
+		}
+		ret = mlx5_lag_query_cong_counters(dev->mdev,
+						   stats->value +
+						   cnts->num_q_counters,
+						   cnts->num_cong_counters,
+						   cnts->offsets +
+						   cnts->num_q_counters);
+
+		mlx5_ib_put_native_port_mdev(dev, port_num);
+		if (ret)
+			return ret;
+	}
+
+done:
+	return num_counters;
+}
+
+static struct rdma_hw_stats *
+mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(counter->device);
+	const struct mlx5_ib_counters *cnts =
+		get_counters(dev, counter->port - 1);
+
+	return rdma_alloc_hw_stats_struct(cnts->names,
+					  cnts->num_q_counters +
+					  cnts->num_cong_counters +
+					  cnts->num_ext_ppcnt_counters,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(counter->device);
+	const struct mlx5_ib_counters *cnts =
+		get_counters(dev, counter->port - 1);
+
+	return mlx5_ib_query_q_counters(dev->mdev, cnts,
+					counter->stats, counter->id);
+}
+
+static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(counter->device);
+	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
+
+	if (!counter->id)
+		return 0;
+
+	MLX5_SET(dealloc_q_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
+	MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id);
+	return mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in);
+}
+
+static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
+				   struct ib_qp *qp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	int err;
+
+	if (!counter->id) {
+		u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {};
+		u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {};
+
+		MLX5_SET(alloc_q_counter_in, in, opcode,
+			 MLX5_CMD_OP_ALLOC_Q_COUNTER);
+		MLX5_SET(alloc_q_counter_in, in, uid, MLX5_SHARED_RESOURCE_UID);
+		err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out);
+		if (err)
+			return err;
+		counter->id =
+			MLX5_GET(alloc_q_counter_out, out, counter_set_id);
+	}
+
+	err = mlx5_ib_qp_set_counter(qp, counter);
+	if (err)
+		goto fail_set_counter;
+
+	return 0;
+
+fail_set_counter:
+	mlx5_ib_counter_dealloc(counter);
+	counter->id = 0;
+
+	return err;
+}
+
+static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
+{
+	return mlx5_ib_qp_set_counter(qp, NULL);
+}
+
+
+static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
+				  const char **names,
+				  size_t *offsets)
+{
+	int i;
+	int j = 0;
+
+	for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
+		names[j] = basic_q_cnts[i].name;
+		offsets[j] = basic_q_cnts[i].offset;
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
+		for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
+			names[j] = out_of_seq_q_cnts[i].name;
+			offsets[j] = out_of_seq_q_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
+		for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
+			names[j] = retrans_q_cnts[i].name;
+			offsets[j] = retrans_q_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
+		for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
+			names[j] = extended_err_cnts[i].name;
+			offsets[j] = extended_err_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, roce_accl)) {
+		for (i = 0; i < ARRAY_SIZE(roce_accl_cnts); i++, j++) {
+			names[j] = roce_accl_cnts[i].name;
+			offsets[j] = roce_accl_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
+			names[j] = cong_cnts[i].name;
+			offsets[j] = cong_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
+		for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) {
+			names[j] = ext_ppcnt_cnts[i].name;
+			offsets[j] = ext_ppcnt_cnts[i].offset;
+		}
+	}
+}
+
+
+static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_counters *cnts)
+{
+	u32 num_counters;
+
+	num_counters = ARRAY_SIZE(basic_q_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
+		num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
+		num_counters += ARRAY_SIZE(retrans_q_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
+		num_counters += ARRAY_SIZE(extended_err_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, roce_accl))
+		num_counters += ARRAY_SIZE(roce_accl_cnts);
+
+	cnts->num_q_counters = num_counters;
+
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
+		num_counters += ARRAY_SIZE(cong_cnts);
+	}
+	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
+		cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts);
+		num_counters += ARRAY_SIZE(ext_ppcnt_cnts);
+	}
+	cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL);
+	if (!cnts->names)
+		return -ENOMEM;
+
+	cnts->offsets = kcalloc(num_counters,
+				sizeof(cnts->offsets), GFP_KERNEL);
+	if (!cnts->offsets)
+		goto err_names;
+
+	return 0;
+
+err_names:
+	kfree(cnts->names);
+	cnts->names = NULL;
+	return -ENOMEM;
+}
+
+static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
+	int num_cnt_ports;
+	int i;
+
+	num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
+
+	MLX5_SET(dealloc_q_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
+
+	for (i = 0; i < num_cnt_ports; i++) {
+		if (dev->port[i].cnts.set_id) {
+			MLX5_SET(dealloc_q_counter_in, in, counter_set_id,
+				 dev->port[i].cnts.set_id);
+			mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in);
+		}
+		kfree(dev->port[i].cnts.names);
+		kfree(dev->port[i].cnts.offsets);
+	}
+}
+
+static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {};
+	int num_cnt_ports;
+	int err = 0;
+	int i;
+	bool is_shared;
+
+	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
+	is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
+	num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
+
+	for (i = 0; i < num_cnt_ports; i++) {
+		err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
+		if (err)
+			goto err_alloc;
+
+		mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
+				      dev->port[i].cnts.offsets);
+
+		MLX5_SET(alloc_q_counter_in, in, uid,
+			 is_shared ? MLX5_SHARED_RESOURCE_UID : 0);
+
+		err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out);
+		if (err) {
+			mlx5_ib_warn(dev,
+				     "couldn't allocate queue counter for port %d, err %d\n",
+				     i + 1, err);
+			goto err_alloc;
+		}
+
+		dev->port[i].cnts.set_id =
+			MLX5_GET(alloc_q_counter_out, out, counter_set_id);
+	}
+	return 0;
+
+err_alloc:
+	mlx5_ib_dealloc_counters(dev);
+	return err;
+}
+
+static int read_flow_counters(struct ib_device *ibdev,
+			      struct mlx5_read_counters_attr *read_attr)
+{
+	struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+
+	return mlx5_fc_query(dev->mdev, fc,
+			     &read_attr->out[IB_COUNTER_PACKETS],
+			     &read_attr->out[IB_COUNTER_BYTES]);
+}
+
+/* flow counters currently expose two counters packets and bytes */
+#define FLOW_COUNTERS_NUM 2
+static int counters_set_description(
+	struct ib_counters *counters, enum mlx5_ib_counters_type counters_type,
+	struct mlx5_ib_flow_counters_desc *desc_data, u32 ncounters)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+	u32 cntrs_max_index = 0;
+	int i;
+
+	if (counters_type != MLX5_IB_COUNTERS_FLOW)
+		return -EINVAL;
+
+	/* init the fields for the object */
+	mcounters->type = counters_type;
+	mcounters->read_counters = read_flow_counters;
+	mcounters->counters_num = FLOW_COUNTERS_NUM;
+	mcounters->ncounters = ncounters;
+	/* each counter entry have both description and index pair */
+	for (i = 0; i < ncounters; i++) {
+		if (desc_data[i].description > IB_COUNTER_BYTES)
+			return -EINVAL;
+
+		if (cntrs_max_index <= desc_data[i].index)
+			cntrs_max_index = desc_data[i].index + 1;
+	}
+
+	mutex_lock(&mcounters->mcntrs_mutex);
+	mcounters->counters_data = desc_data;
+	mcounters->cntrs_max_index = cntrs_max_index;
+	mutex_unlock(&mcounters->mcntrs_mutex);
+
+	return 0;
+}
+
+#define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
+int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters,
+				   struct mlx5_ib_create_flow *ucmd)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
+	struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
+	struct mlx5_ib_flow_counters_desc *desc_data = NULL;
+	bool hw_hndl = false;
+	int ret = 0;
+
+	if (ucmd && ucmd->ncounters_data != 0) {
+		cntrs_data = ucmd->data;
+		if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
+			return -EINVAL;
+
+		desc_data = kcalloc(cntrs_data->ncounters,
+				    sizeof(*desc_data),
+				    GFP_KERNEL);
+		if (!desc_data)
+			return  -ENOMEM;
+
+		if (copy_from_user(desc_data,
+				   u64_to_user_ptr(cntrs_data->counters_data),
+				   sizeof(*desc_data) * cntrs_data->ncounters)) {
+			ret = -EFAULT;
+			goto free;
+		}
+	}
+
+	if (!mcounters->hw_cntrs_hndl) {
+		mcounters->hw_cntrs_hndl = mlx5_fc_create(
+			to_mdev(ibcounters->device)->mdev, false);
+		if (IS_ERR(mcounters->hw_cntrs_hndl)) {
+			ret = PTR_ERR(mcounters->hw_cntrs_hndl);
+			goto free;
+		}
+		hw_hndl = true;
+	}
+
+	if (desc_data) {
+		/* counters already bound to at least one flow */
+		if (mcounters->cntrs_max_index) {
+			ret = -EINVAL;
+			goto free_hndl;
+		}
+
+		ret = counters_set_description(ibcounters,
+					       MLX5_IB_COUNTERS_FLOW,
+					       desc_data,
+					       cntrs_data->ncounters);
+		if (ret)
+			goto free_hndl;
+
+	} else if (!mcounters->cntrs_max_index) {
+		/* counters not bound yet, must have udata passed */
+		ret = -EINVAL;
+		goto free_hndl;
+	}
+
+	return 0;
+
+free_hndl:
+	if (hw_hndl) {
+		mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev,
+				mcounters->hw_cntrs_hndl);
+		mcounters->hw_cntrs_hndl = NULL;
+	}
+free:
+	kfree(desc_data);
+	return ret;
+}
+
+void mlx5_ib_counters_clear_description(struct ib_counters *counters)
+{
+	struct mlx5_ib_mcounters *mcounters;
+
+	if (!counters || atomic_read(&counters->usecnt) != 1)
+		return;
+
+	mcounters = to_mcounters(counters);
+
+	mutex_lock(&mcounters->mcntrs_mutex);
+	kfree(mcounters->counters_data);
+	mcounters->counters_data = NULL;
+	mcounters->cntrs_max_index = 0;
+	mutex_unlock(&mcounters->mcntrs_mutex);
+}
+
+static const struct ib_device_ops hw_stats_ops = {
+	.alloc_hw_stats = mlx5_ib_alloc_hw_stats,
+	.get_hw_stats = mlx5_ib_get_hw_stats,
+	.counter_bind_qp = mlx5_ib_counter_bind_qp,
+	.counter_unbind_qp = mlx5_ib_counter_unbind_qp,
+	.counter_dealloc = mlx5_ib_counter_dealloc,
+	.counter_alloc_stats = mlx5_ib_counter_alloc_stats,
+	.counter_update_stats = mlx5_ib_counter_update_stats,
+};
+
+static const struct ib_device_ops counters_ops = {
+	.create_counters = mlx5_ib_create_counters,
+	.destroy_counters = mlx5_ib_destroy_counters,
+	.read_counters = mlx5_ib_read_counters,
+
+	INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
+};
+
+int mlx5_ib_counters_init(struct mlx5_ib_dev *dev)
+{
+	ib_set_device_ops(&dev->ib_dev, &counters_ops);
+
+	if (!MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
+		return 0;
+
+	ib_set_device_ops(&dev->ib_dev, &hw_stats_ops);
+	return mlx5_ib_alloc_counters(dev);
+}
+
+void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev)
+{
+	if (!MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
+		return;
+
+	mlx5_ib_dealloc_counters(dev);
+}
diff --git a/drivers/infiniband/hw/mlx5/counters.h b/drivers/infiniband/hw/mlx5/counters.h
new file mode 100644
index 0000000..1aa30c2
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/counters.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _MLX5_IB_COUNTERS_H
+#define _MLX5_IB_COUNTERS_H
+
+#include "mlx5_ib.h"
+
+int mlx5_ib_counters_init(struct mlx5_ib_dev *dev);
+void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev);
+void mlx5_ib_counters_clear_description(struct ib_counters *counters);
+int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters,
+				   struct mlx5_ib_create_flow *ucmd);
+u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num);
+#endif /* _MLX5_IB_COUNTERS_H */
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 655ea9c..9e3d8b82 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -14,6 +14,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
+#include "devx.h"
 #include "qp.h"
 #include <linux/xarray.h>
 
@@ -89,22 +90,6 @@
 	u8 is_destroyed:1;
 };
 
-#define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
-struct devx_obj {
-	struct mlx5_ib_dev	*ib_dev;
-	u64			obj_id;
-	u32			dinlen; /* destroy inbox length */
-	u32			dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW];
-	u32			flags;
-	union {
-		struct mlx5_ib_devx_mr	devx_mr;
-		struct mlx5_core_dct	core_dct;
-		struct mlx5_core_cq	core_cq;
-		u32			flow_counter_bulk_size;
-	};
-	struct list_head event_sub; /* holds devx_event_subscription entries */
-};
-
 struct devx_umem {
 	struct mlx5_core_dev		*mdev;
 	struct ib_umem			*umem;
@@ -171,48 +156,6 @@
 	mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
 }
 
-bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type)
-{
-	struct devx_obj *devx_obj = obj;
-	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
-
-	switch (opcode) {
-	case MLX5_CMD_OP_DESTROY_TIR:
-		*dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR;
-		*dest_id = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox,
-				    obj_id);
-		return true;
-
-	case MLX5_CMD_OP_DESTROY_FLOW_TABLE:
-		*dest_type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
-		*dest_id = MLX5_GET(destroy_flow_table_in, devx_obj->dinbox,
-				    table_id);
-		return true;
-	default:
-		return false;
-	}
-}
-
-bool mlx5_ib_devx_is_flow_counter(void *obj, u32 offset, u32 *counter_id)
-{
-	struct devx_obj *devx_obj = obj;
-	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
-
-	if (opcode == MLX5_CMD_OP_DEALLOC_FLOW_COUNTER) {
-
-		if (offset && offset >= devx_obj->flow_counter_bulk_size)
-			return false;
-
-		*counter_id = MLX5_GET(dealloc_flow_counter_in,
-				       devx_obj->dinbox,
-				       flow_counter_id);
-		*counter_id += offset;
-		return true;
-	}
-
-	return false;
-}
-
 static bool is_legacy_unaffiliated_event_num(u16 event_num)
 {
 	switch (event_num) {
@@ -2419,17 +2362,24 @@
 	return NOTIFY_OK;
 }
 
-void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev)
+int mlx5_ib_devx_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_devx_event_table *table = &dev->devx_event_table;
+	int uid;
 
-	xa_init(&table->event_xa);
-	mutex_init(&table->event_xa_lock);
-	MLX5_NB_INIT(&table->devx_nb, devx_event_notifier, NOTIFY_ANY);
-	mlx5_eq_notifier_register(dev->mdev, &table->devx_nb);
+	uid = mlx5_ib_devx_create(dev, false);
+	if (uid > 0) {
+		dev->devx_whitelist_uid = uid;
+		xa_init(&table->event_xa);
+		mutex_init(&table->event_xa_lock);
+		MLX5_NB_INIT(&table->devx_nb, devx_event_notifier, NOTIFY_ANY);
+		mlx5_eq_notifier_register(dev->mdev, &table->devx_nb);
+	}
+
+	return 0;
 }
 
-void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev)
+void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_devx_event_table *table = &dev->devx_event_table;
 	struct devx_event_subscription *sub, *tmp;
@@ -2437,17 +2387,21 @@
 	void *entry;
 	unsigned long id;
 
-	mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb);
-	mutex_lock(&dev->devx_event_table.event_xa_lock);
-	xa_for_each(&table->event_xa, id, entry) {
-		event = entry;
-		list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list,
-					 xa_list)
-			devx_cleanup_subscription(dev, sub);
-		kfree(entry);
+	if (dev->devx_whitelist_uid) {
+		mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb);
+		mutex_lock(&dev->devx_event_table.event_xa_lock);
+		xa_for_each(&table->event_xa, id, entry) {
+			event = entry;
+			list_for_each_entry_safe(
+				sub, tmp, &event->unaffiliated_list, xa_list)
+				devx_cleanup_subscription(dev, sub);
+			kfree(entry);
+		}
+		mutex_unlock(&dev->devx_event_table.event_xa_lock);
+		xa_destroy(&table->event_xa);
+
+		mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
 	}
-	mutex_unlock(&dev->devx_event_table.event_xa_lock);
-	xa_destroy(&table->event_xa);
 }
 
 static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
diff --git a/drivers/infiniband/hw/mlx5/devx.h b/drivers/infiniband/hw/mlx5/devx.h
new file mode 100644
index 0000000..1f69866
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/devx.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2019-2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _MLX5_IB_DEVX_H
+#define _MLX5_IB_DEVX_H
+
+#include "mlx5_ib.h"
+
+#define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
+struct devx_obj {
+	struct mlx5_ib_dev	*ib_dev;
+	u64			obj_id;
+	u32			dinlen; /* destroy inbox length */
+	u32			dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW];
+	u32			flags;
+	union {
+		struct mlx5_ib_devx_mr	devx_mr;
+		struct mlx5_core_dct	core_dct;
+		struct mlx5_core_cq	core_cq;
+		u32			flow_counter_bulk_size;
+	};
+	struct list_head event_sub; /* holds devx_event_subscription entries */
+};
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
+int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
+void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid);
+int mlx5_ib_devx_init(struct mlx5_ib_dev *dev);
+void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev);
+#else
+static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user)
+{
+	return -EOPNOTSUPP;
+}
+static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {}
+static inline int mlx5_ib_devx_init(struct mlx5_ib_dev *dev)
+{
+	return 0;
+}
+static inline void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev)
+{
+}
+#endif
+#endif /* _MLX5_IB_DEVX_H */
diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c
deleted file mode 100644
index 216a110..0000000
--- a/drivers/infiniband/hw/mlx5/flow.c
+++ /dev/null
@@ -1,765 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
-/*
- * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
- */
-
-#include <rdma/ib_user_verbs.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/uverbs_types.h>
-#include <rdma/uverbs_ioctl.h>
-#include <rdma/uverbs_std_types.h>
-#include <rdma/mlx5_user_ioctl_cmds.h>
-#include <rdma/mlx5_user_ioctl_verbs.h>
-#include <rdma/ib_umem.h>
-#include <linux/mlx5/driver.h>
-#include <linux/mlx5/fs.h>
-#include "mlx5_ib.h"
-
-#define UVERBS_MODULE_NAME mlx5_ib
-#include <rdma/uverbs_named_ioctl.h>
-
-static int
-mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type,
-			     enum mlx5_flow_namespace_type *namespace)
-{
-	switch (table_type) {
-	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX:
-		*namespace = MLX5_FLOW_NAMESPACE_BYPASS;
-		break;
-	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX:
-		*namespace = MLX5_FLOW_NAMESPACE_EGRESS;
-		break;
-	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB:
-		*namespace = MLX5_FLOW_NAMESPACE_FDB;
-		break;
-	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX:
-		*namespace = MLX5_FLOW_NAMESPACE_RDMA_RX;
-		break;
-	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX:
-		*namespace = MLX5_FLOW_NAMESPACE_RDMA_TX;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static const struct uverbs_attr_spec mlx5_ib_flow_type[] = {
-	[MLX5_IB_FLOW_TYPE_NORMAL] = {
-		.type = UVERBS_ATTR_TYPE_PTR_IN,
-		.u.ptr = {
-			.len = sizeof(u16), /* data is priority */
-			.min_len = sizeof(u16),
-		}
-	},
-	[MLX5_IB_FLOW_TYPE_SNIFFER] = {
-		.type = UVERBS_ATTR_TYPE_PTR_IN,
-		UVERBS_ATTR_NO_DATA(),
-	},
-	[MLX5_IB_FLOW_TYPE_ALL_DEFAULT] = {
-		.type = UVERBS_ATTR_TYPE_PTR_IN,
-		UVERBS_ATTR_NO_DATA(),
-	},
-	[MLX5_IB_FLOW_TYPE_MC_DEFAULT] = {
-		.type = UVERBS_ATTR_TYPE_PTR_IN,
-		UVERBS_ATTR_NO_DATA(),
-	},
-};
-
-static int get_dests(struct uverbs_attr_bundle *attrs,
-		     struct mlx5_ib_flow_matcher *fs_matcher, int *dest_id,
-		     int *dest_type, struct ib_qp **qp, u32 *flags)
-{
-	bool dest_devx, dest_qp;
-	void *devx_obj;
-	int err;
-
-	dest_devx = uverbs_attr_is_valid(attrs,
-					 MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX);
-	dest_qp = uverbs_attr_is_valid(attrs,
-				       MLX5_IB_ATTR_CREATE_FLOW_DEST_QP);
-
-	*flags = 0;
-	err = uverbs_get_flags32(flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_FLAGS,
-				 MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS |
-					 MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP);
-	if (err)
-		return err;
-
-	/* Both flags are not allowed */
-	if (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS &&
-	    *flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)
-		return -EINVAL;
-
-	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
-		if (dest_devx && (dest_qp || *flags))
-			return -EINVAL;
-		else if (dest_qp && *flags)
-			return -EINVAL;
-	}
-
-	/* Allow only DEVX object, drop as dest for FDB */
-	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !(dest_devx ||
-	     (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)))
-		return -EINVAL;
-
-	/* Allow only DEVX object or QP as dest when inserting to RDMA_RX */
-	if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
-	    ((!dest_devx && !dest_qp) || (dest_devx && dest_qp)))
-		return -EINVAL;
-
-	*qp = NULL;
-	if (dest_devx) {
-		devx_obj =
-			uverbs_attr_get_obj(attrs,
-					    MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX);
-
-		/* Verify that the given DEVX object is a flow
-		 * steering destination.
-		 */
-		if (!mlx5_ib_devx_is_flow_dest(devx_obj, dest_id, dest_type))
-			return -EINVAL;
-		/* Allow only flow table as dest when inserting to FDB or RDMA_RX */
-		if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB ||
-		     fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
-		    *dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
-			return -EINVAL;
-	} else if (dest_qp) {
-		struct mlx5_ib_qp *mqp;
-
-		*qp = uverbs_attr_get_obj(attrs,
-					  MLX5_IB_ATTR_CREATE_FLOW_DEST_QP);
-		if (IS_ERR(*qp))
-			return PTR_ERR(*qp);
-
-		if ((*qp)->qp_type != IB_QPT_RAW_PACKET)
-			return -EINVAL;
-
-		mqp = to_mqp(*qp);
-		if (mqp->is_rss)
-			*dest_id = mqp->rss_qp.tirn;
-		else
-			*dest_id = mqp->raw_packet_qp.rq.tirn;
-		*dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR;
-	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) {
-		*dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT;
-	}
-
-	if (*dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
-	    fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
-		return -EINVAL;
-
-	return 0;
-}
-
-#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2
-static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
-	struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_flow_context flow_context = {.flow_tag =
-		MLX5_FS_DEFAULT_FLOW_TAG};
-	u32 *offset_attr, offset = 0, counter_id = 0;
-	int dest_id, dest_type, inlen, len, ret, i;
-	struct mlx5_ib_flow_handler *flow_handler;
-	struct mlx5_ib_flow_matcher *fs_matcher;
-	struct ib_uobject **arr_flow_actions;
-	struct ib_uflow_resources *uflow_res;
-	struct mlx5_flow_act flow_act = {};
-	struct ib_qp *qp = NULL;
-	void *devx_obj, *cmd_in;
-	struct ib_uobject *uobj;
-	struct mlx5_ib_dev *dev;
-	u32 flags;
-
-	if (!capable(CAP_NET_RAW))
-		return -EPERM;
-
-	fs_matcher = uverbs_attr_get_obj(attrs,
-					 MLX5_IB_ATTR_CREATE_FLOW_MATCHER);
-	uobj =  uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE);
-	dev = mlx5_udata_to_mdev(&attrs->driver_udata);
-
-	if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp, &flags))
-		return -EINVAL;
-
-	if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS)
-		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS;
-
-	if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)
-		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
-
-	len = uverbs_attr_get_uobjs_arr(attrs,
-		MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions);
-	if (len) {
-		devx_obj = arr_flow_actions[0]->object;
-
-		if (uverbs_attr_is_valid(attrs,
-					 MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET)) {
-
-			int num_offsets = uverbs_attr_ptr_get_array_size(
-				attrs,
-				MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET,
-				sizeof(u32));
-
-			if (num_offsets != 1)
-				return -EINVAL;
-
-			offset_attr = uverbs_attr_get_alloced_ptr(
-				attrs,
-				MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET);
-			offset = *offset_attr;
-		}
-
-		if (!mlx5_ib_devx_is_flow_counter(devx_obj, offset,
-						  &counter_id))
-			return -EINVAL;
-
-		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
-	}
-
-	cmd_in = uverbs_attr_get_alloced_ptr(
-		attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
-	inlen = uverbs_attr_get_len(attrs,
-				    MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
-
-	uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS);
-	if (!uflow_res)
-		return -ENOMEM;
-
-	len = uverbs_attr_get_uobjs_arr(attrs,
-		MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions);
-	for (i = 0; i < len; i++) {
-		struct mlx5_ib_flow_action *maction =
-			to_mflow_act(arr_flow_actions[i]->object);
-
-		ret = parse_flow_flow_action(maction, false, &flow_act);
-		if (ret)
-			goto err_out;
-		flow_resources_add(uflow_res, IB_FLOW_SPEC_ACTION_HANDLE,
-				   arr_flow_actions[i]->object);
-	}
-
-	ret = uverbs_copy_from(&flow_context.flow_tag, attrs,
-			       MLX5_IB_ATTR_CREATE_FLOW_TAG);
-	if (!ret) {
-		if (flow_context.flow_tag >= BIT(24)) {
-			ret = -EINVAL;
-			goto err_out;
-		}
-		flow_context.flags |= FLOW_CONTEXT_HAS_TAG;
-	}
-
-	flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher,
-					       &flow_context,
-					       &flow_act,
-					       counter_id,
-					       cmd_in, inlen,
-					       dest_id, dest_type);
-	if (IS_ERR(flow_handler)) {
-		ret = PTR_ERR(flow_handler);
-		goto err_out;
-	}
-
-	ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, uflow_res);
-
-	return 0;
-err_out:
-	ib_uverbs_flow_resources_free(uflow_res);
-	return ret;
-}
-
-static int flow_matcher_cleanup(struct ib_uobject *uobject,
-				enum rdma_remove_reason why,
-				struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_ib_flow_matcher *obj = uobject->object;
-	int ret;
-
-	ret = ib_destroy_usecnt(&obj->usecnt, why, uobject);
-	if (ret)
-		return ret;
-
-	kfree(obj);
-	return 0;
-}
-
-static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
-			      struct mlx5_ib_flow_matcher *obj)
-{
-	enum mlx5_ib_uapi_flow_table_type ft_type =
-		MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX;
-	u32 flags;
-	int err;
-
-	/* New users should use MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE and older
-	 * users should switch to it. We leave this to not break userspace
-	 */
-	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE) &&
-	    uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS))
-		return -EINVAL;
-
-	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE)) {
-		err = uverbs_get_const(&ft_type, attrs,
-				       MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE);
-		if (err)
-			return err;
-
-		err = mlx5_ib_ft_type_to_namespace(ft_type, &obj->ns_type);
-		if (err)
-			return err;
-
-		return 0;
-	}
-
-	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS)) {
-		err = uverbs_get_flags32(&flags, attrs,
-					 MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
-					 IB_FLOW_ATTR_FLAGS_EGRESS);
-		if (err)
-			return err;
-
-		if (flags) {
-			mlx5_ib_ft_type_to_namespace(
-				MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX,
-				&obj->ns_type);
-			return 0;
-		}
-	}
-
-	obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS;
-
-	return 0;
-}
-
-static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)(
-	struct uverbs_attr_bundle *attrs)
-{
-	struct ib_uobject *uobj = uverbs_attr_get_uobject(
-		attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE);
-	struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
-	struct mlx5_ib_flow_matcher *obj;
-	int err;
-
-	obj = kzalloc(sizeof(struct mlx5_ib_flow_matcher), GFP_KERNEL);
-	if (!obj)
-		return -ENOMEM;
-
-	obj->mask_len = uverbs_attr_get_len(
-		attrs, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK);
-	err = uverbs_copy_from(&obj->matcher_mask,
-			       attrs,
-			       MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK);
-	if (err)
-		goto end;
-
-	obj->flow_type = uverbs_attr_get_enum_id(
-		attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE);
-
-	if (obj->flow_type == MLX5_IB_FLOW_TYPE_NORMAL) {
-		err = uverbs_copy_from(&obj->priority,
-				       attrs,
-				       MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE);
-		if (err)
-			goto end;
-	}
-
-	err = uverbs_copy_from(&obj->match_criteria_enable,
-			       attrs,
-			       MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA);
-	if (err)
-		goto end;
-
-	err = mlx5_ib_matcher_ns(attrs, obj);
-	if (err)
-		goto end;
-
-	uobj->object = obj;
-	obj->mdev = dev->mdev;
-	atomic_set(&obj->usecnt, 0);
-	return 0;
-
-end:
-	kfree(obj);
-	return err;
-}
-
-void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction)
-{
-	switch (maction->flow_action_raw.sub_type) {
-	case MLX5_IB_FLOW_ACTION_MODIFY_HEADER:
-		mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev,
-					   maction->flow_action_raw.modify_hdr);
-		break;
-	case MLX5_IB_FLOW_ACTION_PACKET_REFORMAT:
-		mlx5_packet_reformat_dealloc(maction->flow_action_raw.dev->mdev,
-					     maction->flow_action_raw.pkt_reformat);
-		break;
-	case MLX5_IB_FLOW_ACTION_DECAP:
-		break;
-	default:
-		break;
-	}
-}
-
-static struct ib_flow_action *
-mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev,
-			     enum mlx5_ib_uapi_flow_table_type ft_type,
-			     u8 num_actions, void *in)
-{
-	enum mlx5_flow_namespace_type namespace;
-	struct mlx5_ib_flow_action *maction;
-	int ret;
-
-	ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace);
-	if (ret)
-		return ERR_PTR(-EINVAL);
-
-	maction = kzalloc(sizeof(*maction), GFP_KERNEL);
-	if (!maction)
-		return ERR_PTR(-ENOMEM);
-
-	maction->flow_action_raw.modify_hdr =
-		mlx5_modify_header_alloc(dev->mdev, namespace, num_actions, in);
-
-	if (IS_ERR(maction->flow_action_raw.modify_hdr)) {
-		ret = PTR_ERR(maction->flow_action_raw.modify_hdr);
-		kfree(maction);
-		return ERR_PTR(ret);
-	}
-	maction->flow_action_raw.sub_type =
-		MLX5_IB_FLOW_ACTION_MODIFY_HEADER;
-	maction->flow_action_raw.dev = dev;
-
-	return &maction->ib_action;
-}
-
-static bool mlx5_ib_modify_header_supported(struct mlx5_ib_dev *dev)
-{
-	return MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-					 max_modify_header_actions) ||
-	       MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
-					 max_modify_header_actions) ||
-	       MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev,
-					 max_modify_header_actions);
-}
-
-static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
-	struct uverbs_attr_bundle *attrs)
-{
-	struct ib_uobject *uobj = uverbs_attr_get_uobject(
-		attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE);
-	struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
-	enum mlx5_ib_uapi_flow_table_type ft_type;
-	struct ib_flow_action *action;
-	int num_actions;
-	void *in;
-	int ret;
-
-	if (!mlx5_ib_modify_header_supported(mdev))
-		return -EOPNOTSUPP;
-
-	in = uverbs_attr_get_alloced_ptr(attrs,
-		MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM);
-
-	num_actions = uverbs_attr_ptr_get_array_size(
-		attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM,
-		MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto));
-	if (num_actions < 0)
-		return num_actions;
-
-	ret = uverbs_get_const(&ft_type, attrs,
-			       MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE);
-	if (ret)
-		return ret;
-	action = mlx5_ib_create_modify_header(mdev, ft_type, num_actions, in);
-	if (IS_ERR(action))
-		return PTR_ERR(action);
-
-	uverbs_flow_action_fill_action(action, uobj, &mdev->ib_dev,
-				       IB_FLOW_ACTION_UNSPECIFIED);
-
-	return 0;
-}
-
-static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev,
-						      u8 packet_reformat_type,
-						      u8 ft_type)
-{
-	switch (packet_reformat_type) {
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
-		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX)
-			return MLX5_CAP_FLOWTABLE(ibdev->mdev,
-						  encap_general_header);
-		break;
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
-		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX)
-			return MLX5_CAP_FLOWTABLE_NIC_TX(ibdev->mdev,
-				reformat_l2_to_l3_tunnel);
-		break;
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
-		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX)
-			return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev,
-				reformat_l3_tunnel_to_l2);
-		break;
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2:
-		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX)
-			return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, decap);
-		break;
-	default:
-		break;
-	}
-
-	return false;
-}
-
-static int mlx5_ib_dv_to_prm_packet_reforamt_type(u8 dv_prt, u8 *prm_prt)
-{
-	switch (dv_prt) {
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
-		*prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL;
-		break;
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
-		*prm_prt = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2;
-		break;
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
-		*prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int mlx5_ib_flow_action_create_packet_reformat_ctx(
-	struct mlx5_ib_dev *dev,
-	struct mlx5_ib_flow_action *maction,
-	u8 ft_type, u8 dv_prt,
-	void *in, size_t len)
-{
-	enum mlx5_flow_namespace_type namespace;
-	u8 prm_prt;
-	int ret;
-
-	ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace);
-	if (ret)
-		return ret;
-
-	ret = mlx5_ib_dv_to_prm_packet_reforamt_type(dv_prt, &prm_prt);
-	if (ret)
-		return ret;
-
-	maction->flow_action_raw.pkt_reformat =
-		mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len,
-					   in, namespace);
-	if (IS_ERR(maction->flow_action_raw.pkt_reformat)) {
-		ret = PTR_ERR(maction->flow_action_raw.pkt_reformat);
-		return ret;
-	}
-
-	maction->flow_action_raw.sub_type =
-		MLX5_IB_FLOW_ACTION_PACKET_REFORMAT;
-	maction->flow_action_raw.dev = dev;
-
-	return 0;
-}
-
-static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)(
-	struct uverbs_attr_bundle *attrs)
-{
-	struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
-		MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE);
-	struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
-	enum mlx5_ib_uapi_flow_action_packet_reformat_type dv_prt;
-	enum mlx5_ib_uapi_flow_table_type ft_type;
-	struct mlx5_ib_flow_action *maction;
-	int ret;
-
-	ret = uverbs_get_const(&ft_type, attrs,
-			       MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE);
-	if (ret)
-		return ret;
-
-	ret = uverbs_get_const(&dv_prt, attrs,
-			       MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE);
-	if (ret)
-		return ret;
-
-	if (!mlx5_ib_flow_action_packet_reformat_valid(mdev, dv_prt, ft_type))
-		return -EOPNOTSUPP;
-
-	maction = kzalloc(sizeof(*maction), GFP_KERNEL);
-	if (!maction)
-		return -ENOMEM;
-
-	if (dv_prt ==
-	    MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2) {
-		maction->flow_action_raw.sub_type =
-			MLX5_IB_FLOW_ACTION_DECAP;
-		maction->flow_action_raw.dev = mdev;
-	} else {
-		void *in;
-		int len;
-
-		in = uverbs_attr_get_alloced_ptr(attrs,
-			MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF);
-		if (IS_ERR(in)) {
-			ret = PTR_ERR(in);
-			goto free_maction;
-		}
-
-		len = uverbs_attr_get_len(attrs,
-			MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF);
-
-		ret = mlx5_ib_flow_action_create_packet_reformat_ctx(mdev,
-			maction, ft_type, dv_prt, in, len);
-		if (ret)
-			goto free_maction;
-	}
-
-	uverbs_flow_action_fill_action(&maction->ib_action, uobj, &mdev->ib_dev,
-				       IB_FLOW_ACTION_UNSPECIFIED);
-	return 0;
-
-free_maction:
-	kfree(maction);
-	return ret;
-}
-
-DECLARE_UVERBS_NAMED_METHOD(
-	MLX5_IB_METHOD_CREATE_FLOW,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE,
-			UVERBS_OBJECT_FLOW,
-			UVERBS_ACCESS_NEW,
-			UA_MANDATORY),
-	UVERBS_ATTR_PTR_IN(
-		MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE,
-		UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)),
-		UA_MANDATORY,
-		UA_ALLOC_AND_COPY),
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_MATCHER,
-			MLX5_IB_OBJECT_FLOW_MATCHER,
-			UVERBS_ACCESS_READ,
-			UA_MANDATORY),
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_QP,
-			UVERBS_OBJECT_QP,
-			UVERBS_ACCESS_READ),
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX,
-			MLX5_IB_OBJECT_DEVX_OBJ,
-			UVERBS_ACCESS_READ),
-	UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS,
-			     UVERBS_OBJECT_FLOW_ACTION,
-			     UVERBS_ACCESS_READ, 1,
-			     MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS,
-			     UA_OPTIONAL),
-	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_TAG,
-			   UVERBS_ATTR_TYPE(u32),
-			   UA_OPTIONAL),
-	UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX,
-			     MLX5_IB_OBJECT_DEVX_OBJ,
-			     UVERBS_ACCESS_READ, 1, 1,
-			     UA_OPTIONAL),
-	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET,
-			   UVERBS_ATTR_MIN_SIZE(sizeof(u32)),
-			   UA_OPTIONAL,
-			   UA_ALLOC_AND_COPY),
-	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_FLAGS,
-			     enum mlx5_ib_create_flow_flags,
-			     UA_OPTIONAL));
-
-DECLARE_UVERBS_NAMED_METHOD_DESTROY(
-	MLX5_IB_METHOD_DESTROY_FLOW,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE,
-			UVERBS_OBJECT_FLOW,
-			UVERBS_ACCESS_DESTROY,
-			UA_MANDATORY));
-
-ADD_UVERBS_METHODS(mlx5_ib_fs,
-		   UVERBS_OBJECT_FLOW,
-		   &UVERBS_METHOD(MLX5_IB_METHOD_CREATE_FLOW),
-		   &UVERBS_METHOD(MLX5_IB_METHOD_DESTROY_FLOW));
-
-DECLARE_UVERBS_NAMED_METHOD(
-	MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE,
-			UVERBS_OBJECT_FLOW_ACTION,
-			UVERBS_ACCESS_NEW,
-			UA_MANDATORY),
-	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM,
-			   UVERBS_ATTR_MIN_SIZE(MLX5_UN_SZ_BYTES(
-				   set_add_copy_action_in_auto)),
-			   UA_MANDATORY,
-			   UA_ALLOC_AND_COPY),
-	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE,
-			     enum mlx5_ib_uapi_flow_table_type,
-			     UA_MANDATORY));
-
-DECLARE_UVERBS_NAMED_METHOD(
-	MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE,
-			UVERBS_OBJECT_FLOW_ACTION,
-			UVERBS_ACCESS_NEW,
-			UA_MANDATORY),
-	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF,
-			   UVERBS_ATTR_MIN_SIZE(1),
-			   UA_ALLOC_AND_COPY,
-			   UA_OPTIONAL),
-	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE,
-			     enum mlx5_ib_uapi_flow_action_packet_reformat_type,
-			     UA_MANDATORY),
-	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE,
-			     enum mlx5_ib_uapi_flow_table_type,
-			     UA_MANDATORY));
-
-ADD_UVERBS_METHODS(
-	mlx5_ib_flow_actions,
-	UVERBS_OBJECT_FLOW_ACTION,
-	&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER),
-	&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT));
-
-DECLARE_UVERBS_NAMED_METHOD(
-	MLX5_IB_METHOD_FLOW_MATCHER_CREATE,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE,
-			MLX5_IB_OBJECT_FLOW_MATCHER,
-			UVERBS_ACCESS_NEW,
-			UA_MANDATORY),
-	UVERBS_ATTR_PTR_IN(
-		MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK,
-		UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)),
-		UA_MANDATORY),
-	UVERBS_ATTR_ENUM_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE,
-			    mlx5_ib_flow_type,
-			    UA_MANDATORY),
-	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA,
-			   UVERBS_ATTR_TYPE(u8),
-			   UA_MANDATORY),
-	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
-			     enum ib_flow_flags,
-			     UA_OPTIONAL),
-	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE,
-			     enum mlx5_ib_uapi_flow_table_type,
-			     UA_OPTIONAL));
-
-DECLARE_UVERBS_NAMED_METHOD_DESTROY(
-	MLX5_IB_METHOD_FLOW_MATCHER_DESTROY,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_DESTROY_HANDLE,
-			MLX5_IB_OBJECT_FLOW_MATCHER,
-			UVERBS_ACCESS_DESTROY,
-			UA_MANDATORY));
-
-DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER,
-			    UVERBS_TYPE_ALLOC_IDR(flow_matcher_cleanup),
-			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE),
-			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY));
-
-const struct uapi_definition mlx5_ib_flow_defs[] = {
-	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
-		MLX5_IB_OBJECT_FLOW_MATCHER),
-	UAPI_DEF_CHAIN_OBJ_TREE(
-		UVERBS_OBJECT_FLOW,
-		&mlx5_ib_fs),
-	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
-				&mlx5_ib_flow_actions),
-	{},
-};
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
new file mode 100644
index 0000000..e9cfb9a
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -0,0 +1,2516 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/uverbs_types.h>
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/uverbs_std_types.h>
+#include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
+#include <rdma/ib_umem.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/fs.h>
+#include <linux/mlx5/fs_helpers.h>
+#include <linux/mlx5/accel.h>
+#include <linux/mlx5/eswitch.h>
+#include "mlx5_ib.h"
+#include "counters.h"
+#include "devx.h"
+#include "fs.h"
+
+#define UVERBS_MODULE_NAME mlx5_ib
+#include <rdma/uverbs_named_ioctl.h>
+
+enum {
+	MATCH_CRITERIA_ENABLE_OUTER_BIT,
+	MATCH_CRITERIA_ENABLE_MISC_BIT,
+	MATCH_CRITERIA_ENABLE_INNER_BIT,
+	MATCH_CRITERIA_ENABLE_MISC2_BIT
+};
+
+#define HEADER_IS_ZERO(match_criteria, headers)			           \
+	!(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
+		    0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
+
+static u8 get_match_criteria_enable(u32 *match_criteria)
+{
+	u8 match_criteria_enable;
+
+	match_criteria_enable =
+		(!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
+		MATCH_CRITERIA_ENABLE_OUTER_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
+		MATCH_CRITERIA_ENABLE_MISC_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
+		MATCH_CRITERIA_ENABLE_INNER_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) <<
+		MATCH_CRITERIA_ENABLE_MISC2_BIT;
+
+	return match_criteria_enable;
+}
+
+static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
+{
+	u8 entry_mask;
+	u8 entry_val;
+	int err = 0;
+
+	if (!mask)
+		goto out;
+
+	entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c,
+			      ip_protocol);
+	entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v,
+			     ip_protocol);
+	if (!entry_mask) {
+		MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
+		MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
+		goto out;
+	}
+	/* Don't override existing ip protocol */
+	if (mask != entry_mask || val != entry_val)
+		err = -EINVAL;
+out:
+	return err;
+}
+
+static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
+			   bool inner)
+{
+	if (inner) {
+		MLX5_SET(fte_match_set_misc,
+			 misc_c, inner_ipv6_flow_label, mask);
+		MLX5_SET(fte_match_set_misc,
+			 misc_v, inner_ipv6_flow_label, val);
+	} else {
+		MLX5_SET(fte_match_set_misc,
+			 misc_c, outer_ipv6_flow_label, mask);
+		MLX5_SET(fte_match_set_misc,
+			 misc_v, outer_ipv6_flow_label, val);
+	}
+}
+
+static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
+{
+	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
+	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
+	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
+	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
+}
+
+static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask)
+{
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL))
+		return -EOPNOTSUPP;
+
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP))
+		return -EOPNOTSUPP;
+
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS))
+		return -EOPNOTSUPP;
+
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+#define LAST_ETH_FIELD vlan_tag
+#define LAST_IB_FIELD sl
+#define LAST_IPV4_FIELD tos
+#define LAST_IPV6_FIELD traffic_class
+#define LAST_TCP_UDP_FIELD src_port
+#define LAST_TUNNEL_FIELD tunnel_id
+#define LAST_FLOW_TAG_FIELD tag_id
+#define LAST_DROP_FIELD size
+#define LAST_COUNTERS_FIELD counters
+
+/* Field is the last supported field */
+#define FIELDS_NOT_SUPPORTED(filter, field)\
+	memchr_inv((void *)&filter.field  +\
+		   sizeof(filter.field), 0,\
+		   sizeof(filter) -\
+		   offsetof(typeof(filter), field) -\
+		   sizeof(filter.field))
+
+int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
+			   bool is_egress,
+			   struct mlx5_flow_act *action)
+{
+
+	switch (maction->ib_action.type) {
+	case IB_FLOW_ACTION_ESP:
+		if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+				      MLX5_FLOW_CONTEXT_ACTION_DECRYPT))
+			return -EINVAL;
+		/* Currently only AES_GCM keymat is supported by the driver */
+		action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx;
+		action->action |= is_egress ?
+			MLX5_FLOW_CONTEXT_ACTION_ENCRYPT :
+			MLX5_FLOW_CONTEXT_ACTION_DECRYPT;
+		return 0;
+	case IB_FLOW_ACTION_UNSPECIFIED:
+		if (maction->flow_action_raw.sub_type ==
+		    MLX5_IB_FLOW_ACTION_MODIFY_HEADER) {
+			if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+				return -EINVAL;
+			action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+			action->modify_hdr =
+				maction->flow_action_raw.modify_hdr;
+			return 0;
+		}
+		if (maction->flow_action_raw.sub_type ==
+		    MLX5_IB_FLOW_ACTION_DECAP) {
+			if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
+				return -EINVAL;
+			action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
+			return 0;
+		}
+		if (maction->flow_action_raw.sub_type ==
+		    MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) {
+			if (action->action &
+			    MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
+				return -EINVAL;
+			action->action |=
+				MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
+			action->pkt_reformat =
+				maction->flow_action_raw.pkt_reformat;
+			return 0;
+		}
+		fallthrough;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int parse_flow_attr(struct mlx5_core_dev *mdev,
+			   struct mlx5_flow_spec *spec,
+			   const union ib_flow_spec *ib_spec,
+			   const struct ib_flow_attr *flow_attr,
+			   struct mlx5_flow_act *action, u32 prev_type)
+{
+	struct mlx5_flow_context *flow_context = &spec->flow_context;
+	u32 *match_c = spec->match_criteria;
+	u32 *match_v = spec->match_value;
+	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					   misc_parameters);
+	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					   misc_parameters);
+	void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					    misc_parameters_2);
+	void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					    misc_parameters_2);
+	void *headers_c;
+	void *headers_v;
+	int match_ipv;
+	int ret;
+
+	if (ib_spec->type & IB_FLOW_SPEC_INNER) {
+		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					 inner_headers);
+		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.inner_ip_version);
+	} else {
+		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					 outer_headers);
+		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.outer_ip_version);
+	}
+
+	switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
+	case IB_FLOW_SPEC_ETH:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
+			return -EOPNOTSUPP;
+
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					     dmac_47_16),
+				ib_spec->eth.mask.dst_mac);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					     dmac_47_16),
+				ib_spec->eth.val.dst_mac);
+
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					     smac_47_16),
+				ib_spec->eth.mask.src_mac);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					     smac_47_16),
+				ib_spec->eth.val.src_mac);
+
+		if (ib_spec->eth.mask.vlan_tag) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 cvlan_tag, 1);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 cvlan_tag, 1);
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 first_vid, ntohs(ib_spec->eth.val.vlan_tag));
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 first_cfi,
+				 ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 first_cfi,
+				 ntohs(ib_spec->eth.val.vlan_tag) >> 12);
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 first_prio,
+				 ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 first_prio,
+				 ntohs(ib_spec->eth.val.vlan_tag) >> 13);
+		}
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+			 ethertype, ntohs(ib_spec->eth.mask.ether_type));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+			 ethertype, ntohs(ib_spec->eth.val.ether_type));
+		break;
+	case IB_FLOW_SPEC_IPV4:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
+			return -EOPNOTSUPP;
+
+		if (match_ipv) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ip_version, 0xf);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ip_version, MLX5_FS_IPV4_VERSION);
+		} else {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ethertype, 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ethertype, ETH_P_IP);
+		}
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.mask.src_ip,
+		       sizeof(ib_spec->ipv4.mask.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.val.src_ip,
+		       sizeof(ib_spec->ipv4.val.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.mask.dst_ip,
+		       sizeof(ib_spec->ipv4.mask.dst_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.val.dst_ip,
+		       sizeof(ib_spec->ipv4.val.dst_ip));
+
+		set_tos(headers_c, headers_v,
+			ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
+
+		if (set_proto(headers_c, headers_v,
+			      ib_spec->ipv4.mask.proto,
+			      ib_spec->ipv4.val.proto))
+			return -EINVAL;
+		break;
+	case IB_FLOW_SPEC_IPV6:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
+			return -EOPNOTSUPP;
+
+		if (match_ipv) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ip_version, 0xf);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ip_version, MLX5_FS_IPV6_VERSION);
+		} else {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ethertype, 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ethertype, ETH_P_IPV6);
+		}
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.mask.src_ip,
+		       sizeof(ib_spec->ipv6.mask.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.val.src_ip,
+		       sizeof(ib_spec->ipv6.val.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.mask.dst_ip,
+		       sizeof(ib_spec->ipv6.mask.dst_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.val.dst_ip,
+		       sizeof(ib_spec->ipv6.val.dst_ip));
+
+		set_tos(headers_c, headers_v,
+			ib_spec->ipv6.mask.traffic_class,
+			ib_spec->ipv6.val.traffic_class);
+
+		if (set_proto(headers_c, headers_v,
+			      ib_spec->ipv6.mask.next_hdr,
+			      ib_spec->ipv6.val.next_hdr))
+			return -EINVAL;
+
+		set_flow_label(misc_params_c, misc_params_v,
+			       ntohl(ib_spec->ipv6.mask.flow_label),
+			       ntohl(ib_spec->ipv6.val.flow_label),
+			       ib_spec->type & IB_FLOW_SPEC_INNER);
+		break;
+	case IB_FLOW_SPEC_ESP:
+		if (ib_spec->esp.mask.seq)
+			return -EOPNOTSUPP;
+
+		MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi,
+			 ntohl(ib_spec->esp.mask.spi));
+		MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi,
+			 ntohl(ib_spec->esp.val.spi));
+		break;
+	case IB_FLOW_SPEC_TCP:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
+					 LAST_TCP_UDP_FIELD))
+			return -EOPNOTSUPP;
+
+		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP))
+			return -EINVAL;
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
+			 ntohs(ib_spec->tcp_udp.mask.src_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
+			 ntohs(ib_spec->tcp_udp.val.src_port));
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
+			 ntohs(ib_spec->tcp_udp.mask.dst_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
+			 ntohs(ib_spec->tcp_udp.val.dst_port));
+		break;
+	case IB_FLOW_SPEC_UDP:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
+					 LAST_TCP_UDP_FIELD))
+			return -EOPNOTSUPP;
+
+		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP))
+			return -EINVAL;
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
+			 ntohs(ib_spec->tcp_udp.mask.src_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
+			 ntohs(ib_spec->tcp_udp.val.src_port));
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
+			 ntohs(ib_spec->tcp_udp.mask.dst_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
+			 ntohs(ib_spec->tcp_udp.val.dst_port));
+		break;
+	case IB_FLOW_SPEC_GRE:
+		if (ib_spec->gre.mask.c_ks_res0_ver)
+			return -EOPNOTSUPP;
+
+		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE))
+			return -EINVAL;
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
+			 0xff);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+			 IPPROTO_GRE);
+
+		MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol,
+			 ntohs(ib_spec->gre.mask.protocol));
+		MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol,
+			 ntohs(ib_spec->gre.val.protocol));
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
+				    gre_key.nvgre.hi),
+		       &ib_spec->gre.mask.key,
+		       sizeof(ib_spec->gre.mask.key));
+		memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v,
+				    gre_key.nvgre.hi),
+		       &ib_spec->gre.val.key,
+		       sizeof(ib_spec->gre.val.key));
+		break;
+	case IB_FLOW_SPEC_MPLS:
+		switch (prev_type) {
+		case IB_FLOW_SPEC_UDP:
+			if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+						   ft_field_support.outer_first_mpls_over_udp),
+						   &ib_spec->mpls.mask.tag))
+				return -EOPNOTSUPP;
+
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+					    outer_first_mpls_over_udp),
+			       &ib_spec->mpls.val.tag,
+			       sizeof(ib_spec->mpls.val.tag));
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+					    outer_first_mpls_over_udp),
+			       &ib_spec->mpls.mask.tag,
+			       sizeof(ib_spec->mpls.mask.tag));
+			break;
+		case IB_FLOW_SPEC_GRE:
+			if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+						   ft_field_support.outer_first_mpls_over_gre),
+						   &ib_spec->mpls.mask.tag))
+				return -EOPNOTSUPP;
+
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+					    outer_first_mpls_over_gre),
+			       &ib_spec->mpls.val.tag,
+			       sizeof(ib_spec->mpls.val.tag));
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+					    outer_first_mpls_over_gre),
+			       &ib_spec->mpls.mask.tag,
+			       sizeof(ib_spec->mpls.mask.tag));
+			break;
+		default:
+			if (ib_spec->type & IB_FLOW_SPEC_INNER) {
+				if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+							   ft_field_support.inner_first_mpls),
+							   &ib_spec->mpls.mask.tag))
+					return -EOPNOTSUPP;
+
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+						    inner_first_mpls),
+				       &ib_spec->mpls.val.tag,
+				       sizeof(ib_spec->mpls.val.tag));
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+						    inner_first_mpls),
+				       &ib_spec->mpls.mask.tag,
+				       sizeof(ib_spec->mpls.mask.tag));
+			} else {
+				if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+							   ft_field_support.outer_first_mpls),
+							   &ib_spec->mpls.mask.tag))
+					return -EOPNOTSUPP;
+
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+						    outer_first_mpls),
+				       &ib_spec->mpls.val.tag,
+				       sizeof(ib_spec->mpls.val.tag));
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+						    outer_first_mpls),
+				       &ib_spec->mpls.mask.tag,
+				       sizeof(ib_spec->mpls.mask.tag));
+			}
+		}
+		break;
+	case IB_FLOW_SPEC_VXLAN_TUNNEL:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
+					 LAST_TUNNEL_FIELD))
+			return -EOPNOTSUPP;
+
+		MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
+			 ntohl(ib_spec->tunnel.mask.tunnel_id));
+		MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
+			 ntohl(ib_spec->tunnel.val.tunnel_id));
+		break;
+	case IB_FLOW_SPEC_ACTION_TAG:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
+					 LAST_FLOW_TAG_FIELD))
+			return -EOPNOTSUPP;
+		if (ib_spec->flow_tag.tag_id >= BIT(24))
+			return -EINVAL;
+
+		flow_context->flow_tag = ib_spec->flow_tag.tag_id;
+		flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
+		break;
+	case IB_FLOW_SPEC_ACTION_DROP:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
+					 LAST_DROP_FIELD))
+			return -EOPNOTSUPP;
+		action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
+		break;
+	case IB_FLOW_SPEC_ACTION_HANDLE:
+		ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act),
+			flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action);
+		if (ret)
+			return ret;
+		break;
+	case IB_FLOW_SPEC_ACTION_COUNT:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count,
+					 LAST_COUNTERS_FIELD))
+			return -EOPNOTSUPP;
+
+		/* for now support only one counters spec per flow */
+		if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
+			return -EINVAL;
+
+		action->counters = ib_spec->flow_count.counters;
+		action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* If a flow could catch both multicast and unicast packets,
+ * it won't fall into the multicast flow steering table and this rule
+ * could steal other multicast packets.
+ */
+static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr)
+{
+	union ib_flow_spec *flow_spec;
+
+	if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
+	    ib_attr->num_of_specs < 1)
+		return false;
+
+	flow_spec = (union ib_flow_spec *)(ib_attr + 1);
+	if (flow_spec->type == IB_FLOW_SPEC_IPV4) {
+		struct ib_flow_spec_ipv4 *ipv4_spec;
+
+		ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec;
+		if (ipv4_is_multicast(ipv4_spec->val.dst_ip))
+			return true;
+
+		return false;
+	}
+
+	if (flow_spec->type == IB_FLOW_SPEC_ETH) {
+		struct ib_flow_spec_eth *eth_spec;
+
+		eth_spec = (struct ib_flow_spec_eth *)flow_spec;
+		return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
+		       is_multicast_ether_addr(eth_spec->val.dst_mac);
+	}
+
+	return false;
+}
+
+enum valid_spec {
+	VALID_SPEC_INVALID,
+	VALID_SPEC_VALID,
+	VALID_SPEC_NA,
+};
+
+static enum valid_spec
+is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
+		     const struct mlx5_flow_spec *spec,
+		     const struct mlx5_flow_act *flow_act,
+		     bool egress)
+{
+	const u32 *match_c = spec->match_criteria;
+	bool is_crypto =
+		(flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+				     MLX5_FLOW_CONTEXT_ACTION_DECRYPT));
+	bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c);
+	bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+	/*
+	 * Currently only crypto is supported in egress, when regular egress
+	 * rules would be supported, always return VALID_SPEC_NA.
+	 */
+	if (!is_crypto)
+		return VALID_SPEC_NA;
+
+	return is_crypto && is_ipsec &&
+		(!egress || (!is_drop &&
+			     !(spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG))) ?
+		VALID_SPEC_VALID : VALID_SPEC_INVALID;
+}
+
+static bool is_valid_spec(struct mlx5_core_dev *mdev,
+			  const struct mlx5_flow_spec *spec,
+			  const struct mlx5_flow_act *flow_act,
+			  bool egress)
+{
+	/* We curretly only support ipsec egress flow */
+	return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID;
+}
+
+static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
+			       const struct ib_flow_attr *flow_attr,
+			       bool check_inner)
+{
+	union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
+	int match_ipv = check_inner ?
+			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.inner_ip_version) :
+			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.outer_ip_version);
+	int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0;
+	bool ipv4_spec_valid, ipv6_spec_valid;
+	unsigned int ip_spec_type = 0;
+	bool has_ethertype = false;
+	unsigned int spec_index;
+	bool mask_valid = true;
+	u16 eth_type = 0;
+	bool type_valid;
+
+	/* Validate that ethertype is correct */
+	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
+		if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) &&
+		    ib_spec->eth.mask.ether_type) {
+			mask_valid = (ib_spec->eth.mask.ether_type ==
+				      htons(0xffff));
+			has_ethertype = true;
+			eth_type = ntohs(ib_spec->eth.val.ether_type);
+		} else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) ||
+			   (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) {
+			ip_spec_type = ib_spec->type;
+		}
+		ib_spec = (void *)ib_spec + ib_spec->size;
+	}
+
+	type_valid = (!has_ethertype) || (!ip_spec_type);
+	if (!type_valid && mask_valid) {
+		ipv4_spec_valid = (eth_type == ETH_P_IP) &&
+			(ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit));
+		ipv6_spec_valid = (eth_type == ETH_P_IPV6) &&
+			(ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit));
+
+		type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) ||
+			     (((eth_type == ETH_P_MPLS_UC) ||
+			       (eth_type == ETH_P_MPLS_MC)) && match_ipv);
+	}
+
+	return type_valid;
+}
+
+static bool is_valid_attr(struct mlx5_core_dev *mdev,
+			  const struct ib_flow_attr *flow_attr)
+{
+	return is_valid_ethertype(mdev, flow_attr, false) &&
+	       is_valid_ethertype(mdev, flow_attr, true);
+}
+
+static void put_flow_table(struct mlx5_ib_dev *dev,
+			   struct mlx5_ib_flow_prio *prio, bool ft_added)
+{
+	prio->refcount -= !!ft_added;
+	if (!prio->refcount) {
+		mlx5_destroy_flow_table(prio->flow_table);
+		prio->flow_table = NULL;
+	}
+}
+
+static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
+{
+	struct mlx5_ib_flow_handler *handler = container_of(flow_id,
+							  struct mlx5_ib_flow_handler,
+							  ibflow);
+	struct mlx5_ib_flow_handler *iter, *tmp;
+	struct mlx5_ib_dev *dev = handler->dev;
+
+	mutex_lock(&dev->flow_db->lock);
+
+	list_for_each_entry_safe(iter, tmp, &handler->list, list) {
+		mlx5_del_flow_rules(iter->rule);
+		put_flow_table(dev, iter->prio, true);
+		list_del(&iter->list);
+		kfree(iter);
+	}
+
+	mlx5_del_flow_rules(handler->rule);
+	put_flow_table(dev, handler->prio, true);
+	mlx5_ib_counters_clear_description(handler->ibcounters);
+	mutex_unlock(&dev->flow_db->lock);
+	if (handler->flow_matcher)
+		atomic_dec(&handler->flow_matcher->usecnt);
+	kfree(handler);
+
+	return 0;
+}
+
+static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
+{
+	priority *= 2;
+	if (!dont_trap)
+		priority++;
+	return priority;
+}
+
+enum flow_table_type {
+	MLX5_IB_FT_RX,
+	MLX5_IB_FT_TX
+};
+
+#define MLX5_FS_MAX_TYPES	 6
+#define MLX5_FS_MAX_ENTRIES	 BIT(16)
+
+static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
+					   struct mlx5_ib_flow_prio *prio,
+					   int priority,
+					   int num_entries, int num_groups,
+					   u32 flags)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_flow_table *ft;
+
+	ft_attr.prio = priority;
+	ft_attr.max_fte = num_entries;
+	ft_attr.flags = flags;
+	ft_attr.autogroup.max_num_groups = num_groups;
+	ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
+	if (IS_ERR(ft))
+		return ERR_CAST(ft);
+
+	prio->flow_table = ft;
+	prio->refcount = 0;
+	return prio;
+}
+
+static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
+						struct ib_flow_attr *flow_attr,
+						enum flow_table_type ft_type)
+{
+	bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
+	struct mlx5_flow_namespace *ns = NULL;
+	struct mlx5_ib_flow_prio *prio;
+	struct mlx5_flow_table *ft;
+	int max_table_size;
+	int num_entries;
+	int num_groups;
+	bool esw_encap;
+	u32 flags = 0;
+	int priority;
+
+	max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+						       log_max_ft_size));
+	esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
+		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
+	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
+		enum mlx5_flow_namespace_type fn_type;
+
+		if (flow_is_multicast_only(flow_attr) &&
+		    !dont_trap)
+			priority = MLX5_IB_FLOW_MCAST_PRIO;
+		else
+			priority = ib_prio_to_core_prio(flow_attr->priority,
+							dont_trap);
+		if (ft_type == MLX5_IB_FT_RX) {
+			fn_type = MLX5_FLOW_NAMESPACE_BYPASS;
+			prio = &dev->flow_db->prios[priority];
+			if (!dev->is_rep && !esw_encap &&
+			    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
+				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
+			if (!dev->is_rep && !esw_encap &&
+			    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+					reformat_l3_tunnel_to_l2))
+				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+		} else {
+			max_table_size =
+				BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
+							      log_max_ft_size));
+			fn_type = MLX5_FLOW_NAMESPACE_EGRESS;
+			prio = &dev->flow_db->egress_prios[priority];
+			if (!dev->is_rep && !esw_encap &&
+			    MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
+				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+		}
+		ns = mlx5_get_flow_namespace(dev->mdev, fn_type);
+		num_entries = MLX5_FS_MAX_ENTRIES;
+		num_groups = MLX5_FS_MAX_TYPES;
+	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
+		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
+		ns = mlx5_get_flow_namespace(dev->mdev,
+					     MLX5_FLOW_NAMESPACE_LEFTOVERS);
+		build_leftovers_ft_param(&priority,
+					 &num_entries,
+					 &num_groups);
+		prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
+	} else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
+		if (!MLX5_CAP_FLOWTABLE(dev->mdev,
+					allow_sniffer_and_nic_rx_shared_tir))
+			return ERR_PTR(-EOPNOTSUPP);
+
+		ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
+					     MLX5_FLOW_NAMESPACE_SNIFFER_RX :
+					     MLX5_FLOW_NAMESPACE_SNIFFER_TX);
+
+		prio = &dev->flow_db->sniffer[ft_type];
+		priority = 0;
+		num_entries = 1;
+		num_groups = 1;
+	}
+
+	if (!ns)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	max_table_size = min_t(int, num_entries, max_table_size);
+
+	ft = prio->flow_table;
+	if (!ft)
+		return _get_prio(ns, prio, priority, max_table_size, num_groups,
+				 flags);
+
+	return prio;
+}
+
+static void set_underlay_qp(struct mlx5_ib_dev *dev,
+			    struct mlx5_flow_spec *spec,
+			    u32 underlay_qpn)
+{
+	void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
+					   spec->match_criteria,
+					   misc_parameters);
+	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+					   misc_parameters);
+
+	if (underlay_qpn &&
+	    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+				      ft_field_support.bth_dst_qp)) {
+		MLX5_SET(fte_match_set_misc,
+			 misc_params_v, bth_dst_qp, underlay_qpn);
+		MLX5_SET(fte_match_set_misc,
+			 misc_params_c, bth_dst_qp, 0xffffff);
+	}
+}
+
+static void mlx5_ib_set_rule_source_port(struct mlx5_ib_dev *dev,
+					 struct mlx5_flow_spec *spec,
+					 struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
+	void *misc;
+
+	if (mlx5_eswitch_vport_match_metadata_enabled(esw)) {
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    misc_parameters_2);
+
+		MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
+			 mlx5_eswitch_get_vport_metadata_for_match(esw,
+								   rep->vport));
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    misc_parameters_2);
+
+		MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
+			 mlx5_eswitch_get_vport_metadata_mask());
+	} else {
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    misc_parameters);
+
+		MLX5_SET(fte_match_set_misc, misc, source_port, rep->vport);
+
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    misc_parameters);
+
+		MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+	}
+}
+
+static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
+						      struct mlx5_ib_flow_prio *ft_prio,
+						      const struct ib_flow_attr *flow_attr,
+						      struct mlx5_flow_destination *dst,
+						      u32 underlay_qpn,
+						      struct mlx5_ib_create_flow *ucmd)
+{
+	struct mlx5_flow_table	*ft = ft_prio->flow_table;
+	struct mlx5_ib_flow_handler *handler;
+	struct mlx5_flow_act flow_act = {};
+	struct mlx5_flow_spec *spec;
+	struct mlx5_flow_destination dest_arr[2] = {};
+	struct mlx5_flow_destination *rule_dst = dest_arr;
+	const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
+	unsigned int spec_index;
+	u32 prev_type = 0;
+	int err = 0;
+	int dest_num = 0;
+	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
+
+	if (!is_valid_attr(dev->mdev, flow_attr))
+		return ERR_PTR(-EINVAL);
+
+	if (dev->is_rep && is_egress)
+		return ERR_PTR(-EINVAL);
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
+	if (!handler || !spec) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	INIT_LIST_HEAD(&handler->list);
+
+	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
+		err = parse_flow_attr(dev->mdev, spec,
+				      ib_flow, flow_attr, &flow_act,
+				      prev_type);
+		if (err < 0)
+			goto free;
+
+		prev_type = ((union ib_flow_spec *)ib_flow)->type;
+		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
+	}
+
+	if (dst && !(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP)) {
+		memcpy(&dest_arr[0], dst, sizeof(*dst));
+		dest_num++;
+	}
+
+	if (!flow_is_multicast_only(flow_attr))
+		set_underlay_qp(dev, spec, underlay_qpn);
+
+	if (dev->is_rep) {
+		struct mlx5_eswitch_rep *rep;
+
+		rep = dev->port[flow_attr->port - 1].rep;
+		if (!rep) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		mlx5_ib_set_rule_source_port(dev, spec, rep);
+	}
+
+	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
+
+	if (is_egress &&
+	    !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) {
+		err = -EINVAL;
+		goto free;
+	}
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		struct mlx5_ib_mcounters *mcounters;
+
+		err = mlx5_ib_flow_counters_set_data(flow_act.counters, ucmd);
+		if (err)
+			goto free;
+
+		mcounters = to_mcounters(flow_act.counters);
+		handler->ibcounters = flow_act.counters;
+		dest_arr[dest_num].type =
+			MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dest_arr[dest_num].counter_id =
+			mlx5_fc_id(mcounters->hw_cntrs_hndl);
+		dest_num++;
+	}
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
+		if (!dest_num)
+			rule_dst = NULL;
+	} else {
+		if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)
+			flow_act.action |=
+				MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
+		if (is_egress)
+			flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+		else if (dest_num)
+			flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	}
+
+	if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG)  &&
+	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
+	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
+		mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
+			     spec->flow_context.flow_tag, flow_attr->type);
+		err = -EINVAL;
+		goto free;
+	}
+	handler->rule = mlx5_add_flow_rules(ft, spec,
+					    &flow_act,
+					    rule_dst, dest_num);
+
+	if (IS_ERR(handler->rule)) {
+		err = PTR_ERR(handler->rule);
+		goto free;
+	}
+
+	ft_prio->refcount++;
+	handler->prio = ft_prio;
+	handler->dev = dev;
+
+	ft_prio->flow_table = ft;
+free:
+	if (err && handler) {
+		mlx5_ib_counters_clear_description(handler->ibcounters);
+		kfree(handler);
+	}
+	kvfree(spec);
+	return err ? ERR_PTR(err) : handler;
+}
+
+static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
+						     struct mlx5_ib_flow_prio *ft_prio,
+						     const struct ib_flow_attr *flow_attr,
+						     struct mlx5_flow_destination *dst)
+{
+	return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
+}
+
+enum {
+	LEFTOVERS_MC,
+	LEFTOVERS_UC,
+};
+
+static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
+							  struct mlx5_ib_flow_prio *ft_prio,
+							  struct ib_flow_attr *flow_attr,
+							  struct mlx5_flow_destination *dst)
+{
+	struct mlx5_ib_flow_handler *handler_ucast = NULL;
+	struct mlx5_ib_flow_handler *handler = NULL;
+
+	static struct {
+		struct ib_flow_attr	flow_attr;
+		struct ib_flow_spec_eth eth_flow;
+	} leftovers_specs[] = {
+		[LEFTOVERS_MC] = {
+			.flow_attr = {
+				.num_of_specs = 1,
+				.size = sizeof(leftovers_specs[0])
+			},
+			.eth_flow = {
+				.type = IB_FLOW_SPEC_ETH,
+				.size = sizeof(struct ib_flow_spec_eth),
+				.mask = {.dst_mac = {0x1} },
+				.val =  {.dst_mac = {0x1} }
+			}
+		},
+		[LEFTOVERS_UC] = {
+			.flow_attr = {
+				.num_of_specs = 1,
+				.size = sizeof(leftovers_specs[0])
+			},
+			.eth_flow = {
+				.type = IB_FLOW_SPEC_ETH,
+				.size = sizeof(struct ib_flow_spec_eth),
+				.mask = {.dst_mac = {0x1} },
+				.val = {.dst_mac = {} }
+			}
+		}
+	};
+
+	handler = create_flow_rule(dev, ft_prio,
+				   &leftovers_specs[LEFTOVERS_MC].flow_attr,
+				   dst);
+	if (!IS_ERR(handler) &&
+	    flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
+		handler_ucast = create_flow_rule(dev, ft_prio,
+						 &leftovers_specs[LEFTOVERS_UC].flow_attr,
+						 dst);
+		if (IS_ERR(handler_ucast)) {
+			mlx5_del_flow_rules(handler->rule);
+			ft_prio->refcount--;
+			kfree(handler);
+			handler = handler_ucast;
+		} else {
+			list_add(&handler_ucast->list, &handler->list);
+		}
+	}
+
+	return handler;
+}
+
+static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
+							struct mlx5_ib_flow_prio *ft_rx,
+							struct mlx5_ib_flow_prio *ft_tx,
+							struct mlx5_flow_destination *dst)
+{
+	struct mlx5_ib_flow_handler *handler_rx;
+	struct mlx5_ib_flow_handler *handler_tx;
+	int err;
+	static const struct ib_flow_attr flow_attr  = {
+		.num_of_specs = 0,
+		.size = sizeof(flow_attr)
+	};
+
+	handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
+	if (IS_ERR(handler_rx)) {
+		err = PTR_ERR(handler_rx);
+		goto err;
+	}
+
+	handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
+	if (IS_ERR(handler_tx)) {
+		err = PTR_ERR(handler_tx);
+		goto err_tx;
+	}
+
+	list_add(&handler_tx->list, &handler_rx->list);
+
+	return handler_rx;
+
+err_tx:
+	mlx5_del_flow_rules(handler_rx->rule);
+	ft_rx->refcount--;
+	kfree(handler_rx);
+err:
+	return ERR_PTR(err);
+}
+
+
+static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
+					   struct ib_flow_attr *flow_attr,
+					   int domain,
+					   struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_ib_flow_handler *handler = NULL;
+	struct mlx5_flow_destination *dst = NULL;
+	struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
+	struct mlx5_ib_flow_prio *ft_prio;
+	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
+	struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr;
+	size_t min_ucmd_sz, required_ucmd_sz;
+	int err;
+	int underlay_qpn;
+
+	if (udata && udata->inlen) {
+		min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) +
+				sizeof(ucmd_hdr.reserved);
+		if (udata->inlen < min_ucmd_sz)
+			return ERR_PTR(-EOPNOTSUPP);
+
+		err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz);
+		if (err)
+			return ERR_PTR(err);
+
+		/* currently supports only one counters data */
+		if (ucmd_hdr.ncounters_data > 1)
+			return ERR_PTR(-EINVAL);
+
+		required_ucmd_sz = min_ucmd_sz +
+			sizeof(struct mlx5_ib_flow_counters_data) *
+			ucmd_hdr.ncounters_data;
+		if (udata->inlen > required_ucmd_sz &&
+		    !ib_is_udata_cleared(udata, required_ucmd_sz,
+					 udata->inlen - required_ucmd_sz))
+			return ERR_PTR(-EOPNOTSUPP);
+
+		ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL);
+		if (!ucmd)
+			return ERR_PTR(-ENOMEM);
+
+		err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz);
+		if (err)
+			goto free_ucmd;
+	}
+
+	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) {
+		err = -ENOMEM;
+		goto free_ucmd;
+	}
+
+	if (domain != IB_FLOW_DOMAIN_USER ||
+	    flow_attr->port > dev->num_ports ||
+	    (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP |
+				  IB_FLOW_ATTR_FLAGS_EGRESS))) {
+		err = -EINVAL;
+		goto free_ucmd;
+	}
+
+	if (is_egress &&
+	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
+	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
+		err = -EINVAL;
+		goto free_ucmd;
+	}
+
+	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+	if (!dst) {
+		err = -ENOMEM;
+		goto free_ucmd;
+	}
+
+	mutex_lock(&dev->flow_db->lock);
+
+	ft_prio = get_flow_table(dev, flow_attr,
+				 is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX);
+	if (IS_ERR(ft_prio)) {
+		err = PTR_ERR(ft_prio);
+		goto unlock;
+	}
+	if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
+		ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
+		if (IS_ERR(ft_prio_tx)) {
+			err = PTR_ERR(ft_prio_tx);
+			ft_prio_tx = NULL;
+			goto destroy_ft;
+		}
+	}
+
+	if (is_egress) {
+		dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT;
+	} else {
+		dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+		if (mqp->is_rss)
+			dst->tir_num = mqp->rss_qp.tirn;
+		else
+			dst->tir_num = mqp->raw_packet_qp.rq.tirn;
+	}
+
+	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
+		underlay_qpn = (mqp->flags & IB_QP_CREATE_SOURCE_QPN) ?
+				       mqp->underlay_qpn :
+				       0;
+		handler = _create_flow_rule(dev, ft_prio, flow_attr, dst,
+					    underlay_qpn, ucmd);
+	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
+		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
+		handler = create_leftovers_rule(dev, ft_prio, flow_attr,
+						dst);
+	} else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
+		handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
+	} else {
+		err = -EINVAL;
+		goto destroy_ft;
+	}
+
+	if (IS_ERR(handler)) {
+		err = PTR_ERR(handler);
+		handler = NULL;
+		goto destroy_ft;
+	}
+
+	mutex_unlock(&dev->flow_db->lock);
+	kfree(dst);
+	kfree(ucmd);
+
+	return &handler->ibflow;
+
+destroy_ft:
+	put_flow_table(dev, ft_prio, false);
+	if (ft_prio_tx)
+		put_flow_table(dev, ft_prio_tx, false);
+unlock:
+	mutex_unlock(&dev->flow_db->lock);
+	kfree(dst);
+free_ucmd:
+	kfree(ucmd);
+	return ERR_PTR(err);
+}
+
+static struct mlx5_ib_flow_prio *
+_get_flow_table(struct mlx5_ib_dev *dev,
+		struct mlx5_ib_flow_matcher *fs_matcher,
+		bool mcast)
+{
+	struct mlx5_flow_namespace *ns = NULL;
+	struct mlx5_ib_flow_prio *prio = NULL;
+	int max_table_size = 0;
+	bool esw_encap;
+	u32 flags = 0;
+	int priority;
+
+	if (mcast)
+		priority = MLX5_IB_FLOW_MCAST_PRIO;
+	else
+		priority = ib_prio_to_core_prio(fs_matcher->priority, false);
+
+	esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
+		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
+	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
+		max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+					log_max_ft_size));
+		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
+		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+					      reformat_l3_tunnel_to_l2) &&
+		    !esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) {
+		max_table_size = BIT(
+			MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size));
+		if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) && !esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) {
+		max_table_size = BIT(
+			MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size));
+		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
+		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, reformat_l3_tunnel_to_l2) &&
+		    esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+		priority = FDB_BYPASS_PATH;
+	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) {
+		max_table_size =
+			BIT(MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev,
+						       log_max_ft_size));
+		priority = fs_matcher->priority;
+	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) {
+		max_table_size =
+			BIT(MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev,
+						       log_max_ft_size));
+		priority = fs_matcher->priority;
+	}
+
+	max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
+
+	ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
+	if (!ns)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS)
+		prio = &dev->flow_db->prios[priority];
+	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
+		prio = &dev->flow_db->egress_prios[priority];
+	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB)
+		prio = &dev->flow_db->fdb;
+	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX)
+		prio = &dev->flow_db->rdma_rx[priority];
+	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX)
+		prio = &dev->flow_db->rdma_tx[priority];
+
+	if (!prio)
+		return ERR_PTR(-EINVAL);
+
+	if (prio->flow_table)
+		return prio;
+
+	return _get_prio(ns, prio, priority, max_table_size,
+			 MLX5_FS_MAX_TYPES, flags);
+}
+
+static struct mlx5_ib_flow_handler *
+_create_raw_flow_rule(struct mlx5_ib_dev *dev,
+		      struct mlx5_ib_flow_prio *ft_prio,
+		      struct mlx5_flow_destination *dst,
+		      struct mlx5_ib_flow_matcher  *fs_matcher,
+		      struct mlx5_flow_context *flow_context,
+		      struct mlx5_flow_act *flow_act,
+		      void *cmd_in, int inlen,
+		      int dst_num)
+{
+	struct mlx5_ib_flow_handler *handler;
+	struct mlx5_flow_spec *spec;
+	struct mlx5_flow_table *ft = ft_prio->flow_table;
+	int err = 0;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
+	if (!handler || !spec) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	INIT_LIST_HEAD(&handler->list);
+
+	memcpy(spec->match_value, cmd_in, inlen);
+	memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params,
+	       fs_matcher->mask_len);
+	spec->match_criteria_enable = fs_matcher->match_criteria_enable;
+	spec->flow_context = *flow_context;
+
+	handler->rule = mlx5_add_flow_rules(ft, spec,
+					    flow_act, dst, dst_num);
+
+	if (IS_ERR(handler->rule)) {
+		err = PTR_ERR(handler->rule);
+		goto free;
+	}
+
+	ft_prio->refcount++;
+	handler->prio = ft_prio;
+	handler->dev = dev;
+	ft_prio->flow_table = ft;
+
+free:
+	if (err)
+		kfree(handler);
+	kvfree(spec);
+	return err ? ERR_PTR(err) : handler;
+}
+
+static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
+				void *match_v)
+{
+	void *match_c;
+	void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4;
+	void *dmac, *dmac_mask;
+	void *ipv4, *ipv4_mask;
+
+	if (!(fs_matcher->match_criteria_enable &
+	      (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT)))
+		return false;
+
+	match_c = fs_matcher->matcher_mask.match_params;
+	match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v,
+					   outer_headers);
+	match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c,
+					   outer_headers);
+
+	dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
+			    dmac_47_16);
+	dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
+				 dmac_47_16);
+
+	if (is_multicast_ether_addr(dmac) &&
+	    is_multicast_ether_addr(dmac_mask))
+		return true;
+
+	ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
+			    dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+
+	ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
+				 dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+
+	if (ipv4_is_multicast(*(__be32 *)(ipv4)) &&
+	    ipv4_is_multicast(*(__be32 *)(ipv4_mask)))
+		return true;
+
+	return false;
+}
+
+static struct mlx5_ib_flow_handler *raw_fs_rule_add(
+	struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
+	struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act,
+	u32 counter_id, void *cmd_in, int inlen, int dest_id, int dest_type)
+{
+	struct mlx5_flow_destination *dst;
+	struct mlx5_ib_flow_prio *ft_prio;
+	struct mlx5_ib_flow_handler *handler;
+	int dst_num = 0;
+	bool mcast;
+	int err;
+
+	if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
+		return ERR_PTR(-ENOMEM);
+
+	dst = kcalloc(2, sizeof(*dst), GFP_KERNEL);
+	if (!dst)
+		return ERR_PTR(-ENOMEM);
+
+	mcast = raw_fs_is_multicast(fs_matcher, cmd_in);
+	mutex_lock(&dev->flow_db->lock);
+
+	ft_prio = _get_flow_table(dev, fs_matcher, mcast);
+	if (IS_ERR(ft_prio)) {
+		err = PTR_ERR(ft_prio);
+		goto unlock;
+	}
+
+	if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) {
+		dst[dst_num].type = dest_type;
+		dst[dst_num++].tir_num = dest_id;
+		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	} else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
+		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
+		dst[dst_num++].ft_num = dest_id;
+		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	} else  if (dest_type == MLX5_FLOW_DESTINATION_TYPE_PORT) {
+		dst[dst_num++].type = MLX5_FLOW_DESTINATION_TYPE_PORT;
+		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+	}
+
+
+	if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dst[dst_num].counter_id = counter_id;
+		dst_num++;
+	}
+
+	handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher,
+					flow_context, flow_act,
+					cmd_in, inlen, dst_num);
+
+	if (IS_ERR(handler)) {
+		err = PTR_ERR(handler);
+		goto destroy_ft;
+	}
+
+	mutex_unlock(&dev->flow_db->lock);
+	atomic_inc(&fs_matcher->usecnt);
+	handler->flow_matcher = fs_matcher;
+
+	kfree(dst);
+
+	return handler;
+
+destroy_ft:
+	put_flow_table(dev, ft_prio, false);
+unlock:
+	mutex_unlock(&dev->flow_db->lock);
+	kfree(dst);
+
+	return ERR_PTR(err);
+}
+
+static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags)
+{
+	u32 flags = 0;
+
+	if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA)
+		flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA;
+
+	return flags;
+}
+
+#define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED                             \
+	MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA
+static struct ib_flow_action *
+mlx5_ib_create_flow_action_esp(struct ib_device *device,
+			       const struct ib_flow_action_attrs_esp *attr,
+			       struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_dev *mdev = to_mdev(device);
+	struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm;
+	struct mlx5_accel_esp_xfrm_attrs accel_attrs = {};
+	struct mlx5_ib_flow_action *action;
+	u64 action_flags;
+	u64 flags;
+	int err = 0;
+
+	err = uverbs_get_flags64(
+		&action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
+		((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1));
+	if (err)
+		return ERR_PTR(err);
+
+	flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags);
+
+	/* We current only support a subset of the standard features. Only a
+	 * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn
+	 * (with overlap). Full offload mode isn't supported.
+	 */
+	if (!attr->keymat || attr->replay || attr->encap ||
+	    attr->spi || attr->seq || attr->tfc_pad ||
+	    attr->hard_limit_pkts ||
+	    (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
+			     IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (attr->keymat->protocol !=
+	    IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	aes_gcm = &attr->keymat->keymat.aes_gcm;
+
+	if (aes_gcm->icv_len != 16 ||
+	    aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	action = kmalloc(sizeof(*action), GFP_KERNEL);
+	if (!action)
+		return ERR_PTR(-ENOMEM);
+
+	action->esp_aes_gcm.ib_flags = attr->flags;
+	memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key,
+	       sizeof(accel_attrs.keymat.aes_gcm.aes_key));
+	accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8;
+	memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt,
+	       sizeof(accel_attrs.keymat.aes_gcm.salt));
+	memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv,
+	       sizeof(accel_attrs.keymat.aes_gcm.seq_iv));
+	accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8;
+	accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ;
+	accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM;
+
+	accel_attrs.esn = attr->esn;
+	if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED)
+		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
+	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
+		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+
+	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)
+		accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT;
+
+	action->esp_aes_gcm.ctx =
+		mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags);
+	if (IS_ERR(action->esp_aes_gcm.ctx)) {
+		err = PTR_ERR(action->esp_aes_gcm.ctx);
+		goto err_parse;
+	}
+
+	action->esp_aes_gcm.ib_flags = attr->flags;
+
+	return &action->ib_action;
+
+err_parse:
+	kfree(action);
+	return ERR_PTR(err);
+}
+
+static int
+mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action,
+			       const struct ib_flow_action_attrs_esp *attr,
+			       struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_flow_action *maction = to_mflow_act(action);
+	struct mlx5_accel_esp_xfrm_attrs accel_attrs;
+	int err = 0;
+
+	if (attr->keymat || attr->replay || attr->encap ||
+	    attr->spi || attr->seq || attr->tfc_pad ||
+	    attr->hard_limit_pkts ||
+	    (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
+			     IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS |
+			     IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)))
+		return -EOPNOTSUPP;
+
+	/* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can
+	 * be modified.
+	 */
+	if (!(maction->esp_aes_gcm.ib_flags &
+	      IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) &&
+	    attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
+			   IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))
+		return -EINVAL;
+
+	memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs,
+	       sizeof(accel_attrs));
+
+	accel_attrs.esn = attr->esn;
+	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
+		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+	else
+		accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+
+	err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx,
+					 &accel_attrs);
+	if (err)
+		return err;
+
+	maction->esp_aes_gcm.ib_flags &=
+		~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
+	maction->esp_aes_gcm.ib_flags |=
+		attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
+
+	return 0;
+}
+
+static void destroy_flow_action_raw(struct mlx5_ib_flow_action *maction)
+{
+	switch (maction->flow_action_raw.sub_type) {
+	case MLX5_IB_FLOW_ACTION_MODIFY_HEADER:
+		mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev,
+					   maction->flow_action_raw.modify_hdr);
+		break;
+	case MLX5_IB_FLOW_ACTION_PACKET_REFORMAT:
+		mlx5_packet_reformat_dealloc(maction->flow_action_raw.dev->mdev,
+					     maction->flow_action_raw.pkt_reformat);
+		break;
+	case MLX5_IB_FLOW_ACTION_DECAP:
+		break;
+	default:
+		break;
+	}
+}
+
+static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action)
+{
+	struct mlx5_ib_flow_action *maction = to_mflow_act(action);
+
+	switch (action->type) {
+	case IB_FLOW_ACTION_ESP:
+		/*
+		 * We only support aes_gcm by now, so we implicitly know this is
+		 * the underline crypto.
+		 */
+		mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx);
+		break;
+	case IB_FLOW_ACTION_UNSPECIFIED:
+		destroy_flow_action_raw(maction);
+		break;
+	default:
+		WARN_ON(true);
+		break;
+	}
+
+	kfree(maction);
+	return 0;
+}
+
+static int
+mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type,
+			     enum mlx5_flow_namespace_type *namespace)
+{
+	switch (table_type) {
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX:
+		*namespace = MLX5_FLOW_NAMESPACE_BYPASS;
+		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX:
+		*namespace = MLX5_FLOW_NAMESPACE_EGRESS;
+		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB:
+		*namespace = MLX5_FLOW_NAMESPACE_FDB;
+		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX:
+		*namespace = MLX5_FLOW_NAMESPACE_RDMA_RX;
+		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX:
+		*namespace = MLX5_FLOW_NAMESPACE_RDMA_TX;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static const struct uverbs_attr_spec mlx5_ib_flow_type[] = {
+	[MLX5_IB_FLOW_TYPE_NORMAL] = {
+		.type = UVERBS_ATTR_TYPE_PTR_IN,
+		.u.ptr = {
+			.len = sizeof(u16), /* data is priority */
+			.min_len = sizeof(u16),
+		}
+	},
+	[MLX5_IB_FLOW_TYPE_SNIFFER] = {
+		.type = UVERBS_ATTR_TYPE_PTR_IN,
+		UVERBS_ATTR_NO_DATA(),
+	},
+	[MLX5_IB_FLOW_TYPE_ALL_DEFAULT] = {
+		.type = UVERBS_ATTR_TYPE_PTR_IN,
+		UVERBS_ATTR_NO_DATA(),
+	},
+	[MLX5_IB_FLOW_TYPE_MC_DEFAULT] = {
+		.type = UVERBS_ATTR_TYPE_PTR_IN,
+		UVERBS_ATTR_NO_DATA(),
+	},
+};
+
+static bool is_flow_dest(void *obj, int *dest_id, int *dest_type)
+{
+	struct devx_obj *devx_obj = obj;
+	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
+
+	switch (opcode) {
+	case MLX5_CMD_OP_DESTROY_TIR:
+		*dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+		*dest_id = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox,
+				    obj_id);
+		return true;
+
+	case MLX5_CMD_OP_DESTROY_FLOW_TABLE:
+		*dest_type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+		*dest_id = MLX5_GET(destroy_flow_table_in, devx_obj->dinbox,
+				    table_id);
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int get_dests(struct uverbs_attr_bundle *attrs,
+		     struct mlx5_ib_flow_matcher *fs_matcher, int *dest_id,
+		     int *dest_type, struct ib_qp **qp, u32 *flags)
+{
+	bool dest_devx, dest_qp;
+	void *devx_obj;
+	int err;
+
+	dest_devx = uverbs_attr_is_valid(attrs,
+					 MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX);
+	dest_qp = uverbs_attr_is_valid(attrs,
+				       MLX5_IB_ATTR_CREATE_FLOW_DEST_QP);
+
+	*flags = 0;
+	err = uverbs_get_flags32(flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_FLAGS,
+				 MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS |
+					 MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP);
+	if (err)
+		return err;
+
+	/* Both flags are not allowed */
+	if (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS &&
+	    *flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)
+		return -EINVAL;
+
+	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
+		if (dest_devx && (dest_qp || *flags))
+			return -EINVAL;
+		else if (dest_qp && *flags)
+			return -EINVAL;
+	}
+
+	/* Allow only DEVX object, drop as dest for FDB */
+	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !(dest_devx ||
+	     (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)))
+		return -EINVAL;
+
+	/* Allow only DEVX object or QP as dest when inserting to RDMA_RX */
+	if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
+	    ((!dest_devx && !dest_qp) || (dest_devx && dest_qp)))
+		return -EINVAL;
+
+	*qp = NULL;
+	if (dest_devx) {
+		devx_obj =
+			uverbs_attr_get_obj(attrs,
+					    MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX);
+
+		/* Verify that the given DEVX object is a flow
+		 * steering destination.
+		 */
+		if (!is_flow_dest(devx_obj, dest_id, dest_type))
+			return -EINVAL;
+		/* Allow only flow table as dest when inserting to FDB or RDMA_RX */
+		if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB ||
+		     fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
+		    *dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
+			return -EINVAL;
+	} else if (dest_qp) {
+		struct mlx5_ib_qp *mqp;
+
+		*qp = uverbs_attr_get_obj(attrs,
+					  MLX5_IB_ATTR_CREATE_FLOW_DEST_QP);
+		if (IS_ERR(*qp))
+			return PTR_ERR(*qp);
+
+		if ((*qp)->qp_type != IB_QPT_RAW_PACKET)
+			return -EINVAL;
+
+		mqp = to_mqp(*qp);
+		if (mqp->is_rss)
+			*dest_id = mqp->rss_qp.tirn;
+		else
+			*dest_id = mqp->raw_packet_qp.rq.tirn;
+		*dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS ||
+		   fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) {
+		*dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT;
+	}
+
+	if (*dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
+	    (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS ||
+	     fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX))
+		return -EINVAL;
+
+	return 0;
+}
+
+static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id)
+{
+	struct devx_obj *devx_obj = obj;
+	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
+
+	if (opcode == MLX5_CMD_OP_DEALLOC_FLOW_COUNTER) {
+
+		if (offset && offset >= devx_obj->flow_counter_bulk_size)
+			return false;
+
+		*counter_id = MLX5_GET(dealloc_flow_counter_in,
+				       devx_obj->dinbox,
+				       flow_counter_id);
+		*counter_id += offset;
+		return true;
+	}
+
+	return false;
+}
+
+#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2
+static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_flow_context flow_context = {.flow_tag =
+		MLX5_FS_DEFAULT_FLOW_TAG};
+	u32 *offset_attr, offset = 0, counter_id = 0;
+	int dest_id, dest_type = -1, inlen, len, ret, i;
+	struct mlx5_ib_flow_handler *flow_handler;
+	struct mlx5_ib_flow_matcher *fs_matcher;
+	struct ib_uobject **arr_flow_actions;
+	struct ib_uflow_resources *uflow_res;
+	struct mlx5_flow_act flow_act = {};
+	struct ib_qp *qp = NULL;
+	void *devx_obj, *cmd_in;
+	struct ib_uobject *uobj;
+	struct mlx5_ib_dev *dev;
+	u32 flags;
+
+	if (!capable(CAP_NET_RAW))
+		return -EPERM;
+
+	fs_matcher = uverbs_attr_get_obj(attrs,
+					 MLX5_IB_ATTR_CREATE_FLOW_MATCHER);
+	uobj =  uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE);
+	dev = mlx5_udata_to_mdev(&attrs->driver_udata);
+
+	if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp, &flags))
+		return -EINVAL;
+
+	if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS)
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS;
+
+	if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+	len = uverbs_attr_get_uobjs_arr(attrs,
+		MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions);
+	if (len) {
+		devx_obj = arr_flow_actions[0]->object;
+
+		if (uverbs_attr_is_valid(attrs,
+					 MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET)) {
+
+			int num_offsets = uverbs_attr_ptr_get_array_size(
+				attrs,
+				MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET,
+				sizeof(u32));
+
+			if (num_offsets != 1)
+				return -EINVAL;
+
+			offset_attr = uverbs_attr_get_alloced_ptr(
+				attrs,
+				MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET);
+			offset = *offset_attr;
+		}
+
+		if (!is_flow_counter(devx_obj, offset, &counter_id))
+			return -EINVAL;
+
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+	}
+
+	cmd_in = uverbs_attr_get_alloced_ptr(
+		attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
+	inlen = uverbs_attr_get_len(attrs,
+				    MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
+
+	uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS);
+	if (!uflow_res)
+		return -ENOMEM;
+
+	len = uverbs_attr_get_uobjs_arr(attrs,
+		MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions);
+	for (i = 0; i < len; i++) {
+		struct mlx5_ib_flow_action *maction =
+			to_mflow_act(arr_flow_actions[i]->object);
+
+		ret = parse_flow_flow_action(maction, false, &flow_act);
+		if (ret)
+			goto err_out;
+		flow_resources_add(uflow_res, IB_FLOW_SPEC_ACTION_HANDLE,
+				   arr_flow_actions[i]->object);
+	}
+
+	ret = uverbs_copy_from(&flow_context.flow_tag, attrs,
+			       MLX5_IB_ATTR_CREATE_FLOW_TAG);
+	if (!ret) {
+		if (flow_context.flow_tag >= BIT(24)) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+		flow_context.flags |= FLOW_CONTEXT_HAS_TAG;
+	}
+
+	flow_handler =
+		raw_fs_rule_add(dev, fs_matcher, &flow_context, &flow_act,
+				counter_id, cmd_in, inlen, dest_id, dest_type);
+	if (IS_ERR(flow_handler)) {
+		ret = PTR_ERR(flow_handler);
+		goto err_out;
+	}
+
+	ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, uflow_res);
+
+	return 0;
+err_out:
+	ib_uverbs_flow_resources_free(uflow_res);
+	return ret;
+}
+
+static int flow_matcher_cleanup(struct ib_uobject *uobject,
+				enum rdma_remove_reason why,
+				struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_flow_matcher *obj = uobject->object;
+	int ret;
+
+	ret = ib_destroy_usecnt(&obj->usecnt, why, uobject);
+	if (ret)
+		return ret;
+
+	kfree(obj);
+	return 0;
+}
+
+static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
+			      struct mlx5_ib_flow_matcher *obj)
+{
+	enum mlx5_ib_uapi_flow_table_type ft_type =
+		MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX;
+	u32 flags;
+	int err;
+
+	/* New users should use MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE and older
+	 * users should switch to it. We leave this to not break userspace
+	 */
+	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE) &&
+	    uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS))
+		return -EINVAL;
+
+	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE)) {
+		err = uverbs_get_const(&ft_type, attrs,
+				       MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE);
+		if (err)
+			return err;
+
+		err = mlx5_ib_ft_type_to_namespace(ft_type, &obj->ns_type);
+		if (err)
+			return err;
+
+		return 0;
+	}
+
+	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS)) {
+		err = uverbs_get_flags32(&flags, attrs,
+					 MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
+					 IB_FLOW_ATTR_FLAGS_EGRESS);
+		if (err)
+			return err;
+
+		if (flags) {
+			mlx5_ib_ft_type_to_namespace(
+				MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX,
+				&obj->ns_type);
+			return 0;
+		}
+	}
+
+	obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS;
+
+	return 0;
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE);
+	struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	struct mlx5_ib_flow_matcher *obj;
+	int err;
+
+	obj = kzalloc(sizeof(struct mlx5_ib_flow_matcher), GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	obj->mask_len = uverbs_attr_get_len(
+		attrs, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK);
+	err = uverbs_copy_from(&obj->matcher_mask,
+			       attrs,
+			       MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK);
+	if (err)
+		goto end;
+
+	obj->flow_type = uverbs_attr_get_enum_id(
+		attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE);
+
+	if (obj->flow_type == MLX5_IB_FLOW_TYPE_NORMAL) {
+		err = uverbs_copy_from(&obj->priority,
+				       attrs,
+				       MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE);
+		if (err)
+			goto end;
+	}
+
+	err = uverbs_copy_from(&obj->match_criteria_enable,
+			       attrs,
+			       MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA);
+	if (err)
+		goto end;
+
+	err = mlx5_ib_matcher_ns(attrs, obj);
+	if (err)
+		goto end;
+
+	uobj->object = obj;
+	obj->mdev = dev->mdev;
+	atomic_set(&obj->usecnt, 0);
+	return 0;
+
+end:
+	kfree(obj);
+	return err;
+}
+
+static struct ib_flow_action *
+mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev,
+			     enum mlx5_ib_uapi_flow_table_type ft_type,
+			     u8 num_actions, void *in)
+{
+	enum mlx5_flow_namespace_type namespace;
+	struct mlx5_ib_flow_action *maction;
+	int ret;
+
+	ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace);
+	if (ret)
+		return ERR_PTR(-EINVAL);
+
+	maction = kzalloc(sizeof(*maction), GFP_KERNEL);
+	if (!maction)
+		return ERR_PTR(-ENOMEM);
+
+	maction->flow_action_raw.modify_hdr =
+		mlx5_modify_header_alloc(dev->mdev, namespace, num_actions, in);
+
+	if (IS_ERR(maction->flow_action_raw.modify_hdr)) {
+		ret = PTR_ERR(maction->flow_action_raw.modify_hdr);
+		kfree(maction);
+		return ERR_PTR(ret);
+	}
+	maction->flow_action_raw.sub_type =
+		MLX5_IB_FLOW_ACTION_MODIFY_HEADER;
+	maction->flow_action_raw.dev = dev;
+
+	return &maction->ib_action;
+}
+
+static bool mlx5_ib_modify_header_supported(struct mlx5_ib_dev *dev)
+{
+	return MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+					 max_modify_header_actions) ||
+	       MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
+					 max_modify_header_actions) ||
+	       MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev,
+					 max_modify_header_actions);
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE);
+	struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	enum mlx5_ib_uapi_flow_table_type ft_type;
+	struct ib_flow_action *action;
+	int num_actions;
+	void *in;
+	int ret;
+
+	if (!mlx5_ib_modify_header_supported(mdev))
+		return -EOPNOTSUPP;
+
+	in = uverbs_attr_get_alloced_ptr(attrs,
+		MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM);
+
+	num_actions = uverbs_attr_ptr_get_array_size(
+		attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM,
+		MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto));
+	if (num_actions < 0)
+		return num_actions;
+
+	ret = uverbs_get_const(&ft_type, attrs,
+			       MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE);
+	if (ret)
+		return ret;
+	action = mlx5_ib_create_modify_header(mdev, ft_type, num_actions, in);
+	if (IS_ERR(action))
+		return PTR_ERR(action);
+
+	uverbs_flow_action_fill_action(action, uobj, &mdev->ib_dev,
+				       IB_FLOW_ACTION_UNSPECIFIED);
+
+	return 0;
+}
+
+static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev,
+						      u8 packet_reformat_type,
+						      u8 ft_type)
+{
+	switch (packet_reformat_type) {
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX)
+			return MLX5_CAP_FLOWTABLE(ibdev->mdev,
+						  encap_general_header);
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX)
+			return MLX5_CAP_FLOWTABLE_NIC_TX(ibdev->mdev,
+				reformat_l2_to_l3_tunnel);
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX)
+			return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev,
+				reformat_l3_tunnel_to_l2);
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX)
+			return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, decap);
+		break;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+static int mlx5_ib_dv_to_prm_packet_reforamt_type(u8 dv_prt, u8 *prm_prt)
+{
+	switch (dv_prt) {
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
+		*prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL;
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
+		*prm_prt = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2;
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
+		*prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5_ib_flow_action_create_packet_reformat_ctx(
+	struct mlx5_ib_dev *dev,
+	struct mlx5_ib_flow_action *maction,
+	u8 ft_type, u8 dv_prt,
+	void *in, size_t len)
+{
+	enum mlx5_flow_namespace_type namespace;
+	u8 prm_prt;
+	int ret;
+
+	ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace);
+	if (ret)
+		return ret;
+
+	ret = mlx5_ib_dv_to_prm_packet_reforamt_type(dv_prt, &prm_prt);
+	if (ret)
+		return ret;
+
+	maction->flow_action_raw.pkt_reformat =
+		mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len,
+					   in, namespace);
+	if (IS_ERR(maction->flow_action_raw.pkt_reformat)) {
+		ret = PTR_ERR(maction->flow_action_raw.pkt_reformat);
+		return ret;
+	}
+
+	maction->flow_action_raw.sub_type =
+		MLX5_IB_FLOW_ACTION_PACKET_REFORMAT;
+	maction->flow_action_raw.dev = dev;
+
+	return 0;
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
+		MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE);
+	struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	enum mlx5_ib_uapi_flow_action_packet_reformat_type dv_prt;
+	enum mlx5_ib_uapi_flow_table_type ft_type;
+	struct mlx5_ib_flow_action *maction;
+	int ret;
+
+	ret = uverbs_get_const(&ft_type, attrs,
+			       MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE);
+	if (ret)
+		return ret;
+
+	ret = uverbs_get_const(&dv_prt, attrs,
+			       MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE);
+	if (ret)
+		return ret;
+
+	if (!mlx5_ib_flow_action_packet_reformat_valid(mdev, dv_prt, ft_type))
+		return -EOPNOTSUPP;
+
+	maction = kzalloc(sizeof(*maction), GFP_KERNEL);
+	if (!maction)
+		return -ENOMEM;
+
+	if (dv_prt ==
+	    MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2) {
+		maction->flow_action_raw.sub_type =
+			MLX5_IB_FLOW_ACTION_DECAP;
+		maction->flow_action_raw.dev = mdev;
+	} else {
+		void *in;
+		int len;
+
+		in = uverbs_attr_get_alloced_ptr(attrs,
+			MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF);
+		if (IS_ERR(in)) {
+			ret = PTR_ERR(in);
+			goto free_maction;
+		}
+
+		len = uverbs_attr_get_len(attrs,
+			MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF);
+
+		ret = mlx5_ib_flow_action_create_packet_reformat_ctx(mdev,
+			maction, ft_type, dv_prt, in, len);
+		if (ret)
+			goto free_maction;
+	}
+
+	uverbs_flow_action_fill_action(&maction->ib_action, uobj, &mdev->ib_dev,
+				       IB_FLOW_ACTION_UNSPECIFIED);
+	return 0;
+
+free_maction:
+	kfree(maction);
+	return ret;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_CREATE_FLOW,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE,
+			UVERBS_OBJECT_FLOW,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(
+		MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE,
+		UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)),
+		UA_MANDATORY,
+		UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_MATCHER,
+			MLX5_IB_OBJECT_FLOW_MATCHER,
+			UVERBS_ACCESS_READ,
+			UA_MANDATORY),
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_QP,
+			UVERBS_OBJECT_QP,
+			UVERBS_ACCESS_READ),
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX,
+			MLX5_IB_OBJECT_DEVX_OBJ,
+			UVERBS_ACCESS_READ),
+	UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS,
+			     UVERBS_OBJECT_FLOW_ACTION,
+			     UVERBS_ACCESS_READ, 1,
+			     MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS,
+			     UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_TAG,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_OPTIONAL),
+	UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX,
+			     MLX5_IB_OBJECT_DEVX_OBJ,
+			     UVERBS_ACCESS_READ, 1, 1,
+			     UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET,
+			   UVERBS_ATTR_MIN_SIZE(sizeof(u32)),
+			   UA_OPTIONAL,
+			   UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_FLAGS,
+			     enum mlx5_ib_create_flow_flags,
+			     UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	MLX5_IB_METHOD_DESTROY_FLOW,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE,
+			UVERBS_OBJECT_FLOW,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+ADD_UVERBS_METHODS(mlx5_ib_fs,
+		   UVERBS_OBJECT_FLOW,
+		   &UVERBS_METHOD(MLX5_IB_METHOD_CREATE_FLOW),
+		   &UVERBS_METHOD(MLX5_IB_METHOD_DESTROY_FLOW));
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE,
+			UVERBS_OBJECT_FLOW_ACTION,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM,
+			   UVERBS_ATTR_MIN_SIZE(MLX5_UN_SZ_BYTES(
+				   set_add_copy_action_in_auto)),
+			   UA_MANDATORY,
+			   UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE,
+			     enum mlx5_ib_uapi_flow_table_type,
+			     UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE,
+			UVERBS_OBJECT_FLOW_ACTION,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF,
+			   UVERBS_ATTR_MIN_SIZE(1),
+			   UA_ALLOC_AND_COPY,
+			   UA_OPTIONAL),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE,
+			     enum mlx5_ib_uapi_flow_action_packet_reformat_type,
+			     UA_MANDATORY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE,
+			     enum mlx5_ib_uapi_flow_table_type,
+			     UA_MANDATORY));
+
+ADD_UVERBS_METHODS(
+	mlx5_ib_flow_actions,
+	UVERBS_OBJECT_FLOW_ACTION,
+	&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER),
+	&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT));
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_FLOW_MATCHER_CREATE,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE,
+			MLX5_IB_OBJECT_FLOW_MATCHER,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(
+		MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK,
+		UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)),
+		UA_MANDATORY),
+	UVERBS_ATTR_ENUM_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE,
+			    mlx5_ib_flow_type,
+			    UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA,
+			   UVERBS_ATTR_TYPE(u8),
+			   UA_MANDATORY),
+	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
+			     enum ib_flow_flags,
+			     UA_OPTIONAL),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE,
+			     enum mlx5_ib_uapi_flow_table_type,
+			     UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	MLX5_IB_METHOD_FLOW_MATCHER_DESTROY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_DESTROY_HANDLE,
+			MLX5_IB_OBJECT_FLOW_MATCHER,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER,
+			    UVERBS_TYPE_ALLOC_IDR(flow_matcher_cleanup),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY));
+
+const struct uapi_definition mlx5_ib_flow_defs[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_FLOW_MATCHER),
+	UAPI_DEF_CHAIN_OBJ_TREE(
+		UVERBS_OBJECT_FLOW,
+		&mlx5_ib_fs),
+	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
+				&mlx5_ib_flow_actions),
+	{},
+};
+
+static const struct ib_device_ops flow_ops = {
+	.create_flow = mlx5_ib_create_flow,
+	.destroy_flow = mlx5_ib_destroy_flow,
+	.destroy_flow_action = mlx5_ib_destroy_flow_action,
+};
+
+static const struct ib_device_ops flow_ipsec_ops = {
+	.create_flow_action_esp = mlx5_ib_create_flow_action_esp,
+	.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp,
+};
+
+int mlx5_ib_fs_init(struct mlx5_ib_dev *dev)
+{
+	dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
+
+	if (!dev->flow_db)
+		return -ENOMEM;
+
+	mutex_init(&dev->flow_db->lock);
+
+	ib_set_device_ops(&dev->ib_dev, &flow_ops);
+	if (mlx5_accel_ipsec_device_caps(dev->mdev) &
+	    MLX5_ACCEL_IPSEC_CAP_DEVICE)
+		ib_set_device_ops(&dev->ib_dev, &flow_ipsec_ops);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h
new file mode 100644
index 0000000..ad320ad
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/fs.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _MLX5_IB_FS_H
+#define _MLX5_IB_FS_H
+
+#include "mlx5_ib.h"
+
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
+int mlx5_ib_fs_init(struct mlx5_ib_dev *dev);
+#else
+static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev)
+{
+	dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
+
+	if (!dev->flow_db)
+		return -ENOMEM;
+
+	mutex_init(&dev->flow_db->lock);
+	return 0;
+}
+#endif
+static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev)
+{
+	kfree(dev->flow_db);
+}
+#endif /* _MLX5_IB_FS_H */
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 6f99ed0..fbc45a5 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1,33 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
  */
 
 #include <linux/debugfs.h>
@@ -59,10 +32,13 @@
 #include "mlx5_ib.h"
 #include "ib_rep.h"
 #include "cmd.h"
+#include "devx.h"
+#include "fs.h"
 #include "srq.h"
 #include "qp.h"
 #include "wr.h"
-#include <linux/mlx5/fs_helpers.h>
+#include "restrack.h"
+#include "counters.h"
 #include <linux/mlx5/accel.h>
 #include <rdma/uverbs_std_types.h>
 #include <rdma/mlx5_user_ioctl_verbs.h>
@@ -311,9 +287,6 @@
 		*native_port_num = 1;
 
 	port = &ibdev->port[ib_port_num - 1];
-	if (!port)
-		return NULL;
-
 	spin_lock(&port->mp.mpi_lock);
 	mpi = ibdev->port[ib_port_num - 1].mp.mpi;
 	if (mpi && !mpi->unaffiliate) {
@@ -1765,6 +1738,92 @@
 	mlx5_ib_disable_lb(dev, true, false);
 }
 
+static int set_ucontext_resp(struct ib_ucontext *uctx,
+			     struct mlx5_ib_alloc_ucontext_resp *resp)
+{
+	struct ib_device *ibdev = uctx->device;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_ucontext *context = to_mucontext(uctx);
+	struct mlx5_bfreg_info *bfregi = &context->bfregi;
+	int err;
+
+	if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
+		err = mlx5_cmd_dump_fill_mkey(dev->mdev,
+					      &resp->dump_fill_mkey);
+		if (err)
+			return err;
+		resp->comp_mask |=
+			MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY;
+	}
+
+	resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
+	if (dev->wc_support)
+		resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev,
+						      log_bf_reg_size);
+	resp->cache_line_size = cache_line_size();
+	resp->max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
+	resp->max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
+	resp->max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
+	resp->max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
+	resp->max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
+	resp->cqe_version = context->cqe_version;
+	resp->log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+				MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
+	resp->num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+					MLX5_CAP_GEN(dev->mdev,
+						     num_of_uars_per_page) : 1;
+
+	if (mlx5_accel_ipsec_device_caps(dev->mdev) &
+				MLX5_ACCEL_IPSEC_CAP_DEVICE) {
+		if (mlx5_get_flow_namespace(dev->mdev,
+				MLX5_FLOW_NAMESPACE_EGRESS))
+			resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM;
+		if (mlx5_accel_ipsec_device_caps(dev->mdev) &
+				MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA)
+			resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA;
+		if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
+			resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING;
+		if (mlx5_accel_ipsec_device_caps(dev->mdev) &
+				MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN)
+			resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN;
+		/* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
+	}
+
+	resp->tot_bfregs = bfregi->lib_uar_dyn ? 0 :
+			bfregi->total_num_bfregs - bfregi->num_dyn_bfregs;
+	resp->num_ports = dev->num_ports;
+	resp->cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
+				      MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
+
+	if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
+		mlx5_query_min_inline(dev->mdev, &resp->eth_min_inline);
+		resp->eth_min_inline++;
+	}
+
+	if (dev->mdev->clock_info)
+		resp->clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
+
+	/*
+	 * We don't want to expose information from the PCI bar that is located
+	 * after 4096 bytes, so if the arch only supports larger pages, let's
+	 * pretend we don't support reading the HCA's core clock. This is also
+	 * forced by mmap function.
+	 */
+	if (PAGE_SIZE <= 4096) {
+		resp->comp_mask |=
+			MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
+		resp->hca_core_clock_offset =
+			offsetof(struct mlx5_init_seg,
+				 internal_timer_h) % PAGE_SIZE;
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, ece_support))
+		resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE;
+
+	resp->num_dyn_bfregs = bfregi->num_dyn_bfregs;
+	return 0;
+}
+
 static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
 				  struct ib_udata *udata)
 {
@@ -1772,14 +1831,12 @@
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
 	struct mlx5_ib_alloc_ucontext_resp resp = {};
-	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_ib_ucontext *context = to_mucontext(uctx);
 	struct mlx5_bfreg_info *bfregi;
 	int ver;
 	int err;
 	size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
 				     max_cqe_version);
-	u32 dump_fill_mkey;
 	bool lib_uar_4k;
 	bool lib_uar_dyn;
 
@@ -1808,37 +1865,6 @@
 	if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
 		return -EINVAL;
 
-	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
-	if (dev->wc_support)
-		resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
-	resp.cache_line_size = cache_line_size();
-	resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
-	resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
-	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
-	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
-	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
-	resp.cqe_version = min_t(__u8,
-				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
-				 req.max_cqe_version);
-	resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
-				MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
-	resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
-					MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
-	resp.response_length = min(offsetof(typeof(resp), response_length) +
-				   sizeof(resp.response_length), udata->outlen);
-
-	if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) {
-		if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_EGRESS))
-			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM;
-		if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA)
-			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA;
-		if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
-			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING;
-		if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN)
-			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN;
-		/* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
-	}
-
 	lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
 	lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR;
 	bfregi = &context->bfregi;
@@ -1887,87 +1913,24 @@
 	if (err)
 		goto out_devx;
 
-	if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
-		err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey);
-		if (err)
-			goto out_mdev;
-	}
-
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 
-	resp.tot_bfregs = lib_uar_dyn ? 0 : req.total_num_bfregs;
-	resp.num_ports = dev->num_ports;
+	context->cqe_version = min_t(__u8,
+				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
+				 req.max_cqe_version);
 
-	if (offsetofend(typeof(resp), cqe_version) <= udata->outlen)
-		resp.response_length += sizeof(resp.cqe_version);
+	err = set_ucontext_resp(uctx, &resp);
+	if (err)
+		goto out_mdev;
 
-	if (offsetofend(typeof(resp), cmds_supp_uhw) <= udata->outlen) {
-		resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
-				      MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
-		resp.response_length += sizeof(resp.cmds_supp_uhw);
-	}
-
-	if (offsetofend(typeof(resp), eth_min_inline) <= udata->outlen) {
-		if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
-			mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
-			resp.eth_min_inline++;
-		}
-		resp.response_length += sizeof(resp.eth_min_inline);
-	}
-
-	if (offsetofend(typeof(resp), clock_info_versions) <= udata->outlen) {
-		if (mdev->clock_info)
-			resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
-		resp.response_length += sizeof(resp.clock_info_versions);
-	}
-
-	/*
-	 * We don't want to expose information from the PCI bar that is located
-	 * after 4096 bytes, so if the arch only supports larger pages, let's
-	 * pretend we don't support reading the HCA's core clock. This is also
-	 * forced by mmap function.
-	 */
-	if (offsetofend(typeof(resp), hca_core_clock_offset) <= udata->outlen) {
-		if (PAGE_SIZE <= 4096) {
-			resp.comp_mask |=
-				MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
-			resp.hca_core_clock_offset =
-				offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
-		}
-		resp.response_length += sizeof(resp.hca_core_clock_offset);
-	}
-
-	if (offsetofend(typeof(resp), log_uar_size) <= udata->outlen)
-		resp.response_length += sizeof(resp.log_uar_size);
-
-	if (offsetofend(typeof(resp), num_uars_per_page) <= udata->outlen)
-		resp.response_length += sizeof(resp.num_uars_per_page);
-
-	if (offsetofend(typeof(resp), num_dyn_bfregs) <= udata->outlen) {
-		resp.num_dyn_bfregs = bfregi->num_dyn_bfregs;
-		resp.response_length += sizeof(resp.num_dyn_bfregs);
-	}
-
-	if (offsetofend(typeof(resp), dump_fill_mkey) <= udata->outlen) {
-		if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
-			resp.dump_fill_mkey = dump_fill_mkey;
-			resp.comp_mask |=
-				MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY;
-		}
-		resp.response_length += sizeof(resp.dump_fill_mkey);
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, ece_support))
-		resp.comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE;
-
+	resp.response_length = min(udata->outlen, sizeof(resp));
 	err = ib_copy_to_udata(udata, &resp, resp.response_length);
 	if (err)
 		goto out_mdev;
 
 	bfregi->ver = ver;
 	bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
-	context->cqe_version = resp.cqe_version;
 	context->lib_caps = req.lib_caps;
 	print_lib_caps(dev, context->lib_caps);
 
@@ -2000,6 +1963,29 @@
 	return err;
 }
 
+static int mlx5_ib_query_ucontext(struct ib_ucontext *ibcontext,
+				  struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_alloc_ucontext_resp uctx_resp = {};
+	int ret;
+
+	ret = set_ucontext_resp(ibcontext, &uctx_resp);
+	if (ret)
+		return ret;
+
+	uctx_resp.response_length =
+		min_t(size_t,
+		      uverbs_attr_get_len(attrs,
+				MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX),
+		      sizeof(uctx_resp));
+
+	ret = uverbs_copy_to_struct_or_zero(attrs,
+					MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX,
+					&uctx_resp,
+					sizeof(uctx_resp));
+	return ret;
+}
+
 static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
@@ -2591,1820 +2577,6 @@
 	mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
 }
 
-enum {
-	MATCH_CRITERIA_ENABLE_OUTER_BIT,
-	MATCH_CRITERIA_ENABLE_MISC_BIT,
-	MATCH_CRITERIA_ENABLE_INNER_BIT,
-	MATCH_CRITERIA_ENABLE_MISC2_BIT
-};
-
-#define HEADER_IS_ZERO(match_criteria, headers)			           \
-	!(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
-		    0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
-
-static u8 get_match_criteria_enable(u32 *match_criteria)
-{
-	u8 match_criteria_enable;
-
-	match_criteria_enable =
-		(!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
-		MATCH_CRITERIA_ENABLE_OUTER_BIT;
-	match_criteria_enable |=
-		(!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
-		MATCH_CRITERIA_ENABLE_MISC_BIT;
-	match_criteria_enable |=
-		(!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
-		MATCH_CRITERIA_ENABLE_INNER_BIT;
-	match_criteria_enable |=
-		(!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) <<
-		MATCH_CRITERIA_ENABLE_MISC2_BIT;
-
-	return match_criteria_enable;
-}
-
-static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
-{
-	u8 entry_mask;
-	u8 entry_val;
-	int err = 0;
-
-	if (!mask)
-		goto out;
-
-	entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c,
-			      ip_protocol);
-	entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v,
-			     ip_protocol);
-	if (!entry_mask) {
-		MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
-		MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
-		goto out;
-	}
-	/* Don't override existing ip protocol */
-	if (mask != entry_mask || val != entry_val)
-		err = -EINVAL;
-out:
-	return err;
-}
-
-static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
-			   bool inner)
-{
-	if (inner) {
-		MLX5_SET(fte_match_set_misc,
-			 misc_c, inner_ipv6_flow_label, mask);
-		MLX5_SET(fte_match_set_misc,
-			 misc_v, inner_ipv6_flow_label, val);
-	} else {
-		MLX5_SET(fte_match_set_misc,
-			 misc_c, outer_ipv6_flow_label, mask);
-		MLX5_SET(fte_match_set_misc,
-			 misc_v, outer_ipv6_flow_label, val);
-	}
-}
-
-static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
-{
-	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
-	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
-	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
-	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
-}
-
-static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask)
-{
-	if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) &&
-	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL))
-		return -EOPNOTSUPP;
-
-	if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) &&
-	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP))
-		return -EOPNOTSUPP;
-
-	if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) &&
-	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS))
-		return -EOPNOTSUPP;
-
-	if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) &&
-	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL))
-		return -EOPNOTSUPP;
-
-	return 0;
-}
-
-#define LAST_ETH_FIELD vlan_tag
-#define LAST_IB_FIELD sl
-#define LAST_IPV4_FIELD tos
-#define LAST_IPV6_FIELD traffic_class
-#define LAST_TCP_UDP_FIELD src_port
-#define LAST_TUNNEL_FIELD tunnel_id
-#define LAST_FLOW_TAG_FIELD tag_id
-#define LAST_DROP_FIELD size
-#define LAST_COUNTERS_FIELD counters
-
-/* Field is the last supported field */
-#define FIELDS_NOT_SUPPORTED(filter, field)\
-	memchr_inv((void *)&filter.field  +\
-		   sizeof(filter.field), 0,\
-		   sizeof(filter) -\
-		   offsetof(typeof(filter), field) -\
-		   sizeof(filter.field))
-
-int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
-			   bool is_egress,
-			   struct mlx5_flow_act *action)
-{
-
-	switch (maction->ib_action.type) {
-	case IB_FLOW_ACTION_ESP:
-		if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
-				      MLX5_FLOW_CONTEXT_ACTION_DECRYPT))
-			return -EINVAL;
-		/* Currently only AES_GCM keymat is supported by the driver */
-		action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx;
-		action->action |= is_egress ?
-			MLX5_FLOW_CONTEXT_ACTION_ENCRYPT :
-			MLX5_FLOW_CONTEXT_ACTION_DECRYPT;
-		return 0;
-	case IB_FLOW_ACTION_UNSPECIFIED:
-		if (maction->flow_action_raw.sub_type ==
-		    MLX5_IB_FLOW_ACTION_MODIFY_HEADER) {
-			if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
-				return -EINVAL;
-			action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
-			action->modify_hdr =
-				maction->flow_action_raw.modify_hdr;
-			return 0;
-		}
-		if (maction->flow_action_raw.sub_type ==
-		    MLX5_IB_FLOW_ACTION_DECAP) {
-			if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
-				return -EINVAL;
-			action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
-			return 0;
-		}
-		if (maction->flow_action_raw.sub_type ==
-		    MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) {
-			if (action->action &
-			    MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
-				return -EINVAL;
-			action->action |=
-				MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
-			action->pkt_reformat =
-				maction->flow_action_raw.pkt_reformat;
-			return 0;
-		}
-		/* fall through */
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int parse_flow_attr(struct mlx5_core_dev *mdev,
-			   struct mlx5_flow_spec *spec,
-			   const union ib_flow_spec *ib_spec,
-			   const struct ib_flow_attr *flow_attr,
-			   struct mlx5_flow_act *action, u32 prev_type)
-{
-	struct mlx5_flow_context *flow_context = &spec->flow_context;
-	u32 *match_c = spec->match_criteria;
-	u32 *match_v = spec->match_value;
-	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
-					   misc_parameters);
-	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
-					   misc_parameters);
-	void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c,
-					    misc_parameters_2);
-	void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v,
-					    misc_parameters_2);
-	void *headers_c;
-	void *headers_v;
-	int match_ipv;
-	int ret;
-
-	if (ib_spec->type & IB_FLOW_SPEC_INNER) {
-		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
-					 inner_headers);
-		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
-					 inner_headers);
-		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-					ft_field_support.inner_ip_version);
-	} else {
-		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
-					 outer_headers);
-		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
-					 outer_headers);
-		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-					ft_field_support.outer_ip_version);
-	}
-
-	switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
-	case IB_FLOW_SPEC_ETH:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
-			return -EOPNOTSUPP;
-
-		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-					     dmac_47_16),
-				ib_spec->eth.mask.dst_mac);
-		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-					     dmac_47_16),
-				ib_spec->eth.val.dst_mac);
-
-		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-					     smac_47_16),
-				ib_spec->eth.mask.src_mac);
-		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-					     smac_47_16),
-				ib_spec->eth.val.src_mac);
-
-		if (ib_spec->eth.mask.vlan_tag) {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 cvlan_tag, 1);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 cvlan_tag, 1);
-
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 first_vid, ntohs(ib_spec->eth.val.vlan_tag));
-
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 first_cfi,
-				 ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 first_cfi,
-				 ntohs(ib_spec->eth.val.vlan_tag) >> 12);
-
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 first_prio,
-				 ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 first_prio,
-				 ntohs(ib_spec->eth.val.vlan_tag) >> 13);
-		}
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-			 ethertype, ntohs(ib_spec->eth.mask.ether_type));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-			 ethertype, ntohs(ib_spec->eth.val.ether_type));
-		break;
-	case IB_FLOW_SPEC_IPV4:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
-			return -EOPNOTSUPP;
-
-		if (match_ipv) {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 ip_version, 0xf);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 ip_version, MLX5_FS_IPV4_VERSION);
-		} else {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 ethertype, 0xffff);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 ethertype, ETH_P_IP);
-		}
-
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
-		       &ib_spec->ipv4.mask.src_ip,
-		       sizeof(ib_spec->ipv4.mask.src_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
-		       &ib_spec->ipv4.val.src_ip,
-		       sizeof(ib_spec->ipv4.val.src_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
-		       &ib_spec->ipv4.mask.dst_ip,
-		       sizeof(ib_spec->ipv4.mask.dst_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
-		       &ib_spec->ipv4.val.dst_ip,
-		       sizeof(ib_spec->ipv4.val.dst_ip));
-
-		set_tos(headers_c, headers_v,
-			ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
-
-		if (set_proto(headers_c, headers_v,
-			      ib_spec->ipv4.mask.proto,
-			      ib_spec->ipv4.val.proto))
-			return -EINVAL;
-		break;
-	case IB_FLOW_SPEC_IPV6:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
-			return -EOPNOTSUPP;
-
-		if (match_ipv) {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 ip_version, 0xf);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 ip_version, MLX5_FS_IPV6_VERSION);
-		} else {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 ethertype, 0xffff);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 ethertype, ETH_P_IPV6);
-		}
-
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
-		       &ib_spec->ipv6.mask.src_ip,
-		       sizeof(ib_spec->ipv6.mask.src_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
-		       &ib_spec->ipv6.val.src_ip,
-		       sizeof(ib_spec->ipv6.val.src_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
-		       &ib_spec->ipv6.mask.dst_ip,
-		       sizeof(ib_spec->ipv6.mask.dst_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
-		       &ib_spec->ipv6.val.dst_ip,
-		       sizeof(ib_spec->ipv6.val.dst_ip));
-
-		set_tos(headers_c, headers_v,
-			ib_spec->ipv6.mask.traffic_class,
-			ib_spec->ipv6.val.traffic_class);
-
-		if (set_proto(headers_c, headers_v,
-			      ib_spec->ipv6.mask.next_hdr,
-			      ib_spec->ipv6.val.next_hdr))
-			return -EINVAL;
-
-		set_flow_label(misc_params_c, misc_params_v,
-			       ntohl(ib_spec->ipv6.mask.flow_label),
-			       ntohl(ib_spec->ipv6.val.flow_label),
-			       ib_spec->type & IB_FLOW_SPEC_INNER);
-		break;
-	case IB_FLOW_SPEC_ESP:
-		if (ib_spec->esp.mask.seq)
-			return -EOPNOTSUPP;
-
-		MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi,
-			 ntohl(ib_spec->esp.mask.spi));
-		MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi,
-			 ntohl(ib_spec->esp.val.spi));
-		break;
-	case IB_FLOW_SPEC_TCP:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
-					 LAST_TCP_UDP_FIELD))
-			return -EOPNOTSUPP;
-
-		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP))
-			return -EINVAL;
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
-			 ntohs(ib_spec->tcp_udp.mask.src_port));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
-			 ntohs(ib_spec->tcp_udp.val.src_port));
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
-			 ntohs(ib_spec->tcp_udp.mask.dst_port));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
-			 ntohs(ib_spec->tcp_udp.val.dst_port));
-		break;
-	case IB_FLOW_SPEC_UDP:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
-					 LAST_TCP_UDP_FIELD))
-			return -EOPNOTSUPP;
-
-		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP))
-			return -EINVAL;
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
-			 ntohs(ib_spec->tcp_udp.mask.src_port));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
-			 ntohs(ib_spec->tcp_udp.val.src_port));
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
-			 ntohs(ib_spec->tcp_udp.mask.dst_port));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
-			 ntohs(ib_spec->tcp_udp.val.dst_port));
-		break;
-	case IB_FLOW_SPEC_GRE:
-		if (ib_spec->gre.mask.c_ks_res0_ver)
-			return -EOPNOTSUPP;
-
-		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE))
-			return -EINVAL;
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
-			 0xff);
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
-			 IPPROTO_GRE);
-
-		MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol,
-			 ntohs(ib_spec->gre.mask.protocol));
-		MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol,
-			 ntohs(ib_spec->gre.val.protocol));
-
-		memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
-				    gre_key.nvgre.hi),
-		       &ib_spec->gre.mask.key,
-		       sizeof(ib_spec->gre.mask.key));
-		memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v,
-				    gre_key.nvgre.hi),
-		       &ib_spec->gre.val.key,
-		       sizeof(ib_spec->gre.val.key));
-		break;
-	case IB_FLOW_SPEC_MPLS:
-		switch (prev_type) {
-		case IB_FLOW_SPEC_UDP:
-			if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-						   ft_field_support.outer_first_mpls_over_udp),
-						   &ib_spec->mpls.mask.tag))
-				return -EOPNOTSUPP;
-
-			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
-					    outer_first_mpls_over_udp),
-			       &ib_spec->mpls.val.tag,
-			       sizeof(ib_spec->mpls.val.tag));
-			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
-					    outer_first_mpls_over_udp),
-			       &ib_spec->mpls.mask.tag,
-			       sizeof(ib_spec->mpls.mask.tag));
-			break;
-		case IB_FLOW_SPEC_GRE:
-			if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-						   ft_field_support.outer_first_mpls_over_gre),
-						   &ib_spec->mpls.mask.tag))
-				return -EOPNOTSUPP;
-
-			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
-					    outer_first_mpls_over_gre),
-			       &ib_spec->mpls.val.tag,
-			       sizeof(ib_spec->mpls.val.tag));
-			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
-					    outer_first_mpls_over_gre),
-			       &ib_spec->mpls.mask.tag,
-			       sizeof(ib_spec->mpls.mask.tag));
-			break;
-		default:
-			if (ib_spec->type & IB_FLOW_SPEC_INNER) {
-				if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-							   ft_field_support.inner_first_mpls),
-							   &ib_spec->mpls.mask.tag))
-					return -EOPNOTSUPP;
-
-				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
-						    inner_first_mpls),
-				       &ib_spec->mpls.val.tag,
-				       sizeof(ib_spec->mpls.val.tag));
-				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
-						    inner_first_mpls),
-				       &ib_spec->mpls.mask.tag,
-				       sizeof(ib_spec->mpls.mask.tag));
-			} else {
-				if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-							   ft_field_support.outer_first_mpls),
-							   &ib_spec->mpls.mask.tag))
-					return -EOPNOTSUPP;
-
-				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
-						    outer_first_mpls),
-				       &ib_spec->mpls.val.tag,
-				       sizeof(ib_spec->mpls.val.tag));
-				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
-						    outer_first_mpls),
-				       &ib_spec->mpls.mask.tag,
-				       sizeof(ib_spec->mpls.mask.tag));
-			}
-		}
-		break;
-	case IB_FLOW_SPEC_VXLAN_TUNNEL:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
-					 LAST_TUNNEL_FIELD))
-			return -EOPNOTSUPP;
-
-		MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
-			 ntohl(ib_spec->tunnel.mask.tunnel_id));
-		MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
-			 ntohl(ib_spec->tunnel.val.tunnel_id));
-		break;
-	case IB_FLOW_SPEC_ACTION_TAG:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
-					 LAST_FLOW_TAG_FIELD))
-			return -EOPNOTSUPP;
-		if (ib_spec->flow_tag.tag_id >= BIT(24))
-			return -EINVAL;
-
-		flow_context->flow_tag = ib_spec->flow_tag.tag_id;
-		flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
-		break;
-	case IB_FLOW_SPEC_ACTION_DROP:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
-					 LAST_DROP_FIELD))
-			return -EOPNOTSUPP;
-		action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
-		break;
-	case IB_FLOW_SPEC_ACTION_HANDLE:
-		ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act),
-			flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action);
-		if (ret)
-			return ret;
-		break;
-	case IB_FLOW_SPEC_ACTION_COUNT:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count,
-					 LAST_COUNTERS_FIELD))
-			return -EOPNOTSUPP;
-
-		/* for now support only one counters spec per flow */
-		if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
-			return -EINVAL;
-
-		action->counters = ib_spec->flow_count.counters;
-		action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/* If a flow could catch both multicast and unicast packets,
- * it won't fall into the multicast flow steering table and this rule
- * could steal other multicast packets.
- */
-static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr)
-{
-	union ib_flow_spec *flow_spec;
-
-	if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
-	    ib_attr->num_of_specs < 1)
-		return false;
-
-	flow_spec = (union ib_flow_spec *)(ib_attr + 1);
-	if (flow_spec->type == IB_FLOW_SPEC_IPV4) {
-		struct ib_flow_spec_ipv4 *ipv4_spec;
-
-		ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec;
-		if (ipv4_is_multicast(ipv4_spec->val.dst_ip))
-			return true;
-
-		return false;
-	}
-
-	if (flow_spec->type == IB_FLOW_SPEC_ETH) {
-		struct ib_flow_spec_eth *eth_spec;
-
-		eth_spec = (struct ib_flow_spec_eth *)flow_spec;
-		return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
-		       is_multicast_ether_addr(eth_spec->val.dst_mac);
-	}
-
-	return false;
-}
-
-enum valid_spec {
-	VALID_SPEC_INVALID,
-	VALID_SPEC_VALID,
-	VALID_SPEC_NA,
-};
-
-static enum valid_spec
-is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
-		     const struct mlx5_flow_spec *spec,
-		     const struct mlx5_flow_act *flow_act,
-		     bool egress)
-{
-	const u32 *match_c = spec->match_criteria;
-	bool is_crypto =
-		(flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
-				     MLX5_FLOW_CONTEXT_ACTION_DECRYPT));
-	bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c);
-	bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP;
-
-	/*
-	 * Currently only crypto is supported in egress, when regular egress
-	 * rules would be supported, always return VALID_SPEC_NA.
-	 */
-	if (!is_crypto)
-		return VALID_SPEC_NA;
-
-	return is_crypto && is_ipsec &&
-		(!egress || (!is_drop &&
-			     !(spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG))) ?
-		VALID_SPEC_VALID : VALID_SPEC_INVALID;
-}
-
-static bool is_valid_spec(struct mlx5_core_dev *mdev,
-			  const struct mlx5_flow_spec *spec,
-			  const struct mlx5_flow_act *flow_act,
-			  bool egress)
-{
-	/* We curretly only support ipsec egress flow */
-	return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID;
-}
-
-static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
-			       const struct ib_flow_attr *flow_attr,
-			       bool check_inner)
-{
-	union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
-	int match_ipv = check_inner ?
-			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-					ft_field_support.inner_ip_version) :
-			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-					ft_field_support.outer_ip_version);
-	int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0;
-	bool ipv4_spec_valid, ipv6_spec_valid;
-	unsigned int ip_spec_type = 0;
-	bool has_ethertype = false;
-	unsigned int spec_index;
-	bool mask_valid = true;
-	u16 eth_type = 0;
-	bool type_valid;
-
-	/* Validate that ethertype is correct */
-	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
-		if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) &&
-		    ib_spec->eth.mask.ether_type) {
-			mask_valid = (ib_spec->eth.mask.ether_type ==
-				      htons(0xffff));
-			has_ethertype = true;
-			eth_type = ntohs(ib_spec->eth.val.ether_type);
-		} else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) ||
-			   (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) {
-			ip_spec_type = ib_spec->type;
-		}
-		ib_spec = (void *)ib_spec + ib_spec->size;
-	}
-
-	type_valid = (!has_ethertype) || (!ip_spec_type);
-	if (!type_valid && mask_valid) {
-		ipv4_spec_valid = (eth_type == ETH_P_IP) &&
-			(ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit));
-		ipv6_spec_valid = (eth_type == ETH_P_IPV6) &&
-			(ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit));
-
-		type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) ||
-			     (((eth_type == ETH_P_MPLS_UC) ||
-			       (eth_type == ETH_P_MPLS_MC)) && match_ipv);
-	}
-
-	return type_valid;
-}
-
-static bool is_valid_attr(struct mlx5_core_dev *mdev,
-			  const struct ib_flow_attr *flow_attr)
-{
-	return is_valid_ethertype(mdev, flow_attr, false) &&
-	       is_valid_ethertype(mdev, flow_attr, true);
-}
-
-static void put_flow_table(struct mlx5_ib_dev *dev,
-			   struct mlx5_ib_flow_prio *prio, bool ft_added)
-{
-	prio->refcount -= !!ft_added;
-	if (!prio->refcount) {
-		mlx5_destroy_flow_table(prio->flow_table);
-		prio->flow_table = NULL;
-	}
-}
-
-static void counters_clear_description(struct ib_counters *counters)
-{
-	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
-
-	mutex_lock(&mcounters->mcntrs_mutex);
-	kfree(mcounters->counters_data);
-	mcounters->counters_data = NULL;
-	mcounters->cntrs_max_index = 0;
-	mutex_unlock(&mcounters->mcntrs_mutex);
-}
-
-static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
-{
-	struct mlx5_ib_flow_handler *handler = container_of(flow_id,
-							  struct mlx5_ib_flow_handler,
-							  ibflow);
-	struct mlx5_ib_flow_handler *iter, *tmp;
-	struct mlx5_ib_dev *dev = handler->dev;
-
-	mutex_lock(&dev->flow_db->lock);
-
-	list_for_each_entry_safe(iter, tmp, &handler->list, list) {
-		mlx5_del_flow_rules(iter->rule);
-		put_flow_table(dev, iter->prio, true);
-		list_del(&iter->list);
-		kfree(iter);
-	}
-
-	mlx5_del_flow_rules(handler->rule);
-	put_flow_table(dev, handler->prio, true);
-	if (handler->ibcounters &&
-	    atomic_read(&handler->ibcounters->usecnt) == 1)
-		counters_clear_description(handler->ibcounters);
-
-	mutex_unlock(&dev->flow_db->lock);
-	if (handler->flow_matcher)
-		atomic_dec(&handler->flow_matcher->usecnt);
-	kfree(handler);
-
-	return 0;
-}
-
-static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
-{
-	priority *= 2;
-	if (!dont_trap)
-		priority++;
-	return priority;
-}
-
-enum flow_table_type {
-	MLX5_IB_FT_RX,
-	MLX5_IB_FT_TX
-};
-
-#define MLX5_FS_MAX_TYPES	 6
-#define MLX5_FS_MAX_ENTRIES	 BIT(16)
-
-static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
-					   struct mlx5_ib_flow_prio *prio,
-					   int priority,
-					   int num_entries, int num_groups,
-					   u32 flags)
-{
-	struct mlx5_flow_table_attr ft_attr = {};
-	struct mlx5_flow_table *ft;
-
-	ft_attr.prio = priority;
-	ft_attr.max_fte = num_entries;
-	ft_attr.flags = flags;
-	ft_attr.autogroup.max_num_groups = num_groups;
-	ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
-	if (IS_ERR(ft))
-		return ERR_CAST(ft);
-
-	prio->flow_table = ft;
-	prio->refcount = 0;
-	return prio;
-}
-
-static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
-						struct ib_flow_attr *flow_attr,
-						enum flow_table_type ft_type)
-{
-	bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
-	struct mlx5_flow_namespace *ns = NULL;
-	struct mlx5_ib_flow_prio *prio;
-	struct mlx5_flow_table *ft;
-	int max_table_size;
-	int num_entries;
-	int num_groups;
-	bool esw_encap;
-	u32 flags = 0;
-	int priority;
-
-	max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-						       log_max_ft_size));
-	esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
-		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
-	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
-		enum mlx5_flow_namespace_type fn_type;
-
-		if (flow_is_multicast_only(flow_attr) &&
-		    !dont_trap)
-			priority = MLX5_IB_FLOW_MCAST_PRIO;
-		else
-			priority = ib_prio_to_core_prio(flow_attr->priority,
-							dont_trap);
-		if (ft_type == MLX5_IB_FT_RX) {
-			fn_type = MLX5_FLOW_NAMESPACE_BYPASS;
-			prio = &dev->flow_db->prios[priority];
-			if (!dev->is_rep && !esw_encap &&
-			    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
-				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
-			if (!dev->is_rep && !esw_encap &&
-			    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-					reformat_l3_tunnel_to_l2))
-				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-		} else {
-			max_table_size =
-				BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
-							      log_max_ft_size));
-			fn_type = MLX5_FLOW_NAMESPACE_EGRESS;
-			prio = &dev->flow_db->egress_prios[priority];
-			if (!dev->is_rep && !esw_encap &&
-			    MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
-				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-		}
-		ns = mlx5_get_flow_namespace(dev->mdev, fn_type);
-		num_entries = MLX5_FS_MAX_ENTRIES;
-		num_groups = MLX5_FS_MAX_TYPES;
-	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
-		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
-		ns = mlx5_get_flow_namespace(dev->mdev,
-					     MLX5_FLOW_NAMESPACE_LEFTOVERS);
-		build_leftovers_ft_param(&priority,
-					 &num_entries,
-					 &num_groups);
-		prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
-	} else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
-		if (!MLX5_CAP_FLOWTABLE(dev->mdev,
-					allow_sniffer_and_nic_rx_shared_tir))
-			return ERR_PTR(-ENOTSUPP);
-
-		ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
-					     MLX5_FLOW_NAMESPACE_SNIFFER_RX :
-					     MLX5_FLOW_NAMESPACE_SNIFFER_TX);
-
-		prio = &dev->flow_db->sniffer[ft_type];
-		priority = 0;
-		num_entries = 1;
-		num_groups = 1;
-	}
-
-	if (!ns)
-		return ERR_PTR(-ENOTSUPP);
-
-	max_table_size = min_t(int, num_entries, max_table_size);
-
-	ft = prio->flow_table;
-	if (!ft)
-		return _get_prio(ns, prio, priority, max_table_size, num_groups,
-				 flags);
-
-	return prio;
-}
-
-static void set_underlay_qp(struct mlx5_ib_dev *dev,
-			    struct mlx5_flow_spec *spec,
-			    u32 underlay_qpn)
-{
-	void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
-					   spec->match_criteria,
-					   misc_parameters);
-	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
-					   misc_parameters);
-
-	if (underlay_qpn &&
-	    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-				      ft_field_support.bth_dst_qp)) {
-		MLX5_SET(fte_match_set_misc,
-			 misc_params_v, bth_dst_qp, underlay_qpn);
-		MLX5_SET(fte_match_set_misc,
-			 misc_params_c, bth_dst_qp, 0xffffff);
-	}
-}
-
-static int read_flow_counters(struct ib_device *ibdev,
-			      struct mlx5_read_counters_attr *read_attr)
-{
-	struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-
-	return mlx5_fc_query(dev->mdev, fc,
-			     &read_attr->out[IB_COUNTER_PACKETS],
-			     &read_attr->out[IB_COUNTER_BYTES]);
-}
-
-/* flow counters currently expose two counters packets and bytes */
-#define FLOW_COUNTERS_NUM 2
-static int counters_set_description(struct ib_counters *counters,
-				    enum mlx5_ib_counters_type counters_type,
-				    struct mlx5_ib_flow_counters_desc *desc_data,
-				    u32 ncounters)
-{
-	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
-	u32 cntrs_max_index = 0;
-	int i;
-
-	if (counters_type != MLX5_IB_COUNTERS_FLOW)
-		return -EINVAL;
-
-	/* init the fields for the object */
-	mcounters->type = counters_type;
-	mcounters->read_counters = read_flow_counters;
-	mcounters->counters_num = FLOW_COUNTERS_NUM;
-	mcounters->ncounters = ncounters;
-	/* each counter entry have both description and index pair */
-	for (i = 0; i < ncounters; i++) {
-		if (desc_data[i].description > IB_COUNTER_BYTES)
-			return -EINVAL;
-
-		if (cntrs_max_index <= desc_data[i].index)
-			cntrs_max_index = desc_data[i].index + 1;
-	}
-
-	mutex_lock(&mcounters->mcntrs_mutex);
-	mcounters->counters_data = desc_data;
-	mcounters->cntrs_max_index = cntrs_max_index;
-	mutex_unlock(&mcounters->mcntrs_mutex);
-
-	return 0;
-}
-
-#define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
-static int flow_counters_set_data(struct ib_counters *ibcounters,
-				  struct mlx5_ib_create_flow *ucmd)
-{
-	struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
-	struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
-	struct mlx5_ib_flow_counters_desc *desc_data = NULL;
-	bool hw_hndl = false;
-	int ret = 0;
-
-	if (ucmd && ucmd->ncounters_data != 0) {
-		cntrs_data = ucmd->data;
-		if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
-			return -EINVAL;
-
-		desc_data = kcalloc(cntrs_data->ncounters,
-				    sizeof(*desc_data),
-				    GFP_KERNEL);
-		if (!desc_data)
-			return  -ENOMEM;
-
-		if (copy_from_user(desc_data,
-				   u64_to_user_ptr(cntrs_data->counters_data),
-				   sizeof(*desc_data) * cntrs_data->ncounters)) {
-			ret = -EFAULT;
-			goto free;
-		}
-	}
-
-	if (!mcounters->hw_cntrs_hndl) {
-		mcounters->hw_cntrs_hndl = mlx5_fc_create(
-			to_mdev(ibcounters->device)->mdev, false);
-		if (IS_ERR(mcounters->hw_cntrs_hndl)) {
-			ret = PTR_ERR(mcounters->hw_cntrs_hndl);
-			goto free;
-		}
-		hw_hndl = true;
-	}
-
-	if (desc_data) {
-		/* counters already bound to at least one flow */
-		if (mcounters->cntrs_max_index) {
-			ret = -EINVAL;
-			goto free_hndl;
-		}
-
-		ret = counters_set_description(ibcounters,
-					       MLX5_IB_COUNTERS_FLOW,
-					       desc_data,
-					       cntrs_data->ncounters);
-		if (ret)
-			goto free_hndl;
-
-	} else if (!mcounters->cntrs_max_index) {
-		/* counters not bound yet, must have udata passed */
-		ret = -EINVAL;
-		goto free_hndl;
-	}
-
-	return 0;
-
-free_hndl:
-	if (hw_hndl) {
-		mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev,
-				mcounters->hw_cntrs_hndl);
-		mcounters->hw_cntrs_hndl = NULL;
-	}
-free:
-	kfree(desc_data);
-	return ret;
-}
-
-static void mlx5_ib_set_rule_source_port(struct mlx5_ib_dev *dev,
-					 struct mlx5_flow_spec *spec,
-					 struct mlx5_eswitch_rep *rep)
-{
-	struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
-	void *misc;
-
-	if (mlx5_eswitch_vport_match_metadata_enabled(esw)) {
-		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
-				    misc_parameters_2);
-
-		MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
-			 mlx5_eswitch_get_vport_metadata_for_match(esw,
-								   rep->vport));
-		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
-				    misc_parameters_2);
-
-		MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
-			 mlx5_eswitch_get_vport_metadata_mask());
-	} else {
-		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
-				    misc_parameters);
-
-		MLX5_SET(fte_match_set_misc, misc, source_port, rep->vport);
-
-		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
-				    misc_parameters);
-
-		MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
-	}
-}
-
-static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
-						      struct mlx5_ib_flow_prio *ft_prio,
-						      const struct ib_flow_attr *flow_attr,
-						      struct mlx5_flow_destination *dst,
-						      u32 underlay_qpn,
-						      struct mlx5_ib_create_flow *ucmd)
-{
-	struct mlx5_flow_table	*ft = ft_prio->flow_table;
-	struct mlx5_ib_flow_handler *handler;
-	struct mlx5_flow_act flow_act = {};
-	struct mlx5_flow_spec *spec;
-	struct mlx5_flow_destination dest_arr[2] = {};
-	struct mlx5_flow_destination *rule_dst = dest_arr;
-	const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
-	unsigned int spec_index;
-	u32 prev_type = 0;
-	int err = 0;
-	int dest_num = 0;
-	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
-
-	if (!is_valid_attr(dev->mdev, flow_attr))
-		return ERR_PTR(-EINVAL);
-
-	if (dev->is_rep && is_egress)
-		return ERR_PTR(-EINVAL);
-
-	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
-	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
-	if (!handler || !spec) {
-		err = -ENOMEM;
-		goto free;
-	}
-
-	INIT_LIST_HEAD(&handler->list);
-
-	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
-		err = parse_flow_attr(dev->mdev, spec,
-				      ib_flow, flow_attr, &flow_act,
-				      prev_type);
-		if (err < 0)
-			goto free;
-
-		prev_type = ((union ib_flow_spec *)ib_flow)->type;
-		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
-	}
-
-	if (dst && !(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP)) {
-		memcpy(&dest_arr[0], dst, sizeof(*dst));
-		dest_num++;
-	}
-
-	if (!flow_is_multicast_only(flow_attr))
-		set_underlay_qp(dev, spec, underlay_qpn);
-
-	if (dev->is_rep) {
-		struct mlx5_eswitch_rep *rep;
-
-		rep = dev->port[flow_attr->port - 1].rep;
-		if (!rep) {
-			err = -EINVAL;
-			goto free;
-		}
-
-		mlx5_ib_set_rule_source_port(dev, spec, rep);
-	}
-
-	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
-
-	if (is_egress &&
-	    !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) {
-		err = -EINVAL;
-		goto free;
-	}
-
-	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
-		struct mlx5_ib_mcounters *mcounters;
-
-		err = flow_counters_set_data(flow_act.counters, ucmd);
-		if (err)
-			goto free;
-
-		mcounters = to_mcounters(flow_act.counters);
-		handler->ibcounters = flow_act.counters;
-		dest_arr[dest_num].type =
-			MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		dest_arr[dest_num].counter_id =
-			mlx5_fc_id(mcounters->hw_cntrs_hndl);
-		dest_num++;
-	}
-
-	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
-		if (!dest_num)
-			rule_dst = NULL;
-	} else {
-		if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)
-			flow_act.action |=
-				MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
-		if (is_egress)
-			flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
-		else if (dest_num)
-			flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
-	}
-
-	if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG)  &&
-	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
-	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
-		mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
-			     spec->flow_context.flow_tag, flow_attr->type);
-		err = -EINVAL;
-		goto free;
-	}
-	handler->rule = mlx5_add_flow_rules(ft, spec,
-					    &flow_act,
-					    rule_dst, dest_num);
-
-	if (IS_ERR(handler->rule)) {
-		err = PTR_ERR(handler->rule);
-		goto free;
-	}
-
-	ft_prio->refcount++;
-	handler->prio = ft_prio;
-	handler->dev = dev;
-
-	ft_prio->flow_table = ft;
-free:
-	if (err && handler) {
-		if (handler->ibcounters &&
-		    atomic_read(&handler->ibcounters->usecnt) == 1)
-			counters_clear_description(handler->ibcounters);
-		kfree(handler);
-	}
-	kvfree(spec);
-	return err ? ERR_PTR(err) : handler;
-}
-
-static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
-						     struct mlx5_ib_flow_prio *ft_prio,
-						     const struct ib_flow_attr *flow_attr,
-						     struct mlx5_flow_destination *dst)
-{
-	return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
-}
-
-enum {
-	LEFTOVERS_MC,
-	LEFTOVERS_UC,
-};
-
-static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
-							  struct mlx5_ib_flow_prio *ft_prio,
-							  struct ib_flow_attr *flow_attr,
-							  struct mlx5_flow_destination *dst)
-{
-	struct mlx5_ib_flow_handler *handler_ucast = NULL;
-	struct mlx5_ib_flow_handler *handler = NULL;
-
-	static struct {
-		struct ib_flow_attr	flow_attr;
-		struct ib_flow_spec_eth eth_flow;
-	} leftovers_specs[] = {
-		[LEFTOVERS_MC] = {
-			.flow_attr = {
-				.num_of_specs = 1,
-				.size = sizeof(leftovers_specs[0])
-			},
-			.eth_flow = {
-				.type = IB_FLOW_SPEC_ETH,
-				.size = sizeof(struct ib_flow_spec_eth),
-				.mask = {.dst_mac = {0x1} },
-				.val =  {.dst_mac = {0x1} }
-			}
-		},
-		[LEFTOVERS_UC] = {
-			.flow_attr = {
-				.num_of_specs = 1,
-				.size = sizeof(leftovers_specs[0])
-			},
-			.eth_flow = {
-				.type = IB_FLOW_SPEC_ETH,
-				.size = sizeof(struct ib_flow_spec_eth),
-				.mask = {.dst_mac = {0x1} },
-				.val = {.dst_mac = {} }
-			}
-		}
-	};
-
-	handler = create_flow_rule(dev, ft_prio,
-				   &leftovers_specs[LEFTOVERS_MC].flow_attr,
-				   dst);
-	if (!IS_ERR(handler) &&
-	    flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
-		handler_ucast = create_flow_rule(dev, ft_prio,
-						 &leftovers_specs[LEFTOVERS_UC].flow_attr,
-						 dst);
-		if (IS_ERR(handler_ucast)) {
-			mlx5_del_flow_rules(handler->rule);
-			ft_prio->refcount--;
-			kfree(handler);
-			handler = handler_ucast;
-		} else {
-			list_add(&handler_ucast->list, &handler->list);
-		}
-	}
-
-	return handler;
-}
-
-static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
-							struct mlx5_ib_flow_prio *ft_rx,
-							struct mlx5_ib_flow_prio *ft_tx,
-							struct mlx5_flow_destination *dst)
-{
-	struct mlx5_ib_flow_handler *handler_rx;
-	struct mlx5_ib_flow_handler *handler_tx;
-	int err;
-	static const struct ib_flow_attr flow_attr  = {
-		.num_of_specs = 0,
-		.size = sizeof(flow_attr)
-	};
-
-	handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
-	if (IS_ERR(handler_rx)) {
-		err = PTR_ERR(handler_rx);
-		goto err;
-	}
-
-	handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
-	if (IS_ERR(handler_tx)) {
-		err = PTR_ERR(handler_tx);
-		goto err_tx;
-	}
-
-	list_add(&handler_tx->list, &handler_rx->list);
-
-	return handler_rx;
-
-err_tx:
-	mlx5_del_flow_rules(handler_rx->rule);
-	ft_rx->refcount--;
-	kfree(handler_rx);
-err:
-	return ERR_PTR(err);
-}
-
-static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
-					   struct ib_flow_attr *flow_attr,
-					   int domain,
-					   struct ib_udata *udata)
-{
-	struct mlx5_ib_dev *dev = to_mdev(qp->device);
-	struct mlx5_ib_qp *mqp = to_mqp(qp);
-	struct mlx5_ib_flow_handler *handler = NULL;
-	struct mlx5_flow_destination *dst = NULL;
-	struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
-	struct mlx5_ib_flow_prio *ft_prio;
-	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
-	struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr;
-	size_t min_ucmd_sz, required_ucmd_sz;
-	int err;
-	int underlay_qpn;
-
-	if (udata && udata->inlen) {
-		min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) +
-				sizeof(ucmd_hdr.reserved);
-		if (udata->inlen < min_ucmd_sz)
-			return ERR_PTR(-EOPNOTSUPP);
-
-		err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz);
-		if (err)
-			return ERR_PTR(err);
-
-		/* currently supports only one counters data */
-		if (ucmd_hdr.ncounters_data > 1)
-			return ERR_PTR(-EINVAL);
-
-		required_ucmd_sz = min_ucmd_sz +
-			sizeof(struct mlx5_ib_flow_counters_data) *
-			ucmd_hdr.ncounters_data;
-		if (udata->inlen > required_ucmd_sz &&
-		    !ib_is_udata_cleared(udata, required_ucmd_sz,
-					 udata->inlen - required_ucmd_sz))
-			return ERR_PTR(-EOPNOTSUPP);
-
-		ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL);
-		if (!ucmd)
-			return ERR_PTR(-ENOMEM);
-
-		err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz);
-		if (err)
-			goto free_ucmd;
-	}
-
-	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) {
-		err = -ENOMEM;
-		goto free_ucmd;
-	}
-
-	if (domain != IB_FLOW_DOMAIN_USER ||
-	    flow_attr->port > dev->num_ports ||
-	    (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP |
-				  IB_FLOW_ATTR_FLAGS_EGRESS))) {
-		err = -EINVAL;
-		goto free_ucmd;
-	}
-
-	if (is_egress &&
-	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
-	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
-		err = -EINVAL;
-		goto free_ucmd;
-	}
-
-	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
-	if (!dst) {
-		err = -ENOMEM;
-		goto free_ucmd;
-	}
-
-	mutex_lock(&dev->flow_db->lock);
-
-	ft_prio = get_flow_table(dev, flow_attr,
-				 is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX);
-	if (IS_ERR(ft_prio)) {
-		err = PTR_ERR(ft_prio);
-		goto unlock;
-	}
-	if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
-		ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
-		if (IS_ERR(ft_prio_tx)) {
-			err = PTR_ERR(ft_prio_tx);
-			ft_prio_tx = NULL;
-			goto destroy_ft;
-		}
-	}
-
-	if (is_egress) {
-		dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT;
-	} else {
-		dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
-		if (mqp->is_rss)
-			dst->tir_num = mqp->rss_qp.tirn;
-		else
-			dst->tir_num = mqp->raw_packet_qp.rq.tirn;
-	}
-
-	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
-		underlay_qpn = (mqp->flags & IB_QP_CREATE_SOURCE_QPN) ?
-				       mqp->underlay_qpn :
-				       0;
-		handler = _create_flow_rule(dev, ft_prio, flow_attr, dst,
-					    underlay_qpn, ucmd);
-	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
-		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
-		handler = create_leftovers_rule(dev, ft_prio, flow_attr,
-						dst);
-	} else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
-		handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
-	} else {
-		err = -EINVAL;
-		goto destroy_ft;
-	}
-
-	if (IS_ERR(handler)) {
-		err = PTR_ERR(handler);
-		handler = NULL;
-		goto destroy_ft;
-	}
-
-	mutex_unlock(&dev->flow_db->lock);
-	kfree(dst);
-	kfree(ucmd);
-
-	return &handler->ibflow;
-
-destroy_ft:
-	put_flow_table(dev, ft_prio, false);
-	if (ft_prio_tx)
-		put_flow_table(dev, ft_prio_tx, false);
-unlock:
-	mutex_unlock(&dev->flow_db->lock);
-	kfree(dst);
-free_ucmd:
-	kfree(ucmd);
-	return ERR_PTR(err);
-}
-
-static struct mlx5_ib_flow_prio *
-_get_flow_table(struct mlx5_ib_dev *dev,
-		struct mlx5_ib_flow_matcher *fs_matcher,
-		bool mcast)
-{
-	struct mlx5_flow_namespace *ns = NULL;
-	struct mlx5_ib_flow_prio *prio = NULL;
-	int max_table_size = 0;
-	bool esw_encap;
-	u32 flags = 0;
-	int priority;
-
-	if (mcast)
-		priority = MLX5_IB_FLOW_MCAST_PRIO;
-	else
-		priority = ib_prio_to_core_prio(fs_matcher->priority, false);
-
-	esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
-		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
-	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
-		max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-					log_max_ft_size));
-		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap)
-			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
-		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-					      reformat_l3_tunnel_to_l2) &&
-		    !esw_encap)
-			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) {
-		max_table_size = BIT(
-			MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size));
-		if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) && !esw_encap)
-			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) {
-		max_table_size = BIT(
-			MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size));
-		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap)
-			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
-		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, reformat_l3_tunnel_to_l2) &&
-		    esw_encap)
-			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-		priority = FDB_BYPASS_PATH;
-	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) {
-		max_table_size =
-			BIT(MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev,
-						       log_max_ft_size));
-		priority = fs_matcher->priority;
-	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) {
-		max_table_size =
-			BIT(MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev,
-						       log_max_ft_size));
-		priority = fs_matcher->priority;
-	}
-
-	max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
-
-	ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
-	if (!ns)
-		return ERR_PTR(-ENOTSUPP);
-
-	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS)
-		prio = &dev->flow_db->prios[priority];
-	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
-		prio = &dev->flow_db->egress_prios[priority];
-	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB)
-		prio = &dev->flow_db->fdb;
-	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX)
-		prio = &dev->flow_db->rdma_rx[priority];
-	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX)
-		prio = &dev->flow_db->rdma_tx[priority];
-
-	if (!prio)
-		return ERR_PTR(-EINVAL);
-
-	if (prio->flow_table)
-		return prio;
-
-	return _get_prio(ns, prio, priority, max_table_size,
-			 MLX5_FS_MAX_TYPES, flags);
-}
-
-static struct mlx5_ib_flow_handler *
-_create_raw_flow_rule(struct mlx5_ib_dev *dev,
-		      struct mlx5_ib_flow_prio *ft_prio,
-		      struct mlx5_flow_destination *dst,
-		      struct mlx5_ib_flow_matcher  *fs_matcher,
-		      struct mlx5_flow_context *flow_context,
-		      struct mlx5_flow_act *flow_act,
-		      void *cmd_in, int inlen,
-		      int dst_num)
-{
-	struct mlx5_ib_flow_handler *handler;
-	struct mlx5_flow_spec *spec;
-	struct mlx5_flow_table *ft = ft_prio->flow_table;
-	int err = 0;
-
-	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
-	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
-	if (!handler || !spec) {
-		err = -ENOMEM;
-		goto free;
-	}
-
-	INIT_LIST_HEAD(&handler->list);
-
-	memcpy(spec->match_value, cmd_in, inlen);
-	memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params,
-	       fs_matcher->mask_len);
-	spec->match_criteria_enable = fs_matcher->match_criteria_enable;
-	spec->flow_context = *flow_context;
-
-	handler->rule = mlx5_add_flow_rules(ft, spec,
-					    flow_act, dst, dst_num);
-
-	if (IS_ERR(handler->rule)) {
-		err = PTR_ERR(handler->rule);
-		goto free;
-	}
-
-	ft_prio->refcount++;
-	handler->prio = ft_prio;
-	handler->dev = dev;
-	ft_prio->flow_table = ft;
-
-free:
-	if (err)
-		kfree(handler);
-	kvfree(spec);
-	return err ? ERR_PTR(err) : handler;
-}
-
-static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
-				void *match_v)
-{
-	void *match_c;
-	void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4;
-	void *dmac, *dmac_mask;
-	void *ipv4, *ipv4_mask;
-
-	if (!(fs_matcher->match_criteria_enable &
-	      (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT)))
-		return false;
-
-	match_c = fs_matcher->matcher_mask.match_params;
-	match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v,
-					   outer_headers);
-	match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c,
-					   outer_headers);
-
-	dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
-			    dmac_47_16);
-	dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
-				 dmac_47_16);
-
-	if (is_multicast_ether_addr(dmac) &&
-	    is_multicast_ether_addr(dmac_mask))
-		return true;
-
-	ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
-			    dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
-
-	ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
-				 dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
-
-	if (ipv4_is_multicast(*(__be32 *)(ipv4)) &&
-	    ipv4_is_multicast(*(__be32 *)(ipv4_mask)))
-		return true;
-
-	return false;
-}
-
-struct mlx5_ib_flow_handler *
-mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
-			struct mlx5_ib_flow_matcher *fs_matcher,
-			struct mlx5_flow_context *flow_context,
-			struct mlx5_flow_act *flow_act,
-			u32 counter_id,
-			void *cmd_in, int inlen, int dest_id,
-			int dest_type)
-{
-	struct mlx5_flow_destination *dst;
-	struct mlx5_ib_flow_prio *ft_prio;
-	struct mlx5_ib_flow_handler *handler;
-	int dst_num = 0;
-	bool mcast;
-	int err;
-
-	if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL)
-		return ERR_PTR(-EOPNOTSUPP);
-
-	if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
-		return ERR_PTR(-ENOMEM);
-
-	dst = kcalloc(2, sizeof(*dst), GFP_KERNEL);
-	if (!dst)
-		return ERR_PTR(-ENOMEM);
-
-	mcast = raw_fs_is_multicast(fs_matcher, cmd_in);
-	mutex_lock(&dev->flow_db->lock);
-
-	ft_prio = _get_flow_table(dev, fs_matcher, mcast);
-	if (IS_ERR(ft_prio)) {
-		err = PTR_ERR(ft_prio);
-		goto unlock;
-	}
-
-	if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) {
-		dst[dst_num].type = dest_type;
-		dst[dst_num++].tir_num = dest_id;
-		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
-	} else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
-		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
-		dst[dst_num++].ft_num = dest_id;
-		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
-	} else  if (dest_type == MLX5_FLOW_DESTINATION_TYPE_PORT) {
-		dst[dst_num++].type = MLX5_FLOW_DESTINATION_TYPE_PORT;
-		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
-	}
-
-
-	if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
-		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		dst[dst_num].counter_id = counter_id;
-		dst_num++;
-	}
-
-	handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher,
-					flow_context, flow_act,
-					cmd_in, inlen, dst_num);
-
-	if (IS_ERR(handler)) {
-		err = PTR_ERR(handler);
-		goto destroy_ft;
-	}
-
-	mutex_unlock(&dev->flow_db->lock);
-	atomic_inc(&fs_matcher->usecnt);
-	handler->flow_matcher = fs_matcher;
-
-	kfree(dst);
-
-	return handler;
-
-destroy_ft:
-	put_flow_table(dev, ft_prio, false);
-unlock:
-	mutex_unlock(&dev->flow_db->lock);
-	kfree(dst);
-
-	return ERR_PTR(err);
-}
-
-static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags)
-{
-	u32 flags = 0;
-
-	if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA)
-		flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA;
-
-	return flags;
-}
-
-#define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED	MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA
-static struct ib_flow_action *
-mlx5_ib_create_flow_action_esp(struct ib_device *device,
-			       const struct ib_flow_action_attrs_esp *attr,
-			       struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_ib_dev *mdev = to_mdev(device);
-	struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm;
-	struct mlx5_accel_esp_xfrm_attrs accel_attrs = {};
-	struct mlx5_ib_flow_action *action;
-	u64 action_flags;
-	u64 flags;
-	int err = 0;
-
-	err = uverbs_get_flags64(
-		&action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
-		((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1));
-	if (err)
-		return ERR_PTR(err);
-
-	flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags);
-
-	/* We current only support a subset of the standard features. Only a
-	 * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn
-	 * (with overlap). Full offload mode isn't supported.
-	 */
-	if (!attr->keymat || attr->replay || attr->encap ||
-	    attr->spi || attr->seq || attr->tfc_pad ||
-	    attr->hard_limit_pkts ||
-	    (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
-			     IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)))
-		return ERR_PTR(-EOPNOTSUPP);
-
-	if (attr->keymat->protocol !=
-	    IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM)
-		return ERR_PTR(-EOPNOTSUPP);
-
-	aes_gcm = &attr->keymat->keymat.aes_gcm;
-
-	if (aes_gcm->icv_len != 16 ||
-	    aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
-		return ERR_PTR(-EOPNOTSUPP);
-
-	action = kmalloc(sizeof(*action), GFP_KERNEL);
-	if (!action)
-		return ERR_PTR(-ENOMEM);
-
-	action->esp_aes_gcm.ib_flags = attr->flags;
-	memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key,
-	       sizeof(accel_attrs.keymat.aes_gcm.aes_key));
-	accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8;
-	memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt,
-	       sizeof(accel_attrs.keymat.aes_gcm.salt));
-	memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv,
-	       sizeof(accel_attrs.keymat.aes_gcm.seq_iv));
-	accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8;
-	accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ;
-	accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM;
-
-	accel_attrs.esn = attr->esn;
-	if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED)
-		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
-	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
-		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
-
-	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)
-		accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT;
-
-	action->esp_aes_gcm.ctx =
-		mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags);
-	if (IS_ERR(action->esp_aes_gcm.ctx)) {
-		err = PTR_ERR(action->esp_aes_gcm.ctx);
-		goto err_parse;
-	}
-
-	action->esp_aes_gcm.ib_flags = attr->flags;
-
-	return &action->ib_action;
-
-err_parse:
-	kfree(action);
-	return ERR_PTR(err);
-}
-
-static int
-mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action,
-			       const struct ib_flow_action_attrs_esp *attr,
-			       struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_ib_flow_action *maction = to_mflow_act(action);
-	struct mlx5_accel_esp_xfrm_attrs accel_attrs;
-	int err = 0;
-
-	if (attr->keymat || attr->replay || attr->encap ||
-	    attr->spi || attr->seq || attr->tfc_pad ||
-	    attr->hard_limit_pkts ||
-	    (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
-			     IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS |
-			     IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)))
-		return -EOPNOTSUPP;
-
-	/* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can
-	 * be modified.
-	 */
-	if (!(maction->esp_aes_gcm.ib_flags &
-	      IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) &&
-	    attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
-			   IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))
-		return -EINVAL;
-
-	memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs,
-	       sizeof(accel_attrs));
-
-	accel_attrs.esn = attr->esn;
-	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
-		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
-	else
-		accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
-
-	err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx,
-					 &accel_attrs);
-	if (err)
-		return err;
-
-	maction->esp_aes_gcm.ib_flags &=
-		~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
-	maction->esp_aes_gcm.ib_flags |=
-		attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
-
-	return 0;
-}
-
-static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action)
-{
-	struct mlx5_ib_flow_action *maction = to_mflow_act(action);
-
-	switch (action->type) {
-	case IB_FLOW_ACTION_ESP:
-		/*
-		 * We only support aes_gcm by now, so we implicitly know this is
-		 * the underline crypto.
-		 */
-		mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx);
-		break;
-	case IB_FLOW_ACTION_UNSPECIFIED:
-		mlx5_ib_destroy_flow_action_raw(maction);
-		break;
-	default:
-		WARN_ON(true);
-		break;
-	}
-
-	kfree(maction);
-	return 0;
-}
-
 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
@@ -4848,137 +3020,6 @@
 	return __get_port_caps(dev, port);
 }
 
-static void destroy_umrc_res(struct mlx5_ib_dev *dev)
-{
-	int err;
-
-	err = mlx5_mr_cache_cleanup(dev);
-	if (err)
-		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
-
-	if (dev->umrc.qp)
-		mlx5_ib_destroy_qp(dev->umrc.qp, NULL);
-	if (dev->umrc.cq)
-		ib_free_cq(dev->umrc.cq);
-	if (dev->umrc.pd)
-		ib_dealloc_pd(dev->umrc.pd);
-}
-
-enum {
-	MAX_UMR_WR = 128,
-};
-
-static int create_umr_res(struct mlx5_ib_dev *dev)
-{
-	struct ib_qp_init_attr *init_attr = NULL;
-	struct ib_qp_attr *attr = NULL;
-	struct ib_pd *pd;
-	struct ib_cq *cq;
-	struct ib_qp *qp;
-	int ret;
-
-	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
-	if (!attr || !init_attr) {
-		ret = -ENOMEM;
-		goto error_0;
-	}
-
-	pd = ib_alloc_pd(&dev->ib_dev, 0);
-	if (IS_ERR(pd)) {
-		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
-		ret = PTR_ERR(pd);
-		goto error_0;
-	}
-
-	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
-	if (IS_ERR(cq)) {
-		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
-		ret = PTR_ERR(cq);
-		goto error_2;
-	}
-
-	init_attr->send_cq = cq;
-	init_attr->recv_cq = cq;
-	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
-	init_attr->cap.max_send_wr = MAX_UMR_WR;
-	init_attr->cap.max_send_sge = 1;
-	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
-	init_attr->port_num = 1;
-	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
-	if (IS_ERR(qp)) {
-		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
-		ret = PTR_ERR(qp);
-		goto error_3;
-	}
-	qp->device     = &dev->ib_dev;
-	qp->real_qp    = qp;
-	qp->uobject    = NULL;
-	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
-	qp->send_cq    = init_attr->send_cq;
-	qp->recv_cq    = init_attr->recv_cq;
-
-	attr->qp_state = IB_QPS_INIT;
-	attr->port_num = 1;
-	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
-				IB_QP_PORT, NULL);
-	if (ret) {
-		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
-		goto error_4;
-	}
-
-	memset(attr, 0, sizeof(*attr));
-	attr->qp_state = IB_QPS_RTR;
-	attr->path_mtu = IB_MTU_256;
-
-	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
-	if (ret) {
-		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
-		goto error_4;
-	}
-
-	memset(attr, 0, sizeof(*attr));
-	attr->qp_state = IB_QPS_RTS;
-	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
-	if (ret) {
-		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
-		goto error_4;
-	}
-
-	dev->umrc.qp = qp;
-	dev->umrc.cq = cq;
-	dev->umrc.pd = pd;
-
-	sema_init(&dev->umrc.sem, MAX_UMR_WR);
-	ret = mlx5_mr_cache_init(dev);
-	if (ret) {
-		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
-		goto error_4;
-	}
-
-	kfree(attr);
-	kfree(init_attr);
-
-	return 0;
-
-error_4:
-	mlx5_ib_destroy_qp(qp, NULL);
-	dev->umrc.qp = NULL;
-
-error_3:
-	ib_free_cq(cq);
-	dev->umrc.cq = NULL;
-
-error_2:
-	ib_dealloc_pd(pd);
-	dev->umrc.pd = NULL;
-
-error_0:
-	kfree(attr);
-	kfree(init_attr);
-	return ret;
-}
-
 static u8 mlx5_get_umr_fence(u8 umr_fence_cap)
 {
 	switch (umr_fence_cap) {
@@ -4991,18 +3032,20 @@
 	}
 }
 
-static int create_dev_resources(struct mlx5_ib_resources *devr)
+static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
 {
+	struct mlx5_ib_resources *devr = &dev->devr;
 	struct ib_srq_init_attr attr;
-	struct mlx5_ib_dev *dev;
 	struct ib_device *ibdev;
 	struct ib_cq_init_attr cq_attr = {.cqe = 1};
 	int port;
 	int ret = 0;
 
-	dev = container_of(devr, struct mlx5_ib_dev, devr);
 	ibdev = &dev->ib_dev;
 
+	if (!MLX5_CAP_GEN(dev->mdev, xrc))
+		return -EOPNOTSUPP;
+
 	mutex_init(&devr->mutex);
 
 	devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
@@ -5030,34 +3073,19 @@
 	if (ret)
 		goto err_create_cq;
 
-	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
-	if (IS_ERR(devr->x0)) {
-		ret = PTR_ERR(devr->x0);
+	ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0);
+	if (ret)
 		goto error2;
-	}
-	devr->x0->device = &dev->ib_dev;
-	devr->x0->inode = NULL;
-	atomic_set(&devr->x0->usecnt, 0);
-	mutex_init(&devr->x0->tgt_qp_mutex);
-	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
 
-	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
-	if (IS_ERR(devr->x1)) {
-		ret = PTR_ERR(devr->x1);
+	ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0);
+	if (ret)
 		goto error3;
-	}
-	devr->x1->device = &dev->ib_dev;
-	devr->x1->inode = NULL;
-	atomic_set(&devr->x1->usecnt, 0);
-	mutex_init(&devr->x1->tgt_qp_mutex);
-	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
 
 	memset(&attr, 0, sizeof(attr));
 	attr.attr.max_sge = 1;
 	attr.attr.max_wr = 1;
 	attr.srq_type = IB_SRQT_XRC;
 	attr.ext.cq = devr->c0;
-	attr.ext.xrc.xrcd = devr->x0;
 
 	devr->s0 = rdma_zalloc_drv_obj(ibdev, ib_srq);
 	if (!devr->s0) {
@@ -5068,13 +3096,11 @@
 	devr->s0->device	= &dev->ib_dev;
 	devr->s0->pd		= devr->p0;
 	devr->s0->srq_type      = IB_SRQT_XRC;
-	devr->s0->ext.xrc.xrcd	= devr->x0;
 	devr->s0->ext.cq	= devr->c0;
 	ret = mlx5_ib_create_srq(devr->s0, &attr, NULL);
 	if (ret)
 		goto err_create;
 
-	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
 	atomic_inc(&devr->s0->ext.cq->usecnt);
 	atomic_inc(&devr->p0->usecnt);
 	atomic_set(&devr->s0->usecnt, 0);
@@ -5116,9 +3142,9 @@
 err_create:
 	kfree(devr->s0);
 error4:
-	mlx5_ib_dealloc_xrcd(devr->x1, NULL);
+	mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0);
 error3:
-	mlx5_ib_dealloc_xrcd(devr->x0, NULL);
+	mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
 error2:
 	mlx5_ib_destroy_cq(devr->c0, NULL);
 err_create_cq:
@@ -5130,16 +3156,17 @@
 	return ret;
 }
 
-static void destroy_dev_resources(struct mlx5_ib_resources *devr)
+static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev)
 {
+	struct mlx5_ib_resources *devr = &dev->devr;
 	int port;
 
 	mlx5_ib_destroy_srq(devr->s1, NULL);
 	kfree(devr->s1);
 	mlx5_ib_destroy_srq(devr->s0, NULL);
 	kfree(devr->s0);
-	mlx5_ib_dealloc_xrcd(devr->x0, NULL);
-	mlx5_ib_dealloc_xrcd(devr->x1, NULL);
+	mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0);
+	mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
 	mlx5_ib_destroy_cq(devr->c0, NULL);
 	kfree(devr->c0);
 	mlx5_ib_dealloc_pd(devr->p0, NULL);
@@ -5332,466 +3359,6 @@
 	mlx5_nic_vport_disable_roce(dev->mdev);
 }
 
-struct mlx5_ib_counter {
-	const char *name;
-	size_t offset;
-};
-
-#define INIT_Q_COUNTER(_name)		\
-	{ .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
-
-static const struct mlx5_ib_counter basic_q_cnts[] = {
-	INIT_Q_COUNTER(rx_write_requests),
-	INIT_Q_COUNTER(rx_read_requests),
-	INIT_Q_COUNTER(rx_atomic_requests),
-	INIT_Q_COUNTER(out_of_buffer),
-};
-
-static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
-	INIT_Q_COUNTER(out_of_sequence),
-};
-
-static const struct mlx5_ib_counter retrans_q_cnts[] = {
-	INIT_Q_COUNTER(duplicate_request),
-	INIT_Q_COUNTER(rnr_nak_retry_err),
-	INIT_Q_COUNTER(packet_seq_err),
-	INIT_Q_COUNTER(implied_nak_seq_err),
-	INIT_Q_COUNTER(local_ack_timeout_err),
-};
-
-#define INIT_CONG_COUNTER(_name)		\
-	{ .name = #_name, .offset =	\
-		MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
-
-static const struct mlx5_ib_counter cong_cnts[] = {
-	INIT_CONG_COUNTER(rp_cnp_ignored),
-	INIT_CONG_COUNTER(rp_cnp_handled),
-	INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
-	INIT_CONG_COUNTER(np_cnp_sent),
-};
-
-static const struct mlx5_ib_counter extended_err_cnts[] = {
-	INIT_Q_COUNTER(resp_local_length_error),
-	INIT_Q_COUNTER(resp_cqe_error),
-	INIT_Q_COUNTER(req_cqe_error),
-	INIT_Q_COUNTER(req_remote_invalid_request),
-	INIT_Q_COUNTER(req_remote_access_errors),
-	INIT_Q_COUNTER(resp_remote_access_errors),
-	INIT_Q_COUNTER(resp_cqe_flush_error),
-	INIT_Q_COUNTER(req_cqe_flush_error),
-};
-
-static const struct mlx5_ib_counter roce_accl_cnts[] = {
-	INIT_Q_COUNTER(roce_adp_retrans),
-	INIT_Q_COUNTER(roce_adp_retrans_to),
-	INIT_Q_COUNTER(roce_slow_restart),
-	INIT_Q_COUNTER(roce_slow_restart_cnps),
-	INIT_Q_COUNTER(roce_slow_restart_trans),
-};
-
-#define INIT_EXT_PPCNT_COUNTER(_name)		\
-	{ .name = #_name, .offset =	\
-	MLX5_BYTE_OFF(ppcnt_reg, \
-		      counter_set.eth_extended_cntrs_grp_data_layout._name##_high)}
-
-static const struct mlx5_ib_counter ext_ppcnt_cnts[] = {
-	INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
-};
-
-static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev)
-{
-	return MLX5_ESWITCH_MANAGER(mdev) &&
-	       mlx5_ib_eswitch_mode(mdev->priv.eswitch) ==
-		       MLX5_ESWITCH_OFFLOADS;
-}
-
-static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
-{
-	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
-	int num_cnt_ports;
-	int i;
-
-	num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
-
-	MLX5_SET(dealloc_q_counter_in, in, opcode,
-		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
-
-	for (i = 0; i < num_cnt_ports; i++) {
-		if (dev->port[i].cnts.set_id) {
-			MLX5_SET(dealloc_q_counter_in, in, counter_set_id,
-				 dev->port[i].cnts.set_id);
-			mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in);
-		}
-		kfree(dev->port[i].cnts.names);
-		kfree(dev->port[i].cnts.offsets);
-	}
-}
-
-static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
-				    struct mlx5_ib_counters *cnts)
-{
-	u32 num_counters;
-
-	num_counters = ARRAY_SIZE(basic_q_cnts);
-
-	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
-		num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
-
-	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
-		num_counters += ARRAY_SIZE(retrans_q_cnts);
-
-	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
-		num_counters += ARRAY_SIZE(extended_err_cnts);
-
-	if (MLX5_CAP_GEN(dev->mdev, roce_accl))
-		num_counters += ARRAY_SIZE(roce_accl_cnts);
-
-	cnts->num_q_counters = num_counters;
-
-	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
-		cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
-		num_counters += ARRAY_SIZE(cong_cnts);
-	}
-	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
-		cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts);
-		num_counters += ARRAY_SIZE(ext_ppcnt_cnts);
-	}
-	cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL);
-	if (!cnts->names)
-		return -ENOMEM;
-
-	cnts->offsets = kcalloc(num_counters,
-				sizeof(cnts->offsets), GFP_KERNEL);
-	if (!cnts->offsets)
-		goto err_names;
-
-	return 0;
-
-err_names:
-	kfree(cnts->names);
-	cnts->names = NULL;
-	return -ENOMEM;
-}
-
-static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
-				  const char **names,
-				  size_t *offsets)
-{
-	int i;
-	int j = 0;
-
-	for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
-		names[j] = basic_q_cnts[i].name;
-		offsets[j] = basic_q_cnts[i].offset;
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
-		for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
-			names[j] = out_of_seq_q_cnts[i].name;
-			offsets[j] = out_of_seq_q_cnts[i].offset;
-		}
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
-		for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
-			names[j] = retrans_q_cnts[i].name;
-			offsets[j] = retrans_q_cnts[i].offset;
-		}
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
-		for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
-			names[j] = extended_err_cnts[i].name;
-			offsets[j] = extended_err_cnts[i].offset;
-		}
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, roce_accl)) {
-		for (i = 0; i < ARRAY_SIZE(roce_accl_cnts); i++, j++) {
-			names[j] = roce_accl_cnts[i].name;
-			offsets[j] = roce_accl_cnts[i].offset;
-		}
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
-		for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
-			names[j] = cong_cnts[i].name;
-			offsets[j] = cong_cnts[i].offset;
-		}
-	}
-
-	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
-		for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) {
-			names[j] = ext_ppcnt_cnts[i].name;
-			offsets[j] = ext_ppcnt_cnts[i].offset;
-		}
-	}
-}
-
-static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
-{
-	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {};
-	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {};
-	int num_cnt_ports;
-	int err = 0;
-	int i;
-	bool is_shared;
-
-	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
-	is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
-	num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
-
-	for (i = 0; i < num_cnt_ports; i++) {
-		err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
-		if (err)
-			goto err_alloc;
-
-		mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
-				      dev->port[i].cnts.offsets);
-
-		MLX5_SET(alloc_q_counter_in, in, uid,
-			 is_shared ? MLX5_SHARED_RESOURCE_UID : 0);
-
-		err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out);
-		if (err) {
-			mlx5_ib_warn(dev,
-				     "couldn't allocate queue counter for port %d, err %d\n",
-				     i + 1, err);
-			goto err_alloc;
-		}
-
-		dev->port[i].cnts.set_id =
-			MLX5_GET(alloc_q_counter_out, out, counter_set_id);
-	}
-	return 0;
-
-err_alloc:
-	mlx5_ib_dealloc_counters(dev);
-	return err;
-}
-
-static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
-						   u8 port_num)
-{
-	return is_mdev_switchdev_mode(dev->mdev) ? &dev->port[0].cnts :
-						   &dev->port[port_num].cnts;
-}
-
-/**
- * mlx5_ib_get_counters_id - Returns counters id to use for device+port
- * @dev:	Pointer to mlx5 IB device
- * @port_num:	Zero based port number
- *
- * mlx5_ib_get_counters_id() Returns counters set id to use for given
- * device port combination in switchdev and non switchdev mode of the
- * parent device.
- */
-u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num)
-{
-	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
-
-	return cnts->set_id;
-}
-
-static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
-						    u8 port_num)
-{
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	const struct mlx5_ib_counters *cnts;
-	bool is_switchdev = is_mdev_switchdev_mode(dev->mdev);
-
-	if ((is_switchdev && port_num) || (!is_switchdev && !port_num))
-		return NULL;
-
-	cnts = get_counters(dev, port_num - 1);
-
-	return rdma_alloc_hw_stats_struct(cnts->names,
-					  cnts->num_q_counters +
-					  cnts->num_cong_counters +
-					  cnts->num_ext_ppcnt_counters,
-					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
-}
-
-static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
-				    const struct mlx5_ib_counters *cnts,
-				    struct rdma_hw_stats *stats,
-				    u16 set_id)
-{
-	u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {};
-	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {};
-	__be32 val;
-	int ret, i;
-
-	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
-	MLX5_SET(query_q_counter_in, in, counter_set_id, set_id);
-	ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out);
-	if (ret)
-		return ret;
-
-	for (i = 0; i < cnts->num_q_counters; i++) {
-		val = *(__be32 *)((void *)out + cnts->offsets[i]);
-		stats->value[i] = (u64)be32_to_cpu(val);
-	}
-
-	return 0;
-}
-
-static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
-					    const struct mlx5_ib_counters *cnts,
-					    struct rdma_hw_stats *stats)
-{
-	int offset = cnts->num_q_counters + cnts->num_cong_counters;
-	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
-	int ret, i;
-	void *out;
-
-	out = kvzalloc(sz, GFP_KERNEL);
-	if (!out)
-		return -ENOMEM;
-
-	ret = mlx5_cmd_query_ext_ppcnt_counters(dev->mdev, out);
-	if (ret)
-		goto free;
-
-	for (i = 0; i < cnts->num_ext_ppcnt_counters; i++)
-		stats->value[i + offset] =
-			be64_to_cpup((__be64 *)(out +
-				    cnts->offsets[i + offset]));
-free:
-	kvfree(out);
-	return ret;
-}
-
-static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
-				struct rdma_hw_stats *stats,
-				u8 port_num, int index)
-{
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1);
-	struct mlx5_core_dev *mdev;
-	int ret, num_counters;
-	u8 mdev_port_num;
-
-	if (!stats)
-		return -EINVAL;
-
-	num_counters = cnts->num_q_counters +
-		       cnts->num_cong_counters +
-		       cnts->num_ext_ppcnt_counters;
-
-	/* q_counters are per IB device, query the master mdev */
-	ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats, cnts->set_id);
-	if (ret)
-		return ret;
-
-	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
-		ret =  mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats);
-		if (ret)
-			return ret;
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
-		mdev = mlx5_ib_get_native_port_mdev(dev, port_num,
-						    &mdev_port_num);
-		if (!mdev) {
-			/* If port is not affiliated yet, its in down state
-			 * which doesn't have any counters yet, so it would be
-			 * zero. So no need to read from the HCA.
-			 */
-			goto done;
-		}
-		ret = mlx5_lag_query_cong_counters(dev->mdev,
-						   stats->value +
-						   cnts->num_q_counters,
-						   cnts->num_cong_counters,
-						   cnts->offsets +
-						   cnts->num_q_counters);
-
-		mlx5_ib_put_native_port_mdev(dev, port_num);
-		if (ret)
-			return ret;
-	}
-
-done:
-	return num_counters;
-}
-
-static struct rdma_hw_stats *
-mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
-{
-	struct mlx5_ib_dev *dev = to_mdev(counter->device);
-	const struct mlx5_ib_counters *cnts =
-		get_counters(dev, counter->port - 1);
-
-	return rdma_alloc_hw_stats_struct(cnts->names,
-					  cnts->num_q_counters +
-					  cnts->num_cong_counters +
-					  cnts->num_ext_ppcnt_counters,
-					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
-}
-
-static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
-{
-	struct mlx5_ib_dev *dev = to_mdev(counter->device);
-	const struct mlx5_ib_counters *cnts =
-		get_counters(dev, counter->port - 1);
-
-	return mlx5_ib_query_q_counters(dev->mdev, cnts,
-					counter->stats, counter->id);
-}
-
-static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
-{
-	struct mlx5_ib_dev *dev = to_mdev(counter->device);
-	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
-
-	if (!counter->id)
-		return 0;
-
-	MLX5_SET(dealloc_q_counter_in, in, opcode,
-		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
-	MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id);
-	return mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in);
-}
-
-static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
-				   struct ib_qp *qp)
-{
-	struct mlx5_ib_dev *dev = to_mdev(qp->device);
-	int err;
-
-	if (!counter->id) {
-		u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {};
-		u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {};
-
-		MLX5_SET(alloc_q_counter_in, in, opcode,
-			 MLX5_CMD_OP_ALLOC_Q_COUNTER);
-		MLX5_SET(alloc_q_counter_in, in, uid, MLX5_SHARED_RESOURCE_UID);
-		err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out);
-		if (err)
-			return err;
-		counter->id =
-			MLX5_GET(alloc_q_counter_out, out, counter_set_id);
-	}
-
-	err = mlx5_ib_qp_set_counter(qp, counter);
-	if (err)
-		goto fail_set_counter;
-
-	return 0;
-
-fail_set_counter:
-	mlx5_ib_counter_dealloc(counter);
-	counter->id = 0;
-
-	return err;
-}
-
-static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
-{
-	return mlx5_ib_qp_set_counter(qp, NULL);
-}
-
 static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
 				 enum rdma_netdev_t type,
 				 struct rdma_netdev_alloc_params *params)
@@ -5802,23 +3369,6 @@
 	return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params);
 }
 
-static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev)
-{
-	if (!dev->delay_drop.dir_debugfs)
-		return;
-	debugfs_remove_recursive(dev->delay_drop.dir_debugfs);
-	dev->delay_drop.dir_debugfs = NULL;
-}
-
-static void cancel_delay_drop(struct mlx5_ib_dev *dev)
-{
-	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
-		return;
-
-	cancel_work_sync(&dev->delay_drop.delay_drop_work);
-	delay_drop_debugfs_cleanup(dev);
-}
-
 static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf,
 				       size_t count, loff_t *pos)
 {
@@ -5858,40 +3408,6 @@
 	.read	= delay_drop_timeout_read,
 };
 
-static void delay_drop_debugfs_init(struct mlx5_ib_dev *dev)
-{
-	struct dentry *root;
-
-	if (!mlx5_debugfs_root)
-		return;
-
-	root = debugfs_create_dir("delay_drop", dev->mdev->priv.dbg_root);
-	dev->delay_drop.dir_debugfs = root;
-
-	debugfs_create_atomic_t("num_timeout_events", 0400, root,
-				&dev->delay_drop.events_cnt);
-	debugfs_create_atomic_t("num_rqs", 0400, root,
-				&dev->delay_drop.rqs_cnt);
-	debugfs_create_file("timeout", 0600, root, &dev->delay_drop,
-			    &fops_delay_drop_timeout);
-}
-
-static void init_delay_drop(struct mlx5_ib_dev *dev)
-{
-	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
-		return;
-
-	mutex_init(&dev->delay_drop.lock);
-	dev->delay_drop.dev = dev;
-	dev->delay_drop.activate = false;
-	dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
-	INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
-	atomic_set(&dev->delay_drop.rqs_cnt, 0);
-	atomic_set(&dev->delay_drop.events_cnt, 0);
-
-	delay_drop_debugfs_init(dev);
-}
-
 static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
 				      struct mlx5_ib_multiport_info *mpi)
 {
@@ -6385,90 +3901,32 @@
 	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
 			     enum mlx5_ib_uapi_flow_action_flags));
 
+ADD_UVERBS_ATTRIBUTES_SIMPLE(
+	mlx5_ib_query_context,
+	UVERBS_OBJECT_DEVICE,
+	UVERBS_METHOD_QUERY_CONTEXT,
+	UVERBS_ATTR_PTR_OUT(
+		MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX,
+		UVERBS_ATTR_STRUCT(struct mlx5_ib_alloc_ucontext_resp,
+				   dump_fill_mkey),
+		UA_MANDATORY));
+
 static const struct uapi_definition mlx5_ib_defs[] = {
 	UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
 	UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
 	UAPI_DEF_CHAIN(mlx5_ib_qos_defs),
+	UAPI_DEF_CHAIN(mlx5_ib_std_types_defs),
 
 	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
 				&mlx5_ib_flow_action),
 	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
+	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context),
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
 				UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR),
 	{}
 };
 
-static int mlx5_ib_read_counters(struct ib_counters *counters,
-				 struct ib_counters_read_attr *read_attr,
-				 struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
-	struct mlx5_read_counters_attr mread_attr = {};
-	struct mlx5_ib_flow_counters_desc *desc;
-	int ret, i;
-
-	mutex_lock(&mcounters->mcntrs_mutex);
-	if (mcounters->cntrs_max_index > read_attr->ncounters) {
-		ret = -EINVAL;
-		goto err_bound;
-	}
-
-	mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64),
-				 GFP_KERNEL);
-	if (!mread_attr.out) {
-		ret = -ENOMEM;
-		goto err_bound;
-	}
-
-	mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl;
-	mread_attr.flags = read_attr->flags;
-	ret = mcounters->read_counters(counters->device, &mread_attr);
-	if (ret)
-		goto err_read;
-
-	/* do the pass over the counters data array to assign according to the
-	 * descriptions and indexing pairs
-	 */
-	desc = mcounters->counters_data;
-	for (i = 0; i < mcounters->ncounters; i++)
-		read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description];
-
-err_read:
-	kfree(mread_attr.out);
-err_bound:
-	mutex_unlock(&mcounters->mcntrs_mutex);
-	return ret;
-}
-
-static int mlx5_ib_destroy_counters(struct ib_counters *counters)
-{
-	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
-
-	counters_clear_description(counters);
-	if (mcounters->hw_cntrs_hndl)
-		mlx5_fc_destroy(to_mdev(counters->device)->mdev,
-				mcounters->hw_cntrs_hndl);
-
-	kfree(mcounters);
-
-	return 0;
-}
-
-static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
-						   struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_ib_mcounters *mcounters;
-
-	mcounters = kzalloc(sizeof(*mcounters), GFP_KERNEL);
-	if (!mcounters)
-		return ERR_PTR(-ENOMEM);
-
-	mutex_init(&mcounters->mcntrs_mutex);
-
-	return &mcounters->ibcntrs;
-}
-
 static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
 {
 	mlx5_ib_cleanup_multiport_master(dev);
@@ -6547,21 +4005,16 @@
 	return -ENOMEM;
 }
 
-static int mlx5_ib_stage_flow_db_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_enable_driver(struct ib_device *dev)
 {
-	dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
+	struct mlx5_ib_dev *mdev = to_mdev(dev);
+	int ret;
 
-	if (!dev->flow_db)
-		return -ENOMEM;
+	ret = mlx5_ib_test_wc(mdev);
+	mlx5_ib_dbg(mdev, "Write-Combining %s",
+		    mdev->wc_support ? "supported" : "not supported");
 
-	mutex_init(&dev->flow_db->lock);
-
-	return 0;
-}
-
-static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
-{
-	kfree(dev->flow_db);
+	return ret;
 }
 
 static const struct ib_device_ops mlx5_ib_dev_ops = {
@@ -6577,9 +4030,7 @@
 	.attach_mcast = mlx5_ib_mcg_attach,
 	.check_mr_status = mlx5_ib_check_mr_status,
 	.create_ah = mlx5_ib_create_ah,
-	.create_counters = mlx5_ib_create_counters,
 	.create_cq = mlx5_ib_create_cq,
-	.create_flow = mlx5_ib_create_flow,
 	.create_qp = mlx5_ib_create_qp,
 	.create_srq = mlx5_ib_create_srq,
 	.dealloc_pd = mlx5_ib_dealloc_pd,
@@ -6587,10 +4038,7 @@
 	.del_gid = mlx5_ib_del_gid,
 	.dereg_mr = mlx5_ib_dereg_mr,
 	.destroy_ah = mlx5_ib_destroy_ah,
-	.destroy_counters = mlx5_ib_destroy_counters,
 	.destroy_cq = mlx5_ib_destroy_cq,
-	.destroy_flow = mlx5_ib_destroy_flow,
-	.destroy_flow_action = mlx5_ib_destroy_flow_action,
 	.destroy_qp = mlx5_ib_destroy_qp,
 	.destroy_srq = mlx5_ib_destroy_srq,
 	.detach_mcast = mlx5_ib_mcg_detach,
@@ -6598,8 +4046,6 @@
 	.drain_rq = mlx5_ib_drain_rq,
 	.drain_sq = mlx5_ib_drain_sq,
 	.enable_driver = mlx5_ib_enable_driver,
-	.fill_res_entry = mlx5_ib_fill_res_entry,
-	.fill_stat_entry = mlx5_ib_fill_stat_entry,
 	.get_dev_fw_str = get_dev_fw_str,
 	.get_dma_mr = mlx5_ib_get_dma_mr,
 	.get_link_layer = mlx5_ib_port_link_layer,
@@ -6623,24 +4069,20 @@
 	.query_pkey = mlx5_ib_query_pkey,
 	.query_qp = mlx5_ib_query_qp,
 	.query_srq = mlx5_ib_query_srq,
-	.read_counters = mlx5_ib_read_counters,
+	.query_ucontext = mlx5_ib_query_ucontext,
 	.reg_user_mr = mlx5_ib_reg_user_mr,
 	.req_notify_cq = mlx5_ib_arm_cq,
 	.rereg_user_mr = mlx5_ib_rereg_user_mr,
 	.resize_cq = mlx5_ib_resize_cq,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
 	INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
 };
 
-static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
-	.create_flow_action_esp = mlx5_ib_create_flow_action_esp,
-	.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp,
-};
-
 static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
 	.rdma_netdev_get_params = mlx5_ib_rn_get_params,
 };
@@ -6661,6 +4103,8 @@
 static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
 	.alloc_xrcd = mlx5_ib_alloc_xrcd,
 	.dealloc_xrcd = mlx5_ib_dealloc_xrcd,
+
+	INIT_RDMA_OBJ_SIZE(ib_xrcd, mlx5_ib_xrcd, ibxrcd),
 };
 
 static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
@@ -6769,9 +4213,6 @@
 	    MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)
 		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
 
-	if (mlx5_accel_ipsec_device_caps(dev->mdev) &
-	    MLX5_ACCEL_IPSEC_CAP_DEVICE)
-		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops);
 	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
 
 	if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
@@ -6829,65 +4270,36 @@
 	.modify_wq = mlx5_ib_modify_wq,
 };
 
-static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
-{
-	u8 port_num;
-
-	dev->ib_dev.uverbs_ex_cmd_mask |=
-			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
-			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
-			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
-			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
-			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
-	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
-
-	port_num = mlx5_core_native_port_num(dev->mdev) - 1;
-
-	/* Register only for native ports */
-	return mlx5_add_netdev_notifier(dev, port_num);
-}
-
-static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev)
-{
-	u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
-
-	mlx5_remove_netdev_notifier(dev, port_num);
-}
-
-static int mlx5_ib_stage_raw_eth_roce_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_core_dev *mdev = dev->mdev;
 	enum rdma_link_layer ll;
 	int port_type_cap;
-	int err = 0;
-
-	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
-	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
-
-	if (ll == IB_LINK_LAYER_ETHERNET)
-		err = mlx5_ib_stage_common_roce_init(dev);
-
-	return err;
-}
-
-static void mlx5_ib_stage_raw_eth_roce_cleanup(struct mlx5_ib_dev *dev)
-{
-	mlx5_ib_stage_common_roce_cleanup(dev);
-}
-
-static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev)
-{
-	struct mlx5_core_dev *mdev = dev->mdev;
-	enum rdma_link_layer ll;
-	int port_type_cap;
+	u8 port_num = 0;
 	int err;
 
 	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
 	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
 
 	if (ll == IB_LINK_LAYER_ETHERNET) {
-		err = mlx5_ib_stage_common_roce_init(dev);
-		if (err)
+		dev->ib_dev.uverbs_ex_cmd_mask |=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
+			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
+		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
+
+		port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+
+		/* Register only for native ports */
+		err = mlx5_add_netdev_notifier(dev, port_num);
+		if (err || dev->is_rep || !mlx5_is_roce_enabled(mdev))
+			/*
+			 * We don't enable ETH interface for
+			 * 1. IB representors
+			 * 2. User disabled ROCE through devlink interface
+			 */
 			return err;
 
 		err = mlx5_enable_eth(dev);
@@ -6897,73 +4309,29 @@
 
 	return 0;
 cleanup:
-	mlx5_ib_stage_common_roce_cleanup(dev);
-
+	mlx5_remove_netdev_notifier(dev, port_num);
 	return err;
 }
 
-static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_core_dev *mdev = dev->mdev;
 	enum rdma_link_layer ll;
 	int port_type_cap;
+	u8 port_num;
 
 	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
 	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
 
 	if (ll == IB_LINK_LAYER_ETHERNET) {
-		mlx5_disable_eth(dev);
-		mlx5_ib_stage_common_roce_cleanup(dev);
+		if (!dev->is_rep)
+			mlx5_disable_eth(dev);
+
+		port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+		mlx5_remove_netdev_notifier(dev, port_num);
 	}
 }
 
-static int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
-{
-	return create_dev_resources(&dev->devr);
-}
-
-static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
-{
-	destroy_dev_resources(&dev->devr);
-}
-
-static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
-{
-	return mlx5_ib_odp_init_one(dev);
-}
-
-static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
-{
-	mlx5_ib_odp_cleanup_one(dev);
-}
-
-static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
-	.alloc_hw_stats = mlx5_ib_alloc_hw_stats,
-	.get_hw_stats = mlx5_ib_get_hw_stats,
-	.counter_bind_qp = mlx5_ib_counter_bind_qp,
-	.counter_unbind_qp = mlx5_ib_counter_unbind_qp,
-	.counter_dealloc = mlx5_ib_counter_dealloc,
-	.counter_alloc_stats = mlx5_ib_counter_alloc_stats,
-	.counter_update_stats = mlx5_ib_counter_update_stats,
-};
-
-static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
-{
-	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
-		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_hw_stats_ops);
-
-		return mlx5_ib_alloc_counters(dev);
-	}
-
-	return 0;
-}
-
-static void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
-{
-	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
-		mlx5_ib_dealloc_counters(dev);
-}
-
 static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
 {
 	mlx5_ib_init_cong_debugfs(dev,
@@ -7023,7 +4391,18 @@
 
 static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
 {
-	destroy_umrc_res(dev);
+	int err;
+
+	err = mlx5_mr_cache_cleanup(dev);
+	if (err)
+		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
+
+	if (dev->umrc.qp)
+		mlx5_ib_destroy_qp(dev->umrc.qp, NULL);
+	if (dev->umrc.cq)
+		ib_free_cq(dev->umrc.cq);
+	if (dev->umrc.pd)
+		ib_dealloc_pd(dev->umrc.pd);
 }
 
 static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
@@ -7031,21 +4410,162 @@
 	ib_unregister_device(&dev->ib_dev);
 }
 
+enum {
+	MAX_UMR_WR = 128,
+};
+
 static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 {
-	return create_umr_res(dev);
+	struct ib_qp_init_attr *init_attr = NULL;
+	struct ib_qp_attr *attr = NULL;
+	struct ib_pd *pd;
+	struct ib_cq *cq;
+	struct ib_qp *qp;
+	int ret;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto error_0;
+	}
+
+	pd = ib_alloc_pd(&dev->ib_dev, 0);
+	if (IS_ERR(pd)) {
+		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
+		ret = PTR_ERR(pd);
+		goto error_0;
+	}
+
+	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
+	if (IS_ERR(cq)) {
+		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
+		ret = PTR_ERR(cq);
+		goto error_2;
+	}
+
+	init_attr->send_cq = cq;
+	init_attr->recv_cq = cq;
+	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->cap.max_send_wr = MAX_UMR_WR;
+	init_attr->cap.max_send_sge = 1;
+	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
+	init_attr->port_num = 1;
+	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
+	if (IS_ERR(qp)) {
+		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
+		ret = PTR_ERR(qp);
+		goto error_3;
+	}
+	qp->device     = &dev->ib_dev;
+	qp->real_qp    = qp;
+	qp->uobject    = NULL;
+	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
+	qp->send_cq    = init_attr->send_cq;
+	qp->recv_cq    = init_attr->recv_cq;
+
+	attr->qp_state = IB_QPS_INIT;
+	attr->port_num = 1;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
+				IB_QP_PORT, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTR;
+	attr->path_mtu = IB_MTU_256;
+
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTS;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
+		goto error_4;
+	}
+
+	dev->umrc.qp = qp;
+	dev->umrc.cq = cq;
+	dev->umrc.pd = pd;
+
+	sema_init(&dev->umrc.sem, MAX_UMR_WR);
+	ret = mlx5_mr_cache_init(dev);
+	if (ret) {
+		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
+		goto error_4;
+	}
+
+	kfree(attr);
+	kfree(init_attr);
+
+	return 0;
+
+error_4:
+	mlx5_ib_destroy_qp(qp, NULL);
+	dev->umrc.qp = NULL;
+
+error_3:
+	ib_free_cq(cq);
+	dev->umrc.cq = NULL;
+
+error_2:
+	ib_dealloc_pd(pd);
+	dev->umrc.pd = NULL;
+
+error_0:
+	kfree(attr);
+	kfree(init_attr);
+	return ret;
 }
 
 static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
 {
-	init_delay_drop(dev);
+	struct dentry *root;
 
+	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
+		return 0;
+
+	mutex_init(&dev->delay_drop.lock);
+	dev->delay_drop.dev = dev;
+	dev->delay_drop.activate = false;
+	dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
+	INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
+	atomic_set(&dev->delay_drop.rqs_cnt, 0);
+	atomic_set(&dev->delay_drop.events_cnt, 0);
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	root = debugfs_create_dir("delay_drop", dev->mdev->priv.dbg_root);
+	dev->delay_drop.dir_debugfs = root;
+
+	debugfs_create_atomic_t("num_timeout_events", 0400, root,
+				&dev->delay_drop.events_cnt);
+	debugfs_create_atomic_t("num_rqs", 0400, root,
+				&dev->delay_drop.rqs_cnt);
+	debugfs_create_file("timeout", 0600, root, &dev->delay_drop,
+			    &fops_delay_drop_timeout);
 	return 0;
 }
 
 static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
 {
-	cancel_delay_drop(dev);
+	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
+		return;
+
+	cancel_work_sync(&dev->delay_drop.delay_drop_work);
+	if (!dev->delay_drop.dir_debugfs)
+		return;
+
+	debugfs_remove_recursive(dev->delay_drop.dir_debugfs);
+	dev->delay_drop.dir_debugfs = NULL;
 }
 
 static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
@@ -7060,38 +4580,6 @@
 	mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
 }
 
-static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev)
-{
-	int uid;
-
-	uid = mlx5_ib_devx_create(dev, false);
-	if (uid > 0) {
-		dev->devx_whitelist_uid = uid;
-		mlx5_ib_devx_init_event_table(dev);
-	}
-
-	return 0;
-}
-static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev)
-{
-	if (dev->devx_whitelist_uid) {
-		mlx5_ib_devx_cleanup_event_table(dev);
-		mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
-	}
-}
-
-int mlx5_ib_enable_driver(struct ib_device *dev)
-{
-	struct mlx5_ib_dev *mdev = to_mdev(dev);
-	int ret;
-
-	ret = mlx5_ib_test_wc(mdev);
-	mlx5_ib_dbg(mdev, "Write-Combining %s",
-		    mdev->wc_support ? "supported" : "not supported");
-
-	return ret;
-}
-
 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
 		      const struct mlx5_ib_profile *profile,
 		      int stage)
@@ -7139,9 +4627,9 @@
 	STAGE_CREATE(MLX5_IB_STAGE_INIT,
 		     mlx5_ib_stage_init_init,
 		     mlx5_ib_stage_init_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
-		     mlx5_ib_stage_flow_db_init,
-		     mlx5_ib_stage_flow_db_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_FS,
+		     mlx5_ib_fs_init,
+		     mlx5_ib_fs_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
 		     mlx5_ib_stage_caps_init,
 		     mlx5_ib_stage_caps_cleanup),
@@ -7149,8 +4637,8 @@
 		     mlx5_ib_stage_non_default_cb,
 		     NULL),
 	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
-		     mlx5_ib_stage_roce_init,
-		     mlx5_ib_stage_roce_cleanup),
+		     mlx5_ib_roce_init,
+		     mlx5_ib_roce_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_QP,
 		     mlx5_init_qp_table,
 		     mlx5_cleanup_qp_table),
@@ -7158,17 +4646,17 @@
 		     mlx5_init_srq_table,
 		     mlx5_cleanup_srq_table),
 	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
-		     mlx5_ib_stage_dev_res_init,
-		     mlx5_ib_stage_dev_res_cleanup),
+		     mlx5_ib_dev_res_init,
+		     mlx5_ib_dev_res_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
 		     mlx5_ib_stage_dev_notifier_init,
 		     mlx5_ib_stage_dev_notifier_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_ODP,
-		     mlx5_ib_stage_odp_init,
-		     mlx5_ib_stage_odp_cleanup),
+		     mlx5_ib_odp_init_one,
+		     mlx5_ib_odp_cleanup_one),
 	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
-		     mlx5_ib_stage_counters_init,
-		     mlx5_ib_stage_counters_cleanup),
+		     mlx5_ib_counters_init,
+		     mlx5_ib_counters_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
 		     mlx5_ib_stage_cong_debugfs_init,
 		     mlx5_ib_stage_cong_debugfs_cleanup),
@@ -7182,8 +4670,8 @@
 		     NULL,
 		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
-		     mlx5_ib_stage_devx_init,
-		     mlx5_ib_stage_devx_cleanup),
+		     mlx5_ib_devx_init,
+		     mlx5_ib_devx_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
 		     mlx5_ib_stage_ib_reg_init,
 		     mlx5_ib_stage_ib_reg_cleanup),
@@ -7193,15 +4681,18 @@
 	STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
 		     mlx5_ib_stage_delay_drop_init,
 		     mlx5_ib_stage_delay_drop_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_RESTRACK,
+		     mlx5_ib_restrack_init,
+		     NULL),
 };
 
 const struct mlx5_ib_profile raw_eth_profile = {
 	STAGE_CREATE(MLX5_IB_STAGE_INIT,
 		     mlx5_ib_stage_init_init,
 		     mlx5_ib_stage_init_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
-		     mlx5_ib_stage_flow_db_init,
-		     mlx5_ib_stage_flow_db_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_FS,
+		     mlx5_ib_fs_init,
+		     mlx5_ib_fs_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
 		     mlx5_ib_stage_caps_init,
 		     mlx5_ib_stage_caps_cleanup),
@@ -7209,8 +4700,8 @@
 		     mlx5_ib_stage_raw_eth_non_default_cb,
 		     NULL),
 	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
-		     mlx5_ib_stage_raw_eth_roce_init,
-		     mlx5_ib_stage_raw_eth_roce_cleanup),
+		     mlx5_ib_roce_init,
+		     mlx5_ib_roce_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_QP,
 		     mlx5_init_qp_table,
 		     mlx5_cleanup_qp_table),
@@ -7218,14 +4709,14 @@
 		     mlx5_init_srq_table,
 		     mlx5_cleanup_srq_table),
 	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
-		     mlx5_ib_stage_dev_res_init,
-		     mlx5_ib_stage_dev_res_cleanup),
+		     mlx5_ib_dev_res_init,
+		     mlx5_ib_dev_res_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
 		     mlx5_ib_stage_dev_notifier_init,
 		     mlx5_ib_stage_dev_notifier_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
-		     mlx5_ib_stage_counters_init,
-		     mlx5_ib_stage_counters_cleanup),
+		     mlx5_ib_counters_init,
+		     mlx5_ib_counters_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
 		     mlx5_ib_stage_cong_debugfs_init,
 		     mlx5_ib_stage_cong_debugfs_cleanup),
@@ -7239,14 +4730,17 @@
 		     NULL,
 		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
-		     mlx5_ib_stage_devx_init,
-		     mlx5_ib_stage_devx_cleanup),
+		     mlx5_ib_devx_init,
+		     mlx5_ib_devx_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
 		     mlx5_ib_stage_ib_reg_init,
 		     mlx5_ib_stage_ib_reg_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
 		     mlx5_ib_stage_post_ib_reg_umr_init,
 		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_RESTRACK,
+		     mlx5_ib_restrack_init,
+		     NULL),
 };
 
 static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev)
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 5dbe3eb..5287fc8 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
  */
 
 #ifndef MLX5_IB_H
@@ -730,8 +703,8 @@
 
 struct mlx5_ib_resources {
 	struct ib_cq	*c0;
-	struct ib_xrcd	*x0;
-	struct ib_xrcd	*x1;
+	u32 xrcdn0;
+	u32 xrcdn1;
 	struct ib_pd	*p0;
 	struct ib_srq	*s0;
 	struct ib_srq	*s1;
@@ -832,7 +805,7 @@
 
 enum mlx5_ib_stages {
 	MLX5_IB_STAGE_INIT,
-	MLX5_IB_STAGE_FLOW_DB,
+	MLX5_IB_STAGE_FS,
 	MLX5_IB_STAGE_CAPS,
 	MLX5_IB_STAGE_NON_DEFAULT_CB,
 	MLX5_IB_STAGE_ROCE,
@@ -850,7 +823,7 @@
 	MLX5_IB_STAGE_IB_REG,
 	MLX5_IB_STAGE_POST_IB_REG_UMR,
 	MLX5_IB_STAGE_DELAY_DROP,
-	MLX5_IB_STAGE_CLASS_ATTR,
+	MLX5_IB_STAGE_RESTRACK,
 	MLX5_IB_STAGE_MAX,
 };
 
@@ -1078,11 +1051,6 @@
 	return container_of(core_qp, struct mlx5_ib_rwq, core_qp);
 }
 
-static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mkey *mmkey)
-{
-	return container_of(mmkey, struct mlx5_ib_mr, mmkey);
-}
-
 static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd)
 {
 	return container_of(ibpd, struct mlx5_ib_pd, ibpd);
@@ -1210,7 +1178,7 @@
 			  struct ib_pd *pd, struct ib_udata *udata);
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata);
+			       u32 max_num_sg);
 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
 					 u32 max_num_sg,
 					 u32 max_num_meta_sg);
@@ -1224,9 +1192,8 @@
 			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
 			const struct ib_mad *in, struct ib_mad *out,
 			size_t *out_mad_size, u16 *out_mad_pkey_index);
-struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
-				   struct ib_udata *udata);
-int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
+int mlx5_ib_alloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
+void mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
 int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset);
 int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port);
 int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev,
@@ -1375,46 +1342,12 @@
 						   u8 *native_port_num);
 void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev,
 				  u8 port_num);
-int mlx5_ib_fill_res_entry(struct sk_buff *msg,
-			   struct rdma_restrack_entry *res);
-int mlx5_ib_fill_stat_entry(struct sk_buff *msg,
-			    struct rdma_restrack_entry *res);
 
 extern const struct uapi_definition mlx5_ib_devx_defs[];
 extern const struct uapi_definition mlx5_ib_flow_defs[];
 extern const struct uapi_definition mlx5_ib_qos_defs[];
+extern const struct uapi_definition mlx5_ib_std_types_defs[];
 
-#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
-int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
-void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid);
-void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev);
-void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev);
-struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add(
-	struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
-	struct mlx5_flow_context *flow_context,
-	struct mlx5_flow_act *flow_act, u32 counter_id,
-	void *cmd_in, int inlen, int dest_id, int dest_type);
-bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type);
-bool mlx5_ib_devx_is_flow_counter(void *obj, u32 offset, u32 *counter_id);
-void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction);
-#else
-static inline int
-mlx5_ib_devx_create(struct mlx5_ib_dev *dev,
-			   bool is_user) { return -EOPNOTSUPP; }
-static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {}
-static inline void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) {}
-static inline void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev) {}
-static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id,
-					     int *dest_type)
-{
-	return false;
-}
-static inline void
-mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction)
-{
-	return;
-};
-#endif
 static inline void init_query_mad(struct ib_smp *mad)
 {
 	mad->base_version  = 1;
@@ -1423,15 +1356,6 @@
 	mad->method	   = IB_MGMT_METHOD_GET;
 }
 
-static inline u8 convert_access(int acc)
-{
-	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
-	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
-	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
-	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
-	       MLX5_PERM_LOCAL_READ;
-}
-
 static inline int is_qp1(enum ib_qp_type qp_type)
 {
 	return qp_type == MLX5_IB_QPT_HW_GSI;
@@ -1518,9 +1442,6 @@
 			struct mlx5_bfreg_info *bfregi, u32 bfregn,
 			bool dyn_bfreg);
 
-int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
-u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num);
-
 static inline bool mlx5_ib_can_use_umr(struct mlx5_ib_dev *dev,
 				       bool do_modify_atomic, int access_flags)
 {
@@ -1533,14 +1454,18 @@
 		return false;
 
 	if (access_flags & IB_ACCESS_RELAXED_ORDERING &&
-	    (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) ||
-	     MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)))
+	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
+	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
+		return false;
+
+	if (access_flags & IB_ACCESS_RELAXED_ORDERING &&
+	     MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) &&
+	     !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
 		return false;
 
 	return true;
 }
 
-int mlx5_ib_enable_driver(struct ib_device *dev);
 int mlx5_ib_test_wc(struct mlx5_ib_dev *dev);
 
 static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev)
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 4468307..3e6f2f9 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1961,7 +1961,7 @@
 }
 
 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata)
+			       u32 max_num_sg)
 {
 	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
 }
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 77dca1e..cfd7efa 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -816,6 +816,7 @@
 {
 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
 
+	lockdep_assert_held(&mr->dev->odp_srcu);
 	if (unlikely(io_virt < mr->mmkey.iova))
 		return -EFAULT;
 
@@ -929,11 +930,6 @@
 		if (ret < 0)
 			goto srcu_unlock;
 
-		/*
-		 * When prefetching a page, page fault is generated
-		 * in order to bring the page to the main memory.
-		 * In the current flow, page faults are being counted.
-		 */
 		mlx5_update_odp_stats(mr, faults, ret);
 
 		npages += ret;
@@ -1770,13 +1766,26 @@
 {
 	struct prefetch_mr_work *work =
 		container_of(w, struct prefetch_mr_work, work);
+	struct mlx5_ib_dev *dev;
 	u32 bytes_mapped = 0;
+	int srcu_key;
+	int ret;
 	u32 i;
 
-	for (i = 0; i < work->num_sge; ++i)
-		pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
-			     work->frags[i].length, &bytes_mapped,
-			     work->pf_flags);
+	/* We rely on IB/core that work is executed if we have num_sge != 0 only. */
+	WARN_ON(!work->num_sge);
+	dev = work->frags[0].mr->dev;
+	/* SRCU should be held when calling to mlx5_odp_populate_xlt() */
+	srcu_key = srcu_read_lock(&dev->odp_srcu);
+	for (i = 0; i < work->num_sge; ++i) {
+		ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
+				   work->frags[i].length, &bytes_mapped,
+				   work->pf_flags);
+		if (ret <= 0)
+			continue;
+		mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
+	}
+	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 
 	destroy_prefetch_work(work);
 }
@@ -1832,6 +1841,7 @@
 				   &bytes_mapped, pf_flags);
 		if (ret < 0)
 			goto out;
+		mlx5_update_odp_stats(mr, prefetch, ret);
 	}
 	ret = 0;
 
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 1225b8d..59fce5f 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -38,6 +38,7 @@
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
 #include "ib_rep.h"
+#include "counters.h"
 #include "cmd.h"
 #include "qp.h"
 #include "wr.h"
@@ -2031,15 +2032,15 @@
 	switch (init_attr->qp_type) {
 	case IB_QPT_XRC_INI:
 		MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn);
-		MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn);
+		MLX5_SET(qpc, qpc, xrcd, devr->xrcdn1);
 		MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn);
 		break;
 	default:
 		if (init_attr->srq) {
-			MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x0)->xrcdn);
+			MLX5_SET(qpc, qpc, xrcd, devr->xrcdn0);
 			MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(init_attr->srq)->msrq.srqn);
 		} else {
-			MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn);
+			MLX5_SET(qpc, qpc, xrcd, devr->xrcdn1);
 			MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s1)->msrq.srqn);
 		}
 	}
@@ -2178,11 +2179,11 @@
 		MLX5_SET(qpc, qpc, no_sq, 1);
 
 	if (attr->srq) {
-		MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x0)->xrcdn);
+		MLX5_SET(qpc, qpc, xrcd, devr->xrcdn0);
 		MLX5_SET(qpc, qpc, srqn_rmpn_xrqn,
 			 to_msrq(attr->srq)->msrq.srqn);
 	} else {
-		MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn);
+		MLX5_SET(qpc, qpc, xrcd, devr->xrcdn1);
 		MLX5_SET(qpc, qpc, srqn_rmpn_xrqn,
 			 to_msrq(devr->s1)->msrq.srqn);
 	}
@@ -3554,7 +3555,7 @@
 	switch (raw_qp_param->operation) {
 	case MLX5_CMD_OP_RST2INIT_QP:
 		rq_state = MLX5_RQC_STATE_RDY;
-		sq_state = MLX5_SQC_STATE_RDY;
+		sq_state = MLX5_SQC_STATE_RST;
 		break;
 	case MLX5_CMD_OP_2ERR_QP:
 		rq_state = MLX5_RQC_STATE_ERR;
@@ -3566,13 +3567,11 @@
 		break;
 	case MLX5_CMD_OP_RTR2RTS_QP:
 	case MLX5_CMD_OP_RTS2RTS_QP:
-		if (raw_qp_param->set_mask ==
-		    MLX5_RAW_QP_RATE_LIMIT) {
-			modify_rq = 0;
-			sq_state = sq->state;
-		} else {
-			return raw_qp_param->set_mask ? -EINVAL : 0;
-		}
+		if (raw_qp_param->set_mask & ~MLX5_RAW_QP_RATE_LIMIT)
+			return -EINVAL;
+
+		modify_rq = 0;
+		sq_state = MLX5_SQC_STATE_RDY;
 		break;
 	case MLX5_CMD_OP_INIT2INIT_QP:
 	case MLX5_CMD_OP_INIT2RTR_QP:
@@ -4114,9 +4113,9 @@
 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 	enum ib_qp_state cur_state, new_state;
-	int err = 0;
 	int required = IB_QP_STATE;
 	void *dctc;
+	int err;
 
 	if (!(attr_mask & IB_QP_STATE))
 		return -EINVAL;
@@ -4208,11 +4207,9 @@
 		mlx5_ib_warn(dev, "Modify DCT: Invalid transition from %d to %d\n", cur_state, new_state);
 		return -EINVAL;
 	}
-	if (err)
-		qp->state = IB_QPS_ERR;
-	else
-		qp->state = new_state;
-	return err;
+
+	qp->state = new_state;
+	return 0;
 }
 
 int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
@@ -4450,7 +4447,7 @@
 			[MLX5_SQ_STATE_NA]	= IB_QPS_RESET,
 		},
 		[MLX5_RQC_STATE_RDY] = {
-			[MLX5_SQC_STATE_RST]	= MLX5_QP_STATE_BAD,
+			[MLX5_SQC_STATE_RST]	= MLX5_QP_STATE,
 			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE,
 			[MLX5_SQC_STATE_ERR]	= IB_QPS_SQE,
 			[MLX5_SQ_STATE_NA]	= MLX5_QP_STATE,
@@ -4462,7 +4459,7 @@
 			[MLX5_SQ_STATE_NA]	= IB_QPS_ERR,
 		},
 		[MLX5_RQ_STATE_NA] = {
-			[MLX5_SQC_STATE_RST]    = IB_QPS_RESET,
+			[MLX5_SQC_STATE_RST]    = MLX5_QP_STATE,
 			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE,
 			[MLX5_SQC_STATE_ERR]	= MLX5_QP_STATE,
 			[MLX5_SQ_STATE_NA]	= MLX5_QP_STATE_BAD,
@@ -4708,41 +4705,23 @@
 	return err;
 }
 
-struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
-				   struct ib_udata *udata)
+int mlx5_ib_alloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata)
 {
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_ib_xrcd *xrcd;
-	int err;
+	struct mlx5_ib_dev *dev = to_mdev(ibxrcd->device);
+	struct mlx5_ib_xrcd *xrcd = to_mxrcd(ibxrcd);
 
 	if (!MLX5_CAP_GEN(dev->mdev, xrc))
-		return ERR_PTR(-ENOSYS);
+		return -EOPNOTSUPP;
 
-	xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL);
-	if (!xrcd)
-		return ERR_PTR(-ENOMEM);
-
-	err = mlx5_cmd_xrcd_alloc(dev->mdev, &xrcd->xrcdn, 0);
-	if (err) {
-		kfree(xrcd);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	return &xrcd->ibxrcd;
+	return mlx5_cmd_xrcd_alloc(dev->mdev, &xrcd->xrcdn, 0);
 }
 
-int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
+void mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(xrcd->device);
 	u32 xrcdn = to_mxrcd(xrcd)->xrcdn;
-	int err;
 
-	err = mlx5_cmd_xrcd_dealloc(dev->mdev, xrcdn, 0);
-	if (err)
-		mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);
-
-	kfree(xrcd);
-	return 0;
+	mlx5_cmd_xrcd_dealloc(dev->mdev, xrcdn, 0);
 }
 
 static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type)
diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h
index 82ea2b9..ba899df 100644
--- a/drivers/infiniband/hw/mlx5/qp.h
+++ b/drivers/infiniband/hw/mlx5/qp.h
@@ -43,4 +43,5 @@
 
 int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn);
 int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn);
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
 #endif /* _MLX5_IB_QP_H */
diff --git a/drivers/infiniband/hw/mlx5/restrack.c b/drivers/infiniband/hw/mlx5/restrack.c
index 8f6c04f..887270d 100644
--- a/drivers/infiniband/hw/mlx5/restrack.c
+++ b/drivers/infiniband/hw/mlx5/restrack.c
@@ -1,17 +1,85 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ * Copyright (c) 2019-2020, Mellanox Technologies Ltd. All rights reserved.
  */
 
 #include <uapi/rdma/rdma_netlink.h>
+#include <linux/mlx5/rsc_dump.h>
 #include <rdma/ib_umem_odp.h>
 #include <rdma/restrack.h>
 #include "mlx5_ib.h"
+#include "restrack.h"
 
-static int fill_stat_mr_entry(struct sk_buff *msg,
-			      struct rdma_restrack_entry *res)
+#define MAX_DUMP_SIZE 1024
+
+static int dump_rsc(struct mlx5_core_dev *dev, enum mlx5_sgmt_type type,
+		    int index, void *data, int *data_len)
 {
-	struct ib_mr *ibmr = container_of(res, struct ib_mr, res);
+	struct mlx5_core_dev *mdev = dev;
+	struct mlx5_rsc_dump_cmd *cmd;
+	struct mlx5_rsc_key key = {};
+	struct page *page;
+	int offset = 0;
+	int err = 0;
+	int cmd_err;
+	int size;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	key.size = PAGE_SIZE;
+	key.rsc = type;
+	key.index1 = index;
+	key.num_of_obj1 = 1;
+
+	cmd = mlx5_rsc_dump_cmd_create(mdev, &key);
+	if (IS_ERR(cmd)) {
+		err = PTR_ERR(cmd);
+		goto free_page;
+	}
+
+	do {
+		cmd_err = mlx5_rsc_dump_next(mdev, cmd, page, &size);
+		if (cmd_err < 0 || size + offset > MAX_DUMP_SIZE) {
+			err = cmd_err;
+			goto destroy_cmd;
+		}
+		memcpy(data + offset, page_address(page), size);
+		offset += size;
+	} while (cmd_err > 0);
+	*data_len = offset;
+
+destroy_cmd:
+	mlx5_rsc_dump_cmd_destroy(cmd);
+free_page:
+	__free_page(page);
+	return err;
+}
+
+static int fill_res_raw(struct sk_buff *msg, struct mlx5_ib_dev *dev,
+			enum mlx5_sgmt_type type, u32 key)
+{
+	int len = 0;
+	void *data;
+	int err;
+
+	data = kzalloc(MAX_DUMP_SIZE, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	err = dump_rsc(dev->mdev, type, key, data, &len);
+	if (err)
+		goto out;
+
+	err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data);
+out:
+	kfree(data);
+	return err;
+}
+
+static int fill_stat_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr)
+{
 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
 	struct nlattr *table_attr;
 
@@ -31,6 +99,9 @@
 		    msg, "page_invalidations",
 		    atomic64_read(&mr->odp_stats.invalidations)))
 		goto err_table;
+	if (rdma_nl_stat_hwcounter_entry(msg, "page_prefetch",
+					 atomic64_read(&mr->odp_stats.prefetch)))
+		goto err_table;
 
 	nla_nest_end(msg, table_attr);
 	return 0;
@@ -41,10 +112,16 @@
 	return -EMSGSIZE;
 }
 
-static int fill_res_mr_entry(struct sk_buff *msg,
-			     struct rdma_restrack_entry *res)
+static int fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ibmr)
 {
-	struct ib_mr *ibmr = container_of(res, struct ib_mr, res);
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+
+	return fill_res_raw(msg, mr->dev, MLX5_SGMT_TYPE_PRM_QUERY_MKEY,
+			    mlx5_mkey_to_idx(mr->mmkey.key));
+}
+
+static int fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr)
+{
 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
 	struct nlattr *table_attr;
 
@@ -71,20 +148,32 @@
 	return -EMSGSIZE;
 }
 
-int mlx5_ib_fill_res_entry(struct sk_buff *msg,
-			   struct rdma_restrack_entry *res)
+static int fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ibcq)
 {
-	if (res->type == RDMA_RESTRACK_MR)
-		return fill_res_mr_entry(msg, res);
+	struct mlx5_ib_dev *dev = to_mdev(ibcq->device);
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
 
-	return 0;
+	return fill_res_raw(msg, dev, MLX5_SGMT_TYPE_PRM_QUERY_CQ, cq->mcq.cqn);
 }
 
-int mlx5_ib_fill_stat_entry(struct sk_buff *msg,
-			    struct rdma_restrack_entry *res)
+static int fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ibqp)
 {
-	if (res->type == RDMA_RESTRACK_MR)
-		return fill_stat_mr_entry(msg, res);
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 
+	return fill_res_raw(msg, dev, MLX5_SGMT_TYPE_PRM_QUERY_QP,
+			    ibqp->qp_num);
+}
+
+static const struct ib_device_ops restrack_ops = {
+	.fill_res_cq_entry_raw = fill_res_cq_entry_raw,
+	.fill_res_mr_entry = fill_res_mr_entry,
+	.fill_res_mr_entry_raw = fill_res_mr_entry_raw,
+	.fill_res_qp_entry_raw = fill_res_qp_entry_raw,
+	.fill_stat_mr_entry = fill_stat_mr_entry,
+};
+
+int mlx5_ib_restrack_init(struct mlx5_ib_dev *dev)
+{
+	ib_set_device_ops(&dev->ib_dev, &restrack_ops);
 	return 0;
 }
diff --git a/drivers/infiniband/hw/mlx5/restrack.h b/drivers/infiniband/hw/mlx5/restrack.h
new file mode 100644
index 0000000..e8d8127
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/restrack.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2013-2020, Mellanox Technologies Ltd. All rights reserved.
+ */
+
+#ifndef _MLX5_IB_RESTRACK_H
+#define _MLX5_IB_RESTRACK_H
+
+#include "mlx5_ib.h"
+
+int mlx5_ib_restrack_init(struct mlx5_ib_dev *dev);
+
+#endif /* _MLX5_IB_RESTRACK_H */
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 6d1ff13..7e10cbc 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -274,10 +274,10 @@
 	if (srq->wq_sig)
 		in.flags |= MLX5_SRQ_FLAG_WQ_SIG;
 
-	if (init_attr->srq_type == IB_SRQT_XRC)
+	if (init_attr->srq_type == IB_SRQT_XRC && init_attr->ext.xrc.xrcd)
 		in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
 	else
-		in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn;
+		in.xrcd = dev->devr.xrcdn0;
 
 	if (init_attr->srq_type == IB_SRQT_TM) {
 		in.tm_log_list_size =
diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c
new file mode 100644
index 0000000..16145fd
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/std_types.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_ib.h"
+
+#define UVERBS_MODULE_NAME mlx5_ib
+#include <rdma/uverbs_named_ioctl.h>
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_PD_QUERY)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_pd *pd =
+		uverbs_attr_get_obj(attrs, MLX5_IB_ATTR_QUERY_PD_HANDLE);
+	struct mlx5_ib_pd *mpd = to_mpd(pd);
+
+	return uverbs_copy_to(attrs, MLX5_IB_ATTR_QUERY_PD_RESP_PDN,
+			      &mpd->pdn, sizeof(mpd->pdn));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_PD_QUERY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_QUERY_PD_HANDLE,
+			UVERBS_OBJECT_PD,
+			UVERBS_ACCESS_READ,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_QUERY_PD_RESP_PDN,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_MANDATORY));
+
+ADD_UVERBS_METHODS(mlx5_ib_pd,
+		   UVERBS_OBJECT_PD,
+		   &UVERBS_METHOD(MLX5_IB_METHOD_PD_QUERY));
+
+const struct uapi_definition mlx5_ib_std_types_defs[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE(
+		UVERBS_OBJECT_PD,
+		&mlx5_ib_pd),
+	{},
+};
diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c
index bc35dbe..4388097 100644
--- a/drivers/infiniband/hw/mlx5/wr.c
+++ b/drivers/infiniband/hw/mlx5/wr.c
@@ -263,7 +263,9 @@
 	return cpu_to_be64(result);
 }
 
-static __be64 get_umr_update_access_mask(int atomic)
+static __be64 get_umr_update_access_mask(int atomic,
+					 int relaxed_ordering_write,
+					 int relaxed_ordering_read)
 {
 	u64 result;
 
@@ -275,6 +277,12 @@
 	if (atomic)
 		result |= MLX5_MKEY_MASK_A;
 
+	if (relaxed_ordering_write)
+		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE;
+
+	if (relaxed_ordering_read)
+		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ;
+
 	return cpu_to_be64(result);
 }
 
@@ -289,17 +297,28 @@
 
 static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask)
 {
-	if ((mask & MLX5_MKEY_MASK_PAGE_SIZE &&
-	     MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) ||
-	    (mask & MLX5_MKEY_MASK_A &&
-	     MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)))
+	if (mask & MLX5_MKEY_MASK_PAGE_SIZE &&
+	    MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
 		return -EPERM;
+
+	if (mask & MLX5_MKEY_MASK_A &&
+	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
+		return -EPERM;
+
+	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE &&
+	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
+		return -EPERM;
+
+	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ &&
+	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
+		return -EPERM;
+
 	return 0;
 }
 
 static int set_reg_umr_segment(struct mlx5_ib_dev *dev,
 			       struct mlx5_wqe_umr_ctrl_seg *umr,
-			       const struct ib_send_wr *wr, int atomic)
+			       const struct ib_send_wr *wr)
 {
 	const struct mlx5_umr_wr *umrwr = umr_wr(wr);
 
@@ -325,7 +344,10 @@
 	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION)
 		umr->mkey_mask |= get_umr_update_translation_mask();
 	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) {
-		umr->mkey_mask |= get_umr_update_access_mask(atomic);
+		umr->mkey_mask |= get_umr_update_access_mask(
+			!!(MLX5_CAP_GEN(dev->mdev, atomic)),
+			!!(MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)),
+			!!(MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)));
 		umr->mkey_mask |= get_umr_update_pd_mask();
 	}
 	if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR)
@@ -383,20 +405,31 @@
 
 	memset(seg, 0, sizeof(*seg));
 	if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR)
-		seg->status = MLX5_MKEY_STATUS_FREE;
+		MLX5_SET(mkc, seg, free, 1);
 
-	seg->flags = convert_access(umrwr->access_flags);
+	MLX5_SET(mkc, seg, a,
+		 !!(umrwr->access_flags & IB_ACCESS_REMOTE_ATOMIC));
+	MLX5_SET(mkc, seg, rw,
+		 !!(umrwr->access_flags & IB_ACCESS_REMOTE_WRITE));
+	MLX5_SET(mkc, seg, rr, !!(umrwr->access_flags & IB_ACCESS_REMOTE_READ));
+	MLX5_SET(mkc, seg, lw, !!(umrwr->access_flags & IB_ACCESS_LOCAL_WRITE));
+	MLX5_SET(mkc, seg, lr, 1);
+	MLX5_SET(mkc, seg, relaxed_ordering_write,
+		 !!(umrwr->access_flags & IB_ACCESS_RELAXED_ORDERING));
+	MLX5_SET(mkc, seg, relaxed_ordering_read,
+		 !!(umrwr->access_flags & IB_ACCESS_RELAXED_ORDERING));
+
 	if (umrwr->pd)
-		seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
+		MLX5_SET(mkc, seg, pd, to_mpd(umrwr->pd)->pdn);
 	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION &&
 	    !umrwr->length)
-		seg->flags_pd |= cpu_to_be32(MLX5_MKEY_LEN64);
+		MLX5_SET(mkc, seg, length64, 1);
 
-	seg->start_addr = cpu_to_be64(umrwr->virt_addr);
-	seg->len = cpu_to_be64(umrwr->length);
-	seg->log2_page_size = umrwr->page_shift;
-	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
-				       mlx5_mkey_variant(umrwr->mkey));
+	MLX5_SET64(mkc, seg, start_addr, umrwr->virt_addr);
+	MLX5_SET64(mkc, seg, len, umrwr->length);
+	MLX5_SET(mkc, seg, log_page_size, umrwr->page_shift);
+	MLX5_SET(mkc, seg, qpn, 0xffffff);
+	MLX5_SET(mkc, seg, mkey_7_0, mlx5_mkey_variant(umrwr->mkey));
 }
 
 static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg,
@@ -1224,8 +1257,7 @@
 
 	qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
 	(*ctrl)->imm = cpu_to_be32(umr_wr(wr)->mkey);
-	err = set_reg_umr_segment(dev, *seg, wr,
-				  !!(MLX5_CAP_GEN(dev->mdev, atomic)));
+	err = set_reg_umr_segment(dev, *seg, wr);
 	if (unlikely(err))
 		goto out;
 	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index d11c743..6cdbec1 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -2901,7 +2901,7 @@
 }
 
 struct ib_mr *ocrdma_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
-			      u32 max_num_sg, struct ib_udata *udata)
+			      u32 max_num_sg)
 {
 	int status;
 	struct ocrdma_mr *mr;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index 3a50108..df8e3b9 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -101,7 +101,7 @@
 struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
 				 u64 virt, int acc, struct ib_udata *);
 struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			      u32 max_num_sg, struct ib_udata *udata);
+			      u32 max_num_sg);
 int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		     unsigned int *sg_offset);
 
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index b1de8d6..d85f992 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -110,7 +110,6 @@
 	if (err)
 		return err;
 
-	immutable->pkey_tbl_len = 1;
 	immutable->gid_tbl_len = 1;
 	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
 	immutable->max_mad_size = 0;
@@ -179,6 +178,7 @@
 
 static const struct ib_device_ops qedr_roce_dev_ops = {
 	.get_port_immutable = qedr_roce_port_immutable,
+	.query_pkey = qedr_query_pkey,
 };
 
 static void qedr_roce_register_device(struct qedr_dev *dev)
@@ -221,7 +221,6 @@
 	.post_srq_recv = qedr_post_srq_recv,
 	.process_mad = qedr_process_mad,
 	.query_device = qedr_query_device,
-	.query_pkey = qedr_query_pkey,
 	.query_port = qedr_query_port,
 	.query_qp = qedr_query_qp,
 	.query_srq = qedr_query_srq,
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index fdf90ec..4602921 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -235,6 +235,7 @@
 	u32 dpi_size;
 	u16 dpi;
 	bool db_rec;
+	u8 edpm_mode;
 };
 
 union db_prod32 {
@@ -344,10 +345,10 @@
 	u32 wqe_prod;
 	u32 sge_prod;
 	u32 wr_prod_cnt;
-	u32 wr_cons_cnt;
+	atomic_t wr_cons_cnt;
 	u32 num_elems;
 
-	u32 *virt_prod_pair_addr;
+	struct rdma_srq_producers *virt_prod_pair_addr;
 	dma_addr_t phy_prod_pair_addr;
 };
 
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index f5aa85a..4ce4e2e 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -239,7 +239,6 @@
 	attr->ip_gids = true;
 	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
 		attr->gid_tbl_len = 1;
-		attr->pkey_tbl_len = 1;
 	} else {
 		attr->gid_tbl_len = QEDR_MAX_SGID;
 		attr->pkey_tbl_len = QEDR_ROCE_PKEY_TABLE_LEN;
@@ -275,7 +274,8 @@
 			DP_ERR(dev, "Problem copying data from user space\n");
 			return -EFAULT;
 		}
-
+		ctx->edpm_mode = !!(ureq.context_flags &
+				    QEDR_ALLOC_UCTX_EDPM_MODE);
 		ctx->db_rec = !!(ureq.context_flags & QEDR_ALLOC_UCTX_DB_REC);
 	}
 
@@ -316,11 +316,15 @@
 		uresp.dpm_flags = QEDR_DPM_TYPE_IWARP_LEGACY;
 	else
 		uresp.dpm_flags = QEDR_DPM_TYPE_ROCE_ENHANCED |
-				  QEDR_DPM_TYPE_ROCE_LEGACY;
+				  QEDR_DPM_TYPE_ROCE_LEGACY |
+				  QEDR_DPM_TYPE_ROCE_EDPM_MODE;
 
-	uresp.dpm_flags |= QEDR_DPM_SIZES_SET;
-	uresp.ldpm_limit_size = QEDR_LDPM_MAX_SIZE;
-	uresp.edpm_trans_size = QEDR_EDPM_TRANS_SIZE;
+	if (ureq.context_flags & QEDR_SUPPORT_DPM_SIZES) {
+		uresp.dpm_flags |= QEDR_DPM_SIZES_SET;
+		uresp.ldpm_limit_size = QEDR_LDPM_MAX_SIZE;
+		uresp.edpm_trans_size = QEDR_EDPM_TRANS_SIZE;
+		uresp.edpm_limit_size = QEDR_EDPM_MAX_SIZE;
+	}
 
 	uresp.wids_enabled = 1;
 	uresp.wid_count = oparams.wid_count;
@@ -1754,7 +1758,7 @@
 	struct qed_rdma_create_qp_out_params out_params;
 	struct qedr_pd *pd = get_qedr_pd(ibpd);
 	struct qedr_create_qp_uresp uresp;
-	struct qedr_ucontext *ctx = NULL;
+	struct qedr_ucontext *ctx = pd ? pd->uctx : NULL;
 	struct qedr_create_qp_ureq ureq;
 	int alloc_and_init = rdma_protocol_roce(&dev->ibdev, 1);
 	int rc = -EINVAL;
@@ -1792,6 +1796,9 @@
 		in_params.rq_pbl_ptr = qp->urq.pbl_tbl->pa;
 	}
 
+	if (ctx)
+		SET_FIELD(in_params.flags, QED_ROCE_EDPM_MODE, ctx->edpm_mode);
+
 	qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx,
 					      &in_params, &out_params);
 
@@ -3004,7 +3011,7 @@
 }
 
 struct ib_mr *qedr_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
-			    u32 max_num_sg, struct ib_udata *udata)
+			    u32 max_num_sg)
 {
 	struct qedr_mr *mr;
 
@@ -3687,7 +3694,7 @@
 	 * count and consumer count and subtract it from max
 	 * work request supported so that we get elements left.
 	 */
-	used = hw_srq->wr_prod_cnt - hw_srq->wr_cons_cnt;
+	used = hw_srq->wr_prod_cnt - (u32)atomic_read(&hw_srq->wr_cons_cnt);
 
 	return hw_srq->max_wr - used;
 }
@@ -3702,7 +3709,6 @@
 	unsigned long flags;
 	int status = 0;
 	u32 num_sge;
-	u32 offset;
 
 	spin_lock_irqsave(&srq->lock, flags);
 
@@ -3715,7 +3721,8 @@
 		if (!qedr_srq_elem_left(hw_srq) ||
 		    wr->num_sge > srq->hw_srq.max_sges) {
 			DP_ERR(dev, "Can't post WR  (%d,%d) || (%d > %d)\n",
-			       hw_srq->wr_prod_cnt, hw_srq->wr_cons_cnt,
+			       hw_srq->wr_prod_cnt,
+			       atomic_read(&hw_srq->wr_cons_cnt),
 			       wr->num_sge, srq->hw_srq.max_sges);
 			status = -ENOMEM;
 			*bad_wr = wr;
@@ -3749,22 +3756,20 @@
 			hw_srq->sge_prod++;
 		}
 
-		/* Flush WQE and SGE information before
+		/* Update WQE and SGE information before
 		 * updating producer.
 		 */
-		wmb();
+		dma_wmb();
 
 		/* SRQ producer is 8 bytes. Need to update SGE producer index
 		 * in first 4 bytes and need to update WQE producer in
 		 * next 4 bytes.
 		 */
-		*srq->hw_srq.virt_prod_pair_addr = hw_srq->sge_prod;
-		offset = offsetof(struct rdma_srq_producers, wqe_prod);
-		*((u8 *)srq->hw_srq.virt_prod_pair_addr + offset) =
-			hw_srq->wqe_prod;
+		srq->hw_srq.virt_prod_pair_addr->sge_prod = hw_srq->sge_prod;
+		/* Make sure sge producer is updated first */
+		dma_wmb();
+		srq->hw_srq.virt_prod_pair_addr->wqe_prod = hw_srq->wqe_prod;
 
-		/* Flush producer after updating it. */
-		wmb();
 		wr = wr->next;
 	}
 
@@ -4183,7 +4188,7 @@
 	} else {
 		__process_resp_one(dev, qp, cq, wc, resp, wr_id);
 	}
-	srq->hw_srq.wr_cons_cnt++;
+	atomic_inc(&srq->hw_srq.wr_cons_cnt);
 
 	return 1;
 }
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 5e02387..39dd628 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -84,7 +84,7 @@
 		   int sg_nents, unsigned int *sg_offset);
 
 struct ib_mr *qedr_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			    u32 max_num_sg, struct ib_udata *udata);
+			    u32 max_num_sg);
 int qedr_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc);
 int qedr_post_send(struct ib_qp *, const struct ib_send_wr *,
 		   const struct ib_send_wr **bad_wr);
diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.c b/drivers/infiniband/hw/usnic/usnic_fwd.c
index 7875883..398c4c0 100644
--- a/drivers/infiniband/hw/usnic/usnic_fwd.c
+++ b/drivers/infiniband/hw/usnic/usnic_fwd.c
@@ -214,7 +214,7 @@
 	if (!flow)
 		return ERR_PTR(-ENOMEM);
 
-	tlv = pci_alloc_consistent(pdev, tlv_size, &tlv_pa);
+	tlv = dma_alloc_coherent(&pdev->dev, tlv_size, &tlv_pa, GFP_ATOMIC);
 	if (!tlv) {
 		usnic_err("Failed to allocate memory\n");
 		status = -ENOMEM;
@@ -258,7 +258,7 @@
 
 out_free_tlv:
 	spin_unlock(&ufdev->lock);
-	pci_free_consistent(pdev, tlv_size, tlv, tlv_pa);
+	dma_free_coherent(&pdev->dev, tlv_size, tlv, tlv_pa);
 	if (!status)
 		return flow;
 out_free_flow:
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
index b039f1f..77a010e68 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
@@ -202,7 +202,7 @@
  * @return: ib_mr pointer on success, otherwise returns an errno.
  */
 struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			      u32 max_num_sg, struct ib_udata *udata)
+			      u32 max_num_sg)
 {
 	struct pvrdma_dev *dev = to_vdev(pd->device);
 	struct pvrdma_user_mr *mr;
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
index 2677022..699b208 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
@@ -406,7 +406,7 @@
 				 struct ib_udata *udata);
 int pvrdma_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			      u32 max_num_sg, struct ib_udata *udata);
+			      u32 max_num_sg);
 int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 		     int sg_nents, unsigned int *sg_offset);
 int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c
index 40480ad..75a04b1 100644
--- a/drivers/infiniband/sw/rdmavt/ah.c
+++ b/drivers/infiniband/sw/rdmavt/ah.c
@@ -90,8 +90,7 @@
 /**
  * rvt_create_ah - create an address handle
  * @ibah: the IB address handle
- * @ah_attr: the attributes of the AH
- * @create_flags: create address handle flags (see enum rdma_create_ah_flags)
+ * @init_attr: the attributes of the AH
  * @udata: pointer to user's input output buffer information.
  *
  * This may be called from interrupt context.
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 60864e5..2f7c25f 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -576,7 +576,7 @@
  * Return: the memory region on success, otherwise return an errno.
  */
 struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			   u32 max_num_sg, struct ib_udata *udata)
+			   u32 max_num_sg)
 {
 	struct rvt_mr *mr;
 
diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h
index 780fc63..b3aba35 100644
--- a/drivers/infiniband/sw/rdmavt/mr.h
+++ b/drivers/infiniband/sw/rdmavt/mr.h
@@ -71,7 +71,7 @@
 			      struct ib_udata *udata);
 int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			   u32 max_num_sg, struct ib_udata *udata);
+			   u32 max_num_sg);
 int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 		  int sg_nents, unsigned int *sg_offset);
 
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 5642eef..907203a 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -40,14 +40,6 @@
 MODULE_DESCRIPTION("Soft RDMA transport");
 MODULE_LICENSE("Dual BSD/GPL");
 
-/* free resources for all ports on a device */
-static void rxe_cleanup_ports(struct rxe_dev *rxe)
-{
-	kfree(rxe->port.pkey_tbl);
-	rxe->port.pkey_tbl = NULL;
-
-}
-
 /* free resources for a rxe device all objects created for this device must
  * have been destroyed
  */
@@ -66,8 +58,6 @@
 	rxe_pool_cleanup(&rxe->mc_grp_pool);
 	rxe_pool_cleanup(&rxe->mc_elem_pool);
 
-	rxe_cleanup_ports(rxe);
-
 	if (rxe->tfm)
 		crypto_free_shash(rxe->tfm);
 }
@@ -111,7 +101,7 @@
 }
 
 /* initialize port attributes */
-static int rxe_init_port_param(struct rxe_port *port)
+static void rxe_init_port_param(struct rxe_port *port)
 {
 	port->attr.state		= IB_PORT_DOWN;
 	port->attr.max_mtu		= IB_MTU_4096;
@@ -134,35 +124,19 @@
 	port->attr.phys_state		= RXE_PORT_PHYS_STATE;
 	port->mtu_cap			= ib_mtu_enum_to_int(IB_MTU_256);
 	port->subnet_prefix		= cpu_to_be64(RXE_PORT_SUBNET_PREFIX);
-
-	return 0;
 }
 
 /* initialize port state, note IB convention that HCA ports are always
  * numbered from 1
  */
-static int rxe_init_ports(struct rxe_dev *rxe)
+static void rxe_init_ports(struct rxe_dev *rxe)
 {
 	struct rxe_port *port = &rxe->port;
 
 	rxe_init_port_param(port);
-
-	if (!port->attr.pkey_tbl_len || !port->attr.gid_tbl_len)
-		return -EINVAL;
-
-	port->pkey_tbl = kcalloc(port->attr.pkey_tbl_len,
-			sizeof(*port->pkey_tbl), GFP_KERNEL);
-
-	if (!port->pkey_tbl)
-		return -ENOMEM;
-
-	port->pkey_tbl[0] = 0xffff;
 	addrconf_addr_eui48((unsigned char *)&port->port_guid,
 			    rxe->ndev->dev_addr);
-
 	spin_lock_init(&port->port_lock);
-
-	return 0;
 }
 
 /* init pools of managed objects */
@@ -252,13 +226,11 @@
 	/* init default device parameters */
 	rxe_init_device_param(rxe);
 
-	err = rxe_init_ports(rxe);
-	if (err)
-		goto err1;
+	rxe_init_ports(rxe);
 
 	err = rxe_init_pools(rxe);
 	if (err)
-		goto err2;
+		return err;
 
 	/* init pending mmap list */
 	spin_lock_init(&rxe->mmap_offset_lock);
@@ -268,11 +240,6 @@
 	mutex_init(&rxe->usdev_lock);
 
 	return 0;
-
-err2:
-	rxe_cleanup_ports(rxe);
-err1:
-	return err;
 }
 
 void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index 775c23b..39dc3bf 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -103,8 +103,8 @@
 	from_mem_obj,
 };
 
-int rxe_mem_init_dma(struct rxe_pd *pd,
-		     int access, struct rxe_mem *mem);
+void rxe_mem_init_dma(struct rxe_pd *pd,
+		      int access, struct rxe_mem *mem);
 
 int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
 		      u64 length, u64 iova, int access, struct ib_udata *udata,
@@ -132,9 +132,6 @@
 
 int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length);
 
-int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
-		      u64 *page, int num_pages, u64 iova);
-
 void rxe_mem_cleanup(struct rxe_pool_entry *arg);
 
 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length);
@@ -145,7 +142,6 @@
 struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
 				int paylen, struct rxe_pkt_info *pkt);
 int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc);
-enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num);
 const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num);
 struct device *rxe_dma_device(struct rxe_dev *rxe);
 int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid);
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index e83c7b5..cdd811a 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -144,8 +144,8 @@
 	return -ENOMEM;
 }
 
-int rxe_mem_init_dma(struct rxe_pd *pd,
-		     int access, struct rxe_mem *mem)
+void rxe_mem_init_dma(struct rxe_pd *pd,
+		      int access, struct rxe_mem *mem)
 {
 	rxe_mem_init(access, mem);
 
@@ -153,8 +153,6 @@
 	mem->access		= access;
 	mem->state		= RXE_MEM_STATE_VALID;
 	mem->type		= RXE_MEM_TYPE_DMA;
-
-	return 0;
 }
 
 int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
@@ -587,47 +585,3 @@
 
 	return mem;
 }
-
-int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
-		      u64 *page, int num_pages, u64 iova)
-{
-	int i;
-	int num_buf;
-	int err;
-	struct rxe_map **map;
-	struct rxe_phys_buf *buf;
-	int page_size;
-
-	if (num_pages > mem->max_buf) {
-		err = -EINVAL;
-		goto err1;
-	}
-
-	num_buf		= 0;
-	page_size	= 1 << mem->page_shift;
-	map		= mem->map;
-	buf		= map[0]->buf;
-
-	for (i = 0; i < num_pages; i++) {
-		buf->addr = *page++;
-		buf->size = page_size;
-		buf++;
-		num_buf++;
-
-		if (num_buf == RXE_BUF_PER_MAP) {
-			map++;
-			buf = map[0]->buf;
-			num_buf = 0;
-		}
-	}
-
-	mem->iova	= iova;
-	mem->va		= iova;
-	mem->length	= num_pages << mem->page_shift;
-	mem->state	= RXE_MEM_STATE_VALID;
-
-	return 0;
-
-err1:
-	return err;
-}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 312c2fc..0c38086 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -520,11 +520,6 @@
 	return rxe->ndev->name;
 }
 
-enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num)
-{
-	return IB_LINK_LAYER_ETHERNET;
-}
-
 int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
 {
 	int err;
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
index 99e9d8b..2f381ae 100644
--- a/drivers/infiniband/sw/rxe/rxe_param.h
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -100,7 +100,7 @@
 	RXE_MAX_SRQ_SGE			= 27,
 	RXE_MIN_SRQ_SGE			= 1,
 	RXE_MAX_FMR_PAGE_LIST_LEN	= 512,
-	RXE_MAX_PKEYS			= 64,
+	RXE_MAX_PKEYS			= 1,
 	RXE_LOCAL_CA_ACK_DELAY		= 15,
 
 	RXE_MAX_UCONTEXT		= 512,
@@ -148,7 +148,7 @@
 	RXE_PORT_INIT_TYPE_REPLY	= 0,
 	RXE_PORT_ACTIVE_WIDTH		= IB_WIDTH_1X,
 	RXE_PORT_ACTIVE_SPEED		= 1,
-	RXE_PORT_PKEY_TBL_LEN		= 64,
+	RXE_PORT_PKEY_TBL_LEN		= 1,
 	RXE_PORT_PHYS_STATE		= IB_PORT_PHYS_STATE_POLLING,
 	RXE_PORT_SUBNET_PREFIX		= 0xfe80000000000000ULL,
 };
diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c
index 831ad57..7e123d3 100644
--- a/drivers/infiniband/sw/rxe/rxe_recv.c
+++ b/drivers/infiniband/sw/rxe/rxe_recv.c
@@ -101,36 +101,15 @@
 static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
 		      u32 qpn, struct rxe_qp *qp)
 {
-	int i;
-	int found_pkey = 0;
 	struct rxe_port *port = &rxe->port;
 	u16 pkey = bth_pkey(pkt);
 
 	pkt->pkey_index = 0;
 
-	if (qpn == 1) {
-		for (i = 0; i < port->attr.pkey_tbl_len; i++) {
-			if (pkey_match(pkey, port->pkey_tbl[i])) {
-				pkt->pkey_index = i;
-				found_pkey = 1;
-				break;
-			}
-		}
-
-		if (!found_pkey) {
-			pr_warn_ratelimited("bad pkey = 0x%x\n", pkey);
-			set_bad_pkey_cntr(port);
-			goto err1;
-		}
-	} else {
-		if (unlikely(!pkey_match(pkey,
-					 port->pkey_tbl[qp->attr.pkey_index]
-					))) {
-			pr_warn_ratelimited("bad pkey = 0x%0x\n", pkey);
-			set_bad_pkey_cntr(port);
-			goto err1;
-		}
-		pkt->pkey_index = qp->attr.pkey_index;
+	if (!pkey_match(pkey, IB_DEFAULT_PKEY_FULL)) {
+		pr_warn_ratelimited("bad pkey = 0x%x\n", pkey);
+		set_bad_pkey_cntr(port);
+		goto err1;
 	}
 
 	if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) &&
@@ -330,10 +309,14 @@
 
 static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb)
 {
+	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
 	const struct ib_gid_attr *gid_attr;
 	union ib_gid dgid;
 	union ib_gid *pdgid;
 
+	if (pkt->mask & RXE_LOOPBACK_MASK)
+		return 0;
+
 	if (skb->protocol == htons(ETH_P_IP)) {
 		ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
 				       (struct in6_addr *)&dgid);
@@ -366,7 +349,7 @@
 	if (unlikely(skb->len < pkt->offset + RXE_BTH_BYTES))
 		goto drop;
 
-	if (unlikely(rxe_match_dgid(rxe, skb) < 0)) {
+	if (rxe_match_dgid(rxe, skb) < 0) {
 		pr_warn_ratelimited("failed matching dgid\n");
 		goto drop;
 	}
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index e503117..34df2b5 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -381,7 +381,6 @@
 				       struct rxe_pkt_info *pkt)
 {
 	struct rxe_dev		*rxe = to_rdev(qp->ibqp.device);
-	struct rxe_port		*port = &rxe->port;
 	struct sk_buff		*skb;
 	struct rxe_send_wr	*ibwr = &wqe->wr;
 	struct rxe_av		*av;
@@ -419,9 +418,7 @@
 			(pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) ==
 			(RXE_WRITE_MASK | RXE_IMMDT_MASK));
 
-	pkey = (qp_type(qp) == IB_QPT_GSI) ?
-		 port->pkey_tbl[ibwr->wr.ud.pkey_index] :
-		 port->pkey_tbl[qp->attr.pkey_index];
+	pkey = IB_DEFAULT_PKEY_FULL;
 
 	qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn :
 					 qp->attr.dest_qp_num;
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index b8a22af..bb61e53 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -83,22 +83,11 @@
 static int rxe_query_pkey(struct ib_device *device,
 			  u8 port_num, u16 index, u16 *pkey)
 {
-	struct rxe_dev *rxe = to_rdev(device);
-	struct rxe_port *port;
+	if (index > 0)
+		return -EINVAL;
 
-	port = &rxe->port;
-
-	if (unlikely(index >= port->attr.pkey_tbl_len)) {
-		dev_warn(device->dev.parent, "invalid index = %d\n",
-			 index);
-		goto err1;
-	}
-
-	*pkey = port->pkey_tbl[index];
+	*pkey = IB_DEFAULT_PKEY_FULL;
 	return 0;
-
-err1:
-	return -EINVAL;
 }
 
 static int rxe_modify_device(struct ib_device *dev,
@@ -141,9 +130,7 @@
 static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev,
 					       u8 port_num)
 {
-	struct rxe_dev *rxe = to_rdev(dev);
-
-	return rxe_link_layer(rxe, port_num);
+	return IB_LINK_LAYER_ETHERNET;
 }
 
 static int rxe_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
@@ -684,6 +671,7 @@
 	unsigned int mask;
 	unsigned int length = 0;
 	int i;
+	struct ib_send_wr *next;
 
 	while (wr) {
 		mask = wr_opcode_mask(wr->opcode, qp);
@@ -700,6 +688,8 @@
 			break;
 		}
 
+		next = wr->next;
+
 		length = 0;
 		for (i = 0; i < wr->num_sge; i++)
 			length += wr->sg_list[i].length;
@@ -710,7 +700,7 @@
 			*bad_wr = wr;
 			break;
 		}
-		wr = wr->next;
+		wr = next;
 	}
 
 	rxe_run_task(&qp->req.task, 1);
@@ -901,30 +891,16 @@
 	struct rxe_dev *rxe = to_rdev(ibpd->device);
 	struct rxe_pd *pd = to_rpd(ibpd);
 	struct rxe_mem *mr;
-	int err;
 
 	mr = rxe_alloc(&rxe->mr_pool);
-	if (!mr) {
-		err = -ENOMEM;
-		goto err1;
-	}
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
 
 	rxe_add_index(mr);
-
 	rxe_add_ref(pd);
-
-	err = rxe_mem_init_dma(pd, access, mr);
-	if (err)
-		goto err2;
+	rxe_mem_init_dma(pd, access, mr);
 
 	return &mr->ibmr;
-
-err2:
-	rxe_drop_ref(pd);
-	rxe_drop_index(mr);
-	rxe_drop_ref(mr);
-err1:
-	return ERR_PTR(err);
 }
 
 static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
@@ -975,7 +951,7 @@
 }
 
 static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
-				  u32 max_num_sg, struct ib_udata *udata)
+				  u32 max_num_sg)
 {
 	struct rxe_dev *rxe = to_rdev(ibpd->device);
 	struct rxe_pd *pd = to_rpd(ibpd);
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 92de39c..c664c7f 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -371,7 +371,6 @@
 
 struct rxe_port {
 	struct ib_port_attr	attr;
-	u16			*pkey_tbl;
 	__be64			port_guid;
 	__be64			subnet_prefix;
 	spinlock_t		port_lock; /* guard port */
diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
index ed60c9e4..d862bec 100644
--- a/drivers/infiniband/sw/siw/siw_main.c
+++ b/drivers/infiniband/sw/siw/siw_main.c
@@ -289,7 +289,6 @@
 	.post_srq_recv = siw_post_srq_recv,
 	.query_device = siw_query_device,
 	.query_gid = siw_query_gid,
-	.query_pkey = siw_query_pkey,
 	.query_port = siw_query_port,
 	.query_qp = siw_query_qp,
 	.query_srq = siw_query_srq,
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 987e2ba..adafa1b 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -176,7 +176,6 @@
 	attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
 	attr->phys_state = sdev->state == IB_PORT_ACTIVE ?
 		IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
-	attr->pkey_tbl_len = 1;
 	attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
 	attr->state = sdev->state;
 	/*
@@ -204,20 +203,12 @@
 	if (rv)
 		return rv;
 
-	port_immutable->pkey_tbl_len = attr.pkey_tbl_len;
 	port_immutable->gid_tbl_len = attr.gid_tbl_len;
 	port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
 
 	return 0;
 }
 
-int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey)
-{
-	/* Report the default pkey */
-	*pkey = 0xffff;
-	return 0;
-}
-
 int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
 		  union ib_gid *gid)
 {
@@ -1373,7 +1364,7 @@
 }
 
 struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			   u32 max_sge, struct ib_udata *udata)
+			   u32 max_sge)
 {
 	struct siw_device *sdev = to_siw_dev(pd->device);
 	struct siw_mr *mr = NULL;
diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h
index 1a73198..d957227 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.h
+++ b/drivers/infiniband/sw/siw/siw_verbs.h
@@ -46,7 +46,6 @@
 		  struct ib_udata *udata);
 int siw_query_port(struct ib_device *base_dev, u8 port,
 		   struct ib_port_attr *attr);
-int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey);
 int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
 		  union ib_gid *gid);
 int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
@@ -69,7 +68,7 @@
 struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len,
 			      u64 rnic_va, int rights, struct ib_udata *udata);
 struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type,
-			   u32 max_sge, struct ib_udata *udata);
+			   u32 max_sge);
 struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights);
 int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
 		  unsigned int *sg_off);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 9a3379c..3440dc4 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -515,7 +515,7 @@
 
 int ipoib_ib_dev_open_default(struct net_device *dev);
 int ipoib_ib_dev_open(struct net_device *dev);
-int ipoib_ib_dev_stop(struct net_device *dev);
+void ipoib_ib_dev_stop(struct net_device *dev);
 void ipoib_ib_dev_up(struct net_device *dev);
 void ipoib_ib_dev_down(struct net_device *dev);
 int ipoib_ib_dev_stop_default(struct net_device *dev);
@@ -527,7 +527,7 @@
 
 void ipoib_mcast_restart_task(struct work_struct *work);
 void ipoib_mcast_start_thread(struct net_device *dev);
-int ipoib_mcast_stop_thread(struct net_device *dev);
+void ipoib_mcast_stop_thread(struct net_device *dev);
 
 void ipoib_mcast_dev_down(struct net_device *dev);
 void ipoib_mcast_dev_flush(struct net_device *dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index da3c531..494f413 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -670,13 +670,12 @@
 	return rc;
 }
 
-static void __ipoib_reap_ah(struct net_device *dev)
+static void ipoib_reap_dead_ahs(struct ipoib_dev_priv *priv)
 {
-	struct ipoib_dev_priv *priv = ipoib_priv(dev);
 	struct ipoib_ah *ah, *tah;
 	unsigned long flags;
 
-	netif_tx_lock_bh(dev);
+	netif_tx_lock_bh(priv->dev);
 	spin_lock_irqsave(&priv->lock, flags);
 
 	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
@@ -687,37 +686,37 @@
 		}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
-	netif_tx_unlock_bh(dev);
+	netif_tx_unlock_bh(priv->dev);
 }
 
 void ipoib_reap_ah(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
 		container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
-	struct net_device *dev = priv->dev;
 
-	__ipoib_reap_ah(dev);
+	ipoib_reap_dead_ahs(priv);
 
 	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
 		queue_delayed_work(priv->wq, &priv->ah_reap_task,
 				   round_jiffies_relative(HZ));
 }
 
-static void ipoib_flush_ah(struct net_device *dev)
+static void ipoib_start_ah_reaper(struct ipoib_dev_priv *priv)
 {
-	struct ipoib_dev_priv *priv = ipoib_priv(dev);
-
-	cancel_delayed_work(&priv->ah_reap_task);
-	flush_workqueue(priv->wq);
-	ipoib_reap_ah(&priv->ah_reap_task.work);
+	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
+	queue_delayed_work(priv->wq, &priv->ah_reap_task,
+			   round_jiffies_relative(HZ));
 }
 
-static void ipoib_stop_ah(struct net_device *dev)
+static void ipoib_stop_ah_reaper(struct ipoib_dev_priv *priv)
 {
-	struct ipoib_dev_priv *priv = ipoib_priv(dev);
-
 	set_bit(IPOIB_STOP_REAPER, &priv->flags);
-	ipoib_flush_ah(dev);
+	cancel_delayed_work(&priv->ah_reap_task);
+	/*
+	 * After ipoib_stop_ah_reaper() we always go through
+	 * ipoib_reap_dead_ahs() which ensures the work is really stopped and
+	 * does a final flush out of the dead_ah's list
+	 */
 }
 
 static int recvs_pending(struct net_device *dev)
@@ -846,18 +845,6 @@
 	return 0;
 }
 
-int ipoib_ib_dev_stop(struct net_device *dev)
-{
-	struct ipoib_dev_priv *priv = ipoib_priv(dev);
-
-	priv->rn_ops->ndo_stop(dev);
-
-	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
-	ipoib_flush_ah(dev);
-
-	return 0;
-}
-
 int ipoib_ib_dev_open_default(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
@@ -901,10 +888,7 @@
 		return -1;
 	}
 
-	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
-	queue_delayed_work(priv->wq, &priv->ah_reap_task,
-			   round_jiffies_relative(HZ));
-
+	ipoib_start_ah_reaper(priv);
 	if (priv->rn_ops->ndo_open(dev)) {
 		pr_warn("%s: Failed to open dev\n", dev->name);
 		goto dev_stop;
@@ -915,13 +899,20 @@
 	return 0;
 
 dev_stop:
-	set_bit(IPOIB_STOP_REAPER, &priv->flags);
-	cancel_delayed_work(&priv->ah_reap_task);
-	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
-	ipoib_ib_dev_stop(dev);
+	ipoib_stop_ah_reaper(priv);
 	return -1;
 }
 
+void ipoib_ib_dev_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	priv->rn_ops->ndo_stop(dev);
+
+	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
+	ipoib_stop_ah_reaper(priv);
+}
+
 void ipoib_pkey_dev_check_presence(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
@@ -1232,7 +1223,7 @@
 		ipoib_mcast_dev_flush(dev);
 		if (oper_up)
 			set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
-		ipoib_flush_ah(dev);
+		ipoib_reap_dead_ahs(priv);
 	}
 
 	if (level >= IPOIB_FLUSH_NORMAL)
@@ -1307,7 +1298,7 @@
 	 * the neighbor garbage collection is stopped and reaped.
 	 * That should all be done now, so make a final ah flush.
 	 */
-	ipoib_stop_ah(dev);
+	ipoib_reap_dead_ahs(priv);
 
 	clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 3cfb682..752581a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1892,8 +1892,15 @@
 
 	priv->max_ib_mtu = ppriv->max_ib_mtu;
 	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
-	memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
-	memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid));
+	if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN))
+		memcpy(&priv->local_gid, priv->dev->dev_addr + 4,
+		       sizeof(priv->local_gid));
+	else {
+		memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr,
+		       INFINIBAND_ALEN);
+		memcpy(&priv->local_gid, &ppriv->local_gid,
+		       sizeof(priv->local_gid));
+	}
 }
 
 static int ipoib_ndo_init(struct net_device *ndev)
@@ -1976,6 +1983,8 @@
 
 	/* no more works over the priv->wq */
 	if (priv->wq) {
+		/* See ipoib_mcast_carrier_on_task() */
+		WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags));
 		flush_workqueue(priv->wq);
 		destroy_workqueue(priv->wq);
 		priv->wq = NULL;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 9bfa514..86e4ed6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -680,15 +680,13 @@
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
-int ipoib_mcast_stop_thread(struct net_device *dev)
+void ipoib_mcast_stop_thread(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
 
 	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
 
 	cancel_delayed_work_sync(&priv->mcast_task);
-
-	return 0;
 }
 
 static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 1d77c7f..78ee944 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -300,18 +300,6 @@
 struct iscsi_iser_task;
 
 /**
- * struct iser_comp - iSER completion context
- *
- * @cq:         completion queue
- * @active_qps: Number of active QPs attached
- *              to completion context
- */
-struct iser_comp {
-	struct ib_cq		*cq;
-	int                      active_qps;
-};
-
-/**
  * struct iser_device - iSER device handle
  *
  * @ib_device:     RDMA device
@@ -320,9 +308,6 @@
  * @event_handler: IB events handle routine
  * @ig_list:	   entry in devices list
  * @refcount:      Reference counter, dominated by open iser connections
- * @comps_used:    Number of completion contexts used, Min between online
- *                 cpus and device max completion vectors
- * @comps:         Dinamically allocated array of completion handlers
  */
 struct iser_device {
 	struct ib_device             *ib_device;
@@ -330,8 +315,6 @@
 	struct ib_event_handler      event_handler;
 	struct list_head             ig_list;
 	int                          refcount;
-	int			     comps_used;
-	struct iser_comp	     *comps;
 };
 
 /**
@@ -353,6 +336,7 @@
  * @list:           entry in connection fastreg pool
  * @rsc:            data buffer registration resources
  * @sig_protected:  is region protected indicator
+ * @all_list:       first and last list members
  */
 struct iser_fr_desc {
 	struct list_head		  list;
@@ -367,6 +351,7 @@
  * @list:                list of fastreg descriptors
  * @lock:                protects fastreg pool
  * @size:                size of the pool
+ * @all_list:            first and last list members
  */
 struct iser_fr_pool {
 	struct list_head        list;
@@ -380,11 +365,12 @@
  *
  * @cma_id:              rdma_cm connection maneger handle
  * @qp:                  Connection Queue-pair
+ * @cq:                  Connection completion queue
+ * @cq_size:             The number of max outstanding completions
  * @post_recv_buf_count: post receive counter
  * @sig_count:           send work request signal count
  * @rx_wr:               receive work request for batch posts
  * @device:              reference to iser device
- * @comp:                iser completion context
  * @fr_pool:             connection fast registration poool
  * @pi_support:          Indicate device T10-PI support
  * @reg_cqe:             completion handler
@@ -392,11 +378,12 @@
 struct ib_conn {
 	struct rdma_cm_id           *cma_id;
 	struct ib_qp	            *qp;
+	struct ib_cq		    *cq;
+	u32			    cq_size;
 	int                          post_recv_buf_count;
 	u8                           sig_count;
 	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
 	struct iser_device          *device;
-	struct iser_comp	    *comp;
 	struct iser_fr_pool          fr_pool;
 	bool			     pi_support;
 	struct ib_cqe		     reg_cqe;
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index c1f44c4..699e075 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -68,59 +68,23 @@
 static int iser_create_device_ib_res(struct iser_device *device)
 {
 	struct ib_device *ib_dev = device->ib_device;
-	int i, max_cqe;
 
 	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
 		iser_err("IB device does not support memory registrations\n");
 		return -1;
 	}
 
-	device->comps_used = min_t(int, num_online_cpus(),
-				 ib_dev->num_comp_vectors);
-
-	device->comps = kcalloc(device->comps_used, sizeof(*device->comps),
-				GFP_KERNEL);
-	if (!device->comps)
-		goto comps_err;
-
-	max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe);
-
-	iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n",
-		  device->comps_used, dev_name(&ib_dev->dev),
-		  ib_dev->num_comp_vectors, max_cqe);
-
 	device->pd = ib_alloc_pd(ib_dev,
 		iser_always_reg ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
 	if (IS_ERR(device->pd))
 		goto pd_err;
 
-	for (i = 0; i < device->comps_used; i++) {
-		struct iser_comp *comp = &device->comps[i];
-
-		comp->cq = ib_alloc_cq(ib_dev, comp, max_cqe, i,
-				       IB_POLL_SOFTIRQ);
-		if (IS_ERR(comp->cq)) {
-			comp->cq = NULL;
-			goto cq_err;
-		}
-	}
-
 	INIT_IB_EVENT_HANDLER(&device->event_handler, ib_dev,
 			      iser_event_handler);
 	ib_register_event_handler(&device->event_handler);
 	return 0;
 
-cq_err:
-	for (i = 0; i < device->comps_used; i++) {
-		struct iser_comp *comp = &device->comps[i];
-
-		if (comp->cq)
-			ib_free_cq(comp->cq);
-	}
-	ib_dealloc_pd(device->pd);
 pd_err:
-	kfree(device->comps);
-comps_err:
 	iser_err("failed to allocate an IB resource\n");
 	return -1;
 }
@@ -131,20 +95,9 @@
  */
 static void iser_free_device_ib_res(struct iser_device *device)
 {
-	int i;
-
-	for (i = 0; i < device->comps_used; i++) {
-		struct iser_comp *comp = &device->comps[i];
-
-		ib_free_cq(comp->cq);
-		comp->cq = NULL;
-	}
-
 	ib_unregister_event_handler(&device->event_handler);
 	ib_dealloc_pd(device->pd);
 
-	kfree(device->comps);
-	device->comps = NULL;
 	device->pd = NULL;
 }
 
@@ -287,70 +240,57 @@
 	struct ib_device	*ib_dev;
 	struct ib_qp_init_attr	init_attr;
 	int			ret = -ENOMEM;
-	int index, min_index = 0;
+	unsigned int max_send_wr, cq_size;
 
 	BUG_ON(ib_conn->device == NULL);
 
 	device = ib_conn->device;
 	ib_dev = device->ib_device;
 
-	memset(&init_attr, 0, sizeof init_attr);
+	if (ib_conn->pi_support)
+		max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
+	else
+		max_send_wr = ISER_QP_MAX_REQ_DTOS + 1;
+	max_send_wr = min_t(unsigned int, max_send_wr,
+			    (unsigned int)ib_dev->attrs.max_qp_wr);
 
-	mutex_lock(&ig.connlist_mutex);
-	/* select the CQ with the minimal number of usages */
-	for (index = 0; index < device->comps_used; index++) {
-		if (device->comps[index].active_qps <
-		    device->comps[min_index].active_qps)
-			min_index = index;
+	cq_size = max_send_wr + ISER_QP_MAX_RECV_DTOS;
+	ib_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1, IB_POLL_SOFTIRQ);
+	if (IS_ERR(ib_conn->cq)) {
+		ret = PTR_ERR(ib_conn->cq);
+		goto cq_err;
 	}
-	ib_conn->comp = &device->comps[min_index];
-	ib_conn->comp->active_qps++;
-	mutex_unlock(&ig.connlist_mutex);
-	iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn);
+	ib_conn->cq_size = cq_size;
+
+	memset(&init_attr, 0, sizeof(init_attr));
 
 	init_attr.event_handler = iser_qp_event_callback;
 	init_attr.qp_context	= (void *)ib_conn;
-	init_attr.send_cq	= ib_conn->comp->cq;
-	init_attr.recv_cq	= ib_conn->comp->cq;
+	init_attr.send_cq	= ib_conn->cq;
+	init_attr.recv_cq	= ib_conn->cq;
 	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
 	init_attr.cap.max_send_sge = 2;
 	init_attr.cap.max_recv_sge = 1;
 	init_attr.sq_sig_type	= IB_SIGNAL_REQ_WR;
 	init_attr.qp_type	= IB_QPT_RC;
-	if (ib_conn->pi_support) {
-		init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
+	init_attr.cap.max_send_wr = max_send_wr;
+	if (ib_conn->pi_support)
 		init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
-		iser_conn->max_cmds =
-			ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
-	} else {
-		if (ib_dev->attrs.max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
-			init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS + 1;
-			iser_conn->max_cmds =
-				ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
-		} else {
-			init_attr.cap.max_send_wr = ib_dev->attrs.max_qp_wr;
-			iser_conn->max_cmds =
-				ISER_GET_MAX_XMIT_CMDS(ib_dev->attrs.max_qp_wr);
-			iser_dbg("device %s supports max_send_wr %d\n",
-				 dev_name(&device->ib_device->dev),
-				 ib_dev->attrs.max_qp_wr);
-		}
-	}
+	iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(max_send_wr - 1);
 
 	ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
 	if (ret)
 		goto out_err;
 
 	ib_conn->qp = ib_conn->cma_id->qp;
-	iser_info("setting conn %p cma_id %p qp %p\n",
+	iser_info("setting conn %p cma_id %p qp %p max_send_wr %d\n",
 		  ib_conn, ib_conn->cma_id,
-		  ib_conn->cma_id->qp);
+		  ib_conn->cma_id->qp, max_send_wr);
 	return ret;
 
 out_err:
-	mutex_lock(&ig.connlist_mutex);
-	ib_conn->comp->active_qps--;
-	mutex_unlock(&ig.connlist_mutex);
+	ib_cq_pool_put(ib_conn->cq, ib_conn->cq_size);
+cq_err:
 	iser_err("unable to alloc mem or create resource, err %d\n", ret);
 
 	return ret;
@@ -462,10 +402,8 @@
 		  iser_conn, ib_conn->cma_id, ib_conn->qp);
 
 	if (ib_conn->qp != NULL) {
-		mutex_lock(&ig.connlist_mutex);
-		ib_conn->comp->active_qps--;
-		mutex_unlock(&ig.connlist_mutex);
 		rdma_destroy_qp(ib_conn->cma_id);
+		ib_cq_pool_put(ib_conn->cq, ib_conn->cq_size);
 		ib_conn->qp = NULL;
 	}
 
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index b7df38e..61e2f7f 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -24,13 +24,6 @@
 
 #include "ib_isert.h"
 
-#define	ISERT_MAX_CONN		8
-#define ISER_MAX_RX_CQ_LEN	(ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN)
-#define ISER_MAX_TX_CQ_LEN \
-	((ISERT_QP_MAX_REQ_DTOS + ISCSI_DEF_XMIT_CMDS_MAX) * ISERT_MAX_CONN)
-#define ISER_MAX_CQ_LEN		(ISER_MAX_RX_CQ_LEN + ISER_MAX_TX_CQ_LEN + \
-				 ISERT_MAX_CONN)
-
 static int isert_debug_level;
 module_param_named(debug_level, isert_debug_level, int, 0644);
 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:0)");
@@ -82,53 +75,34 @@
 	}
 }
 
-static struct isert_comp *
-isert_comp_get(struct isert_conn *isert_conn)
-{
-	struct isert_device *device = isert_conn->device;
-	struct isert_comp *comp;
-	int i, min = 0;
-
-	mutex_lock(&device_list_mutex);
-	for (i = 0; i < device->comps_used; i++)
-		if (device->comps[i].active_qps <
-		    device->comps[min].active_qps)
-			min = i;
-	comp = &device->comps[min];
-	comp->active_qps++;
-	mutex_unlock(&device_list_mutex);
-
-	isert_info("conn %p, using comp %p min_index: %d\n",
-		   isert_conn, comp, min);
-
-	return comp;
-}
-
-static void
-isert_comp_put(struct isert_comp *comp)
-{
-	mutex_lock(&device_list_mutex);
-	comp->active_qps--;
-	mutex_unlock(&device_list_mutex);
-}
-
 static struct ib_qp *
 isert_create_qp(struct isert_conn *isert_conn,
-		struct isert_comp *comp,
 		struct rdma_cm_id *cma_id)
 {
+	u32 cq_size = ISERT_QP_MAX_REQ_DTOS + ISERT_QP_MAX_RECV_DTOS + 2;
 	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
 	struct ib_qp_init_attr attr;
-	int ret;
+	int ret, factor;
+
+	isert_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1, IB_POLL_WORKQUEUE);
+	if (IS_ERR(isert_conn->cq)) {
+		isert_err("Unable to allocate cq\n");
+		ret = PTR_ERR(isert_conn->cq);
+		return ERR_PTR(ret);
+	}
+	isert_conn->cq_size = cq_size;
 
 	memset(&attr, 0, sizeof(struct ib_qp_init_attr));
 	attr.event_handler = isert_qp_event_callback;
 	attr.qp_context = isert_conn;
-	attr.send_cq = comp->cq;
-	attr.recv_cq = comp->cq;
+	attr.send_cq = isert_conn->cq;
+	attr.recv_cq = isert_conn->cq;
 	attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS + 1;
 	attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
-	attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX;
+	factor = rdma_rw_mr_factor(device->ib_device, cma_id->port_num,
+				   ISCSI_ISER_MAX_SG_TABLESIZE);
+	attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX * factor;
 	attr.cap.max_send_sge = device->ib_device->attrs.max_send_sge;
 	attr.cap.max_recv_sge = 1;
 	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -139,6 +113,8 @@
 	ret = rdma_create_qp(cma_id, device->pd, &attr);
 	if (ret) {
 		isert_err("rdma_create_qp failed for cma_id %d\n", ret);
+		ib_cq_pool_put(isert_conn->cq, isert_conn->cq_size);
+
 		return ERR_PTR(ret);
 	}
 
@@ -146,25 +122,6 @@
 }
 
 static int
-isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id)
-{
-	struct isert_comp *comp;
-	int ret;
-
-	comp = isert_comp_get(isert_conn);
-	isert_conn->qp = isert_create_qp(isert_conn, comp, cma_id);
-	if (IS_ERR(isert_conn->qp)) {
-		ret = PTR_ERR(isert_conn->qp);
-		goto err;
-	}
-
-	return 0;
-err:
-	isert_comp_put(comp);
-	return ret;
-}
-
-static int
 isert_alloc_rx_descriptors(struct isert_conn *isert_conn)
 {
 	struct isert_device *device = isert_conn->device;
@@ -231,61 +188,6 @@
 	isert_conn->rx_descs = NULL;
 }
 
-static void
-isert_free_comps(struct isert_device *device)
-{
-	int i;
-
-	for (i = 0; i < device->comps_used; i++) {
-		struct isert_comp *comp = &device->comps[i];
-
-		if (comp->cq)
-			ib_free_cq(comp->cq);
-	}
-	kfree(device->comps);
-}
-
-static int
-isert_alloc_comps(struct isert_device *device)
-{
-	int i, max_cqe, ret = 0;
-
-	device->comps_used = min(ISERT_MAX_CQ, min_t(int, num_online_cpus(),
-				 device->ib_device->num_comp_vectors));
-
-	isert_info("Using %d CQs, %s supports %d vectors support "
-		   "pi_capable %d\n",
-		   device->comps_used, dev_name(&device->ib_device->dev),
-		   device->ib_device->num_comp_vectors,
-		   device->pi_capable);
-
-	device->comps = kcalloc(device->comps_used, sizeof(struct isert_comp),
-				GFP_KERNEL);
-	if (!device->comps)
-		return -ENOMEM;
-
-	max_cqe = min(ISER_MAX_CQ_LEN, device->ib_device->attrs.max_cqe);
-
-	for (i = 0; i < device->comps_used; i++) {
-		struct isert_comp *comp = &device->comps[i];
-
-		comp->device = device;
-		comp->cq = ib_alloc_cq(device->ib_device, comp, max_cqe, i,
-				IB_POLL_WORKQUEUE);
-		if (IS_ERR(comp->cq)) {
-			isert_err("Unable to allocate cq\n");
-			ret = PTR_ERR(comp->cq);
-			comp->cq = NULL;
-			goto out_cq;
-		}
-	}
-
-	return 0;
-out_cq:
-	isert_free_comps(device);
-	return ret;
-}
-
 static int
 isert_create_device_ib_res(struct isert_device *device)
 {
@@ -296,16 +198,12 @@
 		  ib_dev->attrs.max_send_sge, ib_dev->attrs.max_recv_sge);
 	isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd);
 
-	ret = isert_alloc_comps(device);
-	if (ret)
-		goto out;
-
 	device->pd = ib_alloc_pd(ib_dev, 0);
 	if (IS_ERR(device->pd)) {
 		ret = PTR_ERR(device->pd);
 		isert_err("failed to allocate pd, device %p, ret=%d\n",
 			  device, ret);
-		goto out_cq;
+		return ret;
 	}
 
 	/* Check signature cap */
@@ -313,13 +211,6 @@
 			     IB_DEVICE_INTEGRITY_HANDOVER ? true : false;
 
 	return 0;
-
-out_cq:
-	isert_free_comps(device);
-out:
-	if (ret > 0)
-		ret = -EINVAL;
-	return ret;
 }
 
 static void
@@ -328,7 +219,6 @@
 	isert_info("device %p\n", device);
 
 	ib_dealloc_pd(device->pd);
-	isert_free_comps(device);
 }
 
 static void
@@ -490,6 +380,13 @@
 	}
 }
 
+static void
+isert_destroy_qp(struct isert_conn *isert_conn)
+{
+	ib_destroy_qp(isert_conn->qp);
+	ib_cq_pool_put(isert_conn->cq, isert_conn->cq_size);
+}
+
 static int
 isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
 {
@@ -530,17 +427,19 @@
 
 	isert_set_nego_params(isert_conn, &event->param.conn);
 
-	ret = isert_conn_setup_qp(isert_conn, cma_id);
-	if (ret)
+	isert_conn->qp = isert_create_qp(isert_conn, cma_id);
+	if (IS_ERR(isert_conn->qp)) {
+		ret = PTR_ERR(isert_conn->qp);
 		goto out_conn_dev;
+	}
 
 	ret = isert_login_post_recv(isert_conn);
 	if (ret)
-		goto out_conn_dev;
+		goto out_destroy_qp;
 
 	ret = isert_rdma_accept(isert_conn);
 	if (ret)
-		goto out_conn_dev;
+		goto out_destroy_qp;
 
 	mutex_lock(&isert_np->mutex);
 	list_add_tail(&isert_conn->node, &isert_np->accepted);
@@ -548,6 +447,8 @@
 
 	return 0;
 
+out_destroy_qp:
+	isert_destroy_qp(isert_conn);
 out_conn_dev:
 	isert_device_put(device);
 out_rsp_dma_map:
@@ -572,12 +473,8 @@
 	    !isert_conn->dev_removed)
 		rdma_destroy_id(isert_conn->cm_id);
 
-	if (isert_conn->qp) {
-		struct isert_comp *comp = isert_conn->qp->recv_cq->cq_context;
-
-		isert_comp_put(comp);
-		ib_destroy_qp(isert_conn->qp);
-	}
+	if (isert_conn->qp)
+		isert_destroy_qp(isert_conn);
 
 	if (isert_conn->login_req_buf)
 		isert_free_login_buf(isert_conn);
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
index 3b296ba..c55f7d9 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.h
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -63,7 +63,8 @@
 		(ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge) + \
 		 sizeof(struct ib_cqe) + sizeof(bool)))
 
-#define ISCSI_ISER_SG_TABLESIZE		256
+/* Maximum support is 16MB I/O size */
+#define ISCSI_ISER_MAX_SG_TABLESIZE	4096
 
 enum isert_desc_type {
 	ISCSI_TX_CONTROL,
@@ -155,6 +156,8 @@
 	struct iser_tx_desc	login_tx_desc;
 	struct rdma_cm_id	*cm_id;
 	struct ib_qp		*qp;
+	struct ib_cq		*cq;
+	u32			cq_size;
 	struct isert_device	*device;
 	struct mutex		mutex;
 	struct kref		kref;
@@ -165,22 +168,6 @@
 	bool			dev_removed;
 };
 
-#define ISERT_MAX_CQ 64
-
-/**
- * struct isert_comp - iSER completion context
- *
- * @device:     pointer to device handle
- * @cq:         completion queue
- * @active_qps: Number of active QPs attached
- *              to completion context
- */
-struct isert_comp {
-	struct isert_device     *device;
-	struct ib_cq		*cq;
-	int                      active_qps;
-};
-
 struct isert_device {
 	bool			pi_capable;
 	int			refcount;
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
index d324312..f645198 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
@@ -118,12 +118,17 @@
  * struct opa_vesw_info - OPA vnic switch information
  * @fabric_id: 10-bit fabric id
  * @vesw_id: 12-bit virtual ethernet switch id
+ * @rsvd0: reserved bytes
  * @def_port_mask: bitmask of default ports
+ * @rsvd1: reserved bytes
  * @pkey: partition key
+ * @rsvd2: reserved bytes
  * @u_mcast_dlid: unknown multicast dlid
  * @u_ucast_dlid: array of unknown unicast dlids
+ * @rsvd3: reserved bytes
  * @rc: routing control
  * @eth_mtu: Ethernet MTU
+ * @rsvd4: reserved bytes
  */
 struct opa_vesw_info {
 	__be16  fabric_id;
@@ -150,12 +155,14 @@
  * struct opa_per_veswport_info - OPA vnic per port information
  * @port_num: port number
  * @eth_link_status: current ethernet link state
+ * @rsvd0: reserved bytes
  * @base_mac_addr: base mac address
  * @config_state: configured port state
  * @oper_state: operational port state
  * @max_mac_tbl_ent: max number of mac table entries
  * @max_smac_ent: max smac entries in mac table
  * @mac_tbl_digest: mac table digest
+ * @rsvd1: reserved bytes
  * @encap_slid: base slid for the port
  * @pcp_to_sc_uc: sc by pcp index for unicast ethernet packets
  * @pcp_to_vl_uc: vl by pcp index for unicast ethernet packets
@@ -165,8 +172,10 @@
  * @non_vlan_vl_uc: vl for non-vlan unicast ethernet packets
  * @non_vlan_sc_mc: sc for non-vlan multicast ethernet packets
  * @non_vlan_vl_mc: vl for non-vlan multicast ethernet packets
+ * @rsvd2: reserved bytes
  * @uc_macs_gen_count: generation count for unicast macs list
  * @mc_macs_gen_count: generation count for multicast macs list
+ * @rsvd3: reserved bytes
  */
 struct opa_per_veswport_info {
 	__be32  port_num;
@@ -294,6 +303,7 @@
  * @rx_512_1023: received packet length is >=512 and < 1023 bytes
  * @rx_1024_1518: received packet length is >=1024 and < 1518 bytes
  * @rx_1519_max: received packet length >= 1519 bytes
+ * @reserved: reserved bytes
  *
  * All the above are counters of corresponding conditions.
  */
@@ -347,16 +357,26 @@
  * @veswport_num: virtual ethernet switch port number
  * @tx_errors: transmit errors
  * @rx_errors: receive errors
+ * @rsvd0: reserved bytes
  * @tx_smac_filt: smac filter errors
+ * @rsvd1: reserved bytes
+ * @rsvd2: reserved bytes
+ * @rsvd3: reserved bytes
  * @tx_dlid_zero: transmit packets with invalid dlid
+ * @rsvd4: reserved bytes
  * @tx_logic: other transmit errors
+ * @rsvd5: reserved bytes
  * @tx_drop_state: packet tansmission in non-forward port state
  * @rx_bad_veswid: received packet with invalid vesw id
+ * @rsvd6: reserved bytes
  * @rx_runt: received ethernet packet with length < 64 bytes
  * @rx_oversize: received ethernet packet with length > MTU size
+ * @rsvd7: reserved bytes
  * @rx_eth_down: received packets when interface is down
  * @rx_drop_state: received packets in non-forwarding port state
  * @rx_logic: other receive errors
+ * @rsvd8: reserved bytes
+ * @rsvd9: reserved bytes
  *
  * All the above are counters of corresponding error conditions.
  */
@@ -447,6 +467,7 @@
  * struct opa_vnic_vema_mad - Generic VEMA MAD
  * @mad_hdr: Generic MAD header
  * @rmpp_hdr: RMPP header for vendor specific MADs
+ * @reserved: reserved bytes
  * @oui: Unique org identifier
  * @data: MAD data
  */
@@ -467,6 +488,7 @@
  * @trap_num: Trap number
  * @toggle_count: Notice toggle bit and count value
  * @issuer_lid: Trap issuer's lid
+ * @reserved: reserved bytes
  * @issuer_gid: Issuer GID (only if Report method)
  * @raw_data: Trap message body
  */
@@ -487,6 +509,7 @@
  * struct opa_vnic_vema_mad_trap - Generic VEMA MAD Trap
  * @mad_hdr: Generic MAD header
  * @rmpp_hdr: RMPP header for vendor specific MADs
+ * @reserved: reserved bytes
  * @oui: Unique org identifier
  * @notice: Notice structure
  */
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
index 564388a..776e892 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
@@ -12,6 +12,7 @@
 
 #include <linux/module.h>
 #include <linux/rculist.h>
+#include <linux/random.h>
 
 #include "rtrs-clt.h"
 #include "rtrs-log.h"
@@ -23,6 +24,12 @@
  * leads to "false positives" failed reconnect attempts
  */
 #define RTRS_RECONNECT_BACKOFF 1000
+/*
+ * Wait for additional random time between 0 and 8 seconds
+ * before starting to reconnect to avoid clients reconnecting
+ * all at once in case of a major network outage
+ */
+#define RTRS_RECONNECT_SEED 8
 
 MODULE_DESCRIPTION("RDMA Transport Client");
 MODULE_LICENSE("GPL");
@@ -306,7 +313,8 @@
 		 */
 		delay_ms = clt->reconnect_delay_sec * 1000;
 		queue_delayed_work(rtrs_wq, &sess->reconnect_dwork,
-				   msecs_to_jiffies(delay_ms));
+				   msecs_to_jiffies(delay_ms +
+						    prandom_u32() % RTRS_RECONNECT_SEED));
 	} else {
 		/*
 		 * Error can happen just on establishing new connection,
@@ -2503,7 +2511,9 @@
 		sess->stats->reconnects.fail_cnt++;
 		delay_ms = clt->reconnect_delay_sec * 1000;
 		queue_delayed_work(rtrs_wq, &sess->reconnect_dwork,
-				   msecs_to_jiffies(delay_ms));
+				   msecs_to_jiffies(delay_ms +
+						    prandom_u32() %
+						    RTRS_RECONNECT_SEED));
 	}
 }
 
@@ -2972,7 +2982,7 @@
 		pr_err("Failed to create rtrs-client dev class\n");
 		return PTR_ERR(rtrs_clt_dev_class);
 	}
-	rtrs_wq = alloc_workqueue("rtrs_client_wq", WQ_MEM_RECLAIM, 0);
+	rtrs_wq = alloc_workqueue("rtrs_client_wq", 0, 0);
 	if (!rtrs_wq) {
 		class_destroy(rtrs_clt_dev_class);
 		return -ENOMEM;
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
index 0d9241f..a219bd1 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
@@ -2150,7 +2150,7 @@
 		err = PTR_ERR(rtrs_dev_class);
 		goto out_chunk_pool;
 	}
-	rtrs_wq = alloc_workqueue("rtrs_server_wq", WQ_MEM_RECLAIM, 0);
+	rtrs_wq = alloc_workqueue("rtrs_server_wq", 0, 0);
 	if (!rtrs_wq) {
 		err = -ENOMEM;
 		goto out_dev_class;
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index ef7fcd3..0065eb1 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -869,7 +869,7 @@
 
 static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_rdma_ch *ch = wc->qp->qp_context;
 
 	pr_debug("%s-%d wc->status %d\n", ch->sess_name, ch->qp->qp_num,
 		 wc->status);
@@ -1322,7 +1322,7 @@
  */
 static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_rdma_ch *ch = wc->qp->qp_context;
 	struct srpt_send_ioctx *ioctx =
 		container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe);
 
@@ -1683,7 +1683,7 @@
 
 static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_rdma_ch *ch = wc->qp->qp_context;
 	struct srpt_recv_ioctx *ioctx =
 		container_of(wc->wr_cqe, struct srpt_recv_ioctx, ioctx.cqe);
 
@@ -1744,7 +1744,7 @@
  */
 static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_rdma_ch *ch = wc->qp->qp_context;
 	struct srpt_send_ioctx *ioctx =
 		container_of(wc->wr_cqe, struct srpt_send_ioctx, ioctx.cqe);
 	enum srpt_command_state state;
@@ -1791,7 +1791,7 @@
 		goto out;
 
 retry:
-	ch->cq = ib_alloc_cq_any(sdev->device, ch, ch->rq_size + sq_size,
+	ch->cq = ib_cq_pool_get(sdev->device, ch->rq_size + sq_size, -1,
 				 IB_POLL_WORKQUEUE);
 	if (IS_ERR(ch->cq)) {
 		ret = PTR_ERR(ch->cq);
@@ -1799,6 +1799,7 @@
 		       ch->rq_size + sq_size, ret);
 		goto out;
 	}
+	ch->cq_size = ch->rq_size + sq_size;
 
 	qp_init->qp_context = (void *)ch;
 	qp_init->event_handler
@@ -1843,7 +1844,7 @@
 		if (retry) {
 			pr_debug("failed to create queue pair with sq_size = %d (%d) - retrying\n",
 				 sq_size, ret);
-			ib_free_cq(ch->cq);
+			ib_cq_pool_put(ch->cq, ch->cq_size);
 			sq_size = max(sq_size / 2, MIN_SRPT_SQ_SIZE);
 			goto retry;
 		} else {
@@ -1869,14 +1870,14 @@
 
 err_destroy_cq:
 	ch->qp = NULL;
-	ib_free_cq(ch->cq);
+	ib_cq_pool_put(ch->cq, ch->cq_size);
 	goto out;
 }
 
 static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch)
 {
 	ib_destroy_qp(ch->qp);
-	ib_free_cq(ch->cq);
+	ib_cq_pool_put(ch->cq, ch->cq_size);
 }
 
 /**
@@ -2156,9 +2157,6 @@
 
 	WARN_ON_ONCE(irqs_disabled());
 
-	if (WARN_ON(!sdev || !req))
-		return -EINVAL;
-
 	it_iu_len = be32_to_cpu(req->req_it_iu_len);
 
 	pr_info("Received SRP_LOGIN_REQ with i_port_id %pI6, t_port_id %pI6 and it_iu_len %d on port %d (guid=%pI6); pkey %#04x\n",
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index f31c349..41435a6 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -300,6 +300,7 @@
 		} rdma_cm;
 	};
 	struct ib_cq		*cq;
+	u32			cq_size;
 	struct ib_cqe		zw_cqe;
 	struct rcu_head		rcu;
 	struct kref		kref;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
index 42198e6..8db4b5f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -299,11 +299,18 @@
 }
 EXPORT_SYMBOL_GPL(mlx5_fill_page_array);
 
-void mlx5_fill_page_frag_array(struct mlx5_frag_buf *buf, __be64 *pas)
+void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm)
 {
 	int i;
 
+	WARN_ON(perm & 0xfc);
 	for (i = 0; i < buf->npages; i++)
-		pas[i] = cpu_to_be64(buf->frags[i].map);
+		pas[i] = cpu_to_be64(buf->frags[i].map | perm);
+}
+EXPORT_SYMBOL_GPL(mlx5_fill_page_frag_array_perm);
+
+void mlx5_fill_page_frag_array(struct mlx5_frag_buf *buf, __be64 *pas)
+{
+	mlx5_fill_page_frag_array_perm(buf, pas, 0);
 }
 EXPORT_SYMBOL_GPL(mlx5_fill_page_frag_array);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 7e70a817..9ccec5f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1577,6 +1577,7 @@
 static bool counter_is_valid(u32 action)
 {
 	return (action & (MLX5_FLOW_CONTEXT_ACTION_DROP |
+			  MLX5_FLOW_CONTEXT_ACTION_ALLOW |
 			  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST));
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 5763965..97b5fcb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -226,13 +226,20 @@
 
 int mlx5i_create_underlay_qp(struct mlx5e_priv *priv)
 {
+	unsigned char *dev_addr = priv->netdev->dev_addr;
 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(create_qp_in)] = {};
 	struct mlx5i_priv *ipriv = priv->ppriv;
 	void *addr_path;
+	int qpn = 0;
 	int ret = 0;
 	void *qpc;
 
+	if (MLX5_CAP_GEN(priv->mdev, mkey_by_name)) {
+		qpn = (dev_addr[1] << 16) + (dev_addr[2] << 8) + dev_addr[3];
+		MLX5_SET(create_qp_in, in, input_qpn, qpn);
+	}
+
 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD);
 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index e32d46c33..ce43e3f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -557,6 +557,9 @@
 	if (MLX5_CAP_GEN_MAX(dev, release_all_pages))
 		MLX5_SET(cmd_hca_cap, set_hca_cap, release_all_pages, 1);
 
+	if (MLX5_CAP_GEN_MAX(dev, mkey_by_name))
+		MLX5_SET(cmd_hca_cap, set_hca_cap, mkey_by_name, 1);
+
 	return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
 }
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 57db125..4d3376e 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -276,7 +276,9 @@
 	MLX5_MKEY_MASK_RW		= 1ull << 20,
 	MLX5_MKEY_MASK_A		= 1ull << 21,
 	MLX5_MKEY_MASK_SMALL_FENCE	= 1ull << 23,
-	MLX5_MKEY_MASK_FREE		= 1ull << 29,
+	MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE	= 1ull << 25,
+	MLX5_MKEY_MASK_FREE			= 1ull << 29,
+	MLX5_MKEY_MASK_RELAXED_ORDERING_READ	= 1ull << 47,
 };
 
 enum {
@@ -1007,7 +1009,6 @@
 	MLX5_MKEY_REMOTE_INVAL	= 1 << 24,
 	MLX5_MKEY_FLAG_SYNC_UMR = 1 << 29,
 	MLX5_MKEY_BSF_EN	= 1 << 30,
-	MLX5_MKEY_LEN64		= 1 << 31,
 };
 
 struct mlx5_mkey_seg {
@@ -1361,11 +1362,11 @@
 	MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca_cur[MLX5_CAP_DEV_EVENT], cap)
 
 #define MLX5_CAP_DEV_VDPA_EMULATION(mdev, cap)\
-	MLX5_GET(device_virtio_emulation_cap, \
+	MLX5_GET(virtio_emulation_cap, \
 		(mdev)->caps.hca_cur[MLX5_CAP_VDPA_EMULATION], cap)
 
 #define MLX5_CAP64_DEV_VDPA_EMULATION(mdev, cap)\
-	MLX5_GET64(device_virtio_emulation_cap, \
+	MLX5_GET64(virtio_emulation_cap, \
 		(mdev)->caps.hca_cur[MLX5_CAP_VDPA_EMULATION], cap)
 
 #define MLX5_CAP_IPSEC(mdev, cap)\
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a0fcc4d..c145de0 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -975,6 +975,7 @@
 void mlx5_unregister_debugfs(void);
 
 void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas);
+void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm);
 void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas);
 int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 		    unsigned int *irqn);
@@ -1057,6 +1058,7 @@
 enum {
 	MLX5_INTERFACE_PROTOCOL_IB  = 0,
 	MLX5_INTERFACE_PROTOCOL_ETH = 1,
+	MLX5_INTERFACE_PROTOCOL_VDPA = 2,
 };
 
 struct mlx5_interface {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 35e4dc6..de1ffb4 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -93,6 +93,7 @@
 
 enum {
 	MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b,
+	MLX5_OBJ_TYPE_VIRTIO_NET_Q = 0x000d,
 	MLX5_OBJ_TYPE_MKEY = 0xff01,
 	MLX5_OBJ_TYPE_QP = 0xff02,
 	MLX5_OBJ_TYPE_PSV = 0xff03,
@@ -985,17 +986,40 @@
 	u8         user_unaffiliated_events[4][0x40];
 };
 
-struct mlx5_ifc_device_virtio_emulation_cap_bits {
-	u8         reserved_at_0[0x20];
+struct mlx5_ifc_virtio_emulation_cap_bits {
+	u8         desc_tunnel_offload_type[0x1];
+	u8         eth_frame_offload_type[0x1];
+	u8         virtio_version_1_0[0x1];
+	u8         device_features_bits_mask[0xd];
+	u8         event_mode[0x8];
+	u8         virtio_queue_type[0x8];
 
-	u8         reserved_at_20[0x13];
+	u8         max_tunnel_desc[0x10];
+	u8         reserved_at_30[0x3];
 	u8         log_doorbell_stride[0x5];
 	u8         reserved_at_38[0x3];
 	u8         log_doorbell_bar_size[0x5];
 
 	u8         doorbell_bar_offset[0x40];
 
-	u8         reserved_at_80[0x780];
+	u8         max_emulated_devices[0x8];
+	u8         max_num_virtio_queues[0x18];
+
+	u8         reserved_at_a0[0x60];
+
+	u8         umem_1_buffer_param_a[0x20];
+
+	u8         umem_1_buffer_param_b[0x20];
+
+	u8         umem_2_buffer_param_a[0x20];
+
+	u8         umem_2_buffer_param_b[0x20];
+
+	u8         umem_3_buffer_param_a[0x20];
+
+	u8         umem_3_buffer_param_b[0x20];
+
+	u8         reserved_at_1c0[0x640];
 };
 
 enum {
@@ -1220,7 +1244,11 @@
 
 	u8         max_sgl_for_optimized_performance[0x8];
 	u8         log_max_cq_sz[0x8];
-	u8         reserved_at_d0[0xb];
+	u8         relaxed_ordering_write_umr[0x1];
+	u8         relaxed_ordering_read_umr[0x1];
+	u8         reserved_at_d2[0x7];
+	u8         virtio_net_device_emualtion_manager[0x1];
+	u8         virtio_blk_device_emualtion_manager[0x1];
 	u8         log_max_cq[0x5];
 
 	u8         log_max_eq_sz[0x8];
@@ -1396,7 +1424,10 @@
 	u8         bf[0x1];
 	u8         driver_version[0x1];
 	u8         pad_tx_eth_packet[0x1];
-	u8         reserved_at_263[0x8];
+	u8         reserved_at_263[0x3];
+	u8         mkey_by_name[0x1];
+	u8         reserved_at_267[0x4];
+
 	u8         log_bf_reg_size[0x5];
 
 	u8         reserved_at_270[0x8];
@@ -2953,7 +2984,7 @@
 	struct mlx5_ifc_fpga_cap_bits fpga_cap;
 	struct mlx5_ifc_tls_cap_bits tls_cap;
 	struct mlx5_ifc_device_mem_cap_bits device_mem_cap;
-	struct mlx5_ifc_device_virtio_emulation_cap_bits virtio_emulation_cap;
+	struct mlx5_ifc_virtio_emulation_cap_bits virtio_emulation_cap;
 	u8         reserved_at_0[0x8000];
 };
 
@@ -3302,15 +3333,18 @@
 };
 
 struct mlx5_ifc_rqtc_bits {
-	u8         reserved_at_0[0xa0];
+	u8    reserved_at_0[0xa0];
 
-	u8         reserved_at_a0[0x10];
-	u8         rqt_max_size[0x10];
+	u8    reserved_at_a0[0x5];
+	u8    list_q_type[0x3];
+	u8    reserved_at_a8[0x8];
+	u8    rqt_max_size[0x10];
 
-	u8         reserved_at_c0[0x10];
-	u8         rqt_actual_size[0x10];
+	u8    rq_vhca_id_format[0x1];
+	u8    reserved_at_c1[0xf];
+	u8    rqt_actual_size[0x10];
 
-	u8         reserved_at_e0[0x6a0];
+	u8    reserved_at_e0[0x6a0];
 
 	struct mlx5_ifc_rq_num_bits rq_num[];
 };
@@ -7092,7 +7126,7 @@
 
 struct mlx5_ifc_destroy_mkey_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -7723,8 +7757,10 @@
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         reserved_at_40[0x8];
+	u8         input_qpn[0x18];
 
+	u8         reserved_at_60[0x20];
 	u8         opt_param_mask[0x20];
 
 	u8         ece[0x20];
@@ -7788,7 +7824,7 @@
 
 struct mlx5_ifc_create_mkey_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -10346,6 +10382,40 @@
 	struct mlx5_ifc_umem_bits  umem;
 };
 
+struct mlx5_ifc_create_umem_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         umem_id[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_umem_in_bits {
+	u8        opcode[0x10];
+	u8        uid[0x10];
+
+	u8        reserved_at_20[0x10];
+	u8        op_mod[0x10];
+
+	u8        reserved_at_40[0x8];
+	u8        umem_id[0x18];
+
+	u8        reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_umem_out_bits {
+	u8        status[0x8];
+	u8        reserved_at_8[0x18];
+
+	u8        syndrome[0x20];
+
+	u8        reserved_at_40[0x40];
+};
+
 struct mlx5_ifc_create_uctx_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
@@ -10358,6 +10428,18 @@
 	struct mlx5_ifc_uctx_bits  uctx;
 };
 
+struct mlx5_ifc_create_uctx_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
 struct mlx5_ifc_destroy_uctx_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
@@ -10371,6 +10453,15 @@
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_destroy_uctx_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8          reserved_at_40[0x40];
+};
+
 struct mlx5_ifc_create_sw_icm_in_bits {
 	struct mlx5_ifc_general_obj_in_cmd_hdr_bits   hdr;
 	struct mlx5_ifc_sw_icm_bits		      sw_icm;
@@ -10687,4 +10778,10 @@
 	u8         hw_offset_record_number[0x18];
 };
 
+enum {
+	MLX5_MTT_PERM_READ	= 1 << 0,
+	MLX5_MTT_PERM_WRITE	= 1 << 1,
+	MLX5_MTT_PERM_RW	= MLX5_MTT_PERM_READ | MLX5_MTT_PERM_WRITE,
+};
+
 #endif /* MLX5_IFC_H */
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
index fe2fc9e..83139b9 100644
--- a/include/rdma/ib.h
+++ b/include/rdma/ib.h
@@ -1,36 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2010 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
-#if !defined(_RDMA_IB_H)
+#ifndef _RDMA_IB_H
 #define _RDMA_IB_H
 
 #include <linux/types.h>
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index 2734c89..b0e636a 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -1,37 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
  * Copyright (c) 2005 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
-#if !defined(IB_ADDR_H)
+#ifndef IB_ADDR_H
 #define IB_ADDR_H
 
 #include <linux/in.h>
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
index e06d133..66a8f36 100644
--- a/include/rdma/ib_cache.h
+++ b/include/rdma/ib_cache.h
@@ -1,35 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Intel Corporation. All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
 #ifndef _IB_CACHE_H
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 0f1ea5f..382427a 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -6,6 +6,7 @@
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
  */
+
 #ifndef IB_CM_H
 #define IB_CM_H
 
diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h
index 9a90bd0..57c1ac8 100644
--- a/include/rdma/ib_hdrs.h
+++ b/include/rdma/ib_hdrs.h
@@ -1,48 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
  * Copyright(c) 2016 - 2018 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
  */
 
 #ifndef IB_HDRS_H
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index 8c093fc..8dfb1dd 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -1,40 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
  * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
  * Copyright (c) 2004 Intel Corporation.  All rights reserved.
  * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
  * Copyright (c) 2004-2006 Voltaire Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
-#if !defined(IB_MAD_H)
+#ifndef IB_MAD_H
 #define IB_MAD_H
 
 #include <linux/list.h>
diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h
index 8ebf84a..1838869 100644
--- a/include/rdma/ib_marshall.h
+++ b/include/rdma/ib_marshall.h
@@ -1,36 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
-#if !defined(IB_USER_MARSHALL_H)
+#ifndef IB_USER_MARSHALL_H
 #define IB_USER_MARSHALL_H
 
 #include <rdma/ib_verbs.h>
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index 7ea1382..a9162f25 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
 #ifndef IB_PACK_H
diff --git a/include/rdma/ib_pma.h b/include/rdma/ib_pma.h
index 2f8a65c..44c6182 100644
--- a/include/rdma/ib_pma.h
+++ b/include/rdma/ib_pma.h
@@ -1,38 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
  * All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
-#if !defined(IB_PMA_H)
+#ifndef IB_PMA_H
 #define IB_PMA_H
 
 #include <rdma/ib_mad.h>
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 1952097..693285e 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -1,35 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
  * Copyright (c) 2006 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
 #ifndef IB_SA_H
diff --git a/include/rdma/ib_smi.h b/include/rdma/ib_smi.h
index 7be0028..fdb8633 100644
--- a/include/rdma/ib_smi.h
+++ b/include/rdma/ib_smi.h
@@ -1,40 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
  * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
  * Copyright (c) 2004 Intel Corporation.  All rights reserved.
  * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
  * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
-#if !defined(IB_SMI_H)
+#ifndef IB_SMI_H
 #define IB_SMI_H
 
 #include <rdma/ib_mad.h>
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index e3518fd..71f573a 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2007 Cisco Systems.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
 #ifndef IB_UMEM_H
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 64314ff..d16d2c1 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
 #ifndef IB_UMEM_ODP_H
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index ef2f398..c0b2fa7 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
  * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
@@ -6,37 +7,9 @@
  * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
-#if !defined(IB_VERBS_H)
+#ifndef IB_VERBS_H
 #define IB_VERBS_H
 
 #include <linux/types.h>
@@ -75,6 +48,7 @@
 struct ib_uqp_object;
 struct ib_usrq_object;
 struct ib_uwq_object;
+struct rdma_cm_id;
 
 extern struct workqueue_struct *ib_wq;
 extern struct workqueue_struct *ib_comp_wq;
@@ -1566,9 +1540,8 @@
 	struct ib_device       *device;
 	atomic_t		usecnt; /* count all exposed resources */
 	struct inode	       *inode;
-
-	struct mutex		tgt_qp_mutex;
-	struct list_head	tgt_qp_list;
+	struct rw_semaphore	tgt_qps_rwsem;
+	struct xarray		tgt_qps;
 };
 
 struct ib_ah {
@@ -2270,6 +2243,7 @@
 struct ib_odp_counters {
 	atomic64_t faults;
 	atomic64_t invalidations;
+	atomic64_t prefetch;
 };
 
 struct ib_counters {
@@ -2476,7 +2450,7 @@
 			     struct ib_pd *pd, struct ib_udata *udata);
 	int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata);
 	struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type,
-				  u32 max_num_sg, struct ib_udata *udata);
+				  u32 max_num_sg);
 	struct ib_mr *(*alloc_mr_integrity)(struct ib_pd *pd,
 					    u32 max_num_data_sg,
 					    u32 max_num_meta_sg);
@@ -2493,9 +2467,8 @@
 	int (*dealloc_mw)(struct ib_mw *mw);
 	int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
 	int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
-	struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device,
-				      struct ib_udata *udata);
-	int (*dealloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata);
+	int (*alloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata);
+	void (*dealloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata);
 	struct ib_flow *(*create_flow)(struct ib_qp *qp,
 				       struct ib_flow_attr *flow_attr,
 				       int domain, struct ib_udata *udata);
@@ -2539,9 +2512,9 @@
 	struct ib_mr *(*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm,
 				   struct ib_dm_mr_attr *attr,
 				   struct uverbs_attr_bundle *attrs);
-	struct ib_counters *(*create_counters)(
-		struct ib_device *device, struct uverbs_attr_bundle *attrs);
-	int (*destroy_counters)(struct ib_counters *counters);
+	int (*create_counters)(struct ib_counters *counters,
+			       struct uverbs_attr_bundle *attrs);
+	void (*destroy_counters)(struct ib_counters *counters);
 	int (*read_counters)(struct ib_counters *counters,
 			     struct ib_counters_read_attr *counters_read_attr,
 			     struct uverbs_attr_bundle *attrs);
@@ -2581,8 +2554,13 @@
 	/**
 	 * Allows rdma drivers to add their own restrack attributes.
 	 */
-	int (*fill_res_entry)(struct sk_buff *msg,
-			      struct rdma_restrack_entry *entry);
+	int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr);
+	int (*fill_res_mr_entry_raw)(struct sk_buff *msg, struct ib_mr *ibmr);
+	int (*fill_res_cq_entry)(struct sk_buff *msg, struct ib_cq *ibcq);
+	int (*fill_res_cq_entry_raw)(struct sk_buff *msg, struct ib_cq *ibcq);
+	int (*fill_res_qp_entry)(struct sk_buff *msg, struct ib_qp *ibqp);
+	int (*fill_res_qp_entry_raw)(struct sk_buff *msg, struct ib_qp *ibqp);
+	int (*fill_res_cm_id_entry)(struct sk_buff *msg, struct rdma_cm_id *id);
 
 	/* Device lifecycle callbacks */
 	/*
@@ -2637,14 +2615,19 @@
 	 * Allows rdma drivers to add their own restrack attributes
 	 * dumped via 'rdma stat' iproute2 command.
 	 */
-	int (*fill_stat_entry)(struct sk_buff *msg,
-			       struct rdma_restrack_entry *entry);
+	int (*fill_stat_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr);
+
+	/* query driver for its ucontext properties */
+	int (*query_ucontext)(struct ib_ucontext *context,
+			      struct uverbs_attr_bundle *attrs);
 
 	DECLARE_RDMA_OBJ_SIZE(ib_ah);
+	DECLARE_RDMA_OBJ_SIZE(ib_counters);
 	DECLARE_RDMA_OBJ_SIZE(ib_cq);
 	DECLARE_RDMA_OBJ_SIZE(ib_pd);
 	DECLARE_RDMA_OBJ_SIZE(ib_srq);
 	DECLARE_RDMA_OBJ_SIZE(ib_ucontext);
+	DECLARE_RDMA_OBJ_SIZE(ib_xrcd);
 };
 
 struct ib_core_device {
@@ -4263,14 +4246,8 @@
 	return ib_dereg_mr_user(mr, NULL);
 }
 
-struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata);
-
-static inline struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
-					enum ib_mr_type mr_type, u32 max_num_sg)
-{
-	return ib_alloc_mr_user(pd, mr_type, max_num_sg, NULL);
-}
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			  u32 max_num_sg);
 
 struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
 				    u32 max_num_data_sg,
@@ -4321,21 +4298,9 @@
  */
 int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
 
-/**
- * ib_alloc_xrcd - Allocates an XRC domain.
- * @device: The device on which to allocate the XRC domain.
- * @caller: Module name for kernel consumers
- */
-struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller);
-#define ib_alloc_xrcd(device) \
-	__ib_alloc_xrcd((device), KBUILD_MODNAME)
-
-/**
- * ib_dealloc_xrcd - Deallocates an XRC domain.
- * @xrcd: The XRC domain to deallocate.
- * @udata: Valid user data or NULL for kernel object
- */
-int ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
+struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device,
+				   struct inode *inode, struct ib_udata *udata);
+int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata);
 
 static inline int ib_check_mr_access(int flags)
 {
@@ -4417,9 +4382,6 @@
 int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr,
 		 u32 wq_attr_mask);
-struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
-						 struct ib_rwq_ind_table_init_attr*
-						 wq_ind_table_init_attr);
 int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
 
 int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h
index 5aa8a9c..9197540 100644
--- a/include/rdma/iw_cm.h
+++ b/include/rdma/iw_cm.h
@@ -1,35 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
  * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
+
 #ifndef IW_CM_H
 #define IW_CM_H
 
diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h
index c895350..6dbc1a2 100644
--- a/include/rdma/iw_portmap.h
+++ b/include/rdma/iw_portmap.h
@@ -1,35 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2014 Intel Corporation. All rights reserved.
  * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *	  copyright notice, this list of conditions and the following
- *	  disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *	  copyright notice, this list of conditions and the following
- *	  disclaimer in the documentation and/or other materials
- *	  provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
+
 #ifndef _IW_PORTMAP_H
 #define _IW_PORTMAP_H
 
diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h
index 66d4393..c6bfa26 100644
--- a/include/rdma/opa_addr.h
+++ b/include/rdma/opa_addr.h
@@ -1,48 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
  * Copyright(c) 2017 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
  */
 
 #ifndef OPA_ADDR_H
diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h
index 0d9e6d7..73bcac9 100644
--- a/include/rdma/opa_port_info.h
+++ b/include/rdma/opa_port_info.h
@@ -1,36 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2014-2020 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
-#if !defined(OPA_PORT_INFO_H)
+#ifndef OPA_PORT_INFO_H
 #define OPA_PORT_INFO_H
 
 #include <rdma/opa_smi.h>
diff --git a/include/rdma/opa_smi.h b/include/rdma/opa_smi.h
index c7b2ef1..a9f1b57 100644
--- a/include/rdma/opa_smi.h
+++ b/include/rdma/opa_smi.h
@@ -1,36 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2014 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
-#if !defined(OPA_SMI_H)
+#ifndef OPA_SMI_H
 #define OPA_SMI_H
 
 #include <rdma/ib_mad.h>
diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h
index 6f244e7..cbe3c28 100644
--- a/include/rdma/opa_vnic.h
+++ b/include/rdma/opa_vnic.h
@@ -1,52 +1,11 @@
-#ifndef _OPA_VNIC_H
-#define _OPA_VNIC_H
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
  * Copyright(c) 2017 - 2020 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
  */
 
+#ifndef _OPA_VNIC_H
+#define _OPA_VNIC_H
+
 /*
  * This file contains Intel Omni-Path (OPA) Virtual Network Interface
  * Controller (VNIC) specific declarations.
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 939d7ab..cf5da2a 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -1,37 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
  * Copyright (c) 2005 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
-#if !defined(RDMA_CM_H)
+#ifndef RDMA_CM_H
 #define RDMA_CM_H
 
 #include <linux/socket.h>
diff --git a/include/rdma/rdma_cm_ib.h b/include/rdma/rdma_cm_ib.h
index 6a69d71..8354e7d 100644
--- a/include/rdma/rdma_cm_ib.h
+++ b/include/rdma/rdma_cm_ib.h
@@ -1,36 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2006 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
-#if !defined(RDMA_CM_IB_H)
+#ifndef RDMA_CM_IB_H
 #define RDMA_CM_IB_H
 
 #include <rdma/rdma_cm.h>
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index ab22759..2758d9d 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -1,8 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+
 #ifndef _RDMA_NETLINK_H
 #define _RDMA_NETLINK_H
 
-
 #include <linux/netlink.h>
 #include <uapi/rdma/rdma_netlink.h>
 
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index ac5a943..9fd217b 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -1,53 +1,11 @@
-#ifndef DEF_RDMA_VT_H
-#define DEF_RDMA_VT_H
-
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
  * Copyright(c) 2016 - 2019 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
  */
 
+#ifndef DEF_RDMA_VT_H
+#define DEF_RDMA_VT_H
+
 /*
  * Structure that low level drivers will populate in order to register with the
  * rdmavt layer.
diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h
index 574eb72..1fe2bb5 100644
--- a/include/rdma/rdmavt_cq.h
+++ b/include/rdma/rdmavt_cq.h
@@ -1,56 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright(c) 2016 - 2018 Intel Corporation.
+ */
+
 #ifndef DEF_RDMAVT_INCCQ_H
 #define DEF_RDMAVT_INCCQ_H
 
-/*
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 - 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
 #include <linux/kthread.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_verbs.h>
diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h
index ce6c888..c3367e9 100644
--- a/include/rdma/rdmavt_mr.h
+++ b/include/rdma/rdmavt_mr.h
@@ -1,53 +1,11 @@
-#ifndef DEF_RDMAVT_INCMR_H
-#define DEF_RDMAVT_INCMR_H
-
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
  * Copyright(c) 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
  */
 
+#ifndef DEF_RDMAVT_INCMR_H
+#define DEF_RDMAVT_INCMR_H
+
 /*
  * For Memory Regions. This stuff should probably be moved into rdmavt/mr.h once
  * drivers no longer need access to the MR directly.
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 2f1fc23..8275954 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -1,53 +1,11 @@
-#ifndef DEF_RDMAVT_INCQP_H
-#define DEF_RDMAVT_INCQP_H
-
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
  * Copyright(c) 2016 - 2020 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
  */
 
+#ifndef DEF_RDMAVT_INCQP_H
+#define DEF_RDMAVT_INCQP_H
+
 #include <rdma/rdma_vt.h>
 #include <rdma/ib_pack.h>
 #include <rdma/ib_verbs.h>
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index 86de10e..b00270c 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
 #ifndef _UVERBS_IOCTL_
@@ -652,6 +625,7 @@
 	struct ib_udata ucore;
 	struct ib_uverbs_file *ufile;
 	struct ib_ucontext *context;
+	struct ib_uobject *uobject;
 	DECLARE_BITMAP(attr_present, UVERBS_API_ATTR_BKEY_LEN);
 	struct uverbs_attr attrs[];
 };
diff --git a/include/rdma/uverbs_named_ioctl.h b/include/rdma/uverbs_named_ioctl.h
index 6ae6cf8..f04f512 100644
--- a/include/rdma/uverbs_named_ioctl.h
+++ b/include/rdma/uverbs_named_ioctl.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
 #ifndef _UVERBS_NAMED_IOCTL_
diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h
index bf0392a..fe05121 100644
--- a/include/rdma/uverbs_std_types.h
+++ b/include/rdma/uverbs_std_types.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
 #ifndef _UVERBS_STD_TYPES__
@@ -110,6 +83,20 @@
 	rdma_alloc_abort_uobject(uobj, attrs, false);
 }
 
+static inline void uobj_finalize_uobj_create(struct ib_uobject *uobj,
+					     struct uverbs_attr_bundle *attrs)
+{
+	/*
+	 * Tell the core code that the write() handler has completed
+	 * initializing the object and that the core should commit or
+	 * abort this object based upon the return code from the write()
+	 * method. Similar to what uverbs_finalize_uobj_create() does for
+	 * ioctl()
+	 */
+	WARN_ON(attrs->uobject);
+	attrs->uobject = uobj;
+}
+
 static inline struct ib_uobject *
 __uobj_alloc(const struct uverbs_api_object *obj,
 	     struct uverbs_attr_bundle *attrs, struct ib_device **ib_dev)
diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
index c15b298..06db27e 100644
--- a/include/rdma/uverbs_types.h
+++ b/include/rdma/uverbs_types.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
  */
 
 #ifndef _UVERBS_TYPES_
diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
index 53b6e20..507a286 100644
--- a/include/uapi/rdma/efa-abi.h
+++ b/include/uapi/rdma/efa-abi.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef EFA_ABI_USER_H
@@ -20,6 +20,16 @@
  * hex bit offset of the field.
  */
 
+enum {
+	EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH  = 1 << 0,
+	EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR = 1 << 1,
+};
+
+struct efa_ibv_alloc_ucontext_cmd {
+	__u32 comp_mask;
+	__u8 reserved_20[4];
+};
+
 enum efa_ibv_user_cmds_supp_udata {
 	EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0,
 	EFA_USER_CMDS_SUPP_UDATA_CREATE_AH    = 1 << 1,
@@ -31,6 +41,9 @@
 	__u16 sub_cqs_per_cq;
 	__u16 inline_buf_size;
 	__u32 max_llq_size; /* bytes */
+	__u16 max_tx_batch; /* units of 64 bytes */
+	__u16 min_sq_wr;
+	__u8 reserved_a0[4];
 };
 
 struct efa_ibv_alloc_pd_resp {
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index 4961d5e..99dcabf 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -69,6 +69,7 @@
 	UVERBS_METHOD_INFO_HANDLES,
 	UVERBS_METHOD_QUERY_PORT,
 	UVERBS_METHOD_GET_CONTEXT,
+	UVERBS_METHOD_QUERY_CONTEXT,
 };
 
 enum uverbs_attrs_invoke_write_cmd_attr_ids {
@@ -87,6 +88,11 @@
 	UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT,
 };
 
+enum uverbs_attrs_query_context_attr_ids {
+	UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS,
+	UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT,
+};
+
 enum uverbs_attrs_create_cq_cmd_attr_ids {
 	UVERBS_ATTR_CREATE_CQ_HANDLE,
 	UVERBS_ATTR_CREATE_CQ_CQE,
@@ -242,6 +248,7 @@
 	UVERBS_METHOD_DM_MR_REG,
 	UVERBS_METHOD_MR_DESTROY,
 	UVERBS_METHOD_ADVISE_MR,
+	UVERBS_METHOD_QUERY_MR,
 };
 
 enum uverbs_attrs_mr_destroy_ids {
@@ -255,6 +262,14 @@
 	UVERBS_ATTR_ADVISE_MR_SGE_LIST,
 };
 
+enum uverbs_attrs_query_mr_cmd_attr_ids {
+	UVERBS_ATTR_QUERY_MR_HANDLE,
+	UVERBS_ATTR_QUERY_MR_RESP_LKEY,
+	UVERBS_ATTR_QUERY_MR_RESP_RKEY,
+	UVERBS_ATTR_QUERY_MR_RESP_LENGTH,
+	UVERBS_ATTR_QUERY_MR_RESP_IOVA,
+};
+
 enum uverbs_attrs_create_counters_cmd_attr_ids {
 	UVERBS_ATTR_CREATE_COUNTERS_HANDLE,
 };
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index 8e316ef..e24d66d 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -228,6 +228,10 @@
 	MLX5_IB_METHOD_FLOW_MATCHER_DESTROY,
 };
 
+enum mlx5_ib_device_query_context_attrs {
+	MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX = (1U << UVERBS_ID_NS_SHIFT),
+};
+
 #define MLX5_IB_DW_MATCH_PARAM 0x80
 
 struct mlx5_ib_match_params {
@@ -259,7 +263,7 @@
 	MLX5_IB_ATTR_CREATE_FLOW_FLAGS,
 };
 
-enum mlx5_ib_destoy_flow_attrs {
+enum mlx5_ib_destroy_flow_attrs {
 	MLX5_IB_ATTR_DESTROY_FLOW_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
 };
 
@@ -286,4 +290,14 @@
 	MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF,
 };
 
+enum mlx5_ib_query_pd_attrs {
+	MLX5_IB_ATTR_QUERY_PD_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_ATTR_QUERY_PD_RESP_PDN,
+};
+
+enum mlx5_ib_pd_methods {
+	MLX5_IB_METHOD_PD_QUERY = (1U << UVERBS_ID_NS_SHIFT),
+
+};
+
 #endif
diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h
index a0b83c9..bf7333b 100644
--- a/include/uapi/rdma/qedr-abi.h
+++ b/include/uapi/rdma/qedr-abi.h
@@ -39,8 +39,9 @@
 
 /* user kernel communication data structures. */
 enum qedr_alloc_ucontext_flags {
-	QEDR_ALLOC_UCTX_RESERVED	= 1 << 0,
-	QEDR_ALLOC_UCTX_DB_REC		= 1 << 1
+	QEDR_ALLOC_UCTX_EDPM_MODE	= 1 << 0,
+	QEDR_ALLOC_UCTX_DB_REC		= 1 << 1,
+	QEDR_SUPPORT_DPM_SIZES		= 1 << 2,
 };
 
 struct qedr_alloc_ucontext_req {
@@ -50,13 +51,14 @@
 
 #define QEDR_LDPM_MAX_SIZE	(8192)
 #define QEDR_EDPM_TRANS_SIZE	(64)
+#define QEDR_EDPM_MAX_SIZE	(ROCE_REQ_MAX_INLINE_DATA_SIZE)
 
 enum qedr_rdma_dpm_type {
 	QEDR_DPM_TYPE_NONE		= 0,
 	QEDR_DPM_TYPE_ROCE_ENHANCED	= 1 << 0,
 	QEDR_DPM_TYPE_ROCE_LEGACY	= 1 << 1,
 	QEDR_DPM_TYPE_IWARP_LEGACY	= 1 << 2,
-	QEDR_DPM_TYPE_RESERVED		= 1 << 3,
+	QEDR_DPM_TYPE_ROCE_EDPM_MODE	= 1 << 3,
 	QEDR_DPM_SIZES_SET		= 1 << 4,
 };
 
@@ -77,6 +79,8 @@
 	__u16 ldpm_limit_size;
 	__u8 edpm_trans_size;
 	__u8 reserved;
+	__u16 edpm_limit_size;
+	__u8 padding[6];
 };
 
 struct qedr_alloc_pd_ureq {
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 8e27778..d2f5b83 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -287,6 +287,12 @@
 
 	RDMA_NLDEV_CMD_STAT_DEL,
 
+	RDMA_NLDEV_CMD_RES_QP_GET_RAW,
+
+	RDMA_NLDEV_CMD_RES_CQ_GET_RAW,
+
+	RDMA_NLDEV_CMD_RES_MR_GET_RAW,
+
 	RDMA_NLDEV_NUM_OPS
 };
 
@@ -525,6 +531,8 @@
 	 */
 	RDMA_NLDEV_ATTR_DEV_DIM,                /* u8 */
 
+	RDMA_NLDEV_ATTR_RES_RAW,	/* binary */
+
 	/*
 	 * Always the end
 	 */
@@ -561,5 +569,6 @@
  */
 enum rdma_nl_counter_mask {
 	RDMA_COUNTER_MASK_QP_TYPE = 1,
+	RDMA_COUNTER_MASK_PID = 1 << 1,
 };
 #endif /* _UAPI_RDMA_NETLINK_H */
diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h
index d92d272..53c5518 100644
--- a/include/uapi/rdma/rdma_user_ioctl.h
+++ b/include/uapi/rdma/rdma_user_ioctl.h
@@ -43,7 +43,7 @@
 
 /*
  * General blocks assignments
- * It is closed on purpose do not expose it it user space
+ * It is closed on purpose - do not expose it to user space
  * #define MAD_CMD_BASE		0x00
  * #define HFI1_CMD_BAS		0xE0
  */