Merge tag 'xfs-5.4-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux Pull xfs updates from Darrick Wong: "For this cycle we have the usual pile of cleanups and bug fixes, some performance improvements for online metadata scrubbing, massive speedups in the directory entry creation code, some performance improvement in the file ACL lookup code, a fix for a logging stall during mount, and fixes for concurrency problems. It has survived a couple of weeks of xfstests runs and merges cleanly. Summary: - Remove KM_SLEEP/KM_NOSLEEP. - Ensure that memory buffers for IO are properly sector-aligned to avoid problems that the block layer doesn't check. - Make the bmap scrubber more efficient in its record checking. - Don't crash xfs_db when superblock inode geometry is corrupt. - Fix btree key helper functions. - Remove unneeded error returns for things that can't fail. - Fix buffer logging bugs in repair. - Clean up iterator return values. - Speed up directory entry creation. - Enable allocation of xattr value memory buffer during lookup. - Fix readahead racing with truncate/punch hole. - Other minor cleanups. - Fix one AGI/AGF deadlock with RENAME_WHITEOUT. - More BUG -> WARN whackamole. - Fix various problems with the log failing to advance under certain circumstances, which results in stalls during mount" * tag 'xfs-5.4-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (45 commits) xfs: push the grant head when the log head moves forward xfs: push iclog state cleaning into xlog_state_clean_log xfs: factor iclog state processing out of xlog_state_do_callback() xfs: factor callbacks out of xlog_state_do_callback() xfs: factor debug code out of xlog_state_do_callback() xfs: prevent CIL push holdoff in log recovery xfs: fix missed wakeup on l_flush_wait xfs: push the AIL in xlog_grant_head_wake xfs: Use WARN_ON_ONCE for bailout mount-operation xfs: Fix deadlock between AGI and AGF with RENAME_WHITEOUT xfs: define a flags field for the AG geometry ioctl structure xfs: add a xfs_valid_startblock helper xfs: remove the unused XFS_ALLOC_USERDATA flag xfs: cleanup xfs_fsb_to_db xfs: fix the dax supported check in xfs_ioctl_setattr_dax_invalidate xfs: Fix stale data exposure when readahead races with hole punch fs: Export generic_fadvise() mm: Handle MADV_WILLNEED through vfs_fadvise() xfs: allocate xattr buffer on demand xfs: consolidate attribute value copying ...

commit: b41dae061bbd722b9d7fa828f35d22035b218e18 [log] [tgz]
author: Linus Torvalds <torvalds@linux-foundation.org> Wed Sep 18 18:32:43 2019 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> Wed Sep 18 18:32:43 2019 -0700
tree: a5c0bade0c3d221483b54204bfc47e4fdbf09316
parent: e6bc9de714972cac34daa1dc1567ee48a47a9342 [diff]
parent: 14e15f1bcd738dc13dd7c1e78e4800e8bc577980 [diff]
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 16bb9a3..da031b9 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c

@@ -3,10 +3,10 @@
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  */
-#include <linux/sched/mm.h>
+#include "xfs.h"
 #include <linux/backing-dev.h>
-#include "kmem.h"
 #include "xfs_message.h"
+#include "xfs_trace.h"
 
 void *
 kmem_alloc(size_t size, xfs_km_flags_t flags)
@@ -15,9 +15,11 @@
 	gfp_t	lflags = kmem_flags_convert(flags);
 	void	*ptr;
 
+	trace_kmem_alloc(size, flags, _RET_IP_);
+
 	do {
 		ptr = kmalloc(size, lflags);
-		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+		if (ptr || (flags & KM_MAYFAIL))
 			return ptr;
 		if (!(++retries % 100))
 			xfs_err(NULL,
@@ -28,28 +30,24 @@
 	} while (1);
 }
 
-void *
-kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+
+/*
+ * __vmalloc() will allocate data pages and auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
+ * we need to tell memory reclaim that we are in such a context via
+ * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
+ * and potentially deadlocking.
+ */
+static void *
+__kmem_vmalloc(size_t size, xfs_km_flags_t flags)
 {
 	unsigned nofs_flag = 0;
 	void	*ptr;
-	gfp_t	lflags;
+	gfp_t	lflags = kmem_flags_convert(flags);
 
-	ptr = kmem_alloc(size, flags | KM_MAYFAIL);
-	if (ptr)
-		return ptr;
-
-	/*
-	 * __vmalloc() will allocate data pages and auxillary structures (e.g.
-	 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
-	 * here. Hence we need to tell memory reclaim that we are in such a
-	 * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
-	 * the filesystem here and potentially deadlocking.
-	 */
 	if (flags & KM_NOFS)
 		nofs_flag = memalloc_nofs_save();
 
-	lflags = kmem_flags_convert(flags);
 	ptr = __vmalloc(size, lflags, PAGE_KERNEL);
 
 	if (flags & KM_NOFS)
@@ -58,6 +56,44 @@
 	return ptr;
 }
 
+/*
+ * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned
+ * to the @align_mask. We only guarantee alignment up to page size, we'll clamp
+ * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE
+ * aligned region.
+ */
+void *
+kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags)
+{
+	void	*ptr;
+
+	trace_kmem_alloc_io(size, flags, _RET_IP_);
+
+	if (WARN_ON_ONCE(align_mask >= PAGE_SIZE))
+		align_mask = PAGE_SIZE - 1;
+
+	ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+	if (ptr) {
+		if (!((uintptr_t)ptr & align_mask))
+			return ptr;
+		kfree(ptr);
+	}
+	return __kmem_vmalloc(size, flags);
+}
+
+void *
+kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+{
+	void	*ptr;
+
+	trace_kmem_alloc_large(size, flags, _RET_IP_);
+
+	ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+	if (ptr)
+		return ptr;
+	return __kmem_vmalloc(size, flags);
+}
+
 void *
 kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
 {
@@ -65,9 +101,11 @@
 	gfp_t	lflags = kmem_flags_convert(flags);
 	void	*ptr;
 
+	trace_kmem_realloc(newsize, flags, _RET_IP_);
+
 	do {
 		ptr = krealloc(old, newsize, lflags);
-		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+		if (ptr || (flags & KM_MAYFAIL))
 			return ptr;
 		if (!(++retries % 100))
 			xfs_err(NULL,
@@ -85,9 +123,10 @@
 	gfp_t	lflags = kmem_flags_convert(flags);
 	void	*ptr;
 
+	trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_);
 	do {
 		ptr = kmem_cache_alloc(zone, lflags);
-		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+		if (ptr || (flags & KM_MAYFAIL))
 			return ptr;
 		if (!(++retries % 100))
 			xfs_err(NULL,

diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 267655a..8170d95 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h

@@ -16,8 +16,6 @@
  */
 
 typedef unsigned __bitwise xfs_km_flags_t;
-#define KM_SLEEP	((__force xfs_km_flags_t)0x0001u)
-#define KM_NOSLEEP	((__force xfs_km_flags_t)0x0002u)
 #define KM_NOFS		((__force xfs_km_flags_t)0x0004u)
 #define KM_MAYFAIL	((__force xfs_km_flags_t)0x0008u)
 #define KM_ZERO		((__force xfs_km_flags_t)0x0010u)
@@ -32,15 +30,11 @@
 {
 	gfp_t	lflags;
 
-	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
+	BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO));
 
-	if (flags & KM_NOSLEEP) {
-		lflags = GFP_ATOMIC | __GFP_NOWARN;
-	} else {
-		lflags = GFP_KERNEL | __GFP_NOWARN;
-		if (flags & KM_NOFS)
-			lflags &= ~__GFP_FS;
-	}
+	lflags = GFP_KERNEL | __GFP_NOWARN;
+	if (flags & KM_NOFS)
+		lflags &= ~__GFP_FS;
 
 	/*
 	 * Default page/slab allocator behavior is to retry for ever
@@ -59,6 +53,7 @@
 }
 
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
 extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
 extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
 static inline void  kmem_free(const void *ptr)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 372ad55..533b04a 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c

@@ -2205,7 +2205,7 @@
 	ASSERT(xfs_bmap_free_item_zone != NULL);
 	ASSERT(oinfo != NULL);
 
-	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+	new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
 	new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
 	new->xefi_blockcount = 1;
 	new->xefi_oinfo = *oinfo;

diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d6ed5d2..58fa85c 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h

@@ -81,10 +81,9 @@
 /*
  * Defines for datatype
  */
-#define XFS_ALLOC_USERDATA		(1 << 0)/* allocation is for user data*/
-#define XFS_ALLOC_INITIAL_USER_DATA	(1 << 1)/* special case start of file */
-#define XFS_ALLOC_USERDATA_ZERO		(1 << 2)/* zero extent on allocation */
-#define XFS_ALLOC_NOBUSY		(1 << 3)/* Busy extents not allowed */
+#define XFS_ALLOC_INITIAL_USER_DATA	(1 << 0)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO		(1 << 1)/* zero extent on allocation */
+#define XFS_ALLOC_NOBUSY		(1 << 2)/* Busy extents not allowed */
 
 static inline bool
 xfs_alloc_is_userdata(int datatype)

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index d48fcf1..510ca69 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c

@@ -97,7 +97,10 @@
  * Overall external interface routines.
  *========================================================================*/
 
-/* Retrieve an extended attribute and its value.  Must have ilock. */
+/*
+ * Retrieve an extended attribute and its value.  Must have ilock.
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
 int
 xfs_attr_get_ilocked(
 	struct xfs_inode	*ip,
@@ -115,12 +118,28 @@
 		return xfs_attr_node_get(args);
 }
 
-/* Retrieve an extended attribute by name, and its value. */
+/*
+ * Retrieve an extended attribute by name, and its value if requested.
+ *
+ * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value,
+ * just an indication whether the attribute exists and the size of the value if
+ * it exists. The size is returned in @valuelenp,
+ *
+ * If the attribute is found, but exceeds the size limit set by the caller in
+ * @valuelenp, return -ERANGE with the size of the attribute that was found in
+ * @valuelenp.
+ *
+ * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after
+ * existence of the attribute has been determined. On success, return that
+ * buffer to the caller and leave them to free it. On failure, free any
+ * allocated buffer and ensure the buffer pointer returned to the caller is
+ * null.
+ */
 int
 xfs_attr_get(
 	struct xfs_inode	*ip,
 	const unsigned char	*name,
-	unsigned char		*value,
+	unsigned char		**value,
 	int			*valuelenp,
 	int			flags)
 {
@@ -128,6 +147,8 @@
 	uint			lock_mode;
 	int			error;
 
+	ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value);
+
 	XFS_STATS_INC(ip->i_mount, xs_attr_get);
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -137,17 +158,29 @@
 	if (error)
 		return error;
 
-	args.value = value;
-	args.valuelen = *valuelenp;
 	/* Entirely possible to look up a name which doesn't exist */
 	args.op_flags = XFS_DA_OP_OKNOENT;
+	if (flags & ATTR_ALLOC)
+		args.op_flags |= XFS_DA_OP_ALLOCVAL;
+	else
+		args.value = *value;
+	args.valuelen = *valuelenp;
 
 	lock_mode = xfs_ilock_attr_map_shared(ip);
 	error = xfs_attr_get_ilocked(ip, &args);
 	xfs_iunlock(ip, lock_mode);
-
 	*valuelenp = args.valuelen;
-	return error == -EEXIST ? 0 : error;
+
+	/* on error, we have to clean up allocated value buffers */
+	if (error) {
+		if (flags & ATTR_ALLOC) {
+			kmem_free(args.value);
+			*value = NULL;
+		}
+		return error;
+	}
+	*value = args.value;
+	return 0;
 }
 
 /*
@@ -768,6 +801,8 @@
  *
  * This leaf block cannot have a "remote" value, we only call this routine
  * if bmap_one_block() says there is only one block (ie: no remote blks).
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
  */
 STATIC int
 xfs_attr_leaf_get(xfs_da_args_t *args)
@@ -789,9 +824,6 @@
 	}
 	error = xfs_attr3_leaf_getvalue(bp, args);
 	xfs_trans_brelse(args->trans, bp);
-	if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
-		error = xfs_attr_rmtval_get(args);
-	}
 	return error;
 }
 
@@ -1268,11 +1300,13 @@
 }
 
 /*
- * Look up a filename in a node attribute list.
+ * Retrieve the attribute data from a node attribute list.
  *
  * This routine gets called for any attribute fork that has more than one
  * block, ie: both true Btree attr lists and for single-leaf-blocks with
  * "remote" values taking up more blocks.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
  */
 STATIC int
 xfs_attr_node_get(xfs_da_args_t *args)
@@ -1294,24 +1328,21 @@
 	error = xfs_da3_node_lookup_int(state, &retval);
 	if (error) {
 		retval = error;
-	} else if (retval == -EEXIST) {
-		blk = &state->path.blk[ state->path.active-1 ];
-		ASSERT(blk->bp != NULL);
-		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-
-		/*
-		 * Get the value, local or "remote"
-		 */
-		retval = xfs_attr3_leaf_getvalue(blk->bp, args);
-		if (!retval && (args->rmtblkno > 0)
-		    && !(args->flags & ATTR_KERNOVAL)) {
-			retval = xfs_attr_rmtval_get(args);
-		}
+		goto out_release;
 	}
+	if (retval != -EEXIST)
+		goto out_release;
+
+	/*
+	 * Get the value, local or "remote"
+	 */
+	blk = &state->path.blk[state->path.active - 1];
+	retval = xfs_attr3_leaf_getvalue(blk->bp, args);
 
 	/*
 	 * If not in a transaction, we have to release all the buffers.
 	 */
+out_release:
 	for (i = 0; i < state->path.active; i++) {
 		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
 		state->path.blk[i].bp = NULL;

diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index ff28ebf..94badfa 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h

@@ -37,6 +37,7 @@
 #define ATTR_KERNOVAL	0x2000	/* [kernel] get attr size only, not value */
 
 #define ATTR_INCOMPLETE	0x4000	/* [kernel] return INCOMPLETE attr keys */
+#define ATTR_ALLOC	0x8000	/* allocate xattr buffer on demand */
 
 #define XFS_ATTR_FLAGS \
 	{ ATTR_DONTFOLLOW, 	"DONTFOLLOW" }, \
@@ -47,7 +48,8 @@
 	{ ATTR_REPLACE,		"REPLACE" }, \
 	{ ATTR_KERNOTIME,	"KERNOTIME" }, \
 	{ ATTR_KERNOVAL,	"KERNOVAL" }, \
-	{ ATTR_INCOMPLETE,	"INCOMPLETE" }
+	{ ATTR_INCOMPLETE,	"INCOMPLETE" }, \
+	{ ATTR_ALLOC,		"ALLOC" }
 
 /*
  * The maximum size (into the kernel or returned from the kernel) of an
@@ -143,7 +145,7 @@
 int xfs_inode_hasattr(struct xfs_inode *ip);
 int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
 int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
-		 unsigned char *value, int *valuelenp, int flags);
+		 unsigned char **value, int *valuelenp, int flags);
 int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
 		 unsigned char *value, int valuelen, int flags);
 int xfs_attr_set_args(struct xfs_da_args *args);

diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 70eb941..b9f0196 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c

@@ -393,6 +393,50 @@
 	return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
 }
 
+static int
+xfs_attr_copy_value(
+	struct xfs_da_args	*args,
+	unsigned char		*value,
+	int			valuelen)
+{
+	/*
+	 * No copy if all we have to do is get the length
+	 */
+	if (args->flags & ATTR_KERNOVAL) {
+		args->valuelen = valuelen;
+		return 0;
+	}
+
+	/*
+	 * No copy if the length of the existing buffer is too small
+	 */
+	if (args->valuelen < valuelen) {
+		args->valuelen = valuelen;
+		return -ERANGE;
+	}
+
+	if (args->op_flags & XFS_DA_OP_ALLOCVAL) {
+		args->value = kmem_alloc_large(valuelen, 0);
+		if (!args->value)
+			return -ENOMEM;
+	}
+	args->valuelen = valuelen;
+
+	/* remote block xattr requires IO for copy-in */
+	if (args->rmtblkno)
+		return xfs_attr_rmtval_get(args);
+
+	/*
+	 * This is to prevent a GCC warning because the remote xattr case
+	 * doesn't have a value to pass in. In that case, we never reach here,
+	 * but GCC can't work that out and so throws a "passing NULL to
+	 * memcpy" warning.
+	 */
+	if (!value)
+		return -EINVAL;
+	memcpy(args->value, value, valuelen);
+	return 0;
+}
 
 /*========================================================================
  * External routines when attribute fork size < XFS_LITINO(mp).
@@ -720,15 +764,19 @@
 }
 
 /*
- * Look up a name in a shortform attribute list structure.
+ * Retreive the attribute value and length.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
  */
-/*ARGSUSED*/
 int
-xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+xfs_attr_shortform_getvalue(
+	struct xfs_da_args	*args)
 {
-	xfs_attr_shortform_t *sf;
-	xfs_attr_sf_entry_t *sfe;
-	int i;
+	struct xfs_attr_shortform *sf;
+	struct xfs_attr_sf_entry *sfe;
+	int			i;
 
 	ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
 	sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
@@ -741,18 +789,8 @@
 			continue;
 		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
 			continue;
-		if (args->flags & ATTR_KERNOVAL) {
-			args->valuelen = sfe->valuelen;
-			return -EEXIST;
-		}
-		if (args->valuelen < sfe->valuelen) {
-			args->valuelen = sfe->valuelen;
-			return -ERANGE;
-		}
-		args->valuelen = sfe->valuelen;
-		memcpy(args->value, &sfe->nameval[args->namelen],
-						    args->valuelen);
-		return -EEXIST;
+		return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
+						sfe->valuelen);
 	}
 	return -ENOATTR;
 }
@@ -782,7 +820,7 @@
 	ifp = dp->i_afp;
 	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
 	size = be16_to_cpu(sf->hdr.totsize);
-	tmpbuffer = kmem_alloc(size, KM_SLEEP);
+	tmpbuffer = kmem_alloc(size, 0);
 	ASSERT(tmpbuffer != NULL);
 	memcpy(tmpbuffer, ifp->if_u1.if_data, size);
 	sf = (xfs_attr_shortform_t *)tmpbuffer;
@@ -985,7 +1023,7 @@
 
 	trace_xfs_attr_leaf_to_sf(args);
 
-	tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+	tmpbuffer = kmem_alloc(args->geo->blksize, 0);
 	if (!tmpbuffer)
 		return -ENOMEM;
 
@@ -1448,7 +1486,7 @@
 
 	trace_xfs_attr_leaf_compact(args);
 
-	tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+	tmpbuffer = kmem_alloc(args->geo->blksize, 0);
 	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
 	memset(bp->b_addr, 0, args->geo->blksize);
 	leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -2167,7 +2205,7 @@
 		struct xfs_attr_leafblock *tmp_leaf;
 		struct xfs_attr3_icleaf_hdr tmphdr;
 
-		tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+		tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
 
 		/*
 		 * Copy the header into the temp leaf so that all the stuff
@@ -2350,6 +2388,10 @@
 /*
  * Get the value associated with an attribute name from a leaf attribute
  * list structure.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
  */
 int
 xfs_attr3_leaf_getvalue(
@@ -2361,7 +2403,6 @@
 	struct xfs_attr_leaf_entry *entry;
 	struct xfs_attr_leaf_name_local *name_loc;
 	struct xfs_attr_leaf_name_remote *name_rmt;
-	int			valuelen;
 
 	leaf = bp->b_addr;
 	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
@@ -2373,36 +2414,19 @@
 		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
 		ASSERT(name_loc->namelen == args->namelen);
 		ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
-		valuelen = be16_to_cpu(name_loc->valuelen);
-		if (args->flags & ATTR_KERNOVAL) {
-			args->valuelen = valuelen;
-			return 0;
-		}
-		if (args->valuelen < valuelen) {
-			args->valuelen = valuelen;
-			return -ERANGE;
-		}
-		args->valuelen = valuelen;
-		memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
-	} else {
-		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-		ASSERT(name_rmt->namelen == args->namelen);
-		ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
-		args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
-		args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-		args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
-						       args->rmtvaluelen);
-		if (args->flags & ATTR_KERNOVAL) {
-			args->valuelen = args->rmtvaluelen;
-			return 0;
-		}
-		if (args->valuelen < args->rmtvaluelen) {
-			args->valuelen = args->rmtvaluelen;
-			return -ERANGE;
-		}
-		args->valuelen = args->rmtvaluelen;
+		return xfs_attr_copy_value(args,
+					&name_loc->nameval[args->namelen],
+					be16_to_cpu(name_loc->valuelen));
 	}
-	return 0;
+
+	name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+	ASSERT(name_rmt->namelen == args->namelen);
+	ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+	args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+	args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+	args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+					       args->rmtvaluelen);
+	return xfs_attr_copy_value(args, NULL, args->rmtvaluelen);
 }
 
 /*========================================================================

diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 4eb30d3..3e39b7d 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c

@@ -358,6 +358,8 @@
 /*
  * Read the value associated with an attribute from the out-of-line buffer
  * that we stored it in.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
  */
 int
 xfs_attr_rmtval_get(

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 07aad70..054b4ce 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c

@@ -553,7 +553,7 @@
 #endif
 	ASSERT(xfs_bmap_free_item_zone != NULL);
 
-	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+	new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
 	new->xefi_startblock = bno;
 	new->xefi_blockcount = (xfs_extlen_t)len;
 	if (oinfo)
@@ -1099,7 +1099,7 @@
 	if (error)
 		goto trans_cancel;
 	ASSERT(ip->i_afp == NULL);
-	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0);
 	ip->i_afp->if_flags = XFS_IFEXTENTS;
 	logflags = 0;
 	switch (ip->i_d.di_format) {
@@ -1985,11 +1985,8 @@
 	}
 
 	/* add reverse mapping unless caller opted out */
-	if (!(bma->flags & XFS_BMAPI_NORMAP)) {
-		error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
-		if (error)
-			goto done;
-	}
+	if (!(bma->flags & XFS_BMAPI_NORMAP))
+		xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
 
 	/* convert to a btree if necessary */
 	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -2471,9 +2468,7 @@
 	}
 
 	/* update reverse mappings */
-	error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
-	if (error)
-		goto done;
+	xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
 
 	/* convert to a btree if necessary */
 	if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -2832,11 +2827,8 @@
 	}
 
 	/* add reverse mapping unless caller opted out */
-	if (!(flags & XFS_BMAPI_NORMAP)) {
-		error = xfs_rmap_map_extent(tp, ip, whichfork, new);
-		if (error)
-			goto done;
-	}
+	if (!(flags & XFS_BMAPI_NORMAP))
+		xfs_rmap_map_extent(tp, ip, whichfork, new);
 
 	/* convert to a btree if necessary */
 	if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -4050,12 +4042,8 @@
 	 */
 	if (!(bma->flags & XFS_BMAPI_METADATA)) {
 		bma->datatype = XFS_ALLOC_NOBUSY;
-		if (whichfork == XFS_DATA_FORK) {
-			if (bma->offset == 0)
-				bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
-			else
-				bma->datatype |= XFS_ALLOC_USERDATA;
-		}
+		if (whichfork == XFS_DATA_FORK && bma->offset == 0)
+			bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
 		if (bma->flags & XFS_BMAPI_ZERO)
 			bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
 	}
@@ -4401,12 +4389,9 @@
 			 * If this is a CoW allocation, record the data in
 			 * the refcount btree for orphan recovery.
 			 */
-			if (whichfork == XFS_COW_FORK) {
-				error = xfs_refcount_alloc_cow_extent(tp,
-						bma.blkno, bma.length);
-				if (error)
-					goto error0;
-			}
+			if (whichfork == XFS_COW_FORK)
+				xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+						bma.length);
 		}
 
 		/* Deal with the allocated space we found.  */
@@ -4530,7 +4515,7 @@
 	if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
 		goto out_finish;
 	error = -EFSCORRUPTED;
-	if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+	if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
 		goto out_finish;
 
 	XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
@@ -4540,12 +4525,8 @@
 	*imap = bma.got;
 	*seq = READ_ONCE(ifp->if_seq);
 
-	if (whichfork == XFS_COW_FORK) {
-		error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
-				bma.length);
-		if (error)
-			goto out_finish;
-	}
+	if (whichfork == XFS_COW_FORK)
+		xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
 
 	error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
 			whichfork);
@@ -5149,18 +5130,14 @@
 	}
 
 	/* remove reverse mapping */
-	error = xfs_rmap_unmap_extent(tp, ip, whichfork, del);
-	if (error)
-		goto done;
+	xfs_rmap_unmap_extent(tp, ip, whichfork, del);
 
 	/*
 	 * If we need to, add to list of extents to delete.
 	 */
 	if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
 		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
-			error = xfs_refcount_decrease_extent(tp, del);
-			if (error)
-				goto done;
+			xfs_refcount_decrease_extent(tp, del);
 		} else {
 			__xfs_bmap_add_free(tp, del->br_startblock,
 					del->br_blockcount, NULL,
@@ -5651,12 +5628,11 @@
 			&new);
 
 	/* update reverse mapping. rmap functions merge the rmaps for us */
-	error = xfs_rmap_unmap_extent(tp, ip, whichfork, got);
-	if (error)
-		return error;
+	xfs_rmap_unmap_extent(tp, ip, whichfork, got);
 	memcpy(&new, got, sizeof(new));
 	new.br_startoff = left->br_startoff + left->br_blockcount;
-	return xfs_rmap_map_extent(tp, ip, whichfork, &new);
+	xfs_rmap_map_extent(tp, ip, whichfork, &new);
+	return 0;
 }
 
 static int
@@ -5695,10 +5671,9 @@
 			got);
 
 	/* update reverse mapping */
-	error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
-	if (error)
-		return error;
-	return xfs_rmap_map_extent(tp, ip, whichfork, got);
+	xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
+	xfs_rmap_map_extent(tp, ip, whichfork, got);
+	return 0;
 }
 
 int
@@ -6094,7 +6069,7 @@
 			bmap->br_blockcount,
 			bmap->br_state);
 
-	bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS);
+	bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
 	INIT_LIST_HEAD(&bi->bi_list);
 	bi->bi_type = type;
 	bi->bi_owner = ip;
@@ -6106,29 +6081,29 @@
 }
 
 /* Map an extent into a file. */
-int
+void
 xfs_bmap_map_extent(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*PREV)
 {
 	if (!xfs_bmap_is_update_needed(PREV))
-		return 0;
+		return;
 
-	return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+	__xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
 }
 
 /* Unmap an extent out of a file. */
-int
+void
 xfs_bmap_unmap_extent(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*PREV)
 {
 	if (!xfs_bmap_is_update_needed(PREV))
-		return 0;
+		return;
 
-	return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+	__xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
 }
 
 /*

diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 8f597f9..5bb446d 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h

@@ -171,6 +171,13 @@
 		!isnullstartblock(irec->br_startblock);
 }
 
+/*
+ * Check the mapping for obviously garbage allocations that could trash the
+ * filesystem immediately.
+ */
+#define xfs_valid_startblock(ip, startblock) \
+	((startblock) != 0 || XFS_IS_REALTIME_INODE(ip))
+
 void	xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
 		xfs_filblks_t len);
 int	xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
@@ -254,9 +261,9 @@
 		enum xfs_bmap_intent_type type, int whichfork,
 		xfs_fileoff_t startoff, xfs_fsblock_t startblock,
 		xfs_filblks_t *blockcount, xfs_exntst_t state);
-int	xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void	xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap);
-int	xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void	xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap);
 
 static inline int xfs_bmap_fork_to_state(int whichfork)

diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fbb18ba..ffe608d 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c

@@ -400,8 +400,20 @@
 	union xfs_btree_key	*k1,
 	union xfs_btree_key	*k2)
 {
-	return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) -
-			  be64_to_cpu(k2->bmbt.br_startoff);
+	uint64_t		a = be64_to_cpu(k1->bmbt.br_startoff);
+	uint64_t		b = be64_to_cpu(k2->bmbt.br_startoff);
+
+	/*
+	 * Note: This routine previously casted a and b to int64 and subtracted
+	 * them to generate a result.  This lead to problems if b was the
+	 * "maximum" key value (all ones) being signed incorrectly, hence this
+	 * somewhat less efficient version.
+	 */
+	if (a > b)
+		return 1;
+	if (b > a)
+		return -1;
+	return 0;
 }
 
 static xfs_failaddr_t

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f1048ef..71de937 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c

@@ -4466,8 +4466,6 @@
  *				      btree block
  *
  * @bp: buffer containing the btree block
- * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
- * @pag_max_level: pointer to the per-ag max level field
  */
 xfs_failaddr_t
 xfs_btree_sblock_v5hdr_verify(
@@ -4600,7 +4598,7 @@
 
 		/* Callback */
 		error = fn(cur, recp, priv);
-		if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
+		if (error)
 			break;
 
 advloop:
@@ -4702,8 +4700,7 @@
 			 */
 			if (ldiff >= 0 && hdiff >= 0) {
 				error = fn(cur, recp, priv);
-				if (error < 0 ||
-				    error == XFS_BTREE_QUERY_RANGE_ABORT)
+				if (error)
 					break;
 			} else if (hdiff < 0) {
 				/* Record is larger than high key; pop. */
@@ -4774,8 +4771,7 @@
  * Query a btree for all records overlapping a given interval of keys.  The
  * supplied function will be called with each record found; return one of the
  * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
- * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
- * negative error code.
+ * code.  This function returns -ECANCELED, zero, or a negative error code.
  */
 int
 xfs_btree_query_range(
@@ -4891,7 +4887,7 @@
 	union xfs_btree_rec		*rec,
 	void				*priv)
 {
-	return XFS_BTREE_QUERY_RANGE_ABORT;
+	return -ECANCELED;
 }
 
 /* Is there a record covering a given range of keys? */
@@ -4906,7 +4902,7 @@
 
 	error = xfs_btree_query_range(cur, low, high,
 			&xfs_btree_has_record_helper, NULL);
-	if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+	if (error == -ECANCELED) {
 		*exists = true;
 		return 0;
 	}

diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index fa3cd8a..ced1e65 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h

@@ -464,9 +464,13 @@
 uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
 unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
 
-/* return codes */
-#define XFS_BTREE_QUERY_RANGE_CONTINUE	(XFS_ITER_CONTINUE) /* keep iterating */
-#define XFS_BTREE_QUERY_RANGE_ABORT	(XFS_ITER_ABORT)    /* stop iterating */
+/*
+ * Return codes for the query range iterator function are 0 to continue
+ * iterating, and non-zero to stop iterating.  Any non-zero value will be
+ * passed up to the _query_range caller.  The special value -ECANCELED can be
+ * used to stop iteration, because _query_range never generates that error
+ * code on its own.
+ */
 typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
 		union xfs_btree_rec *rec, void *priv);
 

diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 0bf56e9..4fd1223 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c

@@ -2098,7 +2098,7 @@
 		 * If we didn't get it and the block might work if fragmented,
 		 * try without the CONTIG flag.  Loop until we get it all.
 		 */
-		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+		mapp = kmem_alloc(sizeof(*mapp) * count, 0);
 		for (b = *bno, mapi = 0; b < *bno + count; ) {
 			nmap = min(XFS_BMAP_MAX_NMAP, count);
 			c = (int)(*bno + count - b);
@@ -2480,7 +2480,7 @@
 
 	if (nirecs > 1) {
 		map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
-				  KM_SLEEP | KM_NOFS);
+				  KM_NOFS);
 		if (!map)
 			return -ENOMEM;
 		*mapp = map;
@@ -2539,7 +2539,7 @@
 		 */
 		if (nfsb != 1)
 			irecs = kmem_zalloc(sizeof(irec) * nfsb,
-					    KM_SLEEP | KM_NOFS);
+					    KM_NOFS);
 
 		nirecs = nfsb;
 		error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,

diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 84dd865..ae0bbd2 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h

@@ -81,13 +81,15 @@
 #define XFS_DA_OP_ADDNAME	0x0004	/* this is an add operation */
 #define XFS_DA_OP_OKNOENT	0x0008	/* lookup/add op, ENOENT ok, else die */
 #define XFS_DA_OP_CILOOKUP	0x0010	/* lookup to return CI name if found */
+#define XFS_DA_OP_ALLOCVAL	0x0020	/* lookup to alloc buffer if found  */
 
 #define XFS_DA_OP_FLAGS \
 	{ XFS_DA_OP_JUSTCHECK,	"JUSTCHECK" }, \
 	{ XFS_DA_OP_RENAME,	"RENAME" }, \
 	{ XFS_DA_OP_ADDNAME,	"ADDNAME" }, \
 	{ XFS_DA_OP_OKNOENT,	"OKNOENT" }, \
-	{ XFS_DA_OP_CILOOKUP,	"CILOOKUP" }
+	{ XFS_DA_OP_CILOOKUP,	"CILOOKUP" }, \
+	{ XFS_DA_OP_ALLOCVAL,	"ALLOCVAL" }
 
 /*
  * Storage for holding state during Btree searches and split/join ops.

diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index eb2be2a..2255752 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c

@@ -517,7 +517,7 @@
 	}
 	if (!dfp) {
 		dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
-				KM_SLEEP | KM_NOFS);
+				KM_NOFS);
 		dfp->dfp_type = type;
 		dfp->dfp_intent = NULL;
 		dfp->dfp_done = NULL;

diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 6784072..867c5de 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c

@@ -110,9 +110,9 @@
 
 	nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
 	mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-				    KM_SLEEP | KM_MAYFAIL);
+				    KM_MAYFAIL);
 	mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-				     KM_SLEEP | KM_MAYFAIL);
+				     KM_MAYFAIL);
 	if (!mp->m_dir_geo || !mp->m_attr_geo) {
 		kmem_free(mp->m_dir_geo);
 		kmem_free(mp->m_attr_geo);
@@ -217,7 +217,7 @@
 	if (error)
 		return error;
 
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
 	if (!args)
 		return -ENOMEM;
 
@@ -254,7 +254,7 @@
 		XFS_STATS_INC(dp->i_mount, xs_dir_create);
 	}
 
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
 	if (!args)
 		return -ENOMEM;
 
@@ -353,7 +353,7 @@
 	 * lockdep Doing this avoids having to add a bunch of lockdep class
 	 * annotations into the reclaim path for the ilock.
 	 */
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
 	args->geo = dp->i_mount->m_dir_geo;
 	args->name = name->name;
 	args->namelen = name->len;
@@ -422,7 +422,7 @@
 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	XFS_STATS_INC(dp->i_mount, xs_dir_remove);
 
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
 	if (!args)
 		return -ENOMEM;
 
@@ -483,7 +483,7 @@
 	if (rval)
 		return rval;
 
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
 	if (!args)
 		return -ENOMEM;
 

diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index a6fb0cc..9595ced 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c

@@ -1092,7 +1092,7 @@
 	 * Copy the directory into a temporary buffer.
 	 * Then pitch the incore inode data so we can make extents.
 	 */
-	sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+	sfp = kmem_alloc(ifp->if_bytes, 0);
 	memcpy(sfp, oldsfp, ifp->if_bytes);
 
 	xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);

diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 1fc44ef..705c4f5 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c

@@ -32,8 +32,6 @@
 static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
 				 int index, xfs_da_state_blk_t *dblk,
 				 int *rval);
-static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
-				     xfs_da_state_blk_t *fblk);
 
 /*
  * Check internal consistency of a leafn block.
@@ -1611,6 +1609,353 @@
 }
 
 /*
+ * Add a new data block to the directory at the free space index that the caller
+ * has specified.
+ */
+static int
+xfs_dir2_node_add_datablk(
+	struct xfs_da_args	*args,
+	struct xfs_da_state_blk	*fblk,
+	xfs_dir2_db_t		*dbno,
+	struct xfs_buf		**dbpp,
+	struct xfs_buf		**fbpp,
+	int			*findex)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_dir2_data_free *bf;
+	struct xfs_dir2_data_hdr *hdr;
+	struct xfs_dir2_free	*free = NULL;
+	xfs_dir2_db_t		fbno;
+	struct xfs_buf		*fbp;
+	struct xfs_buf		*dbp;
+	__be16			*bests = NULL;
+	int			error;
+
+	/* Not allowed to allocate, return failure. */
+	if (args->total == 0)
+		return -ENOSPC;
+
+	/* Allocate and initialize the new data block.  */
+	error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno);
+	if (error)
+		return error;
+	error = xfs_dir3_data_init(args, *dbno, &dbp);
+	if (error)
+		return error;
+
+	/*
+	 * Get the freespace block corresponding to the data block
+	 * that was just allocated.
+	 */
+	fbno = dp->d_ops->db_to_fdb(args->geo, *dbno);
+	error = xfs_dir2_free_try_read(tp, dp,
+			       xfs_dir2_db_to_da(args->geo, fbno), &fbp);
+	if (error)
+		return error;
+
+	/*
+	 * If there wasn't a freespace block, the read will
+	 * return a NULL fbp.  Allocate and initialize a new one.
+	 */
+	if (!fbp) {
+		error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno);
+		if (error)
+			return error;
+
+		if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) {
+			xfs_alert(mp,
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
+				__func__, (unsigned long long)dp->i_ino,
+				(long long)dp->d_ops->db_to_fdb(args->geo, *dbno),
+				(long long)*dbno, (long long)fbno);
+			if (fblk) {
+				xfs_alert(mp,
+			" fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
+					fblk, (unsigned long long)fblk->blkno,
+					fblk->index, fblk->magic);
+			} else {
+				xfs_alert(mp, " ... fblk is NULL");
+			}
+			XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+			return -EFSCORRUPTED;
+		}
+
+		/* Get a buffer for the new block. */
+		error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+		if (error)
+			return error;
+		free = fbp->b_addr;
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+		/* Remember the first slot as our empty slot. */
+		freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
+							XFS_DIR2_FREE_OFFSET)) *
+				dp->d_ops->free_max_bests(args->geo);
+	} else {
+		free = fbp->b_addr;
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+	}
+
+	/* Set the freespace block index from the data block number. */
+	*findex = dp->d_ops->db_to_fdindex(args->geo, *dbno);
+
+	/* Extend the freespace table if the new data block is off the end. */
+	if (*findex >= freehdr.nvalid) {
+		ASSERT(*findex < dp->d_ops->free_max_bests(args->geo));
+		freehdr.nvalid = *findex + 1;
+		bests[*findex] = cpu_to_be16(NULLDATAOFF);
+	}
+
+	/*
+	 * If this entry was for an empty data block (this should always be
+	 * true) then update the header.
+	 */
+	if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
+		freehdr.nused++;
+		dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+		xfs_dir2_free_log_header(args, fbp);
+	}
+
+	/* Update the freespace value for the new block in the table. */
+	hdr = dbp->b_addr;
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	bests[*findex] = bf[0].length;
+
+	*dbpp = dbp;
+	*fbpp = fbp;
+	return 0;
+}
+
+static int
+xfs_dir2_node_find_freeblk(
+	struct xfs_da_args	*args,
+	struct xfs_da_state_blk	*fblk,
+	xfs_dir2_db_t		*dbnop,
+	struct xfs_buf		**fbpp,
+	int			*findexp,
+	int			length)
+{
+	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_dir2_free	*free = NULL;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_buf		*fbp = NULL;
+	xfs_dir2_db_t		firstfbno;
+	xfs_dir2_db_t		lastfbno;
+	xfs_dir2_db_t		ifbno = -1;
+	xfs_dir2_db_t		dbno = -1;
+	xfs_dir2_db_t		fbno;
+	xfs_fileoff_t		fo;
+	__be16			*bests = NULL;
+	int			findex = 0;
+	int			error;
+
+	/*
+	 * If we came in with a freespace block that means that lookup
+	 * found an entry with our hash value.  This is the freespace
+	 * block for that data entry.
+	 */
+	if (fblk) {
+		fbp = fblk->bp;
+		free = fbp->b_addr;
+		findex = fblk->index;
+		if (findex >= 0) {
+			/* caller already found the freespace for us. */
+			bests = dp->d_ops->free_bests_p(free);
+			dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+			ASSERT(findex < freehdr.nvalid);
+			ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
+			ASSERT(be16_to_cpu(bests[findex]) >= length);
+			dbno = freehdr.firstdb + findex;
+			goto found_block;
+		}
+
+		/*
+		 * The data block looked at didn't have enough room.
+		 * We'll start at the beginning of the freespace entries.
+		 */
+		ifbno = fblk->blkno;
+		xfs_trans_brelse(tp, fbp);
+		fbp = NULL;
+		fblk->bp = NULL;
+	}
+
+	/*
+	 * If we don't have a data block yet, we're going to scan the freespace
+	 * data for a data block with enough free space in it.
+	 */
+	error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK);
+	if (error)
+		return error;
+	lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
+	firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
+
+	for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) {
+		/* If it's ifbno we already looked at it. */
+		if (fbno == ifbno)
+			continue;
+
+		/*
+		 * Read the block.  There can be holes in the freespace blocks,
+		 * so this might not succeed.  This should be really rare, so
+		 * there's no reason to avoid it.
+		 */
+		error = xfs_dir2_free_try_read(tp, dp,
+				xfs_dir2_db_to_da(args->geo, fbno),
+				&fbp);
+		if (error)
+			return error;
+		if (!fbp)
+			continue;
+
+		free = fbp->b_addr;
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+		/* Scan the free entry array for a large enough free space. */
+		for (findex = freehdr.nvalid - 1; findex >= 0; findex--) {
+			if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+			    be16_to_cpu(bests[findex]) >= length) {
+				dbno = freehdr.firstdb + findex;
+				goto found_block;
+			}
+		}
+
+		/* Didn't find free space, go on to next free block */
+		xfs_trans_brelse(tp, fbp);
+	}
+
+found_block:
+	*dbnop = dbno;
+	*fbpp = fbp;
+	*findexp = findex;
+	return 0;
+}
+
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int
+xfs_dir2_node_addname_int(
+	struct xfs_da_args	*args,		/* operation arguments */
+	struct xfs_da_state_blk	*fblk)		/* optional freespace block */
+{
+	struct xfs_dir2_data_unused *dup;	/* data unused entry pointer */
+	struct xfs_dir2_data_entry *dep;	/* data entry pointer */
+	struct xfs_dir2_data_hdr *hdr;		/* data block header */
+	struct xfs_dir2_data_free *bf;
+	struct xfs_dir2_free	*free = NULL;	/* freespace block structure */
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_buf		*dbp;		/* data block buffer */
+	struct xfs_buf		*fbp;		/* freespace buffer */
+	xfs_dir2_data_aoff_t	aoff;
+	xfs_dir2_db_t		dbno;		/* data block number */
+	int			error;		/* error return value */
+	int			findex;		/* freespace entry index */
+	int			length;		/* length of the new entry */
+	int			logfree = 0;	/* need to log free entry */
+	int			needlog = 0;	/* need to log data header */
+	int			needscan = 0;	/* need to rescan data frees */
+	__be16			*tagp;		/* data entry tag pointer */
+	__be16			*bests;
+
+	length = dp->d_ops->data_entsize(args->namelen);
+	error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex,
+					   length);
+	if (error)
+		return error;
+
+	/*
+	 * Now we know if we must allocate blocks, so if we are checking whether
+	 * we can insert without allocation then we can return now.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+		if (dbno == -1)
+			return -ENOSPC;
+		return 0;
+	}
+
+	/*
+	 * If we don't have a data block, we need to allocate one and make
+	 * the freespace entries refer to it.
+	 */
+	if (dbno == -1) {
+		/* we're going to have to log the free block index later */
+		logfree = 1;
+		error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
+						  &findex);
+	} else {
+		/* Read the data block in. */
+		error = xfs_dir3_data_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, dbno),
+					   -1, &dbp);
+	}
+	if (error)
+		return error;
+
+	/* setup for data block up now */
+	hdr = dbp->b_addr;
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	ASSERT(be16_to_cpu(bf[0].length) >= length);
+
+	/* Point to the existing unused space. */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)hdr + be16_to_cpu(bf[0].offset));
+
+	/* Mark the first part of the unused space, inuse for us. */
+	aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+	error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
+			&needlog, &needscan);
+	if (error) {
+		xfs_trans_brelse(tp, dbp);
+		return error;
+	}
+
+	/* Fill in the new entry and log it. */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, dep->namelen);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	xfs_dir2_data_log_entry(args, dbp, dep);
+
+	/* Rescan the freespace and log the data block if needed. */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+
+	/* If the freespace block entry is now wrong, update it. */
+	free = fbp->b_addr;
+	bests = dp->d_ops->free_bests_p(free);
+	if (bests[findex] != bf[0].length) {
+		bests[findex] = bf[0].length;
+		logfree = 1;
+	}
+
+	/* Log the freespace entry if needed. */
+	if (logfree)
+		xfs_dir2_free_log_bests(args, fbp, findex, findex);
+
+	/* Return the data block and offset in args. */
+	args->blkno = (xfs_dablk_t)dbno;
+	args->index = be16_to_cpu(*tagp);
+	return 0;
+}
+
+/*
  * Top-level node form directory addname routine.
  */
 int						/* error */
@@ -1680,383 +2025,6 @@
 }
 
 /*
- * Add the data entry for a node-format directory name addition.
- * The leaf entry is added in xfs_dir2_leafn_add.
- * We may enter with a freespace block that the lookup found.
- */
-static int					/* error */
-xfs_dir2_node_addname_int(
-	xfs_da_args_t		*args,		/* operation arguments */
-	xfs_da_state_blk_t	*fblk)		/* optional freespace block */
-{
-	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
-	xfs_dir2_db_t		dbno;		/* data block number */
-	struct xfs_buf		*dbp;		/* data block buffer */
-	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	xfs_dir2_data_unused_t	*dup;		/* data unused entry pointer */
-	int			error;		/* error return value */
-	xfs_dir2_db_t		fbno;		/* freespace block number */
-	struct xfs_buf		*fbp;		/* freespace buffer */
-	int			findex;		/* freespace entry index */
-	xfs_dir2_free_t		*free=NULL;	/* freespace block structure */
-	xfs_dir2_db_t		ifbno;		/* initial freespace block no */
-	xfs_dir2_db_t		lastfbno=0;	/* highest freespace block no */
-	int			length;		/* length of the new entry */
-	int			logfree;	/* need to log free entry */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			needlog;	/* need to log data header */
-	int			needscan;	/* need to rescan data frees */
-	__be16			*tagp;		/* data entry tag pointer */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	__be16			*bests;
-	struct xfs_dir3_icfree_hdr freehdr;
-	struct xfs_dir2_data_free *bf;
-	xfs_dir2_data_aoff_t	aoff;
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	tp = args->trans;
-	length = dp->d_ops->data_entsize(args->namelen);
-	/*
-	 * If we came in with a freespace block that means that lookup
-	 * found an entry with our hash value.  This is the freespace
-	 * block for that data entry.
-	 */
-	if (fblk) {
-		fbp = fblk->bp;
-		/*
-		 * Remember initial freespace block number.
-		 */
-		ifbno = fblk->blkno;
-		free = fbp->b_addr;
-		findex = fblk->index;
-		bests = dp->d_ops->free_bests_p(free);
-		dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-		/*
-		 * This means the free entry showed that the data block had
-		 * space for our entry, so we remembered it.
-		 * Use that data block.
-		 */
-		if (findex >= 0) {
-			ASSERT(findex < freehdr.nvalid);
-			ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
-			ASSERT(be16_to_cpu(bests[findex]) >= length);
-			dbno = freehdr.firstdb + findex;
-		} else {
-			/*
-			 * The data block looked at didn't have enough room.
-			 * We'll start at the beginning of the freespace entries.
-			 */
-			dbno = -1;
-			findex = 0;
-		}
-	} else {
-		/*
-		 * Didn't come in with a freespace block, so no data block.
-		 */
-		ifbno = dbno = -1;
-		fbp = NULL;
-		findex = 0;
-	}
-
-	/*
-	 * If we don't have a data block yet, we're going to scan the
-	 * freespace blocks looking for one.  Figure out what the
-	 * highest freespace block number is.
-	 */
-	if (dbno == -1) {
-		xfs_fileoff_t	fo;		/* freespace block number */
-
-		if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
-			return error;
-		lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
-		fbno = ifbno;
-	}
-	/*
-	 * While we haven't identified a data block, search the freeblock
-	 * data for a good data block.  If we find a null freeblock entry,
-	 * indicating a hole in the data blocks, remember that.
-	 */
-	while (dbno == -1) {
-		/*
-		 * If we don't have a freeblock in hand, get the next one.
-		 */
-		if (fbp == NULL) {
-			/*
-			 * Happens the first time through unless lookup gave
-			 * us a freespace block to start with.
-			 */
-			if (++fbno == 0)
-				fbno = xfs_dir2_byte_to_db(args->geo,
-							XFS_DIR2_FREE_OFFSET);
-			/*
-			 * If it's ifbno we already looked at it.
-			 */
-			if (fbno == ifbno)
-				fbno++;
-			/*
-			 * If it's off the end we're done.
-			 */
-			if (fbno >= lastfbno)
-				break;
-			/*
-			 * Read the block.  There can be holes in the
-			 * freespace blocks, so this might not succeed.
-			 * This should be really rare, so there's no reason
-			 * to avoid it.
-			 */
-			error = xfs_dir2_free_try_read(tp, dp,
-					xfs_dir2_db_to_da(args->geo, fbno),
-					&fbp);
-			if (error)
-				return error;
-			if (!fbp)
-				continue;
-			free = fbp->b_addr;
-			findex = 0;
-		}
-		/*
-		 * Look at the current free entry.  Is it good enough?
-		 *
-		 * The bests initialisation should be where the bufer is read in
-		 * the above branch. But gcc is too stupid to realise that bests
-		 * and the freehdr are actually initialised if they are placed
-		 * there, so we have to do it here to avoid warnings. Blech.
-		 */
-		bests = dp->d_ops->free_bests_p(free);
-		dp->d_ops->free_hdr_from_disk(&freehdr, free);
-		if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
-		    be16_to_cpu(bests[findex]) >= length)
-			dbno = freehdr.firstdb + findex;
-		else {
-			/*
-			 * Are we done with the freeblock?
-			 */
-			if (++findex == freehdr.nvalid) {
-				/*
-				 * Drop the block.
-				 */
-				xfs_trans_brelse(tp, fbp);
-				fbp = NULL;
-				if (fblk && fblk->bp)
-					fblk->bp = NULL;
-			}
-		}
-	}
-	/*
-	 * If we don't have a data block, we need to allocate one and make
-	 * the freespace entries refer to it.
-	 */
-	if (unlikely(dbno == -1)) {
-		/*
-		 * Not allowed to allocate, return failure.
-		 */
-		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-			return -ENOSPC;
-
-		/*
-		 * Allocate and initialize the new data block.
-		 */
-		if (unlikely((error = xfs_dir2_grow_inode(args,
-							 XFS_DIR2_DATA_SPACE,
-							 &dbno)) ||
-		    (error = xfs_dir3_data_init(args, dbno, &dbp))))
-			return error;
-
-		/*
-		 * If (somehow) we have a freespace block, get rid of it.
-		 */
-		if (fbp)
-			xfs_trans_brelse(tp, fbp);
-		if (fblk && fblk->bp)
-			fblk->bp = NULL;
-
-		/*
-		 * Get the freespace block corresponding to the data block
-		 * that was just allocated.
-		 */
-		fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
-		error = xfs_dir2_free_try_read(tp, dp,
-				       xfs_dir2_db_to_da(args->geo, fbno),
-				       &fbp);
-		if (error)
-			return error;
-
-		/*
-		 * If there wasn't a freespace block, the read will
-		 * return a NULL fbp.  Allocate and initialize a new one.
-		 */
-		if (!fbp) {
-			error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
-						    &fbno);
-			if (error)
-				return error;
-
-			if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
-				xfs_alert(mp,
-"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d",
-					__func__, (unsigned long long)dp->i_ino,
-					(long long)dp->d_ops->db_to_fdb(
-								args->geo, dbno),
-					(long long)dbno, (long long)fbno,
-					(unsigned long long)ifbno, lastfbno);
-				if (fblk) {
-					xfs_alert(mp,
-				" fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
-						fblk,
-						(unsigned long long)fblk->blkno,
-						fblk->index,
-						fblk->magic);
-				} else {
-					xfs_alert(mp, " ... fblk is NULL");
-				}
-				XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
-						 XFS_ERRLEVEL_LOW, mp);
-				return -EFSCORRUPTED;
-			}
-
-			/*
-			 * Get a buffer for the new block.
-			 */
-			error = xfs_dir3_free_get_buf(args, fbno, &fbp);
-			if (error)
-				return error;
-			free = fbp->b_addr;
-			bests = dp->d_ops->free_bests_p(free);
-			dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-			/*
-			 * Remember the first slot as our empty slot.
-			 */
-			freehdr.firstdb =
-				(fbno - xfs_dir2_byte_to_db(args->geo,
-							XFS_DIR2_FREE_OFFSET)) *
-					dp->d_ops->free_max_bests(args->geo);
-		} else {
-			free = fbp->b_addr;
-			bests = dp->d_ops->free_bests_p(free);
-			dp->d_ops->free_hdr_from_disk(&freehdr, free);
-		}
-
-		/*
-		 * Set the freespace block index from the data block number.
-		 */
-		findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
-		/*
-		 * If it's after the end of the current entries in the
-		 * freespace block, extend that table.
-		 */
-		if (findex >= freehdr.nvalid) {
-			ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
-			freehdr.nvalid = findex + 1;
-			/*
-			 * Tag new entry so nused will go up.
-			 */
-			bests[findex] = cpu_to_be16(NULLDATAOFF);
-		}
-		/*
-		 * If this entry was for an empty data block
-		 * (this should always be true) then update the header.
-		 */
-		if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
-			freehdr.nused++;
-			dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-			xfs_dir2_free_log_header(args, fbp);
-		}
-		/*
-		 * Update the real value in the table.
-		 * We haven't allocated the data entry yet so this will
-		 * change again.
-		 */
-		hdr = dbp->b_addr;
-		bf = dp->d_ops->data_bestfree_p(hdr);
-		bests[findex] = bf[0].length;
-		logfree = 1;
-	}
-	/*
-	 * We had a data block so we don't have to make a new one.
-	 */
-	else {
-		/*
-		 * If just checking, we succeeded.
-		 */
-		if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-			return 0;
-
-		/*
-		 * Read the data block in.
-		 */
-		error = xfs_dir3_data_read(tp, dp,
-					   xfs_dir2_db_to_da(args->geo, dbno),
-					   -1, &dbp);
-		if (error)
-			return error;
-		hdr = dbp->b_addr;
-		bf = dp->d_ops->data_bestfree_p(hdr);
-		logfree = 0;
-	}
-	ASSERT(be16_to_cpu(bf[0].length) >= length);
-	/*
-	 * Point to the existing unused space.
-	 */
-	dup = (xfs_dir2_data_unused_t *)
-	      ((char *)hdr + be16_to_cpu(bf[0].offset));
-	needscan = needlog = 0;
-	/*
-	 * Mark the first part of the unused space, inuse for us.
-	 */
-	aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
-	error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
-			&needlog, &needscan);
-	if (error) {
-		xfs_trans_brelse(tp, dbp);
-		return error;
-	}
-	/*
-	 * Fill in the new entry and log it.
-	 */
-	dep = (xfs_dir2_data_entry_t *)dup;
-	dep->inumber = cpu_to_be64(args->inumber);
-	dep->namelen = args->namelen;
-	memcpy(dep->name, args->name, dep->namelen);
-	dp->d_ops->data_put_ftype(dep, args->filetype);
-	tagp = dp->d_ops->data_entry_tag_p(dep);
-	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-	xfs_dir2_data_log_entry(args, dbp, dep);
-	/*
-	 * Rescan the block for bestfree if needed.
-	 */
-	if (needscan)
-		xfs_dir2_data_freescan(dp, hdr, &needlog);
-	/*
-	 * Log the data block header if needed.
-	 */
-	if (needlog)
-		xfs_dir2_data_log_header(args, dbp);
-	/*
-	 * If the freespace entry is now wrong, update it.
-	 */
-	bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
-	if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
-		bests[findex] = bf[0].length;
-		logfree = 1;
-	}
-	/*
-	 * Log the freespace entry if needed.
-	 */
-	if (logfree)
-		xfs_dir2_free_log_bests(args, fbp, findex, findex);
-	/*
-	 * Return the data block and offset in args, then drop the data block.
-	 */
-	args->blkno = (xfs_dablk_t)dbno;
-	args->index = be16_to_cpu(*tagp);
-	return 0;
-}
-
-/*
  * Lookup an entry in a node-format directory.
  * All the real work happens in xfs_da3_node_lookup_int.
  * The only real output is the inode number of the entry.

diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 0335892..85f14fc 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c

@@ -164,7 +164,7 @@
 	 * can free the block and copy the formatted data into the inode literal
 	 * area.
 	 */
-	dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+	dst = kmem_alloc(mp->m_sb.sb_inodesize, 0);
 	hdr = bp->b_addr;
 
 	/*
@@ -436,7 +436,7 @@
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	old_isize = (int)dp->i_d.di_size;
-	buf = kmem_alloc(old_isize, KM_SLEEP);
+	buf = kmem_alloc(old_isize, 0);
 	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
 	memcpy(oldsfp, sfp, old_isize);
 	/*
@@ -1096,7 +1096,7 @@
 	 * Don't want xfs_idata_realloc copying the data here.
 	 */
 	oldsize = dp->i_df.if_bytes;
-	buf = kmem_alloc(oldsize, KM_SLEEP);
+	buf = kmem_alloc(oldsize, 0);
 	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	ASSERT(oldsfp->i8count == 1);
 	memcpy(buf, oldsfp, oldsize);
@@ -1169,7 +1169,7 @@
 	 * Don't want xfs_idata_realloc copying the data here.
 	 */
 	oldsize = dp->i_df.if_bytes;
-	buf = kmem_alloc(oldsize, KM_SLEEP);
+	buf = kmem_alloc(oldsize, 0);
 	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	ASSERT(oldsfp->i8count == 0);
 	memcpy(buf, oldsfp, oldsize);

diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 52d03a3..39dd2b9 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h

@@ -287,7 +287,7 @@
 	uint32_t	ag_ifree;	/* o: inodes free */
 	uint32_t	ag_sick;	/* o: sick things in ag */
 	uint32_t	ag_checked;	/* o: checked metadata in ag */
-	uint32_t	ag_reserved32;	/* o: zero */
+	uint32_t	ag_flags;	/* i/o: flags for this ag */
 	uint64_t	ag_reserved[12];/* o: zero */
 };
 #define XFS_AG_GEOM_SICK_SB	(1 << 0)  /* superblock */

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 04377ab..588d446 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c

@@ -2787,8 +2787,13 @@
 	igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
 			inodes);
 
-	/* Set the maximum inode count for this filesystem. */
-	if (sbp->sb_imax_pct) {
+	/*
+	 * Set the maximum inode count for this filesystem, being careful not
+	 * to use obviously garbage sb_inopblog/sb_inopblock values.  Regular
+	 * users should never get here due to failing sb verification, but
+	 * certain users (xfs_db) need to be usable even with corrupt metadata.
+	 */
+	if (sbp->sb_imax_pct && igeo->ialloc_blks) {
 		/*
 		 * Make sure the maximum inode count is a multiple
 		 * of the units we allocate inodes in.

diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 27aa3f2..7bc8740 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c

@@ -616,7 +616,7 @@
  * sequence counter is seen before the modifications to the extent tree itself
  * take effect.
  */
-static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
+static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
 {
 	WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
 }
@@ -633,7 +633,7 @@
 	struct xfs_iext_leaf	*new = NULL;
 	int			nr_entries, i;
 
-	xfs_iext_inc_seq(ifp, state);
+	xfs_iext_inc_seq(ifp);
 
 	if (ifp->if_height == 0)
 		xfs_iext_alloc_root(ifp, cur);
@@ -875,7 +875,7 @@
 	ASSERT(ifp->if_u1.if_root != NULL);
 	ASSERT(xfs_iext_valid(ifp, cur));
 
-	xfs_iext_inc_seq(ifp, state);
+	xfs_iext_inc_seq(ifp);
 
 	nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
 	for (i = cur->pos; i < nr_entries; i++)
@@ -983,7 +983,7 @@
 {
 	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
 
-	xfs_iext_inc_seq(ifp, state);
+	xfs_iext_inc_seq(ifp);
 
 	if (cur->pos == 0) {
 		struct xfs_bmbt_irec	old;

diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index bf3e040..c643bee 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c

@@ -94,7 +94,7 @@
 		return 0;
 
 	ASSERT(ip->i_afp == NULL);
-	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS);
 
 	switch (dip->di_aformat) {
 	case XFS_DINODE_FMT_LOCAL:
@@ -147,7 +147,7 @@
 
 	if (size) {
 		real_size = roundup(mem_size, 4);
-		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+		ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
 		memcpy(ifp->if_u1.if_data, data, size);
 		if (zero_terminate)
 			ifp->if_u1.if_data[size] = '\0';
@@ -302,7 +302,7 @@
 	}
 
 	ifp->if_broot_bytes = size;
-	ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+	ifp->if_broot = kmem_alloc(size, KM_NOFS);
 	ASSERT(ifp->if_broot != NULL);
 	/*
 	 * Copy and convert from the on-disk structure
@@ -367,7 +367,7 @@
 		 */
 		if (ifp->if_broot_bytes == 0) {
 			new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
-			ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+			ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
 			ifp->if_broot_bytes = (int)new_size;
 			return;
 		}
@@ -382,7 +382,7 @@
 		new_max = cur_max + rec_diff;
 		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
 		ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-				KM_SLEEP | KM_NOFS);
+				KM_NOFS);
 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
 		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -408,7 +408,7 @@
 	else
 		new_size = 0;
 	if (new_size > 0) {
-		new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+		new_broot = kmem_alloc(new_size, KM_NOFS);
 		/*
 		 * First copy over the btree block header.
 		 */
@@ -492,7 +492,7 @@
 	 * We enforce that here.
 	 */
 	ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
-			roundup(new_size, 4), KM_SLEEP | KM_NOFS);
+			roundup(new_size, 4), KM_NOFS);
 	ifp->if_bytes = new_size;
 }
 
@@ -683,7 +683,7 @@
 		return;
 
 	ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
-				       KM_SLEEP | KM_NOFS);
+				       KM_NOFS);
 	ip->i_cowfp->if_flags = XFS_IFEXTENTS;
 	ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
 	ip->i_cnextents = 0;

diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 51bb9bd..9a7fadb 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c

@@ -1174,7 +1174,7 @@
 /*
  * Record a refcount intent for later processing.
  */
-static int
+static void
 __xfs_refcount_add(
 	struct xfs_trans		*tp,
 	enum xfs_refcount_intent_type	type,
@@ -1189,44 +1189,43 @@
 			blockcount);
 
 	ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
-			KM_SLEEP | KM_NOFS);
+			KM_NOFS);
 	INIT_LIST_HEAD(&ri->ri_list);
 	ri->ri_type = type;
 	ri->ri_startblock = startblock;
 	ri->ri_blockcount = blockcount;
 
 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
-	return 0;
 }
 
 /*
  * Increase the reference count of the blocks backing a file's extent.
  */
-int
+void
 xfs_refcount_increase_extent(
 	struct xfs_trans		*tp,
 	struct xfs_bmbt_irec		*PREV)
 {
 	if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
-		return 0;
+		return;
 
-	return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE,
-			PREV->br_startblock, PREV->br_blockcount);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+			PREV->br_blockcount);
 }
 
 /*
  * Decrease the reference count of the blocks backing a file's extent.
  */
-int
+void
 xfs_refcount_decrease_extent(
 	struct xfs_trans		*tp,
 	struct xfs_bmbt_irec		*PREV)
 {
 	if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
-		return 0;
+		return;
 
-	return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE,
-			PREV->br_startblock, PREV->br_blockcount);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+			PREV->br_blockcount);
 }
 
 /*
@@ -1541,47 +1540,40 @@
 }
 
 /* Record a CoW staging extent in the refcount btree. */
-int
+void
 xfs_refcount_alloc_cow_extent(
 	struct xfs_trans		*tp,
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	int				error;
 
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
-		return 0;
+		return;
 
-	error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
-	if (error)
-		return error;
+	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
 
 	/* Add rmap entry */
-	return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+	xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
 			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
-int
+void
 xfs_refcount_free_cow_extent(
 	struct xfs_trans		*tp,
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	int				error;
 
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
-		return 0;
+		return;
 
 	/* Remove rmap entry */
-	error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+	xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
 			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
-	if (error)
-		return error;
-
-	return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
 }
 
 struct xfs_refcount_recovery {
@@ -1602,7 +1594,7 @@
 	if (be32_to_cpu(rec->refc.rc_refcount) != 1)
 		return -EFSCORRUPTED;
 
-	rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP);
+	rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
 	xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
 	list_add_tail(&rr->rr_list, debris);
 
@@ -1679,10 +1671,8 @@
 		/* Free the orphan record */
 		agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
 		fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
-		error = xfs_refcount_free_cow_extent(tp, fsb,
+		xfs_refcount_free_cow_extent(tp, fsb,
 				rr->rr_rrec.rc_blockcount);
-		if (error)
-			goto out_trans;
 
 		/* Free the block. */
 		xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);

diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 1d9c518..2097955 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h

@@ -29,9 +29,9 @@
 	xfs_extlen_t				ri_blockcount;
 };
 
-extern int xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp,
 		struct xfs_bmbt_irec *irec);
-extern int xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp,
 		struct xfs_bmbt_irec *irec);
 
 extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
@@ -45,10 +45,10 @@
 		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
 		xfs_extlen_t *flen, bool find_end_of_shared);
 
-extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp,
-		xfs_fsblock_t fsb, xfs_extlen_t len);
-extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp,
-		xfs_fsblock_t fsb, xfs_extlen_t len);
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+		xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+		xfs_extlen_t len);
 extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
 		xfs_agnumber_t agno);
 

diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index e6aeb39..38e9414 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c

@@ -168,7 +168,6 @@
 	union xfs_btree_rec	*rec,
 	struct xfs_rmap_irec	*irec)
 {
-	irec->rm_flags = 0;
 	irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
 	irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
 	irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
@@ -254,15 +253,15 @@
 			rec->rm_flags);
 
 	if (rec->rm_owner != info->high.rm_owner)
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
 	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
 	    rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 
 	*info->irec = *rec;
 	*info->stat = 1;
-	return XFS_BTREE_QUERY_RANGE_ABORT;
+	return -ECANCELED;
 }
 
 /*
@@ -305,7 +304,7 @@
 
 	error = xfs_rmap_query_range(cur, &info.high, &info.high,
 			xfs_rmap_find_left_neighbor_helper, &info);
-	if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+	if (error == -ECANCELED)
 		error = 0;
 	if (*stat)
 		trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
@@ -330,16 +329,16 @@
 			rec->rm_flags);
 
 	if (rec->rm_owner != info->high.rm_owner)
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
 	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
 	    (rec->rm_offset > info->high.rm_offset ||
 	     rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 
 	*info->irec = *rec;
 	*info->stat = 1;
-	return XFS_BTREE_QUERY_RANGE_ABORT;
+	return -ECANCELED;
 }
 
 /*
@@ -377,7 +376,7 @@
 			cur->bc_private.a.agno, bno, 0, owner, offset, flags);
 	error = xfs_rmap_query_range(cur, &info.high, &info.high,
 			xfs_rmap_lookup_le_range_helper, &info);
-	if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+	if (error == -ECANCELED)
 		error = 0;
 	if (*stat)
 		trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
@@ -2268,7 +2267,7 @@
  * Record a rmap intent; the list is kept sorted first by AG and then by
  * increasing age.
  */
-static int
+static void
 __xfs_rmap_add(
 	struct xfs_trans		*tp,
 	enum xfs_rmap_intent_type	type,
@@ -2287,7 +2286,7 @@
 			bmap->br_blockcount,
 			bmap->br_state);
 
-	ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS);
+	ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
 	INIT_LIST_HEAD(&ri->ri_list);
 	ri->ri_type = type;
 	ri->ri_owner = owner;
@@ -2295,11 +2294,10 @@
 	ri->ri_bmap = *bmap;
 
 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
-	return 0;
 }
 
 /* Map an extent into a file. */
-int
+void
 xfs_rmap_map_extent(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
@@ -2307,15 +2305,15 @@
 	struct xfs_bmbt_irec	*PREV)
 {
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
-		return 0;
+		return;
 
-	return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+	__xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
 			XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
 			whichfork, PREV);
 }
 
 /* Unmap an extent out of a file. */
-int
+void
 xfs_rmap_unmap_extent(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
@@ -2323,9 +2321,9 @@
 	struct xfs_bmbt_irec	*PREV)
 {
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
-		return 0;
+		return;
 
-	return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+	__xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
 			XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
 			whichfork, PREV);
 }
@@ -2336,7 +2334,7 @@
  * Note that tp can be NULL here as no transaction is used for COW fork
  * unwritten conversion.
  */
-int
+void
 xfs_rmap_convert_extent(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
@@ -2345,15 +2343,15 @@
 	struct xfs_bmbt_irec	*PREV)
 {
 	if (!xfs_rmap_update_is_needed(mp, whichfork))
-		return 0;
+		return;
 
-	return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+	__xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
 			XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
 			whichfork, PREV);
 }
 
 /* Schedule the creation of an rmap for non-file data. */
-int
+void
 xfs_rmap_alloc_extent(
 	struct xfs_trans	*tp,
 	xfs_agnumber_t		agno,
@@ -2364,18 +2362,18 @@
 	struct xfs_bmbt_irec	bmap;
 
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
-		return 0;
+		return;
 
 	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
 	bmap.br_blockcount = len;
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
 
-	return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+	__xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
 }
 
 /* Schedule the deletion of an rmap for non-file data. */
-int
+void
 xfs_rmap_free_extent(
 	struct xfs_trans	*tp,
 	xfs_agnumber_t		agno,
@@ -2386,14 +2384,14 @@
 	struct xfs_bmbt_irec	bmap;
 
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
-		return 0;
+		return;
 
 	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
 	bmap.br_blockcount = len;
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
 
-	return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+	__xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
 }
 
 /* Compare rmap records.  Returns -1 if a < b, 1 if a > b, and 0 if equal. */
@@ -2511,7 +2509,7 @@
 	    ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
 		return 0;
 	rks->has_rmap = true;
-	return XFS_BTREE_QUERY_RANGE_ABORT;
+	return -ECANCELED;
 }
 
 /*
@@ -2540,8 +2538,11 @@
 
 	error = xfs_rmap_query_range(cur, &low, &high,
 			xfs_rmap_has_other_keys_helper, &rks);
+	if (error < 0)
+		return error;
+
 	*has_rmap = rks.has_rmap;
-	return error;
+	return 0;
 }
 
 const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {

diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index e21ed02..abe6334 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h

@@ -68,6 +68,7 @@
 	if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
 		return -EFSCORRUPTED;
 	irec->rm_offset = XFS_RMAP_OFF(offset);
+	irec->rm_flags = 0;
 	if (offset & XFS_RMAP_OFF_ATTR_FORK)
 		irec->rm_flags |= XFS_RMAP_ATTR_FORK;
 	if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
@@ -161,16 +162,16 @@
 };
 
 /* functions for updating the rmapbt based on bmbt map/unmap operations */
-int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
+void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
 		struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *imap);
-int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
 		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
 		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
 
 void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,

diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index e0641b7..c45acbd 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h

@@ -177,10 +177,4 @@
 	unsigned int	agino_log;	/* #bits for agino in inum */
 };
 
-/* Keep iterating the data structure. */
-#define XFS_ITER_CONTINUE	(0)
-
-/* Stop iterating the data structure. */
-#define XFS_ITER_ABORT		(1)
-
 #endif /* __XFS_SHARED_H__ */

diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 802b34c..300b3e9 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h

@@ -169,6 +169,14 @@
 	xfs_exntst_t	br_state;	/* extent state */
 } xfs_bmbt_irec_t;
 
+/* per-AG block reservation types */
+enum xfs_ag_resv_type {
+	XFS_AG_RESV_NONE = 0,
+	XFS_AG_RESV_AGFL,
+	XFS_AG_RESV_METADATA,
+	XFS_AG_RESV_RMAPBT,
+};
+
 /*
  * Type verifier functions
  */

diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 16b09b9..ba0f747 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c

@@ -639,7 +639,7 @@
 	xchk_agfl_block_xref(sc, agbno);
 
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return XFS_ITER_ABORT;
+		return -ECANCELED;
 
 	return 0;
 }
@@ -730,7 +730,7 @@
 	/* Check the blocks in the AGFL. */
 	error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
 			sc->sa.agfl_bp, xchk_agfl_block, &sai);
-	if (error == XFS_ITER_ABORT) {
+	if (error == -ECANCELED) {
 		error = 0;
 		goto out_free;
 	}

diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 1afc58b..0edc7f8 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c

@@ -80,7 +80,7 @@
 	 * without the inode lock held, which means we can sleep.
 	 */
 	if (sc->flags & XCHK_TRY_HARDER) {
-		error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP);
+		error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0);
 		if (error)
 			return error;
 	}
@@ -163,8 +163,6 @@
 	args.valuelen = valuelen;
 
 	error = xfs_attr_get_ilocked(context->dp, &args);
-	if (error == -EEXIST)
-		error = 0;
 	if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
 			&error))
 		goto fail_xref;
@@ -173,7 +171,7 @@
 					     args.blkno);
 fail_xref:
 	if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		context->seen_enough = XFS_ITER_ABORT;
+		context->seen_enough = 1;
 	return;
 }
 

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 1bd29fd..fa6ea64 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c

@@ -75,6 +75,7 @@
 	xfs_fileoff_t		lastoff;
 	bool			is_rt;
 	bool			is_shared;
+	bool			was_loaded;
 	int			whichfork;
 };
 
@@ -213,25 +214,20 @@
 
 /* Cross-reference a single rtdev extent record. */
 STATIC void
-xchk_bmap_rt_extent_xref(
-	struct xchk_bmap_info	*info,
+xchk_bmap_rt_iextent_xref(
 	struct xfs_inode	*ip,
-	struct xfs_btree_cur	*cur,
+	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
-	if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return;
-
 	xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
 			irec->br_blockcount);
 }
 
 /* Cross-reference a single datadev extent record. */
 STATIC void
-xchk_bmap_extent_xref(
-	struct xchk_bmap_info	*info,
+xchk_bmap_iextent_xref(
 	struct xfs_inode	*ip,
-	struct xfs_btree_cur	*cur,
+	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
 	struct xfs_mount	*mp = info->sc->mp;
@@ -240,9 +236,6 @@
 	xfs_extlen_t		len;
 	int			error;
 
-	if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return;
-
 	agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
 	agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
 	len = irec->br_blockcount;
@@ -300,20 +293,15 @@
 
 /* Scrub a single extent record. */
 STATIC int
-xchk_bmap_extent(
+xchk_bmap_iextent(
 	struct xfs_inode	*ip,
-	struct xfs_btree_cur	*cur,
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
 	struct xfs_mount	*mp = info->sc->mp;
-	struct xfs_buf		*bp = NULL;
 	xfs_filblks_t		end;
 	int			error = 0;
 
-	if (cur)
-		xfs_btree_get_block(cur, 0, &bp);
-
 	/*
 	 * Check for out-of-order extents.  This record could have come
 	 * from the incore list, for which there is no ordering check.
@@ -364,10 +352,13 @@
 		xchk_fblock_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
+	if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
+
 	if (info->is_rt)
-		xchk_bmap_rt_extent_xref(info, ip, cur, irec);
+		xchk_bmap_rt_iextent_xref(ip, info, irec);
 	else
-		xchk_bmap_extent_xref(info, ip, cur, irec);
+		xchk_bmap_iextent_xref(ip, info, irec);
 
 	info->lastoff = irec->br_startoff + irec->br_blockcount;
 	return error;
@@ -380,10 +371,13 @@
 	union xfs_btree_rec	*rec)
 {
 	struct xfs_bmbt_irec	irec;
+	struct xfs_bmbt_irec	iext_irec;
+	struct xfs_iext_cursor	icur;
 	struct xchk_bmap_info	*info = bs->private;
 	struct xfs_inode	*ip = bs->cur->bc_private.b.ip;
 	struct xfs_buf		*bp = NULL;
 	struct xfs_btree_block	*block;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, info->whichfork);
 	uint64_t		owner;
 	int			i;
 
@@ -402,9 +396,26 @@
 		}
 	}
 
-	/* Set up the in-core record and scrub it. */
+	/*
+	 * Check that the incore extent tree contains an extent that matches
+	 * this one exactly.  We validate those cached bmaps later, so we don't
+	 * need to check them here.  If the incore extent tree was just loaded
+	 * from disk by the scrubber, we assume that its contents match what's
+	 * on disk (we still hold the ILOCK) and skip the equivalence check.
+	 */
+	if (!info->was_loaded)
+		return 0;
+
 	xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
-	return xchk_bmap_extent(ip, bs->cur, info, &irec);
+	if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur,
+				&iext_irec) ||
+	    irec.br_startoff != iext_irec.br_startoff ||
+	    irec.br_startblock != iext_irec.br_startblock ||
+	    irec.br_blockcount != iext_irec.br_blockcount ||
+	    irec.br_state != iext_irec.br_state)
+		xchk_fblock_set_corrupt(bs->sc, info->whichfork,
+				irec.br_startoff);
+	return 0;
 }
 
 /* Scan the btree records. */
@@ -415,15 +426,26 @@
 	struct xchk_bmap_info	*info)
 {
 	struct xfs_owner_info	oinfo;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(sc->ip, whichfork);
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_inode	*ip = sc->ip;
 	struct xfs_btree_cur	*cur;
 	int			error;
 
+	/* Load the incore bmap cache if it's not loaded. */
+	info->was_loaded = ifp->if_flags & XFS_IFEXTENTS;
+	if (!info->was_loaded) {
+		error = xfs_iread_extents(sc->tp, ip, whichfork);
+		if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
+			goto out;
+	}
+
+	/* Check the btree structure. */
 	cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
 	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
 	error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info);
 	xfs_btree_del_cursor(cur, error);
+out:
 	return error;
 }
 
@@ -500,7 +522,7 @@
 
 out:
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return XFS_BTREE_QUERY_RANGE_ABORT;
+		return -ECANCELED;
 	return 0;
 }
 
@@ -529,7 +551,7 @@
 	sbcri.sc = sc;
 	sbcri.whichfork = whichfork;
 	error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
-	if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+	if (error == -ECANCELED)
 		error = 0;
 
 	xfs_btree_del_cursor(cur, error);
@@ -671,13 +693,6 @@
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
 		goto out;
 
-	/* Now try to scrub the in-memory extent list. */
-        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-		error = xfs_iread_extents(sc->tp, ip, whichfork);
-		if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
-			goto out;
-	}
-
 	/* Find the offset of the last extent in the mapping. */
 	error = xfs_bmap_last_offset(ip, &endoff, whichfork);
 	if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
@@ -689,7 +704,7 @@
 	for_each_xfs_iext(ifp, &icur, &irec) {
 		if (xchk_should_terminate(sc, &error) ||
 		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
-			break;
+			goto out;
 		if (isnullstartblock(irec.br_startblock))
 			continue;
 		if (irec.br_startoff >= endoff) {
@@ -697,7 +712,7 @@
 					irec.br_startoff);
 			goto out;
 		}
-		error = xchk_bmap_extent(ip, NULL, &info, &irec);
+		error = xchk_bmap_iextent(ip, &info, &irec);
 		if (error)
 			goto out;
 	}

diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index fc3f510..98f82d7c 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c

@@ -125,7 +125,7 @@
 	struct xchk_fscounters	*fsc;
 	int			error;
 
-	sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP);
+	sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
 	if (!sc->buf)
 		return -ENOMEM;
 	fsc = sc->buf;

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 4cfeec5..b70a88b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c

@@ -351,7 +351,7 @@
 	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
 	xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
-	xfs_trans_log_buf(tp, bp, 0, bp->b_length);
+	xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
 	bp->b_ops = ops;
 	*bpp = bp;
 
@@ -664,7 +664,7 @@
 {
 	xfs_agblock_t		*agbno = priv;
 
-	return (*agbno == bno) ? XFS_ITER_ABORT : 0;
+	return (*agbno == bno) ? -ECANCELED : 0;
 }
 
 /* Does this block match the btree information passed in? */
@@ -694,7 +694,7 @@
 	if (owner == XFS_RMAP_OWN_AG) {
 		error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
 				xrep_findroot_agfl_walk, &agbno);
-		if (error == XFS_ITER_ABORT)
+		if (error == -ECANCELED)
 			return 0;
 		if (error)
 			return error;

diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 99c0b12..5641ae5 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c

@@ -22,7 +22,7 @@
 	struct xfs_inode	*ip)
 {
 	/* Allocate the buffer without the inode lock held. */
-	sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP);
+	sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
 	if (!sc->buf)
 		return -ENOMEM;
 

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index cbda40d..96d7071 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c

@@ -112,7 +112,7 @@
 {
 	struct xfs_inode *ip = XFS_I(inode);
 	struct posix_acl *acl = NULL;
-	struct xfs_acl *xfs_acl;
+	struct xfs_acl *xfs_acl = NULL;
 	unsigned char *ea_name;
 	int error;
 	int len;
@@ -135,12 +135,8 @@
 	 * go out to the disk.
 	 */
 	len = XFS_ACL_MAX_SIZE(ip->i_mount);
-	xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
-	if (!xfs_acl)
-		return ERR_PTR(-ENOMEM);
-
-	error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
-							&len, ATTR_ROOT);
+	error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len,
+				ATTR_ALLOC | ATTR_ROOT);
 	if (error) {
 		/*
 		 * If the attribute doesn't exist make sure we have a negative
@@ -151,8 +147,8 @@
 	} else  {
 		acl = xfs_acl_from_disk(xfs_acl, len,
 					XFS_ACL_MAX_ENTRIES(ip->i_mount));
+		kmem_free(xfs_acl);
 	}
-	kmem_free(xfs_acl);
 	return acl;
 }
 
@@ -180,7 +176,7 @@
 		struct xfs_acl *xfs_acl;
 		int len = XFS_ACL_MAX_SIZE(ip->i_mount);
 
-		xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+		xfs_acl = kmem_zalloc_large(len, 0);
 		if (!xfs_acl)
 			return -ENOMEM;
 

diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index dc93c51..a640a28 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c

@@ -147,7 +147,7 @@
 	 * Allocate storage for a list of all the "remote" value extents.
 	 */
 	size = count * sizeof(xfs_attr_inactive_list_t);
-	list = kmem_alloc(size, KM_SLEEP);
+	list = kmem_alloc(size, 0);
 
 	/*
 	 * Identify each of the "remote" value extents.

diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 58fc820..00758fd 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c

@@ -109,7 +109,7 @@
 	 * It didn't all fit, so we have to sort everything on hashval.
 	 */
 	sbsize = sf->hdr.count * sizeof(*sbuf);
-	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+	sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
 
 	/*
 	 * Scan the attribute list for the rest of the entries, storing

diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 9fa4a7e..83d24e98 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c

@@ -141,7 +141,7 @@
 {
 	struct xfs_bui_log_item		*buip;
 
-	buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP);
+	buip = kmem_zone_zalloc(xfs_bui_zone, 0);
 
 	xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
 	buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -218,7 +218,7 @@
 {
 	struct xfs_bud_log_item		*budp;
 
-	budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP);
+	budp = kmem_zone_zalloc(xfs_bud_zone, 0);
 	xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
 			  &xfs_bud_item_ops);
 	budp->bud_buip = buip;
@@ -542,9 +542,7 @@
 		irec.br_blockcount = count;
 		irec.br_startoff = bmap->me_startoff;
 		irec.br_state = state;
-		error = xfs_bmap_unmap_extent(tp, ip, &irec);
-		if (error)
-			goto err_inode;
+		xfs_bmap_unmap_extent(tp, ip, &irec);
 	}
 
 	set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 98c6a7a..0910cb7 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c

@@ -39,9 +39,9 @@
 xfs_daddr_t
 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 {
-	return (XFS_IS_REALTIME_INODE(ip) ? \
-		 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
-		 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+	if (XFS_IS_REALTIME_INODE(ip))
+		return XFS_FSB_TO_BB(ip->i_mount, fsb);
+	return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
 }
 
 /*
@@ -1532,24 +1532,16 @@
 			trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
 
 			/* Remove the mapping from the donor file. */
-			error = xfs_bmap_unmap_extent(tp, tip, &uirec);
-			if (error)
-				goto out;
+			xfs_bmap_unmap_extent(tp, tip, &uirec);
 
 			/* Remove the mapping from the source file. */
-			error = xfs_bmap_unmap_extent(tp, ip, &irec);
-			if (error)
-				goto out;
+			xfs_bmap_unmap_extent(tp, ip, &irec);
 
 			/* Map the donor file's blocks into the source file. */
-			error = xfs_bmap_map_extent(tp, ip, &uirec);
-			if (error)
-				goto out;
+			xfs_bmap_map_extent(tp, ip, &uirec);
 
 			/* Map the source file's blocks into the donor file. */
-			error = xfs_bmap_map_extent(tp, tip, &irec);
-			if (error)
-				goto out;
+			xfs_bmap_map_extent(tp, tip, &irec);
 
 			error = xfs_defer_finish(tpp);
 			tp = *tpp;

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ca08490..120ef99 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c

@@ -353,7 +353,8 @@
 	 */
 	size = BBTOB(bp->b_length);
 	if (size < PAGE_SIZE) {
-		bp->b_addr = kmem_alloc(size, KM_NOFS);
+		int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
+		bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS);
 		if (!bp->b_addr) {
 			/* low memory - use alloc_page loop instead */
 			goto use_alloc_page;
@@ -368,7 +369,7 @@
 		}
 		bp->b_offset = offset_in_page(bp->b_addr);
 		bp->b_pages = bp->b_page_array;
-		bp->b_pages[0] = virt_to_page(bp->b_addr);
+		bp->b_pages[0] = kmem_to_page(bp->b_addr);
 		bp->b_page_count = 1;
 		bp->b_flags |= _XBF_KMEM;
 		return 0;
@@ -1741,7 +1742,7 @@
 {
 	xfs_buftarg_t		*btp;
 
-	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
+	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
 
 	btp->bt_mount = mp;
 	btp->bt_dev =  bdev->bd_dev;

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c6e57a3..f6ce17d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h

@@ -350,6 +350,12 @@
 #define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
 
+static inline int
+xfs_buftarg_dma_alignment(struct xfs_buftarg *bt)
+{
+	return queue_dma_alignment(bt->bt_bdev->bd_disk->queue);
+}
+
 int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
 bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
 bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7dcaec5..d74fbd1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c

@@ -702,7 +702,7 @@
 	}
 
 	bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
-				KM_SLEEP);
+				0);
 	if (!bip->bli_formats)
 		return -ENOMEM;
 	return 0;
@@ -747,7 +747,7 @@
 		return 0;
 	}
 
-	bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
+	bip = kmem_zone_zalloc(xfs_buf_item_zone, 0);
 	xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
 	bip->bli_buf = bp;
 

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index fb1ad44..aeb95e7 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c

@@ -440,7 +440,7 @@
 {
 	struct xfs_dquot	*dqp;
 
-	dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
+	dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
 
 	dqp->dq_flags = type;
 	dqp->q_core.d_id = cpu_to_be32(id);
@@ -1239,7 +1239,7 @@
 /*
  * Iterate every dquot of a particular type.  The caller must ensure that the
  * particular quota type is active.  iter_fn can return negative error codes,
- * or XFS_ITER_ABORT to indicate that it wants to stop iterating.
+ * or -ECANCELED to indicate that it wants to stop iterating.
  */
 int
 xfs_qm_dqiterate(

diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 282ec5a..d60647d 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c

@@ -347,7 +347,7 @@
 {
 	struct xfs_qoff_logitem	*qf;
 
-	qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
+	qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0);
 
 	xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
 			&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);

diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 544c948..849fd447 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c

@@ -213,7 +213,7 @@
 	struct xfs_mount	*mp)
 {
 	mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
-			KM_SLEEP | KM_MAYFAIL);
+			KM_MAYFAIL);
 	if (!mp->m_errortag)
 		return -ENOMEM;
 

diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 0ed6837..2183d87 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c

@@ -33,7 +33,7 @@
 	struct rb_node		**rbp;
 	struct rb_node		*parent = NULL;
 
-	new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
+	new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
 	new->agno = agno;
 	new->bno = bno;
 	new->length = len;

diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 86f6512..e44efc4 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c

@@ -163,9 +163,9 @@
 	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
 		size = (uint)(sizeof(xfs_efi_log_item_t) +
 			((nextents - 1) * sizeof(xfs_extent_t)));
-		efip = kmem_zalloc(size, KM_SLEEP);
+		efip = kmem_zalloc(size, 0);
 	} else {
-		efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
+		efip = kmem_zone_zalloc(xfs_efi_zone, 0);
 	}
 
 	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
@@ -333,9 +333,9 @@
 	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
 		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
 				(nextents - 1) * sizeof(struct xfs_extent),
-				KM_SLEEP);
+				0);
 	} else {
-		efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
+		efdp = kmem_zone_zalloc(xfs_efd_zone, 0);
 	}
 
 	xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 28101bbc..d952d59 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c

@@ -28,6 +28,7 @@
 #include <linux/falloc.h>
 #include <linux/backing-dev.h>
 #include <linux/mman.h>
+#include <linux/fadvise.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -933,6 +934,30 @@
 	return error;
 }
 
+STATIC int
+xfs_file_fadvise(
+	struct file	*file,
+	loff_t		start,
+	loff_t		end,
+	int		advice)
+{
+	struct xfs_inode *ip = XFS_I(file_inode(file));
+	int ret;
+	int lockflags = 0;
+
+	/*
+	 * Operations creating pages in page cache need protection from hole
+	 * punching and similar ops
+	 */
+	if (advice == POSIX_FADV_WILLNEED) {
+		lockflags = XFS_IOLOCK_SHARED;
+		xfs_ilock(ip, lockflags);
+	}
+	ret = generic_fadvise(file, start, end, advice);
+	if (lockflags)
+		xfs_iunlock(ip, lockflags);
+	return ret;
+}
 
 STATIC loff_t
 xfs_file_remap_range(
@@ -1232,6 +1257,7 @@
 	.fsync		= xfs_file_fsync,
 	.get_unmapped_area = thp_get_unmapped_area,
 	.fallocate	= xfs_file_fallocate,
+	.fadvise	= xfs_file_fadvise,
 	.remap_file_range = xfs_file_remap_range,
 };
 

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 5a8f964..d082143 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c

@@ -250,7 +250,7 @@
 		rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
 		if (info->next_daddr < rec_daddr)
 			info->next_daddr = rec_daddr;
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 	}
 
 	/* Are we just counting mappings? */
@@ -259,14 +259,14 @@
 			info->head->fmh_entries++;
 
 		if (info->last)
-			return XFS_BTREE_QUERY_RANGE_CONTINUE;
+			return 0;
 
 		info->head->fmh_entries++;
 
 		rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
 		if (info->next_daddr < rec_daddr)
 			info->next_daddr = rec_daddr;
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 	}
 
 	/*
@@ -276,7 +276,7 @@
 	 */
 	if (rec_daddr > info->next_daddr) {
 		if (info->head->fmh_entries >= info->head->fmh_count)
-			return XFS_BTREE_QUERY_RANGE_ABORT;
+			return -ECANCELED;
 
 		fmr.fmr_device = info->dev;
 		fmr.fmr_physical = info->next_daddr;
@@ -295,7 +295,7 @@
 
 	/* Fill out the extent we found */
 	if (info->head->fmh_entries >= info->head->fmh_count)
-		return XFS_BTREE_QUERY_RANGE_ABORT;
+		return -ECANCELED;
 
 	trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
 
@@ -328,7 +328,7 @@
 	rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
 	if (info->next_daddr < rec_daddr)
 		info->next_daddr = rec_daddr;
-	return XFS_BTREE_QUERY_RANGE_CONTINUE;
+	return 0;
 }
 
 /* Transform a rmapbt irec into a fsmap */

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0b0fd10..944add5 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c

@@ -40,7 +40,7 @@
 	 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
 	 * code up to do this anyway.
 	 */
-	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+	ip = kmem_zone_alloc(xfs_inode_zone, 0);
 	if (!ip)
 		return NULL;
 	if (inode_init_always(mp->m_super, VFS_I(ip))) {

diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d99a0a3..3ebd1b7 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c

@@ -89,7 +89,7 @@
 {
 	struct xfs_icreate_item	*icp;
 
-	icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+	icp = kmem_zone_zalloc(xfs_icreate_zone, 0);
 
 	xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
 			  &xfs_icreate_item_ops);

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6467d5e..18f4b26 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c

@@ -2018,7 +2018,7 @@
 	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
 		return 0;
 
-	iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+	iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
 	iu->iu_agino = prev_agino;
 	iu->iu_next_unlinked = this_agino;
 
@@ -3282,7 +3282,8 @@
 					spaceres);
 
 	/*
-	 * Set up the target.
+	 * Check for expected errors before we dirty the transaction
+	 * so we can return an error without a transaction abort.
 	 */
 	if (target_ip == NULL) {
 		/*
@@ -3294,6 +3295,46 @@
 			if (error)
 				goto out_trans_cancel;
 		}
+	} else {
+		/*
+		 * If target exists and it's a directory, check that whether
+		 * it can be destroyed.
+		 */
+		if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
+		    (!xfs_dir_isempty(target_ip) ||
+		     (VFS_I(target_ip)->i_nlink > 2))) {
+			error = -EEXIST;
+			goto out_trans_cancel;
+		}
+	}
+
+	/*
+	 * Directory entry creation below may acquire the AGF. Remove
+	 * the whiteout from the unlinked list first to preserve correct
+	 * AGI/AGF locking order. This dirties the transaction so failures
+	 * after this point will abort and log recovery will clean up the
+	 * mess.
+	 *
+	 * For whiteouts, we need to bump the link count on the whiteout
+	 * inode. After this point, we have a real link, clear the tmpfile
+	 * state flag from the inode so it doesn't accidentally get misused
+	 * in future.
+	 */
+	if (wip) {
+		ASSERT(VFS_I(wip)->i_nlink == 0);
+		error = xfs_iunlink_remove(tp, wip);
+		if (error)
+			goto out_trans_cancel;
+
+		xfs_bumplink(tp, wip);
+		xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
+		VFS_I(wip)->i_state &= ~I_LINKABLE;
+	}
+
+	/*
+	 * Set up the target.
+	 */
+	if (target_ip == NULL) {
 		/*
 		 * If target does not exist and the rename crosses
 		 * directories, adjust the target directory link count
@@ -3312,22 +3353,6 @@
 		}
 	} else { /* target_ip != NULL */
 		/*
-		 * If target exists and it's a directory, check that both
-		 * target and source are directories and that target can be
-		 * destroyed, or that neither is a directory.
-		 */
-		if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
-			/*
-			 * Make sure target dir is empty.
-			 */
-			if (!(xfs_dir_isempty(target_ip)) ||
-			    (VFS_I(target_ip)->i_nlink > 2)) {
-				error = -EEXIST;
-				goto out_trans_cancel;
-			}
-		}
-
-		/*
 		 * Link the source inode under the target name.
 		 * If the source inode is a directory and we are moving
 		 * it across directories, its ".." entry will be
@@ -3417,30 +3442,6 @@
 	if (error)
 		goto out_trans_cancel;
 
-	/*
-	 * For whiteouts, we need to bump the link count on the whiteout inode.
-	 * This means that failures all the way up to this point leave the inode
-	 * on the unlinked list and so cleanup is a simple matter of dropping
-	 * the remaining reference to it. If we fail here after bumping the link
-	 * count, we're shutting down the filesystem so we'll never see the
-	 * intermediate state on disk.
-	 */
-	if (wip) {
-		ASSERT(VFS_I(wip)->i_nlink == 0);
-		xfs_bumplink(tp, wip);
-		error = xfs_iunlink_remove(tp, wip);
-		if (error)
-			goto out_trans_cancel;
-		xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
-
-		/*
-		 * Now we have a real link, clear the "I'm a tmpfile" state
-		 * flag from the inode so it doesn't accidentally get misused in
-		 * future.
-		 */
-		VFS_I(wip)->i_state &= ~I_LINKABLE;
-	}
-
 	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
 	if (new_parent)

diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c9a502ee..bb8f076 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c

@@ -651,7 +651,7 @@
 	struct xfs_inode_log_item *iip;
 
 	ASSERT(ip->i_itemp == NULL);
-	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0);
 
 	iip->ili_inode = ip;
 	xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index affa557..d58f0d6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c

@@ -396,7 +396,7 @@
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+	kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
 	if (!kbuf)
 		goto out_dput;
 
@@ -434,11 +434,11 @@
 
 	if (*len > XFS_XATTR_SIZE_MAX)
 		return -EINVAL;
-	kbuf = kmem_zalloc_large(*len, KM_SLEEP);
+	kbuf = kmem_zalloc_large(*len, 0);
 	if (!kbuf)
 		return -ENOMEM;
 
-	error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
+	error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags);
 	if (error)
 		goto out_kfree;
 
@@ -831,7 +831,7 @@
 /*
  * Check the incoming bulk request @hdr from userspace and initialize the
  * internal @breq bulk request appropriately.  Returns 0 if the bulk request
- * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual
+ * should proceed; -ECANCELED if there's nothing to do; or the usual
  * negative error code.
  */
 static int
@@ -889,13 +889,13 @@
 
 		/* Asking for an inode past the end of the AG?  We're done! */
 		if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
-			return XFS_ITER_ABORT;
+			return -ECANCELED;
 	} else if (hdr->agno)
 		return -EINVAL;
 
 	/* Asking for an inode past the end of the FS?  We're done! */
 	if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
-		return XFS_ITER_ABORT;
+		return -ECANCELED;
 
 	return 0;
 }
@@ -936,7 +936,7 @@
 		return -EFAULT;
 
 	error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat);
-	if (error == XFS_ITER_ABORT)
+	if (error == -ECANCELED)
 		goto out_teardown;
 	if (error < 0)
 		return error;
@@ -986,7 +986,7 @@
 		return -EFAULT;
 
 	error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
-	if (error == XFS_ITER_ABORT)
+	if (error == -ECANCELED)
 		goto out_teardown;
 	if (error < 0)
 		return error;
@@ -1038,6 +1038,10 @@
 
 	if (copy_from_user(&ageo, arg, sizeof(ageo)))
 		return -EFAULT;
+	if (ageo.ag_flags)
+		return -EINVAL;
+	if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
+		return -EINVAL;
 
 	error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
 	if (error)
@@ -1309,8 +1313,7 @@
 	if (fa->fsx_xflags & FS_XFLAG_DAX) {
 		if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
 			return -EINVAL;
-		if (S_ISREG(inode->i_mode) &&
-		    !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
+		if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
 				sb->s_blocksize))
 			return -EINVAL;
 	}
@@ -1881,7 +1884,7 @@
 	info.mp = ip->i_mount;
 	info.data = arg;
 	error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
-	if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+	if (error == -ECANCELED) {
 		error = 0;
 		aborted = true;
 	} else if (error)

diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 7bd7534..1e08bf7 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c

@@ -381,7 +381,7 @@
 		return PTR_ERR(dentry);
 
 	error = -ENOMEM;
-	kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+	kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
 	if (!kbuf)
 		goto out_dput;
 

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 3a4310d..f780e223 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c

@@ -58,7 +58,7 @@
 {
 	struct xfs_mount	*mp = ip->i_mount;
 
-	if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
 		return xfs_alert_fsblock_zero(ip, imap);
 
 	if (imap->br_startblock == HOLESTARTBLOCK) {
@@ -297,7 +297,7 @@
 		goto out_unlock;
 	}
 
-	if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
 		error = xfs_alert_fsblock_zero(ip, imap);
 
 out_unlock:
@@ -814,7 +814,7 @@
 		if (error)
 			return error;
 
-		if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+		if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
 			return xfs_alert_fsblock_zero(ip, &imap);
 
 		if ((numblks_fsb = imap.br_blockcount) == 0) {

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f5c955d..884950a 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c

@@ -137,7 +137,7 @@
 	xfs_irele(ip);
 
 	error = bc->formatter(bc->breq, buf);
-	if (error == XFS_IBULK_ABORT)
+	if (error == -ECANCELED)
 		goto out_advance;
 	if (error)
 		goto out;
@@ -169,7 +169,7 @@
 	ASSERT(breq->icount == 1);
 
 	bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
-			KM_SLEEP | KM_MAYFAIL);
+			KM_MAYFAIL);
 	if (!bc.buf)
 		return -ENOMEM;
 
@@ -181,7 +181,7 @@
 	 * If we reported one inode to userspace then we abort because we hit
 	 * the end of the buffer.  Don't leak that back to userspace.
 	 */
-	if (error == XFS_IWALK_ABORT)
+	if (error == -ECANCELED)
 		error = 0;
 
 	return error;
@@ -243,7 +243,7 @@
 		return 0;
 
 	bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
-			KM_SLEEP | KM_MAYFAIL);
+			KM_MAYFAIL);
 	if (!bc.buf)
 		return -ENOMEM;
 
@@ -342,7 +342,7 @@
 	int			error;
 
 	error = ic->formatter(ic->breq, &inogrp);
-	if (error && error != XFS_IBULK_ABORT)
+	if (error && error != -ECANCELED)
 		return error;
 
 	ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +

diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index e90c1fc..96a1e2a 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h

@@ -18,9 +18,6 @@
 /* Only iterate within the same AG as startino */
 #define XFS_IBULK_SAME_AG	(XFS_IWALK_SAME_AG)
 
-/* Return value that means we want to abort the walk. */
-#define XFS_IBULK_ABORT		(XFS_IWALK_ABORT)
-
 /*
  * Advance the user buffer pointer by one record of the given size.  If the
  * buffer is now full, return the appropriate error code.
@@ -34,13 +31,21 @@
 
 	breq->ubuffer = b + bytes;
 	breq->ocount++;
-	return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0;
+	return breq->ocount == breq->icount ? -ECANCELED : 0;
 }
 
 /*
  * Return stat information in bulk (by-inode) for the filesystem.
  */
 
+/*
+ * Return codes for the formatter function are 0 to continue iterating, and
+ * non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * bulkstat/inumbers caller.  The special value -ECANCELED can be used to stop
+ * iteration, as neither bulkstat nor inumbers will ever generate that error
+ * code on their own.
+ */
+
 typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,
 		const struct xfs_bulkstat *bstat);
 

diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 8c7d7271..aa375cf 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c

@@ -31,7 +31,7 @@
  * inode it finds, it calls a walk function with the relevant inode number and
  * a pointer to caller-provided data.  The walk function can return the usual
  * negative error code to stop the iteration; 0 to continue the iteration; or
- * XFS_IWALK_ABORT to stop the iteration.  This return value is returned to the
+ * -ECANCELED to stop the iteration.  This return value is returned to the
  * caller.
  *
  * Internally, we allow the walk function to do anything, which means that we
@@ -616,7 +616,7 @@
 		if (xfs_pwork_ctl_want_abort(&pctl))
 			break;
 
-		iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP);
+		iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
 		iwag->mp = mp;
 		iwag->iwalk_fn = iwalk_fn;
 		iwag->data = data;

diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 6c960e1..37a795f 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h

@@ -6,12 +6,17 @@
 #ifndef __XFS_IWALK_H__
 #define __XFS_IWALK_H__
 
+/*
+ * Return codes for the inode/inobt walk function are 0 to continue iterating,
+ * and non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * iwalk or inobt_walk caller.  The special value -ECANCELED can be used to
+ * stop iteration, as neither iwalk nor inobt_walk will ever generate that
+ * error code on their own.
+ */
+
 /* Walk all inodes in the filesystem starting from @startino. */
 typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
 			    xfs_ino_t ino, void *data);
-/* Return values for xfs_iwalk_fn. */
-#define XFS_IWALK_CONTINUE	(XFS_ITER_CONTINUE)
-#define XFS_IWALK_ABORT		(XFS_ITER_ABORT)
 
 int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
 		unsigned int flags, xfs_iwalk_fn iwalk_fn,
@@ -30,8 +35,6 @@
 				 xfs_agnumber_t agno,
 				 const struct xfs_inobt_rec_incore *irec,
 				 void *data);
-/* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */
-#define XFS_INOBT_WALK_ABORT	(XFS_IWALK_ABORT)
 
 int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp,
 		xfs_ino_t startino, unsigned int flags,

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7fc3c1a..a2beee9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c

@@ -214,15 +214,42 @@
 {
 	struct xlog_ticket	*tic;
 	int			need_bytes;
+	bool			woken_task = false;
 
 	list_for_each_entry(tic, &head->waiters, t_queue) {
+
+		/*
+		 * There is a chance that the size of the CIL checkpoints in
+		 * progress at the last AIL push target calculation resulted in
+		 * limiting the target to the log head (l_last_sync_lsn) at the
+		 * time. This may not reflect where the log head is now as the
+		 * CIL checkpoints may have completed.
+		 *
+		 * Hence when we are woken here, it may be that the head of the
+		 * log that has moved rather than the tail. As the tail didn't
+		 * move, there still won't be space available for the
+		 * reservation we require.  However, if the AIL has already
+		 * pushed to the target defined by the old log head location, we
+		 * will hang here waiting for something else to update the AIL
+		 * push target.
+		 *
+		 * Therefore, if there isn't space to wake the first waiter on
+		 * the grant head, we need to push the AIL again to ensure the
+		 * target reflects both the current log tail and log head
+		 * position before we wait for the tail to move again.
+		 */
+
 		need_bytes = xlog_ticket_reservation(log, head, tic);
-		if (*free_bytes < need_bytes)
+		if (*free_bytes < need_bytes) {
+			if (!woken_task)
+				xlog_grant_push_ail(log, need_bytes);
 			return false;
+		}
 
 		*free_bytes -= need_bytes;
 		trace_xfs_log_grant_wake_up(log, tic);
 		wake_up_process(tic->t_task);
+		woken_task = true;
 	}
 
 	return true;
@@ -428,8 +455,7 @@
 	XFS_STATS_INC(mp, xs_try_logspace);
 
 	ASSERT(*ticp == NULL);
-	tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
-				KM_SLEEP);
+	tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0);
 	*ticp = tic;
 
 	xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -1404,6 +1430,7 @@
 	 */
 	ASSERT(log->l_iclog_size >= 4096);
 	for (i = 0; i < log->l_iclog_bufs; i++) {
+		int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
 		size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
 				sizeof(struct bio_vec);
 
@@ -1415,8 +1442,8 @@
 		iclog->ic_prev = prev_iclog;
 		prev_iclog = iclog;
 
-		iclog->ic_data = kmem_alloc_large(log->l_iclog_size,
-				KM_MAYFAIL);
+		iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
+						KM_MAYFAIL);
 		if (!iclog->ic_data)
 			goto out_free_iclog;
 #ifdef DEBUG
@@ -2496,21 +2523,35 @@
  *****************************************************************************
  */
 
-/* Clean iclogs starting from the head.  This ordering must be
- * maintained, so an iclog doesn't become ACTIVE beyond one that
- * is SYNCING.  This is also required to maintain the notion that we use
- * a ordered wait queue to hold off would be writers to the log when every
- * iclog is trying to sync to disk.
+/*
+ * An iclog has just finished IO completion processing, so we need to update
+ * the iclog state and propagate that up into the overall log state. Hence we
+ * prepare the iclog for cleaning, and then clean all the pending dirty iclogs
+ * starting from the head, and then wake up any threads that are waiting for the
+ * iclog to be marked clean.
  *
- * State Change: DIRTY -> ACTIVE
+ * The ordering of marking iclogs ACTIVE must be maintained, so an iclog
+ * doesn't become ACTIVE beyond one that is SYNCING.  This is also required to
+ * maintain the notion that we use a ordered wait queue to hold off would be
+ * writers to the log when every iclog is trying to sync to disk.
+ *
+ * Caller must hold the icloglock before calling us.
+ *
+ * State Change: !IOERROR -> DIRTY -> ACTIVE
  */
 STATIC void
-xlog_state_clean_log(
-	struct xlog *log)
+xlog_state_clean_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*dirty_iclog)
 {
-	xlog_in_core_t	*iclog;
-	int changed = 0;
+	struct xlog_in_core	*iclog;
+	int			changed = 0;
 
+	/* Prepare the completed iclog. */
+	if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR))
+		dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+
+	/* Walk all the iclogs to update the ordered active state. */
 	iclog = log->l_iclog;
 	do {
 		if (iclog->ic_state == XLOG_STATE_DIRTY) {
@@ -2548,7 +2589,13 @@
 		iclog = iclog->ic_next;
 	} while (iclog != log->l_iclog);
 
-	/* log is locked when we are called */
+
+	/*
+	 * Wake up threads waiting in xfs_log_force() for the dirty iclog
+	 * to be cleaned.
+	 */
+	wake_up_all(&dirty_iclog->ic_force_wait);
+
 	/*
 	 * Change state for the dummy log recording.
 	 * We usually go to NEED. But we go to NEED2 if the changed indicates
@@ -2582,7 +2629,7 @@
 			ASSERT(0);
 		}
 	}
-}	/* xlog_state_clean_log */
+}
 
 STATIC xfs_lsn_t
 xlog_get_lowest_lsn(
@@ -2603,30 +2650,205 @@
 	return lowest_lsn;
 }
 
+/*
+ * Completion of a iclog IO does not imply that a transaction has completed, as
+ * transactions can be large enough to span many iclogs. We cannot change the
+ * tail of the log half way through a transaction as this may be the only
+ * transaction in the log and moving the tail to point to the middle of it
+ * will prevent recovery from finding the start of the transaction. Hence we
+ * should only update the last_sync_lsn if this iclog contains transaction
+ * completion callbacks on it.
+ *
+ * We have to do this before we drop the icloglock to ensure we are the only one
+ * that can update it.
+ *
+ * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
+ * the reservation grant head pushing. This is due to the fact that the push
+ * target is bound by the current last_sync_lsn value. Hence if we have a large
+ * amount of log space bound up in this committing transaction then the
+ * last_sync_lsn value may be the limiting factor preventing tail pushing from
+ * freeing space in the log. Hence once we've updated the last_sync_lsn we
+ * should push the AIL to ensure the push target (and hence the grant head) is
+ * no longer bound by the old log head location and can move forwards and make
+ * progress again.
+ */
+static void
+xlog_state_set_callback(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	xfs_lsn_t		header_lsn)
+{
+	iclog->ic_state = XLOG_STATE_CALLBACK;
+
+	ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+			   header_lsn) <= 0);
+
+	if (list_empty_careful(&iclog->ic_callbacks))
+		return;
+
+	atomic64_set(&log->l_last_sync_lsn, header_lsn);
+	xlog_grant_push_ail(log, 0);
+}
+
+/*
+ * Return true if we need to stop processing, false to continue to the next
+ * iclog. The caller will need to run callbacks if the iclog is returned in the
+ * XLOG_STATE_CALLBACK state.
+ */
+static bool
+xlog_state_iodone_process_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	struct xlog_in_core	*completed_iclog,
+	bool			*ioerror)
+{
+	xfs_lsn_t		lowest_lsn;
+	xfs_lsn_t		header_lsn;
+
+	/* Skip all iclogs in the ACTIVE & DIRTY states */
+	if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+		return false;
+
+	/*
+	 * Between marking a filesystem SHUTDOWN and stopping the log, we do
+	 * flush all iclogs to disk (if there wasn't a log I/O error). So, we do
+	 * want things to go smoothly in case of just a SHUTDOWN  w/o a
+	 * LOG_IO_ERROR.
+	 */
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		*ioerror = true;
+		return false;
+	}
+
+	/*
+	 * Can only perform callbacks in order.  Since this iclog is not in the
+	 * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean
+	 * up.  If we set our iclog to DO_CALLBACK, we will not process it when
+	 * we retry since a previous iclog is in the CALLBACK and the state
+	 * cannot change since we are holding the l_icloglock.
+	 */
+	if (!(iclog->ic_state &
+			(XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) {
+		if (completed_iclog &&
+		    (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) {
+			completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK;
+		}
+		return true;
+	}
+
+	/*
+	 * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC
+	 * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught
+	 * by the above if and are going to clean (i.e. we aren't doing their
+	 * callbacks) see the above if.
+	 *
+	 * We will do one more check here to see if we have chased our tail
+	 * around. If this is not the lowest lsn iclog, then we will leave it
+	 * for another completion to process.
+	 */
+	header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+	lowest_lsn = xlog_get_lowest_lsn(log);
+	if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
+		return false;
+
+	xlog_state_set_callback(log, iclog, header_lsn);
+	return false;
+
+}
+
+/*
+ * Keep processing entries in the iclog callback list until we come around and
+ * it is empty.  We need to atomically see that the list is empty and change the
+ * state to DIRTY so that we don't miss any more callbacks being added.
+ *
+ * This function is called with the icloglock held and returns with it held. We
+ * drop it while running callbacks, however, as holding it over thousands of
+ * callbacks is unnecessary and causes excessive contention if we do.
+ */
+static void
+xlog_state_do_iclog_callbacks(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	bool			aborted)
+{
+	spin_unlock(&log->l_icloglock);
+	spin_lock(&iclog->ic_callback_lock);
+	while (!list_empty(&iclog->ic_callbacks)) {
+		LIST_HEAD(tmp);
+
+		list_splice_init(&iclog->ic_callbacks, &tmp);
+
+		spin_unlock(&iclog->ic_callback_lock);
+		xlog_cil_process_committed(&tmp, aborted);
+		spin_lock(&iclog->ic_callback_lock);
+	}
+
+	/*
+	 * Pick up the icloglock while still holding the callback lock so we
+	 * serialise against anyone trying to add more callbacks to this iclog
+	 * now we've finished processing.
+	 */
+	spin_lock(&log->l_icloglock);
+	spin_unlock(&iclog->ic_callback_lock);
+}
+
+#ifdef DEBUG
+/*
+ * Make one last gasp attempt to see if iclogs are being left in limbo.  If the
+ * above loop finds an iclog earlier than the current iclog and in one of the
+ * syncing states, the current iclog is put into DO_CALLBACK and the callbacks
+ * are deferred to the completion of the earlier iclog. Walk the iclogs in order
+ * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in
+ * one of the syncing states.
+ *
+ * Note that SYNCING|IOERROR is a valid state so we cannot just check for
+ * ic_state == SYNCING.
+ */
+static void
+xlog_state_callback_check_state(
+	struct xlog		*log)
+{
+	struct xlog_in_core	*first_iclog = log->l_iclog;
+	struct xlog_in_core	*iclog = first_iclog;
+
+	do {
+		ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+		/*
+		 * Terminate the loop if iclogs are found in states
+		 * which will cause other threads to clean up iclogs.
+		 *
+		 * SYNCING - i/o completion will go through logs
+		 * DONE_SYNC - interrupt thread should be waiting for
+		 *              l_icloglock
+		 * IOERROR - give up hope all ye who enter here
+		 */
+		if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+		    iclog->ic_state & XLOG_STATE_SYNCING ||
+		    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+		    iclog->ic_state == XLOG_STATE_IOERROR )
+			break;
+		iclog = iclog->ic_next;
+	} while (first_iclog != iclog);
+}
+#else
+#define xlog_state_callback_check_state(l)	((void)0)
+#endif
+
 STATIC void
 xlog_state_do_callback(
 	struct xlog		*log,
 	bool			aborted,
 	struct xlog_in_core	*ciclog)
 {
-	xlog_in_core_t	   *iclog;
-	xlog_in_core_t	   *first_iclog;	/* used to know when we've
-						 * processed all iclogs once */
-	int		   flushcnt = 0;
-	xfs_lsn_t	   lowest_lsn;
-	int		   ioerrors;	/* counter: iclogs with errors */
-	int		   loopdidcallbacks; /* flag: inner loop did callbacks*/
-	int		   funcdidcallbacks; /* flag: function did callbacks */
-	int		   repeats;	/* for issuing console warnings if
-					 * looping too many times */
-	int		   wake = 0;
+	struct xlog_in_core	*iclog;
+	struct xlog_in_core	*first_iclog;
+	bool			did_callbacks = false;
+	bool			cycled_icloglock;
+	bool			ioerror;
+	int			flushcnt = 0;
+	int			repeats = 0;
 
 	spin_lock(&log->l_icloglock);
-	first_iclog = iclog = log->l_iclog;
-	ioerrors = 0;
-	funcdidcallbacks = 0;
-	repeats = 0;
-
 	do {
 		/*
 		 * Scan all iclogs starting with the one pointed to by the
@@ -2638,137 +2860,34 @@
 		 */
 		first_iclog = log->l_iclog;
 		iclog = log->l_iclog;
-		loopdidcallbacks = 0;
+		cycled_icloglock = false;
+		ioerror = false;
 		repeats++;
 
 		do {
+			if (xlog_state_iodone_process_iclog(log, iclog,
+							ciclog, &ioerror))
+				break;
 
-			/* skip all iclogs in the ACTIVE & DIRTY states */
-			if (iclog->ic_state &
-			    (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
+			if (!(iclog->ic_state &
+			      (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) {
 				iclog = iclog->ic_next;
 				continue;
 			}
 
 			/*
-			 * Between marking a filesystem SHUTDOWN and stopping
-			 * the log, we do flush all iclogs to disk (if there
-			 * wasn't a log I/O error). So, we do want things to
-			 * go smoothly in case of just a SHUTDOWN  w/o a
-			 * LOG_IO_ERROR.
+			 * Running callbacks will drop the icloglock which means
+			 * we'll have to run at least one more complete loop.
 			 */
-			if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
-				/*
-				 * Can only perform callbacks in order.  Since
-				 * this iclog is not in the DONE_SYNC/
-				 * DO_CALLBACK state, we skip the rest and
-				 * just try to clean up.  If we set our iclog
-				 * to DO_CALLBACK, we will not process it when
-				 * we retry since a previous iclog is in the
-				 * CALLBACK and the state cannot change since
-				 * we are holding the l_icloglock.
-				 */
-				if (!(iclog->ic_state &
-					(XLOG_STATE_DONE_SYNC |
-						 XLOG_STATE_DO_CALLBACK))) {
-					if (ciclog && (ciclog->ic_state ==
-							XLOG_STATE_DONE_SYNC)) {
-						ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
-					}
-					break;
-				}
-				/*
-				 * We now have an iclog that is in either the
-				 * DO_CALLBACK or DONE_SYNC states. The other
-				 * states (WANT_SYNC, SYNCING, or CALLBACK were
-				 * caught by the above if and are going to
-				 * clean (i.e. we aren't doing their callbacks)
-				 * see the above if.
-				 */
+			cycled_icloglock = true;
+			xlog_state_do_iclog_callbacks(log, iclog, aborted);
 
-				/*
-				 * We will do one more check here to see if we
-				 * have chased our tail around.
-				 */
-
-				lowest_lsn = xlog_get_lowest_lsn(log);
-				if (lowest_lsn &&
-				    XFS_LSN_CMP(lowest_lsn,
-						be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
-					iclog = iclog->ic_next;
-					continue; /* Leave this iclog for
-						   * another thread */
-				}
-
-				iclog->ic_state = XLOG_STATE_CALLBACK;
-
-
-				/*
-				 * Completion of a iclog IO does not imply that
-				 * a transaction has completed, as transactions
-				 * can be large enough to span many iclogs. We
-				 * cannot change the tail of the log half way
-				 * through a transaction as this may be the only
-				 * transaction in the log and moving th etail to
-				 * point to the middle of it will prevent
-				 * recovery from finding the start of the
-				 * transaction. Hence we should only update the
-				 * last_sync_lsn if this iclog contains
-				 * transaction completion callbacks on it.
-				 *
-				 * We have to do this before we drop the
-				 * icloglock to ensure we are the only one that
-				 * can update it.
-				 */
-				ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-					be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-				if (!list_empty_careful(&iclog->ic_callbacks))
-					atomic64_set(&log->l_last_sync_lsn,
-						be64_to_cpu(iclog->ic_header.h_lsn));
-
-			} else
-				ioerrors++;
-
-			spin_unlock(&log->l_icloglock);
-
-			/*
-			 * Keep processing entries in the callback list until
-			 * we come around and it is empty.  We need to
-			 * atomically see that the list is empty and change the
-			 * state to DIRTY so that we don't miss any more
-			 * callbacks being added.
-			 */
-			spin_lock(&iclog->ic_callback_lock);
-			while (!list_empty(&iclog->ic_callbacks)) {
-				LIST_HEAD(tmp);
-
-				list_splice_init(&iclog->ic_callbacks, &tmp);
-
-				spin_unlock(&iclog->ic_callback_lock);
-				xlog_cil_process_committed(&tmp, aborted);
-				spin_lock(&iclog->ic_callback_lock);
-			}
-
-			loopdidcallbacks++;
-			funcdidcallbacks++;
-
-			spin_lock(&log->l_icloglock);
-			spin_unlock(&iclog->ic_callback_lock);
-			if (!(iclog->ic_state & XLOG_STATE_IOERROR))
-				iclog->ic_state = XLOG_STATE_DIRTY;
-
-			/*
-			 * Transition from DIRTY to ACTIVE if applicable.
-			 * NOP if STATE_IOERROR.
-			 */
-			xlog_state_clean_log(log);
-
-			/* wake up threads waiting in xfs_log_force() */
-			wake_up_all(&iclog->ic_force_wait);
-
+			xlog_state_clean_iclog(log, iclog);
 			iclog = iclog->ic_next;
 		} while (first_iclog != iclog);
 
+		did_callbacks |= cycled_icloglock;
+
 		if (repeats > 5000) {
 			flushcnt += repeats;
 			repeats = 0;
@@ -2776,50 +2895,15 @@
 				"%s: possible infinite loop (%d iterations)",
 				__func__, flushcnt);
 		}
-	} while (!ioerrors && loopdidcallbacks);
+	} while (!ioerror && cycled_icloglock);
 
-#ifdef DEBUG
-	/*
-	 * Make one last gasp attempt to see if iclogs are being left in limbo.
-	 * If the above loop finds an iclog earlier than the current iclog and
-	 * in one of the syncing states, the current iclog is put into
-	 * DO_CALLBACK and the callbacks are deferred to the completion of the
-	 * earlier iclog. Walk the iclogs in order and make sure that no iclog
-	 * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
-	 * states.
-	 *
-	 * Note that SYNCING|IOABORT is a valid state so we cannot just check
-	 * for ic_state == SYNCING.
-	 */
-	if (funcdidcallbacks) {
-		first_iclog = iclog = log->l_iclog;
-		do {
-			ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
-			/*
-			 * Terminate the loop if iclogs are found in states
-			 * which will cause other threads to clean up iclogs.
-			 *
-			 * SYNCING - i/o completion will go through logs
-			 * DONE_SYNC - interrupt thread should be waiting for
-			 *              l_icloglock
-			 * IOERROR - give up hope all ye who enter here
-			 */
-			if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
-			    iclog->ic_state & XLOG_STATE_SYNCING ||
-			    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
-			    iclog->ic_state == XLOG_STATE_IOERROR )
-				break;
-			iclog = iclog->ic_next;
-		} while (first_iclog != iclog);
-	}
-#endif
+	if (did_callbacks)
+		xlog_state_callback_check_state(log);
 
 	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
-		wake = 1;
-	spin_unlock(&log->l_icloglock);
-
-	if (wake)
 		wake_up_all(&log->l_flush_wait);
+
+	spin_unlock(&log->l_icloglock);
 }
 
 
@@ -3919,7 +4003,9 @@
 	 * item committed callback functions will do this again under lock to
 	 * avoid races.
 	 */
+	spin_lock(&log->l_cilp->xc_push_lock);
 	wake_up_all(&log->l_cilp->xc_commit_wait);
+	spin_unlock(&log->l_cilp->xc_push_lock);
 	xlog_state_do_callback(log, true, NULL);
 
 #ifdef XFSERRORDEBUG

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index fa5602d..ef652abd 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c

@@ -38,7 +38,7 @@
 	struct xlog_ticket *tic;
 
 	tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
-				KM_SLEEP|KM_NOFS);
+				KM_NOFS);
 
 	/*
 	 * set the current reservation to zero so we know to steal the basic
@@ -186,7 +186,7 @@
 			 */
 			kmem_free(lip->li_lv_shadow);
 
-			lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS);
+			lv = kmem_alloc_large(buf_size, KM_NOFS);
 			memset(lv, 0, xlog_cil_iovec_space(niovecs));
 
 			lv->lv_item = lip;
@@ -660,7 +660,7 @@
 	if (!cil)
 		return 0;
 
-	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
 	new_ctx->ticket = xlog_cil_ticket_alloc(log);
 
 	down_write(&cil->xc_ctx_lock);
@@ -1179,11 +1179,11 @@
 	struct xfs_cil	*cil;
 	struct xfs_cil_ctx *ctx;
 
-	cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+	cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
 	if (!cil)
 		return -ENOMEM;
 
-	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+	ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
 	if (!ctx) {
 		kmem_free(cil);
 		return -ENOMEM;

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13d1d3e..5083190 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c

@@ -97,6 +97,8 @@
 	struct xlog	*log,
 	int		nbblks)
 {
+	int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
+
 	/*
 	 * Pass log block 0 since we don't have an addr yet, buffer will be
 	 * verified on read.
@@ -125,7 +127,7 @@
 	if (nbblks > 1 && log->l_sectBBsize > 1)
 		nbblks += log->l_sectBBsize;
 	nbblks = round_up(nbblks, log->l_sectBBsize);
-	return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL);
+	return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL);
 }
 
 /*
@@ -1960,7 +1962,7 @@
 		}
 	}
 
-	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
 	bcp->bc_blkno = buf_f->blf_blkno;
 	bcp->bc_len = buf_f->blf_len;
 	bcp->bc_refcount = 1;
@@ -2930,7 +2932,7 @@
 	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
 		in_f = item->ri_buf[0].i_addr;
 	} else {
-		in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP);
+		in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
 		need_free = 1;
 		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
 		if (error)
@@ -4161,7 +4163,7 @@
 {
 	xlog_recover_item_t	*item;
 
-	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+	item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
 	INIT_LIST_HEAD(&item->ri_list);
 	list_add_tail(&item->ri_list, head);
 }
@@ -4201,7 +4203,7 @@
 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
 
-	ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
+	ptr = kmem_realloc(old_ptr, len + old_len, 0);
 	memcpy(&ptr[old_len], dp, len);
 	item->ri_buf[item->ri_cnt-1].i_len += len;
 	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4261,7 +4263,7 @@
 		return 0;
 	}
 
-	ptr = kmem_alloc(len, KM_SLEEP);
+	ptr = kmem_alloc(len, 0);
 	memcpy(ptr, dp, len);
 	in_f = (struct xfs_inode_log_format *)ptr;
 
@@ -4289,7 +4291,7 @@
 		item->ri_total = in_f->ilf_size;
 		item->ri_buf =
 			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
-				    KM_SLEEP);
+				    0);
 	}
 	ASSERT(item->ri_total > item->ri_cnt);
 	/* Description region is ri_buf[0] */
@@ -4423,7 +4425,7 @@
 	 * This is a new transaction so allocate a new recovery container to
 	 * hold the recovery ops that will follow.
 	 */
-	trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
+	trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
 	trans->r_log_tid = tid;
 	trans->r_lsn = be64_to_cpu(rhead->h_lsn);
 	INIT_LIST_HEAD(&trans->r_itemq);
@@ -5022,16 +5024,27 @@
 }
 
 /*
- * xlog_iunlink_recover
+ * Recover AGI unlinked lists
  *
- * This is called during recovery to process any inodes which
- * we unlinked but not freed when the system crashed.  These
- * inodes will be on the lists in the AGI blocks.  What we do
- * here is scan all the AGIs and fully truncate and free any
- * inodes found on the lists.  Each inode is removed from the
- * lists when it has been fully truncated and is freed.  The
- * freeing of the inode and its removal from the list must be
- * atomic.
+ * This is called during recovery to process any inodes which we unlinked but
+ * not freed when the system crashed.  These inodes will be on the lists in the
+ * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
+ * any inodes found on the lists. Each inode is removed from the lists when it
+ * has been fully truncated and is freed. The freeing of the inode and its
+ * removal from the list must be atomic.
+ *
+ * If everything we touch in the agi processing loop is already in memory, this
+ * loop can hold the cpu for a long time. It runs without lock contention,
+ * memory allocation contention, the need wait for IO, etc, and so will run
+ * until we either run out of inodes to process, run low on memory or we run out
+ * of log space.
+ *
+ * This behaviour is bad for latency on single CPU and non-preemptible kernels,
+ * and can prevent other filesytem work (such as CIL pushes) from running. This
+ * can lead to deadlocks if the recovery process runs out of log reservation
+ * space. Hence we need to yield the CPU when there is other kernel work
+ * scheduled on this CPU to ensure other scheduled work can run without undue
+ * latency.
  */
 STATIC void
 xlog_recover_process_iunlinks(
@@ -5078,6 +5091,7 @@
 			while (agino != NULLAGINO) {
 				agino = xlog_recover_process_one_iunlink(mp,
 							agno, agino, bucket);
+				cond_resched();
 			}
 		}
 		xfs_buf_rele(agibp);
@@ -5527,7 +5541,7 @@
 	 */
 	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
 						 sizeof(struct list_head),
-						 KM_SLEEP);
+						 0);
 	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
 		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
 

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 322da69..ba5b6f3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c

@@ -82,7 +82,7 @@
 	if (hole < 0) {
 		xfs_uuid_table = kmem_realloc(xfs_uuid_table,
 			(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
-			KM_SLEEP);
+			0);
 		hole = xfs_uuid_table_size++;
 	}
 	xfs_uuid_table[hole] = *uuid;
@@ -214,7 +214,7 @@
 
 		spin_lock(&mp->m_perag_lock);
 		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
-			BUG();
+			WARN_ON_ONCE(1);
 			spin_unlock(&mp->m_perag_lock);
 			radix_tree_preload_end();
 			error = -EEXIST;

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4adb683..fdb60e0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h

@@ -327,13 +327,6 @@
 }
 
 /* per-AG block reservation data structures*/
-enum xfs_ag_resv_type {
-	XFS_AG_RESV_NONE = 0,
-	XFS_AG_RESV_AGFL,
-	XFS_AG_RESV_METADATA,
-	XFS_AG_RESV_RMAPBT,
-};
-
 struct xfs_ag_resv {
 	/* number of blocks originally reserved here */
 	xfs_extlen_t			ar_orig_reserved;

diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 7473881..a06661d 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c

@@ -333,12 +333,12 @@
 	if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
 		return -EINVAL;
 
-	if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
+	if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
 		return -ENOMEM;
 
 	/* An extra list is needed to avoid reaping up to a grp_time early. */
 	mru->grp_count = grp_count + 1;
-	mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
+	mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
 
 	if (!mru->lists) {
 		err = -ENOMEM;

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5e7a37f..ecd8ce1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c

@@ -642,7 +642,7 @@
 
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
-	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0);
 
 	error = list_lru_init(&qinf->qi_lru);
 	if (error)
@@ -978,7 +978,7 @@
 	if (qip->i_d.di_nblocks == 0)
 		return 0;
 
-	map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+	map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
 
 	lblkno = 0;
 	maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);

diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index d8288aa..2328268 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c

@@ -144,9 +144,9 @@
 	ASSERT(nextents > 0);
 	if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
 		cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
-				KM_SLEEP);
+				0);
 	else
-		cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP);
+		cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
 
 	xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
 	cuip->cui_format.cui_nextents = nextents;
@@ -223,7 +223,7 @@
 {
 	struct xfs_cud_log_item		*cudp;
 
-	cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP);
+	cudp = kmem_zone_zalloc(xfs_cud_zone, 0);
 	xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
 			  &xfs_cud_item_ops);
 	cudp->cud_cuip = cuip;
@@ -555,26 +555,24 @@
 			irec.br_blockcount = new_len;
 			switch (type) {
 			case XFS_REFCOUNT_INCREASE:
-				error = xfs_refcount_increase_extent(tp, &irec);
+				xfs_refcount_increase_extent(tp, &irec);
 				break;
 			case XFS_REFCOUNT_DECREASE:
-				error = xfs_refcount_decrease_extent(tp, &irec);
+				xfs_refcount_decrease_extent(tp, &irec);
 				break;
 			case XFS_REFCOUNT_ALLOC_COW:
-				error = xfs_refcount_alloc_cow_extent(tp,
+				xfs_refcount_alloc_cow_extent(tp,
 						irec.br_startblock,
 						irec.br_blockcount);
 				break;
 			case XFS_REFCOUNT_FREE_COW:
-				error = xfs_refcount_free_cow_extent(tp,
+				xfs_refcount_free_cow_extent(tp,
 						irec.br_startblock,
 						irec.br_blockcount);
 				break;
 			default:
 				ASSERT(0);
 			}
-			if (error)
-				goto abort_error;
 			requeue_only = true;
 		}
 	}

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index edbe37b..0f08153 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c

@@ -495,10 +495,8 @@
 			ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
 
 			/* Free the CoW orphan record. */
-			error = xfs_refcount_free_cow_extent(*tpp,
-					del.br_startblock, del.br_blockcount);
-			if (error)
-				break;
+			xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
+					del.br_blockcount);
 
 			xfs_bmap_add_free(*tpp, del.br_startblock,
 					  del.br_blockcount, NULL);
@@ -675,15 +673,10 @@
 	trace_xfs_reflink_cow_remap(ip, &del);
 
 	/* Free the CoW orphan record. */
-	error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
-			del.br_blockcount);
-	if (error)
-		goto out_cancel;
+	xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
 
 	/* Map the new blocks into the data fork. */
-	error = xfs_bmap_map_extent(tp, ip, &del);
-	if (error)
-		goto out_cancel;
+	xfs_bmap_map_extent(tp, ip, &del);
 
 	/* Charge this new data fork mapping to the on-disk quota. */
 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
@@ -1070,14 +1063,10 @@
 				uirec.br_blockcount, uirec.br_startblock);
 
 		/* Update the refcount tree */
-		error = xfs_refcount_increase_extent(tp, &uirec);
-		if (error)
-			goto out_cancel;
+		xfs_refcount_increase_extent(tp, &uirec);
 
 		/* Map the new blocks into the data fork. */
-		error = xfs_bmap_map_extent(tp, ip, &uirec);
-		if (error)
-			goto out_cancel;
+		xfs_bmap_map_extent(tp, ip, &uirec);
 
 		/* Update quota accounting. */
 		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,

diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 77ed557..8939e0e 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c

@@ -142,9 +142,9 @@
 
 	ASSERT(nextents > 0);
 	if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
-		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
+		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
 	else
-		ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
+		ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
 
 	xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
 	ruip->rui_format.rui_nextents = nextents;
@@ -244,7 +244,7 @@
 {
 	struct xfs_rud_log_item		*rudp;
 
-	rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP);
+	rudp = kmem_zone_zalloc(xfs_rud_zone, 0);
 	xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
 			  &xfs_rud_item_ops);
 	rudp->rud_ruip = ruip;

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5fa4db3..4a48a8c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c

@@ -865,7 +865,7 @@
 	 * lower bound on the minimum level with any free extents. We can
 	 * continue without the cache if it couldn't be allocated.
 	 */
-	mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP);
+	mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
 	if (!mp->m_rsum_cache)
 		xfs_warn(mp, "could not allocate realtime summary cache");
 }
@@ -963,7 +963,7 @@
 	/*
 	 * Allocate a new (fake) mount/sb.
 	 */
-	nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+	nmp = kmem_alloc(sizeof(*nmp), 0);
 	/*
 	 * Loop over the bitmap blocks.
 	 * We will do everything one bitmap block at a time.

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f945023..391b474 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c

@@ -818,7 +818,8 @@
 		goto out_destroy_buf;
 
 	mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
-			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+			WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
+			0, mp->m_fsname);
 	if (!mp->m_cil_workqueue)
 		goto out_destroy_unwritten;
 

diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8094b19..eaae275 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h

@@ -23,6 +23,7 @@
 struct xlog_ticket;
 struct xlog_recover;
 struct xlog_recover_item;
+struct xlog_rec_header;
 struct xfs_buf_log_format;
 struct xfs_inode_log_format;
 struct xfs_bmbt_irec;
@@ -30,6 +31,10 @@
 struct xfs_refcount_irec;
 struct xfs_fsmap;
 struct xfs_rmap_irec;
+struct xfs_icreate_log;
+struct xfs_owner_info;
+struct xfs_trans_res;
+struct xfs_inobt_rec_incore;
 
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
 	TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -3575,6 +3580,35 @@
 		  __entry->nr_threads, __entry->pid)
 )
 
+DECLARE_EVENT_CLASS(xfs_kmem_class,
+	TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
+	TP_ARGS(size, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(ssize_t, size)
+		__field(int, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->size = size;
+		__entry->flags = flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("size %zd flags 0x%x caller %pS",
+		  __entry->size,
+		  __entry->flags,
+		  (char *)__entry->caller_ip)
+)
+
+#define DEFINE_KMEM_EVENT(name) \
+DEFINE_EVENT(xfs_kmem_class, name, \
+	TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
+	TP_ARGS(size, flags, caller_ip))
+DEFINE_KMEM_EVENT(kmem_alloc);
+DEFINE_KMEM_EVENT(kmem_alloc_io);
+DEFINE_KMEM_EVENT(kmem_alloc_large);
+DEFINE_KMEM_EVENT(kmem_realloc);
+DEFINE_KMEM_EVENT(kmem_zone_alloc);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d42a68d..f4795fd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c

@@ -90,7 +90,7 @@
 
 	trace_xfs_trans_dup(tp, _RET_IP_);
 
-	ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+	ntp = kmem_zone_zalloc(xfs_trans_zone, 0);
 
 	/*
 	 * Initialize the new transaction structure.
@@ -263,7 +263,7 @@
 	 * GFP_NOFS allocation context so that we avoid lockdep false positives
 	 * by doing GFP_KERNEL allocations inside sb_start_intwrite().
 	 */
-	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+	tp = kmem_zone_zalloc(xfs_trans_zone, 0);
 	if (!(flags & XFS_TRANS_NO_WRITECOUNT))
 		sb_start_intwrite(mp->m_super);
 

diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 1027c9c..1645746 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c

@@ -863,7 +863,7 @@
 xfs_trans_alloc_dqinfo(
 	xfs_trans_t	*tp)
 {
-	tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP);
+	tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
 }
 
 void

diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 3123b5a..cb895b1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c

@@ -30,7 +30,7 @@
 		value = NULL;
 	}
 
-	error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+	error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags);
 	if (error)
 		return error;
 	return asize;

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae66481..ffe35d9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h

@@ -3543,6 +3543,8 @@
 /* mm/fadvise.c */
 extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
 		       int advice);
+extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
+			   int advice);
 
 #if defined(CONFIG_IO_URING)
 extern struct sock *io_uring_get_socket(struct file *file);

diff --git a/mm/fadvise.c b/mm/fadvise.c
index 467bcd0..4f17c83 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c

@@ -27,8 +27,7 @@
  * deactivate the pages and clear PG_Referenced.
  */
 
-static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
-			   int advice)
+int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
 {
 	struct inode *inode;
 	struct address_space *mapping;
@@ -178,6 +177,7 @@
 	}
 	return 0;
 }
+EXPORT_SYMBOL(generic_fadvise);
 
 int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
 {

diff --git a/mm/madvise.c b/mm/madvise.c
index 968df3a..bac973b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c

@@ -14,6 +14,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/hugetlb.h>
 #include <linux/falloc.h>
+#include <linux/fadvise.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
 #include <linux/fs.h>
@@ -275,6 +276,7 @@
 			     unsigned long start, unsigned long end)
 {
 	struct file *file = vma->vm_file;
+	loff_t offset;
 
 	*prev = vma;
 #ifdef CONFIG_SWAP
@@ -298,12 +300,20 @@
 		return 0;
 	}
 
-	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-	if (end > vma->vm_end)
-		end = vma->vm_end;
-	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-
-	force_page_cache_readahead(file->f_mapping, file, start, end - start);
+	/*
+	 * Filesystem's fadvise may need to take various locks.  We need to
+	 * explicitly grab a reference because the vma (and hence the
+	 * vma's reference to the file) can go away as soon as we drop
+	 * mmap_sem.
+	 */
+	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
+	get_file(file);
+	up_read(&current->mm->mmap_sem);
+	offset = (loff_t)(start - vma->vm_start)
+			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
+	fput(file);
+	down_read(&current->mm->mmap_sem);
 	return 0;
 }
commit	b41dae061bbd722b9d7fa828f35d22035b218e18	[log] [tgz]
author	Linus Torvalds <torvalds@linux-foundation.org>	Wed Sep 18 18:32:43 2019 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	Wed Sep 18 18:32:43 2019 -0700
tree	a5c0bade0c3d221483b54204bfc47e4fdbf09316
parent	e6bc9de714972cac34daa1dc1567ee48a47a9342 [diff]
parent	14e15f1bcd738dc13dd7c1e78e4800e8bc577980 [diff]