From e2ac836307e346969737c60075fdb01bed1af503 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Jun 2018 23:24:38 -0700 Subject: [PATCH 01/11] xfs: simplify xfs_bmap_punch_delalloc_range Instead of using xfs_bmapi_read to find delalloc extents and then punch them out using xfs_bunmapi, opencode the loop to iterate over the extents and call xfs_bmap_del_extent_delay directly. This both simplifies the code and reduces the number of extent tree lookups required. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 85 ++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 53 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c35009a866995..c195b9d857af4 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -685,12 +685,10 @@ xfs_getbmap( } /* - * dead simple method of punching delalyed allocation blocks from a range in - * the inode. Walks a block at a time so will be slow, but is only executed in - * rare error cases so the overhead is not critical. This will always punch out - * both the start and end blocks, even if the ranges only partially overlap - * them, so it is up to the caller to ensure that partial blocks are not - * passed in. + * Dead simple method of punching delalyed allocation blocks from a range in + * the inode. This will always punch out both the start and end blocks, even + * if the ranges only partially overlap them, so it is up to the caller to + * ensure that partial blocks are not passed in. */ int xfs_bmap_punch_delalloc_range( @@ -698,63 +696,44 @@ xfs_bmap_punch_delalloc_range( xfs_fileoff_t start_fsb, xfs_fileoff_t length) { - xfs_fileoff_t remaining = length; + struct xfs_ifork *ifp = &ip->i_df; + xfs_fileoff_t end_fsb = start_fsb + length; + struct xfs_bmbt_irec got, del; + struct xfs_iext_cursor icur; int error = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - do { - int done; - xfs_bmbt_irec_t imap; - int nimaps = 1; - xfs_fsblock_t firstblock; - struct xfs_defer_ops dfops; + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } - /* - * Map the range first and check that it is a delalloc extent - * before trying to unmap the range. Otherwise we will be - * trying to remove a real extent (which requires a - * transaction) or a hole, which is probably a bad idea... - */ - error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, - XFS_BMAPI_ENTIRE); + if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) + return 0; - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "Failed delalloc mapping lookup ino %lld fsb %lld.", - ip->i_ino, start_fsb); - } - break; - } - if (!nimaps) { - /* nothing there */ - goto next_block; - } - if (imap.br_startblock != DELAYSTARTBLOCK) { - /* been converted, ignore */ - goto next_block; - } - WARN_ON(imap.br_blockcount == 0); + while (got.br_startoff + got.br_blockcount > start_fsb) { + del = got; + xfs_trim_extent(&del, start_fsb, length); /* - * Note: while we initialise the firstblock/dfops pair, they - * should never be used because blocks should never be - * allocated or freed for a delalloc extent and hence we need - * don't cancel or finish them after the xfs_bunmapi() call. + * A delete can push the cursor forward. Step back to the + * previous extent on non-delalloc or extents outside the + * target range. */ - xfs_defer_init(&dfops, &firstblock); - error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, - &dfops, &done); - if (error) - break; + if (!del.br_blockcount || + !isnullstartblock(del.br_startblock)) { + if (!xfs_iext_prev_extent(ifp, &icur, &got)) + break; + continue; + } - ASSERT(!xfs_defer_has_unfinished_work(&dfops)); -next_block: - start_fsb++; - remaining--; - } while(remaining > 0); + error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, + &got, &del); + if (error || !xfs_iext_get_extent(ifp, &icur, &got)) + break; + } return error; } From 23fcb3340d033d9f081e21e6c12c2db7eaa541d3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Jun 2018 23:25:57 -0700 Subject: [PATCH 02/11] xfs: More robust inode extent count validation When the inode is in extent format, it can't have more extents that fit in the inode fork. We don't currenty check this, and so this corruption goes unnoticed by the inode verifiers. This can lead to crashes operating on invalid in-memory structures. Attempts to access such a inode will now error out in the verifier rather than allowing modification operations to proceed. Reported-by: Wen Xu Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong [darrick: fix a typedef, add some braces and breaks to shut up compiler warnings] Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_format.h | 3 ++ fs/xfs/libxfs/xfs_inode_buf.c | 76 ++++++++++++++++++++++------------- 2 files changed, 50 insertions(+), 29 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 1c5a8aaf2bfce..7b4a43deb83e0 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -962,6 +962,9 @@ typedef enum xfs_dinode_fmt { XFS_DFORK_DSIZE(dip, mp) : \ XFS_DFORK_ASIZE(dip, mp)) +#define XFS_DFORK_MAXEXT(dip, mp, w) \ + (XFS_DFORK_SIZE(dip, mp, w) / sizeof(struct xfs_bmbt_rec)) + /* * Return pointers to the data or attribute forks. */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d38d724534c48..33dc34655ac3d 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -374,6 +374,47 @@ xfs_log_dinode_to_disk( } } +static xfs_failaddr_t +xfs_dinode_verify_fork( + struct xfs_dinode *dip, + struct xfs_mount *mp, + int whichfork) +{ + uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + + switch (XFS_DFORK_FORMAT(dip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (whichfork == XFS_DATA_FORK) { + if (S_ISREG(be16_to_cpu(dip->di_mode))) + return __this_address; + if (be64_to_cpu(dip->di_size) > + XFS_DFORK_SIZE(dip, mp, whichfork)) + return __this_address; + } + if (di_nextents) + return __this_address; + break; + case XFS_DINODE_FMT_EXTENTS: + if (di_nextents > XFS_DFORK_MAXEXT(dip, mp, whichfork)) + return __this_address; + break; + case XFS_DINODE_FMT_BTREE: + if (whichfork == XFS_ATTR_FORK) { + if (di_nextents > MAXAEXTNUM) + return __this_address; + } else if (di_nextents > MAXEXTNUM) { + return __this_address; + } + break; + default: + return __this_address; + } + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -441,24 +482,9 @@ xfs_dinode_verify( case S_IFREG: case S_IFLNK: case S_IFDIR: - switch (dip->di_format) { - case XFS_DINODE_FMT_LOCAL: - /* - * no local regular files yet - */ - if (S_ISREG(mode)) - return __this_address; - if (di_size > XFS_DFORK_DSIZE(dip, mp)) - return __this_address; - if (dip->di_nextents) - return __this_address; - /* fall through */ - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - break; - default: - return __this_address; - } + fa = xfs_dinode_verify_fork(dip, mp, XFS_DATA_FORK); + if (fa) + return fa; break; case 0: /* Uninitialized inode ok. */ @@ -468,17 +494,9 @@ xfs_dinode_verify( } if (XFS_DFORK_Q(dip)) { - switch (dip->di_aformat) { - case XFS_DINODE_FMT_LOCAL: - if (dip->di_anextents) - return __this_address; - /* fall through */ - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - break; - default: - return __this_address; - } + fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK); + if (fa) + return fa; } else { /* * If there is no fork offset, this may be a freshly-made inode From e53946dbd31a21f4bef155f8febba556933d62bf Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Jun 2018 23:26:05 -0700 Subject: [PATCH 03/11] xfs: xfs_iflush_abort() can be called twice on cluster writeback failure When a corrupt inode is detected during xfs_iflush_cluster, we can get a shutdown ASSERT failure like this: XFS (pmem1): Metadata corruption detected at xfs_symlink_shortform_verify+0x5c/0xa0, inode 0x86627 data fork XFS (pmem1): Unmount and run xfs_repair XFS (pmem1): xfs_do_force_shutdown(0x8) called from line 3372 of file fs/xfs/xfs_inode.c. Return address = ffffffff814f4116 XFS (pmem1): Corruption of in-memory data detected. Shutting down filesystem XFS (pmem1): xfs_do_force_shutdown(0x1) called from line 222 of file fs/xfs/libxfs/xfs_defer.c. Return address = ffffffff814a8a88 XFS (pmem1): xfs_do_force_shutdown(0x1) called from line 222 of file fs/xfs/libxfs/xfs_defer.c. Return address = ffffffff814a8ef9 XFS (pmem1): Please umount the filesystem and rectify the problem(s) XFS: Assertion failed: xfs_isiflocked(ip), file: fs/xfs/xfs_inode.h, line: 258 ..... Call Trace: xfs_iflush_abort+0x10a/0x110 xfs_iflush+0xf3/0x390 xfs_inode_item_push+0x126/0x1e0 xfsaild+0x2c5/0x890 kthread+0x11c/0x140 ret_from_fork+0x24/0x30 Essentially, xfs_iflush_abort() has been called twice on the original inode that that was flushed. This happens because the inode has been flushed to teh buffer successfully via xfs_iflush_int(), and so when another inode is detected as corrupt in xfs_iflush_cluster, the buffer is marked stale and EIO, and iodone callbacks are run on it. Running the iodone callbacks walks across the original inode and calls xfs_iflush_abort() on it. When xfs_iflush_cluster() returns to xfs_iflush(), it runs the error path for that function, and that calls xfs_iflush_abort() on the inode a second time, leading to the above assert failure as the inode is not flush locked anymore. This bug has been there a long time. The simple fix would be to just avoid calling xfs_iflush_abort() in xfs_iflush() if we've got a failure from xfs_iflush_cluster(). However, xfs_iflush_cluster() has magic delwri buffer handling that means it may or may not have run IO completion on the buffer, and hence sometimes we have to call xfs_iflush_abort() from xfs_iflush(), and sometimes we shouldn't. After reading through all the error paths and the delwri buffer code, it's clear that the error handling in xfs_iflush_cluster() is unnecessary. If the buffer is delwri, it leaves it on the delwri list so that when the delwri list is submitted it sees a shutdown fliesystem in xfs_buf_submit() and that marks the buffer stale, EIO and runs IO completion. i.e. exactly what xfs+iflush_cluster() does when it's not a delwri buffer. Further, marking a buffer stale clears the _XBF_DELWRI_Q flag on the buffer, which means when submission of the buffer occurs, it just skips over it and releases it. IOWs, the error handling in xfs_iflush_cluster doesn't need to care if the buffer is already on a the delwri queue or not - it just needs to mark the buffer stale, EIO and run completions. That means we can just use the easy fix for xfs_iflush() to avoid the double abort. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 57 +++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 36 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7a96c4e0ab5c6..5df4de666cc11 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3236,7 +3236,6 @@ xfs_iflush_cluster( struct xfs_inode *cip; int nr_found; int clcount = 0; - int bufwasdelwri; int i; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -3360,37 +3359,22 @@ xfs_iflush_cluster( * inode buffer and shut down the filesystem. */ rcu_read_unlock(); - /* - * Clean up the buffer. If it was delwri, just release it -- - * brelse can handle it with no problems. If not, shut down the - * filesystem before releasing the buffer. - */ - bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); - if (bufwasdelwri) - xfs_buf_relse(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - if (!bufwasdelwri) { - /* - * Just like incore_relse: if we have b_iodone functions, - * mark the buffer as an error and call them. Otherwise - * mark it as stale and brelse. - */ - if (bp->b_iodone) { - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp); - } else { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - } - } - /* - * Unlocks the flush lock + * We'll always have an inode attached to the buffer for completion + * process by the time we are called from xfs_iflush(). Hence we have + * always need to do IO completion processing to abort the inodes + * attached to the buffer. handle them just like the shutdown case in + * xfs_buf_submit(). */ + ASSERT(bp->b_iodone); + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_ioend(bp); + + /* abort the corrupt inode, as it was not attached to the buffer */ xfs_iflush_abort(cip, false); kmem_free(cilist); xfs_perag_put(pag); @@ -3486,12 +3470,17 @@ xfs_iflush( xfs_log_force(mp, 0); /* - * inode clustering: - * see if other inodes can be gathered into this write + * inode clustering: try to gather other inodes into this write + * + * Note: Any error during clustering will result in the filesystem + * being shut down and completion callbacks run on the cluster buffer. + * As we have already flushed and attached this inode to the buffer, + * it has already been aborted and released by xfs_iflush_cluster() and + * so we have no further error handling to do here. */ error = xfs_iflush_cluster(ip, bp); if (error) - goto cluster_corrupt_out; + return error; *bpp = bp; return 0; @@ -3500,12 +3489,8 @@ xfs_iflush( if (bp) xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -cluster_corrupt_out: - error = -EFSCORRUPTED; abort_out: - /* - * Unlocks the flush lock - */ + /* abort the corrupt inode, as it was not attached to the buffer */ xfs_iflush_abort(ip, false); return error; } From 10ee25268e1f8475905e1deb85bb83627dca561e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Jun 2018 23:26:55 -0700 Subject: [PATCH 04/11] xfs: allow empty transactions while frozen In commit e89c041338ed6ef ("xfs: implement the GETFSMAP ioctl") we created the ability to obtain empty transactions. These transactions have no log or block reservations and therefore can't modify anything. Since they're also NO_WRITECOUNT they can run while the fs is frozen, so we don't need to WARN_ON about that usage. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trans.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index e040af120b69b..524f543c5b820 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -258,7 +258,12 @@ xfs_trans_alloc( if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); - WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + /* + * Zero-reservation ("empty") transactions can't modify anything, so + * they're allowed to run while we're frozen. + */ + WARN_ON(resp->tr_logres > 0 && + mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); tp = kmem_zone_zalloc(xfs_trans_zone, From aafe12cee0b132824f5187987f8a3fb704b9f685 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Jun 2018 23:26:56 -0700 Subject: [PATCH 05/11] xfs: don't trip over negative free space in xfs_reserve_blocks If we somehow end up with a filesystem that has fewer free blocks than the blocks set aside to avoid ENOSPC deadlocks, it's possible that the free space calculation in xfs_reserve_blocks will spit out a negative number (because percpu_counter_sum returns s64). We fail to notice this negative number and set fdblks_delta to it. Now we increment fdblocks(!) and the unsigned type of m_resblks means that we end up setting a ridiculously huge m_resblks reservation. Avoid this comedy of errors by detecting the negative free space and returning -ENOSPC. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_fsops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a7afcad6b7114..3f2bd6032cf86 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -387,7 +387,7 @@ xfs_reserve_blocks( do { free = percpu_counter_sum(&mp->m_fdblocks) - mp->m_alloc_set_aside; - if (!free) + if (free <= 0) break; delta = request - mp->m_resblks; From f62cb48e43195f66c7a40bbfcf11531fc1ff8999 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Jun 2018 23:26:57 -0700 Subject: [PATCH 06/11] xfs: don't allow insert-range to shift extents past the maximum offset Zorro Lang reports that generic/485 blows an assert on a filesystem with 512 byte blocks. The test tries to fallocate a post-eof extent at the maximum file size and calls insert range to shift the extents right by two blocks. On a 512b block filesystem this causes startoff to overflow the 54-bit startoff field, leading to the assert. Therefore, always check the rightmost extent to see if it would overflow prior to invoking the insert range machinery. Reported-by: zlang@redhat.com Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200137 Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 26 ++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_bmap.h | 2 ++ fs/xfs/libxfs/xfs_format.h | 2 ++ fs/xfs/xfs_bmap_util.c | 4 ++++ 4 files changed, 34 insertions(+) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 01628f0c9a0c2..7205268b30bc5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5780,6 +5780,32 @@ xfs_bmap_collapse_extents( return error; } +/* Make sure we won't be right-shifting an extent past the maximum bound. */ +int +xfs_bmap_can_insert_extents( + struct xfs_inode *ip, + xfs_fileoff_t off, + xfs_fileoff_t shift) +{ + struct xfs_bmbt_irec got; + int is_empty; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_last_extent(NULL, ip, XFS_DATA_FORK, &got, &is_empty); + if (!error && !is_empty && got.br_startoff >= off && + ((got.br_startoff + shift) & BMBT_STARTOFF_MASK) < got.br_startoff) + error = -EINVAL; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + int xfs_bmap_insert_extents( struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 99dddbd0fcc6c..9b49ddf99c411 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -227,6 +227,8 @@ int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops); +int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, + xfs_fileoff_t shift); int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 7b4a43deb83e0..059bc44c27e83 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1529,6 +1529,8 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTBLOCK_BITLEN 52 #define BMBT_BLOCKCOUNT_BITLEN 21 +#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) + typedef struct xfs_bmbt_rec { __be64 l0, l1; } xfs_bmbt_rec_t; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c195b9d857af4..bb417156e3bf7 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1383,6 +1383,10 @@ xfs_insert_file_space( trace_xfs_insert_file_space(ip); + error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); + if (error) + return error; + error = xfs_prepare_shift(ip, offset); if (error) return error; From 5bd88d153998c1b189fdeb8b8bd4cce36b5acf62 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Jun 2018 23:26:57 -0700 Subject: [PATCH 07/11] xfs: recheck reflink state after grabbing ILOCK_SHARED for a write The reflink iflag could have changed since the earlier unlocked check, so if we got ILOCK_SHARED for a write and but we're now a reflink inode we have to switch to ILOCK_EXCL and relock. This helps us avoid blowing lock assertions in things like generic/166: XFS: Assertion failed: xfs_isilocked(ip, XFS_ILOCK_EXCL), file: fs/xfs/xfs_reflink.c, line: 383 WARNING: CPU: 1 PID: 24707 at fs/xfs/xfs_message.c:104 assfail+0x25/0x30 [xfs] Modules linked in: deadline_iosched dm_snapshot dm_bufio ext4 mbcache jbd2 dm_flakey xfs libcrc32c dax_pmem device_dax nd_pmem sch_fq_codel af_packet [last unloaded: scsi_debug] CPU: 1 PID: 24707 Comm: xfs_io Not tainted 4.18.0-rc1-djw #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-1ubuntu1 04/01/2014 RIP: 0010:assfail+0x25/0x30 [xfs] Code: ff 0f 0b c3 90 66 66 66 66 90 48 89 f1 41 89 d0 48 c7 c6 e8 ef 1b a0 48 89 fa 31 ff e8 54 f9 ff ff 80 3d fd ba 0f 00 00 75 03 <0f> 0b c3 0f 0b 66 0f 1f 44 00 00 66 66 66 66 90 48 63 f6 49 89 f9 RSP: 0018:ffffc90006423ad8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff880030b65e80 RCX: 0000000000000000 RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffffa01b0447 RBP: ffffc90006423c10 R08: 0000000000000000 R09: 0000000000000000 R10: ffff88003d43fc30 R11: f000000000000000 R12: ffff880077cda000 R13: 0000000000000000 R14: ffffc90006423c30 R15: ffffc90006423bf9 FS: 00007feba8986800(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000138ab58 CR3: 000000003d40a000 CR4: 00000000000006a0 Call Trace: xfs_reflink_allocate_cow+0x24c/0x3d0 [xfs] xfs_file_iomap_begin+0x6d2/0xeb0 [xfs] ? iomap_to_fiemap+0x80/0x80 iomap_apply+0x5e/0x130 iomap_dio_rw+0x2e0/0x400 ? iomap_to_fiemap+0x80/0x80 ? xfs_file_dio_aio_write+0x133/0x4a0 [xfs] xfs_file_dio_aio_write+0x133/0x4a0 [xfs] xfs_file_write_iter+0x7b/0xb0 [xfs] __vfs_write+0x16f/0x1f0 vfs_write+0xc8/0x1c0 ksys_pwrite64+0x74/0x90 do_syscall_64+0x56/0x180 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_iomap.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 49f5492eed3bd..55876dd02f0c8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -963,12 +963,13 @@ xfs_ilock_for_iomap( unsigned *lockmode) { unsigned mode = XFS_ILOCK_SHARED; + bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); /* * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) { + if (xfs_is_reflink_inode(ip) && is_write) { /* * FIXME: It could still overwrite on unshared extents and not * need allocation. @@ -989,6 +990,7 @@ xfs_ilock_for_iomap( mode = XFS_ILOCK_EXCL; } +relock: if (flags & IOMAP_NOWAIT) { if (!xfs_ilock_nowait(ip, mode)) return -EAGAIN; @@ -996,6 +998,17 @@ xfs_ilock_for_iomap( xfs_ilock(ip, mode); } + /* + * The reflink iflag could have changed since the earlier unlocked + * check, so if we got ILOCK_SHARED for a write and but we're now a + * reflink inode we have to switch to ILOCK_EXCL and relock. + */ + if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) { + xfs_iunlock(ip, mode); + mode = XFS_ILOCK_EXCL; + goto relock; + } + *lockmode = mode; return 0; } From 232d0a24b0fc197c50e95fe65f236027b5fa1b74 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Jun 2018 23:26:57 -0700 Subject: [PATCH 08/11] xfs: fix uninitialized field in rtbitmap fsmap backend Initialize the extent count field of the high key so that when we use the high key to synthesize an 'unknown owner' record (i.e. used space record) at the end of the queried range we have a field with which to compute rm_blockcount. This is not strictly necessary because the synthesizer never uses the rm_blockcount field, but we can shut up the static code analysis anyway. Coverity-id: 1437358 Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_fsmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index c34fa9c342f25..c7157bc48bd19 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -513,8 +513,8 @@ xfs_getfsmap_rtdev_rtbitmap_query( struct xfs_trans *tp, struct xfs_getfsmap_info *info) { - struct xfs_rtalloc_rec alow; - struct xfs_rtalloc_rec ahigh; + struct xfs_rtalloc_rec alow = { 0 }; + struct xfs_rtalloc_rec ahigh = { 0 }; int error; xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); From a3a374bf1889b1b401b25e6aada3ca4151a99d15 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Jun 2018 23:26:58 -0700 Subject: [PATCH 09/11] xfs: fix off-by-one error in xfs_rtalloc_query_range In commit 8ad560d2565e6 ("xfs: strengthen rtalloc query range checks") we strengthened the input parameter checks in the rtbitmap range query function, but introduced an off-by-one error in the process. The call to xfs_rtfind_forw deals with the high key being rextents, but we clamp the high key to rextents - 1. This causes the returned results to stop one block short of the end of the rtdev, which is incorrect. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rtbitmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 65fc4ed2e9a10..b228c821bae68 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1029,8 +1029,8 @@ xfs_rtalloc_query_range( if (low_rec->ar_startext >= mp->m_sb.sb_rextents || low_rec->ar_startext == high_rec->ar_startext) return 0; - if (high_rec->ar_startext >= mp->m_sb.sb_rextents) - high_rec->ar_startext = mp->m_sb.sb_rextents - 1; + if (high_rec->ar_startext > mp->m_sb.sb_rextents) + high_rec->ar_startext = mp->m_sb.sb_rextents; /* Iterate the bitmap, looking for discrepancies. */ rtstart = low_rec->ar_startext; From e53c4b598372001a13901b77649dc1b4afec3e85 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Jun 2018 23:26:58 -0700 Subject: [PATCH 10/11] xfs: ensure post-EOF zeroing happens after zeroing part of a file If a user asks us to zero_range part of a file, the end of the range is EOF, and not aligned to a page boundary, invoke writeback of the EOF page to ensure that the post-EOF part of the page is zeroed. This ensures that we don't expose stale memory contents via mmap, if in a clumsy manner. Found by running generic/127 when it runs zero_range and mapread at EOF one after the other. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Allison Henderson --- fs/xfs/xfs_bmap_util.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index bb417156e3bf7..83b1e8c6c18f9 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1187,7 +1187,22 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - return iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); + error = iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); + if (error) + return error; + + /* + * If we zeroed right up to EOF and EOF straddles a page boundary we + * must make sure that the post-EOF area is also zeroed because the + * page could be mmap'd and iomap_zero_range doesn't do that for us. + * Writeback of the eof page will do this, albeit clumsily. + */ + if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) { + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + (offset + len) & ~PAGE_MASK, LLONG_MAX); + } + + return error; } /* From d8cb5e42378918e00eda58792f3ab86dd1e7d214 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Jun 2018 23:26:56 -0700 Subject: [PATCH 11/11] xfs: fix fdblocks accounting w/ RMAPBT per-AG reservation In __xfs_ag_resv_init we incorrectly calculate the amount by which to decrease fdblocks when reserving blocks for the rmapbt. Because rmapbt allocations do not decrease fdblocks, we must decrease fdblocks by the entire size of the requested reservation in order to achieve our goal of always having enough free blocks to satisfy an rmapbt expansion. This is in contrast to the refcountbt/finobt, which /do/ subtract from fdblocks whenever they allocate a block. For this allocation type we preserve the existing behavior where we decrease fdblocks only by the requested reservation minus the size of the existing tree. This fixes the problem where the available block counts reported by statfs change across a remount if there had been an rmapbt size change since mount time. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson --- fs/xfs/libxfs/xfs_ag_resv.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 84db76e0e3e3c..fecd187fcf2c3 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -157,6 +157,7 @@ __xfs_ag_resv_free( error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); resv->ar_reserved = 0; resv->ar_asked = 0; + resv->ar_orig_reserved = 0; if (error) trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, @@ -189,13 +190,34 @@ __xfs_ag_resv_init( struct xfs_mount *mp = pag->pag_mount; struct xfs_ag_resv *resv; int error; - xfs_extlen_t reserved; + xfs_extlen_t hidden_space; if (used > ask) ask = used; - reserved = ask - used; - error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true); + switch (type) { + case XFS_AG_RESV_RMAPBT: + /* + * Space taken by the rmapbt is not subtracted from fdblocks + * because the rmapbt lives in the free space. Here we must + * subtract the entire reservation from fdblocks so that we + * always have blocks available for rmapbt expansion. + */ + hidden_space = ask; + break; + case XFS_AG_RESV_METADATA: + /* + * Space taken by all other metadata btrees are accounted + * on-disk as used space. We therefore only hide the space + * that is reserved but not used by the trees. + */ + hidden_space = ask - used; + break; + default: + ASSERT(0); + return -EINVAL; + } + error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); @@ -216,7 +238,8 @@ __xfs_ag_resv_init( resv = xfs_perag_resv(pag, type); resv->ar_asked = ask; - resv->ar_reserved = resv->ar_orig_reserved = reserved; + resv->ar_orig_reserved = hidden_space; + resv->ar_reserved = ask - used; trace_xfs_ag_resv_init(pag, type, ask); return 0;