xfs: use per-AG reservations for the finobt

Currently we try to rely on the global reserved block pool for block
allocations for the free inode btree, but I have customer reports
(fairly complex workload, need to find an easier reproducer) where that
is not enough as the AG where we free an inode that requires a new
finobt block is entirely full.  This causes us to cancel a dirty
transaction and thus a file system shutdown.

I think the right way to guard against this is to treat the finot the same
way as the refcount btree and have a per-AG reservations for the possible
worst case size of it, and the patch below implements that.

Note that this could increase mount times with large finobt trees.  In
an ideal world we would have added a field for the number of finobt
fields to the AGI, similar to what we did for the refcount blocks.
We should do add it next time we rev the AGI or AGF format by adding
new fields.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
This commit is contained in:
Christoph Hellwig 2017-01-25 07:49:35 -08:00 committed by Darrick J. Wong
parent 4dfa2b8411
commit 76d771b4cb
5 changed files with 144 additions and 20 deletions

View File

@ -39,6 +39,7 @@
#include "xfs_rmap_btree.h" #include "xfs_rmap_btree.h"
#include "xfs_btree.h" #include "xfs_btree.h"
#include "xfs_refcount_btree.h" #include "xfs_refcount_btree.h"
#include "xfs_ialloc_btree.h"
/* /*
* Per-AG Block Reservations * Per-AG Block Reservations
@ -210,6 +211,9 @@ __xfs_ag_resv_init(
if (error) { if (error) {
trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
error, _RET_IP_); error, _RET_IP_);
xfs_warn(mp,
"Per-AG reservation for AG %u failed. Filesystem may run out of space.",
pag->pag_agno);
return error; return error;
} }
@ -228,6 +232,8 @@ int
xfs_ag_resv_init( xfs_ag_resv_init(
struct xfs_perag *pag) struct xfs_perag *pag)
{ {
struct xfs_mount *mp = pag->pag_mount;
xfs_agnumber_t agno = pag->pag_agno;
xfs_extlen_t ask; xfs_extlen_t ask;
xfs_extlen_t used; xfs_extlen_t used;
int error = 0; int error = 0;
@ -236,23 +242,45 @@ xfs_ag_resv_init(
if (pag->pag_meta_resv.ar_asked == 0) { if (pag->pag_meta_resv.ar_asked == 0) {
ask = used = 0; ask = used = 0;
error = xfs_refcountbt_calc_reserves(pag->pag_mount, error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
pag->pag_agno, &ask, &used); if (error)
goto out;
error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
if (error) if (error)
goto out; goto out;
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
ask, used); ask, used);
if (error) if (error) {
goto out; /*
* Because we didn't have per-AG reservations when the
* finobt feature was added we might not be able to
* reserve all needed blocks. Warn and fall back to the
* old and potentially buggy code in that case, but
* ensure we do have the reservation for the refcountbt.
*/
ask = used = 0;
mp->m_inotbt_nores = true;
error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
&used);
if (error)
goto out;
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
ask, used);
if (error)
goto out;
}
} }
/* Create the AGFL metadata reservation */ /* Create the AGFL metadata reservation */
if (pag->pag_agfl_resv.ar_asked == 0) { if (pag->pag_agfl_resv.ar_asked == 0) {
ask = used = 0; ask = used = 0;
error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
&ask, &used);
if (error) if (error)
goto out; goto out;
@ -261,9 +289,16 @@ xfs_ag_resv_init(
goto out; goto out;
} }
#ifdef DEBUG
/* need to read in the AGF for the ASSERT below to work */
error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
if (error)
return error;
ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
pag->pagf_freeblks + pag->pagf_flcount); pag->pagf_freeblks + pag->pagf_flcount);
#endif
out: out:
return error; return error;
} }

View File

@ -82,11 +82,12 @@ xfs_finobt_set_root(
} }
STATIC int STATIC int
xfs_inobt_alloc_block( __xfs_inobt_alloc_block(
struct xfs_btree_cur *cur, struct xfs_btree_cur *cur,
union xfs_btree_ptr *start, union xfs_btree_ptr *start,
union xfs_btree_ptr *new, union xfs_btree_ptr *new,
int *stat) int *stat,
enum xfs_ag_resv_type resv)
{ {
xfs_alloc_arg_t args; /* block allocation args */ xfs_alloc_arg_t args; /* block allocation args */
int error; /* error return value */ int error; /* error return value */
@ -103,6 +104,7 @@ xfs_inobt_alloc_block(
args.maxlen = 1; args.maxlen = 1;
args.prod = 1; args.prod = 1;
args.type = XFS_ALLOCTYPE_NEAR_BNO; args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.resv = resv;
error = xfs_alloc_vextent(&args); error = xfs_alloc_vextent(&args);
if (error) { if (error) {
@ -122,6 +124,27 @@ xfs_inobt_alloc_block(
return 0; return 0;
} }
STATIC int
xfs_inobt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
int *stat)
{
return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
}
STATIC int
xfs_finobt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
int *stat)
{
return __xfs_inobt_alloc_block(cur, start, new, stat,
XFS_AG_RESV_METADATA);
}
STATIC int STATIC int
xfs_inobt_free_block( xfs_inobt_free_block(
struct xfs_btree_cur *cur, struct xfs_btree_cur *cur,
@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.dup_cursor = xfs_inobt_dup_cursor, .dup_cursor = xfs_inobt_dup_cursor,
.set_root = xfs_finobt_set_root, .set_root = xfs_finobt_set_root,
.alloc_block = xfs_inobt_alloc_block, .alloc_block = xfs_finobt_alloc_block,
.free_block = xfs_inobt_free_block, .free_block = xfs_inobt_free_block,
.get_minrecs = xfs_inobt_get_minrecs, .get_minrecs = xfs_inobt_get_minrecs,
.get_maxrecs = xfs_inobt_get_maxrecs, .get_maxrecs = xfs_inobt_get_maxrecs,
@ -480,3 +503,64 @@ xfs_inobt_rec_check_count(
return 0; return 0;
} }
#endif /* DEBUG */ #endif /* DEBUG */
static xfs_extlen_t
xfs_inobt_max_size(
struct xfs_mount *mp)
{
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (mp->m_inobt_mxr[0] == 0)
return 0;
return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
XFS_INODES_PER_CHUNK);
}
static int
xfs_inobt_count_blocks(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_btnum_t btnum,
xfs_extlen_t *tree_blocks)
{
struct xfs_buf *agbp;
struct xfs_btree_cur *cur;
int error;
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
if (error)
return error;
cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
error = xfs_btree_count_blocks(cur, tree_blocks);
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
xfs_buf_relse(agbp);
return error;
}
/*
* Figure out how many blocks to reserve and how many are used by this btree.
*/
int
xfs_finobt_calc_reserves(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_extlen_t *ask,
xfs_extlen_t *used)
{
xfs_extlen_t tree_len = 0;
int error;
if (!xfs_sb_version_hasfinobt(&mp->m_sb))
return 0;
error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
if (error)
return error;
*ask += xfs_inobt_max_size(mp);
*used += tree_len;
return 0;
}

View File

@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
#define xfs_inobt_rec_check_count(mp, rec) 0 #define xfs_inobt_rec_check_count(mp, rec) 0
#endif /* DEBUG */ #endif /* DEBUG */
int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_extlen_t *ask, xfs_extlen_t *used);
#endif /* __XFS_IALLOC_BTREE_H__ */ #endif /* __XFS_IALLOC_BTREE_H__ */

View File

@ -1792,22 +1792,23 @@ xfs_inactive_ifree(
int error; int error;
/* /*
* The ifree transaction might need to allocate blocks for record * We try to use a per-AG reservation for any block needed by the finobt
* insertion to the finobt. We don't want to fail here at ENOSPC, so * tree, but as the finobt feature predates the per-AG reservation
* allow ifree to dip into the reserved block pool if necessary. * support a degraded file system might not have enough space for the
* * reservation at mount time. In that case try to dip into the reserved
* Freeing large sets of inodes generally means freeing inode chunks, * pool and pray.
* directory and file data blocks, so this should be relatively safe.
* Only under severe circumstances should it be possible to free enough
* inodes to exhaust the reserve block pool via finobt expansion while
* at the same time not creating free space in the filesystem.
* *
* Send a warning if the reservation does happen to fail, as the inode * Send a warning if the reservation does happen to fail, as the inode
* now remains allocated and sits on the unlinked list until the fs is * now remains allocated and sits on the unlinked list until the fs is
* repaired. * repaired.
*/ */
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, if (unlikely(mp->m_inotbt_nores)) {
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
&tp);
} else {
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
}
if (error) { if (error) {
if (error == -ENOSPC) { if (error == -ENOSPC) {
xfs_warn_ratelimited(mp, xfs_warn_ratelimited(mp,

View File

@ -140,6 +140,7 @@ typedef struct xfs_mount {
int m_fixedfsid[2]; /* unchanged for life of FS */ int m_fixedfsid[2]; /* unchanged for life of FS */
uint m_dmevmask; /* DMI events for this FS */ uint m_dmevmask; /* DMI events for this FS */
__uint64_t m_flags; /* global mount flags */ __uint64_t m_flags; /* global mount flags */
bool m_inotbt_nores; /* no per-AG finobt resv. */
int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */
int m_ialloc_min_blks;/* min blocks in sparse inode int m_ialloc_min_blks;/* min blocks in sparse inode