* [PATCH 0/2] Context sensitive memory shrinker support @ 2010-04-13 0:24 Dave Chinner 2010-04-13 0:24 ` [PATCH 1/2] mm: add context argument to shrinker callback Dave Chinner 2010-04-13 0:24 ` [PATCH 2/2] xfs: add a shrinker to background inode reclaim Dave Chinner 0 siblings, 2 replies; 18+ messages in thread From: Dave Chinner @ 2010-04-13 0:24 UTC (permalink / raw) To: linux-kernel; +Cc: linux-mm, linux-fsdevel, xfs Recently I made the XFS inode reclaim operate entirely in the background for both clean and dirty inodes as it simplified the code a lot and is somewhat more efficient. Unfortunately, there are some workloads where the background reclaim is not freeing memory fast enough, so the reclaim needs an extra push when memory is low. The inode caches are per-filesystem on XFS, so to make effective use of the shrinker callbacks when memory is low, we need a context to be passed through the shrinker to give us the filesystem context to run the reclaim from. The two patches introduce the shrinker context and implement the XFS inode reclaim shrinkers. ^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-13 0:24 [PATCH 0/2] Context sensitive memory shrinker support Dave Chinner @ 2010-04-13 0:24 ` Dave Chinner 2010-04-13 8:17 ` KOSAKI Motohiro ` (2 more replies) 2010-04-13 0:24 ` [PATCH 2/2] xfs: add a shrinker to background inode reclaim Dave Chinner 1 sibling, 3 replies; 18+ messages in thread From: Dave Chinner @ 2010-04-13 0:24 UTC (permalink / raw) To: linux-kernel; +Cc: linux-mm, linux-fsdevel, xfs From: Dave Chinner <dchinner@redhat.com> The current shrinker implementation requires the registered callback to have global state to work from. This makes it difficult to shrink caches that are not global (e.g. per-filesystem caches). Add a context argument to the shrinker callback so that it can easily be used in such situations. Signed-off-by: Dave Chinner <dchinner@redhat.com> --- arch/x86/kvm/mmu.c | 2 +- drivers/gpu/drm/i915/i915_gem.c | 2 +- fs/dcache.c | 2 +- fs/gfs2/glock.c | 2 +- fs/gfs2/quota.c | 2 +- fs/gfs2/quota.h | 2 +- fs/inode.c | 2 +- fs/mbcache.c | 5 +++-- fs/nfs/dir.c | 2 +- fs/nfs/internal.h | 2 +- fs/quota/dquot.c | 2 +- fs/ubifs/shrinker.c | 2 +- fs/ubifs/ubifs.h | 2 +- fs/xfs/linux-2.6/xfs_buf.c | 5 +++-- fs/xfs/quota/xfs_qm.c | 7 +++++-- include/linux/mm.h | 13 +++++++------ mm/vmscan.c | 9 ++++++--- 17 files changed, 36 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 48aeee8..d8ecb5b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2915,7 +2915,7 @@ static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) kvm_mmu_zap_page(kvm, page); } -static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) +static int mmu_shrink(void *ctx, int nr_to_scan, gfp_t gfp_mask) { struct kvm *kvm; struct kvm *kvm_freed = NULL; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 368d726..ed94bd6 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -5058,7 +5058,7 @@ void i915_gem_release(struct drm_device * dev, struct drm_file *file_priv) } static int -i915_gem_shrink(int nr_to_scan, gfp_t gfp_mask) +i915_gem_shrink(void *ctx, int nr_to_scan, gfp_t gfp_mask) { drm_i915_private_t *dev_priv, *next_dev; struct drm_i915_gem_object *obj_priv, *next_obj; diff --git a/fs/dcache.c b/fs/dcache.c index f1358e5..fdfe379 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -897,7 +897,7 @@ EXPORT_SYMBOL(shrink_dcache_parent); * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dcache_memory(void *ctx, int nr, gfp_t gfp_mask) { if (nr) { if (!(gfp_mask & __GFP_FS)) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 454d4b4..9969572 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1345,7 +1345,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } -static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) +static int gfs2_shrink_glock_memory(void *ctx, int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; int may_demote; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6dbcbad..3e3156a 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(qd_lru_lock); -int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) +int gfs2_shrink_qd_memory(void *ctx, int nr, gfp_t gfp_mask) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 195f60c..6218c00 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); +extern int gfs2_shrink_qd_memory(void *ctx, int nr, gfp_t gfp_mask); extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/inode.c b/fs/inode.c index 407bf39..b47e48a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -514,7 +514,7 @@ static void prune_icache(int nr_to_scan) * This function is passed the number of inodes to scan, and it returns the * total number of remaining possibly-reclaimable inodes. */ -static int shrink_icache_memory(int nr, gfp_t gfp_mask) +static int shrink_icache_memory(void *ctx, int nr, gfp_t gfp_mask) { if (nr) { /* diff --git a/fs/mbcache.c b/fs/mbcache.c index ec88ff3..725ea66 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache) * What the mbcache registers as to get shrunk dynamically. */ -static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); +static int mb_cache_shrink_fn(void *ctx, int nr_to_scan, gfp_t gfp_mask); static struct shrinker mb_cache_shrinker = { .shrink = mb_cache_shrink_fn, @@ -192,12 +192,13 @@ forget: * gets low. * * @nr_to_scan: Number of objects to scan + * @ctx: (ignored) * @gfp_mask: (ignored) * * Returns the number of objects which are present in the cache. */ static int -mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask) +mb_cache_shrink_fn(void *ctx, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(free_list); struct list_head *l, *ltmp; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c6f2750..8869f61 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1667,7 +1667,7 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry) smp_mb__after_atomic_dec(); } -int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) +int nfs_access_cache_shrinker(void *ctx, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(head); struct nfs_inode *nfsi; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 11f82f0..ea55452 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -205,7 +205,7 @@ extern struct rpc_procinfo nfs4_procedures[]; void nfs_close_context(struct nfs_open_context *ctx, int is_sync); /* dir.c */ -extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); +extern int nfs_access_cache_shrinker(void *ctx, int nr_to_scan, gfp_t gfp_mask); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index e0b870f..883bbfd 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -670,7 +670,7 @@ static void prune_dqcache(int count) * more memory */ -static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dqcache_memory(void *ctx, int nr, gfp_t gfp_mask) { if (nr) { spin_lock(&dq_list_lock); diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index 02feb59..8ba73bf 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -277,7 +277,7 @@ static int kick_a_thread(void) return 0; } -int ubifs_shrinker(int nr, gfp_t gfp_mask) +int ubifs_shrinker(void *ctx, int nr, gfp_t gfp_mask) { int freed, contention = 0; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index bd2542d..7244260 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); int ubifs_tnc_end_commit(struct ubifs_info *c); /* shrinker.c */ -int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask); +int ubifs_shrinker(void *ctx, int nr_to_scan, gfp_t gfp_mask); /* commit.c */ int ubifs_bg_thread(void *info); diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 44c2b0e..0df4b2e 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -44,7 +44,7 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC int xfsbufd_wakeup(int, gfp_t); +STATIC int xfsbufd_wakeup(void *ctx, int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); static struct shrinker xfs_buf_shake = { .shrink = xfsbufd_wakeup, @@ -339,7 +339,7 @@ _xfs_buf_lookup_pages( __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); - xfsbufd_wakeup(0, gfp_mask); + xfsbufd_wakeup(NULL, 0, gfp_mask); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } @@ -1756,6 +1756,7 @@ xfs_buf_runall_queues( STATIC int xfsbufd_wakeup( + void *ctx, int priority, gfp_t mask) { diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 417e61e..82ac964 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -72,7 +72,7 @@ STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(int, gfp_t); +STATIC int xfs_qm_shake(void *, int, gfp_t); static struct shrinker xfs_qm_shaker = { .shrink = xfs_qm_shake, @@ -2088,7 +2088,10 @@ xfs_qm_shake_freelist( */ /* ARGSUSED */ STATIC int -xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) +xfs_qm_shake( + void *ctx, + int nr_to_scan, + gfp_t gfp_mask) { int ndqused, nfree, n; diff --git a/include/linux/mm.h b/include/linux/mm.h index e70f21b..7d48942 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -982,11 +982,11 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) /* * A callback you can register to apply pressure to ageable caches. * - * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should - * look through the least-recently-used 'nr_to_scan' entries and - * attempt to free them up. It should return the number of objects - * which remain in the cache. If it returns -1, it means it cannot do - * any scanning at this time (eg. there is a risk of deadlock). + * 'shrink' is passed a context 'ctx', a count 'nr_to_scan' and a 'gfpmask'. + * It should look through the least-recently-used 'nr_to_scan' entries and + * attempt to free them up. It should return the number of objects which + * remain in the cache. If it returns -1, it means it cannot do any scanning + * at this time (eg. there is a risk of deadlock). * * The 'gfpmask' refers to the allocation we are currently trying to * fulfil. @@ -995,7 +995,8 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) * querying the cache size, so a fastpath for that case is appropriate. */ struct shrinker { - int (*shrink)(int nr_to_scan, gfp_t gfp_mask); + int (*shrink)(void *ctx, int nr_to_scan, gfp_t gfp_mask); + void *ctx; /* user callback context */ int seeks; /* seeks to recreate an obj */ /* These are for internal use */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 5321ac4..40f27d2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -215,8 +215,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; - unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); + unsigned long max_pass; + max_pass = (*shrinker->shrink)(shrinker->ctx, 0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); @@ -244,8 +245,10 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, int shrink_ret; int nr_before; - nr_before = (*shrinker->shrink)(0, gfp_mask); - shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); + nr_before = (*shrinker->shrink)(shrinker->ctx, + 0, gfp_mask); + shrink_ret = (*shrinker->shrink)(shrinker->ctx, + this_scan, gfp_mask); if (shrink_ret == -1) break; if (shrink_ret < nr_before) -- 1.6.5 ^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-13 0:24 ` [PATCH 1/2] mm: add context argument to shrinker callback Dave Chinner @ 2010-04-13 8:17 ` KOSAKI Motohiro 2010-04-18 0:15 ` Christoph Hellwig 2010-04-28 9:39 ` Avi Kivity 2 siblings, 0 replies; 18+ messages in thread From: KOSAKI Motohiro @ 2010-04-13 8:17 UTC (permalink / raw) To: Dave Chinner; +Cc: kosaki.motohiro, linux-kernel, linux-mm, linux-fsdevel, xfs > diff --git a/include/linux/mm.h b/include/linux/mm.h > index e70f21b..7d48942 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -982,11 +982,11 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) > /* > * A callback you can register to apply pressure to ageable caches. > * > - * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should > - * look through the least-recently-used 'nr_to_scan' entries and > - * attempt to free them up. It should return the number of objects > - * which remain in the cache. If it returns -1, it means it cannot do > - * any scanning at this time (eg. there is a risk of deadlock). > + * 'shrink' is passed a context 'ctx', a count 'nr_to_scan' and a 'gfpmask'. > + * It should look through the least-recently-used 'nr_to_scan' entries and > + * attempt to free them up. It should return the number of objects which > + * remain in the cache. If it returns -1, it means it cannot do any scanning > + * at this time (eg. there is a risk of deadlock). > * > * The 'gfpmask' refers to the allocation we are currently trying to > * fulfil. > @@ -995,7 +995,8 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) > * querying the cache size, so a fastpath for that case is appropriate. > */ > struct shrinker { > - int (*shrink)(int nr_to_scan, gfp_t gfp_mask); > + int (*shrink)(void *ctx, int nr_to_scan, gfp_t gfp_mask); > + void *ctx; /* user callback context */ > int seeks; /* seeks to recreate an obj */ > > /* These are for internal use */ > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 5321ac4..40f27d2 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -215,8 +215,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, > list_for_each_entry(shrinker, &shrinker_list, list) { > unsigned long long delta; > unsigned long total_scan; > - unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); > + unsigned long max_pass; > > + max_pass = (*shrinker->shrink)(shrinker->ctx, 0, gfp_mask); > delta = (4 * scanned) / shrinker->seeks; > delta *= max_pass; > do_div(delta, lru_pages + 1); > @@ -244,8 +245,10 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, > int shrink_ret; > int nr_before; > > - nr_before = (*shrinker->shrink)(0, gfp_mask); > - shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); > + nr_before = (*shrinker->shrink)(shrinker->ctx, > + 0, gfp_mask); > + shrink_ret = (*shrinker->shrink)(shrinker->ctx, > + this_scan, gfp_mask); > if (shrink_ret == -1) > break; > if (shrink_ret < nr_before) Looks good about this mm part. Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> off-topic: shrink_slab() was introduced for page/[id]-cache scan balancing at first. now it still have hardcorded shrinker->nr calculation for slab although now lots another subsystem using it. shrinker->seeks seems no intuitive knob. probably we should try generalization it in future. but it is another story. I think this patch provide good first step. delta = (4 * scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); shrinker->nr += delta; Thanks. ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-13 0:24 ` [PATCH 1/2] mm: add context argument to shrinker callback Dave Chinner 2010-04-13 8:17 ` KOSAKI Motohiro @ 2010-04-18 0:15 ` Christoph Hellwig 2010-04-19 14:00 ` Nick Piggin 2010-04-28 9:39 ` Avi Kivity 2 siblings, 1 reply; 18+ messages in thread From: Christoph Hellwig @ 2010-04-18 0:15 UTC (permalink / raw) To: Dave Chinner; +Cc: linux-kernel, linux-fsdevel, linux-mm, xfs Any chance we can still get this into 2.6.34? It's really needed to fix a regression in XFS that would be hard to impossible to work around inside the fs. While it touches quite a few places the changes are trivial and well understood. ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-18 0:15 ` Christoph Hellwig @ 2010-04-19 14:00 ` Nick Piggin 2010-04-20 0:41 ` Dave Chinner 0 siblings, 1 reply; 18+ messages in thread From: Nick Piggin @ 2010-04-19 14:00 UTC (permalink / raw) To: Christoph Hellwig Cc: Dave Chinner, linux-kernel, linux-fsdevel, linux-mm, xfs, Andrew Morton On Sat, Apr 17, 2010 at 08:15:14PM -0400, Christoph Hellwig wrote: > Any chance we can still get this into 2.6.34? It's really needed to fix > a regression in XFS that would be hard to impossible to work around > inside the fs. While it touches quite a few places the changes are > trivial and well understood. Why do you even need this context argument? Reclaim is not doing anything smart about this, it would just call each call shrinker in turn. Do you not have an easily traversable list of mountpoints? Can you just make a list of them? It would be cheaper than putting a whole shrinker structure into them anyway. The main reason I would be against proliferation of dynamic shrinker registration would be that it could change reclaim behaviour depending on how they get ordered (in the cache the caches are semi-dependent, like inode cache and dentry cache). Unless there is a reason I missed, I would much prefer not to do this like this. ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-19 14:00 ` Nick Piggin @ 2010-04-20 0:41 ` Dave Chinner 2010-04-20 8:38 ` Nick Piggin 0 siblings, 1 reply; 18+ messages in thread From: Dave Chinner @ 2010-04-20 0:41 UTC (permalink / raw) To: Nick Piggin Cc: Christoph Hellwig, linux-kernel, linux-fsdevel, linux-mm, xfs, Andrew Morton On Tue, Apr 20, 2010 at 12:00:39AM +1000, Nick Piggin wrote: > On Sat, Apr 17, 2010 at 08:15:14PM -0400, Christoph Hellwig wrote: > > Any chance we can still get this into 2.6.34? It's really needed to fix > > a regression in XFS that would be hard to impossible to work around > > inside the fs. While it touches quite a few places the changes are > > trivial and well understood. > > Why do you even need this context argument? Reclaim is not doing anything > smart about this, it would just call each call shrinker in turn. It's not being smart, but it is detemining how many objects to reclaim in each shrinker call based on memory pressure and the number of reclimable objects in the cache the shrinker works on. That's exactly the semantics I want for per-filesystem inode cache reclaim. > Do you not have an easily traversable list of mountpoints? No, XFS does not have one, and I'm actively trying to remove any global state that crosses mounts that does exist (e.g. the global dquot caches and freelist). > Can you just > make a list of them? It would be cheaper than putting a whole shrinker > structure into them anyway. It's not cheaper or simpler. To make it work properly, I'd need to aggregate counters over all the filesystems in the list, work out how much to reclaim from each, etc. It is quite messy compared to deferecing the context to check one variable and either return or invoke the existing inode reclaim code. I also don't want to have a situation where i have to implement fairness heuristics to avoid reclaiming one filesystem too much or only end up reclaiming one or two inodes per filesystem per shrinker call because of the number of filesytems is similar to the shrinker batch size. The high level shrinker code already does this reclaim proportioning and does it far better than can be done in the scope of a shrinker callback. IOWs, adding a context allows XFS to do inode reclaim far more efficiently than if it was implemented through global state and a single shrinker. FWIW, we have this problem in the inode and dentry cache - we've got all sorts of complexity for being fair about reclaiming across all superblocks. I don't want to duplicate that complexity - instead I want to avoid it entirely. > The main reason I would be against proliferation of dynamic shrinker > registration would be that it could change reclaim behaviour depending > on how they get ordered (in the cache the caches are semi-dependent, > like inode cache and dentry cache). Adding a context does not change that implicit ordering based on registration order. Any filesystem based shrinker is going to be registered after the core infrastructure shrnikers, so they are not going to perturb the current ordering. And if this is enough of a problem to disallow context based cache shrinkers, then lets fix the interface so that we encode the dependencies explicitly in the registration interface rather than doing it implicitly. IOWs, I don't think this is a valid reason for not allowing a context to be passed with a shrinker because it is easily fixed. Cheers, Dave. -- Dave Chinner david@fromorbit.com ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-20 0:41 ` Dave Chinner @ 2010-04-20 8:38 ` Nick Piggin 2010-04-20 10:32 ` Dave Chinner 0 siblings, 1 reply; 18+ messages in thread From: Nick Piggin @ 2010-04-20 8:38 UTC (permalink / raw) To: Dave Chinner Cc: Christoph Hellwig, linux-kernel, linux-fsdevel, linux-mm, xfs, Andrew Morton On Tue, Apr 20, 2010 at 10:41:49AM +1000, Dave Chinner wrote: > On Tue, Apr 20, 2010 at 12:00:39AM +1000, Nick Piggin wrote: > > On Sat, Apr 17, 2010 at 08:15:14PM -0400, Christoph Hellwig wrote: > > > Any chance we can still get this into 2.6.34? It's really needed to fix > > > a regression in XFS that would be hard to impossible to work around > > > inside the fs. While it touches quite a few places the changes are > > > trivial and well understood. > > > > Why do you even need this context argument? Reclaim is not doing anything > > smart about this, it would just call each call shrinker in turn. > > It's not being smart, but it is detemining how many objects to > reclaim in each shrinker call based on memory pressure and the > number of reclimable objects in the cache the shrinker works on. > That's exactly the semantics I want for per-filesystem inode cache > reclaim. And you can easily do that in the filesystem code because either way you have to know the number of objects you have. > > > Do you not have an easily traversable list of mountpoints? > > No, XFS does not have one, and I'm actively trying to remove any > global state that crosses mounts that does exist (e.g. the global > dquot caches and freelist). The shrinker list is global state too, so it's not a big deal. A simple list and an rwsem will work fine and be smaller than adding the full shrinker structure in the mount point. And that way you get a mountpoint list to potentially use for other things. Wheras with the shrinker you still cannot. > > Can you just > > make a list of them? It would be cheaper than putting a whole shrinker > > structure into them anyway. > > It's not cheaper or simpler. To make it work properly, I'd > need to aggregate counters over all the filesystems in the list, > work out how much to reclaim from each, etc. It is quite messy > compared to deferecing the context to check one variable and either > return or invoke the existing inode reclaim code. It most definitely is cheaper, in terms of memory footprint. For cost of doing the traversals, letting the shrinker code do it is doing the *exact* same thing -- it's just traversing all your mount points. > I also don't want to have a situation where i have to implement > fairness heuristics to avoid reclaiming one filesystem too much or > only end up reclaiming one or two inodes per filesystem per shrinker > call because of the number of filesytems is similar to the shrinker > batch size. The high level shrinker code already does this reclaim > proportioning and does it far better than can be done in the scope > of a shrinker callback. IOWs, adding a context allows XFS to do > inode reclaim far more efficiently than if it was implemented > through global state and a single shrinker. You would just do it proportionately with the size of each fliesystem, simple, dumb, and exactly what the shrinker does. > FWIW, we have this problem in the inode and dentry cache - we've got > all sorts of complexity for being fair about reclaiming across all > superblocks. I don't want to duplicate that complexity - instead I > want to avoid it entirely. The dcache pruning is not complex. if (prune_ratio != 1) w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1; else w_count = sb->s_nr_dentry_unused; Inode pruning is the same. > > The main reason I would be against proliferation of dynamic shrinker > > registration would be that it could change reclaim behaviour depending > > on how they get ordered (in the cache the caches are semi-dependent, > > like inode cache and dentry cache). > > Adding a context does not change that implicit ordering based on > registration order. Any filesystem based shrinker is going to be > registered after the core infrastructure shrnikers, so they are not > going to perturb the current ordering. By definition the ordering changes based on the order of registration. You can't argue with that. > And if this is enough of a problem to disallow context based cache > shrinkers, then lets fix the interface so that we encode the > dependencies explicitly in the registration interface rather than > doing it implicitly. > > IOWs, I don't think this is a valid reason for not allowing a > context to be passed with a shrinker because it is easily fixed. Well yeah you could do all that maybe. I think it would definitely be required if we were to do context shrinkers like this. But AFAIKS there is simply no need at all. Definitely it is not preventing XFS from following more like the existing shrinker implementations. ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-20 8:38 ` Nick Piggin @ 2010-04-20 10:32 ` Dave Chinner 2010-04-21 8:40 ` Nick Piggin 0 siblings, 1 reply; 18+ messages in thread From: Dave Chinner @ 2010-04-20 10:32 UTC (permalink / raw) To: Nick Piggin Cc: Christoph Hellwig, linux-kernel, linux-fsdevel, linux-mm, xfs, Andrew Morton On Tue, Apr 20, 2010 at 06:38:40PM +1000, Nick Piggin wrote: > On Tue, Apr 20, 2010 at 10:41:49AM +1000, Dave Chinner wrote: > > And if this is enough of a problem to disallow context based cache > > shrinkers, then lets fix the interface so that we encode the > > dependencies explicitly in the registration interface rather than > > doing it implicitly. > > > > IOWs, I don't think this is a valid reason for not allowing a > > context to be passed with a shrinker because it is easily fixed. > > Well yeah you could do all that maybe. I think it would definitely be > required if we were to do context shrinkers like this. But AFAIKS there > is simply no need at all. Definitely it is not preventing XFS from > following more like the existing shrinker implementations. So you're basically saying that we shouldn't improve the shrinker interface because you don't think that anyone should be doing anything different to what is already there. If a change of interface means that we end up with shorter call chains, less global state, more flexibilty, better batching and IO patterns, less duplication of code and algorithms and it doesn't cause any regressions, then where's the problem? Cheers, Dave. -- Dave Chinner david@fromorbit.com ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-20 10:32 ` Dave Chinner @ 2010-04-21 8:40 ` Nick Piggin 2010-04-22 16:32 ` Christoph Hellwig 0 siblings, 1 reply; 18+ messages in thread From: Nick Piggin @ 2010-04-21 8:40 UTC (permalink / raw) To: Dave Chinner Cc: Christoph Hellwig, linux-kernel, linux-fsdevel, linux-mm, xfs, Andrew Morton On Tue, Apr 20, 2010 at 08:32:16PM +1000, Dave Chinner wrote: > On Tue, Apr 20, 2010 at 06:38:40PM +1000, Nick Piggin wrote: > > On Tue, Apr 20, 2010 at 10:41:49AM +1000, Dave Chinner wrote: > > > And if this is enough of a problem to disallow context based cache > > > shrinkers, then lets fix the interface so that we encode the > > > dependencies explicitly in the registration interface rather than > > > doing it implicitly. > > > > > > IOWs, I don't think this is a valid reason for not allowing a > > > context to be passed with a shrinker because it is easily fixed. > > > > Well yeah you could do all that maybe. I think it would definitely be > > required if we were to do context shrinkers like this. But AFAIKS there > > is simply no need at all. Definitely it is not preventing XFS from > > following more like the existing shrinker implementations. > > So you're basically saying that we shouldn't improve the shrinker > interface because you don't think that anyone should be doing > anything different to what is already there. I'm saying that dynamic registration is no good, if we don't have a way to order the shrinkers. > If a change of interface means that we end up with shorter call > chains, less global state, more flexibilty, better batching and IO > patterns, less duplication of code and algorithms and it doesn't > cause any regressions, then where's the problem? Yep that would all be great but I don't see how the interface change enables any of that at all. It seems to me that the advantage goes the other way because it doesn't put as much crap into your mount structure and you end up with an useful traversable list of mounts as a side-effect. ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-21 8:40 ` Nick Piggin @ 2010-04-22 16:32 ` Christoph Hellwig 2010-04-22 16:38 ` Nick Piggin 0 siblings, 1 reply; 18+ messages in thread From: Christoph Hellwig @ 2010-04-22 16:32 UTC (permalink / raw) To: Nick Piggin Cc: Dave Chinner, linux-kernel, xfs, Christoph Hellwig, linux-mm, linux-fsdevel, Andrew Morton On Wed, Apr 21, 2010 at 06:40:04PM +1000, Nick Piggin wrote: > I'm saying that dynamic registration is no good, if we don't have a > way to order the shrinkers. We can happily throw in a priority field into the shrinker structure, but at this stage in the release process I'd rather have an as simple as possible fix for the regression. And just adding the context pointer which is a no-op for all existing shrinkers fits that scheme very well. If it makes you happier I can queue up a patch to add the priorities for 2.6.35. I think figuring out any meaningful priorities will be much harder than that, though. > > If a change of interface means that we end up with shorter call > > chains, less global state, more flexibilty, better batching and IO > > patterns, less duplication of code and algorithms and it doesn't > > cause any regressions, then where's the problem? > > Yep that would all be great but I don't see how the interface change > enables any of that at all. It seems to me that the advantage goes > the other way because it doesn't put as much crap into your mount > structure and you end up with an useful traversable list of mounts as > a side-effect. There really is not need for that. The Linux VFS is designed to have superblocks independent, which actually is a good thing as global state gets in the way of scalability or just clean code. Note that a mounts list would be even worse as filesystems basically are not concerned with mount points themselves. ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-22 16:32 ` Christoph Hellwig @ 2010-04-22 16:38 ` Nick Piggin 2010-04-22 16:42 ` Christoph Hellwig 2010-04-28 3:38 ` Dave Chinner 0 siblings, 2 replies; 18+ messages in thread From: Nick Piggin @ 2010-04-22 16:38 UTC (permalink / raw) To: Christoph Hellwig Cc: Dave Chinner, linux-kernel, xfs, linux-mm, linux-fsdevel, Andrew Morton On Thu, Apr 22, 2010 at 12:32:11PM -0400, Christoph Hellwig wrote: > On Wed, Apr 21, 2010 at 06:40:04PM +1000, Nick Piggin wrote: > > I'm saying that dynamic registration is no good, if we don't have a > > way to order the shrinkers. > > We can happily throw in a priority field into the shrinker structure, > but at this stage in the release process I'd rather have an as simple > as possible fix for the regression. And just adding the context pointer > which is a no-op for all existing shrinkers fits that scheme very well. > > If it makes you happier I can queue up a patch to add the priorities > for 2.6.35. I think figuring out any meaningful priorities will be > much harder than that, though. I don't understand, it should be implemented like just all the other shrinkers AFAIKS. Like the dcache one that has to shrink multiple superblocks. There is absolutely no requirement for this API change to implement it in XFS. If you then add a patch to change the API and can show how it improves the situation, then fine. > > > > If a change of interface means that we end up with shorter call > > > chains, less global state, more flexibilty, better batching and IO > > > patterns, less duplication of code and algorithms and it doesn't > > > cause any regressions, then where's the problem? > > > > Yep that would all be great but I don't see how the interface change > > enables any of that at all. It seems to me that the advantage goes > > the other way because it doesn't put as much crap into your mount > > structure and you end up with an useful traversable list of mounts as > > a side-effect. > > There really is not need for that. The Linux VFS is designed to have > superblocks independent, which actually is a good thing as global > state gets in the way of scalability or just clean code. Note that > a mounts list would be even worse as filesystems basically are not > concerned with mount points themselves. But the shrinker list *is* a global list. The downside of it in the way it was done in the XFS patch is that 1) it is much larger than a simple list head, and 2) not usable by anything other then the shrinker. OK, maybe there will never be other uses for it other than shrinker, but that's not a disadvantage (just one less advantage). ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-22 16:38 ` Nick Piggin @ 2010-04-22 16:42 ` Christoph Hellwig 2010-04-22 16:57 ` Nick Piggin 2010-04-23 1:58 ` Dave Chinner 2010-04-28 3:38 ` Dave Chinner 1 sibling, 2 replies; 18+ messages in thread From: Christoph Hellwig @ 2010-04-22 16:42 UTC (permalink / raw) To: Nick Piggin Cc: Christoph Hellwig, Dave Chinner, linux-kernel, xfs, linux-mm, linux-fsdevel, Andrew Morton On Fri, Apr 23, 2010 at 02:38:01AM +1000, Nick Piggin wrote: > I don't understand, it should be implemented like just all the other > shrinkers AFAIKS. Like the dcache one that has to shrink multiple > superblocks. There is absolutely no requirement for this API change > to implement it in XFS. The dcache shrinker is an example for a complete mess. > But the shrinker list *is* a global list. The downside of it in the way > it was done in the XFS patch is that 1) it is much larger than a simple > list head, and 2) not usable by anything other then the shrinker. It is an existing global list just made more useful. Whenever a driver has muliple instances of pool that need shrinking this comes in useful, it's not related to filesystems at all. ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-22 16:42 ` Christoph Hellwig @ 2010-04-22 16:57 ` Nick Piggin 2010-04-23 1:58 ` Dave Chinner 1 sibling, 0 replies; 18+ messages in thread From: Nick Piggin @ 2010-04-22 16:57 UTC (permalink / raw) To: Christoph Hellwig Cc: Dave Chinner, linux-kernel, xfs, linux-mm, linux-fsdevel, Andrew Morton On Thu, Apr 22, 2010 at 12:42:47PM -0400, Christoph Hellwig wrote: > On Fri, Apr 23, 2010 at 02:38:01AM +1000, Nick Piggin wrote: > > I don't understand, it should be implemented like just all the other > > shrinkers AFAIKS. Like the dcache one that has to shrink multiple > > superblocks. There is absolutely no requirement for this API change > > to implement it in XFS. > > The dcache shrinker is an example for a complete mess. I don't know. It's not really caused by not registering multiple shrinkers. It seems to be caused more by the locking, which is not going away when you have multiple shrinkers. The XFS patch seems to be pinning the mount structure when it is registered, so it would have no such locking/refcounting problems using a private list AFAIKS. > > But the shrinker list *is* a global list. The downside of it in the way > > it was done in the XFS patch is that 1) it is much larger than a simple > > list head, and 2) not usable by anything other then the shrinker. > > It is an existing global list just made more useful. Whenever a driver > has muliple instances of pool that need shrinking this comes in useful, > it's not related to filesystems at all. I would say less useful, because shrinker structure cannot be used by anything but the shrinker, wheras a private list can be used by anything, including the applicable shrinker. ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-22 16:42 ` Christoph Hellwig 2010-04-22 16:57 ` Nick Piggin @ 2010-04-23 1:58 ` Dave Chinner 1 sibling, 0 replies; 18+ messages in thread From: Dave Chinner @ 2010-04-23 1:58 UTC (permalink / raw) To: Christoph Hellwig Cc: Nick Piggin, linux-kernel, xfs, linux-mm, linux-fsdevel, Andrew Morton On Thu, Apr 22, 2010 at 12:42:47PM -0400, Christoph Hellwig wrote: > On Fri, Apr 23, 2010 at 02:38:01AM +1000, Nick Piggin wrote: > > I don't understand, it should be implemented like just all the other > > shrinkers AFAIKS. Like the dcache one that has to shrink multiple > > superblocks. There is absolutely no requirement for this API change > > to implement it in XFS. > > The dcache shrinker is an example for a complete mess. Yes, it is, and one that I think we can clean up significantly by the use of context based shrinkers. IMO, a better approach to the VFS shrinkers is to leverage the fact we already have per-sb dentry LRUs and convert the inode cache to a per-sb LRU as well. We can then remove the current dependency problems by moving to a single context based shrinker (i.e. per-sb) to reclaim from the per-sb dentry LRU, followed by the per-sb inode LRU via a single shrinker. That is, remove the global scope from them because that is the cause of the shrinker call-order dependency problems. Further, if we then add a filesystem callout to the new superblock shrinker callout, we can handle all the needs of XFS (and other filesystems) without requiring them to have global filesystem lists and without introducing new dependencies between registered shrinkers. And given that the shrinker currently handles proportioning reclaim based on the number of objects reported by the cache, it also allows us to further simplify the dentry cache reclaim by removing all the proportioning stuff it does right now... Cheers, Dave. -- Dave Chinner david@fromorbit.com ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-22 16:38 ` Nick Piggin 2010-04-22 16:42 ` Christoph Hellwig @ 2010-04-28 3:38 ` Dave Chinner 1 sibling, 0 replies; 18+ messages in thread From: Dave Chinner @ 2010-04-28 3:38 UTC (permalink / raw) To: Nick Piggin Cc: Christoph Hellwig, linux-kernel, xfs, linux-mm, linux-fsdevel, Andrew Morton On Fri, Apr 23, 2010 at 02:38:01AM +1000, Nick Piggin wrote: > On Thu, Apr 22, 2010 at 12:32:11PM -0400, Christoph Hellwig wrote: > > On Wed, Apr 21, 2010 at 06:40:04PM +1000, Nick Piggin wrote: > > > I'm saying that dynamic registration is no good, if we don't have a > > > way to order the shrinkers. > > > > We can happily throw in a priority field into the shrinker structure, > > but at this stage in the release process I'd rather have an as simple > > as possible fix for the regression. And just adding the context pointer > > which is a no-op for all existing shrinkers fits that scheme very well. > > > > If it makes you happier I can queue up a patch to add the priorities > > for 2.6.35. I think figuring out any meaningful priorities will be > > much harder than that, though. > > I don't understand, it should be implemented like just all the other > shrinkers AFAIKS. Like the dcache one that has to shrink multiple > superblocks. There is absolutely no requirement for this API change > to implement it in XFS. Well, I've gone and done this global shrinker because I need a fix for the problem before .34 releases, not because I like it. Now my problem is that the accepted method of using global shrinkers (i.e. split nr_to-scan into portions based on per-fs usage) is causing a regression compared to not having a shrinker at all. The context based shrinker did not cause this regression, either. The regression is oom-killer panics with "no killable tasks" - it kills my 1GB RAM VM dead. Without a shrinker or with the context based shrinkers I will see one or two dd processes getting OOM-killed maybe once every 10 or so runs on this VM, but the machine continues to stay up. The global shrinker is turning this into a panic, and it is happening about twice as often. To fix this I've had to remove all the code that proportions the reclaim across all the XFS filesystems in the system. Basically it now walks from the first filesystem in the list to the last every time and effectively it only reclaims from the first filesystem it finds with reclaimable inodes. This is exactly the behaviour the context based shrinkers give me, without the need for adding global lists, additional locking and traverses. Also, context based shrinkers won't re-traverse all the filesystems, avoiding the potential for starving some filesystems of shrinker based reclaim if filesystems earlier in the list are putting more inodes into reclaim concurrently. Given that this behaviour matches pretty closely to the reasons I've already given for preferring context based per-fs shrinkers than a global shrinker and list, can we please move forward with this API change, Nick? As it is, I'm going to cross my fingers and ship this global shrinker because of time limitations, but I certainly hoping that for .35 we can move to context based shrinking.... Cheers, Dave. -- Dave Chinner david@fromorbit.com ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-13 0:24 ` [PATCH 1/2] mm: add context argument to shrinker callback Dave Chinner 2010-04-13 8:17 ` KOSAKI Motohiro 2010-04-18 0:15 ` Christoph Hellwig @ 2010-04-28 9:39 ` Avi Kivity 2010-04-28 13:45 ` Dave Chinner 2 siblings, 1 reply; 18+ messages in thread From: Avi Kivity @ 2010-04-28 9:39 UTC (permalink / raw) To: Dave Chinner; +Cc: linux-kernel, linux-mm, linux-fsdevel, xfs On 04/13/2010 03:24 AM, Dave Chinner wrote: > From: Dave Chinner<dchinner@redhat.com> > > The current shrinker implementation requires the registered callback > to have global state to work from. This makes it difficult to shrink > caches that are not global (e.g. per-filesystem caches). Add a > context argument to the shrinker callback so that it can easily be > used in such situations. > > @@ -995,7 +995,8 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) > * querying the cache size, so a fastpath for that case is appropriate. > */ > struct shrinker { > - int (*shrink)(int nr_to_scan, gfp_t gfp_mask); > + int (*shrink)(void *ctx, int nr_to_scan, gfp_t gfp_mask); > + void *ctx; /* user callback context */ > int seeks; /* seeks to recreate an obj */ > It's nicer (and slightly cheaper) to have int (*shrink)(struct shrinker *shrinker, int nr_to_scan, gfp_t gfp_mask); /* no void *ctx; */ Clients can use container_of() to reach their context from the shrinker argument. -- error compiling committee.c: too many arguments to function ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] mm: add context argument to shrinker callback 2010-04-28 9:39 ` Avi Kivity @ 2010-04-28 13:45 ` Dave Chinner 0 siblings, 0 replies; 18+ messages in thread From: Dave Chinner @ 2010-04-28 13:45 UTC (permalink / raw) To: Avi Kivity; +Cc: linux-kernel, linux-mm, linux-fsdevel, xfs On Wed, Apr 28, 2010 at 12:39:44PM +0300, Avi Kivity wrote: > On 04/13/2010 03:24 AM, Dave Chinner wrote: > >From: Dave Chinner<dchinner@redhat.com> > > > >The current shrinker implementation requires the registered callback > >to have global state to work from. This makes it difficult to shrink > >caches that are not global (e.g. per-filesystem caches). Add a > >context argument to the shrinker callback so that it can easily be > >used in such situations. > > >@@ -995,7 +995,8 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) > > * querying the cache size, so a fastpath for that case is appropriate. > > */ > > struct shrinker { > >- int (*shrink)(int nr_to_scan, gfp_t gfp_mask); > >+ int (*shrink)(void *ctx, int nr_to_scan, gfp_t gfp_mask); > >+ void *ctx; /* user callback context */ > > int seeks; /* seeks to recreate an obj */ > > > It's nicer (and slightly cheaper) to have > > int (*shrink)(struct shrinker *shrinker, int nr_to_scan, gfp_t gfp_mask); > /* no void *ctx; */ > > Clients can use container_of() to reach their context from the > shrinker argument. Agreed, that makes a lot of sense. I'll change it for the next version. Cheers, Dave. -- Dave Chinner david@fromorbit.com ^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 2/2] xfs: add a shrinker to background inode reclaim 2010-04-13 0:24 [PATCH 0/2] Context sensitive memory shrinker support Dave Chinner 2010-04-13 0:24 ` [PATCH 1/2] mm: add context argument to shrinker callback Dave Chinner @ 2010-04-13 0:24 ` Dave Chinner 1 sibling, 0 replies; 18+ messages in thread From: Dave Chinner @ 2010-04-13 0:24 UTC (permalink / raw) To: linux-kernel; +Cc: linux-mm, linux-fsdevel, xfs From: Dave Chinner <dchinner@redhat.com> On low memory boxes or those with highmem, kernel can OOM when the application scans large numbers of inodes. In this case, the XFS background inode reclaim scan run by xfssyncd does not run soon enough to free clean, reclaimable inodes. Add a per-filesystem shrinker to XFS to run inode reclaim so that it is expedited when memory is low and hence prevent OOM conditions from being hit. Signed-off-by: Dave Chinner <dchinner@redhat.com> --- fs/xfs/linux-2.6/xfs_super.c | 3 ++ fs/xfs/linux-2.6/xfs_sync.c | 71 ++++++++++++++++++++++++++++++++++++---- fs/xfs/linux-2.6/xfs_sync.h | 5 ++- fs/xfs/quota/xfs_qm_syscalls.c | 3 +- fs/xfs/xfs_ag.h | 1 + fs/xfs/xfs_mount.h | 1 + 6 files changed, 75 insertions(+), 9 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 52e06b4..855ba45 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1209,6 +1209,7 @@ xfs_fs_put_super( xfs_unmountfs(mp); xfs_freesb(mp); + xfs_inode_shrinker_unregister(mp); xfs_icsb_destroy_counters(mp); xfs_close_devices(mp); xfs_dmops_put(mp); @@ -1622,6 +1623,8 @@ xfs_fs_fill_super( if (error) goto fail_vnrele; + xfs_inode_shrinker_register(mp); + kfree(mtpt); return 0; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 05cd853..774c5c0 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -95,7 +95,8 @@ xfs_inode_ag_walk( struct xfs_perag *pag, int flags), int flags, int tag, - int exclusive) + int exclusive, + int *nr_to_scan) { uint32_t first_index; int last_error = 0; @@ -134,7 +135,7 @@ restart: if (error == EFSCORRUPTED) break; - } while (1); + } while ((*nr_to_scan)--); if (skipped) { delay(1); @@ -150,11 +151,16 @@ xfs_inode_ag_iterator( struct xfs_perag *pag, int flags), int flags, int tag, - int exclusive) + int exclusive, + int nr_to_scan) { int error = 0; int last_error = 0; xfs_agnumber_t ag; + int nr = nr_to_scan; + + if (!nr) + nr = INT_MAX; for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { struct xfs_perag *pag; @@ -165,7 +171,7 @@ xfs_inode_ag_iterator( continue; } error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, - exclusive); + exclusive, &nr); xfs_perag_put(pag); if (error) { last_error = error; @@ -291,7 +297,7 @@ xfs_sync_data( ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, - XFS_ICI_NO_TAG, 0); + XFS_ICI_NO_TAG, 0, 0); if (error) return XFS_ERROR(error); @@ -310,7 +316,7 @@ xfs_sync_attr( ASSERT((flags & ~SYNC_WAIT) == 0); return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, - XFS_ICI_NO_TAG, 0); + XFS_ICI_NO_TAG, 0, 0); } STATIC int @@ -673,6 +679,7 @@ __xfs_inode_set_reclaim_tag( radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), XFS_ICI_RECLAIM_TAG); + pag->pag_ici_reclaimable++; } /* @@ -705,6 +712,7 @@ __xfs_inode_clear_reclaim_tag( { radix_tree_tag_clear(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + pag->pag_ici_reclaimable--; } /* @@ -854,5 +862,54 @@ xfs_reclaim_inodes( int mode) { return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, - XFS_ICI_RECLAIM_TAG, 1); + XFS_ICI_RECLAIM_TAG, 1, 0); +} + +static int +xfs_reclaim_inode_shrink( + void *ctx, + int nr_to_scan, + gfp_t gfp_mask) +{ + struct xfs_mount *mp = ctx; + xfs_agnumber_t ag; + int reclaimable = 0; + + if (nr_to_scan) { + if (!(gfp_mask & __GFP_FS)) + return -1; + + xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, + XFS_ICI_RECLAIM_TAG, 1, nr_to_scan); + } + + for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, ag); + if (!pag->pag_ici_init) { + xfs_perag_put(pag); + continue; + } + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); + } + return reclaimable; +} + +void +xfs_inode_shrinker_register( + struct xfs_mount *mp) +{ + mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink; + mp->m_inode_shrink.ctx = mp; + mp->m_inode_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&mp->m_inode_shrink); +} + +void +xfs_inode_shrinker_unregister( + struct xfs_mount *mp) +{ + unregister_shrinker(&mp->m_inode_shrink); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index d480c34..e6c631b 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -53,6 +53,9 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, int tag, int write_lock); + int flags, int tag, int write_lock, int nr_to_scan); + +void xfs_inode_shrinker_register(struct xfs_mount *mp); +void xfs_inode_shrinker_unregister(struct xfs_mount *mp); #endif diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 5d0ee8d..94c0cac 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -891,7 +891,8 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, + XFS_ICI_NO_TAG, 0, 0); } /*------------------------------------------------------------------------*/ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index b1a5a1f..abb8222 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -223,6 +223,7 @@ typedef struct xfs_perag { int pag_ici_init; /* incore inode cache initialised */ rwlock_t pag_ici_lock; /* incore inode lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ #endif int pagb_count; /* pagb slots in use */ xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 4fa0bc7..1fe7662 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -259,6 +259,7 @@ typedef struct xfs_mount { wait_queue_head_t m_wait_single_sync_task; __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ + struct shrinker m_inode_shrink; /* inode reclaim shrinker */ } xfs_mount_t; /* -- 1.6.5 ^ permalink raw reply related [flat|nested] 18+ messages in thread
end of thread, other threads:[~2010-04-28 13:45 UTC | newest] Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2010-04-13 0:24 [PATCH 0/2] Context sensitive memory shrinker support Dave Chinner 2010-04-13 0:24 ` [PATCH 1/2] mm: add context argument to shrinker callback Dave Chinner 2010-04-13 8:17 ` KOSAKI Motohiro 2010-04-18 0:15 ` Christoph Hellwig 2010-04-19 14:00 ` Nick Piggin 2010-04-20 0:41 ` Dave Chinner 2010-04-20 8:38 ` Nick Piggin 2010-04-20 10:32 ` Dave Chinner 2010-04-21 8:40 ` Nick Piggin 2010-04-22 16:32 ` Christoph Hellwig 2010-04-22 16:38 ` Nick Piggin 2010-04-22 16:42 ` Christoph Hellwig 2010-04-22 16:57 ` Nick Piggin 2010-04-23 1:58 ` Dave Chinner 2010-04-28 3:38 ` Dave Chinner 2010-04-28 9:39 ` Avi Kivity 2010-04-28 13:45 ` Dave Chinner 2010-04-13 0:24 ` [PATCH 2/2] xfs: add a shrinker to background inode reclaim Dave Chinner
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).