linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Christoph Rohland <cr@sap.com>
To: Linus Torvalds <torvalds@transmeta.com>
Cc: Marcelo Tosatti <marcelo@conectiva.com.br>, linux-kernel@vger.kernel.org
Subject: [Patch] shmem_unuse race fix
Date: 27 Dec 2000 11:32:35 +0100	[thread overview]
Message-ID: <m3k88mb28v.fsf@linux.local> (raw)

Hi Linus,

The following patch (against clean test13-pre4) removes the races in
shmem_unuse. I changed inode.c to not lock the inode if there is no
write_inode function. So I can grab the inode while holding the
spinlock.

It also optimises the shmem_ftruncate behaviour.

BTW: The generic swapoff path itself has still races if a process is
paging in a page which is just freed on swap by try_to_unuse. It gives
'VM: bad swap entries' and worse. But this is not shmem
specific. Marcelo would you like to look into this?

Greetings
                Christoph

--- 4-13-4/fs/inode.c	Sun Dec 17 12:54:00 2000
+++ c/fs/inode.c	Wed Dec 27 11:02:59 2000
@@ -168,13 +168,6 @@
 		__wait_on_inode(inode);
 }
 
-
-static inline void write_inode(struct inode *inode, int sync)
-{
-	if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode)
-		inode->i_sb->s_op->write_inode(inode, sync);
-}
-
 static inline void __iget(struct inode * inode)
 {
 	if (atomic_read(&inode->i_count)) {
@@ -202,16 +195,20 @@
 		list_add(&inode->i_list, atomic_read(&inode->i_count)
 							? &inode_in_use
 							: &inode_unused);
-		/* Set I_LOCK, reset I_DIRTY */
-		inode->i_state |= I_LOCK;
-		inode->i_state &= ~I_DIRTY;
-		spin_unlock(&inode_lock);
+		if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode) {
+                        /* Set I_LOCK, reset I_DIRTY */
+                        inode->i_state |= I_LOCK;
+                        inode->i_state &= ~I_DIRTY;
+                        spin_unlock(&inode_lock);
 
-		write_inode(inode, sync);
+                        inode->i_sb->s_op->write_inode(inode, sync);
 
-		spin_lock(&inode_lock);
-		inode->i_state &= ~I_LOCK;
-		wake_up(&inode->i_wait);
+                        spin_lock(&inode_lock);
+                        inode->i_state &= ~I_LOCK;
+                        wake_up(&inode->i_wait);
+                        return;
+                }
+                inode->i_state &= ~I_DIRTY;
 	}
 }
 
diff -uNr 4-13-4/include/linux/shmem_fs.h c/include/linux/shmem_fs.h
--- 4-13-4/include/linux/shmem_fs.h	Fri Dec 22 10:05:38 2000
+++ c/include/linux/shmem_fs.h	Tue Dec 26 18:52:29 2000
@@ -19,6 +19,7 @@
 
 struct shmem_inode_info {
 	spinlock_t	lock;
+	unsigned long	max_index;
 	swp_entry_t	i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
 	swp_entry_t   **i_indirect; /* doubly indirect blocks */
 	unsigned long	swapped;
diff -uNr 4-13-4/mm/shmem.c c/mm/shmem.c
--- 4-13-4/mm/shmem.c	Fri Dec 22 10:05:38 2000
+++ c/mm/shmem.c	Tue Dec 26 20:00:32 2000
@@ -51,11 +51,16 @@
 
 static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index) 
 {
+	unsigned long offset;
+
 	if (index < SHMEM_NR_DIRECT)
 		return info->i_direct+index;
 
 	index -= SHMEM_NR_DIRECT;
-	if (index >= ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
+	offset = index % ENTRIES_PER_PAGE;
+	index /= ENTRIES_PER_PAGE;
+
+	if (index >= ENTRIES_PER_PAGE)
 		return NULL;
 
 	if (!info->i_indirect) {
@@ -63,13 +68,13 @@
 		if (!info->i_indirect)
 			return NULL;
 	}
-	if(!(info->i_indirect[index/ENTRIES_PER_PAGE])) {
-		info->i_indirect[index/ENTRIES_PER_PAGE] = (swp_entry_t *) get_zeroed_page(GFP_USER);
-		if (!info->i_indirect[index/ENTRIES_PER_PAGE])
+	if(!(info->i_indirect[index])) {
+		info->i_indirect[index] = (swp_entry_t *) get_zeroed_page(GFP_USER);
+		if (!info->i_indirect[index])
 			return NULL;
 	}
 	
-	return info->i_indirect[index/ENTRIES_PER_PAGE]+index%ENTRIES_PER_PAGE;
+	return info->i_indirect[index]+offset;
 }
 
 static int shmem_free_swp(swp_entry_t *dir, unsigned int count)
@@ -99,7 +104,6 @@
  * @dir:	pointer to swp_entries 
  * @size:	number of entries in dir
  * @start:	offset to start from
- * @inode:	inode for statistics
  * @freed:	counter for freed pages
  *
  * It frees the swap entries from dir+start til dir+size
@@ -109,7 +113,7 @@
 
 static unsigned long 
 shmem_truncate_part (swp_entry_t * dir, unsigned long size, 
-		     unsigned long start, struct inode * inode, unsigned long *freed) {
+		     unsigned long start, unsigned long *freed) {
 	if (start > size)
 		return start - size;
 	if (dir)
@@ -121,21 +125,27 @@
 static void shmem_truncate (struct inode * inode)
 {
 	int clear_base;
-	unsigned long start;
+	unsigned long index, start;
 	unsigned long mmfreed, freed = 0;
-	swp_entry_t **base, **ptr;
+	swp_entry_t **base, **ptr, **last;
 	struct shmem_inode_info * info = &inode->u.shmem_i;
 
 	spin_lock (&info->lock);
-	start = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (index >= info->max_index) {
+		info->max_index = index;
+		spin_unlock (&info->lock);
+		return;
+	}
 
-	start = shmem_truncate_part (info->i_direct, SHMEM_NR_DIRECT, start, inode, &freed);
+	start = shmem_truncate_part (info->i_direct, SHMEM_NR_DIRECT, index, &freed);
 
 	if (!(base = info->i_indirect))
-		goto out;;
+		goto out;
 
 	clear_base = 1;
-	for (ptr = base; ptr < base + ENTRIES_PER_PAGE; ptr++) {
+	last = base + ((info->max_index - SHMEM_NR_DIRECT + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE);
+	for (ptr = base; ptr < last; ptr++) {
 		if (!start) {
 			if (!*ptr)
 				continue;
@@ -145,16 +155,16 @@
 			continue;
 		}
 		clear_base = 0;
-		start = shmem_truncate_part (*ptr, ENTRIES_PER_PAGE, start, inode, &freed);
+		start = shmem_truncate_part (*ptr, ENTRIES_PER_PAGE, start, &freed);
 	}
 
-	if (!clear_base) 
-		goto out;
-
-	free_page ((unsigned long)base);
-	info->i_indirect = 0;
+	if (clear_base) {
+		free_page ((unsigned long)base);
+		info->i_indirect = 0;
+	}
 
 out:
+	info->max_index = index;
 
 	/*
 	 * We have to calculate the free blocks since we do not know
@@ -181,14 +191,14 @@
 {
 	struct shmem_sb_info *info = &inode->i_sb->u.shmem_sb;
 
-	spin_lock (&shmem_ilock);
-	list_del (&inode->u.shmem_i.list);
-	spin_unlock (&shmem_ilock);
 	inode->i_size = 0;
 	shmem_truncate (inode);
 	spin_lock (&info->stat_lock);
 	info->free_inodes++;
 	spin_unlock (&info->stat_lock);
+	spin_lock (&shmem_ilock);
+	list_del (&inode->u.shmem_i.list);
+	spin_unlock (&shmem_ilock);
 	clear_inode(inode);
 }
 
@@ -215,21 +225,23 @@
 	info = &((struct inode *)page->mapping->host)->u.shmem_i;
 	if (info->locked)
 		return 1;
+	spin_lock(&info->lock);
 	swap = __get_swap_page(2);
-	if (!swap.val)
+	if (!swap.val) {
+		spin_unlock(&info->lock);
 		return 1;
+	}
 
-	spin_lock(&info->lock);
-	entry = shmem_swp_entry (info, page->index);
+	entry = shmem_swp_entry(info, page->index);
 	if (!entry)	/* this had been allocted on page allocation */
 		BUG();
 	error = -EAGAIN;
 	if (entry->val) {
-                __swap_free(swap, 2);
+		__swap_free(swap, 2);
 		goto out;
-        }
+	}
 
-        *entry = swap;
+	*entry = swap;
 	error = 0;
 	/* Remove the from the page cache */
 	lru_cache_del(page);
@@ -756,21 +768,21 @@
 	return -1;
 }
 
-static int shmem_unuse_inode (struct inode *inode, swp_entry_t entry, struct page *page)
+static inline int shmem_unuse_inode (struct shmem_inode_info *info, struct address_space *mapping, swp_entry_t entry, struct page *page)
 {
 	swp_entry_t **base, **ptr;
 	unsigned long idx;
 	int offset;
 	
 	idx = 0;
-	if ((offset = shmem_clear_swp (entry, inode->u.shmem_i.i_direct, SHMEM_NR_DIRECT)) >= 0)
+	if ((offset = shmem_clear_swp (entry,info->i_direct, SHMEM_NR_DIRECT)) >= 0)
 		goto found;
 
 	idx = SHMEM_NR_DIRECT;
-	if (!(base = inode->u.shmem_i.i_indirect))
+	if (!(base = info->i_indirect))
 		return 0;
 
-	for (ptr = base; ptr < base + ENTRIES_PER_PAGE; ptr++) {
+	for (ptr = base; idx <= info->max_index; ptr++) {
 		if (*ptr &&
 		    (offset = shmem_clear_swp (entry, *ptr, ENTRIES_PER_PAGE)) >= 0)
 			goto found;
@@ -778,35 +790,71 @@
 	}
 	return 0;
 found:
-	delete_from_swap_cache (page);
-	add_to_page_cache (page, inode->i_mapping, idx);
+	if (PageSwapCache(page))
+		BUG();
+	add_to_page_cache (page, mapping, offset + idx);
 	SetPageDirty (page);
 	SetPageUptodate (page);
 	UnlockPage (page);
-	spin_lock (&inode->u.shmem_i.lock);
-	inode->u.shmem_i.swapped--;
-	spin_unlock (&inode->u.shmem_i.lock);
+	info->swapped--;
 	return 1;
 }
 
+static struct inode * grab_next_inode(struct list_head **listp)
+{
+	struct inode *inode;
+
+	for (*listp = (*listp)->next; *listp != &shmem_inodes; *listp = (*listp)->next) {
+		inode = list_entry(*listp, struct inode, u.shmem_i.list);
+
+		if (igrab(inode))
+			return inode;
+	}
+
+	return NULL;
+}
+
 /*
- * unuse_shmem() search for an eventually swapped out shmem page.
+ * shmem_unuse() search for an eventually swapped out shmem page.
  */
 void shmem_unuse(swp_entry_t entry, struct page *page)
 {
-	struct list_head *p;
-	struct inode * inode;
+	struct list_head *p = &shmem_inodes;
+	struct inode * inode, *next_inode = NULL;
+	struct shmem_inode_info *info;
+	int ret;
 
-	spin_lock (&shmem_ilock);
-	list_for_each(p, &shmem_inodes) {
-		inode = list_entry(p, struct inode, u.shmem_i.list);
+	/*
+	 * This is tricky:
+	 *
+	 * - We cannot use a semaphore for the list since delete will
+	 *   revert the lock order.
+	 * - We need to get the inode semaphore since the swap cache
+	 *   has no atomic actions for getting an entry
+	 * - We cannot drop the inode while holding the lock
+	 *
+	 * So we grab the current inode and the next one to make sure
+	 * that the next pointer inthe list is valid.
+	 */
 
-		if (shmem_unuse_inode(inode, entry, page))
+	spin_lock(&shmem_ilock);
+	for(inode = grab_next_inode(&p); inode; inode = next_inode) {
+		next_inode = grab_next_inode(&p);
+		info = &inode->u.shmem_i;
+		spin_unlock(&shmem_ilock);
+		down(&inode->i_sem);
+		spin_lock (&info->lock);
+		ret = shmem_unuse_inode(info, inode->i_mapping, entry, page);
+		spin_unlock (&info->lock);
+		up(&inode->i_sem);
+		iput (inode);
+		spin_lock(&shmem_ilock);
+		if (ret)
 			break;
 	}
 	spin_unlock (&shmem_ilock);
+	iput(next_inode);
 }
-
 
 /*
  * shmem_file_setup - get an unlinked file living in shmem fs

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

             reply	other threads:[~2000-12-27 11:01 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2000-12-27 10:32 Christoph Rohland [this message]
2000-12-27 15:14 ` [Patch] shmem_unuse race fix Marcelo Tosatti
2000-12-27 18:36   ` Christoph Rohland
2000-12-28  4:35     ` Linus Torvalds
2000-12-28 12:02       ` Christoph Rohland
2000-12-28 18:07         ` Linus Torvalds
2000-12-28 22:13           ` Christoph Rohland

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=m3k88mb28v.fsf@linux.local \
    --to=cr@sap.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=marcelo@conectiva.com.br \
    --cc=torvalds@transmeta.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).