From 168a9fd6a1bf91041adf9909f6c72cf747f0ca8c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 12 Jul 2005 13:58:10 -0700 Subject: [PATCH] __wait_on_freeing_inode fix This patch fixes queer behavior in __wait_on_freeing_inode(). If I_LOCK was not set it called yield(), effectively busy waiting for the removal of the inode from the hash. This change was introduced within "[PATCH] eliminate inode waitqueue hashtable" Changeset 1.1938.166.16 last october by wli. The solution is to restore the old behavior, of unconditionally waiting on the waitqueue. It doesn't matter if I_LOCK is not set initally, the task will go to sleep, and wake up when wake_up_inode() is called from generic_delete_inode() after removing the inode from the hash chain. Comment is also updated to better reflect current behavior. This condition is very hard to trigger normally (simultaneous clear_inode() with iget()) so probably only heavy stress testing can reveal any change of behavior. Signed-off-by: Miklos Szeredi Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 6d695037a0a3..0116d06731c2 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1244,29 +1244,21 @@ int inode_wait(void *word) } /* - * If we try to find an inode in the inode hash while it is being deleted, we - * have to wait until the filesystem completes its deletion before reporting - * that it isn't found. This is because iget will immediately call - * ->read_inode, and we want to be sure that evidence of the deletion is found - * by ->read_inode. + * If we try to find an inode in the inode hash while it is being + * deleted, we have to wait until the filesystem completes its + * deletion before reporting that it isn't found. This function waits + * until the deletion _might_ have completed. Callers are responsible + * to recheck inode state. + * + * It doesn't matter if I_LOCK is not set initially, a call to + * wake_up_inode() after removing from the hash list will DTRT. + * * This is called with inode_lock held. */ static void __wait_on_freeing_inode(struct inode *inode) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); - - /* - * I_FREEING and I_CLEAR are cleared in process context under - * inode_lock, so we have to give the tasks who would clear them - * a chance to run and acquire inode_lock. - */ - if (!(inode->i_state & I_LOCK)) { - spin_unlock(&inode_lock); - yield(); - spin_lock(&inode_lock); - return; - } wq = bit_waitqueue(&inode->i_state, __I_LOCK); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode_lock); -- cgit v1.2.1 From 4120db47198d21d8cd3b2cdbbe1ea6118a50bcd4 Mon Sep 17 00:00:00 2001 From: "Artem B. Bityuckiy" Date: Tue, 12 Jul 2005 13:58:12 -0700 Subject: [PATCH] bugfix: two read_inode() calls without clear_inode() call between Bug symptoms ~~~~~~~~~~~~ For the same inode VFS calls read_inode() twice and doesn't call clear_inode() between the two read_inode() invocations. Bug description ~~~~~~~~~~~~~~~ Suppose we have an inode which has zero reference count but is still in the inode cache. Suppose kswapd invokes shrink_icache_memory() to free some RAM. In prune_icache() inodes are removed from i_hash. prune_icache () is then going to call clear_inode(), but drops the inode_lock spinlock before this. If in this moment another task calls iget() for an inode which was just removed from i_hash by prune_icache(), then iget() invokes read_inode() for this inode, because it is *already removed* from i_hash. The end result is: we call iget(#N) then iput(#N); inode #N has zero i_count now and is in the inode cache; kswapd starts. kswapd removes the inode #N from i_hash ans is preempted; we call iget(#N) again; read_inode() is invoked as the result; but we expect clear_inode() before. Fix ~~~~~~~ To fix the bug I remove inodes from i_hash later, when clear_inode() is actually called. I remove them from i_hash under spinlock protection. Since the i_state is set to I_FREEING, it is safe to do this. The others will sleep waiting for the inode state change. I also postpone removing inodes from i_sb_list. It is not compulsory to do so but I do it for readability reasons. Inodes are added/removed to the lists together everywhere in the code and there is no point to change this rule. This is harmless because the only user of i_sb_list which somehow may interfere with me (invalidate_list()) is excluded by the iprune_sem mutex. The same race is possible in invalidate_list() so I do the same for it. Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 0116d06731c2..5bc97507eeaa 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -282,6 +282,13 @@ static void dispose_list(struct list_head *head) if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); + + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + list_del_init(&inode->i_sb_list); + spin_unlock(&inode_lock); + + wake_up_inode(inode); destroy_inode(inode); nr_disposed++; } @@ -317,8 +324,6 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) inode = list_entry(tmp, struct inode, i_sb_list); invalidate_inode_buffers(inode); if (!atomic_read(&inode->i_count)) { - hlist_del_init(&inode->i_hash); - list_del(&inode->i_sb_list); list_move(&inode->i_list, dispose); inode->i_state |= I_FREEING; count++; @@ -439,8 +444,6 @@ static void prune_icache(int nr_to_scan) if (!can_unuse(inode)) continue; } - hlist_del_init(&inode->i_hash); - list_del_init(&inode->i_sb_list); list_move(&inode->i_list, &freeable); inode->i_state |= I_FREEING; nr_pruned++; -- cgit v1.2.1 From 0eeca28300df110bd6ed54b31193c83b87921443 Mon Sep 17 00:00:00 2001 From: Robert Love Date: Tue, 12 Jul 2005 17:06:03 -0400 Subject: [PATCH] inotify inotify is intended to correct the deficiencies of dnotify, particularly its inability to scale and its terrible user interface: * dnotify requires the opening of one fd per each directory that you intend to watch. This quickly results in too many open files and pins removable media, preventing unmount. * dnotify is directory-based. You only learn about changes to directories. Sure, a change to a file in a directory affects the directory, but you are then forced to keep a cache of stat structures. * dnotify's interface to user-space is awful. Signals? inotify provides a more usable, simple, powerful solution to file change notification: * inotify's interface is a system call that returns a fd, not SIGIO. You get a single fd, which is select()-able. * inotify has an event that says "the filesystem that the item you were watching is on was unmounted." * inotify can watch directories or files. Inotify is currently used by Beagle (a desktop search infrastructure), Gamin (a FAM replacement), and other projects. See Documentation/filesystems/inotify.txt. Signed-off-by: Robert Love Cc: John McCutchan Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 5bc97507eeaa..96364fae0844 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * This is needed for the following functions: @@ -202,6 +203,10 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); spin_lock_init(&inode->i_lock); i_size_ordered_init(inode); +#ifdef CONFIG_INOTIFY + INIT_LIST_HEAD(&inode->inotify_watches); + sema_init(&inode->inotify_sem, 1); +#endif } EXPORT_SYMBOL(inode_init_once); @@ -351,6 +356,7 @@ int invalidate_inodes(struct super_block * sb) down(&iprune_sem); spin_lock(&inode_lock); + inotify_unmount_inodes(&sb->s_inodes); busy = invalidate_list(&sb->s_inodes, &throw_away); spin_unlock(&inode_lock); -- cgit v1.2.1