ceph: fix deadlock bugs by making iput() calls asynchronous

MaxKellermann · intel-lab-lkp · commit ea85520061fa · 2025-09-17T20:47:11.000+08:00
The iput() function is a dangerous one - if the reference counter goes to zero, the function may block for a long time due to: - inode_wait_for_writeback() waits until writeback on this inode completes - the filesystem-specific "evict_inode" callback can do similar things; e.g. all netfs-based filesystems will call netfs_wait_for_outstanding_io() which is similar to inode_wait_for_writeback() Therefore, callers must carefully evaluate the context they're in and check whether invoking iput() is a good idea at all. Most of the time, this is not a problem because the dcache holds references to all inodes, and the dcache is usually the one to release the last reference. But this assumption is fragile. For example, under (memcg) memory pressure, the dcache shrinker is more likely to release inode references, moving the inode eviction to contexts where that was extremely unlikely to occur. Our production servers "found" at least two deadlock bugs in the Ceph filesystem that were caused by this iput() behavior: 1. Writeback may lead to iput() calls in Ceph (e.g. from ceph_put_wrbuffer_cap_refs()) which deadlocks in inode_wait_for_writeback(). Waiting for writeback completion from within writeback will obviously never be able to make any progress. This leads to blocked kworkers like this: INFO: task kworker/u777:6:1270802 blocked for more than 122 seconds. Not tainted 6.16.7-i1-es torvalds#773 task:kworker/u777:6 state:D stack:0 pid:1270802 tgid:1270802 ppid:2 task_flags:0x4208060 flags:0x00004000 Workqueue: writeback wb_workfn (flush-ceph-3) Call Trace: <TASK> __schedule+0x4ea/0x17d0 schedule+0x1c/0xc0 inode_wait_for_writeback+0x71/0xb0 evict+0xcf/0x200 ceph_put_wrbuffer_cap_refs+0xdd/0x220 ceph_invalidate_folio+0x97/0xc0 ceph_writepages_start+0x127b/0x14d0 do_writepages+0xba/0x150 __writeback_single_inode+0x34/0x290 writeback_sb_inodes+0x203/0x470 __writeback_inodes_wb+0x4c/0xe0 wb_writeback+0x189/0x2b0 wb_workfn+0x30b/0x3d0 process_one_work+0x143/0x2b0 worker_thread+0x30a/0x450 2. In the Ceph messenger thread (net/ceph/messenger*.c), any iput() call may invoke ceph_evict_inode() which will deadlock in netfs_wait_for_outstanding_io(); since this blocks the messenger thread, completions from the Ceph servers will not ever be received and handled. It looks like these deadlock bugs have been in the Ceph filesystem code since forever (therefore no "Fixes" tag in this patch). There may be various ways to solve this: - make iput() asynchronous and defer the actual eviction like fput() (may add overhead) - make iput() only asynchronous if I_SYNC is set (doesn't solve random things happening inside the "evict_inode" callback) - add iput_deferred() to make this asynchronous behavior/overhead optional and explicit - refactor Ceph to avoid iput() calls from within writeback and messenger (if that is even possible) - add a Ceph-specific workaround After advice from Mateusz Guzik, I decided to do the latter. The implementation is simple because it piggybacks on the existing work_struct for ceph_queue_inode_work() - ceph_inode_work() calls iput() at the end which means we can donate the last reference to it. Since Ceph has a few iput() callers in a loop, it seemed simple enough to pass this counter and use atomic_sub() instead of atomic_dec(). This patch adds ceph_iput_n_async() and converts lots of iput() calls to it - at least those that may come through writeback and the messenger. Signed-off-by: Max Kellermann <max.kellermann@ionos.com> Cc: Mateusz Guzik <mjguzik@gmail.com> Cc: stable@vger.kernel.org
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
@@ -265,7 +265,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
 	subreq->error = err;
 	trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
 	netfs_read_subreq_terminated(subreq);
-	iput(req->r_inode);
+	ceph_iput_async(req->r_inode);
 	ceph_dec_osd_stopping_blocker(fsc->mdsc);
 }
 
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
@@ -1810,7 +1810,7 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
 	spin_unlock(&mdsc->snap_flush_lock);
 
 	if (need_put)
-		iput(inode);
+		ceph_iput_async(inode);
 }
 
 /*
@@ -3389,8 +3389,8 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
 	}
 	if (wake)
 		wake_up_all(&ci->i_cap_wq);
-	while (put-- > 0)
-		iput(inode);
+	if (put > 0)
+		ceph_iput_n_async(inode, put);
 }
 
 void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
@@ -3492,9 +3492,8 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 	}
 	if (complete_capsnap)
 		wake_up_all(&ci->i_cap_wq);
-	while (put-- > 0) {
-		iput(inode);
-	}
+	if (put > 0)
+		ceph_iput_n_async(inode, put);
 }
 
 /*
@@ -3998,7 +3997,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 	if (wake_mdsc)
 		wake_up_all(&mdsc->cap_flushing_wq);
 	if (drop)
-		iput(inode);
+		ceph_iput_async(inode);
 }
 
 void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
@@ -4091,7 +4090,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
 			wake_up_all(&ci->i_cap_wq);
 		if (wake_mdsc)
 			wake_up_all(&mdsc->cap_flushing_wq);
-		iput(inode);
+		ceph_iput_async(inode);
 	}
 }
 
@@ -4654,7 +4653,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 done:
 	mutex_unlock(&session->s_mutex);
 done_unlocked:
-	iput(inode);
+	ceph_iput_async(inode);
 out:
 	ceph_dec_mds_stopping_blocker(mdsc);
 
@@ -4747,7 +4746,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 			doutc(cl, "on %p %llx.%llx\n", inode,
 			      ceph_vinop(inode));
 			ceph_check_caps(ci, 0);
-			iput(inode);
+			ceph_iput_async(inode);
 			spin_lock(&mdsc->cap_delay_lock);
 		}
 
@@ -4786,7 +4785,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
 		spin_unlock(&mdsc->cap_dirty_lock);
 		ceph_wait_on_async_create(inode);
 		ceph_check_caps(ci, CHECK_CAPS_FLUSH);
-		iput(inode);
+		ceph_iput_async(inode);
 		spin_lock(&mdsc->cap_dirty_lock);
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
@@ -1289,7 +1289,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
 		ceph_mdsc_free_path_info(&path_info);
 	}
 out:
-	iput(req->r_old_inode);
+	ceph_iput_async(req->r_old_inode);
 	ceph_mdsc_release_dir_caps(req);
 }
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
@@ -2217,6 +2217,48 @@ void ceph_queue_inode_work(struct inode *inode, int work_bit)
 	}
 }
 
+/**
+ * Queue an asynchronous iput() call in a worker thread.  Use this
+ * instead of iput() in contexts where evicting the inode is unsafe.
+ * For example, inode eviction may cause deadlocks in
+ * inode_wait_for_writeback() (when called from within writeback) or
+ * in netfs_wait_for_outstanding_io() (when called from within the
+ * Ceph messenger).
+ *
+ * @n: how many references to put
+ */
+void ceph_iput_n_async(struct inode *inode, int n)
+{
+	if (unlikely(!inode))
+		return;
+
+	if (likely(atomic_sub_return(n, &inode->i_count) > 0))
+		/* somebody else is holding another reference -
+		 * nothing left to do for us
+		 */
+		return;
+
+	doutc(ceph_inode_to_fs_client(inode)->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
+
+	/* the reference counter is now 0, i.e. nobody else is holding
+	 * a reference to this inode; restore it to 1 and donate it to
+	 * ceph_inode_work() which will call iput() at the end
+	 */
+	atomic_set(&inode->i_count, 1);
+
+	/* simply queue a ceph_inode_work() without setting
+	 * i_work_mask bit; other than putting the reference, there is
+	 * nothing to do
+	 */
+	WARN_ON_ONCE(!queue_work(ceph_inode_to_fs_client(inode)->inode_wq,
+				 &ceph_inode(inode)->i_work));
+
+	/* note: queue_work() cannot fail; it i_work were already
+	 * queued, then it would be holding another reference, but no
+	 * such reference exists
+	 */
+}
+
 static void ceph_do_invalidate_pages(struct inode *inode)
 {
 	struct ceph_client *cl = ceph_inode_to_client(inode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
@@ -1098,14 +1098,14 @@ void ceph_mdsc_release_request(struct kref *kref)
 		ceph_msg_put(req->r_reply);
 	if (req->r_inode) {
 		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
-		iput(req->r_inode);
+		ceph_iput_async(req->r_inode);
 	}
 	if (req->r_parent) {
 		ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
-		iput(req->r_parent);
+		ceph_iput_async(req->r_parent);
 	}
-	iput(req->r_target_inode);
-	iput(req->r_new_inode);
+	ceph_iput_async(req->r_target_inode);
+	ceph_iput_async(req->r_new_inode);
 	if (req->r_dentry)
 		dput(req->r_dentry);
 	if (req->r_old_dentry)
@@ -1119,7 +1119,7 @@ void ceph_mdsc_release_request(struct kref *kref)
 		 */
 		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
 				  CEPH_CAP_PIN);
-		iput(req->r_old_dentry_dir);
+		ceph_iput_async(req->r_old_dentry_dir);
 	}
 	kfree(req->r_path1);
 	kfree(req->r_path2);
@@ -1241,7 +1241,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 	}
 
 	if (req->r_unsafe_dir) {
-		iput(req->r_unsafe_dir);
+		ceph_iput_async(req->r_unsafe_dir);
 		req->r_unsafe_dir = NULL;
 	}
 
@@ -1414,7 +1414,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
 	if (!cap) {
 		spin_unlock(&ci->i_ceph_lock);
-		iput(inode);
+		ceph_iput_async(inode);
 		goto random;
 	}
 	mds = cap->session->s_mds;
@@ -1423,7 +1423,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	      cap == ci->i_auth_cap ? "auth " : "", cap);
 	spin_unlock(&ci->i_ceph_lock);
 out:
-	iput(inode);
+	ceph_iput_async(inode);
 	return mds;
 
 random:
@@ -1845,7 +1845,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
 		spin_unlock(&session->s_cap_lock);
 
 		if (last_inode) {
-			iput(last_inode);
+			ceph_iput_async(last_inode);
 			last_inode = NULL;
 		}
 		if (old_cap) {
@@ -1878,7 +1878,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
 	session->s_cap_iterator = NULL;
 	spin_unlock(&session->s_cap_lock);
 
-	iput(last_inode);
+	ceph_iput_async(last_inode);
 	if (old_cap)
 		ceph_put_cap(session->s_mdsc, old_cap);
 
@@ -1907,8 +1907,8 @@ static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
 		wake_up_all(&ci->i_cap_wq);
 	if (invalidate)
 		ceph_queue_invalidate(inode);
-	while (iputs--)
-		iput(inode);
+	if (iputs > 0)
+		ceph_iput_n_async(inode, iputs);
 	return 0;
 }
 
@@ -1948,7 +1948,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
 			spin_unlock(&session->s_cap_lock);
 
 			inode = ceph_find_inode(sb, vino);
-			iput(inode);
+			ceph_iput_async(inode);
 
 			spin_lock(&session->s_cap_lock);
 		}
@@ -2544,7 +2544,7 @@ static void ceph_cap_unlink_work(struct work_struct *work)
 			doutc(cl, "on %p %llx.%llx\n", inode,
 			      ceph_vinop(inode));
 			ceph_check_caps(ci, CHECK_CAPS_FLUSH);
-			iput(inode);
+			ceph_iput_async(inode);
 			spin_lock(&mdsc->cap_delay_lock);
 		}
 	}
@@ -3973,7 +3973,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 		    !req->r_reply_info.has_create_ino) {
 			/* This should never happen on an async create */
 			WARN_ON_ONCE(req->r_deleg_ino);
-			iput(in);
+			ceph_iput_async(in);
 			in = NULL;
 		}
 
@@ -5357,7 +5357,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 
 out:
 	mutex_unlock(&session->s_mutex);
-	iput(inode);
+	ceph_iput_async(inode);
 
 	ceph_dec_mds_stopping_blocker(mdsc);
 	return;
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
@@ -76,7 +76,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
 		            le64_to_cpu(h->max_files));
 	spin_unlock(&ci->i_ceph_lock);
 
-	iput(inode);
+	ceph_iput_async(inode);
 out:
 	ceph_dec_mds_stopping_blocker(mdsc);
 }
@@ -190,7 +190,7 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
 		node = rb_first(&mdsc->quotarealms_inodes);
 		qri = rb_entry(node, struct ceph_quotarealm_inode, node);
 		rb_erase(node, &mdsc->quotarealms_inodes);
-		iput(qri->inode);
+		ceph_iput_async(qri->inode);
 		kfree(qri);
 	}
 	mutex_unlock(&mdsc->quotarealms_inodes_mutex);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
@@ -740,7 +740,7 @@ static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc,
 		if (!inode)
 			continue;
 		spin_unlock(&realm->inodes_with_caps_lock);
-		iput(lastinode);
+		ceph_iput_async(lastinode);
 		lastinode = inode;
 
 		/*
@@ -767,7 +767,7 @@ static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc,
 		spin_lock(&realm->inodes_with_caps_lock);
 	}
 	spin_unlock(&realm->inodes_with_caps_lock);
-	iput(lastinode);
+	ceph_iput_async(lastinode);
 
 	if (capsnap)
 		kmem_cache_free(ceph_cap_snap_cachep, capsnap);
@@ -960,7 +960,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
 		ihold(inode);
 		spin_unlock(&mdsc->snap_flush_lock);
 		ceph_flush_snaps(ci, &session);
-		iput(inode);
+		ceph_iput_async(inode);
 		spin_lock(&mdsc->snap_flush_lock);
 	}
 	spin_unlock(&mdsc->snap_flush_lock);
@@ -1121,12 +1121,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 			ceph_get_snap_realm(mdsc, realm);
 			ceph_change_snap_realm(inode, realm);
 			spin_unlock(&ci->i_ceph_lock);
-			iput(inode);
+			ceph_iput_async(inode);
 			continue;
 
 skip_inode:
 			spin_unlock(&ci->i_ceph_lock);
-			iput(inode);
+			ceph_iput_async(inode);
 		}
 
 		/* we may have taken some of the old realm's children. */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
@@ -1123,6 +1123,13 @@ static inline void ceph_queue_flush_snaps(struct inode *inode)
 	ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS);
 }
 
+void ceph_iput_n_async(struct inode *inode, int n);
+
+static inline void ceph_iput_async(struct inode *inode)
+{
+	ceph_iput_n_async(inode, 1);
+}
+
 extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask);
 extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 			     int mask, bool force);

Original file line number	Diff line number	Diff line change
`@@ -265,7 +265,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)`
`265`	`265`	`subreq->error = err;`
`266`	`266`	`trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);`
`267`	`267`	`netfs_read_subreq_terminated(subreq);`
`268`		`- iput(req->r_inode);`
	`268`	`+ ceph_iput_async(req->r_inode);`
`269`	`269`	`ceph_dec_osd_stopping_blocker(fsc->mdsc);`
`270`	`270`	`}`
`271`	`271`
Original file line number	Diff line number	Diff line change
`@@ -1289,7 +1289,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,`
`1289`	`1289`	`ceph_mdsc_free_path_info(&path_info);`
`1290`	`1290`	`}`
`1291`	`1291`	`out:`
`1292`		`- iput(req->r_old_inode);`
	`1292`	`+ ceph_iput_async(req->r_old_inode);`
`1293`	`1293`	`ceph_mdsc_release_dir_caps(req);`
`1294`	`1294`	`}`
`1295`	`1295`