Skip to content

Commit ea85520

Browse files
MaxKellermannintel-lab-lkp
authored andcommitted
ceph: fix deadlock bugs by making iput() calls asynchronous
The iput() function is a dangerous one - if the reference counter goes to zero, the function may block for a long time due to: - inode_wait_for_writeback() waits until writeback on this inode completes - the filesystem-specific "evict_inode" callback can do similar things; e.g. all netfs-based filesystems will call netfs_wait_for_outstanding_io() which is similar to inode_wait_for_writeback() Therefore, callers must carefully evaluate the context they're in and check whether invoking iput() is a good idea at all. Most of the time, this is not a problem because the dcache holds references to all inodes, and the dcache is usually the one to release the last reference. But this assumption is fragile. For example, under (memcg) memory pressure, the dcache shrinker is more likely to release inode references, moving the inode eviction to contexts where that was extremely unlikely to occur. Our production servers "found" at least two deadlock bugs in the Ceph filesystem that were caused by this iput() behavior: 1. Writeback may lead to iput() calls in Ceph (e.g. from ceph_put_wrbuffer_cap_refs()) which deadlocks in inode_wait_for_writeback(). Waiting for writeback completion from within writeback will obviously never be able to make any progress. This leads to blocked kworkers like this: INFO: task kworker/u777:6:1270802 blocked for more than 122 seconds. Not tainted 6.16.7-i1-es torvalds#773 task:kworker/u777:6 state:D stack:0 pid:1270802 tgid:1270802 ppid:2 task_flags:0x4208060 flags:0x00004000 Workqueue: writeback wb_workfn (flush-ceph-3) Call Trace: <TASK> __schedule+0x4ea/0x17d0 schedule+0x1c/0xc0 inode_wait_for_writeback+0x71/0xb0 evict+0xcf/0x200 ceph_put_wrbuffer_cap_refs+0xdd/0x220 ceph_invalidate_folio+0x97/0xc0 ceph_writepages_start+0x127b/0x14d0 do_writepages+0xba/0x150 __writeback_single_inode+0x34/0x290 writeback_sb_inodes+0x203/0x470 __writeback_inodes_wb+0x4c/0xe0 wb_writeback+0x189/0x2b0 wb_workfn+0x30b/0x3d0 process_one_work+0x143/0x2b0 worker_thread+0x30a/0x450 2. In the Ceph messenger thread (net/ceph/messenger*.c), any iput() call may invoke ceph_evict_inode() which will deadlock in netfs_wait_for_outstanding_io(); since this blocks the messenger thread, completions from the Ceph servers will not ever be received and handled. It looks like these deadlock bugs have been in the Ceph filesystem code since forever (therefore no "Fixes" tag in this patch). There may be various ways to solve this: - make iput() asynchronous and defer the actual eviction like fput() (may add overhead) - make iput() only asynchronous if I_SYNC is set (doesn't solve random things happening inside the "evict_inode" callback) - add iput_deferred() to make this asynchronous behavior/overhead optional and explicit - refactor Ceph to avoid iput() calls from within writeback and messenger (if that is even possible) - add a Ceph-specific workaround After advice from Mateusz Guzik, I decided to do the latter. The implementation is simple because it piggybacks on the existing work_struct for ceph_queue_inode_work() - ceph_inode_work() calls iput() at the end which means we can donate the last reference to it. Since Ceph has a few iput() callers in a loop, it seemed simple enough to pass this counter and use atomic_sub() instead of atomic_dec(). This patch adds ceph_iput_n_async() and converts lots of iput() calls to it - at least those that may come through writeback and the messenger. Signed-off-by: Max Kellermann <[email protected]> Cc: Mateusz Guzik <[email protected]> Cc: [email protected]
1 parent 0fc02d1 commit ea85520

File tree

8 files changed

+84
-36
lines changed

8 files changed

+84
-36
lines changed

fs/ceph/addr.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
265265
subreq->error = err;
266266
trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
267267
netfs_read_subreq_terminated(subreq);
268-
iput(req->r_inode);
268+
ceph_iput_async(req->r_inode);
269269
ceph_dec_osd_stopping_blocker(fsc->mdsc);
270270
}
271271

fs/ceph/caps.c

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1810,7 +1810,7 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
18101810
spin_unlock(&mdsc->snap_flush_lock);
18111811

18121812
if (need_put)
1813-
iput(inode);
1813+
ceph_iput_async(inode);
18141814
}
18151815

18161816
/*
@@ -3389,8 +3389,8 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
33893389
}
33903390
if (wake)
33913391
wake_up_all(&ci->i_cap_wq);
3392-
while (put-- > 0)
3393-
iput(inode);
3392+
if (put > 0)
3393+
ceph_iput_n_async(inode, put);
33943394
}
33953395

33963396
void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
@@ -3492,9 +3492,8 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
34923492
}
34933493
if (complete_capsnap)
34943494
wake_up_all(&ci->i_cap_wq);
3495-
while (put-- > 0) {
3496-
iput(inode);
3497-
}
3495+
if (put > 0)
3496+
ceph_iput_n_async(inode, put);
34983497
}
34993498

35003499
/*
@@ -3998,7 +3997,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
39983997
if (wake_mdsc)
39993998
wake_up_all(&mdsc->cap_flushing_wq);
40003999
if (drop)
4001-
iput(inode);
4000+
ceph_iput_async(inode);
40024001
}
40034002

40044003
void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
@@ -4091,7 +4090,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
40914090
wake_up_all(&ci->i_cap_wq);
40924091
if (wake_mdsc)
40934092
wake_up_all(&mdsc->cap_flushing_wq);
4094-
iput(inode);
4093+
ceph_iput_async(inode);
40954094
}
40964095
}
40974096

@@ -4654,7 +4653,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
46544653
done:
46554654
mutex_unlock(&session->s_mutex);
46564655
done_unlocked:
4657-
iput(inode);
4656+
ceph_iput_async(inode);
46584657
out:
46594658
ceph_dec_mds_stopping_blocker(mdsc);
46604659

@@ -4747,7 +4746,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
47474746
doutc(cl, "on %p %llx.%llx\n", inode,
47484747
ceph_vinop(inode));
47494748
ceph_check_caps(ci, 0);
4750-
iput(inode);
4749+
ceph_iput_async(inode);
47514750
spin_lock(&mdsc->cap_delay_lock);
47524751
}
47534752

@@ -4786,7 +4785,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
47864785
spin_unlock(&mdsc->cap_dirty_lock);
47874786
ceph_wait_on_async_create(inode);
47884787
ceph_check_caps(ci, CHECK_CAPS_FLUSH);
4789-
iput(inode);
4788+
ceph_iput_async(inode);
47904789
spin_lock(&mdsc->cap_dirty_lock);
47914790
}
47924791
spin_unlock(&mdsc->cap_dirty_lock);

fs/ceph/dir.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1289,7 +1289,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
12891289
ceph_mdsc_free_path_info(&path_info);
12901290
}
12911291
out:
1292-
iput(req->r_old_inode);
1292+
ceph_iput_async(req->r_old_inode);
12931293
ceph_mdsc_release_dir_caps(req);
12941294
}
12951295

fs/ceph/inode.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2217,6 +2217,48 @@ void ceph_queue_inode_work(struct inode *inode, int work_bit)
22172217
}
22182218
}
22192219

2220+
/**
2221+
* Queue an asynchronous iput() call in a worker thread. Use this
2222+
* instead of iput() in contexts where evicting the inode is unsafe.
2223+
* For example, inode eviction may cause deadlocks in
2224+
* inode_wait_for_writeback() (when called from within writeback) or
2225+
* in netfs_wait_for_outstanding_io() (when called from within the
2226+
* Ceph messenger).
2227+
*
2228+
* @n: how many references to put
2229+
*/
2230+
void ceph_iput_n_async(struct inode *inode, int n)
2231+
{
2232+
if (unlikely(!inode))
2233+
return;
2234+
2235+
if (likely(atomic_sub_return(n, &inode->i_count) > 0))
2236+
/* somebody else is holding another reference -
2237+
* nothing left to do for us
2238+
*/
2239+
return;
2240+
2241+
doutc(ceph_inode_to_fs_client(inode)->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
2242+
2243+
/* the reference counter is now 0, i.e. nobody else is holding
2244+
* a reference to this inode; restore it to 1 and donate it to
2245+
* ceph_inode_work() which will call iput() at the end
2246+
*/
2247+
atomic_set(&inode->i_count, 1);
2248+
2249+
/* simply queue a ceph_inode_work() without setting
2250+
* i_work_mask bit; other than putting the reference, there is
2251+
* nothing to do
2252+
*/
2253+
WARN_ON_ONCE(!queue_work(ceph_inode_to_fs_client(inode)->inode_wq,
2254+
&ceph_inode(inode)->i_work));
2255+
2256+
/* note: queue_work() cannot fail; it i_work were already
2257+
* queued, then it would be holding another reference, but no
2258+
* such reference exists
2259+
*/
2260+
}
2261+
22202262
static void ceph_do_invalidate_pages(struct inode *inode)
22212263
{
22222264
struct ceph_client *cl = ceph_inode_to_client(inode);

fs/ceph/mds_client.c

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,14 +1098,14 @@ void ceph_mdsc_release_request(struct kref *kref)
10981098
ceph_msg_put(req->r_reply);
10991099
if (req->r_inode) {
11001100
ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1101-
iput(req->r_inode);
1101+
ceph_iput_async(req->r_inode);
11021102
}
11031103
if (req->r_parent) {
11041104
ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
1105-
iput(req->r_parent);
1105+
ceph_iput_async(req->r_parent);
11061106
}
1107-
iput(req->r_target_inode);
1108-
iput(req->r_new_inode);
1107+
ceph_iput_async(req->r_target_inode);
1108+
ceph_iput_async(req->r_new_inode);
11091109
if (req->r_dentry)
11101110
dput(req->r_dentry);
11111111
if (req->r_old_dentry)
@@ -1119,7 +1119,7 @@ void ceph_mdsc_release_request(struct kref *kref)
11191119
*/
11201120
ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
11211121
CEPH_CAP_PIN);
1122-
iput(req->r_old_dentry_dir);
1122+
ceph_iput_async(req->r_old_dentry_dir);
11231123
}
11241124
kfree(req->r_path1);
11251125
kfree(req->r_path2);
@@ -1241,7 +1241,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
12411241
}
12421242

12431243
if (req->r_unsafe_dir) {
1244-
iput(req->r_unsafe_dir);
1244+
ceph_iput_async(req->r_unsafe_dir);
12451245
req->r_unsafe_dir = NULL;
12461246
}
12471247

@@ -1414,7 +1414,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
14141414
cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
14151415
if (!cap) {
14161416
spin_unlock(&ci->i_ceph_lock);
1417-
iput(inode);
1417+
ceph_iput_async(inode);
14181418
goto random;
14191419
}
14201420
mds = cap->session->s_mds;
@@ -1423,7 +1423,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
14231423
cap == ci->i_auth_cap ? "auth " : "", cap);
14241424
spin_unlock(&ci->i_ceph_lock);
14251425
out:
1426-
iput(inode);
1426+
ceph_iput_async(inode);
14271427
return mds;
14281428

14291429
random:
@@ -1845,7 +1845,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
18451845
spin_unlock(&session->s_cap_lock);
18461846

18471847
if (last_inode) {
1848-
iput(last_inode);
1848+
ceph_iput_async(last_inode);
18491849
last_inode = NULL;
18501850
}
18511851
if (old_cap) {
@@ -1878,7 +1878,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
18781878
session->s_cap_iterator = NULL;
18791879
spin_unlock(&session->s_cap_lock);
18801880

1881-
iput(last_inode);
1881+
ceph_iput_async(last_inode);
18821882
if (old_cap)
18831883
ceph_put_cap(session->s_mdsc, old_cap);
18841884

@@ -1907,8 +1907,8 @@ static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
19071907
wake_up_all(&ci->i_cap_wq);
19081908
if (invalidate)
19091909
ceph_queue_invalidate(inode);
1910-
while (iputs--)
1911-
iput(inode);
1910+
if (iputs > 0)
1911+
ceph_iput_n_async(inode, iputs);
19121912
return 0;
19131913
}
19141914

@@ -1948,7 +1948,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
19481948
spin_unlock(&session->s_cap_lock);
19491949

19501950
inode = ceph_find_inode(sb, vino);
1951-
iput(inode);
1951+
ceph_iput_async(inode);
19521952

19531953
spin_lock(&session->s_cap_lock);
19541954
}
@@ -2544,7 +2544,7 @@ static void ceph_cap_unlink_work(struct work_struct *work)
25442544
doutc(cl, "on %p %llx.%llx\n", inode,
25452545
ceph_vinop(inode));
25462546
ceph_check_caps(ci, CHECK_CAPS_FLUSH);
2547-
iput(inode);
2547+
ceph_iput_async(inode);
25482548
spin_lock(&mdsc->cap_delay_lock);
25492549
}
25502550
}
@@ -3973,7 +3973,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
39733973
!req->r_reply_info.has_create_ino) {
39743974
/* This should never happen on an async create */
39753975
WARN_ON_ONCE(req->r_deleg_ino);
3976-
iput(in);
3976+
ceph_iput_async(in);
39773977
in = NULL;
39783978
}
39793979

@@ -5357,7 +5357,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
53575357

53585358
out:
53595359
mutex_unlock(&session->s_mutex);
5360-
iput(inode);
5360+
ceph_iput_async(inode);
53615361

53625362
ceph_dec_mds_stopping_blocker(mdsc);
53635363
return;

fs/ceph/quota.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
7676
le64_to_cpu(h->max_files));
7777
spin_unlock(&ci->i_ceph_lock);
7878

79-
iput(inode);
79+
ceph_iput_async(inode);
8080
out:
8181
ceph_dec_mds_stopping_blocker(mdsc);
8282
}
@@ -190,7 +190,7 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
190190
node = rb_first(&mdsc->quotarealms_inodes);
191191
qri = rb_entry(node, struct ceph_quotarealm_inode, node);
192192
rb_erase(node, &mdsc->quotarealms_inodes);
193-
iput(qri->inode);
193+
ceph_iput_async(qri->inode);
194194
kfree(qri);
195195
}
196196
mutex_unlock(&mdsc->quotarealms_inodes_mutex);

fs/ceph/snap.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -740,7 +740,7 @@ static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc,
740740
if (!inode)
741741
continue;
742742
spin_unlock(&realm->inodes_with_caps_lock);
743-
iput(lastinode);
743+
ceph_iput_async(lastinode);
744744
lastinode = inode;
745745

746746
/*
@@ -767,7 +767,7 @@ static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc,
767767
spin_lock(&realm->inodes_with_caps_lock);
768768
}
769769
spin_unlock(&realm->inodes_with_caps_lock);
770-
iput(lastinode);
770+
ceph_iput_async(lastinode);
771771

772772
if (capsnap)
773773
kmem_cache_free(ceph_cap_snap_cachep, capsnap);
@@ -960,7 +960,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
960960
ihold(inode);
961961
spin_unlock(&mdsc->snap_flush_lock);
962962
ceph_flush_snaps(ci, &session);
963-
iput(inode);
963+
ceph_iput_async(inode);
964964
spin_lock(&mdsc->snap_flush_lock);
965965
}
966966
spin_unlock(&mdsc->snap_flush_lock);
@@ -1121,12 +1121,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
11211121
ceph_get_snap_realm(mdsc, realm);
11221122
ceph_change_snap_realm(inode, realm);
11231123
spin_unlock(&ci->i_ceph_lock);
1124-
iput(inode);
1124+
ceph_iput_async(inode);
11251125
continue;
11261126

11271127
skip_inode:
11281128
spin_unlock(&ci->i_ceph_lock);
1129-
iput(inode);
1129+
ceph_iput_async(inode);
11301130
}
11311131

11321132
/* we may have taken some of the old realm's children. */

fs/ceph/super.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,6 +1123,13 @@ static inline void ceph_queue_flush_snaps(struct inode *inode)
11231123
ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS);
11241124
}
11251125

1126+
void ceph_iput_n_async(struct inode *inode, int n);
1127+
1128+
static inline void ceph_iput_async(struct inode *inode)
1129+
{
1130+
ceph_iput_n_async(inode, 1);
1131+
}
1132+
11261133
extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask);
11271134
extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
11281135
int mask, bool force);

0 commit comments

Comments
 (0)