rds: ib: Add cm_id generation scheme in order to detect new ones

Hakon-Bugge · Brian Maly · commit 0405046276ef · 2025-06-23T19:56:59.000-04:00
In rds_rdma_cm_event_handler_cmn() we have a simple address compare of the supplied cm_id and the one stored in the IB Connection struct, ic: if (ic->i_cm_id != cm_id) { But this address compare is flaky. Although the address is the same, the cm_id per se, does not need to be the same. The cm_id may be destroyed and re-created. Since we here talk about free() + kzalloc(), there is a fair probability that the new cm_id ends up at the very same address as the old one. If that happens, there is a dissonance between the RDS IB Connection state and the state of the cm_id. This bug used to manifest itself as a NULL ptr deref in rds_rdma_cm_event_handler_cmn(), but that was before commit a71a60c ("rds: ib: Remove incorrect update of the path record sl and qos_class fields"). Kernels with a71a60c may observe the following crash: BUG: kernel NULL pointer dereference, address: 0000000000000034 RIP: 0010:dma_direct_unmap_sg+0x5a/0x220 Call Trace: rds_ib_recv_cqe_handler+0xed/0x395 [rds_rdma] poll_rcq+0x72/0xa0 [rds_rdma] rds_ib_rx+0xb9/0x2b0 [rds_rdma] rds_ib_tasklet_fn_recv+0x2f/0x40 [rds_rdma] tasklet_action_common+0x153/0x240 handle_softirqs+0xe4/0x2ac __irq_exit_rcu+0xab/0xd0 common_interrupt+0x85/0xa0 Drgn analyses of the crash: >>> trace = prog.crashed_thread().stack_trace() >>> trace #0 sg_dma_is_bus_address (./include/linux/scatterlist.h:297:11) #1 dma_direct_unmap_sg (kernel/dma/direct.c:452:7) #2 ib_dma_unmap_sg_attrs (./include/rdma/ib_verbs.h:4232:3) #3 ib_dma_unmap_sg (./include/rdma/ib_verbs.h:4294:2) #4 rds_ib_recv_cqe_handler (net/rds/ib_recv.c:1397:2) #5 poll_rcq (net/rds/ib_cm.c:777:4) #6 rds_ib_rx (net/rds/ib_cm.c:870:2) #7 rds_ib_recv_cb (net/rds/ib_cm.c:916:2) #8 rds_ib_tasklet_fn_recv (net/rds/ib_cm.c:925:2) #9 tasklet_action_common (kernel/softirq.c:795:7) #10 handle_softirqs (kernel/softirq.c:561:3) #11 __do_softirq (kernel/softirq.c:595:2) #12 invoke_softirq (kernel/softirq.c:435:3) #13 __irq_exit_rcu (kernel/softirq.c:644:3) #14 common_interrupt (arch/x86/kernel/irq.c:280:1) #15 asm_common_interrupt+0x26/0x2b (./arch/x86/include/asm/idtentry.h:693) #16 cpuidle_enter_state (drivers/cpuidle/cpuidle.c:288:5) #17 cpuidle_enter (drivers/cpuidle/cpuidle.c:385:9) #18 cpuidle_idle_call (kernel/sched/idle.c:230:19) #19 do_idle (kernel/sched/idle.c:326:4) #20 cpu_startup_entry (kernel/sched/idle.c:424:3) #21 rest_init (init/main.c:747:2) #22 start_kernel (init/main.c:1105:2) #23 x86_64_start_reservations (arch/x86/kernel/head64.c:507:2) #24 x86_64_start_kernel (arch/x86/kernel/head64.c:488:2) #25 secondary_startup_64+0x15d/0x15f (arch/x86/kernel/head_64.S:413) >>> trace[4]["ic"].i_recvs[0] (struct rds_ib_recv_work){ .r_ibinc = (struct rds_ib_incoming *)0x0, .r_frag = (struct rds_page_frag *)0x0, .r_wr = (struct ib_recv_wr){ .next = (struct ib_recv_wr *)0x0, .wr_id = (u64)0, .wr_cqe = (struct ib_cqe *)0x0, .sg_list = (struct ib_sge *)0xffffb91c1a5b5030, .num_sge = (int)5, }, .r_sge = (struct ib_sge [8]){ { .addr = (u64)33819643904, .length = (u32)48, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, }, .r_ic = (struct rds_ib_connection *)0x0, .r_posted = (int)0, } The prosaic version: We do get an event and processes a recv CQE, but when performing DMA unmap, the addresses for SGE entry 1-4 are all zero. That explains the crash. If you look at rds_ib_recv_init_ring(), the state will be exactly what we see above when this function has been called. So, how can this happen? We do get an RDMA_CM_EVENT_ROUTE_RESOLVED event with a new cm_id which happens to have the same address as the one stored in the RDS IB Connection (ic). Now, without a71a60c, we do not crash in rds_rdma_cm_event_handler_cmn() but calls: trans->cm_initiate_connect() rds_ib_cm_initiate_connect() rds_ib_recv_init_ring() and the latter does: sge = recv->r_sge; sge->addr = ic->i_recv_hdrs_dma[i]; sge->length = sizeof(struct rds_header); sge->lkey = ic->i_mr->lkey; for (j = 1; j < num_send_sge; j++) { sge = recv->r_sge + j; sge->addr = 0; <==== Note sge->length = PAGE_SIZE; sge->lkey = ic->i_mr->lkey; } This fits 100% with the r_sge pasted above. We fix this bug by introducing a generation number and embed that in the last three bits of cm_id->context, since these bits are zero due to alignment requirement. When we use cm_id->context to find the RDS Connection (conn), we mask away the three lower bits. When we create the cm_id, we insert the next generation and make a copy of it in the ic. In rds_rdma_cm_event_handler_cmn(), when we have established that the two addresses are equal, we add an additional check to see if the cm_id generation is the same between the rdma_cm supplied cm_id and the generation stored in ic. If the generations differ, we increment the s_ib_cm_id_resurrected statistics counter and return. When handling an incoming connect, we set the cm_id generation count stored in ic to zero. Orabug: 38105083 Fixes: 3ae09d7 ("net/rds: Don't block workqueues "cma_wq" and "cm.wq"") Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com> Reviewed-by: Sharath Srinivasan <sharath.srinivasan@oracle.com> Signed-off-by: Brian Maly <brian.maly@oracle.com>
diff --git a/net/rds/ib.h b/net/rds/ib.h
@@ -92,6 +92,11 @@ enum rds_ib_conn_flags {
 
 #define RDS_IB_CQ_FOLLOW_AFFINITY_THROTTLE	100 /* msec */
 
+#define RDS_IB_CM_IB_GEN_MASK 7
+#define RDS_IB_CM_ID_ADD_GEN(conn, gen) ((void *)((uintptr_t)(conn) + (gen)))
+#define RDS_IB_CM_ID_EXTRACT_GEN(id) ((uintptr_t)(id)->context & RDS_IB_CM_IB_GEN_MASK)
+#define RDS_IB_CM_ID_EXTRACT_CONN(id) ((void *)((uintptr_t)(id)->context & ~RDS_IB_CM_IB_GEN_MASK))
+
 enum rds_ib_preferred_cpu_options {
 	RDS_IB_PREFER_CPU_CQ		= 1 << 0,
 	RDS_IB_PREFER_CPU_NUMA		= 1 << 1,
@@ -358,6 +363,7 @@ struct rds_ib_connection {
 	u16			i_frag_sz;	/* IB fragment size */
 	u16			i_frag_cache_sz;
 	u8			i_frag_pages;
+	u8			i_cm_id_gen:3;
 	unsigned long		i_flags;
 	u16			i_frag_cache_inx;
 	u16			i_hca_sge;
@@ -629,6 +635,7 @@ struct rds_ib_statistics {
 	uint64_t	s_ib_frag_pages_allocated;
 	uint64_t	s_ib_frag_pages_in_ib_recv_queue;
 	uint64_t	s_ib_frag_pages_in_caches;
+	uint64_t	s_ib_cm_id_resurrected;
 };
 
 extern struct workqueue_struct *rds_ib_wq;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
@@ -1836,6 +1836,7 @@ static int rds_ib_cm_accept(struct rds_connection *conn,
 	BUG_ON(ic->i_cm_id);
 
 	ic->i_cm_id = cm_id;
+	ic->i_cm_id_gen = 0;
 	cm_id->context = conn;
 
 	if (isv6) {
@@ -2214,7 +2215,7 @@ void rds_ib_conn_destroy_init(struct rds_connection *conn)
 
 int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
 {
-	struct rds_connection *conn = cm_id->context;
+	struct rds_connection *conn = RDS_IB_CM_ID_EXTRACT_CONN(cm_id);
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	struct rdma_conn_param conn_param;
 	union rds_ib_conn_priv dp;
@@ -2419,8 +2420,9 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
 		handler = rds_rdma_cm_event_handler;
 
 	WARN_ON(ic->i_cm_id);
-	ic->i_cm_id = rdma_create_id(rds_conn_net(conn),
-				     handler, conn, RDMA_PS_TCP, IB_QPT_RC);
+	ic->i_cm_id = rdma_create_id(rds_conn_net(conn), handler,
+				     RDS_IB_CM_ID_ADD_GEN(conn, ++ic->i_cm_id_gen),
+				     RDMA_PS_TCP, IB_QPT_RC);
 
 	if (IS_ERR(ic->i_cm_id)) {
 		ret = PTR_ERR(ic->i_cm_id);
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2025, Oracle and/or its affiliates.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -98,6 +98,7 @@ static char *rds_ib_stat_names[] = {
 	"ib_frag_pages_allocated",
 	"ib_frag_pages_in_ib_recv_queue",
 	"ib_frag_pages_in_caches",
+	"ib_cm_id_resurrected",
 };
 
 unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
@@ -52,6 +52,7 @@ struct rds_rdma_cm_event_handler_info {
 	struct rdma_cm_event   event;
 	struct rds_connection *conn;
 	bool                   isv6;
+	int		       cm_id_gen;
 	char                   private_data[];
 };
 
@@ -110,6 +111,7 @@ static inline bool __rds_rdma_chk_dev(struct rds_connection *conn)
 }
 
 static void rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
+					  int cm_id_gen,
 					  struct rdma_cm_event *event,
 					  struct rds_connection *conn,
 					  bool isv6)
@@ -198,6 +200,14 @@ static void rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
                 return;
 	}
 
+	/* Although same address, could be different generation */
+	if (cm_id_gen != ic->i_cm_id_gen) {
+		rds_ib_stats_inc(s_ib_cm_id_resurrected);
+		up_read(&ic->i_cm_id_free_lock);
+		mutex_unlock(&conn->c_cm_lock);
+		return;
+	}
+
 	/* Even though this function no longer accesses "ic->i_cm_id" past this point
 	 * and "cma.c" always blocks "rdma_destroy_id" until "event_callback" is done,
 	 * we still need to hang on to the "i_cm_id_free_lock" until return,
@@ -388,7 +398,8 @@ static void rds_rdma_cm_event_handler_worker(struct work_struct *work)
 								   struct rds_rdma_cm_event_handler_info,
 								   work);
 
-	rds_rdma_cm_event_handler_cmn(info->cm_id, &info->event, info->conn, info->isv6);
+	rds_rdma_cm_event_handler_cmn(info->cm_id, info->cm_id_gen, &info->event,
+				      info->conn, info->isv6);
 
 	kfree(info);
 }
@@ -397,7 +408,8 @@ static void rds_spawn_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 					    struct rdma_cm_event *event,
 					    bool isv6)
 {
-	struct rds_connection *conn = cm_id->context;
+	int cm_id_gen = RDS_IB_CM_ID_EXTRACT_GEN(cm_id);
+	struct rds_connection *conn = RDS_IB_CM_ID_EXTRACT_CONN(cm_id);
 	struct rds_ib_connection *ic = conn ? conn->c_transport_data : NULL;
 	struct workqueue_struct *wq;
 	struct rds_rdma_cm_event_handler_info *info;
@@ -408,13 +420,13 @@ static void rds_spawn_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 		wq = rds_aux_wq;
 
 	if (!wq) {
-		rds_rdma_cm_event_handler_cmn(cm_id, event, conn, isv6);
+		rds_rdma_cm_event_handler_cmn(cm_id, cm_id_gen, event, conn, isv6);
 		return;
 	}
 
 	info = kmalloc(sizeof(*info) + event->param.conn.private_data_len, GFP_KERNEL);
 	if (!info) {
-		rds_rdma_cm_event_handler_cmn(cm_id, event, conn, isv6);
+		rds_rdma_cm_event_handler_cmn(cm_id, cm_id_gen, event, conn, isv6);
 		return;
 	}
 
@@ -424,6 +436,7 @@ static void rds_spawn_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 	memcpy(&info->event, event, sizeof(*event));
 	info->conn = conn;
 	info->isv6 = isv6;
+	info->cm_id_gen = cm_id_gen;
 
 	if (event->param.conn.private_data &&
 	    event->param.conn.private_data_len) {