From 873a788c7b70128bda3d4be771b069a075040f4f Mon Sep 17 00:00:00 2001 From: Richard Li Date: Fri, 2 May 2025 16:26:11 +0000 Subject: [PATCH 1/2] lockup: detect tasks waiting for RCU grace period and possible blockers Orabug: 37899681 Signed-off-by: Richard Li --- drgn_tools/lockup.py | 55 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/drgn_tools/lockup.py b/drgn_tools/lockup.py index 8c1039f6..e0cfa56d 100644 --- a/drgn_tools/lockup.py +++ b/drgn_tools/lockup.py @@ -1,14 +1,18 @@ -# Copyright (c) 2024, Oracle and/or its affiliates. +# Copyright (c) 2025, Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import argparse +import typing +import drgn from drgn import Program from drgn.helpers.common import escape_ascii_string from drgn.helpers.linux.cpumask import for_each_online_cpu from drgn.helpers.linux.percpu import per_cpu from drgn_tools.bt import bt +from drgn_tools.bt import bt_has_any from drgn_tools.corelens import CorelensModule +from drgn_tools.table import print_table from drgn_tools.task import task_lastrun2now from drgn_tools.util import timestamp_str @@ -17,7 +21,7 @@ def scan_lockup( prog: Program, min_run_time_seconds: int = 1, skip_swapper: bool = True ) -> None: """ - Scan potential lockups on cpus. + Scan potential lockups on cpus and tasks waiting for RCU. :param prog: drgn program :param min_run_time_seconds: int @@ -47,13 +51,52 @@ def scan_lockup( print() nr_processes += 1 - print( - f"We found {nr_processes} processes running more than {min_run_time_seconds} seconds." - ) + print(f"We found {nr_processes} processes running more than {min_run_time_seconds} seconds") + + dump_tasks_waiting_rcu_gp(prog, min_run_time_seconds) + + +def tasks_waiting_rcu_gp(prog: Program) -> typing.List[typing.Tuple[drgn.Object, drgn.StackFrame]]: + """ + Detects tasks waiting RCU grace period + + :param prog: drgn program + """ + rcu_gp_fn = ["percpu_ref_switch_to_atomic_sync", "__wait_rcu_gp"] + return bt_has_any(prog, rcu_gp_fn) + + +def dump_tasks_waiting_rcu_gp(prog: Program, min_run_time_seconds: int) -> None: + """ + Prints tasks waiting on rcu grace period with details + + :param prog: drgn program + :param min_run_time_seconds: int + """ + tasks_waiting = tasks_waiting_rcu_gp(prog) + output = [["TASK", "NAME", "PID", "PENDING_TIME"]] + tasks_pids = set() # remove duplicates + if tasks_waiting: + for t, _ in tasks_waiting: + pending_time = timestamp_str(task_lastrun2now(t)) + pid = t.pid.value_() + if pid not in tasks_pids and task_lastrun2now(t) > min_run_time_seconds * 1e9: + output.append( + [ + hex(t.value_()), + escape_ascii_string(t.comm.string_()), + pid, + pending_time + ] + ) + tasks_pids.add(pid) + print() + print(f"We found below tasks waiting for rcu grace period over {min_run_time_seconds} seconds:") + print_table(output) class LockUp(CorelensModule): - """Print tasks which have been on-cpu for too long""" + """Print tasks which have been on-cpu for too long (possible RCU blockers) and tasks waiting RCU grace period if any""" name = "lockup" From 674b22cbaf6c9e0a5c3bcf5d1f7e82ea3d5d1bd1 Mon Sep 17 00:00:00 2001 From: Richard Li Date: Mon, 20 Oct 2025 22:15:31 +0000 Subject: [PATCH 2/2] lockup: detect tasks waiting for spinlocks and fsnotify with code refactored Orabug: 38559536 Signed-off-by: Richard Li --- drgn_tools/lockup.py | 139 +++++++++++++++++++++++++++++++------------ 1 file changed, 100 insertions(+), 39 deletions(-) diff --git a/drgn_tools/lockup.py b/drgn_tools/lockup.py index e0cfa56d..6475bc51 100644 --- a/drgn_tools/lockup.py +++ b/drgn_tools/lockup.py @@ -2,6 +2,7 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import argparse import typing +from typing import Callable import drgn from drgn import Program @@ -17,6 +18,103 @@ from drgn_tools.util import timestamp_str +def dump_tasks_waiting_on_event( + prog: Program, + min_run_time_seconds: int, + tasks_waiting_func: Callable, + wait_desc: str, +) -> None: + """ + Prints tasks waiting on a specific event with details. + + :param prog: drgn program + :param min_run_time_seconds: int + :param tasks_waiting_func: function to fetch waiting tasks + :param wait_desc: description for the wait type (for printing) + """ + tasks_waiting = tasks_waiting_func(prog) + output = [["TASK", "NAME", "PID", "PENDING_TIME"]] + tasks_pids = set() + if tasks_waiting: + for t, _ in tasks_waiting: + pending_time = timestamp_str(task_lastrun2now(t)) + pid = t.pid.value_() + if pid not in tasks_pids and task_lastrun2now(t) > min_run_time_seconds * 1e9: + output.append( + [ + hex(t.value_()), + escape_ascii_string(t.comm.string_()), + pid, + pending_time, + ] + ) + tasks_pids.add(pid) + print() + print(f"We found below tasks waiting for {wait_desc} over {min_run_time_seconds} seconds:") + print_table(output) + + +def tasks_waiting_rcu_gp(prog: Program) -> typing.List[typing.Tuple[drgn.Object, drgn.StackFrame]]: + """ + Detects tasks waiting RCU grace period + + :param prog: drgn program + """ + rcu_gp_fn = ["percpu_ref_switch_to_atomic_sync", "__wait_rcu_gp"] + return bt_has_any(prog, rcu_gp_fn) + + +def tasks_waiting_spinlock(prog: Program) -> typing.List[typing.Tuple[drgn.Object, drgn.StackFrame]]: + """ + Detects tasks waiting on spinlocks + + :param prog: drgn program + """ + spinlock_spinner_fn = [ + "__pv_queued_spin_lock_slowpath", + "native_queued_spin_lock_slowpath", + "queued_spin_lock_slowpath", + ] + return bt_has_any(prog, spinlock_spinner_fn) + + +def tasks_waiting_fsnotify(prog: Program) -> typing.List[typing.Tuple[drgn.Object, drgn.StackFrame]]: + """ + Detects tasks waiting on fsnotify + + :param prog: drgn program + """ + spinlock_spinner_fn = ["__fsnotify_update_child_dentry_flags"] + return bt_has_any(prog, spinlock_spinner_fn) + + +def dump_tasks_waiting_rcu_gp(prog, min_run_time_seconds: int) -> None: + dump_tasks_waiting_on_event( + prog, + min_run_time_seconds, + tasks_waiting_rcu_gp, + "rcu grace period", + ) + + +def dump_tasks_waiting_spinlock(prog, min_run_time_seconds: int) -> None: + dump_tasks_waiting_on_event( + prog, + min_run_time_seconds, + tasks_waiting_spinlock, + "spinlock", + ) + + +def dump_tasks_waiting_fsnotify(prog, min_run_time_seconds: int) -> None: + dump_tasks_waiting_on_event( + prog, + min_run_time_seconds, + tasks_waiting_fsnotify, + "fsnotify", + ) + + def scan_lockup( prog: Program, min_run_time_seconds: int = 1, skip_swapper: bool = True ) -> None: @@ -54,45 +152,8 @@ def scan_lockup( print(f"We found {nr_processes} processes running more than {min_run_time_seconds} seconds") dump_tasks_waiting_rcu_gp(prog, min_run_time_seconds) - - -def tasks_waiting_rcu_gp(prog: Program) -> typing.List[typing.Tuple[drgn.Object, drgn.StackFrame]]: - """ - Detects tasks waiting RCU grace period - - :param prog: drgn program - """ - rcu_gp_fn = ["percpu_ref_switch_to_atomic_sync", "__wait_rcu_gp"] - return bt_has_any(prog, rcu_gp_fn) - - -def dump_tasks_waiting_rcu_gp(prog: Program, min_run_time_seconds: int) -> None: - """ - Prints tasks waiting on rcu grace period with details - - :param prog: drgn program - :param min_run_time_seconds: int - """ - tasks_waiting = tasks_waiting_rcu_gp(prog) - output = [["TASK", "NAME", "PID", "PENDING_TIME"]] - tasks_pids = set() # remove duplicates - if tasks_waiting: - for t, _ in tasks_waiting: - pending_time = timestamp_str(task_lastrun2now(t)) - pid = t.pid.value_() - if pid not in tasks_pids and task_lastrun2now(t) > min_run_time_seconds * 1e9: - output.append( - [ - hex(t.value_()), - escape_ascii_string(t.comm.string_()), - pid, - pending_time - ] - ) - tasks_pids.add(pid) - print() - print(f"We found below tasks waiting for rcu grace period over {min_run_time_seconds} seconds:") - print_table(output) + dump_tasks_waiting_spinlock(prog, min_run_time_seconds) + dump_tasks_waiting_fsnotify(prog, min_run_time_seconds) class LockUp(CorelensModule):