Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 110 additions & 6 deletions drgn_tools/lockup.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,125 @@
# Copyright (c) 2024, Oracle and/or its affiliates.
# Copyright (c) 2025, Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
import argparse
import typing
from typing import Callable

import drgn
from drgn import Program
from drgn.helpers.common import escape_ascii_string
from drgn.helpers.linux.cpumask import for_each_online_cpu
from drgn.helpers.linux.percpu import per_cpu

from drgn_tools.bt import bt
from drgn_tools.bt import bt_has_any
from drgn_tools.corelens import CorelensModule
from drgn_tools.table import print_table
from drgn_tools.task import task_lastrun2now
from drgn_tools.util import timestamp_str


def dump_tasks_waiting_on_event(
prog: Program,
min_run_time_seconds: int,
tasks_waiting_func: Callable,
wait_desc: str,
) -> None:
"""
Prints tasks waiting on a specific event with details.

:param prog: drgn program
:param min_run_time_seconds: int
:param tasks_waiting_func: function to fetch waiting tasks
:param wait_desc: description for the wait type (for printing)
"""
tasks_waiting = tasks_waiting_func(prog)
output = [["TASK", "NAME", "PID", "PENDING_TIME"]]
tasks_pids = set()
if tasks_waiting:
for t, _ in tasks_waiting:
pending_time = timestamp_str(task_lastrun2now(t))
pid = t.pid.value_()
if pid not in tasks_pids and task_lastrun2now(t) > min_run_time_seconds * 1e9:
output.append(
[
hex(t.value_()),
escape_ascii_string(t.comm.string_()),
pid,
pending_time,
]
)
tasks_pids.add(pid)
print()
print(f"We found below tasks waiting for {wait_desc} over {min_run_time_seconds} seconds:")
print_table(output)


def tasks_waiting_rcu_gp(prog: Program) -> typing.List[typing.Tuple[drgn.Object, drgn.StackFrame]]:
"""
Detects tasks waiting RCU grace period

:param prog: drgn program
"""
rcu_gp_fn = ["percpu_ref_switch_to_atomic_sync", "__wait_rcu_gp"]
return bt_has_any(prog, rcu_gp_fn)


def tasks_waiting_spinlock(prog: Program) -> typing.List[typing.Tuple[drgn.Object, drgn.StackFrame]]:
"""
Detects tasks waiting on spinlocks

:param prog: drgn program
"""
spinlock_spinner_fn = [
"__pv_queued_spin_lock_slowpath",
"native_queued_spin_lock_slowpath",
"queued_spin_lock_slowpath",
]
return bt_has_any(prog, spinlock_spinner_fn)


def tasks_waiting_fsnotify(prog: Program) -> typing.List[typing.Tuple[drgn.Object, drgn.StackFrame]]:
"""
Detects tasks waiting on fsnotify

:param prog: drgn program
"""
spinlock_spinner_fn = ["__fsnotify_update_child_dentry_flags"]
return bt_has_any(prog, spinlock_spinner_fn)


def dump_tasks_waiting_rcu_gp(prog, min_run_time_seconds: int) -> None:
dump_tasks_waiting_on_event(
prog,
min_run_time_seconds,
tasks_waiting_rcu_gp,
"rcu grace period",
)


def dump_tasks_waiting_spinlock(prog, min_run_time_seconds: int) -> None:
dump_tasks_waiting_on_event(
prog,
min_run_time_seconds,
tasks_waiting_spinlock,
"spinlock",
)


def dump_tasks_waiting_fsnotify(prog, min_run_time_seconds: int) -> None:
dump_tasks_waiting_on_event(
prog,
min_run_time_seconds,
tasks_waiting_fsnotify,
"fsnotify",
)


def scan_lockup(
prog: Program, min_run_time_seconds: int = 1, skip_swapper: bool = True
) -> None:
"""
Scan potential lockups on cpus.
Scan potential lockups on cpus and tasks waiting for RCU.

:param prog: drgn program
:param min_run_time_seconds: int
Expand Down Expand Up @@ -47,13 +149,15 @@ def scan_lockup(
print()
nr_processes += 1

print(
f"We found {nr_processes} processes running more than {min_run_time_seconds} seconds."
)
print(f"We found {nr_processes} processes running more than {min_run_time_seconds} seconds")

dump_tasks_waiting_rcu_gp(prog, min_run_time_seconds)
dump_tasks_waiting_spinlock(prog, min_run_time_seconds)
dump_tasks_waiting_fsnotify(prog, min_run_time_seconds)


class LockUp(CorelensModule):
"""Print tasks which have been on-cpu for too long"""
"""Print tasks which have been on-cpu for too long (possible RCU blockers) and tasks waiting RCU grace period if any"""

name = "lockup"

Expand Down