Skip to content

Commit 9d659ae

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
locking/mutex: Add lock handoff to avoid starvation
Implement lock handoff to avoid lock starvation. Lock starvation is possible because mutex_lock() allows lock stealing, where a running (or optimistic spinning) task beats the woken waiter to the acquire. Lock stealing is an important performance optimization because waiting for a waiter to wake up and get runtime can take a significant time, during which everyboy would stall on the lock. The down-side is of course that it allows for starvation. This patch has the waiter requesting a handoff if it fails to acquire the lock upon waking. This re-introduces some of the wait time, because once we do a handoff we have to wait for the waiter to wake up again. A future patch will add a round of optimistic spinning to attempt to alleviate this penalty, but if that turns out to not be enough, we can add a counter and only request handoff after multiple failed wakeups. There are a few tricky implementation details: - accepting a handoff must only be done in the wait-loop. Since the handoff condition is owner == current, it can easily cause recursive locking trouble. - accepting the handoff must be careful to provide the ACQUIRE semantics. - having the HANDOFF bit set on unlock requires care, we must not clear the owner. - we must be careful to not leave HANDOFF set after we've acquired the lock. The tricky scenario is setting the HANDOFF bit on an unlocked mutex. Tested-by: Jason Low <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Waiman Long <[email protected]> Cc: Andrew Morton <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Paul E. McKenney <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: [email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent a3ea3d9 commit 9d659ae

File tree

1 file changed

+119
-23
lines changed

1 file changed

+119
-23
lines changed

kernel/locking/mutex.c

Lines changed: 119 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,10 @@ EXPORT_SYMBOL(__mutex_init);
5454
* bits to store extra state.
5555
*
5656
* Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
57+
* Bit1 indicates unlock needs to hand the lock to the top-waiter
5758
*/
5859
#define MUTEX_FLAG_WAITERS 0x01
60+
#define MUTEX_FLAG_HANDOFF 0x02
5961

6062
#define MUTEX_FLAGS 0x03
6163

@@ -71,20 +73,48 @@ static inline unsigned long __owner_flags(unsigned long owner)
7173

7274
/*
7375
* Actual trylock that will work on any unlocked state.
76+
*
77+
* When setting the owner field, we must preserve the low flag bits.
78+
*
79+
* Be careful with @handoff, only set that in a wait-loop (where you set
80+
* HANDOFF) to avoid recursive lock attempts.
7481
*/
75-
static inline bool __mutex_trylock(struct mutex *lock)
82+
static inline bool __mutex_trylock(struct mutex *lock, const bool handoff)
7683
{
7784
unsigned long owner, curr = (unsigned long)current;
7885

7986
owner = atomic_long_read(&lock->owner);
8087
for (;;) { /* must loop, can race against a flag */
81-
unsigned long old;
88+
unsigned long old, flags = __owner_flags(owner);
89+
90+
if (__owner_task(owner)) {
91+
if (handoff && unlikely(__owner_task(owner) == current)) {
92+
/*
93+
* Provide ACQUIRE semantics for the lock-handoff.
94+
*
95+
* We cannot easily use load-acquire here, since
96+
* the actual load is a failed cmpxchg, which
97+
* doesn't imply any barriers.
98+
*
99+
* Also, this is a fairly unlikely scenario, and
100+
* this contains the cost.
101+
*/
102+
smp_mb(); /* ACQUIRE */
103+
return true;
104+
}
82105

83-
if (__owner_task(owner))
84106
return false;
107+
}
108+
109+
/*
110+
* We set the HANDOFF bit, we must make sure it doesn't live
111+
* past the point where we acquire it. This would be possible
112+
* if we (accidentally) set the bit on an unlocked mutex.
113+
*/
114+
if (handoff)
115+
flags &= ~MUTEX_FLAG_HANDOFF;
85116

86-
old = atomic_long_cmpxchg_acquire(&lock->owner, owner,
87-
curr | __owner_flags(owner));
117+
old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
88118
if (old == owner)
89119
return true;
90120

@@ -134,6 +164,39 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag)
134164
atomic_long_andnot(flag, &lock->owner);
135165
}
136166

167+
static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter)
168+
{
169+
return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter;
170+
}
171+
172+
/*
173+
* Give up ownership to a specific task, when @task = NULL, this is equivalent
174+
* to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE
175+
* semantics like a regular unlock, the __mutex_trylock() provides matching
176+
* ACQUIRE semantics for the handoff.
177+
*/
178+
static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
179+
{
180+
unsigned long owner = atomic_long_read(&lock->owner);
181+
182+
for (;;) {
183+
unsigned long old, new;
184+
185+
#ifdef CONFIG_DEBUG_MUTEXES
186+
DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
187+
#endif
188+
189+
new = (owner & MUTEX_FLAG_WAITERS);
190+
new |= (unsigned long)task;
191+
192+
old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
193+
if (old == owner)
194+
break;
195+
196+
owner = old;
197+
}
198+
}
199+
137200
#ifndef CONFIG_DEBUG_LOCK_ALLOC
138201
/*
139202
* We split the mutex lock/unlock logic into separate fastpath and
@@ -398,7 +461,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
398461
break;
399462

400463
/* Try to acquire the mutex if it is unlocked. */
401-
if (__mutex_trylock(lock)) {
464+
if (__mutex_trylock(lock, false)) {
402465
osq_unlock(&lock->osq);
403466
return true;
404467
}
@@ -523,6 +586,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
523586
struct task_struct *task = current;
524587
struct mutex_waiter waiter;
525588
unsigned long flags;
589+
bool first = false;
526590
int ret;
527591

528592
if (use_ww_ctx) {
@@ -534,7 +598,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
534598
preempt_disable();
535599
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
536600

537-
if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
601+
if (__mutex_trylock(lock, false) ||
602+
mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
538603
/* got the lock, yay! */
539604
lock_acquired(&lock->dep_map, ip);
540605
if (use_ww_ctx) {
@@ -551,7 +616,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
551616
/*
552617
* After waiting to acquire the wait_lock, try again.
553618
*/
554-
if (__mutex_trylock(lock))
619+
if (__mutex_trylock(lock, false))
555620
goto skip_wait;
556621

557622
debug_mutex_lock_common(lock, &waiter);
@@ -561,13 +626,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
561626
list_add_tail(&waiter.list, &lock->wait_list);
562627
waiter.task = task;
563628

564-
if (list_first_entry(&lock->wait_list, struct mutex_waiter, list) == &waiter)
629+
if (__mutex_waiter_is_first(lock, &waiter))
565630
__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
566631

567632
lock_contended(&lock->dep_map, ip);
568633

569634
for (;;) {
570-
if (__mutex_trylock(lock))
635+
if (__mutex_trylock(lock, first))
571636
break;
572637

573638
/*
@@ -586,17 +651,20 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
586651
}
587652

588653
__set_task_state(task, state);
589-
590-
/* didn't get the lock, go to sleep: */
591654
spin_unlock_mutex(&lock->wait_lock, flags);
592655
schedule_preempt_disabled();
593656
spin_lock_mutex(&lock->wait_lock, flags);
657+
658+
if (!first && __mutex_waiter_is_first(lock, &waiter)) {
659+
first = true;
660+
__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
661+
}
594662
}
595663
__set_task_state(task, TASK_RUNNING);
596664

597665
mutex_remove_waiter(lock, &waiter, task);
598666
if (likely(list_empty(&lock->wait_list)))
599-
__mutex_clear_flag(lock, MUTEX_FLAG_WAITERS);
667+
__mutex_clear_flag(lock, MUTEX_FLAGS);
600668

601669
debug_mutex_free_waiter(&waiter);
602670

@@ -724,33 +792,61 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
724792
*/
725793
static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
726794
{
795+
struct task_struct *next = NULL;
727796
unsigned long owner, flags;
728797
WAKE_Q(wake_q);
729798

730799
mutex_release(&lock->dep_map, 1, ip);
731800

732801
/*
733-
* Release the lock before (potentially) taking the spinlock
734-
* such that other contenders can get on with things ASAP.
802+
* Release the lock before (potentially) taking the spinlock such that
803+
* other contenders can get on with things ASAP.
804+
*
805+
* Except when HANDOFF, in that case we must not clear the owner field,
806+
* but instead set it to the top waiter.
735807
*/
736-
owner = atomic_long_fetch_and_release(MUTEX_FLAGS, &lock->owner);
737-
if (!__owner_flags(owner))
738-
return;
808+
owner = atomic_long_read(&lock->owner);
809+
for (;;) {
810+
unsigned long old;
811+
812+
#ifdef CONFIG_DEBUG_MUTEXES
813+
DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
814+
#endif
815+
816+
if (owner & MUTEX_FLAG_HANDOFF)
817+
break;
818+
819+
old = atomic_long_cmpxchg_release(&lock->owner, owner,
820+
__owner_flags(owner));
821+
if (old == owner) {
822+
if (owner & MUTEX_FLAG_WAITERS)
823+
break;
824+
825+
return;
826+
}
827+
828+
owner = old;
829+
}
739830

740831
spin_lock_mutex(&lock->wait_lock, flags);
741832
debug_mutex_unlock(lock);
742-
743833
if (!list_empty(&lock->wait_list)) {
744834
/* get the first entry from the wait-list: */
745835
struct mutex_waiter *waiter =
746-
list_entry(lock->wait_list.next,
747-
struct mutex_waiter, list);
836+
list_first_entry(&lock->wait_list,
837+
struct mutex_waiter, list);
838+
839+
next = waiter->task;
748840

749841
debug_mutex_wake_waiter(lock, waiter);
750-
wake_q_add(&wake_q, waiter->task);
842+
wake_q_add(&wake_q, next);
751843
}
752844

845+
if (owner & MUTEX_FLAG_HANDOFF)
846+
__mutex_handoff(lock, next);
847+
753848
spin_unlock_mutex(&lock->wait_lock, flags);
849+
754850
wake_up_q(&wake_q);
755851
}
756852

@@ -853,7 +949,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
853949
*/
854950
int __sched mutex_trylock(struct mutex *lock)
855951
{
856-
bool locked = __mutex_trylock(lock);
952+
bool locked = __mutex_trylock(lock, false);
857953

858954
if (locked)
859955
mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);

0 commit comments

Comments
 (0)