Skip to content

Commit e70cb51

Browse files
committed
mm: introduce BPF struct ops for OOM handling
Introduce a bpf struct ops for implementing custom OOM handling policies. It's possible to load one bpf_oom_ops for the system and one bpf_oom_ops for every memory cgroup. In case of a memcg OOM, the cgroup tree is traversed from the OOM'ing memcg up to the root and corresponding BPF OOM handlers are executed until some memory is freed. If no memory is freed, the kernel OOM killer is invoked. The struct ops provides the bpf_handle_out_of_memory() callback, which expected to return 1 if it was able to free some memory and 0 otherwise. If 1 is returned, the kernel also checks the bpf_memory_freed field of the oom_control structure, which is expected to be set by kfuncs suitable for releasing memory. If both are set, OOM is considered handled, otherwise the next OOM handler in the chain (e.g. BPF OOM attached to the parent cgroup or the in-kernel OOM killer) is executed. The bpf_handle_out_of_memory() callback program is sleepable to enable using iterators, e.g. cgroup iterators. The callback receives struct oom_control as an argument, so it can determine the scope of the OOM event: if this is a memcg-wide or system-wide OOM. The callback is executed just before the kernel victim task selection algorithm, so all heuristics and sysctls like panic on oom, sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task are respected. BPF OOM struct ops provides the handle_cgroup_offline() callback which is good for releasing struct ops if the corresponding cgroup is gone. The struct ops also has the name field, which allows to define a custom name for the implemented policy. It's printed in the OOM report in the oom_policy=<policy> format. "default" is printed if bpf is not used or policy name is not specified. [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 oom_policy=bpf_test_policy [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc torvalds#102 PREEMPT(full) [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 [ 112.698167] Call Trace: [ 112.698177] <TASK> [ 112.698182] dump_stack_lvl+0x4d/0x70 [ 112.698192] dump_header+0x59/0x1c6 [ 112.698199] oom_kill_process.cold+0x8/0xef [ 112.698206] bpf_oom_kill_process+0x59/0xb0 [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 [ 112.698240] bpf_handle_oom+0x11a/0x1e0 [ 112.698250] out_of_memory+0xab/0x5c0 [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 [ 112.698274] try_charge_memcg+0x4b5/0x7e0 [ 112.698288] charge_memcg+0x2f/0xc0 [ 112.698293] __mem_cgroup_charge+0x30/0xc0 [ 112.698299] do_anonymous_page+0x40f/0xa50 [ 112.698311] __handle_mm_fault+0xbba/0x1140 [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 [ 112.698335] handle_mm_fault+0xe6/0x370 [ 112.698343] do_user_addr_fault+0x211/0x6a0 [ 112.698354] exc_page_fault+0x75/0x1d0 [ 112.698363] asm_exc_page_fault+0x26/0x30 [ 112.698366] RIP: 0033:0x7fa97236db00 Signed-off-by: Roman Gushchin <[email protected]>
1 parent 60ff610 commit e70cb51

File tree

7 files changed

+357
-2
lines changed

7 files changed

+357
-2
lines changed

include/linux/bpf_oom.h

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/* SPDX-License-Identifier: GPL-2.0+ */
2+
3+
#ifndef __BPF_OOM_H
4+
#define __BPF_OOM_H
5+
6+
struct oom_control;
7+
8+
#define BPF_OOM_NAME_MAX_LEN 64
9+
10+
struct bpf_oom_ctx {
11+
/*
12+
* If bpf_oom_ops is attached to a cgroup, id of this cgroup.
13+
* 0 otherwise.
14+
*/
15+
u64 cgroup_id;
16+
};
17+
18+
struct bpf_oom_ops {
19+
/**
20+
* @handle_out_of_memory: Out of memory bpf handler, called before
21+
* the in-kernel OOM killer.
22+
* @oc: OOM control structure
23+
* @ctx: Execution context
24+
*
25+
* Should return 1 if some memory was freed up, otherwise
26+
* the in-kernel OOM killer is invoked.
27+
*/
28+
int (*handle_out_of_memory)(struct oom_control *oc, struct bpf_oom_ctx *ctx);
29+
30+
/**
31+
* @handle_cgroup_offline: Cgroup offline callback
32+
* @cgroup_id: Id of deleted cgroup
33+
*
34+
* Called if the cgroup with the attached bpf_oom_ops is deleted.
35+
*/
36+
void (*handle_cgroup_offline)(u64 cgroup_id, struct bpf_oom_ctx *ctx);
37+
38+
/**
39+
* @name: BPF OOM policy name
40+
*/
41+
char name[BPF_OOM_NAME_MAX_LEN];
42+
};
43+
44+
#ifdef CONFIG_BPF_SYSCALL
45+
/**
46+
* @bpf_handle_oom: handle out of memory condition using bpf
47+
* @oc: OOM control structure
48+
*
49+
* Returns true if some memory was freed.
50+
*/
51+
bool bpf_handle_oom(struct oom_control *oc);
52+
53+
54+
/**
55+
* @bpf_oom_memcg_offline: handle memcg offlining
56+
* @memcg: Memory cgroup is offlined
57+
*
58+
* When a memory cgroup is about to be deleted and there is an
59+
* attached BPF OOM structure, it has to be detached.
60+
*/
61+
void bpf_oom_memcg_offline(struct mem_cgroup *memcg);
62+
63+
#else /* CONFIG_BPF_SYSCALL */
64+
static inline bool bpf_handle_oom(struct oom_control *oc)
65+
{
66+
return false;
67+
}
68+
69+
static inline void bpf_oom_memcg_offline(struct mem_cgroup *memcg) {}
70+
71+
#endif /* CONFIG_BPF_SYSCALL */
72+
73+
#endif /* __BPF_OOM_H */

include/linux/memcontrol.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ struct obj_cgroup;
2929
struct page;
3030
struct mm_struct;
3131
struct kmem_cache;
32+
struct bpf_oom_ops;
3233

3334
/* Cgroup-specific page state, on top of universal node page state */
3435
enum memcg_stat_item {
@@ -226,6 +227,10 @@ struct mem_cgroup {
226227
*/
227228
bool oom_group;
228229

230+
#ifdef CONFIG_BPF_SYSCALL
231+
struct bpf_oom_ops *bpf_oom;
232+
#endif
233+
229234
int swappiness;
230235

231236
/* memory.events and memory.events.local */

include/linux/oom.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ struct oom_control {
5151

5252
/* Used to print the constraint info. */
5353
enum oom_constraint constraint;
54+
55+
#ifdef CONFIG_BPF_SYSCALL
56+
/* Used by the bpf oom implementation to mark the forward progress */
57+
bool bpf_memory_freed;
58+
59+
/* Policy name */
60+
const char *bpf_policy_name;
61+
#endif
5462
};
5563

5664
extern struct mutex oom_lock;

mm/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
105105
ifdef CONFIG_SWAP
106106
obj-$(CONFIG_MEMCG) += swap_cgroup.o
107107
endif
108+
ifdef CONFIG_BPF_SYSCALL
109+
obj-y += bpf_oom.o
110+
endif
108111
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
109112
obj-$(CONFIG_GUP_TEST) += gup_test.o
110113
obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o

mm/bpf_oom.c

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
// SPDX-License-Identifier: GPL-2.0-or-later
2+
/*
3+
* BPF-driven OOM killer customization
4+
*
5+
* Author: Roman Gushchin <[email protected]>
6+
*/
7+
8+
#include <linux/bpf.h>
9+
#include <linux/oom.h>
10+
#include <linux/bpf_oom.h>
11+
#include <linux/srcu.h>
12+
#include <linux/cgroup.h>
13+
#include <linux/memcontrol.h>
14+
15+
DEFINE_STATIC_SRCU(bpf_oom_srcu);
16+
static struct bpf_oom_ops *system_bpf_oom;
17+
18+
static int bpf_ops_handle_oom(struct bpf_oom_ops *bpf_oom_ops,
19+
struct mem_cgroup *memcg,
20+
struct oom_control *oc)
21+
{
22+
struct bpf_oom_ctx exec_ctx;
23+
int ret;
24+
25+
if (memcg)
26+
exec_ctx.cgroup_id = cgroup_id(memcg->css.cgroup);
27+
else
28+
exec_ctx.cgroup_id = 0;
29+
30+
oc->bpf_policy_name = &bpf_oom_ops->name[0];
31+
oc->bpf_memory_freed = false;
32+
ret = bpf_oom_ops->handle_out_of_memory(oc, &exec_ctx);
33+
oc->bpf_policy_name = NULL;
34+
35+
return ret;
36+
}
37+
38+
bool bpf_handle_oom(struct oom_control *oc)
39+
{
40+
struct bpf_oom_ops *bpf_oom_ops = NULL;
41+
struct mem_cgroup *memcg;
42+
int idx, ret = 0;
43+
44+
/* All bpf_oom_ops structures are protected using bpf_oom_srcu */
45+
idx = srcu_read_lock(&bpf_oom_srcu);
46+
47+
/* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */
48+
for (memcg = oc->memcg; memcg; memcg = parent_mem_cgroup(memcg)) {
49+
bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
50+
if (!bpf_oom_ops)
51+
continue;
52+
53+
/* Call BPF OOM handler */
54+
ret = bpf_ops_handle_oom(bpf_oom_ops, memcg, oc);
55+
if (ret && oc->bpf_memory_freed)
56+
goto exit;
57+
}
58+
/*
59+
* System-wide OOM or per-memcg BPF OOM handler wasn't successful?
60+
* Try system_bpf_oom.
61+
*/
62+
bpf_oom_ops = READ_ONCE(system_bpf_oom);
63+
if (!bpf_oom_ops)
64+
goto exit;
65+
66+
/* Call BPF OOM handler */
67+
ret = bpf_ops_handle_oom(bpf_oom_ops, NULL, oc);
68+
exit:
69+
srcu_read_unlock(&bpf_oom_srcu, idx);
70+
return ret && oc->bpf_memory_freed;
71+
}
72+
73+
static int __handle_out_of_memory(struct oom_control *oc,
74+
struct bpf_oom_ctx *exec_ctx)
75+
{
76+
return 0;
77+
}
78+
79+
static void __handle_cgroup_offline(u64 cgroup_id, struct bpf_oom_ctx *exec_ctx)
80+
{
81+
}
82+
83+
static struct bpf_oom_ops __bpf_oom_ops = {
84+
.handle_out_of_memory = __handle_out_of_memory,
85+
.handle_cgroup_offline = __handle_cgroup_offline,
86+
};
87+
88+
static const struct bpf_func_proto *
89+
bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
90+
{
91+
return tracing_prog_func_proto(func_id, prog);
92+
}
93+
94+
static bool bpf_oom_ops_is_valid_access(int off, int size,
95+
enum bpf_access_type type,
96+
const struct bpf_prog *prog,
97+
struct bpf_insn_access_aux *info)
98+
{
99+
return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
100+
}
101+
102+
static const struct bpf_verifier_ops bpf_oom_verifier_ops = {
103+
.get_func_proto = bpf_oom_func_proto,
104+
.is_valid_access = bpf_oom_ops_is_valid_access,
105+
};
106+
107+
static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
108+
{
109+
struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
110+
struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
111+
struct bpf_oom_ops *bpf_oom_ops = kdata;
112+
struct mem_cgroup *memcg = NULL;
113+
int err = 0;
114+
115+
if (ops_link->cgroup_id) {
116+
/* Attach to a memory cgroup? */
117+
memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
118+
if (IS_ERR_OR_NULL(memcg))
119+
return PTR_ERR(memcg);
120+
bpf_oom_ops_ptr = &memcg->bpf_oom;
121+
} else {
122+
/* System-wide OOM handler */
123+
bpf_oom_ops_ptr = &system_bpf_oom;
124+
}
125+
126+
/* Another struct ops attached? */
127+
if (READ_ONCE(*bpf_oom_ops_ptr)) {
128+
err = -EBUSY;
129+
goto exit;
130+
}
131+
132+
/* Expose bpf_oom_ops structure */
133+
WRITE_ONCE(*bpf_oom_ops_ptr, bpf_oom_ops);
134+
exit:
135+
mem_cgroup_put(memcg);
136+
return err;
137+
}
138+
139+
static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link)
140+
{
141+
struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
142+
struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
143+
struct bpf_oom_ops *bpf_oom_ops = kdata;
144+
struct mem_cgroup *memcg = NULL;
145+
146+
if (ops_link->cgroup_id) {
147+
/* Detach from a memory cgroup? */
148+
memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
149+
if (IS_ERR_OR_NULL(memcg))
150+
goto exit;
151+
bpf_oom_ops_ptr = &memcg->bpf_oom;
152+
} else {
153+
/* System-wide OOM handler */
154+
bpf_oom_ops_ptr = &system_bpf_oom;
155+
}
156+
157+
/* Hide bpf_oom_ops from new callers */
158+
if (!WARN_ON(READ_ONCE(*bpf_oom_ops_ptr) != bpf_oom_ops))
159+
WRITE_ONCE(*bpf_oom_ops_ptr, NULL);
160+
161+
mem_cgroup_put(memcg);
162+
163+
exit:
164+
/* Release bpf_oom_ops after a srcu grace period */
165+
synchronize_srcu(&bpf_oom_srcu);
166+
}
167+
168+
void bpf_oom_memcg_offline(struct mem_cgroup *memcg)
169+
{
170+
struct bpf_oom_ops *bpf_oom_ops;
171+
struct bpf_oom_ctx exec_ctx;
172+
u64 cgrp_id;
173+
int idx;
174+
175+
/* All bpf_oom_ops structures are protected using bpf_oom_srcu */
176+
idx = srcu_read_lock(&bpf_oom_srcu);
177+
178+
bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
179+
WRITE_ONCE(memcg->bpf_oom, NULL);
180+
181+
if (bpf_oom_ops && bpf_oom_ops->handle_cgroup_offline) {
182+
cgrp_id = cgroup_id(memcg->css.cgroup);
183+
exec_ctx.cgroup_id = cgrp_id;
184+
bpf_oom_ops->handle_cgroup_offline(cgrp_id, &exec_ctx);
185+
}
186+
187+
srcu_read_unlock(&bpf_oom_srcu, idx);
188+
}
189+
190+
static int bpf_oom_ops_check_member(const struct btf_type *t,
191+
const struct btf_member *member,
192+
const struct bpf_prog *prog)
193+
{
194+
u32 moff = __btf_member_bit_offset(t, member) / 8;
195+
196+
switch (moff) {
197+
case offsetof(struct bpf_oom_ops, handle_out_of_memory):
198+
if (!prog)
199+
return -EINVAL;
200+
break;
201+
}
202+
203+
return 0;
204+
}
205+
206+
static int bpf_oom_ops_init_member(const struct btf_type *t,
207+
const struct btf_member *member,
208+
void *kdata, const void *udata)
209+
{
210+
const struct bpf_oom_ops *uops = udata;
211+
struct bpf_oom_ops *ops = kdata;
212+
u32 moff = __btf_member_bit_offset(t, member) / 8;
213+
214+
switch (moff) {
215+
case offsetof(struct bpf_oom_ops, name):
216+
if (uops->name[0])
217+
strscpy_pad(ops->name, uops->name, sizeof(ops->name));
218+
else
219+
strscpy_pad(ops->name, "bpf_defined_policy");
220+
return 1;
221+
}
222+
return 0;
223+
}
224+
225+
static int bpf_oom_ops_init(struct btf *btf)
226+
{
227+
return 0;
228+
}
229+
230+
static struct bpf_struct_ops bpf_oom_bpf_ops = {
231+
.verifier_ops = &bpf_oom_verifier_ops,
232+
.reg = bpf_oom_ops_reg,
233+
.unreg = bpf_oom_ops_unreg,
234+
.check_member = bpf_oom_ops_check_member,
235+
.init_member = bpf_oom_ops_init_member,
236+
.init = bpf_oom_ops_init,
237+
.name = "bpf_oom_ops",
238+
.owner = THIS_MODULE,
239+
.cfi_stubs = &__bpf_oom_ops
240+
};
241+
242+
static int __init bpf_oom_struct_ops_init(void)
243+
{
244+
return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops);
245+
}
246+
late_initcall(bpf_oom_struct_ops_init);

mm/memcontrol.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
#include <linux/seq_buf.h>
6464
#include <linux/sched/isolation.h>
6565
#include <linux/kmemleak.h>
66+
#include <linux/bpf_oom.h>
6667
#include "internal.h"
6768
#include <net/sock.h>
6869
#include <net/ip.h>
@@ -3885,6 +3886,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
38853886

38863887
zswap_memcg_offline_cleanup(memcg);
38873888

3889+
bpf_oom_memcg_offline(memcg);
38883890
memcg_offline_kmem(memcg);
38893891
reparent_shrinker_deferred(memcg);
38903892
wb_memcg_offline(memcg);

0 commit comments

Comments
 (0)