|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | +/* |
| 3 | + * BPF-driven OOM killer customization |
| 4 | + * |
| 5 | + * Author: Roman Gushchin <[email protected]> |
| 6 | + */ |
| 7 | + |
| 8 | +#include <linux/bpf.h> |
| 9 | +#include <linux/oom.h> |
| 10 | +#include <linux/bpf_oom.h> |
| 11 | +#include <linux/srcu.h> |
| 12 | +#include <linux/cgroup.h> |
| 13 | +#include <linux/memcontrol.h> |
| 14 | + |
| 15 | +DEFINE_STATIC_SRCU(bpf_oom_srcu); |
| 16 | +static struct bpf_oom_ops *system_bpf_oom; |
| 17 | + |
| 18 | +static int bpf_ops_handle_oom(struct bpf_oom_ops *bpf_oom_ops, |
| 19 | + struct mem_cgroup *memcg, |
| 20 | + struct oom_control *oc) |
| 21 | +{ |
| 22 | + struct bpf_oom_ctx exec_ctx; |
| 23 | + int ret; |
| 24 | + |
| 25 | + if (memcg) |
| 26 | + exec_ctx.cgroup_id = cgroup_id(memcg->css.cgroup); |
| 27 | + else |
| 28 | + exec_ctx.cgroup_id = 0; |
| 29 | + |
| 30 | + oc->bpf_policy_name = &bpf_oom_ops->name[0]; |
| 31 | + oc->bpf_memory_freed = false; |
| 32 | + ret = bpf_oom_ops->handle_out_of_memory(oc, &exec_ctx); |
| 33 | + oc->bpf_policy_name = NULL; |
| 34 | + |
| 35 | + return ret; |
| 36 | +} |
| 37 | + |
| 38 | +bool bpf_handle_oom(struct oom_control *oc) |
| 39 | +{ |
| 40 | + struct bpf_oom_ops *bpf_oom_ops = NULL; |
| 41 | + struct mem_cgroup *memcg; |
| 42 | + int idx, ret = 0; |
| 43 | + |
| 44 | + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */ |
| 45 | + idx = srcu_read_lock(&bpf_oom_srcu); |
| 46 | + |
| 47 | + /* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */ |
| 48 | + for (memcg = oc->memcg; memcg; memcg = parent_mem_cgroup(memcg)) { |
| 49 | + bpf_oom_ops = READ_ONCE(memcg->bpf_oom); |
| 50 | + if (!bpf_oom_ops) |
| 51 | + continue; |
| 52 | + |
| 53 | + /* Call BPF OOM handler */ |
| 54 | + ret = bpf_ops_handle_oom(bpf_oom_ops, memcg, oc); |
| 55 | + if (ret && oc->bpf_memory_freed) |
| 56 | + goto exit; |
| 57 | + } |
| 58 | + /* |
| 59 | + * System-wide OOM or per-memcg BPF OOM handler wasn't successful? |
| 60 | + * Try system_bpf_oom. |
| 61 | + */ |
| 62 | + bpf_oom_ops = READ_ONCE(system_bpf_oom); |
| 63 | + if (!bpf_oom_ops) |
| 64 | + goto exit; |
| 65 | + |
| 66 | + /* Call BPF OOM handler */ |
| 67 | + ret = bpf_ops_handle_oom(bpf_oom_ops, NULL, oc); |
| 68 | +exit: |
| 69 | + srcu_read_unlock(&bpf_oom_srcu, idx); |
| 70 | + return ret && oc->bpf_memory_freed; |
| 71 | +} |
| 72 | + |
| 73 | +static int __handle_out_of_memory(struct oom_control *oc, |
| 74 | + struct bpf_oom_ctx *exec_ctx) |
| 75 | +{ |
| 76 | + return 0; |
| 77 | +} |
| 78 | + |
| 79 | +static void __handle_cgroup_offline(u64 cgroup_id, struct bpf_oom_ctx *exec_ctx) |
| 80 | +{ |
| 81 | +} |
| 82 | + |
| 83 | +static struct bpf_oom_ops __bpf_oom_ops = { |
| 84 | + .handle_out_of_memory = __handle_out_of_memory, |
| 85 | + .handle_cgroup_offline = __handle_cgroup_offline, |
| 86 | +}; |
| 87 | + |
| 88 | +static const struct bpf_func_proto * |
| 89 | +bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) |
| 90 | +{ |
| 91 | + return tracing_prog_func_proto(func_id, prog); |
| 92 | +} |
| 93 | + |
| 94 | +static bool bpf_oom_ops_is_valid_access(int off, int size, |
| 95 | + enum bpf_access_type type, |
| 96 | + const struct bpf_prog *prog, |
| 97 | + struct bpf_insn_access_aux *info) |
| 98 | +{ |
| 99 | + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); |
| 100 | +} |
| 101 | + |
| 102 | +static const struct bpf_verifier_ops bpf_oom_verifier_ops = { |
| 103 | + .get_func_proto = bpf_oom_func_proto, |
| 104 | + .is_valid_access = bpf_oom_ops_is_valid_access, |
| 105 | +}; |
| 106 | + |
| 107 | +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link) |
| 108 | +{ |
| 109 | + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link); |
| 110 | + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL; |
| 111 | + struct bpf_oom_ops *bpf_oom_ops = kdata; |
| 112 | + struct mem_cgroup *memcg = NULL; |
| 113 | + int err = 0; |
| 114 | + |
| 115 | + if (ops_link->cgroup_id) { |
| 116 | + /* Attach to a memory cgroup? */ |
| 117 | + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id); |
| 118 | + if (IS_ERR_OR_NULL(memcg)) |
| 119 | + return PTR_ERR(memcg); |
| 120 | + bpf_oom_ops_ptr = &memcg->bpf_oom; |
| 121 | + } else { |
| 122 | + /* System-wide OOM handler */ |
| 123 | + bpf_oom_ops_ptr = &system_bpf_oom; |
| 124 | + } |
| 125 | + |
| 126 | + /* Another struct ops attached? */ |
| 127 | + if (READ_ONCE(*bpf_oom_ops_ptr)) { |
| 128 | + err = -EBUSY; |
| 129 | + goto exit; |
| 130 | + } |
| 131 | + |
| 132 | + /* Expose bpf_oom_ops structure */ |
| 133 | + WRITE_ONCE(*bpf_oom_ops_ptr, bpf_oom_ops); |
| 134 | +exit: |
| 135 | + mem_cgroup_put(memcg); |
| 136 | + return err; |
| 137 | +} |
| 138 | + |
| 139 | +static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link) |
| 140 | +{ |
| 141 | + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link); |
| 142 | + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL; |
| 143 | + struct bpf_oom_ops *bpf_oom_ops = kdata; |
| 144 | + struct mem_cgroup *memcg = NULL; |
| 145 | + |
| 146 | + if (ops_link->cgroup_id) { |
| 147 | + /* Detach from a memory cgroup? */ |
| 148 | + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id); |
| 149 | + if (IS_ERR_OR_NULL(memcg)) |
| 150 | + goto exit; |
| 151 | + bpf_oom_ops_ptr = &memcg->bpf_oom; |
| 152 | + } else { |
| 153 | + /* System-wide OOM handler */ |
| 154 | + bpf_oom_ops_ptr = &system_bpf_oom; |
| 155 | + } |
| 156 | + |
| 157 | + /* Hide bpf_oom_ops from new callers */ |
| 158 | + if (!WARN_ON(READ_ONCE(*bpf_oom_ops_ptr) != bpf_oom_ops)) |
| 159 | + WRITE_ONCE(*bpf_oom_ops_ptr, NULL); |
| 160 | + |
| 161 | + mem_cgroup_put(memcg); |
| 162 | + |
| 163 | +exit: |
| 164 | + /* Release bpf_oom_ops after a srcu grace period */ |
| 165 | + synchronize_srcu(&bpf_oom_srcu); |
| 166 | +} |
| 167 | + |
| 168 | +void bpf_oom_memcg_offline(struct mem_cgroup *memcg) |
| 169 | +{ |
| 170 | + struct bpf_oom_ops *bpf_oom_ops; |
| 171 | + struct bpf_oom_ctx exec_ctx; |
| 172 | + u64 cgrp_id; |
| 173 | + int idx; |
| 174 | + |
| 175 | + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */ |
| 176 | + idx = srcu_read_lock(&bpf_oom_srcu); |
| 177 | + |
| 178 | + bpf_oom_ops = READ_ONCE(memcg->bpf_oom); |
| 179 | + WRITE_ONCE(memcg->bpf_oom, NULL); |
| 180 | + |
| 181 | + if (bpf_oom_ops && bpf_oom_ops->handle_cgroup_offline) { |
| 182 | + cgrp_id = cgroup_id(memcg->css.cgroup); |
| 183 | + exec_ctx.cgroup_id = cgrp_id; |
| 184 | + bpf_oom_ops->handle_cgroup_offline(cgrp_id, &exec_ctx); |
| 185 | + } |
| 186 | + |
| 187 | + srcu_read_unlock(&bpf_oom_srcu, idx); |
| 188 | +} |
| 189 | + |
| 190 | +static int bpf_oom_ops_check_member(const struct btf_type *t, |
| 191 | + const struct btf_member *member, |
| 192 | + const struct bpf_prog *prog) |
| 193 | +{ |
| 194 | + u32 moff = __btf_member_bit_offset(t, member) / 8; |
| 195 | + |
| 196 | + switch (moff) { |
| 197 | + case offsetof(struct bpf_oom_ops, handle_out_of_memory): |
| 198 | + if (!prog) |
| 199 | + return -EINVAL; |
| 200 | + break; |
| 201 | + } |
| 202 | + |
| 203 | + return 0; |
| 204 | +} |
| 205 | + |
| 206 | +static int bpf_oom_ops_init_member(const struct btf_type *t, |
| 207 | + const struct btf_member *member, |
| 208 | + void *kdata, const void *udata) |
| 209 | +{ |
| 210 | + const struct bpf_oom_ops *uops = udata; |
| 211 | + struct bpf_oom_ops *ops = kdata; |
| 212 | + u32 moff = __btf_member_bit_offset(t, member) / 8; |
| 213 | + |
| 214 | + switch (moff) { |
| 215 | + case offsetof(struct bpf_oom_ops, name): |
| 216 | + if (uops->name[0]) |
| 217 | + strscpy_pad(ops->name, uops->name, sizeof(ops->name)); |
| 218 | + else |
| 219 | + strscpy_pad(ops->name, "bpf_defined_policy"); |
| 220 | + return 1; |
| 221 | + } |
| 222 | + return 0; |
| 223 | +} |
| 224 | + |
| 225 | +static int bpf_oom_ops_init(struct btf *btf) |
| 226 | +{ |
| 227 | + return 0; |
| 228 | +} |
| 229 | + |
| 230 | +static struct bpf_struct_ops bpf_oom_bpf_ops = { |
| 231 | + .verifier_ops = &bpf_oom_verifier_ops, |
| 232 | + .reg = bpf_oom_ops_reg, |
| 233 | + .unreg = bpf_oom_ops_unreg, |
| 234 | + .check_member = bpf_oom_ops_check_member, |
| 235 | + .init_member = bpf_oom_ops_init_member, |
| 236 | + .init = bpf_oom_ops_init, |
| 237 | + .name = "bpf_oom_ops", |
| 238 | + .owner = THIS_MODULE, |
| 239 | + .cfi_stubs = &__bpf_oom_ops |
| 240 | +}; |
| 241 | + |
| 242 | +static int __init bpf_oom_struct_ops_init(void) |
| 243 | +{ |
| 244 | + return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops); |
| 245 | +} |
| 246 | +late_initcall(bpf_oom_struct_ops_init); |
0 commit comments