概述
socket filter,在BPF中的类型为BPF_PROG_TYPE_SOCKET_FILTER,顾名思义,实现的是socket的过滤器。
本文会分析BPF_PROG_TYPE_SOCKET_FILTER类型程序的实现原理,一直到埋点函数。
内核中有示例代码,位置在sample/bpf/sock_example.c、samples/bpf/sockex1_kern.c等。
一般会将socket filter程序的段名定义成SEC("socketxxx")
下文的代码分析,基于5.15.99版本的内核
prog加载
这里通过sample/bpf/sock_example.c学习。
一些注释和文件头
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
/* eBPF example program: * - creates arraymap in kernel with key 4 bytes and value 8 bytes * * - loads eBPF program: * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)]; * *(u32*)(fp - 4) = r0; * // assuming packet is IPv4, lookup ip->proto in a map * value = bpf_map_lookup_elem(map_fd, fp - 4); * if (value) * (*(u64*)value) += 1; * * - attaches this program to loopback interface "lo" raw socket * * - every second user space reads map[tcp], map[udp], map[icmp] to see * how many packets of given protocol were seen on "lo" */#include <stdio.h>#include <unistd.h>#include <assert.h>#include <linux/bpf.h>#include <string.h>#include <stdlib.h>#include <errno.h>#include <sys/socket.h>#include <arpa/inet.h>#include <linux/if_ether.h>#include <linux/ip.h>#include <stddef.h>#include <bpf/bpf.h>#include "bpf_insn.h"#include "sock_example.h" |
加载map,使用内核的bpf_create_map函数
|
1
2
3
4
5
6
7
8
9
|
int sock = -1, map_fd, prog_fd, i, key;long long value = 0, tcp_cnt, udp_cnt, icmp_cnt;map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), 256, 0);if (map_fd < 0) { printf("failed to create map '%s'n", strerror(errno)); goto cleanup;} |
用字节码的形式定义的BPF prog程序本体
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
struct bpf_insn prog[] = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ BPF_LD_MAP_FD(BPF_REG_1, map_fd), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ BPF_EXIT_INSN(),};size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); |
这里的bpf_insn是bpf程序底层的字节码,是抽象过的汇编代码。各种高级库都要转换成这种形式,最后转换成汇编代码。
|
1
2
3
4
5
6
7
|
struct bpf_insn { __u8 code; /* opcode */ __u8 dst_reg:4; /* dest register */ __u8 src_reg:4; /* source register */ __s16 off; /* signed offset */ __s32 imm; /* signed immediate constant */}; |
以上实现的程序:
|
1
2
3
4
5
6
|
r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)];*(u32*)(fp - 4) = r0;// assuming packet is IPv4, lookup ip->proto in a mapvalue = bpf_map_lookup_elem(map_fd, fp - 4);if (value) (*(u64*)value) += 1; |
用内核函数bpf_load_program装载prog程序,参数为BPF_PROG_TYPE_SOCKET_FILTER,表示socket/filter类型
|
1
2
3
4
5
6
|
prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, insns_cnt, "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE);if (prog_fd < 0) { printf("failed to load prog '%s'n", strerror(errno)); goto cleanup;} |
open_raw_sock创建一个raw_socket,调用setsockopt将bpf prog附着到这个socket上,参数为SO_ATTACH_BPF。
|
1
2
3
4
5
6
7
|
sock = open_raw_sock("lo");if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)) < 0) { printf("setsockopt %sn", strerror(errno)); goto cleanup;} |
点位跟踪
接下来重点关注setsockopt如何访问sock/filter的点位。
查找setsockopt的源码,寻找SO_ATTACH_BPF参数逻辑。在5.15.99的net/core/sock.c:1169行,找到了处理逻辑
|
1
2
3
4
5
6
7
8
9
10
11
12
|
case SO_ATTACH_BPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_attach_bpf(ufd, sk); } break; |
|
1
|
跳转到`sk_attach_bpf`函数(`net/core/filter.c:1571`) |
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
int sk_attach_bpf(u32 ufd, struct sock *sk){ struct bpf_prog *prog = __get_bpf(ufd, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); return err; } return 0;} |
排除关于bpf的操作(一些对prog程序的操作),跟进__sk_attach_prog。
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk){ //创建socket_filter的对象 // struct sk_filter { // refcount_t refcnt; // struct rcu_head rcu; // struct bpf_prog *prog; // }; struct sk_filter *fp, *old_fp; fp = kmalloc(sizeof(*fp), GFP_KERNEL); if (!fp) return -ENOMEM; fp->prog = prog; // 为fp sk_filter对象分配一个socket的引用,如果失败,释放fp空间 if (!__sk_filter_charge(sk, fp)) { kfree(fp); return -ENOMEM; } refcount_set(&fp->refcnt, 1); // 获取原先socket/filter过滤器 old_fp = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); // 将sk->sk_filter的值变为我们新分配的fp rcu_assign_pointer(sk->sk_filter, fp); // 如果有淘汰下来的旧prog,需要对空间进行清理 if (old_fp) sk_filter_uncharge(sk, old_fp); return 0;} |
在代码注释里的操作过后,成功将prog对象指向了sk->sk_filter->prog。
sk_filter
查找sk_filter代码,寻找调用函数。可以在很多函数中找到踪迹,在include/linux/filter.h中找到函数的原型。
|
1
2
3
4
5
|
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);static inline int sk_filter(struct sock *sk, struct sk_buff *skb){ return sk_filter_trim_cap(sk, skb, 1);} |
sk_filter是封装好的sk->sk_filter调用原型,也有其他代码通过获取sk->sk_filter或者直接调用sk_filter_trim_cap来进行SOCKET_FILTER程序的运行。
sk_filter_trim_cap
跟进sk_filter_trim_cap函数,net/core/filter.c。
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
/ * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap){ int err; struct sk_filter *filter; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ // 检查SKB是否分配PF_MEMALLOC标志位 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); if (err) return err; //lsm框架hook点 err = security_sock_rcv_skb(sk, skb); if (err) return err; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { struct sock *save_sk = skb->sk; unsigned int pkt_len; skb->sk = sk; pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); return err;}EXPORT_SYMBOL(sk_filter_trim_cap); |
PF_MEMALLOC含义:
当前进程有很多可以释放的内存,如果能分配一点紧急内存给当前进程,那么当前进程可以返回更多的内存给系统。非内存管理子系统不应该使用这个标记,除非这次分配保证会释放更大的内存给系统。如果每个子系统都滥用这个标记,可能会耗尽内存管理子系统的保留内存。
程序首先检查 skb 是否设置了PF_MEMALLOC标志位,如果是的话,只有设置了 SOCK_MEMALLOC 标志的 socket 才能使用它,否则就返回 -ENOMEM 并增加统计计数器 LINUX_MIB_PFMEMALLOCDROP。这是为了防止内存不足的情况下,非紧急的 socket 占用有限的内存资源。
下一步,调用 BPF_CGROUP_RUN_PROG_INET_INGRESS() 函数,执行 cgroup 的 ingress hook上的 eBPF 程序,如果返回err,就return err。这是为了实现 cgroup 的网络隔离和限制功能。
这里如果开启了CGROUP_BPF的CGROUP_INET_INGRESS点,则调用__cgroup_bpf_run_filter_skb函数,执行CGROUP的filter程序。若没有开启,返回0值,继续执行代码。
cgroup细节之后讨论。
|
1
2
3
4
5
6
7
8
9
10
|
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) ({ int __ret = 0; if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) __ret = __cgroup_bpf_run_filter_skb(sk, skb, CGROUP_INET_INGRESS); __ret; }) |
调用 security_sock_rcv_skb 函数,这是LSM的预留hook点,检查 socket 是否有权限接收 skb。
接下来,获取读锁,防止 sk_filter 被并发修改,从 sk 中获取 sk_filter 结构体指针。
取出sk->filter后,更新skb中的sock为当前传入socket,并且调用bpf_prog_run_save_cb执行bpf程序。然后把skb->sk赋值旧的socket回去。
|
1
2
3
4
5
6
7
8
9
10
11
12
|
rcu_read_lock();filter = rcu_dereference(sk->sk_filter);if (filter) { struct sock *save_sk = skb->sk; unsigned int pkt_len; skb->sk = sk; pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;}rcu_read_unlock(); |
如果返回的长度不为 0,就调用 pskb_trim 函数,将 skb 的数据部分裁剪到 cap 和返回的长度中的较大值,如果裁剪失败,就返回错误码;如果返回的长度为 0,就将错误码设置为 -EPERM,表示要丢弃 packet。
bpf_prog_run_save_cb
bpf prog执行的细节可以简单看一下。
其中涉及细节放到BPF系统源码分析里讲。
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, struct sk_buff *skb){ u32 res; migrate_disable(); res = __bpf_prog_run_save_cb(prog, skb); migrate_enable(); return res;}/* Must be invoked with migration disabled */static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, const void *ctx){ const struct sk_buff *skb = ctx; u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; if (unlikely(prog->cb_access)) { memcpy(cb_saved, cb_data, sizeof(cb_saved)); memset(cb_data, 0, sizeof(cb_saved)); } res = bpf_prog_run(prog, skb); if (unlikely(prog->cb_access)) memcpy(cb_data, cb_saved, sizeof(cb_saved)); return res;}static inline u8 *bpf_skb_cb(const struct sk_buff *skb){ /* eBPF programs may read/write skb->cb[] area to transfer meta * data between tail calls. Since this also needs to work with * tc, that scratch memory is mapped to qdisc_skb_cb's data area. * * In some socket filter cases, the cb unfortunately needs to be * saved/restored so that protocol specific skb->cb[] data won't * be lost. In any case, due to unpriviledged eBPF programs * attached to sockets, we need to clear the bpf_skb_cb() area * to not leak previous contents to user space. */ BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN); BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != sizeof_field(struct qdisc_skb_cb, data)); return qdisc_skb_cb(skb)->data;}static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx){ return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);}static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, const void *ctx, bpf_dispatcher_fn dfunc){ u32 ret; cant_migrate(); if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; u64 start = sched_clock(); unsigned long flags; ret = dfunc(ctx, prog->insnsi, prog->bpf_func); stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); u64_stats_add(&stats->nsecs, sched_clock() - start); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } return ret;} |
调用链分析
可以通过搜索sk->sk_filter、sk_filter、sk_filter_trim_cap,分析filter程序的调用
SOCKET_RAW
查找函数引用,回溯一下调用链。
查找调用sk_filter的函数,定位到sock_queue_rcv_skb(net/core/sock.c)(很多函数有注释,比如)
|
1
2
3
4
5
6
7
8
9
10
11
|
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb){ int err; err = sk_filter(sk, skb); if (err) return err; return __sock_queue_rcv_skb(sk, skb);}EXPORT_SYMBOL(sock_queue_rcv_skb); |
代码中的sk_filter就是埋点函数。
代码中有非常多的调用,很多都是各种协议的适配,比如J1939,搜索后发现是汽车的CAN总线通信协议。这里我们关注net/ieee802154/socket.c。
可以看到,dgram_rcv_skb(数据报SOCKET)和raw_rcv_skb都调用了sock_queue_rcv_skb。
|
1
2
3
4
5
6
7
8
9
10
11
12
13
|
static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb){ skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS;} |
ipv4的raw_rcv_skb逻辑也差不多,后面都进入同样的raw_rcv
|
1
2
3
4
5
6
7
8
9
10
11
12
|
static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb){ /* Charge it to the socket. */ ipv4_pktinfo_prepare(sk, skb); if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS;} |
跟进到net/ipv4/raw.c的 raw_rcv。
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
int raw_rcv(struct sock *sk, struct sk_buff *skb){ // 安全策略检查 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); kfree_skb(skb); return NET_RX_DROP; } //NFHOOK埋点,重置跟踪信息 nf_reset_ct(skb); skb_push(skb, skb->data - skb_network_header(skb)); raw_rcv_skb(sk, skb); return 0;} |
跟进raw_v4_input。这个函数主要做socket_raw的RX方向sk分配。SOCKET_RAW允许多个socket同时接收同一个数据包,
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
/* IP input processing comes here for RAW socket delivery. * Caller owns SKB, so we must make clones. * * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash){ ...... // 根据网络设备寻找匹配的socket net = dev_net(skb->dev); sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, iph->saddr, iph->daddr, dif, sdif); while (sk) { delivered = 1; if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && ip_mc_sf_allow(sk, iph->daddr, iph->saddr, skb->dev->ifindex, sdif)) { // clone的目的是不共享数据包,socket拥有自己的数据包 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ if (clone) raw_rcv(sk, clone); } sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, iph->saddr, iph->daddr, dif, sdif); }out: read_unlock(&raw_v4_hashinfo.lock); return delivered;} |
跟进到raw_local_deliver
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
int raw_local_deliver(struct sk_buff *skb, int protocol){ int hash; struct sock *raw_sk; // 根据协议获取哈希值,从raw_v4_hashinfo链表获得socket对象 hash = protocol & (RAW_HTABLE_SIZE - 1); raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); /* If there maybe a raw socket we must check - if not we * don't care less */ if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) raw_sk = NULL; return raw_sk != NULL;} |
跟进ip_local_deliver->ip_local_deliver_finish->ip_protocol_deliver_rcu->raw_local_deliver,这就来到了网络层转发到传输层的函数入口了。ip_local_deliver负责网络层转发到上层协议。由于SOCKET_RAW跳过传输层,因此检查设置在了这,具体细节可以看网络系统文章。
[Linux内核源码分析]网络子系统
SOCKET_STREAM
net/ipv4/tcp_ipv4.c的tcp_filter函数调用了sk_filter_trim_cap
|
1
2
3
4
5
6
7
|
int tcp_filter(struct sock *sk, struct sk_buff *skb){ struct tcphdr *th = (struct tcphdr *)skb->data; return sk_filter_trim_cap(sk, skb, th->doff * 4);}EXPORT_SYMBOL(tcp_filter); |
跟进到tcp_v4_rcv(AF_INET_tcp的recv函数)
在TCP_NEW_SYN_RECV的处理逻辑以及主体函数逻辑中,都有tcp_filter的函数调用
tcp_v4_rcv函数中会对TCP_NEW_SYN_RECV进行处理,如果连接检查成功,则需要新建控制块来处理连接,这个新建控制块的状态将会使用TCP_SYN_RECV状态;
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
/* * From tcp_input.c */int tcp_v4_rcv(struct sk_buff *skb){ struct net *net = dev_net(skb->dev); struct sk_buff *skb_to_free; const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; ...... ......... th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); // 获取等待tcp包的合适socketlookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket;process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; // 进入TCP_NEW_SYN_RECV逻辑 if (sk->sk_state == TCP_NEW_SYN_RECV) { ......... if (!tcp_filter(sk, skb)) { th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen); } ......... if (tcp_filter(sk, skb)) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto discard_and_relse; } ...... if (!sock_owned_by_user(sk)) { skb_to_free = sk->sk_rx_skb_cache; sk->sk_rx_skb_cache = NULL; ret = tcp_v4_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb)) goto discard_and_relse; skb_to_free = NULL; } ......... switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, skb, __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb), sdif); if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; goto process; } } /* to ACK */ fallthrough; case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; case TCP_TW_RST: tcp_v4_send_reset(sk, skb); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS:; } goto discard_it;} |
SOCKET_DGRAM
搜索sk_filter找到了udp_queue_rcv_one_skb函数。这个函数位于udp_queue_rcv_skb内部
|
1
2
3
4
5
6
7
8
9
|
static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb){ struct sk_buff *next, *segs; int ret; if (likely(!udp_unexpected_gso(sk, skb))) return udp_queue_rcv_one_skb(sk, skb); ......} |
逆向一路向上跟进至udp_rcv,可知检测逻辑在UDP协议栈rcv处理函数内部。从udp_rcv顺序分析
|
1
2
3
4
|
int udp_rcv(struct sk_buff *skb){ return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);} |
跟进__udp4_lib_rcv
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
/* * All we need to do is get the socket, and then do a checksum. */int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto){ struct sock *sk; struct udphdr *uh; struct rtable *rt = skb_rtable(skb); __be32 saddr, daddr; struct net *net = dev_net(skb->dev); ...... sk = skb_steal_sock(skb, &refcounted); if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp_sk_rx_dst_set(sk, dst); ret = udp_unicast_rcv_skb(sk, skb, uh); if (refcounted) sock_put(sk); return ret; } if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) return udp_unicast_rcv_skb(sk, skb, uh); ......} |
udp_unicast_rcv_skb和__udp4_lib_mcast_deliver都调用了udp_queue_rcv_skb,而udp_queue_rcv_skb内部包含udp_queue_rcv_one_skb。
最后在udp_queue_rcv_one_skb中调用了sk_filter_trim_cap。
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
/* returns: * -1: error * 0: success * >0: "udp encap" protocol resubmission * * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb){ struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); /* * Charge it to the socket, dropping if the queue is full. */ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto drop; nf_reset_ct(skb); ...... if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) goto drop; ......} |
调用链:udp_rcv->__udp4_lib_rcv->udp_unicast_rcv_skb/__udp4_lib_mcast_deliver->udp_queue_rcv_skb->udp_queue_rcv_one_skb->sk_filter_trim_cap
其他协议
还有一些内核函数也调用sk_filter相关函数,但是属于通用sock处理逻辑(__sk_receive_skb),一些其他协议使用,比如DCCP、pppoe、l2tp等,这里就不加以分析。
SOCKET析构逻辑
跟踪点是__sk_destruct(net/core/sock.c),其中会检查sock_filter是否还存在,还存在的话调用sk_filter_uncharge删除分配的内存。
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
/* Sockets having SOCK_RCU_FREE will call this function after one RCU * grace period. This is the case for UDP sockets and TCP listeners. */static void __sk_destruct(struct rcu_head *head){ struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) sk->sk_destruct(sk); filter = rcu_dereference_check(sk->sk_filter, refcount_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } ......#ifdef CONFIG_BPF_SYSCALL bpf_sk_storage_free(sk);#endif ...... sk_prot_free(sk->sk_prot_creator, sk);} |
跟进到sk_destruct
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
void sk_destruct(struct sock *sk){ bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); if (rcu_access_pointer(sk->sk_reuseport_cb)) { reuseport_detach_sock(sk); use_call_rcu = true; } if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu);} |
跟进__sk_free
|
1
2
3
4
5
6
7
8
9
10
|
static void __sk_free(struct sock *sk){ if (likely(sk->sk_net_refcnt)) sock_inuse_add(sock_net(sk), -1); if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) sock_diag_broadcast_destroy(sk); else sk_destruct(sk);} |
跟进至sk_free
|
1
2
3
4
5
6
7
8
9
10
11
|
void sk_free(struct sock *sk){ /* * We subtract one from sk_wmem_alloc and can know if * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ if (refcount_dec_and_test(&sk->sk_wmem_alloc)) __sk_free(sk);}EXPORT_SYMBOL(sk_free); |
sk_free是内核删除socket对象的函数。内核通过sk_alloc分配socket对象。以下为tipc_sk_create(net/tipc/socket.c)的示例。通过sk_alloc创建socket对象,然后判断创建失败,sk_free释放内存。
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
/* Allocate socket's protocol area */// sk_alloc - All socket objects are allocated heresk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern);if (sk == NULL) return -ENOMEM;tsk = tipc_sk(sk);tsk->max_pkt = MAX_PKT_DEFAULT;tsk->maxnagle = 0;tsk->nagle_start = NAGLE_START_INIT;INIT_LIST_HEAD(&tsk->publications);INIT_LIST_HEAD(&tsk->cong_links);msg = &tsk->phdr;/* Finish initializing socket data structures */sock->ops = ops;sock_init_data(sock, sk);tipc_set_sk_state(sk, TIPC_OPEN);if (tipc_sk_insert(tsk)) { sk_free(sk); pr_warn("Socket create failed; port number exhaustedn"); return -EINVAL;} |
处理逻辑调用链:sk_free->__sk_free->sk_destruct->__sk_destruct
SOCKET_PACKET
net/packet/af_packet.c的run_filter会取出sk->sk_filter->prog程序,bpf执行
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
static unsigned int run_filter(struct sk_buff *skb, const struct sock *sk, unsigned int res){ struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res;} |
跟进到packet_rcv
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
/* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequentially, so that if we return skb to original state on exit, * we will not harm anyone. */static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev){ struct sock *sk; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; bool is_drop_n_account = false; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; skb->dev = dev; ...... res = run_filter(skb, sk, snaplen); ......} |
跟进packet_create,这是PF_PACKET协议栈的create函数,其中创建了packet_sock,并且把packet_rcv指针赋值到协议栈处理函数中。当系统创建socket时,会调用inet_create,从inetsw数组中取出协议栈注册的函数,对应PF_PACKET的就是这里的packet_create。
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
/* * Create a packet of type SOCK_PACKET. */static int packet_create(struct net *net, struct socket *sock, int protocol, int kern){ struct sock *sk; struct packet_sock *po; __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; ...... po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; __register_prot_hook(sk); } ......} |
packet_sock的结构
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
struct packet_sock { /* struct sock has to be the first member of packet_sock */ struct sock sk; struct packet_fanout *fanout; union tpacket_stats_u stats; struct packet_ring_buffer rx_ring; struct packet_ring_buffer tx_ring; int copy_thresh; spinlock_t bind_lock; struct mutex pg_vec_lock; unsigned int running; /* bind_lock must be held */ unsigned int auxdata:1, /* writer must hold sock lock */ origdev:1, has_vnet_hdr:1, tp_loss:1, tp_tx_has_off:1; int pressure; int ifindex; /* bound device */ __be16 num; struct packet_rollover *rollover; struct packet_mclist *mclist; atomic_t mapped; enum tpacket_versions tp_version; unsigned int tp_hdrlen; unsigned int tp_reserve; unsigned int tp_tstamp; struct completion skb_completion; struct net_device __rcu *cached_dev; int (*xmit)(struct sk_buff *skb); struct packet_type prot_hook ____cacheline_aligned_in_smp; atomic_t tp_drops ____cacheline_aligned_in_smp;}; |
总结
BPF_PROG_TYPE_SOCKET_FILTER类型的bpf程序,需要利用setsockopt函数绑定,埋点函数位于net/core/filter.c:sk_filter_trim_cap。
调用链总结
- SOCKET_RAW:ip_local_deliver->ip_local_deliver_finish->ip_protocol_deliver_rcu->raw_local_deliver->raw_v4_input->raw_rcv->raw_rcv_skb
- SOCKET_STREAM:tcp_v4_rcv->tcp_filter
- SOCKET_DGRAM:udp_rcv->__udp4_lib_rcv->udp_unicast_rcv_skb/__udp4_lib_mcast_deliver->udp_queue_rcv_skb->udp_queue_rcv_one_skb->sk_filter_trim_cap
- SOCKET析构:sk_free->__sk_free->sk_destruct->__sk_destruct
- SOCKET_PACKET:packet_create->packet_rcv->run_filter
![[eBPF源码分析]Socket_filter类型调用链埋点分析](https://www.83ym.com/wp-content/uploads/foasc/25/12/1766217918/694658becc1c4.jpg)