转载: ip_route_output_key函数分析

上面的文章读了net/ipv4/route.c 中的ip_route_input函数,是协议站对收到报文的路由查找函数。继续阅读一下协议栈发包的时候路由查找的调用函数ip_roue_output_key。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
{
return ip_route_output_flow(net, rp, flp, NULL, 0);
}

//只是一个函数封装,真正的处理函数是ip_route_output_flow.
int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
struct sock *sk, int flags)
{
int err;

/*路由查找*/
if ((err = __ip_route_output_key(net, rp, flp)) != 0)
return err;

/*IPSec 的处理代码*/
if (flp->proto) {
if (!flp->fl4_src)
flp->fl4_src = (*rp)->rt_src;
if (!flp->fl4_dst)
flp->fl4_dst = (*rp)->rt_dst;
err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
flags ? XFRM_LOOKUP_WAIT : 0);
if (err == -EREMOTE)
err = ipv4_dst_blackhole(net, rp, flp);

return err;
}

return 0;
}

//很眼熟的__ip_route_output_key函数。
int __ip_route_output_key(struct net *net, struct rtable **rp,
const struct flowi *flp)
{
unsigned hash;
struct rtable *rth;

if (!rt_caching(net))
goto slow_output;

/*类似于ip_route_input,先在cache中查找路由, 找到就返回*/
hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));

rcu_read_lock_bh();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->u.dst.rt_next)) {
if (rth->fl.fl4_dst == flp->fl4_dst &&
rth->fl.fl4_src == flp->fl4_src &&
rth->fl.iif == 0 &&
rth->fl.oif == flp->oif &&
rth->fl.mark == flp->mark &&
!((rth->fl.fl4_tos ^ flp->fl4_tos) &
(IPTOS_RT_MASK | RTO_ONLINK)) &&
net_eq(dev_net(rth->u.dst.dev), net) &&
!rt_is_expired(rth)) {
dst_use(&rth->u.dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
*rp = rth;
return 0;
}
RT_CACHE_STAT_INC(out_hlist_search);
}
rcu_read_unlock_bh();

/*不支持cache 或在cache中没找到相应的路由信息,在路由表中查找*/
slow_output:
return ip_route_output_slow(net, rp, flp);
}

/*
* Major route resolver routine.
*/
static int ip_route_output_slow(struct net *net, struct rtable **rp,
const struct flowi *oldflp)
{
u32 tos = RT_FL_TOS(oldflp); /*获取tos和当前的RTO_ONLINK(?)标志*/
struct flowi fl = { .nl_u = { .ip4_u =
{ .daddr = oldflp->fl4_dst,
.saddr = oldflp->fl4_src,
.tos = tos & IPTOS_RT_MASK,
.scope = ((tos & RTO_ONLINK) ? /*根据这个标志,得出路由的scope*/
RT_SCOPE_LINK :
RT_SCOPE_UNIVERSE),
} },
.mark = oldflp->mark,
.iif = net->loopback_dev->ifindex, /*设备号为lo的设备号?*/
.oif = oldflp->oif };
struct fib_result res;
unsigned flags = 0;
struct net_device *dev_out = NULL;
int free_res = 0;
int err;


res.fi = NULL;
#ifdef CONFIG_IP_MULTIPLE_TABLES


res.r = NULL;
#endif

/*先是对源地址, 发包接口号和目的地址进行判断分类处理。下面的每一个红色跳转就是一种情况*/
if (oldflp->fl4_src) { /*源*/
err = -EINVAL;
if (ipv4_is_multicast(oldflp->fl4_src) ||
ipv4_is_lbcast(oldflp->fl4_src) ||
ipv4_is_zeronet(oldflp->fl4_src))
goto out;

/*上面是对报文源地址的合理性检查,源地址是多播,广播或0地址时,返回错误*/

/* I removed check for oif == dev_out->oif here.
It was wrong for two reasons:
我在这里删去检查oif == dev_out->oif是否成立,因为有两个原因说明这个检查时错误的:
1. ip_dev_find(net, saddr) can return wrong iface, if saddr
is assigned to multiple interfaces
如果源地址是一个多播接口的地址,函数ip_dev_find(net, saddr)可能返回错误的设备接口。
2. Moreover, we are allowed to send packets with saddr
of another iface. --ANK
而且可以用另外设备接口的源地址发送报文
*/

if (oldflp->oif == 0
&& (ipv4_is_multicast(oldflp->fl4_dst) ||
oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /*发包接口为lo,目的地址是广播或多播时查找发包设备,ip_dev_find返回与所给定的源地址相等的第一个设备*/
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
dev_out = ip_dev_find(net, oldflp->fl4_src);
if (dev_out == NULL)
goto out;

/* Special hack: user can direct multicasts
and limited broadcast via necessary interface
without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
This hack is not just for fun, it allows
vic,vat and friends to work.
They bind socket to loopback, set ttl to zero
and expect that it will work.
From the viewpoint of routing cache they are broken,
because we are not allowed to build multicast path
with loopback source addr (look, routing cache
cannot know, that ttl is zero, so that packet
will not leave this host and route is valid).
Luckily, this hack is good workaround.
*/

/*当报文初始化的出接口为lo接口源地址不为空目的地址是多播或广播地址时,找到源地址所对应的接口重新为出接口赋值, 然后创建cache路由项*/
fl.oif = dev_out->ifindex;
goto make_route;
}

/*?????*/
if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
dev_out = ip_dev_find(net, oldflp->fl4_src);
if (dev_out == NULL)
goto out;
dev_put(dev_out);
dev_out = NULL;
}
}


if (oldflp->oif) {/*发包设备不为空*/
/*检测出接口是否存在*/
dev_out = dev_get_by_index(net, oldflp->oif);
err = -ENODEV;
if (dev_out == NULL)
goto out;

/* RACE: Check return value of inet_select_addr instead. */
/*看设备是否是多地址*/
if (__in_dev_get_rtnl(dev_out) == NULL) {
dev_put(dev_out);
goto out; /* Wrong error code */
}

/*当目的地址是本地多播地址或广播地址,并且报文源地址为空时,找出出接口设备上IP地址scope小于RT_SCOPE_LINK的地址,并赋值,然后往cache中添加路由表项*/
if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
if (!fl.fl4_src)
fl.fl4_src = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
goto make_route;
}
/*目的地址是单播地址或空,源地址为空,那就选一个小于特定scope的IP地址*/
if (!fl.fl4_src) {
if (ipv4_is_multicast(oldflp->fl4_dst))
fl.fl4_src = inet_select_addr(dev_out, 0,
fl.fl4_scope);
else if (!oldflp->fl4_dst)
fl.fl4_src = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
}

if (!fl.fl4_dst) {/*目的地址为空*/
fl.fl4_dst = fl.fl4_src;
if (!fl.fl4_dst)
fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);/*目的和源地址都是空,则赋值为lo接口地址*/
if (dev_out)
dev_put(dev_out);
dev_out = net->loopback_dev;
dev_hold(dev_out);
fl.oif = net->loopback_dev->ifindex;
res.type = RTN_LOCAL;
flags |= RTCF_LOCAL;
/*为发给本机的报文添加cache路由*/
goto make_route;
}

/*一种情况是源地址目的地址不为空,目的地址为空,出接口为lo*/
/*还有其他几种情况,就是目的地址和出接口必须对应*/
if (fib_lookup(net, &fl, &res)) {
res.fi = NULL;
if (oldflp->oif) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.

WHY? DW.
Because we are allowed to send to iface
even if it has NO routes and NO assigned
addresses. When oif is specified, routing
tables are looked up with only one purpose:
to catch if destination is gatewayed, rather than
direct. Moreover, if MSG_DONTROUTE is set,
we send packet, ignoring both routing tables
and ifaddr state. --ANK


We could make it even if oif is unknown,
likely IPv6, but we do not.
*/

if (fl.fl4_src == 0)
fl.fl4_src = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
res.type = RTN_UNICAST;
/*没有查到路由,并且出接口不为lo*/
goto make_route;
}
if (dev_out)
dev_put(dev_out);
err = -ENETUNREACH;
goto out;
}
/*找到路由*/
free_res = 1;

/*路由指向本地*/
if (res.type == RTN_LOCAL) {
if (!fl.fl4_src)
fl.fl4_src = fl.fl4_dst;
if (dev_out)
dev_put(dev_out);
dev_out = net->loopback_dev;
dev_hold(dev_out);
fl.oif = dev_out->ifindex;
if (res.fi)
fib_info_put(res.fi);
res.fi = NULL;
flags |= RTCF_LOCAL;
goto make_route;
}

/*是否支持多路径路由*/
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res.fi->fib_nhs > 1 && fl.oif == 0)
fib_select_multipath(&fl, &res);
else
#endif
if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
fib_select_default(net, &fl, &res);

if (!fl.fl4_src)
fl.fl4_src = FIB_RES_PREFSRC(res);

if (dev_out)
dev_put(dev_out);
dev_out = FIB_RES_DEV(res);
dev_hold(dev_out);
fl.oif = dev_out->ifindex;

/*往cache中添加相应的路由项*/
make_route:
err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);


if (free_res)
fib_res_put(&res);
if (dev_out)
dev_put(dev_out);
out: return err;
}

转载于:https://www.cnblogs.com/listenerln/p/6846335.html


转载: ip_route_output_key函数分析
http://blog.uanet.cn/NETWORK/route/转载: ip_route_output_key函数分析.html
作者
dnsnat
发布于
2022年5月26日
许可协议