1。ip分片的结构体组织形式
先记录以下特殊的字段:
(1)skb_buff的cb字段 char cb[48],是一个自定义字段,在协议各层处理时,可以存储各协议的私有数据,就是随便自己定义,在ip层分片时存储的是struct inet_skb_parm *,该结构体嵌套两个数据:ip选项 struct ip_options opt; 和 标识字段flags。
(2)skb_buff结构体末尾存储的是分片链表,即该结构体的末尾连续的内存,存储了包含分片数据信息的结构体struct skb_shared_info的指针。
分片信息存储区分:
快速分片:此时,分片信息存储在skb_buff字段的skb_shared_info 由skb_buff组成的frag_list链表中,链表上每个skb_buff是一个分片。
慢速分片:此时,分片信息存储在skb_buff字段的data字段和skb_shared_info 的字段skb_frag_struct frags[MAX_SKB_FRAGS]的page字段中,
ip分片分两种情况即快速分片和慢分片:
快速分片是指分片数据已经在L4层分片成功,这些分片使用存储在结构体skb_shared_info中的sk_buff类型的链,变量frag_list中,由于是做好分片,所以在分片函数里只需要为每个分片增加ip头即可,循环对分片进行增加ip头,交由底层协议,最后至网卡驱动函数。
慢速分片是指上层发送的数据完全存储在skb_buff的data成员中,需要分片函数对data进行拆分,组织ip头信息,交由底层协议,最后至网卡驱动函数。
2。ip分片时机
{ ip_pending_frame,ip_queue_xmit } -》dst_output-》{ ip_mc_output,ip_output } -》NF_HOOK_COND(netfilter ) -》ip_finish_output -》ip_fragment -》ip_finish_output2-》邻居子系统的输出函数()-》dev_queue_xmit-》。。。
ip_queue_xmit中会进行路由查找,确定报文的发送网卡接口
3。ip分片函数解释
ip_fragment函数:
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
{
struct iphdr *iph;
int ptr;
struct net_device *dev;
struct sk_buff *skb2;
unsigned int mtu, hlen, left, len, ll_rs;
int offset;
__be16 not_last_frag;
struct rtable *rt = skb_rtable(skb);
int err = 0;
dev = rt->dst.dev;
/*
* Point into the IP datagram header.
*/
//ip头指针
iph = ip_hdr(skb);
//如果设置了不允许分片标识,则直接返回一个icmp不可达报文
if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(ip_skb_dst_mtu(skb)));
kfree_skb(skb);
return -EMSGSIZE;
}
/*
* Setup starting values.
*/
//头部长度
hlen = iph->ihl * 4;
//mtu,不包括ip头,这个包含在路由信息中(dst_entry结构体),查找路由时获得
mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
//获取桥接情况时的mtu
#ifdef CONFIG_BRIDGE_NETFILTER
if (skb->nf_bridge)
mtu -= nf_bridge_mtu_reduction(skb);
#endif
//设置分片结构体的标识,分片完成
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity:
* some transformers could create wrong frag_list or break existing
* one, it is not prohibited. In this case fall back to copying.
*
* LATER: this step can be merged to real generation of fragments,
* we can switch to copy when see the first bad fragment.
*/
//skb_buff末尾连续内存存储了分片(skb_shared_info),
//如果该结构体frag_list为真,则有数据说
//明分片存储在以frag_list为头节点的链表中,这个数据时上层L4层分片存储到这里
if (skb_has_frag_list(skb)) {
struct sk_buff *frag, *frag2;
int first_len = skb_pagelen(skb); //所有分片的长度
//报文总长度大于mtu,或者不是8字节对齐,进行慢分片
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
(iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
skb_cloned(skb))
goto slow_path;
//存在任何不符合mtu要求的分片,都进行慢速分片
skb_walk_frags(skb, frag) {
/* Correct geometry. */
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
skb_headroom(frag) < hlen)
goto slow_path_clean;
/* Partially cloned skb? */
if (skb_shared(frag))
goto slow_path_clean;
BUG_ON(frag->sk);
if (skb->sk) {
frag->sk = skb->sk;
frag->destructor = sock_wfree;
}
skb->truesize -= frag->truesize;
}
/* Everything is OK. Generate! */
//以下是快速分片,首先设置了ip头,直接使用frag_list中的skb_buff
err = 0;
offset = 0;
//临时变量frag,循环时使用
frag = skb_shinfo(skb)->frag_list;
//对原来的frag_list重新初始化
skb_frag_list_init(skb);
skb->data_len = first_len - skb_headlen(skb);
skb->len = first_len;
iph->tot_len = htons(first_len);
iph->frag_off = htons(IP_MF);
ip_send_check(iph);
//循环对每个分片设置IP头,然后调用output函数指针
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
if (frag) {
frag->ip_summed = CHECKSUM_NONE;
skb_reset_transport_header(frag);
__skb_push(frag, hlen);
skb_reset_network_header(frag);
memcpy(skb_network_header(frag), iph, hlen);
iph = ip_hdr(frag); //frag是skb_buff *类型的节点
iph->tot_len = htons(frag->len); //本报文总长度为分片长度
ip_copy_metadata(frag, skb);
//如果是第一个分片,则设置ip选项,以后每个分片的选项从此处复制
if (offset == 0)
ip_options_fragment(frag);
offset += skb->len - hlen; //分片在整个报文中的偏移量
iph->frag_off = htons(offset>>3); //将偏移量设置到ip头中
if (frag->next != NULL) //如果不是最后一个分片则设置分片标识
iph->frag_off |= htons(IP_MF);
/* Ready, complete checksum */
ip_send_check(iph); //计算校验和
}
err = output(skb); //传递进来的发往底层协议的函数指针
if (!err)
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
if (err || !frag)
break;
//变量循环,下一个分片
skb = frag;
frag = skb->next;
skb->next = NULL;
}
if (err == 0) {
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
return 0;
}
//释放分片内存
while (frag) {
skb = frag->next;
kfree_skb(frag);
frag = skb;
}
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
slow_path_clean:
skb_walk_frags(skb, frag2) {
if (frag2 == frag)
break;
frag2->sk = NULL;
frag2->destructor = NULL;
skb->truesize += frag2->truesize;
}
}
//接下来是慢分片
slow_path:
//除去ip头后的数据的长度
left = skb->len - hlen; /* Space per frame */
//数据偏移量指针,发多少数据,指针往后移多少字节
ptr = hlen; /* Where to start from */
/* for bridged IP traffic encapsulated inside f.e. a vlan header,
* we need to make room for the encapsulating header
*/
ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
/*
* Fragment the datagram.
*/
//偏移量和分片标识
offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
not_last_frag = iph->frag_off & htons(IP_MF);
/*
* Keep copying data until we run out.
*/
while (left > 0) {
len = left;
/* IF: it doesn't fit, use 'mtu' - the data space left */
if (len > mtu) //以mtu为分片长度
len = mtu;
/* IF: we are not sending upto and including the packet end
then align the next start on an eight byte boundary */
//设置分片长度,8字节对齐
if (len < left) {
len &= ~7;
}
/*
* Allocate buffer.
*/
//分配一个skb_buff,用于存储并发送分片
if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
err = -ENOMEM;
goto fail;
}
/*
* Set up data on packet
*/
//这段就是设置ip头部信息
ip_copy_metadata(skb2, skb);
skb_reserve(skb2, ll_rs);
skb_put(skb2, len + hlen);
skb_reset_network_header(skb2);
skb2->transport_header = skb2->network_header + hlen;
/*
* Charge the memory for the fragment to any owner
* it might possess
*/
//关联socket结构体
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
/*
* Copy the packet header into the new buffer.
*/
//复制skb的头部到skb2,长度为hlen
skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
/*
* Copy a block of the IP datagram.
*/
//复制数据部分
if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
BUG();
//剩余长度减少
left -= len;
/*
* Fill in the new header fields.
*/
//设置skb2的偏移和标识
iph = ip_hdr(skb2);
iph->frag_off = htons((offset >> 3));
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
* on the initial skb, so that all the following fragments
* will inherit fixed options.
*/
//第一个分片时,设置ip选项,以后每个分片的选项都从这里skb复制到skb2的选项部分
if (offset == 0)
ip_options_fragment(skb);
/*
* Added AC : If we are fragmenting a fragment that's not the
* last fragment then keep MF on each bit
*/
//如果还有数据数据未发送,或者该分片不是最后一个分片,设置分片标识
if (left > 0 || not_last_frag)
iph->frag_off |= htons(IP_MF);
ptr += len; //数据指针后移
offset += len; //偏移加上本次发送长度
/*
* Put this fragment into the sending queue.
*/
//本次发包的包长度
iph->tot_len = htons(len + hlen);
//校验和
ip_send_check(iph);
//调用函数指针交由底层协议处理,其中会调用arp的输出函数,设备统一抽象函数,
//最后是设备驱动函数
err = output(skb2);
if (err)
goto fail;
//分片统计信息累计
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
}
kfree_skb(skb);
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
return err;
fail:
kfree_skb(skb);
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
}
EXPORT_SYMBOL(ip_fragment);
来源:CSDN
作者:种菜的
链接:https://blog.csdn.net/ljq32/article/details/103643683