TCP/IP协议栈实现浅析(下) 报文接收相关函数及流程分析
文章目录
- 上节内容回顾
- 数据包的接收
- 函数原型
- 设备驱动层流程
- ixgb网卡
- tun/tap网卡
- 链路层处理
- IP层流程
- TCP层处理
- Socket层流程
- VFS层流程
- 资源清理
- tcpdump抓包流程解析
- 参考文章
上节内容回顾
上节中按照报文发送经过的顺序即:socect层,传输层,网络层,路由子系统,邻居子系统,设备子系统,网络设备驱动层等各层梳理了报文发送过程,这节梳理报文接收流程,但是报文接收流程就是倒着来的顺序,即从网卡到用户态了。
数据包的接收
函数原型
对应的发送函数,与send系列基本一致:
ssize_t recv(int sockfd, void *buf, size_t len, int flags);
ssize_t recvfrom(int sockfd, void *buf, size_t len, int flags, sockaddr *src_addr, socklen_t *addrlen);
ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
设备驱动层流程
当网卡收到报文需要通知操作系统,此时可以触发一个中断来通知cpu,但是如果网络包流量非常大,持续频繁的触发硬中断,将会严重干扰正常的执行流程。
除了硬中断,还能想到的一种办法是轮询,比如我有空了就去看看有没有报文,但是如果长时间没有报文,这样会浪费cpu资源。
因此就有NAPI,可以结合中断和轮询去处理收包
接下来以ixgb网卡和tun/tap网卡为例说明设备驱动层的流程
ixgb网卡
先看下网卡收包后的中断处理函数,上节中提到过发送完毕后的资源清理是在硬中断处理的,此处就是前面提到的流程
网卡激活时将调用ndo_open,即ixgb_open
static const struct net_device_ops ixgb_netdev_ops = {.ndo_open = ixgb_open,
//...
} /*** ixgb_open - Called when a network interface is made active* @netdev: network interface device structure** Returns 0 on success, negative value on failure** The open entry point is called when a network interface is made* active by the system (IFF_UP). At this point all resources needed* for transmit and receive operations are allocated, the interrupt* handler is registered with the OS, the watchdog timer is started,* and the stack is notified that the interface is ready.**/static int
ixgb_open(struct net_device *netdev)
{struct ixgb_adapter *adapter = netdev_priv(netdev);int err;/* allocate transmit descriptors *///分配用于物理网卡发包的desc,一致DMA映射err = ixgb_setup_tx_resources(adapter);if (err)goto err_setup_tx;netif_carrier_off(netdev);/* allocate receive descriptors *///分配用于物理网卡发包的desc,一致DMA映射err = ixgb_setup_rx_resources(adapter);if (err)goto err_setup_rx;//启动流程err = ixgb_up(adapter);if (err)goto err_up;netif_start_queue(netdev);
//...
}int
ixgb_up(struct ixgb_adapter *adapter)
{
//...//注册硬件中断处理函数err = request_irq(adapter->pdev->irq, ixgb_intr, irq_flags,netdev->name, netdev);
//...napi_enable(&adapter->napi);ixgb_irq_enable(adapter);netif_wake_queue(netdev);mod_timer(&adapter->watchdog_timer, jiffies);return 0;
}/*** ixgb_intr - Interrupt Handler* @irq: interrupt number* @data: pointer to a network interface device structure**/static irqreturn_t
ixgb_intr(int irq, void *data)
{struct net_device *netdev = data;struct ixgb_adapter *adapter = netdev_priv(netdev);struct ixgb_hw *hw = &adapter->hw;
//...if (napi_schedule_prep(&adapter->napi)) {/* Disable interrupts and register for poll. The flushof the posted write is intentionally left out.*/IXGB_WRITE_REG(&adapter->hw, IMC, ~0);__napi_schedule(&adapter->napi);}return IRQ_HANDLED;
}
如果收到了网络包,就会触发硬件中断,并调用ixgb_intr,最终调用__napi_schedule
/*** __napi_schedule - schedule for receive* @n: entry to schedule** The entry's receive function will be scheduled to run.* Consider using __napi_schedule_irqoff() if hard irqs are masked.*/
void __napi_schedule(struct napi_struct *n)
{unsigned long flags;local_irq_save(flags);//每cpu变量softnet_data//这个变量在上节中的net_tx_action也用到了,其output_queue被用来在网卡忙时推后发送报文____napi_schedule(this_cpu_ptr(&softnet_data), n);local_irq_restore(flags);
}
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,struct napi_struct *napi)
{list_add_tail(&napi->poll_list, &sd->poll_list);__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
在调用__napi_schedule时,中断是暂时关闭的,然后推后处理:将当前设备放到 struct softnet_data 结构的 poll_list 里面,在延迟处理部分可以接着处理这个 poll_list 里面的网络设备。
软中断 NET_RX_SOFTIRQ 对应的中断处理函数是 net_rx_action
static __latent_entropy void net_rx_action(struct softirq_action *h)
{struct softnet_data *sd = this_cpu_ptr(&softnet_data);
//...int budget = netdev_budget;
//...for (;;) {struct napi_struct *n;
//...n = list_first_entry(&list, struct napi_struct, poll_list);//处理budget -= napi_poll(n, &repoll);/* If softirq window is exhausted then punt.* Allow this to run for 2 jiffies since which will allow* an average latency of 1.5/HZ.*/if (unlikely(budget <= 0 ||time_after_eq(jiffies, time_limit))) {sd->time_squeeze++;break;}}
//...
}static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{void *have;int work, weight;list_del_init(&n->poll_list);have = netpoll_poll_lock(n);weight = n->weight;/* This NAPI_STATE_SCHED test is for avoiding a race* with netpoll's poll_napi(). Only the entity which* obtains the lock and sees NAPI_STATE_SCHED set will* actually make the ->poll() call. Therefore we avoid* accidentally calling ->poll() when NAPI is not scheduled.*/work = 0;if (test_bit(NAPI_STATE_SCHED, &n->state)) {work = n->poll(n, weight);trace_napi_poll(n, work, weight);}
//...
}
在上一节中,已经列举了在ixgb_probe中添加napi的poll函数的源码
netif_napi_add(netdev, &adapter->napi, ixgb_clean, 64);void netif_napi_add(struct net_device *dev, struct napi_struct *napi,int (*poll)(struct napi_struct *, int), int weight)
{
//...napi->poll = poll;
//...
}
因此n->poll对于ixgb网卡来说,就是调用了ixgb_clean
所以,在net_rx_action中是一个循环,在 poll_list 里面取出网络包到达的设备,然后调用 napi_poll 来轮询这些设备,napi_poll 会调用最初设备初始化的时候,注册的 poll 函数,对于 ixgb_driver,对应的函数是 ixgb_clean
/*** ixgb_clean - NAPI Rx polling callback* @napi: napi struct pointer* @budget: max number of receives to clean**/static int
ixgb_clean(struct napi_struct *napi, int budget)
{struct ixgb_adapter *adapter = container_of(napi, struct ixgb_adapter, napi);int work_done = 0;//发送完毕后清理发送相关资源ixgb_clean_tx_irq(adapter);//清理接收相关资源ixgb_clean_rx_irq(adapter, &work_done, budget);/* If budget not fully consumed, exit the polling mode */if (work_done < budget) {napi_complete_done(napi, work_done);if (!test_bit(__IXGB_DOWN, &adapter->flags))ixgb_irq_enable(adapter);}return work_done;
}
ixgb_clean在上节也提到过,此处直接复制过来,上一节解释了ixgb_clean_tx_irq释放发送完的环形队列中的资源,这里再看下接收资源清理函数ixgb_clean_rx_irq
/*** ixgb_clean_rx_irq - Send received data up the network stack,* @adapter: board private structure* @work_done: output pointer to amount of packets cleaned* @work_to_do: how much work we can complete**/static bool
ixgb_clean_rx_irq(struct ixgb_adapter *adapter, int *work_done, int work_to_do)
{struct ixgb_desc_ring *rx_ring = &adapter->rx_ring;struct net_device *netdev = adapter->netdev;struct pci_dev *pdev = adapter->pdev;struct ixgb_rx_desc *rx_desc, *next_rxd;struct ixgb_buffer *buffer_info, *next_buffer, *next2_buffer;u32 length;unsigned int i, j;int cleaned_count = 0;bool cleaned = false;i = rx_ring->next_to_clean;rx_desc = IXGB_RX_DESC(*rx_ring, i);buffer_info = &rx_ring->buffer_info[i];//判断这个中断是否是数据包到来产生的中断while (rx_desc->status & IXGB_RX_DESC_STATUS_DD) {struct sk_buff *skb;u8 status;//处理完毕if (*work_done >= work_to_do)break;(*work_done)++;rmb(); /* read descriptor and rx_buffer_info after status DD */status = rx_desc->status;skb = buffer_info->skb;buffer_info->skb = NULL;//将数据块预取到Cache中prefetch(skb->data - NET_IP_ALIGN);//环形缓冲区处理到末尾,指向头部if (++i == rx_ring->count)i = 0;next_rxd = IXGB_RX_DESC(*rx_ring, i);//将数据块预取到Cache中prefetch(next_rxd);j = i + 1;if (j == rx_ring->count)j = 0;//预先取得第2个buffernext2_buffer = &rx_ring->buffer_info[j];//将数据块预取到Cache中prefetch(next2_buffer);//取得下一个buffernext_buffer = &rx_ring->buffer_info[i];cleaned = true;cleaned_count++;dma_unmap_single(&pdev->dev,buffer_info->dma,buffer_info->length,DMA_FROM_DEVICE);buffer_info->dma = 0;length = le16_to_cpu(rx_desc->length);rx_desc->length = 0;//...//拷贝到skbixgb_check_copybreak(&adapter->napi, buffer_info, length, &skb);//向skb尾部添加数据/* Good Receive */skb_put(skb, length);
//...//设置二层协议skb->protocol = eth_type_trans(skb, netdev);
//...//协议栈收包处理netif_receive_skb(skb);
//...
}
tun/tap网卡
inti fd = socket(); int f = open("/dev/net/tun", O_RDWR)
write(fd, buf, len);
在上一节看到,writre最终调用write_iter,对于tun网卡:
static const struct file_operations tun_fops = {.owner = THIS_MODULE,.llseek = no_llseek,.read_iter = tun_chr_read_iter,.write_iter = tun_chr_write_iter,
//...
};
static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
//...result = tun_get_user(tun, tfile, NULL, from, noblock, false);tun_put(tun);return result;
}
因此内核态在tun_get_user从用户态的缓冲区收包。
/* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,void *msg_control, struct iov_iter *from,int noblock, bool more)
{
//...//包含包信息,接收if (!(tun->flags & IFF_NO_PI)) {if (len < sizeof(pi))return -EINVAL;len -= sizeof(pi);if (!copy_from_iter_full(&pi, sizeof(pi), from))return -EFAULT;}
//...good_linear = SKB_MAX_HEAD(align);
//...//构造skbif (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {/* For the packet that is not easy to be processed* (e.g gso or jumbo packet), we will do it at after* skb was created with generic XDP routine.*/skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);//...} else {//...skb = tun_alloc_skb(tfile, align, copylen, linear,noblock);}
//...if (zerocopy)err = zerocopy_sg_from_iter(skb, from);elseerr = skb_copy_datagram_from_iter(skb, 0, from, len);
//...}
//...skb_reset_network_header(skb);skb_probe_transport_header(skb);skb_record_rx_queue(skb, tfile->queue_index);//...//收包处理if (frags) {//略过} else if (tfile->napi_enabled) {struct sk_buff_head *queue = &tfile->sk.sk_write_queue;int queue_len;spin_lock_bh(&queue->lock);__skb_queue_tail(queue, skb);queue_len = skb_queue_len(queue);spin_unlock(&queue->lock);if (!more || queue_len > NAPI_POLL_WEIGHT)napi_schedule(&tfile->napi);local_bh_enable();} else if (!IS_ENABLED(CONFIG_4KSTACKS)) {tun_rx_batched(tun, tfile, skb, more);} else {netif_rx_ni(skb);}
//...
}
可以看到tun网卡可以通过napi_schedule,tun_rx_batched,netif_rx_ni等多种方式收包
napi_schedule最终调用____napi_schedule去触发软中断NET_RX_SOFTIRQ,即后续后执行net_rx_action
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,struct napi_struct *napi)
{list_add_tail(&napi->poll_list, &sd->poll_list);__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
netif_rx_ni关抢占并将skb入backlog队列
链路层处理
后续调用为netif_receive_skb->netif_receive_skb_internal->__netif_receive_skb->__netif_receive_skb_one_core->__netif_receive_skb_core,这些调用基本没啥可看的。
static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{struct net_device *orig_dev = skb->dev;struct packet_type *pt_prev = NULL;int ret;ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);if (pt_prev)ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,skb->dev, pt_prev, orig_dev);return ret;
}
此处先调用__netif_receive_skb_core,然后再调用传出的pt_prev的func
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,struct packet_type **ppt_prev)
{
//...
another_round:skb->skb_iif = skb->dev->ifindex;__this_cpu_inc(softnet_data.processed);
//...//vlan tag处理if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||skb->protocol == cpu_to_be16(ETH_P_8021AD)) {skb = skb_vlan_untag(skb);if (unlikely(!skb))goto out;}
//...//tcpdump抓包处理list_for_each_entry_rcu(ptype, &ptype_all, list) {if (pt_prev)ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;}list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {if (pt_prev)ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;}
//...skb_reset_redirect(skb);
skip_classify:if (pfmemalloc && !skb_pfmemalloc_protocol(skb))goto drop;if (skb_vlan_tag_present(skb)) {if (pt_prev) {ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = NULL;}if (vlan_do_receive(&skb))goto another_round;else if (unlikely(!skb))goto out;}
//...type = skb->protocol;/* deliver only exact match when indicated */if (likely(!deliver_exact)) {//发往上层deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,&ptype_base[ntohs(type) &PTYPE_HASH_MASK]);}deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,&orig_dev->ptype_specific);
//...
}static inline void deliver_ptype_list_skb(struct sk_buff *skb,struct packet_type **pt,struct net_device *orig_dev,__be16 type,struct list_head *ptype_list)
{struct packet_type *ptype, *pt_prev = *pt;list_for_each_entry_rcu(ptype, ptype_list, list) {if (ptype->type != type)continue;if (pt_prev)deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;}*pt = pt_prev;
}static inline int deliver_skb(struct sk_buff *skb,struct packet_type *pt_prev,struct net_device *orig_dev)
{if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))return -ENOMEM;refcount_inc(&skb->users);return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
pt_prev->func其实就是ptype_base中注册的各协议的收包函数,对于ip协议来说就是dev_add_pack(&ip_packet_type),注册流程略过
static struct packet_type ip_packet_type __read_mostly = {.type = cpu_to_be16(ETH_P_IP),.func = ip_rcv,.list_func = ip_list_rcv,
};
所以调用的是ip_rcv
IP层流程
/** IP receive entry point*/
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,struct net_device *orig_dev)
{struct net *net = dev_net(dev);skb = ip_rcv_core(skb, net);if (skb == NULL)return NET_RX_DROP;return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,net, NULL, skb, dev, NULL,ip_rcv_finish);
}
调用完了ip_rcv_core再调用ip_rcv_finish
/** Main IP Receive routine.*/
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{const struct iphdr *iph;u32 len;/* When the interface is in promisc. mode, drop all the crap* that it receives, do not try to analyse it.*///混杂模式时直接丢弃if (skb->pkt_type == PACKET_OTHERHOST)goto drop;
//...if (!pskb_may_pull(skb, sizeof(struct iphdr)))goto inhdr_error;iph = ip_hdr(skb);
//进行一系列检查,如检验和等iph = ip_hdr(skb);skb->transport_header = skb->network_header + iph->ihl*4;
//...return skb;
//...
}
处理完后调用ip_rcv_finish
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{struct net_device *dev = skb->dev;int ret;/* if ingress device is enslaved to an L3 master device pass the* skb to its handler for processing*/skb = l3mdev_ip_rcv(skb);if (!skb)return NET_RX_SUCCESS;ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);if (ret != NET_RX_DROP)ret = dst_input(skb);return ret;
}static int ip_rcv_finish_core(struct net *net, struct sock *sk,struct sk_buff *skb, struct net_device *dev,const struct sk_buff *hint)
{
//.../** Initialise the virtual path cache for the packet. It describes* how the packet travels inside Linux networking.*/if (!skb_valid_dst(skb)) {err = ip_route_input_noref(skb, iph->daddr, iph->saddr,iph->tos, dev);if (unlikely(err))goto drop_error;}
//...rt = skb_rtable(skb);if (rt->rt_type == RTN_MULTICAST) {__IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);} else if (rt->rt_type == RTN_BROADCAST) {__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);} else if (skb->pkt_type == PACKET_BROADCAST ||skb->pkt_type == PACKET_MULTICAST) {struct in_device *in_dev = __in_dev_get_rcu(dev);/* RFC 1122 3.3.6:** When a host sends a datagram to a link-layer broadcast* address, the IP destination address MUST be a legal IP* broadcast or IP multicast address.** A host SHOULD silently discard a datagram that is received* via a link-layer broadcast (see Section 2.4) but does not* specify an IP multicast or broadcast destination address.** This doesn't explicitly say L2 *broadcast*, but broadcast is* in a way a form of multicast and the most common use case for* this is 802.11 protecting against cross-station spoofing (the* so-called "hole-196" attack) so do it for both.*/if (in_dev &&IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))goto drop;}return NET_RX_SUCCESS;drop:kfree_skb(skb);return NET_RX_DROP;drop_error:if (err == -EXDEV)__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);goto drop;
}/* Input packet from network to transport. */
static inline int dst_input(struct sk_buff *skb)
{return skb_dst(skb)->input(skb);
}
rt_dst_alloc中注册了input函数是ip_output
/** Deliver IP Packets to the higher protocol layers.*/
int ip_local_deliver(struct sk_buff *skb)
{/** Reassemble IP fragments.*/struct net *net = dev_net(skb->dev);//如果分段,则重新组合if (ip_is_fragment(ip_hdr(skb))) {if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))return 0;}return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,net, NULL, skb, skb->dev, NULL,ip_local_deliver_finish);
}static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{__skb_pull(skb, skb_network_header_len(skb));rcu_read_lock();ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);rcu_read_unlock();return 0;
}
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{const struct net_protocol *ipprot;int raw, ret;resubmit://raw socket收包,拷贝一份raw = raw_local_deliver(skb, protocol);//送入更上层,比如tcp或icmp等ipprot = rcu_dereference(inet_protos[protocol]);if (ipprot) {
//...ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,skb);
//...} else {if (!raw) {if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
//...//发送icmp包,通知不可达icmp_send(skb, ICMP_DEST_UNREACH,ICMP_PROT_UNREACH, 0);}kfree_skb(skb);} else {__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);consume_skb(skb);}}
}
inet_protos是调用inet_add_protocol添加的,比如tcp的:
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)/* thinking of making this const? Don't.* early_demux can change based on sysctl.*/
static struct net_protocol tcp_protocol = {.early_demux = tcp_v4_early_demux,.early_demux_handler = tcp_v4_early_demux,.handler = tcp_v4_rcv,.err_handler = tcp_v4_err,.no_policy = 1,.netns_ok = 1,.icmp_strict_tag_validation = 1,
};
因此下一步调用tcp_v4_rcv
TCP层处理
/** From tcp_input.c*/int tcp_v4_rcv(struct sk_buff *skb)
{
//...th = (const struct tcphdr *)skb->data;iph = ip_hdr(skb);
lookup://寻找对应sksk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,th->dest, sdif, &refcounted);if (!sk)goto no_tcp_socket;process:if (sk->sk_state == TCP_TIME_WAIT)goto do_time_wait;//在tcp连接建立章节梳理过,这里是accept出来的新socket的处理流程if (sk->sk_state == TCP_NEW_SYN_RECV) {//...}//...th = (const struct tcphdr *)skb->data;iph = ip_hdr(skb);tcp_v4_fill_cb(skb, iph, th);skb->dev = NULL;//此处也在前面梳理过了,是接收客户端syc,回复syc+ack的处理流程if (sk->sk_state == TCP_LISTEN) {ret = tcp_v4_do_rcv(sk, skb);goto put_and_return;}
//...//没有用户在接收if (!sock_owned_by_user(sk)) {skb_to_free = sk->sk_rx_skb_cache;sk->sk_rx_skb_cache = NULL;ret = tcp_v4_do_rcv(sk, skb);} else {//用户在接收,加入backlogif (tcp_add_backlog(sk, skb))goto discard_and_relse;skb_to_free = NULL;}bh_unlock_sock(sk);if (skb_to_free)__kfree_skb(skb_to_free);put_and_return:if (refcounted)sock_put(sk);return ret;
//...
}
网络包的接收过程,涉及三个队列:backlog 队列prequeue 队列sk_receive_queue 队列
为何需要3个队列来实现收包呢?因为可能有3个地方来处理收包流程,
第一个主体是软中断的处理过程。执行 tcp_v4_rcv 函数的时候,依然处于软中断的处理逻辑里,所以必然会占用这个软中断。第二个主体就是用户态进程。如果用户态触发系统调用 read 读取网络包,也要从队列里面找。第三个主体就是内核协议栈。哪怕用户进程没有调用 read,读取网络包,当网络包来的时候,也要有一个地方收包。
/* The socket must have it's spinlock held when we get* here, unless it is a TCP_LISTEN socket.** We have a potential double-lock case here, so even when* doing backlog processing we use the BH locking scheme.* This is because we cannot sleep with the original spinlock* held.*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{struct sock *rsk;//就绪状态的处理if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */struct dst_entry *dst = sk->sk_rx_dst;sock_rps_save_rxhash(sk, skb);sk_mark_napi_id(sk, skb);if (dst) {if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||!dst->ops->check(dst, 0)) {dst_release(dst);sk->sk_rx_dst = NULL;}}//接收tcp_rcv_established(sk, skb);return 0;}
//...//其他状态的处理,在梳理tcp连接建立章节已分析过if (tcp_rcv_state_process(sk, skb)) {rsk = sk;goto reset;}return 0;
//...
}/** TCP receive function for the ESTABLISHED state.** It is split into a fast path and a slow path. The fast path is* disabled when:* - A zero window was announced from us - zero window probing* is only handled properly in the slow path.* - Out of order segments arrived.* - Urgent data is expected.* - There is no buffer space left* - Unexpected TCP flags/window values/header lengths are received* (detected by checking the TCP header against pred_flags)* - Data is sent in both directions. Fast path only supports pure senders* or pure receivers (this means either the sequence number or the ack* value must stay constant)* - Unexpected TCP option.** When these conditions are not satisfied it drops into a standard* receive procedure patterned after RFC793 to handle all cases.* The first three cases are guaranteed by proper pred_flags setting,* the rest is checked inline. Fast processing is turned on in* tcp_data_queue when everything is OK.*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{//只看慢速路径
slow_path:
//.../* step 7: process the segment text *///将skb放入sk的队列tcp_data_queue(sk, skb);
//...
}static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{//来的网络包正是服务端期望的下一个网络包/* Queue data for delivery to the user.* Packets in sequence go to the receive queue.* Out of sequence packets to the out_of_order_queue.*/if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {if (tcp_receive_window(tp) == 0) {NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);goto out_of_window;}/* Ok. In sequence. In window. */
queue_and_out:if (skb_queue_len(&sk->sk_receive_queue) == 0)sk_forced_mem_schedule(sk, skb->truesize);else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);sk->sk_data_ready(sk);goto drop;}//放入sk_receive_queue队列中eaten = tcp_queue_rcv(sk, skb, &fragstolen);if (skb->len)tcp_event_data_recv(sk, skb);if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)tcp_fin(sk);//当乱序缺少的包已经进入sk_receive_queue队列,把之前乱序的包从out_of_order_queue放入sk_receive_queueif (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {tcp_ofo_queue(sk);/* RFC5681. 4.2. SHOULD send immediate ACK, when* gap in queue is filled.*/if (RB_EMPTY_ROOT(&tp->out_of_order_queue))inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;}//...}//end_seq 不大于 rcv_nxt,也就是期待收到的包比真实收到的包序列号大,可能是ack丢失,因此回复一个ackif (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {tcp_rcv_spurious_retrans(sk, skb);/* A retransmit, 2nd most common case. Force an immediate ack. */NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);out_of_window:tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);inet_csk_schedule_ack(sk);
drop:tcp_drop(sk, skb);return;}//seq 不小于 rcv_nxt + tcp_receive_window,说明seq已经超过范围了,发送太快了超出接收窗口//发送 ACK ,在 ACK 中将接收窗口为 0 的情况告知客户端,客户端就知道不能再发送了/* Out of window. F.e. zero window probe. */if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))goto out_of_window;//seq 小于 rcv_nxt,但是 end_seq 大于 rcv_nxt,这说明从 seq 到 rcv_nxt 这部分网络包原来的 ACK 客户端没有收到,所以重新发送了一次,从 rcv_nxt 到 end_seq 时新发送的,可以放入 sk_receive_queue 队列if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {/* Partial packet, seq < rcv_next < end_seq */tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);/* If window is closed, drop tail of packet. But after* remembering D-SACK for its head made in previous line.*/if (!tcp_receive_window(tp)) {NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);goto out_of_window;}goto queue_and_out;}//当前来的这个网络包既不在 rcv_nxt 之前,也不在 rcv_nxt + tcp_receive_window 之后,正在我们期望的接收窗口里面,但是又不是 rcv_nxt(不是我们马上期望的网络包 ),因此是乱序包,加入out_of_order_queuetcp_data_queue_ofo(sk, skb);
}static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,bool *fragstolen)
{int eaten;struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);eaten = (tail &&//尝试聚合tcp_try_coalesce(sk, tail,skb, fragstolen)) ? 1 : 0;//将 tp->rcv_nxt 设置为 end_seq,也即当前的网络包接收成功后,更新下一个期待的网络包tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);if (!eaten) {//放入队列__skb_queue_tail(&sk->sk_receive_queue, skb);skb_set_owner_r(skb, sk);}return eaten;
}
放入队列后,会调用sk_data_ready唤醒用户态进程收包,那么等待的流程在哪呢?就在tcp_recvmsg中
/** This routine copies from a sock struct into the user buffer.** Technical note: in 2.3 we work on _locked_ socket, so that* tricks with *seq access order and skb->users are not required.* Probably, code can be easily improved even more.*/int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,int flags, int *addr_len)
{
//...do {u32 offset;//.../* Next get a buffer. *///从队列中取一个skblast = skb_peek_tail(&sk->sk_receive_queue);skb_queue_walk(&sk->sk_receive_queue, skb) {last = skb;
//...offset = *seq - TCP_SKB_CB(skb)->seq;
//...//处理sk_receive_queueif (offset < skb->len)goto found_ok_skb;
//...}/* Well, if we have backlog, try to process it now yet. *///处理backlog队列if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))break;
//...//处理backlog队列if (copied >= target) {/* Do not sleep, just process backlog. */release_sock(sk);lock_sock(sk);} else {//没有网络包,等待sk_wait_data(sk, &timeo, last);}
//...
found_ok_skb:/* Ok so how much can we use? */used = skb->len - offset;if (len < used)used = len;
//...if (!(flags & MSG_TRUNC)) {//拷贝到用户进程err = skb_copy_datagram_msg(skb, offset, msg, used);
//...}
//...
}void release_sock(struct sock *sk)
{spin_lock_bh(&sk->sk_lock.slock);if (sk->sk_backlog.tail)__release_sock(sk);/* Warning : release_cb() might need to release sk ownership,* ie call sock_release_ownership(sk) before us.*/if (sk->sk_prot->release_cb)sk->sk_prot->release_cb(sk);sock_release_ownership(sk);if (waitqueue_active(&sk->sk_lock.wq))wake_up(&sk->sk_lock.wq);spin_unlock_bh(&sk->sk_lock.slock);
}void __release_sock(struct sock *sk)__releases(&sk->sk_lock.slock)__acquires(&sk->sk_lock.slock)
{struct sk_buff *skb, *next;while ((skb = sk->sk_backlog.head) != NULL) {sk->sk_backlog.head = sk->sk_backlog.tail = NULL;spin_unlock_bh(&sk->sk_lock.slock);do {next = skb->next;prefetch(next);WARN_ON_ONCE(skb_dst_is_noref(skb));skb_mark_not_on_list(skb);//接收sk_backlog_rcv(sk, skb);cond_resched();skb = next;} while (skb != NULL);spin_lock_bh(&sk->sk_lock.slock);}/** Doing the zeroing here guarantee we can not loop forever* while a wild producer attempts to flood us.*/sk->sk_backlog.len = 0;
}
static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{if (sk_memalloc_socks() && skb_pfmemalloc(skb))return __sk_backlog_rcv(sk, skb);return sk->sk_backlog_rcv(sk, skb);
}
sk->sk_backlog_rcv实际调用tcp_v4_do_rcv
继续往上看调用tcp_recvmsg的函数,也就是socket层的函数
Socket层流程
inet_recvmsg调用了tcp_recvmsg
/*** sock_recvmsg - receive a message from @sock* @sock: socket* @msg: message to receive* @flags: message flags** Receives @msg from @sock, passing through LSM. Returns the total number* of bytes received, or an error.*/
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
{int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);return err ?: sock_recvmsg_nosec(sock, msg, flags);
}static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,int flags)
{return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,inet_recvmsg, sock, msg, msg_data_left(msg),flags);
}int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,int flags)
{struct sock *sk = sock->sk;int addr_len = 0;int err;if (likely(!(flags & MSG_ERRQUEUE)))sock_rps_record_flow(sk);err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,sk, msg, size, flags & MSG_DONTWAIT,flags & ~MSG_DONTWAIT, &addr_len);if (err >= 0)msg->msg_namelen = addr_len;return err;
}
继续网上看VFS层调用
VFS层流程
在前面章节分析过write调用,这里就是对应的read调用,仍然看文件操作指针
/** Socket files have a set of 'special' operations as well as the generic file ones. These don't appear* in the operation structures but are done directly via the socketcall() multiplexor.*/static const struct file_operations socket_file_ops = {.owner = THIS_MODULE,.llseek = no_llseek,.read_iter = sock_read_iter,.write_iter = sock_write_iter,
//...
};static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
//...res = sock_recvmsg(sock, &msg, msg.msg_flags);*to = msg.msg_iter;return res;
}
调用socket层接口
资源清理
tcpdump抓包流程解析
参考文章
图解Linux网络包接收过程
用户态 tcpdump 如何实现抓到内核网络包的?