当前位置：首页 > news >正文

TCP/IP协议栈实现浅析(下) 报文接收相关函数及流程分析

news 2025/8/31 17:12:42

文章目录

上节内容回顾
数据包的接收
- 函数原型
- 设备驱动层流程
- - ixgb网卡
  - tun/tap网卡
- 链路层处理
- IP层流程
- TCP层处理
- Socket层流程
- VFS层流程
资源清理
tcpdump抓包流程解析
参考文章

上节内容回顾

上节中按照报文发送经过的顺序即：socect层，传输层，网络层，路由子系统，邻居子系统，设备子系统，网络设备驱动层等各层梳理了报文发送过程，这节梳理报文接收流程，但是报文接收流程就是倒着来的顺序，即从网卡到用户态了。

数据包的接收

函数原型

对应的发送函数，与send系列基本一致:

ssize_t recv(int sockfd, void *buf, size_t len, int flags);
ssize_t recvfrom(int sockfd, void *buf, size_t len, int flags, sockaddr *src_addr, socklen_t *addrlen);
ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);

设备驱动层流程

当网卡收到报文需要通知操作系统，此时可以触发一个中断来通知cpu，但是如果网络包流量非常大，持续频繁的触发硬中断，将会严重干扰正常的执行流程。
除了硬中断，还能想到的一种办法是轮询，比如我有空了就去看看有没有报文，但是如果长时间没有报文，这样会浪费cpu资源。
因此就有NAPI，可以结合中断和轮询去处理收包
接下来以ixgb网卡和tun/tap网卡为例说明设备驱动层的流程

ixgb网卡

先看下网卡收包后的中断处理函数，上节中提到过发送完毕后的资源清理是在硬中断处理的，此处就是前面提到的流程
网卡激活时将调用ndo_open，即ixgb_open

static const struct net_device_ops ixgb_netdev_ops = {.ndo_open 		= ixgb_open,
//...
}	/*** ixgb_open - Called when a network interface is made active* @netdev: network interface device structure** Returns 0 on success, negative value on failure** The open entry point is called when a network interface is made* active by the system (IFF_UP).  At this point all resources needed* for transmit and receive operations are allocated, the interrupt* handler is registered with the OS, the watchdog timer is started,* and the stack is notified that the interface is ready.**/static int
ixgb_open(struct net_device *netdev)
{struct ixgb_adapter *adapter = netdev_priv(netdev);int err;/* allocate transmit descriptors *///分配用于物理网卡发包的desc，一致DMA映射err = ixgb_setup_tx_resources(adapter);if (err)goto err_setup_tx;netif_carrier_off(netdev);/* allocate receive descriptors *///分配用于物理网卡发包的desc，一致DMA映射err = ixgb_setup_rx_resources(adapter);if (err)goto err_setup_rx;//启动流程err = ixgb_up(adapter);if (err)goto err_up;netif_start_queue(netdev);
//...
}int
ixgb_up(struct ixgb_adapter *adapter)
{
//...//注册硬件中断处理函数err = request_irq(adapter->pdev->irq, ixgb_intr, irq_flags,netdev->name, netdev);
//...napi_enable(&adapter->napi);ixgb_irq_enable(adapter);netif_wake_queue(netdev);mod_timer(&adapter->watchdog_timer, jiffies);return 0;
}/*** ixgb_intr - Interrupt Handler* @irq: interrupt number* @data: pointer to a network interface device structure**/static irqreturn_t
ixgb_intr(int irq, void *data)
{struct net_device *netdev = data;struct ixgb_adapter *adapter = netdev_priv(netdev);struct ixgb_hw *hw = &adapter->hw;
//...if (napi_schedule_prep(&adapter->napi)) {/* Disable interrupts and register for poll. The flushof the posted write is intentionally left out.*/IXGB_WRITE_REG(&adapter->hw, IMC, ~0);__napi_schedule(&adapter->napi);}return IRQ_HANDLED;
}

如果收到了网络包，就会触发硬件中断，并调用ixgb_intr，最终调用__napi_schedule

/*** __napi_schedule - schedule for receive* @n: entry to schedule** The entry's receive function will be scheduled to run.* Consider using __napi_schedule_irqoff() if hard irqs are masked.*/
void __napi_schedule(struct napi_struct *n)
{unsigned long flags;local_irq_save(flags);//每cpu变量softnet_data//这个变量在上节中的net_tx_action也用到了，其output_queue被用来在网卡忙时推后发送报文____napi_schedule(this_cpu_ptr(&softnet_data), n);local_irq_restore(flags);
}
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,struct napi_struct *napi)
{list_add_tail(&napi->poll_list, &sd->poll_list);__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

在调用__napi_schedule时，中断是暂时关闭的，然后推后处理：将当前设备放到 struct softnet_data 结构的 poll_list 里面，在延迟处理部分可以接着处理这个 poll_list 里面的网络设备。
软中断 NET_RX_SOFTIRQ 对应的中断处理函数是 net_rx_action


static __latent_entropy void net_rx_action(struct softirq_action *h)
{struct softnet_data *sd = this_cpu_ptr(&softnet_data);
//...int budget = netdev_budget;
//...for (;;) {struct napi_struct *n;
//...n = list_first_entry(&list, struct napi_struct, poll_list);//处理budget -= napi_poll(n, &repoll);/* If softirq window is exhausted then punt.* Allow this to run for 2 jiffies since which will allow* an average latency of 1.5/HZ.*/if (unlikely(budget <= 0 ||time_after_eq(jiffies, time_limit))) {sd->time_squeeze++;break;}}
//...
}static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{void *have;int work, weight;list_del_init(&n->poll_list);have = netpoll_poll_lock(n);weight = n->weight;/* This NAPI_STATE_SCHED test is for avoiding a race* with netpoll's poll_napi().  Only the entity which* obtains the lock and sees NAPI_STATE_SCHED set will* actually make the ->poll() call.  Therefore we avoid* accidentally calling ->poll() when NAPI is not scheduled.*/work = 0;if (test_bit(NAPI_STATE_SCHED, &n->state)) {work = n->poll(n, weight);trace_napi_poll(n, work, weight);}
//...
}

在上一节中，已经列举了在ixgb_probe中添加napi的poll函数的源码

netif_napi_add(netdev, &adapter->napi, ixgb_clean, 64);void netif_napi_add(struct net_device *dev, struct napi_struct *napi,int (*poll)(struct napi_struct *, int), int weight)
{
//...napi->poll = poll;
//...
}

因此n->poll对于ixgb网卡来说，就是调用了ixgb_clean
所以，在net_rx_action中是一个循环，在 poll_list 里面取出网络包到达的设备，然后调用 napi_poll 来轮询这些设备，napi_poll 会调用最初设备初始化的时候，注册的 poll 函数，对于 ixgb_driver，对应的函数是 ixgb_clean


/*** ixgb_clean - NAPI Rx polling callback* @napi: napi struct pointer* @budget: max number of receives to clean**/static int
ixgb_clean(struct napi_struct *napi, int budget)
{struct ixgb_adapter *adapter = container_of(napi, struct ixgb_adapter, napi);int work_done = 0;//发送完毕后清理发送相关资源ixgb_clean_tx_irq(adapter);//清理接收相关资源ixgb_clean_rx_irq(adapter, &work_done, budget);/* If budget not fully consumed, exit the polling mode */if (work_done < budget) {napi_complete_done(napi, work_done);if (!test_bit(__IXGB_DOWN, &adapter->flags))ixgb_irq_enable(adapter);}return work_done;
}

ixgb_clean在上节也提到过，此处直接复制过来，上一节解释了ixgb_clean_tx_irq释放发送完的环形队列中的资源，这里再看下接收资源清理函数ixgb_clean_rx_irq


/*** ixgb_clean_rx_irq - Send received data up the network stack,* @adapter: board private structure* @work_done: output pointer to amount of packets cleaned* @work_to_do: how much work we can complete**/static bool
ixgb_clean_rx_irq(struct ixgb_adapter *adapter, int *work_done, int work_to_do)
{struct ixgb_desc_ring *rx_ring = &adapter->rx_ring;struct net_device *netdev = adapter->netdev;struct pci_dev *pdev = adapter->pdev;struct ixgb_rx_desc *rx_desc, *next_rxd;struct ixgb_buffer *buffer_info, *next_buffer, *next2_buffer;u32 length;unsigned int i, j;int cleaned_count = 0;bool cleaned = false;i = rx_ring->next_to_clean;rx_desc = IXGB_RX_DESC(*rx_ring, i);buffer_info = &rx_ring->buffer_info[i];//判断这个中断是否是数据包到来产生的中断while (rx_desc->status & IXGB_RX_DESC_STATUS_DD) {struct sk_buff *skb;u8 status;//处理完毕if (*work_done >= work_to_do)break;(*work_done)++;rmb();	/* read descriptor and rx_buffer_info after status DD */status = rx_desc->status;skb = buffer_info->skb;buffer_info->skb = NULL;//将数据块预取到Cache中prefetch(skb->data - NET_IP_ALIGN);//环形缓冲区处理到末尾，指向头部if (++i == rx_ring->count)i = 0;next_rxd = IXGB_RX_DESC(*rx_ring, i);//将数据块预取到Cache中prefetch(next_rxd);j = i + 1;if (j == rx_ring->count)j = 0;//预先取得第2个buffernext2_buffer = &rx_ring->buffer_info[j];//将数据块预取到Cache中prefetch(next2_buffer);//取得下一个buffernext_buffer = &rx_ring->buffer_info[i];cleaned = true;cleaned_count++;dma_unmap_single(&pdev->dev,buffer_info->dma,buffer_info->length,DMA_FROM_DEVICE);buffer_info->dma = 0;length = le16_to_cpu(rx_desc->length);rx_desc->length = 0;//...//拷贝到skbixgb_check_copybreak(&adapter->napi, buffer_info, length, &skb);//向skb尾部添加数据/* Good Receive */skb_put(skb, length);
//...//设置二层协议skb->protocol = eth_type_trans(skb, netdev);
//...//协议栈收包处理netif_receive_skb(skb);
//...
}

tun/tap网卡

inti fd = socket();                                              int f = open("/dev/net/tun", O_RDWR)
write(fd, buf, len);

在上一节看到，writre最终调用write_iter，对于tun网卡：

static const struct file_operations tun_fops = {.owner	= THIS_MODULE,.llseek = no_llseek,.read_iter  = tun_chr_read_iter,.write_iter = tun_chr_write_iter,
//...
};
static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
//...result = tun_get_user(tun, tfile, NULL, from, noblock, false);tun_put(tun);return result;
}

因此内核态在tun_get_user从用户态的缓冲区收包。


/* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,void *msg_control, struct iov_iter *from,int noblock, bool more)
{
//...//包含包信息，接收if (!(tun->flags & IFF_NO_PI)) {if (len < sizeof(pi))return -EINVAL;len -= sizeof(pi);if (!copy_from_iter_full(&pi, sizeof(pi), from))return -EFAULT;}
//...good_linear = SKB_MAX_HEAD(align);
//...//构造skbif (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) 		 	 	{/* For the packet that is not easy to be processed* (e.g gso or jumbo packet), we will do it at after* skb was created with generic XDP routine.*/skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);//...} else {//...skb = tun_alloc_skb(tfile, align, copylen, linear,noblock);}
//...if (zerocopy)err = zerocopy_sg_from_iter(skb, from);elseerr = skb_copy_datagram_from_iter(skb, 0, from, len);
//...}
//...skb_reset_network_header(skb);skb_probe_transport_header(skb);skb_record_rx_queue(skb, tfile->queue_index);//...//收包处理if (frags) {//略过} else if (tfile->napi_enabled) {struct sk_buff_head *queue = &tfile->sk.sk_write_queue;int queue_len;spin_lock_bh(&queue->lock);__skb_queue_tail(queue, skb);queue_len = skb_queue_len(queue);spin_unlock(&queue->lock);if (!more || queue_len > NAPI_POLL_WEIGHT)napi_schedule(&tfile->napi);local_bh_enable();} else if (!IS_ENABLED(CONFIG_4KSTACKS)) {tun_rx_batched(tun, tfile, skb, more);} else {netif_rx_ni(skb);}
//...
}

可以看到tun网卡可以通过napi_schedule，tun_rx_batched，netif_rx_ni等多种方式收包
napi_schedule最终调用____napi_schedule去触发软中断NET_RX_SOFTIRQ，即后续后执行net_rx_action

/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,struct napi_struct *napi)
{list_add_tail(&napi->poll_list, &sd->poll_list);__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

netif_rx_ni关抢占并将skb入backlog队列

链路层处理

后续调用为netif_receive_skb->netif_receive_skb_internal->__netif_receive_skb->__netif_receive_skb_one_core->__netif_receive_skb_core，这些调用基本没啥可看的。


static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{struct net_device *orig_dev = skb->dev;struct packet_type *pt_prev = NULL;int ret;ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);if (pt_prev)ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,skb->dev, pt_prev, orig_dev);return ret;
}

此处先调用__netif_receive_skb_core,然后再调用传出的pt_prev的func


static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,struct packet_type **ppt_prev)
{
//...
another_round:skb->skb_iif = skb->dev->ifindex;__this_cpu_inc(softnet_data.processed);
//...//vlan tag处理if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||skb->protocol == cpu_to_be16(ETH_P_8021AD)) {skb = skb_vlan_untag(skb);if (unlikely(!skb))goto out;}
//...//tcpdump抓包处理list_for_each_entry_rcu(ptype, &ptype_all, list) {if (pt_prev)ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;}list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {if (pt_prev)ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;}
//...skb_reset_redirect(skb);
skip_classify:if (pfmemalloc && !skb_pfmemalloc_protocol(skb))goto drop;if (skb_vlan_tag_present(skb)) {if (pt_prev) {ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = NULL;}if (vlan_do_receive(&skb))goto another_round;else if (unlikely(!skb))goto out;}
//...type = skb->protocol;/* deliver only exact match when indicated */if (likely(!deliver_exact)) {//发往上层deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,&ptype_base[ntohs(type) &PTYPE_HASH_MASK]);}deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,&orig_dev->ptype_specific);
//...
}static inline void deliver_ptype_list_skb(struct sk_buff *skb,struct packet_type **pt,struct net_device *orig_dev,__be16 type,struct list_head *ptype_list)
{struct packet_type *ptype, *pt_prev = *pt;list_for_each_entry_rcu(ptype, ptype_list, list) {if (ptype->type != type)continue;if (pt_prev)deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;}*pt = pt_prev;
}static inline int deliver_skb(struct sk_buff *skb,struct packet_type *pt_prev,struct net_device *orig_dev)
{if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))return -ENOMEM;refcount_inc(&skb->users);return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

pt_prev->func其实就是ptype_base中注册的各协议的收包函数，对于ip协议来说就是dev_add_pack(&ip_packet_type)，注册流程略过

static struct packet_type ip_packet_type __read_mostly = {.type = cpu_to_be16(ETH_P_IP),.func = ip_rcv,.list_func = ip_list_rcv,
};

所以调用的是ip_rcv

IP层流程


/** IP receive entry point*/
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,struct net_device *orig_dev)
{struct net *net = dev_net(dev);skb = ip_rcv_core(skb, net);if (skb == NULL)return NET_RX_DROP;return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,net, NULL, skb, dev, NULL,ip_rcv_finish);
}

调用完了ip_rcv_core再调用ip_rcv_finish


/** 	Main IP Receive routine.*/
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{const struct iphdr *iph;u32 len;/* When the interface is in promisc. mode, drop all the crap* that it receives, do not try to analyse it.*///混杂模式时直接丢弃if (skb->pkt_type == PACKET_OTHERHOST)goto drop;
//...if (!pskb_may_pull(skb, sizeof(struct iphdr)))goto inhdr_error;iph = ip_hdr(skb);
//进行一系列检查，如检验和等iph = ip_hdr(skb);skb->transport_header = skb->network_header + iph->ihl*4;
//...return skb;
//...
}

处理完后调用ip_rcv_finish


static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{struct net_device *dev = skb->dev;int ret;/* if ingress device is enslaved to an L3 master device pass the* skb to its handler for processing*/skb = l3mdev_ip_rcv(skb);if (!skb)return NET_RX_SUCCESS;ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);if (ret != NET_RX_DROP)ret = dst_input(skb);return ret;
}static int ip_rcv_finish_core(struct net *net, struct sock *sk,struct sk_buff *skb, struct net_device *dev,const struct sk_buff *hint)
{
//.../**	Initialise the virtual path cache for the packet. It describes*	how the packet travels inside Linux networking.*/if (!skb_valid_dst(skb)) {err = ip_route_input_noref(skb, iph->daddr, iph->saddr,iph->tos, dev);if (unlikely(err))goto drop_error;}
//...rt = skb_rtable(skb);if (rt->rt_type == RTN_MULTICAST) {__IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);} else if (rt->rt_type == RTN_BROADCAST) {__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);} else if (skb->pkt_type == PACKET_BROADCAST ||skb->pkt_type == PACKET_MULTICAST) {struct in_device *in_dev = __in_dev_get_rcu(dev);/* RFC 1122 3.3.6:**   When a host sends a datagram to a link-layer broadcast*   address, the IP destination address MUST be a legal IP*   broadcast or IP multicast address.**   A host SHOULD silently discard a datagram that is received*   via a link-layer broadcast (see Section 2.4) but does not*   specify an IP multicast or broadcast destination address.** This doesn't explicitly say L2 *broadcast*, but broadcast is* in a way a form of multicast and the most common use case for* this is 802.11 protecting against cross-station spoofing (the* so-called "hole-196" attack) so do it for both.*/if (in_dev &&IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))goto drop;}return NET_RX_SUCCESS;drop:kfree_skb(skb);return NET_RX_DROP;drop_error:if (err == -EXDEV)__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);goto drop;
}/* Input packet from network to transport.  */
static inline int dst_input(struct sk_buff *skb)
{return skb_dst(skb)->input(skb);
}

rt_dst_alloc中注册了input函数是ip_output


/** 	Deliver IP Packets to the higher protocol layers.*/
int ip_local_deliver(struct sk_buff *skb)
{/**	Reassemble IP fragments.*/struct net *net = dev_net(skb->dev);//如果分段，则重新组合if (ip_is_fragment(ip_hdr(skb))) {if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))return 0;}return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,net, NULL, skb, skb->dev, NULL,ip_local_deliver_finish);
}static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{__skb_pull(skb, skb_network_header_len(skb));rcu_read_lock();ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);rcu_read_unlock();return 0;
}
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{const struct net_protocol *ipprot;int raw, ret;resubmit://raw socket收包，拷贝一份raw = raw_local_deliver(skb, protocol);//送入更上层，比如tcp或icmp等ipprot = rcu_dereference(inet_protos[protocol]);if (ipprot) {
//...ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,skb);
//...} else {if (!raw) {if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
//...//发送icmp包，通知不可达icmp_send(skb, ICMP_DEST_UNREACH,ICMP_PROT_UNREACH, 0);}kfree_skb(skb);} else {__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);consume_skb(skb);}}
}

inet_protos是调用inet_add_protocol添加的，比如tcp的：

	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)/* thinking of making this const? Don't.* early_demux can change based on sysctl.*/
static struct net_protocol tcp_protocol = {.early_demux	=	tcp_v4_early_demux,.early_demux_handler =  tcp_v4_early_demux,.handler	=	tcp_v4_rcv,.err_handler	=	tcp_v4_err,.no_policy	=	1,.netns_ok	=	1,.icmp_strict_tag_validation = 1,
};

因此下一步调用tcp_v4_rcv

TCP层处理


/**	From tcp_input.c*/int tcp_v4_rcv(struct sk_buff *skb)
{
//...th = (const struct tcphdr *)skb->data;iph = ip_hdr(skb);
lookup://寻找对应sksk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,th->dest, sdif, &refcounted);if (!sk)goto no_tcp_socket;process:if (sk->sk_state == TCP_TIME_WAIT)goto do_time_wait;//在tcp连接建立章节梳理过，这里是accept出来的新socket的处理流程if (sk->sk_state == TCP_NEW_SYN_RECV) {//...}//...th = (const struct tcphdr *)skb->data;iph = ip_hdr(skb);tcp_v4_fill_cb(skb, iph, th);skb->dev = NULL;//此处也在前面梳理过了，是接收客户端syc，回复syc+ack的处理流程if (sk->sk_state == TCP_LISTEN) {ret = tcp_v4_do_rcv(sk, skb);goto put_and_return;}
//...//没有用户在接收if (!sock_owned_by_user(sk)) {skb_to_free = sk->sk_rx_skb_cache;sk->sk_rx_skb_cache = NULL;ret = tcp_v4_do_rcv(sk, skb);} else {//用户在接收，加入backlogif (tcp_add_backlog(sk, skb))goto discard_and_relse;skb_to_free = NULL;}bh_unlock_sock(sk);if (skb_to_free)__kfree_skb(skb_to_free);put_and_return:if (refcounted)sock_put(sk);return ret;
//...
}

网络包的接收过程，涉及三个队列：backlog 队列prequeue 队列sk_receive_queue 队列
为何需要3个队列来实现收包呢？因为可能有3个地方来处理收包流程，
第一个主体是软中断的处理过程。执行 tcp_v4_rcv 函数的时候，依然处于软中断的处理逻辑里，所以必然会占用这个软中断。第二个主体就是用户态进程。如果用户态触发系统调用 read 读取网络包，也要从队列里面找。第三个主体就是内核协议栈。哪怕用户进程没有调用 read，读取网络包，当网络包来的时候，也要有一个地方收包。


/* The socket must have it's spinlock held when we get* here, unless it is a TCP_LISTEN socket.** We have a potential double-lock case here, so even when* doing backlog processing we use the BH locking scheme.* This is because we cannot sleep with the original spinlock* held.*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{struct sock *rsk;//就绪状态的处理if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */struct dst_entry *dst = sk->sk_rx_dst;sock_rps_save_rxhash(sk, skb);sk_mark_napi_id(sk, skb);if (dst) {if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||!dst->ops->check(dst, 0)) {dst_release(dst);sk->sk_rx_dst = NULL;}}//接收tcp_rcv_established(sk, skb);return 0;}
//...//其他状态的处理，在梳理tcp连接建立章节已分析过if (tcp_rcv_state_process(sk, skb)) {rsk = sk;goto reset;}return 0;
//...
}/**	TCP receive function for the ESTABLISHED state.**	It is split into a fast path and a slow path. The fast path is* 	disabled when:*	- A zero window was announced from us - zero window probing*        is only handled properly in the slow path.*	- Out of order segments arrived.*	- Urgent data is expected.*	- There is no buffer space left*	- Unexpected TCP flags/window values/header lengths are received*	  (detected by checking the TCP header against pred_flags)*	- Data is sent in both directions. Fast path only supports pure senders*	  or pure receivers (this means either the sequence number or the ack*	  value must stay constant)*	- Unexpected TCP option.**	When these conditions are not satisfied it drops into a standard*	receive procedure patterned after RFC793 to handle all cases.*	The first three cases are guaranteed by proper pred_flags setting,*	the rest is checked inline. Fast processing is turned on in*	tcp_data_queue when everything is OK.*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{//只看慢速路径
slow_path:
//.../* step 7: process the segment text *///将skb放入sk的队列tcp_data_queue(sk, skb);
//...
}static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{//来的网络包正是服务端期望的下一个网络包/*  Queue data for delivery to the user.*  Packets in sequence go to the receive queue.*  Out of sequence packets to the out_of_order_queue.*/if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {if (tcp_receive_window(tp) == 0) {NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);goto out_of_window;}/* Ok. In sequence. In window. */
queue_and_out:if (skb_queue_len(&sk->sk_receive_queue) == 0)sk_forced_mem_schedule(sk, skb->truesize);else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);sk->sk_data_ready(sk);goto drop;}//放入sk_receive_queue队列中eaten = tcp_queue_rcv(sk, skb, &fragstolen);if (skb->len)tcp_event_data_recv(sk, skb);if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)tcp_fin(sk);//当乱序缺少的包已经进入sk_receive_queue队列，把之前乱序的包从out_of_order_queue放入sk_receive_queueif (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {tcp_ofo_queue(sk);/* RFC5681. 4.2. SHOULD send immediate ACK, when* gap in queue is filled.*/if (RB_EMPTY_ROOT(&tp->out_of_order_queue))inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;}//...}//end_seq 不大于 rcv_nxt,也就是期待收到的包比真实收到的包序列号大，可能是ack丢失，因此回复一个ackif (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {tcp_rcv_spurious_retrans(sk, skb);/* A retransmit, 2nd most common case.  Force an immediate ack. */NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);out_of_window:tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);inet_csk_schedule_ack(sk);
drop:tcp_drop(sk, skb);return;}//seq 不小于 rcv_nxt + tcp_receive_window，说明seq已经超过范围了，发送太快了超出接收窗口//发送 ACK ，在 ACK 中将接收窗口为 0 的情况告知客户端，客户端就知道不能再发送了/* Out of window. F.e. zero window probe. */if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))goto out_of_window;//seq 小于 rcv_nxt，但是 end_seq 大于 rcv_nxt，这说明从 seq 到 rcv_nxt 这部分网络包原来的 ACK 客户端没有收到，所以重新发送了一次，从 rcv_nxt 到 end_seq 时新发送的，可以放入 sk_receive_queue 队列if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {/* Partial packet, seq < rcv_next < end_seq */tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);/* If window is closed, drop tail of packet. But after* remembering D-SACK for its head made in previous line.*/if (!tcp_receive_window(tp)) {NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);goto out_of_window;}goto queue_and_out;}//当前来的这个网络包既不在 rcv_nxt 之前，也不在 rcv_nxt + tcp_receive_window 之后，正在我们期望的接收窗口里面，但是又不是 rcv_nxt（不是我们马上期望的网络包 ），因此是乱序包，加入out_of_order_queuetcp_data_queue_ofo(sk, skb);
}static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,bool *fragstolen)
{int eaten;struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);eaten = (tail &&//尝试聚合tcp_try_coalesce(sk, tail,skb, fragstolen)) ? 1 : 0;//将 tp->rcv_nxt 设置为 end_seq，也即当前的网络包接收成功后，更新下一个期待的网络包tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);if (!eaten) {//放入队列__skb_queue_tail(&sk->sk_receive_queue, skb);skb_set_owner_r(skb, sk);}return eaten;
}

放入队列后，会调用sk_data_ready唤醒用户态进程收包,那么等待的流程在哪呢？就在tcp_recvmsg中


/**	This routine copies from a sock struct into the user buffer.**	Technical note: in 2.3 we work on _locked_ socket, so that*	tricks with *seq access order and skb->users are not required.*	Probably, code can be easily improved even more.*/int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,int flags, int *addr_len)
{
//...do {u32 offset;//.../* Next get a buffer. *///从队列中取一个skblast = skb_peek_tail(&sk->sk_receive_queue);skb_queue_walk(&sk->sk_receive_queue, skb) {last = skb;
//...offset = *seq - TCP_SKB_CB(skb)->seq;
//...//处理sk_receive_queueif (offset < skb->len)goto found_ok_skb;
//...}/* Well, if we have backlog, try to process it now yet. *///处理backlog队列if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))break;
//...//处理backlog队列if (copied >= target) {/* Do not sleep, just process backlog. */release_sock(sk);lock_sock(sk);} else {//没有网络包，等待sk_wait_data(sk, &timeo, last);}
//...
found_ok_skb:/* Ok so how much can we use? */used = skb->len - offset;if (len < used)used = len;
//...if (!(flags & MSG_TRUNC)) {//拷贝到用户进程err = skb_copy_datagram_msg(skb, offset, msg, used);
//...}
//...
}void release_sock(struct sock *sk)
{spin_lock_bh(&sk->sk_lock.slock);if (sk->sk_backlog.tail)__release_sock(sk);/* Warning : release_cb() might need to release sk ownership,* ie call sock_release_ownership(sk) before us.*/if (sk->sk_prot->release_cb)sk->sk_prot->release_cb(sk);sock_release_ownership(sk);if (waitqueue_active(&sk->sk_lock.wq))wake_up(&sk->sk_lock.wq);spin_unlock_bh(&sk->sk_lock.slock);
}void __release_sock(struct sock *sk)__releases(&sk->sk_lock.slock)__acquires(&sk->sk_lock.slock)
{struct sk_buff *skb, *next;while ((skb = sk->sk_backlog.head) != NULL) {sk->sk_backlog.head = sk->sk_backlog.tail = NULL;spin_unlock_bh(&sk->sk_lock.slock);do {next = skb->next;prefetch(next);WARN_ON_ONCE(skb_dst_is_noref(skb));skb_mark_not_on_list(skb);//接收sk_backlog_rcv(sk, skb);cond_resched();skb = next;} while (skb != NULL);spin_lock_bh(&sk->sk_lock.slock);}/** Doing the zeroing here guarantee we can not loop forever* while a wild producer attempts to flood us.*/sk->sk_backlog.len = 0;
}
static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{if (sk_memalloc_socks() && skb_pfmemalloc(skb))return __sk_backlog_rcv(sk, skb);return sk->sk_backlog_rcv(sk, skb);
}

sk->sk_backlog_rcv实际调用tcp_v4_do_rcv
继续往上看调用tcp_recvmsg的函数，也就是socket层的函数

Socket层流程

inet_recvmsg调用了tcp_recvmsg

/***	sock_recvmsg - receive a message from @sock*	@sock: socket*	@msg: message to receive*	@flags: message flags**	Receives @msg from @sock, passing through LSM. Returns the total number*	of bytes received, or an error.*/
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
{int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);return err ?: sock_recvmsg_nosec(sock, msg, flags);
}static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,int flags)
{return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,inet_recvmsg, sock, msg, msg_data_left(msg),flags);
}int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,int flags)
{struct sock *sk = sock->sk;int addr_len = 0;int err;if (likely(!(flags & MSG_ERRQUEUE)))sock_rps_record_flow(sk);err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,sk, msg, size, flags & MSG_DONTWAIT,flags & ~MSG_DONTWAIT, &addr_len);if (err >= 0)msg->msg_namelen = addr_len;return err;
}

继续网上看VFS层调用

VFS层流程

在前面章节分析过write调用，这里就是对应的read调用，仍然看文件操作指针

/**	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear*	in the operation structures but are done directly via the socketcall() multiplexor.*/static const struct file_operations socket_file_ops = {.owner =	THIS_MODULE,.llseek =	no_llseek,.read_iter =	sock_read_iter,.write_iter =	sock_write_iter,
//...
};static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
//...res = sock_recvmsg(sock, &msg, msg.msg_flags);*to = msg.msg_iter;return res;
}