目录
- 数据接收流程
- 驱动层
- 网络层
- ip_local_deliver
- ip_local_deliver_finish
- 传输层
- tcp_v4_rcv
- tcp_v4_do_rcv
- tcp_rcv_established
- tcp_recvmsg
linux内核源码下载:https://cdn.kernel.org/pub/linux/kernel/
我下载的是:linux-5.11.1.tar.gz
数据接收流程
1,一般网卡接收数据是以触发中断来接收的,在网卡driver中,接收到数据时,往kernel的api:netif_rx()丢。
2,接着数据被送到IP层ip_local_deliver_finish(),经过剥离ip头部,把数据往TCP层发。
3,tcp层tcp_v4_rcv()收到数据后,再调用tcp_rcv_established()(ack也是其中的tcp_ack()发送的)处理,如果当前用户进程没有正在读取数据,则将其插入到接收队列中,tcp_queue_rcv()则判断接收队列是否为空,如果不为空,则将skb合并到接收队列的尾部,最后由tcp_recvmsg()从接收列队中一个一个的将skb读取到用户设置的buffer中去。
4,上层通过recvmsg等函数去接收数据时,就是从buffer中去读取的。
调用栈:
1、从下往上:IP数据报 -> tcp_v4_rcv -> tcp_v4_do_rcv -> tcp_rcv_established -> tcp_data_queue ->sk_data_ready
2、从上往下:应用层recvfrom -> SYSCALL_DEFINE2 -> __sys_recvfrom -> sock_recvmsg -> sock_recvmsg_nosec -> inet_recvmsg -> tcp_recvmsg
驱动层
netif_rx()
所在目录:/linux-5.11.1/net/core/dev.c。
该函数在网卡驱动程序与linux内核之间建立了一道桥梁,将网卡接收上来的数据包(sk_buff形式)插入内核维护的接收缓冲区队列当中。
他的主要任务是把数据帧添加到CPU的输入队列input_pkt_queue中。随后标记软中断来处理后续上传数据帧给TCP/IP协议栈。
网络层
所在目录:/linux-5.11.1/net/ipv4/ip_input.c。
ip_local_deliver(解析IP头部,组包)
ip_local_deliver_finish(根据IP报头的protocol字段,找到对应的L4协议,TCP/UDP)。
ip_local_deliver
/** Deliver IP Packets to the higher protocol layers.*/
int ip_local_deliver(struct sk_buff *skb)
{/** Reassemble IP fragments.*/struct net *net = dev_net(skb->dev);/* 分片重组 */if (ip_is_fragment(ip_hdr(skb))) {if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))return 0;}/* 经过LOCAL_IN钩子点 */return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,net, NULL, skb, skb->dev, NULL,ip_local_deliver_finish);
}
ip_local_deliver_finish
static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{/* 去掉ip头 */__skb_pull(skb, skb_network_header_len(skb));rcu_read_lock();{/* 获取协议 */int protocol = ip_hdr(skb)->protocol;const struct net_protocol *ipprot;int raw;resubmit:/* 原始套接口,复制一个副本,输出到该套接口 */raw = raw_local_deliver(skb, protocol);/* 获取协议处理结构 */ipprot = rcu_dereference(inet_protos[protocol]);if (ipprot) {int ret;if (!ipprot->no_policy) {if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {kfree_skb(skb);goto out;}nf_reset(skb);}/* 协议上层收包处理函数 */ret = ipprot->handler(skb);if (ret < 0) {protocol = -ret;goto resubmit;}__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);} /* 没有协议接收该数据包 */else {/* 原始套接口未接收或接收异常 */if (!raw) {if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);/* 发送icmp */icmp_send(skb, ICMP_DEST_UNREACH,ICMP_PROT_UNREACH, 0);}/* 丢包 */kfree_skb(skb);} /* 原始套接口接收 */else {__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);/* 释放包 */consume_skb(skb);}}}out:rcu_read_unlock();return 0;
}
传输层
根据IP报头协议,以TCP为例,TCP接收函数为:int tcp_v4_rcv(struct sk_buff *skb)。
所在目录:/linux-5.11.1/net/ipv4/tcp_ipv4.c。
tcp_v4_rcv
int tcp_v4_rcv(struct sk_buff *skb)
{const struct iphdr *iph;struct tcphdr *th;struct sock *sk;int ret;//非本机数据包扔掉if (skb->pkt_type != PACKET_HOST)goto discard_it;/* Count it even if it's bad */TCP_INC_STATS_BH(TCP_MIB_INSEGS);//下面主要是对TCP段的长度进行校验。注意pskb_may_pull()除了校验,还有一个额外的功能,//如果一个TCP段在传输过程中被网络层分片,那么在目的端的网络层会重新组包,这会导致传给//TCP的skb的分片结构中包含多个skb,这种情况下,该函数会将分片结构重组到线性数据区//保证skb的线性区域至少有20个字节数据if (!pskb_may_pull(skb, sizeof(struct tcphdr)))goto discard_it;th = tcp_hdr(skb);if (th->doff < sizeof(struct tcphdr) / 4)goto bad_packet;//保证skb的线性区域至少包括实际的TCP首部if (!pskb_may_pull(skb, th->doff * 4))goto discard_it;//数据包校验相关,校验失败,则悄悄丢弃,不产生任何的差错报文/* An explanation is required here, I think.* Packet length and doff are validated by header prediction,* provided case of th->doff==0 is eliminated.* So, we defer the checks. */if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))goto bad_packet;//初始化skb中的控制块th = tcp_hdr(skb);iph = ip_hdr(skb);TCP_SKB_CB(skb)->seq = ntohl(th->seq);TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +skb->len - th->doff * 4);TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);TCP_SKB_CB(skb)->when = 0;TCP_SKB_CB(skb)->flags = iph->tos;TCP_SKB_CB(skb)->sacked = 0;//根据传入段的源和目的地址信息从ehash或者bhash中查询对应的TCB,这一步决定了//输入数据包应该由哪个套接字处理,获取到TCB时,还会持有一个引用计数sk = __inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->saddr,th->source, iph->daddr, th->dest, inet_iif(skb));if (!sk)goto no_tcp_socket;process://TCP_TIME_WAIT需要做特殊处理,这里先不关注if (sk->sk_state == TCP_TIME_WAIT)goto do_time_wait;//IPSec相关if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))goto discard_and_relse;nf_reset(skb);//TCP套接字过滤器,如果数据包被过滤掉了,结束处理过程if (sk_filter(sk, skb))goto discard_and_relse;//到了传输层,该字段已经没有意义,将其置为空skb->dev = NULL;//先持锁,这样进程上下文和其它软中断则无法操作该TCBbh_lock_sock_nested(sk);ret = 0;//如果当前TCB没有被进程上下文锁定,首先尝试将数据包放入prequeue队列,//如果prequeue队列没有处理,再将其处理后放入receive队列。如果TCB已//经被进程上下文锁定,那么直接将数据包放入backlog队列if (!sock_owned_by_user(sk)) {//DMA部分,忽略
#ifdef CONFIG_NET_DMAstruct tcp_sock *tp = tcp_sk(sk);if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)tp->ucopy.dma_chan = get_softnet_dma();if (tp->ucopy.dma_chan)ret = tcp_v4_do_rcv(sk, skb);else
#endif{//prequeue没有接收该数据包时返回0,那么交由tcp_v4_do_rcv()处理if (!tcp_prequeue(sk, skb))ret = tcp_v4_do_rcv(sk, skb);}} else {//TCB被用户进程锁定,直接将数据包放入backlog队列sk_add_backlog(sk, skb);}//释放锁bh_unlock_sock(sk);//释放TCB引用计数sock_put(sk);//返回处理结果return ret;no_tcp_socket:if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))goto discard_it;if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
bad_packet:TCP_INC_STATS_BH(TCP_MIB_INERRS);} else {tcp_v4_send_reset(NULL, skb);}discard_it:/* Discard frame. */kfree_skb(skb);return 0;discard_and_relse:sock_put(sk);goto discard_it;do_time_wait:
...
}
tcp_v4_do_rcv
所在目录:/linux-5.11.1/net/ipv4/tcp_ipv4.c。
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{struct sock *rsk;#ifdef CONFIG_TCP_MD5SIG/* We really want to reject the packet as early as possible if :* We're expecting an MD5'd packet and this is no MD5 tcp option.* There is an MD5 option and we're not expecting one.*/if (tcp_v4_inbound_md5_hash(sk, skb))goto discard;
#endif/* 当状态为ESTABLISHED时,用tcp_rcv_established()接收处理 */if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */struct dst_entry *dst = sk->sk_rx_dst;sock_rps_save_rxhash(sk, skb);if (dst) {if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || dst->ops->check(dst, 0) == NULL) {dst_release(dst);sk->sk_rx_dst = NULL;}}/* 连接已建立时的处理路径 */tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);return 0;}/* 检查报文长度、报文校验和 */if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))goto csum_err;/* 如果这个sock处于监听状态,被动打开时的处理,包括收到SYN或ACK */if (sk->sk_state == TCP_LISTEN) {/* 返回值:* NULL,错误* nsk == sk,接收到SYN* nsk != sk,接收到ACK*/struct sock *nsk = tcp_v4_hnd_req(sk, skb);if (! nsk)goto discard;if (nsk != sk) { /* 接收到ACK时 */sock_rps_save_rxhash(nsk, skb);if (tcp_child_process(sk, nsk, skb)) { /* 处理新的sock */rsk = nsk;goto reset;}return 0;}} elsesock_rps_save_rx(sk, skb);/* 处理除了ESTABLISHED和TIME_WAIT之外的所有状态,包括SYN_SENT状态 */if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {rsk = sk;goto reset;}return 0;reset:tcp_v4_send_reset(rsk, skb); /* 发送被动的RST包 */discard:kfree_skb(skb);return 0;csum_err:TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);goto discard;
}
tcp_rcv_established
1、状态为ESTABLISHED时,用tcp_rcv_established()接收处理。
2. 状态为LISTEN时,说明这个sock处于监听状态,用于被动打开的接收处理,包括SYN和ACK。
3. 当状态不为ESTABLISHED或TIME_WAIT时,用tcp_rcv_state_process()处理。
所在目录:/linux-5.11.1/net/ipv4/tcp_input.c。
int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,struct tcphdr *th, unsigned len)
{struct tcp_sock *tp = tcp_sk(sk);int res;/** Header prediction.* The code loosely follows the one in the famous* "30 instruction TCP receive" Van Jacobson mail.** Van's trick is to deposit buffers into socket queue* on a device interrupt, to call tcp_recv function* on the receive process context and checksum and copy* the buffer to user space. smart...** Our current scheme is not silly either but we take the* extra cost of the net_bh soft interrupt processing...* We do checksum and copy also but from device to kernel.*/tp->rx_opt.saw_tstamp = 0;/* pred_flags is 0xS?10 << 16 + snd_wnd* if header_prediction is to be made* 'S' will always be tp->tcp_header_len >> 2* '?' will be 0 for the fast path, otherwise pred_flags is 0 to* turn it off (when there are holes in the receive* space for instance)* PSH flag is ignored.*///预定向标志和输入数据段的标志比较//数据段序列号是否正确if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {int tcp_header_len = tp->tcp_header_len;/* Timestamp header prediction: tcp_header_len* is automatically equal to th->doff*4 due to pred_flags* match.*//* Check timestamp *///时间戳选项之外如果还有别的选项就送给Slow Path处理if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {/* No? Slow path! */if (!tcp_parse_aligned_timestamp(tp, th))goto slow_path;//对数据包做PAWS快速检查,如果检查走Slow Path处理/* If PAWS failed, check it more carefully in slow path */if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)goto slow_path;/* DO NOT update ts_recent here, if checksum fails* and timestamp was corrupted part, it will result* in a hung connection since we will drop all* future packets due to the PAWS test.*/}//数据包长度太小if (len <= tcp_header_len) {/* Bulk data transfer: sender */if (len == tcp_header_len) {/* Predicted packet is in window by definition.* seq == rcv_nxt and rcv_wup <= rcv_nxt.* Hence, check seq<=rcv_wup reduces to:*/if (tcp_header_len ==(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&tp->rcv_nxt == tp->rcv_wup)tcp_store_ts_recent(tp);/* We know that such packets are checksummed* on entry.*/tcp_ack(sk, skb, 0);__kfree_skb(skb);tcp_data_snd_check(sk);return 0;} else { /* Header too small */TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);goto discard;}} else {int eaten = 0;int copied_early = 0;//tp->copied_seq表示未读的数据包序列号//tp->rcv_nxt表示下一个期望读取的数据包序列号//len-tcp_header_len小于tp->ucpoy.len表示数据包还没有复制完if (tp->copied_seq == tp->rcv_nxt &&len - tcp_header_len <= tp->ucopy.len) {
#ifdef CONFIG_NET_DMAif (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {copied_early = 1;eaten = 1;}
#endif//当前进程是否有锁定//当前进程的全局指针current//tp->ucopy.task指针是否等于当前进程if (tp->ucopy.task == current &&sock_owned_by_user(sk) && !copied_early) {__set_current_state(TASK_RUNNING);//将数据包复制到应用层空间if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))eaten = 1;}//复制成功if (eaten) {/* Predicted packet is in window by definition.* seq == rcv_nxt and rcv_wup <= rcv_nxt.* Hence, check seq<=rcv_wup reduces to:*/if (tcp_header_len ==(sizeof(struct tcphdr) +TCPOLEN_TSTAMP_ALIGNED) &&tp->rcv_nxt == tp->rcv_wup)tcp_store_ts_recent(tp);tcp_rcv_rtt_measure_ts(sk, skb);__skb_pull(skb, tcp_header_len);tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);}//清除prequeue队列中已经复制的数据包,并回复ackif (copied_early)tcp_cleanup_rbuf(sk, skb->len);}//复制不成功if (!eaten) {//从新计算校验和if (tcp_checksum_complete_user(sk, skb))goto csum_error;/* Predicted packet is in window by definition.* seq == rcv_nxt and rcv_wup <= rcv_nxt.* Hence, check seq<=rcv_wup reduces to:*/if (tcp_header_len ==(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&tp->rcv_nxt == tp->rcv_wup)tcp_store_ts_recent(tp);tcp_rcv_rtt_measure_ts(sk, skb);if ((int)skb->truesize > sk->sk_forward_alloc)goto step5;//大块数据传送NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);/* Bulk data transfer: receiver *///去掉tcp头部__skb_pull(skb, tcp_header_len);//将数据包加入到sk_receive_queue队列中__skb_queue_tail(&sk->sk_receive_queue, skb);skb_set_owner_r(skb, sk);tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;}//更新延迟回答时钟超时间隔值tcp_event_data_recv(sk, skb);if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {/* Well, only one small jumplet in fast path... */tcp_ack(sk, skb, FLAG_DATA);tcp_data_snd_check(sk);if (!inet_csk_ack_scheduled(sk))goto no_ack;}//收到数据后回复ack确认if (!copied_early || tp->rcv_nxt != tp->rcv_wup)__tcp_ack_snd_check(sk, 0);
no_ack:
#ifdef CONFIG_NET_DMAif (copied_early)__skb_queue_tail(&sk->sk_async_wait_queue, skb);else
#endifif (eaten)__kfree_skb(skb);else//no_ack标签表明套接字已经准备好下一次应用读sk->sk_data_ready(sk, 0);return 0;}}slow_path:if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))goto csum_error;/** Standard slow path.*/res = tcp_validate_incoming(sk, skb, th, 1);if (res <= 0)return -res;step5:if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)goto discard;tcp_rcv_rtt_measure_ts(sk, skb);/* Process urgent data. *///紧急数据段处理tcp_urg(sk, skb, th);/* step 7: process the segment text *///根据情况将数据复制到应用层或者//将数据加入sk_receive_queue常规队列中tcp_data_queue(sk, skb);tcp_data_snd_check(sk);tcp_ack_snd_check(sk);return 0;csum_error:TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);discard:__kfree_skb(skb);return 0;
}
tcp_recvmsg
用户进程调用recvfrom读取套接字缓冲区上的数据,实际是调用tcp_recvmsg函数将数据包从内核地址空间复制到用户考地址空间。
所在目录:/linux-5.11.1/net/ipv4/tcp.c。
int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,size_t len, int nonblock, int flags, int *addr_len)
{//获取TCP套接字结构struct tcp_sock *tp = tcp_sk(sk);int copied = 0;u32 peek_seq;u32 *seq;unsigned long used;int err;int target; /* Read at least this many bytes */long timeo;struct task_struct *user_recv = NULL;int copied_early = 0;struct sk_buff *skb;u32 urg_hole = 0;//锁住套接字,其实就是设置sk->sk_lock.owned = 1//当产生软中断调用tcp_v4_rcv获取套接字sock发现//sock处于进程上下文,就会把数据包加入到balock_queue队列中lock_sock(sk);TCP_CHECK_TIMER(sk);err = -ENOTCONN;//套接字当前处于监听状态就直接跳出if (sk->sk_state == TCP_LISTEN)goto out;//查实时间,如果是非阻塞模式就为0timeo = sock_rcvtimeo(sk, nonblock);//紧急处理数据/* Urgent data needs to be handled specially. */if (flags & MSG_OOB)goto recv_urg;//未读取数据包的开始序列号seq = &tp->copied_seq;if (flags & MSG_PEEK) {peek_seq = tp->copied_seq;seq = &peek_seq;}//取len和sk->rcvlowat中的最小值//MSG_WAITALL标志是判断是否要接受完整的数据包后再拷贝复制数据包target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);//配置了DMA可以直接通过访问内存复制数据到用户地址空间
#ifdef CONFIG_NET_DMAtp->ucopy.dma_chan = NULL;preempt_disable();skb = skb_peek_tail(&sk->sk_receive_queue);{int available = 0;if (skb)available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);if ((available < target) &&(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&!sysctl_tcp_low_latency &&dma_find_channel(DMA_MEMCPY)) {preempt_enable_no_resched();tp->ucopy.pinned_list =dma_pin_iovec_pages(msg->msg_iov, len);} else {preempt_enable_no_resched();}}
#endif//主循环,复制数据到用户地址空间直到target为0do {u32 offset;/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. *///遇到紧急数据停止处理跳出循环if (tp->urg_data && tp->urg_seq == *seq) {if (copied)break;//检测套接字上是否有信号等待处理,确保能处理SIGUSR信号。if (signal_pending(current)) {//检查是否超时copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;break;}}/* Next get a buffer. *///循环变量接受缓冲区队列receive_queue队列skb_queue_walk(&sk->sk_receive_queue, skb) {/* Now that we have two receive queues this* shouldn't happen.*/if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),KERN_INFO "recvmsg bug: copied %X ""seq %X rcvnxt %X fl %X\n", *seq,TCP_SKB_CB(skb)->seq, tp->rcv_nxt,flags))break;//未读取数据包的序列号和已经读取数据包的序列号差//如果这个差小于数据包长度skb->len,表示这是我们要找的数据包//因为是最小的序列号offset = *seq - TCP_SKB_CB(skb)->seq;//如果是syn表就跳过if (tcp_hdr(skb)->syn)offset--;//找到了skb,跳转到found_ok_skb处完成复制工作if (offset < skb->len)goto found_ok_skb;//发现是fin包调转到fin处理标签处if (tcp_hdr(skb)->fin)goto found_fin_ok;WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: ""copied %X seq %X rcvnxt %X fl %X\n",*seq, TCP_SKB_CB(skb)->seq,tp->rcv_nxt, flags);}/* Well, if we have backlog, try to process it now yet. *///缓冲区recieve_queue队列中已经没有数据//而且backlog_queue队列中也没有数据了就跳出循环if (copied >= target && !sk->sk_backlog.tail)break;if (copied) {//检查套接字的状态是否是关闭//或者收到远端的断开请求,则要跳出复制循环if (sk->sk_err ||sk->sk_state == TCP_CLOSE ||(sk->sk_shutdown & RCV_SHUTDOWN) ||!timeo ||signal_pending(current))break;} else {//copied为0表示应用层没有复制到数据,没有复制到数据有三种可能//第一是套接字已经关闭了,第二是缓冲区根本没有数据 //第三是其他错误if (sock_flag(sk, SOCK_DONE))break;if (sk->sk_err) {copied = sock_error(sk);break;}if (sk->sk_shutdown & RCV_SHUTDOWN)break;if (sk->sk_state == TCP_CLOSE) {//当用户关闭套接字会设置SOCK_DON标志//连接状态是TCP_CLOSE,SOCK_DONE标志就不会0if (!sock_flag(sk, SOCK_DONE)) {/* This occurs when user tries to read* from never connected socket.*/copied = -ENOTCONN;break;}break;}//查看是否阻塞,不阻塞直接返回//返回的错误标志是EAGAINif (!timeo) {copied = -EAGAIN;break;}//读取数据失败可能是其他错误//返回错误原因if (signal_pending(current)) {copied = sock_intr_errno(timeo);break;}}//根据已经复制数据长度copied清除recieve_queue队列//并且回复对端ack包tcp_cleanup_rbuf(sk, copied);//sk_recieve_queue队列中已无数据需要处理就处理preueue队列上的数据//prequeue队列的处理现场是用户进程if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {/* Install new reader */if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {//复制pre_queue队列的进程是当前进程user_recv = current;//处理数据的用户进程tp->ucopy.task = user_recv;//应用层接受数据的缓冲区地址tp->ucopy.iov = msg->msg_iov;}//拷贝数据长度tp->ucopy.len = len;WARN_ON(tp->copied_seq != tp->rcv_nxt &&!(flags & (MSG_PEEK | MSG_TRUNC)));//prequeu队列不为空,必须在释放套接字之前处理这些数据包//如果这个处理没有完成则数据段顺序将会被破坏,接受段处理顺序是//flight中的数据、backlog队列、prequeue队列、sk_receive_queue队列,只有当前队列处理//完成了才会去处理下一个队列。prequeue队列可能在循环结束套接字释放前又//加入数据包,调转到do_prequeue标签处理if (!skb_queue_empty(&tp->ucopy.prequeue))goto do_prequeue;/* __ Set realtime policy in scheduler __ */}
#ifdef CONFIG_NET_DMAif (tp->ucopy.dma_chan)dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
#endif//数据包复制完毕if (copied >= target) {/* Do not sleep, just process backlog. *///从backlog队列中复制数据包到sk_receive_queue队列release_sock(sk);lock_sock(sk);} else//已经没有数据要处理,将套接字放入等待状态,进程进入睡眠//如果有数据段来了tcp_prequeue会唤醒进程,软中断会判断用户进程睡眠//如果睡眠就会把数据放到prequeue队列中sk_wait_data(sk, &timeo);
#ifdef CONFIG_NET_DMAtcp_service_net_dma(sk, false); /* Don't block */tp->ucopy.wakeup = 0;
#endifif (user_recv) {int chunk;/* __ Restore normal policy in scheduler __ */if ((chunk = len - tp->ucopy.len) != 0) {NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);//更新剩余数据长度len -= chunk;//更新已经复制的数据长度copied += chunk;}//tp->rcv_nxt == tp->copied_seq判断receive_queue队列中释放有数据if (tp->rcv_nxt == tp->copied_seq &&!skb_queue_empty(&tp->ucopy.prequeue)) {
do_prequeue://处理prequeue队列tcp_prequeue_process(sk);if ((chunk = len - tp->ucopy.len) != 0) {NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);//更新剩余需要复制数据长度len -= chunk;//更新复制的数据copiedcopied += chunk;}}}if ((flags & MSG_PEEK) &&(peek_seq - copied - urg_hole != tp->copied_seq)) {if (net_ratelimit())printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",current->comm, task_pid_nr(current));peek_seq = tp->copied_seq;}continue;//处理sk_receive_queue队列中的数据found_ok_skb:/* Ok so how much can we use? */used = skb->len - offset;if (len < used)used = len;/* Do we have urgent data here? *///首先查看是否有紧急数据需要处理//如果设置套接字选项设置了SO_OOBINLINE就不需要处理紧急数据//因为有单独处理if (tp->urg_data) {u32 urg_offset = tp->urg_seq - *seq;if (urg_offset < used) {if (!urg_offset) {if (!sock_flag(sk, SOCK_URGINLINE)) {++*seq;urg_hole++;offset++;used--;if (!used)goto skip_copy;}} elseused = urg_offset;}}if (!(flags & MSG_TRUNC)) {
#ifdef CONFIG_NET_DMAif (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);if (tp->ucopy.dma_chan) {tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, skb, offset,msg->msg_iov, used,tp->ucopy.pinned_list);if (tp->ucopy.dma_cookie < 0) {printk(KERN_ALERT "dma_cookie < 0\n");/* Exception. Bailout! */if (!copied)copied = -EFAULT;break;}dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);if ((offset + used) == skb->len)copied_early = 1;} else
#endif{//将数据包从内核地址空间复制到用户地址空间err = skb_copy_datagram_iovec(skb, offset,msg->msg_iov, used);if (err) {/* Exception. Bailout! */if (!copied)copied = -EFAULT;break;}}}//更新数据包序列号*seq += used;//更新已复制的数据长度copied += used;//更新剩下需要复制的数据长度len -= used;//重新调整tcp接受窗口tcp_rcv_space_adjust(sk);skip_copy:if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {tp->urg_data = 0;//处理完了紧急数据,调转到Fast Path处理tcp_fast_path_check(sk);}if (used + offset < skb->len)continue;if (tcp_hdr(skb)->fin)goto found_fin_ok;if (!(flags & MSG_PEEK)) {sk_eat_skb(sk, skb, copied_early);copied_early = 0;}continue;//套接字状态是Finfound_fin_ok:/* Process the FIN. *///序列号加1++*seq;if (!(flags & MSG_PEEK)) {//重新计算tcp窗口sk_eat_skb(sk, skb, copied_early);copied_early = 0;}break;} while (len > 0);//主循环处理结束后,prequeue队列中还有数据则必须继续处理if (user_recv) {if (!skb_queue_empty(&tp->ucopy.prequeue)) {int chunk;tp->ucopy.len = copied > 0 ? len : 0;tcp_prequeue_process(sk);if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);len -= chunk;copied += chunk;}}tp->ucopy.task = NULL;tp->ucopy.len = 0;}#ifdef CONFIG_NET_DMAtcp_service_net_dma(sk, true); /* Wait for queue to drain */tp->ucopy.dma_chan = NULL;if (tp->ucopy.pinned_list) {dma_unpin_iovec_pages(tp->ucopy.pinned_list);tp->ucopy.pinned_list = NULL;}
#endif/* According to UNIX98, msg_name/msg_namelen are ignored* on connected socket. I was just happy when found this 8) --ANK*//* Clean up data we have read: This will do ACK frames. */tcp_cleanup_rbuf(sk, copied);TCP_CHECK_TIMER(sk);release_sock(sk);if (copied > 0)uid_stat_tcp_rcv(current_uid(), copied);return copied;out:TCP_CHECK_TIMER(sk);release_sock(sk);return err;recv_urg://紧急数据处理,复制紧急数据到用户地址空间err = tcp_recv_urg(sk, msg, len, flags);if (err > 0)uid_stat_tcp_rcv(current_uid(), err);goto out;
}