1.新建socket 函数原形: static int inet_create(struct socket *sock, int protocol) 在net/ipv4/af_inet.c中 详细解释 static int inet_create(struct socket *sock, int protocol) { struct sock *sk; struct proto *prot; sock->state = SS_UNCONNECTED; /* 设置状态为未连接 */ sk = sk_alloc(PF_INET, GFP_KERNEL, 1); /* 申请sock所需的内存 */ /* net/core/sock.c */ if (sk == NULL) goto do_oom; switch (sock->type) { case SOCK_STREAM: /* TCP协议 */ if (protocol && protocol != IPPROTO_TCP) goto free_and_noproto; protocol = IPPROTO_TCP; prot = &tcp_prot; /* tcp_prot定义在net/ipv4/tcp_ipv4.c */ sock->ops = &inet_stream_ops; /* 针对STREAM的socket操作 */ break; case SOCK_SEQPACKET: /* 不支持 */ goto free_and_badtype; case SOCK_DGRAM: /* UDP协议 */ if (protocol && protocol != IPPROTO_UDP) goto free_and_noproto; protocol = IPPROTO_UDP; sk->no_check = UDP_CSUM_DEFAULT; prot=&udp_prot; /* udp_prot定义在net/ipv4/udp.c */ sock->ops = &inet_dgram_ops; /* 针对DGRAM的socket操作 */ break; case SOCK_RAW: /* RAW */ if (!capable(CAP_NET_RAW)) /* 判断是否有权利建立SOCK_RAW */ goto free_and_badperm; if (!protocol) /* protocol不能为0 */ goto free_and_noproto; prot = &raw_prot; /* raw_prot定义在net/ipv4/raw.c */ sk->reuse = 1; /* 允许地址重用 */ sk->num = protocol; sock->ops = &inet_dgram_ops; /* RAW的一些特性和DGRAM相同 */ if (protocol == IPPROTO_RAW) sk->protinfo.af_inet.hdrincl = 1; /* 允许自己定制ip头 */ break; default: goto free_and_badtype; } if (ipv4_config.no_pmtu_disc) sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; else sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT; sk->protinfo.af_inet.id = 0; sock_init_data(sock,sk); /* 初始化一些数据 */ /* net/core/sock.c */ sk->destruct = inet_sock_destruct; /* 当销毁socket时调用inet_sock_destruct */ sk->zapped = 0; sk->family = PF_INET; sk->protocol = protocol; sk->prot = prot; sk->backlog_rcv = prot->backlog_rcv; /* prot->backlog_rcv()见各个类型的定义 */ sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl; /* 设置默认ttl */ /* 修改/proc/sys/net/ipv4/ip_default_ttl */ sk->protinfo.af_inet.mc_loop = 1; sk->protinfo.af_inet.mc_ttl = 1; sk->protinfo.af_inet.mc_index = 0; sk->protinfo.af_inet.mc_list = NULL; #ifdef INET_REFCNT_DEBUG atomic_inc(&inet_sock_nr); #endif if (sk->num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ sk->sport = htons(sk->num); /* 设置本地端口 */ /* Add to protocol hash chains. */ sk->prot->hash(sk); } if (sk->prot->init) { int err = sk->prot->init(sk); /* 协议对socket的初始化 */ if (err != 0) { inet_sock_release(sk); return(err); } } return(0); free_and_badtype: sk_free(sk); /* 释放内存 */ return -ESOCKTNOSUPPORT; free_and_badperm: sk_free(sk); return -EPERM; free_and_noproto: sk_free(sk); return -EPROTONOSUPPORT; do_oom: return -ENOBUFS; } 在net/core/sock.c void sock_init_data(struct socket *sock, struct sock *sk) { skb_queue_head_init(&sk->receive_queue); /* 初始化3条队列 接受,发送,错误*/ skb_queue_head_init(&sk->write_queue); skb_queue_head_init(&sk->error_queue); init_timer(&sk->timer); /* 初始化timer */
sk->allocation = GFP_KERNEL; sk->rcvbuf = sysctl_rmem_default; sk->sndbuf = sysctl_wmem_default; sk->state = TCP_CLOSE; sk->zapped = 1; sk->socket = sock; if(sock) { sk->type = sock->type; sk->sleep = &sock->wait; sock->sk = sk; } else sk->sleep = NULL; sk->dst_lock = RW_LOCK_UNLOCKED; sk->callback_lock = RW_LOCK_UNLOCKED; /* sock_def_wakeup(),sock_def_readable(), sock_def_write_space(),sock_def_error_report(), sock_def_destruct() 在net/core/sock.c */ sk->state_change = sock_def_wakeup; sk->data_ready = sock_def_readable; sk->write_space = sock_def_write_space; sk->error_report = sock_def_error_report; sk->destruct = sock_def_destruct; sk->peercred.pid = 0; sk->peercred.uid = -1; sk->peercred.gid = -1; sk->rcvlowat = 1; sk->rcvtimeo = MAX_SCHEDULE_TIMEOUT; /* 设置接受,发送超时 */ sk->sndtimeo = MAX_SCHEDULE_TIMEOUT; atomic_set(&sk->refcnt, 1); } 1.1 SOCK_STREAM的初始化 在net/ipv4/tcp_ipv4.c static int tcp_v4_init_sock(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); tp->rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ tp->snd_cwnd = 2; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; sk->state = TCP_CLOSE; sk->write_space = tcp_write_space; /* tcp_write_space() 在net/ipv4/tcp.c */ sk->use_write_queue = 1; sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific; /* ipv4_specific 在net/ipv4/tcp_ipv4.c */ sk->sndbuf = sysctl_tcp_wmem[1]; /* 设置发送和接收缓冲区大小 */ sk->rcvbuf = sysctl_tcp_rmem[1]; /* sysctl_tcp_* 在net/ipv4/tcp.c */ atomic_inc(&tcp_sockets_allocated); /* tcp_sockets_allocated是当前TCP socket的数量 */ return 0; } SOCK_DGRAM无初始化 1.2 SOCK_RAW初始化 在net/ipv4/raw.c static int raw_init(struct sock *sk) { struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4); if (sk->num == IPPROTO_ICMP) memset(&tp->filter, 0, sizeof(tp->filter)); return 0; } 2.Server 2.1 bind static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr=(struct sockaddr_in *)uaddr; struct sock *sk=sock->sk; unsigned short snum; int chk_addr_ret; int err; /* If the socket has its own bind function then use it. (RAW) */ if(sk->prot->bind) return sk->prot->bind(sk, uaddr, addr_len); /* 只有SOCK_RAW定义了自己的bind函数 */ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); /* inet_addr_type返回地址的类型 */ /* 在net/ipv4/fib_frontend.c */ /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since * allowing applications to make a non-local bind solves * several problems with systems using dynamic addressing. * (ie. your servers still start up even if your ISDN link * is temporarily down) */ if (sysctl_ip_nonlocal_bind == 0 && sk->protinfo.af_inet.freebind == 0 && addr->sin_addr.s_addr != INADDR_ANY && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) return -EADDRNOTAVAIL; snum = ntohs(addr->sin_port); if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) /* 检查是否有权利bind端口到1-1024 */ return -EACCES; /* We keep a pair of addresses. rcv_saddr is the one * used by hash lookups, and saddr is used for transmit. * * In the BSD API these are the same except where it * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ lock_sock(sk); /* Check these errors (active socket, double bind). */ err = -EINVAL; if ((sk->state != TCP_CLOSE) || (sk->num != 0)) goto out; sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) sk->saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ if (sk->prot->get_port(sk, snum) != 0) { /* get_port检查是否重用 */ sk->saddr = sk->rcv_saddr = 0; err = -EADDRINUSE; goto out; } if (sk->rcv_saddr) sk->userlocks |= SOCK_BINDADDR_LOCK; if (snum) sk->userlocks |= SOCK_BINDPORT_LOCK; sk->sport = htons(sk->num); sk->daddr = 0; sk->dport = 0; sk_dst_reset(sk); err = 0; out: release_sock(sk); return err; } SOCK_STREAM和SOCK_DGRAM用默认的bind 2.1.1 SOCK_RAW的bind 在net/ipv4/raw.c static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; int ret = -EINVAL; int chk_addr_ret; if (sk->state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); /* inet_addr_type返回地址的类型 */ /* 在net/ipv4/fib_frontend.c */ ret = -EADDRNOTAVAIL; if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr; /* sk->rcv_saddr 捆绑的本地地址 */ /* sk->saddr 源地址 */ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) sk->saddr = 0; /* Use device */ /* 地址类型如为多播或是广播源地址为0 */ sk_dst_reset(sk); ret = 0; out: return ret; } 2.2 listen 2.2.1 SOCK_STREAM的listen 在net/ipv4/af_inet.c int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; int err; lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto out; old_state = sk->state; if (!((1<<old_state)&(TCPF_CLOSE|TCPF_LISTEN))) goto out; /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { err = tcp_listen_start(sk); /* 真正实现TCP协议listen */ if (err) goto out; } sk->max_ack_backlog = backlog; err = 0; out: release_sock(sk); return err; } tcp_listen_start在net/ipv4/tcp.h int tcp_listen_start(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct tcp_listen_opt *lopt; sk->max_ack_backlog = 0; sk->ack_backlog = 0; tp->accept_queue = tp->accept_queue_tail = NULL; tp->syn_wait_lock = RW_LOCK_UNLOCKED; tcp_delack_init(tp); /* tp清0 */ /* include/net/tcp.h */ lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL); if (!lopt) return -ENOMEM; memset(lopt, 0, sizeof(struct tcp_listen_opt)); for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++) if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog) break; write_lock_bh(&tp->syn_wait_lock); tp->listen_opt = lopt; write_unlock_bh(&tp->syn_wait_lock); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only * after validation is complete. */ sk->state = TCP_LISTEN; if (sk->prot->get_port(sk, sk->num) == 0) { /* 确认地址没有重用 */ sk->sport = htons(sk->num); /* 设置源端口 */ sk_dst_reset(sk); sk->prot->hash(sk); /* 将端口加到hash表中 */ return 0; } sk->state = TCP_CLOSE; write_lock_bh(&tp->syn_wait_lock); tp->listen_opt = NULL; write_unlock_bh(&tp->syn_wait_lock); kfree(lopt); return -EADDRINUSE; } SOCK_DGRAM 和 SOCK_RAW 不支持listen 2.3 accept 2.3.1 SOCK_STREAM的accept 在net/ipv4/af_inet.c int inet_accept(struct socket *sock, struct socket *newsock, int flags) { struct sock *sk1 = sock->sk; struct sock *sk2; int err = -EINVAL; if((sk2 = sk1->prot->accept(sk1,flags,&err)) == NULL) goto do_err; lock_sock(sk2); BUG_TRAP((1<<sk2->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_CLOSE)); sock_graft(sk2, newsock); /* 将sk2转接到newsock */ /* 在include/net/sock.h */ newsock->state = SS_CONNECTED; release_sock(sk2); return 0; do_err: return err; } SOCK_DGRAM 和 SOCK_RAW 不支持 accept 2.3.1.1 TCP协议的accept 在net/ipv4/tcp.c struct sock *tcp_accept(struct sock *sk, int flags, int *err) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct open_request *req; struct sock *newsk; int error; lock_sock(sk); /* We need to make sure that this socket is listening, * and that it has something pending. */ error = -EINVAL; if (sk->state != TCP_LISTEN) /* 检查socket是否处于listen状态 */ goto out; /* Find already established connection */ if (!tp->accept_queue) { /* 判断accept队列是否准备好 */ long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); /* 判断是否为堵塞模式 */ /* 在include/net/sock.h */ /* If this is a non blocking socket don't sleep */ error = -EAGAIN; if (!timeo) /* 不堵塞模式,直接返回 */ goto out; error = wait_for_connect(sk, timeo); /* 进入空闲等待连接 */ if (error) goto out; } req = tp->accept_queue; if ((tp->accept_queue = req->dl_next) == NULL) tp->accept_queue_tail = NULL; newsk = req->sk; tcp_acceptq_removed(sk); /* sk当前连接数减1 */ /*在include/net/tcp.h */ tcp_openreq_fastfree(req); /* 释放内存 */ /*在include/net/tcp.h */ BUG_TRAP(newsk->state != TCP_SYN_RECV); release_sock(sk); return newsk; out: release_sock(sk); *err = error; return NULL; } /* 只有当socket为堵塞模式,该函数才会被调用 */ /* 在net/ipv4/tcp.c */ static int wait_for_connect(struct sock * sk, long timeo) { DECLARE_WAITQUEUE(wait, current); int err; /* * True wake-one mechanism for incoming connections: only * one process gets woken up, not the 'whole herd'. * Since we do not 'race & poll' for established sockets * anymore, the common case will execute the loop only once. * * Subtle issue: "add_wait_queue_exclusive()" will be added * after any current non-exclusive waiters, and we know that * it will always _stay_ after any new non-exclusive waiters * because all non-exclusive waiters are added at the * beginning of the wait-queue. As such, it's ok to "drop" * our exclusiveness temporarily when we get woken up without * having to remove and re-insert us on the wait queue. */ add_wait_queue_exclusive(sk->sleep, &wait); for (;;) { current->state = TASK_INTERRUPTIBLE; release_sock(sk); if (sk->tp_pinfo.af_tcp.accept_queue == NULL) timeo = schedule_timeout(timeo); /* 休眠timeo时长 */ lock_sock(sk); err = 0; if (sk->tp_pinfo.af_tcp.accept_queue) /* accept队列可用 */ /* 也就是有连接进入 */ break; err = -EINVAL; if (sk->state != TCP_LISTEN) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; if (!timeo) break; } current->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); return err; } 3.Client 3.1 connect 3.1.1 SOCK_STREAM的connect 在net/ipv4/af_inet.c
int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, int addr_len, int flags) { struct sock *sk=sock->sk; int err; long timeo; lock_sock(sk); if (uaddr->sa_family == AF_UNSPEC) { err = sk->prot->disconnect(sk, flags); /* 关闭连接 */ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; } switch (sock->state) { default: err = -EINVAL; goto out; case SS_CONNECTED: err = -EISCONN; goto out; case SS_CONNECTING: err = -EALREADY; /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: err = -EISCONN; if (sk->state != TCP_CLOSE) goto out; err = -EAGAIN; if (sk->num == 0) { if (sk->prot->get_port(sk, 0) != 0) /* 是否重用 */ goto out; sk->sport = htons(sk->num); } err = sk->prot->connect(sk, uaddr, addr_len); /* 调用协议的connect */ if (err < 0) goto out; sock->state = SS_CONNECTING; /* socket状态设置成连接中 */ /* Just entered SS_CONNECTING state; the only * difference is that return value in non-blocking * case is EINPROGRESS, rather than EALREADY. */ err = -EINPROGRESS; break; } timeo = sock_sndtimeo(sk, flags&O_NONBLOCK); /* 是否为堵塞模式 */ /* 在include/net/sock.h */ if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) { /* 连接为完成 */ /* Error code is set above */ if (!timeo || !inet_wait_for_connect(sk, timeo)) /* 非堵塞模式立即返回 */ /* 堵塞模式调用inet_wait_for_connect() */ goto out; err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } /* Connection was closed by RST, timeout, ICMP error * or another process disconnected us. */ if (sk->state == TCP_CLOSE) goto sock_error; /* sk->err may be not zero now, if RECVERR was ordered by user * and error was received after socket entered established state. * Hence, it is handled normally after connect() return successfully. */ sock->state = SS_CONNECTED; /* 设置状态为已连接 */ err = 0; out: release_sock(sk); return err; sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; if (sk->prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; goto out; } /* 只有当socket为堵塞模式,该函数才会被调用 */ /* 在/net/ipv4/af_inet.c */ static long inet_wait_for_connect(struct sock *sk, long timeo) { DECLARE_WAITQUEUE(wait, current); __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(sk->sleep, &wait); /* Basic assumption: if someone sets sk->err, he _must_ * change state of the socket from TCP_SYN_*. * Connect() does not allow to get error notifications * without closing the socket. */ while ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) { release_sock(sk); timeo = schedule_timeout(timeo); /* 进入休眠 */ lock_sock(sk); if (signal_pending(current) || !timeo) break; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); remove_wait_queue(sk->sleep, &wait); return timeo; }  
|