socket建立连接 sys_connect


请尊重原创版权,转载注明出处。

    connect()函数调用过程

connect(fd, servaddr, addrlen);
-> SYSCALL_DEFINE3()
-> sock->ops->connect() == inet_stream_connect (sock->ops即inet_stream_ops)
-> tcp_v4_connect()
    -> inet_hash_connect()
        -> __inet_hash_connect()
            -> check_established()
                -> __inet_check_established()
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
        int, addrlen)
{
    struct socket *sock;
    struct sockaddr_storage address;
    int err, fput_needed;
    /* 找到文件描述符对应的BSD socket结构,在前面的socket调用中建立*/
    sock = sockfd_lookup_light(fd, &err, &fput_needed);
    if (!sock)
        goto out;
    /* copy对端的地址到内核空间 */
    err = move_addr_to_kernel(uservaddr, addrlen, (struct sockaddr *)&address);
    if (err < 0)
        goto out_put;

    err =
        security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
    if (err)
        goto out_put;
    /* 调用该BSD socket对应的connect调用 */
    err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
                 sock->file->f_flags);
out_put:
    /* 释放文件的引用 */
    fput_light(sock->file, fput_needed);
out:
    return err;
}
/*
 *    Connect to a remote host. There is regrettably still a little
 *    TCP 'magic' in here.
 */
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
            int addr_len, int flags)
{
    struct sock *sk = sock->sk;
    int err;
    long timeo;

    lock_sock(sk);

    if (uaddr->sa_family == AF_UNSPEC) {
        err = sk->sk_prot->disconnect(sk, flags);
        sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
        goto out;
    }

    switch (sock->state) {
    default:
        err = -EINVAL;
        goto out;
    case SS_CONNECTED:     /* 该BSD socket已连接*/
        err = -EISCONN;
        goto out;
    case SS_CONNECTING:   /* 该BSD socket正在连接*/
        err = -EALREADY;
        /* Fall out of switch with err, set for this state */
        break;
    case SS_UNCONNECTED:
        err = -EISCONN;
        if (sk->sk_state != TCP_CLOSE)
            goto out;
            /* INET SOCKET 调用协议特有connect操作符 */
        err = sk->sk_prot->connect(sk, uaddr, addr_len);
        if (err < 0)
            goto out;
            /* 上面的调用完成后,连接并没有完成,*/
        sock->state = SS_CONNECTING;

        /* Just entered SS_CONNECTING state; the only
         * difference is that return value in non-blocking
         * case is EINPROGRESS, rather than EALREADY.
         */
        err = -EINPROGRESS;
        break;
    }
    /* 获取连接超时时间*/
    timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

    if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
        /* Error code is set above 进入定时等待 */
        if (!timeo || !inet_wait_for_connect(sk, timeo))
            goto out;

        err = sock_intr_errno(timeo);
        if (signal_pending(current))
            goto out;
    }

    /* Connection was closed by RST, timeout, ICMP error
     * or another process disconnected us.
     */
    if (sk->sk_state == TCP_CLOSE)
        goto sock_error;

    /* sk->sk_err may be not zero now, if RECVERR was ordered by user
     * and error was received after socket entered established state.
     * Hence, it is handled normally after connect() return successfully.
     */

    sock->state = SS_CONNECTED;
    err = 0;
out:
    release_sock(sk);
    return err;

sock_error:
    err = sock_error(sk) ? : -ECONNABORTED;
    sock->state = SS_UNCONNECTED;
    if (sk->sk_prot->disconnect(sk, flags))
        sock->state = SS_DISCONNECTING;
    goto out;
}
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
    struct inet_sock *inet = inet_sk(sk);
    struct tcp_sock *tp = tcp_sk(sk);
    struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
    struct rtable *rt;
    __be32 daddr, nexthop;
    int tmp;
    int err;

    if (addr_len < sizeof(struct sockaddr_in))
        return -EINVAL;

    if (usin->sin_family != AF_INET)
        return -EAFNOSUPPORT;
    /* 开始准备路由 */
    nexthop = daddr = usin->sin_addr.s_addr;
    if (inet->opt && inet->opt->srr) {
        if (!daddr)
            return -EINVAL;
        nexthop = inet->opt->faddr;
    }
    /* 调用路由模块获取出口信息,这里不深入 */
    tmp = ip_route_connect(&rt, nexthop, inet->saddr,
                   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
                   IPPROTO_TCP,
                   inet->sport, usin->sin_port, sk, 1);
    if (tmp < 0) {
        if (tmp == -ENETUNREACH)
            IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
        return tmp;
    }
    /* 如果获取的路由是广播或多播域, 返回网络不可达,tcp不支持多播与广播 */
    if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
        ip_rt_put(rt);
        return -ENETUNREACH;
    }

    if (!inet->opt || !inet->opt->srr)
        daddr = rt->rt_dst;

    if (!inet->saddr)
        inet->saddr = rt->rt_src;
    inet->rcv_saddr = inet->saddr;

    if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
        /* Reset inherited state */
        tp->rx_opt.ts_recent      = 0;
        tp->rx_opt.ts_recent_stamp = 0;
        tp->write_seq         = 0;
    }

    if (tcp_death_row.sysctl_tw_recycle &&
        !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
        struct inet_peer *peer = rt_get_peer(rt);
        /*
         * VJ's idea. We save last timestamp seen from
         * the destination in peer table, when entering state
         * TIME-WAIT * and initialize rx_opt.ts_recent from it,
         * when trying new connection.
         */
        if (peer != NULL &&
            peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
            tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
            tp->rx_opt.ts_recent = peer->tcp_ts;
        }
    }

    inet->dport = usin->sin_port;
    inet->daddr = daddr;

    inet_csk(sk)->icsk_ext_hdr_len = 0;
    if (inet->opt)
        inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
    /* mss_clamp */
    tp->rx_opt.mss_clamp = 536;

    /* Socket identity is still unknown (sport may be zero).
     * However we set state to SYN-SENT and not releasing socket
     * lock select source port, enter ourselves into the hash tables and
     * complete initialization after this.
     */
    tcp_set_state(sk, TCP_SYN_SENT);
    err = inet_hash_connect(&tcp_death_row, sk);
    if (err)
        goto failure;

    err = ip_route_newports(&rt, IPPROTO_TCP,
                inet->sport, inet->dport, sk);
    if (err)
        goto failure;

    /* OK, now commit destination to socket.  */
    sk->sk_gso_type = SKB_GSO_TCPV4;
    sk_setup_caps(sk, &rt->u.dst);

    if (!tp->write_seq)
        tp->write_seq = secure_tcp_sequence_number(inet->saddr,
                               inet->daddr,
                               inet->sport,
                               usin->sin_port);
    /* id是IP包头的id域 */
    inet->id = tp->write_seq ^ jiffies;

    err = tcp_connect(sk);
    rt = NULL;
    if (err)
        goto failure;

    return 0;

failure:
    /*
     * This unhashes the socket and releases the local port,
     * if necessary.
     */
    tcp_set_state(sk, TCP_CLOSE);
    ip_rt_put(rt);
    sk->sk_route_caps = 0;
    inet->dport = 0;
    return err;
}

    当snum==0时,表明此时源端口没有指定,此时会随机选择一个空闲端口作为此次连接 的源端口。low和high分别表示可用端口的下限和上限,remaining表示可用端口的数,注意这里的可用只是指端口 可以用作源端口,其中部分端口可能已经作为其它socket的端口号在使用了,所以要循环1~remaining, 直到查找到空闲的源端口。

    下面来看下对每个端口的检查,即//choose a valid port部分的代码。 这里要先了解下tcp的内核表组成,udp的表内核表udptable只是一张hash表,tcp的表则稍复杂, 它的名字是tcp_hashinfo,在tcp_init()中被初始化,这个数据结构定义如下(省略了不相关的数据):

struct inet_hashinfo {
    struct inet_ehash_bucket *ehash;
    ……
    struct inet_bind_hashbucket *bhash;
    ……
    struct inet_listen_hashbucket  listening_hash[INET_LHTABLE_SIZE]
                    ____cacheline_aligned_in_smp;
};

    从定义可以看出,tcp表又分成了三张表ehash, bhash, listening_hash,其中ehash, listening_hash对应于socket处在TCP的ESTABLISHED, LISTEN状态,bhash对应于socket已绑定了本地地址。 三者间并不互斥,如一个socket可同时在bhash和ehash中,由于TIME_WAIT是一个比较特殊的状态,所以ehash又分成了 chain和twchain,为TIME_WAIT的socket单独形成一张表。

    回到刚才的代码,现在还只是建立socket连接,使用的就应该是tcp表中的bhash。 首先取得内核tcp表的bind表 – bhash,查看是否已有socket占用:

    如果没有,则调用inet_bind_bucket_create()创建一个bind表项tb, 并插入到bind表中,跳转至goto ok代码段; 如果有,则跳转至goto ok代码段。

    进入ok代码段表明已找到合适的bind表项(无论是创建的还是查找到的),调用inet_bind_hash()赋值源端口inet_num。

    inet_hash_connect()函数只是对__inet_hash_connect()函数进行了简单的封装 。在__inet_hash_connect()中如果已绑定了端口号,并且是和其他传输控制块共享绑定的端口号, 则会调用check_established参数指向的函数来检查这个绑定的端口号是否可用,代码如下所示:

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
        struct sock *sk, u32 port_offset,
        int (*check_established)(struct inet_timewait_death_row *,
        struct sock *, __u16, struct inet_timewait_sock **),
        void (*hash)(struct sock *sk))
{
    struct inet_hashinfo *hinfo = death_row->hashinfo;
    const unsigned short snum = inet_sk(sk)->num;
    struct inet_bind_hashbucket *head;
    struct inet_bind_bucket *tb;
    int ret;
    struct net *net = sock_net(sk);

    if (!snum) {
        int i, remaining, low, high, port;
        static u32 hint;
        u32 offset = hint + port_offset;
        struct hlist_node *node;
        struct inet_timewait_sock *tw = NULL;

        inet_get_local_port_range(&low, &high);
        remaining = (high - low) + 1;

        local_bh_enable();
        for (i = 1; i <= remaining; i++) {
            port = low + (i + offset) % remaining;
            if (inet_is_reserved_local_port(port)
                continue;
            head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
            spin_lock(&head->lock);
            inet_bind_bucket_for_each(tb, node, &head->chain) {
                if (net_eq(ib_net(tb), net) && tb->port == port) {
                    if (tb->fastreuse >= 0)
                        goto next_port;
                    WARN_ON(hlist_empty(&tb->owners));
                    if (!check_established(death_row, sk, port, &tw))
                        goto ok;
                    goto next_port;
                }
            }

            tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port);
            if (!tb) {
                spin_unlock(&head->lock);
                break;
            }
            tb->fastreuse = -1;
            tb->fastreuseport = -1;
            goto ok;
        next_port:
            spin_unlock(&head->lock);
        }
        local_bh_enable();

        return -EADDRNOTAVAIL;

ok:
        hint += i;

        inet_bind_hash(sk, tb, port);
        if (sk_unhashed(sk)) {
            inet_sk(sk)->sport = htons(port);
            hash(sk);
        }
        spin_unlock(&head->lock);
        if (tw) {
            inet_twsk_deschedule(tw, death_row);
            inet_twsk_put(tw);
        }

        ret = 0;
        goto out;
    }

    head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
    tb  = inet_csk(sk)->icsk_bind_hash;
    spin_lock_bh(&head->lock);
    if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
        hash(sk);
        spin_unlock_bh(&head->lock);
        return 0;
    } else {
        spin_unlock(&head->lock);
        /* No definite answer... Walk to established hash table */
        ret = check_established(death_row, sk, snum, NULL);
out:
        local_bh_enable();
        return ret;
    }
}

    (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next)这个判断条件就是用来 判断是不是只有当前传输控制块在使用已绑定的端口,条件为false时,会执行else分支,检查是否可用。 这么看来,调用bind()成功并不意味着这个端口就真的可以用。

    check_established参数对应的函数是__inet_check_established(), 在inet_hash_connect()中可以看到。在上面的代码中我们还注意到调用check_established()时第三个参数为NULL, 这在后面的分析中会用到。

    __inet_check_established()函数中,会分别在TIME_WAIT传输控制块和除TIME_WIAT、 LISTEN状态外的传输控制块中查找是已绑定的端口是否已经使用,代码片段如下所示:

/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
            struct sock *sk, __u16 lport,
            struct inet_timewait_sock **twp)
{
    struct inet_hashinfo *hinfo = death_row->hashinfo;
    struct inet_sock *inet = inet_sk(sk);
    __be32 daddr = inet->rcv_saddr;
    __be32 saddr = inet->daddr;
    int dif = sk->sk_bound_dev_if;
    INET_ADDR_COOKIE(acookie, saddr, daddr)
    const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
    struct net *net = sock_net(sk);
    unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport);
    struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
    spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
    struct sock *sk2;
    const struct hlist_nulls_node *node;
    struct inet_timewait_sock *tw;

    spin_lock(lock);

    /* Check TIME-WAIT sockets first. */
    sk_nulls_for_each(sk2, node, &head->twchain) {
        tw = inet_twsk(sk2);


        if (INET_TW_MATCH(sk2, net, hash, acookie,
                saddr, daddr, ports, dif)) {
            if (twsk_unique(sk, sk2, twp))
                goto unique;
            else
                goto not_unique;
        }
    }
    tw = NULL;

    /* And established part... */
    sk_nulls_for_each(sk2, node, &head->chain) {
        if (INET_MATCH(sk2, net, hash, acookie,
                saddr, daddr, ports, dif))
            goto not_unique;
    }

unique:
    ......
    return 0;

not_unique:
    spin_unlock(lock);
    return -EADDRNOTAVAIL;
}

    如果是TCP套接字,twsk_uniqueue()中会调用tcp_twsk_uniqueue()来判断,返回true的条件如下所示:

int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
    const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
    struct tcp_sock *tp = tcp_sk(sk);

    if (tcptw->tw_ts_recent_stamp &&
            (twp == NULL || (sysctl_tcp_tw_reuse &&
            get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
        ......
        return 1;
    }

    return 0;
}
/*
 * Build a SYN and send it off.
 */
int tcp_connect(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct sk_buff *buff;
    /* 初始化连接对应的INET socket结构的参数,为连接做准备 */
    tcp_connect_init(sk);
    /* 获取一个skb,由于是syn包,没有数据,所以大小是MAX_TCP_HEADER的16位对齐 */
    buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
    if (unlikely(buff == NULL))
        return -ENOBUFS;

    /* Reserve space for headers. */
    skb_reserve(buff, MAX_TCP_HEADER);

    tp->snd_nxt = tp->write_seq;
    /* 设置skb相关参数 */
    tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN);
    /* 设置ECN */
    TCP_ECN_send_syn(sk, buff);

    /* Send it off. */
    /* 保存该数据包的发送时间*/
    TCP_SKB_CB(buff)->when = tcp_time_stamp;
    tp->retrans_stamp = TCP_SKB_CB(buff)->when;
    skb_header_release(buff);
    /* 加入发送队列,待确认后在丢弃*/
    __tcp_add_write_queue_tail(sk, buff);
    sk->sk_wmem_queued += buff->truesize;
    sk_mem_charge(sk, buff->truesize);
    tp->packets_out += tcp_skb_pcount(buff);
    tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);

    /* We change tp->snd_nxt after the tcp_transmit_skb() call
     * in order to make this packet get counted in tcpOutSegs.
     */
    tp->snd_nxt = tp->write_seq;
    tp->pushed_seq = tp->write_seq;
    TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);

    /* Timer for repeating the SYN until an answer. */
    inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
    return 0;
}
/*
 * Do all connect socket setups that can be done AF independent.
 */
static void tcp_connect_init(struct sock *sk)
{
    struct dst_entry *dst = __sk_dst_get(sk);
    struct tcp_sock *tp = tcp_sk(sk);
    __u8 rcv_wscale;

    /* We'll fix this up when we get a response from the other end.
     * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
     */
    tp->tcp_header_len = sizeof(struct tcphdr) +
        (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);

#ifdef CONFIG_TCP_MD5SIG
    if (tp->af_specific->md5_lookup(sk, sk) != NULL)
        tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif

    /* If user gave his TCP_MAXSEG, record it to clamp */
    if (tp->rx_opt.user_mss)
        tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
    tp->max_window = 0;
    /* 初始化MTU probe*/
    tcp_mtup_init(sk);
    /* 设置mss */
    tcp_sync_mss(sk, dst_mtu(dst));

    if (!tp->window_clamp)
        tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
    tp->advmss = dst_metric(dst, RTAX_ADVMSS);
    if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
        tp->advmss = tp->rx_opt.user_mss;

    tcp_initialize_rcv_mss(sk);
    /* 根据接收空间大小初始化一个通告窗口 */
    tcp_select_initial_window(tcp_full_space(sk),
                  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
                  &tp->rcv_wnd,
                  &tp->window_clamp,
                  sysctl_tcp_window_scaling,
                  &rcv_wscale);

    tp->rx_opt.rcv_wscale = rcv_wscale;
    tp->rcv_ssthresh = tp->rcv_wnd;

    sk->sk_err = 0;
    sock_reset_flag(sk, SOCK_DONE);
    tp->snd_wnd = 0;
    /* 更新一些滑动窗口的成员*/
    tcp_init_wl(tp, tp->write_seq, 0);
    tp->snd_una = tp->write_seq;
    tp->snd_sml = tp->write_seq;
    tp->snd_up = tp->write_seq;
    tp->rcv_nxt = 0;
    tp->rcv_wup = 0;
    tp->copied_seq = 0;

    inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
    inet_csk(sk)->icsk_retransmits = 0;
    tcp_clear_retrans(tp);
}

    skb发送后,connect并没有返回,因为此时连接还没有建立,tcp进入等待状态,此时回到前面的inet_stream_connect函数

    在发送syn后进入等待状态

static long inet_wait_for_connect(struct sock *sk, long timeo)
{
    DEFINE_WAIT(wait);
    /* sk_sleep 保存此INET SOCKET的等待队列 */
    prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);

    /* Basic assumption: if someone sets sk->sk_err, he _must_
     * change state of the socket from TCP_SYN_*.
     * Connect() does not allow to get error notifications
     * without closing the socket.
     */
    /* 定时等待知道状态变化 */
    while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
        release_sock(sk);
        timeo = schedule_timeout(timeo);
        lock_sock(sk);
        if (signal_pending(current) || !timeo)
            break;
        prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
    }
    finish_wait(sk->sk_sleep, &wait);
    return timeo;
}
文档信息
--------------
* 版权声明:自由转载-非商用
* 转载: [参考]

参考 参考 参考