From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from lists.gentoo.org (pigeon.gentoo.org [208.92.234.80]) by finch.gentoo.org (Postfix) with ESMTP id E530813838B for ; Mon, 6 Oct 2014 11:39:56 +0000 (UTC) Received: from pigeon.gentoo.org (localhost [127.0.0.1]) by pigeon.gentoo.org (Postfix) with SMTP id 577ACE08D0; Mon, 6 Oct 2014 11:39:56 +0000 (UTC) Received: from smtp.gentoo.org (smtp.gentoo.org [140.211.166.183]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by pigeon.gentoo.org (Postfix) with ESMTPS id 67F1EE08D0 for ; Mon, 6 Oct 2014 11:39:55 +0000 (UTC) Received: from oystercatcher.gentoo.org (oystercatcher.gentoo.org [148.251.78.52]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by smtp.gentoo.org (Postfix) with ESMTPS id C982F33BDDA for ; Mon, 6 Oct 2014 11:39:53 +0000 (UTC) Received: from localhost.localdomain (localhost [127.0.0.1]) by oystercatcher.gentoo.org (Postfix) with ESMTP id 87B16721E for ; Mon, 6 Oct 2014 11:39:52 +0000 (UTC) From: "Mike Pagano" To: gentoo-commits@lists.gentoo.org Content-Transfer-Encoding: 8bit Content-type: text/plain; charset=UTF-8 Reply-To: gentoo-dev@lists.gentoo.org, "Mike Pagano" Message-ID: <1412595591.f2f011b9a8a9057b75a30940d240fd4aaeb7d9e3.mpagano@gentoo> Subject: [gentoo-commits] proj/linux-patches:3.16 commit in: / X-VCS-Repository: proj/linux-patches X-VCS-Files: 2500_multipath-tcp-v3.16-872d7f6c6f4e.patch X-VCS-Directories: / X-VCS-Committer: mpagano X-VCS-Committer-Name: Mike Pagano X-VCS-Revision: f2f011b9a8a9057b75a30940d240fd4aaeb7d9e3 X-VCS-Branch: 3.16 Date: Mon, 6 Oct 2014 11:39:52 +0000 (UTC) Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-Id: Gentoo Linux mail X-BeenThere: gentoo-commits@lists.gentoo.org X-Archives-Salt: 3db5dd51-1e84-4df0-a57e-9510395c8d21 X-Archives-Hash: a2cca512defb9a97185bfbd025db7f03 commit: f2f011b9a8a9057b75a30940d240fd4aaeb7d9e3 Author: Mike Pagano gentoo org> AuthorDate: Mon Oct 6 11:39:51 2014 +0000 Commit: Mike Pagano gentoo org> CommitDate: Mon Oct 6 11:39:51 2014 +0000 URL: http://sources.gentoo.org/gitweb/?p=proj/linux-patches.git;a=commit;h=f2f011b9 Remove dup. --- 2500_multipath-tcp-v3.16-872d7f6c6f4e.patch | 19230 -------------------------- 1 file changed, 19230 deletions(-) diff --git a/2500_multipath-tcp-v3.16-872d7f6c6f4e.patch b/2500_multipath-tcp-v3.16-872d7f6c6f4e.patch deleted file mode 100644 index 3000da3..0000000 --- a/2500_multipath-tcp-v3.16-872d7f6c6f4e.patch +++ /dev/null @@ -1,19230 +0,0 @@ -diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c -index 768a0fb67dd6..5a46d91a8df9 100644 ---- a/drivers/infiniband/hw/cxgb4/cm.c -+++ b/drivers/infiniband/hw/cxgb4/cm.c -@@ -3432,7 +3432,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos) - */ - memset(&tmp_opt, 0, sizeof(tmp_opt)); - tcp_clear_options(&tmp_opt); -- tcp_parse_options(skb, &tmp_opt, 0, NULL); -+ tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL); - - req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req)); - memset(req, 0, sizeof(*req)); -diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h -index 2faef339d8f2..d86c853ffaad 100644 ---- a/include/linux/ipv6.h -+++ b/include/linux/ipv6.h -@@ -256,16 +256,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) - return inet_sk(__sk)->pinet6; - } - --static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops) --{ -- struct request_sock *req = reqsk_alloc(ops); -- -- if (req) -- inet_rsk(req)->pktopts = NULL; -- -- return req; --} -- - static inline struct raw6_sock *raw6_sk(const struct sock *sk) - { - return (struct raw6_sock *)sk; -@@ -309,12 +299,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) - return NULL; - } - --static inline struct inet6_request_sock * -- inet6_rsk(const struct request_sock *rsk) --{ -- return NULL; --} -- - static inline struct raw6_sock *raw6_sk(const struct sock *sk) - { - return NULL; -diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h -index ec89301ada41..99ea4b0e3693 100644 ---- a/include/linux/skbuff.h -+++ b/include/linux/skbuff.h -@@ -2784,8 +2784,10 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, - bool zero_okay, - __sum16 check) - { -- if (skb_csum_unnecessary(skb) || (zero_okay && !check)) { -- skb->csum_valid = 1; -+ if (skb_csum_unnecessary(skb)) { -+ return false; -+ } else if (zero_okay && !check) { -+ skb->ip_summed = CHECKSUM_UNNECESSARY; - return false; - } - -diff --git a/include/linux/tcp.h b/include/linux/tcp.h -index a0513210798f..7bc2e078d6ca 100644 ---- a/include/linux/tcp.h -+++ b/include/linux/tcp.h -@@ -53,7 +53,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb) - /* TCP Fast Open */ - #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ - #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ --#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */ -+#define TCP_FASTOPEN_COOKIE_SIZE 4 /* the size employed by this impl. */ - - /* TCP Fast Open Cookie as stored in memory */ - struct tcp_fastopen_cookie { -@@ -72,6 +72,51 @@ struct tcp_sack_block { - u32 end_seq; - }; - -+struct tcp_out_options { -+ u16 options; /* bit field of OPTION_* */ -+ u8 ws; /* window scale, 0 to disable */ -+ u8 num_sack_blocks;/* number of SACK blocks to include */ -+ u8 hash_size; /* bytes in hash_location */ -+ u16 mss; /* 0 to disable */ -+ __u8 *hash_location; /* temporary pointer, overloaded */ -+ __u32 tsval, tsecr; /* need to include OPTION_TS */ -+ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ -+#ifdef CONFIG_MPTCP -+ u16 mptcp_options; /* bit field of MPTCP related OPTION_* */ -+ u8 dss_csum:1, -+ add_addr_v4:1, -+ add_addr_v6:1; /* dss-checksum required? */ -+ -+ union { -+ struct { -+ __u64 sender_key; /* sender's key for mptcp */ -+ __u64 receiver_key; /* receiver's key for mptcp */ -+ } mp_capable; -+ -+ struct { -+ __u64 sender_truncated_mac; -+ __u32 sender_nonce; -+ /* random number of the sender */ -+ __u32 token; /* token for mptcp */ -+ u8 low_prio:1; -+ } mp_join_syns; -+ }; -+ -+ struct { -+ struct in_addr addr; -+ u8 addr_id; -+ } add_addr4; -+ -+ struct { -+ struct in6_addr addr; -+ u8 addr_id; -+ } add_addr6; -+ -+ u16 remove_addrs; /* list of address id */ -+ u8 addr_id; /* address id (mp_join or add_address) */ -+#endif /* CONFIG_MPTCP */ -+}; -+ - /*These are used to set the sack_ok field in struct tcp_options_received */ - #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ - #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/ -@@ -95,6 +140,9 @@ struct tcp_options_received { - u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ - }; - -+struct mptcp_cb; -+struct mptcp_tcp_sock; -+ - static inline void tcp_clear_options(struct tcp_options_received *rx_opt) - { - rx_opt->tstamp_ok = rx_opt->sack_ok = 0; -@@ -111,10 +159,7 @@ struct tcp_request_sock_ops; - - struct tcp_request_sock { - struct inet_request_sock req; --#ifdef CONFIG_TCP_MD5SIG -- /* Only used by TCP MD5 Signature so far. */ - const struct tcp_request_sock_ops *af_specific; --#endif - struct sock *listener; /* needed for TFO */ - u32 rcv_isn; - u32 snt_isn; -@@ -130,6 +175,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) - return (struct tcp_request_sock *)req; - } - -+struct tcp_md5sig_key; -+ - struct tcp_sock { - /* inet_connection_sock has to be the first member of tcp_sock */ - struct inet_connection_sock inet_conn; -@@ -326,6 +373,37 @@ struct tcp_sock { - * socket. Used to retransmit SYNACKs etc. - */ - struct request_sock *fastopen_rsk; -+ -+ /* MPTCP/TCP-specific callbacks */ -+ const struct tcp_sock_ops *ops; -+ -+ struct mptcp_cb *mpcb; -+ struct sock *meta_sk; -+ /* We keep these flags even if CONFIG_MPTCP is not checked, because -+ * it allows checking MPTCP capability just by checking the mpc flag, -+ * rather than adding ifdefs everywhere. -+ */ -+ u16 mpc:1, /* Other end is multipath capable */ -+ inside_tk_table:1, /* Is the tcp_sock inside the token-table? */ -+ send_mp_fclose:1, -+ request_mptcp:1, /* Did we send out an MP_CAPABLE? -+ * (this speeds up mptcp_doit() in tcp_recvmsg) -+ */ -+ mptcp_enabled:1, /* Is MPTCP enabled from the application ? */ -+ pf:1, /* Potentially Failed state: when this flag is set, we -+ * stop using the subflow -+ */ -+ mp_killed:1, /* Killed with a tcp_done in mptcp? */ -+ was_meta_sk:1, /* This was a meta sk (in case of reuse) */ -+ is_master_sk, -+ close_it:1, /* Must close socket in mptcp_data_ready? */ -+ closing:1; -+ struct mptcp_tcp_sock *mptcp; -+#ifdef CONFIG_MPTCP -+ struct hlist_nulls_node tk_table; -+ u32 mptcp_loc_token; -+ u64 mptcp_loc_key; -+#endif /* CONFIG_MPTCP */ - }; - - enum tsq_flags { -@@ -337,6 +415,8 @@ enum tsq_flags { - TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call - * tcp_v{4|6}_mtu_reduced() - */ -+ MPTCP_PATH_MANAGER, /* MPTCP deferred creation of new subflows */ -+ MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */ - }; - - static inline struct tcp_sock *tcp_sk(const struct sock *sk) -@@ -355,6 +435,7 @@ struct tcp_timewait_sock { - #ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *tw_md5_key; - #endif -+ struct mptcp_tw *mptcp_tw; - }; - - static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) -diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h -index 74af137304be..83f63033897a 100644 ---- a/include/net/inet6_connection_sock.h -+++ b/include/net/inet6_connection_sock.h -@@ -27,6 +27,8 @@ int inet6_csk_bind_conflict(const struct sock *sk, - - struct dst_entry *inet6_csk_route_req(struct sock *sk, struct flowi6 *fl6, - const struct request_sock *req); -+u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, -+ const u32 rnd, const u32 synq_hsize); - - struct request_sock *inet6_csk_search_req(const struct sock *sk, - struct request_sock ***prevp, -diff --git a/include/net/inet_common.h b/include/net/inet_common.h -index fe7994c48b75..780f229f46a8 100644 ---- a/include/net/inet_common.h -+++ b/include/net/inet_common.h -@@ -1,6 +1,8 @@ - #ifndef _INET_COMMON_H - #define _INET_COMMON_H - -+#include -+ - extern const struct proto_ops inet_stream_ops; - extern const struct proto_ops inet_dgram_ops; - -@@ -13,6 +15,8 @@ struct sock; - struct sockaddr; - struct socket; - -+int inet_create(struct net *net, struct socket *sock, int protocol, int kern); -+int inet6_create(struct net *net, struct socket *sock, int protocol, int kern); - int inet_release(struct socket *sock); - int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags); -diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h -index 7a4313887568..f62159e39839 100644 ---- a/include/net/inet_connection_sock.h -+++ b/include/net/inet_connection_sock.h -@@ -30,6 +30,7 @@ - - struct inet_bind_bucket; - struct tcp_congestion_ops; -+struct tcp_options_received; - - /* - * Pointers to address related TCP functions -@@ -243,6 +244,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, - - struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); - -+u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd, -+ const u32 synq_hsize); -+ - struct request_sock *inet_csk_search_req(const struct sock *sk, - struct request_sock ***prevp, - const __be16 rport, -diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h -index b1edf17bec01..6a32d8d6b85e 100644 ---- a/include/net/inet_sock.h -+++ b/include/net/inet_sock.h -@@ -86,10 +86,14 @@ struct inet_request_sock { - wscale_ok : 1, - ecn_ok : 1, - acked : 1, -- no_srccheck: 1; -+ no_srccheck: 1, -+ mptcp_rqsk : 1, -+ saw_mpc : 1; - kmemcheck_bitfield_end(flags); -- struct ip_options_rcu *opt; -- struct sk_buff *pktopts; -+ union { -+ struct ip_options_rcu *opt; -+ struct sk_buff *pktopts; -+ }; - u32 ir_mark; - }; - -diff --git a/include/net/mptcp.h b/include/net/mptcp.h -new file mode 100644 -index 000000000000..712780fc39e4 ---- /dev/null -+++ b/include/net/mptcp.h -@@ -0,0 +1,1439 @@ -+/* -+ * MPTCP implementation -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer & Author: -+ * Christoph Paasch -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+#ifndef _MPTCP_H -+#define _MPTCP_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+ -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ #define ntohll(x) be64_to_cpu(x) -+ #define htonll(x) cpu_to_be64(x) -+#elif defined(__BIG_ENDIAN_BITFIELD) -+ #define ntohll(x) (x) -+ #define htonll(x) (x) -+#endif -+ -+struct mptcp_loc4 { -+ u8 loc4_id; -+ u8 low_prio:1; -+ struct in_addr addr; -+}; -+ -+struct mptcp_rem4 { -+ u8 rem4_id; -+ __be16 port; -+ struct in_addr addr; -+}; -+ -+struct mptcp_loc6 { -+ u8 loc6_id; -+ u8 low_prio:1; -+ struct in6_addr addr; -+}; -+ -+struct mptcp_rem6 { -+ u8 rem6_id; -+ __be16 port; -+ struct in6_addr addr; -+}; -+ -+struct mptcp_request_sock { -+ struct tcp_request_sock req; -+ /* hlist-nulls entry to the hash-table. Depending on whether this is a -+ * a new MPTCP connection or an additional subflow, the request-socket -+ * is either in the mptcp_reqsk_tk_htb or mptcp_reqsk_htb. -+ */ -+ struct hlist_nulls_node hash_entry; -+ -+ union { -+ struct { -+ /* Only on initial subflows */ -+ u64 mptcp_loc_key; -+ u64 mptcp_rem_key; -+ u32 mptcp_loc_token; -+ }; -+ -+ struct { -+ /* Only on additional subflows */ -+ struct mptcp_cb *mptcp_mpcb; -+ u32 mptcp_rem_nonce; -+ u32 mptcp_loc_nonce; -+ u64 mptcp_hash_tmac; -+ }; -+ }; -+ -+ u8 loc_id; -+ u8 rem_id; /* Address-id in the MP_JOIN */ -+ u8 dss_csum:1, -+ is_sub:1, /* Is this a new subflow? */ -+ low_prio:1, /* Interface set to low-prio? */ -+ rcv_low_prio:1; -+}; -+ -+struct mptcp_options_received { -+ u16 saw_mpc:1, -+ dss_csum:1, -+ drop_me:1, -+ -+ is_mp_join:1, -+ join_ack:1, -+ -+ saw_low_prio:2, /* 0x1 - low-prio set for this subflow -+ * 0x2 - low-prio set for another subflow -+ */ -+ low_prio:1, -+ -+ saw_add_addr:2, /* Saw at least one add_addr option: -+ * 0x1: IPv4 - 0x2: IPv6 -+ */ -+ more_add_addr:1, /* Saw one more add-addr. */ -+ -+ saw_rem_addr:1, /* Saw at least one rem_addr option */ -+ more_rem_addr:1, /* Saw one more rem-addr. */ -+ -+ mp_fail:1, -+ mp_fclose:1; -+ u8 rem_id; /* Address-id in the MP_JOIN */ -+ u8 prio_addr_id; /* Address-id in the MP_PRIO */ -+ -+ const unsigned char *add_addr_ptr; /* Pointer to add-address option */ -+ const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */ -+ -+ u32 data_ack; -+ u32 data_seq; -+ u16 data_len; -+ -+ u32 mptcp_rem_token;/* Remote token */ -+ -+ /* Key inside the option (from mp_capable or fast_close) */ -+ u64 mptcp_key; -+ -+ u32 mptcp_recv_nonce; -+ u64 mptcp_recv_tmac; -+ u8 mptcp_recv_mac[20]; -+}; -+ -+struct mptcp_tcp_sock { -+ struct tcp_sock *next; /* Next subflow socket */ -+ struct hlist_node cb_list; -+ struct mptcp_options_received rx_opt; -+ -+ /* Those three fields record the current mapping */ -+ u64 map_data_seq; -+ u32 map_subseq; -+ u16 map_data_len; -+ u16 slave_sk:1, -+ fully_established:1, -+ establish_increased:1, -+ second_packet:1, -+ attached:1, -+ send_mp_fail:1, -+ include_mpc:1, -+ mapping_present:1, -+ map_data_fin:1, -+ low_prio:1, /* use this socket as backup */ -+ rcv_low_prio:1, /* Peer sent low-prio option to us */ -+ send_mp_prio:1, /* Trigger to send mp_prio on this socket */ -+ pre_established:1; /* State between sending 3rd ACK and -+ * receiving the fourth ack of new subflows. -+ */ -+ -+ /* isn: needed to translate abs to relative subflow seqnums */ -+ u32 snt_isn; -+ u32 rcv_isn; -+ u8 path_index; -+ u8 loc_id; -+ u8 rem_id; -+ -+#define MPTCP_SCHED_SIZE 4 -+ u8 mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8); -+ -+ struct sk_buff *shortcut_ofoqueue; /* Shortcut to the current modified -+ * skb in the ofo-queue. -+ */ -+ -+ int init_rcv_wnd; -+ u32 infinite_cutoff_seq; -+ struct delayed_work work; -+ u32 mptcp_loc_nonce; -+ struct tcp_sock *tp; /* Where is my daddy? */ -+ u32 last_end_data_seq; -+ -+ /* MP_JOIN subflow: timer for retransmitting the 3rd ack */ -+ struct timer_list mptcp_ack_timer; -+ -+ /* HMAC of the third ack */ -+ char sender_mac[20]; -+}; -+ -+struct mptcp_tw { -+ struct list_head list; -+ u64 loc_key; -+ u64 rcv_nxt; -+ struct mptcp_cb __rcu *mpcb; -+ u8 meta_tw:1, -+ in_list:1; -+}; -+ -+#define MPTCP_PM_NAME_MAX 16 -+struct mptcp_pm_ops { -+ struct list_head list; -+ -+ /* Signal the creation of a new MPTCP-session. */ -+ void (*new_session)(const struct sock *meta_sk); -+ void (*release_sock)(struct sock *meta_sk); -+ void (*fully_established)(struct sock *meta_sk); -+ void (*new_remote_address)(struct sock *meta_sk); -+ int (*get_local_id)(sa_family_t family, union inet_addr *addr, -+ struct net *net, bool *low_prio); -+ void (*addr_signal)(struct sock *sk, unsigned *size, -+ struct tcp_out_options *opts, struct sk_buff *skb); -+ void (*add_raddr)(struct mptcp_cb *mpcb, const union inet_addr *addr, -+ sa_family_t family, __be16 port, u8 id); -+ void (*rem_raddr)(struct mptcp_cb *mpcb, u8 rem_id); -+ void (*init_subsocket_v4)(struct sock *sk, struct in_addr addr); -+ void (*init_subsocket_v6)(struct sock *sk, struct in6_addr addr); -+ -+ char name[MPTCP_PM_NAME_MAX]; -+ struct module *owner; -+}; -+ -+#define MPTCP_SCHED_NAME_MAX 16 -+struct mptcp_sched_ops { -+ struct list_head list; -+ -+ struct sock * (*get_subflow)(struct sock *meta_sk, -+ struct sk_buff *skb, -+ bool zero_wnd_test); -+ struct sk_buff * (*next_segment)(struct sock *meta_sk, -+ int *reinject, -+ struct sock **subsk, -+ unsigned int *limit); -+ void (*init)(struct sock *sk); -+ -+ char name[MPTCP_SCHED_NAME_MAX]; -+ struct module *owner; -+}; -+ -+struct mptcp_cb { -+ /* list of sockets in this multipath connection */ -+ struct tcp_sock *connection_list; -+ /* list of sockets that need a call to release_cb */ -+ struct hlist_head callback_list; -+ -+ /* High-order bits of 64-bit sequence numbers */ -+ u32 snd_high_order[2]; -+ u32 rcv_high_order[2]; -+ -+ u16 send_infinite_mapping:1, -+ in_time_wait:1, -+ list_rcvd:1, /* XXX TO REMOVE */ -+ addr_signal:1, /* Path-manager wants us to call addr_signal */ -+ dss_csum:1, -+ server_side:1, -+ infinite_mapping_rcv:1, -+ infinite_mapping_snd:1, -+ dfin_combined:1, /* Was the DFIN combined with subflow-fin? */ -+ passive_close:1, -+ snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */ -+ rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */ -+ -+ /* socket count in this connection */ -+ u8 cnt_subflows; -+ u8 cnt_established; -+ -+ struct mptcp_sched_ops *sched_ops; -+ -+ struct sk_buff_head reinject_queue; -+ /* First cache-line boundary is here minus 8 bytes. But from the -+ * reinject-queue only the next and prev pointers are regularly -+ * accessed. Thus, the whole data-path is on a single cache-line. -+ */ -+ -+ u64 csum_cutoff_seq; -+ -+ /***** Start of fields, used for connection closure */ -+ spinlock_t tw_lock; -+ unsigned char mptw_state; -+ u8 dfin_path_index; -+ -+ struct list_head tw_list; -+ -+ /***** Start of fields, used for subflow establishment and closure */ -+ atomic_t mpcb_refcnt; -+ -+ /* Mutex needed, because otherwise mptcp_close will complain that the -+ * socket is owned by the user. -+ * E.g., mptcp_sub_close_wq is taking the meta-lock. -+ */ -+ struct mutex mpcb_mutex; -+ -+ /***** Start of fields, used for subflow establishment */ -+ struct sock *meta_sk; -+ -+ /* Master socket, also part of the connection_list, this -+ * socket is the one that the application sees. -+ */ -+ struct sock *master_sk; -+ -+ __u64 mptcp_loc_key; -+ __u64 mptcp_rem_key; -+ __u32 mptcp_loc_token; -+ __u32 mptcp_rem_token; -+ -+#define MPTCP_PM_SIZE 608 -+ u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8); -+ struct mptcp_pm_ops *pm_ops; -+ -+ u32 path_index_bits; -+ /* Next pi to pick up in case a new path becomes available */ -+ u8 next_path_index; -+ -+ /* Original snd/rcvbuf of the initial subflow. -+ * Used for the new subflows on the server-side to allow correct -+ * autotuning -+ */ -+ int orig_sk_rcvbuf; -+ int orig_sk_sndbuf; -+ u32 orig_window_clamp; -+ -+ /* Timer for retransmitting SYN/ACK+MP_JOIN */ -+ struct timer_list synack_timer; -+}; -+ -+#define MPTCP_SUB_CAPABLE 0 -+#define MPTCP_SUB_LEN_CAPABLE_SYN 12 -+#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12 -+#define MPTCP_SUB_LEN_CAPABLE_ACK 20 -+#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20 -+ -+#define MPTCP_SUB_JOIN 1 -+#define MPTCP_SUB_LEN_JOIN_SYN 12 -+#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12 -+#define MPTCP_SUB_LEN_JOIN_SYNACK 16 -+#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16 -+#define MPTCP_SUB_LEN_JOIN_ACK 24 -+#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24 -+ -+#define MPTCP_SUB_DSS 2 -+#define MPTCP_SUB_LEN_DSS 4 -+#define MPTCP_SUB_LEN_DSS_ALIGN 4 -+ -+/* Lengths for seq and ack are the ones without the generic MPTCP-option header, -+ * as they are part of the DSS-option. -+ * To get the total length, just add the different options together. -+ */ -+#define MPTCP_SUB_LEN_SEQ 10 -+#define MPTCP_SUB_LEN_SEQ_CSUM 12 -+#define MPTCP_SUB_LEN_SEQ_ALIGN 12 -+ -+#define MPTCP_SUB_LEN_SEQ_64 14 -+#define MPTCP_SUB_LEN_SEQ_CSUM_64 16 -+#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16 -+ -+#define MPTCP_SUB_LEN_ACK 4 -+#define MPTCP_SUB_LEN_ACK_ALIGN 4 -+ -+#define MPTCP_SUB_LEN_ACK_64 8 -+#define MPTCP_SUB_LEN_ACK_64_ALIGN 8 -+ -+/* This is the "default" option-length we will send out most often. -+ * MPTCP DSS-header -+ * 32-bit data sequence number -+ * 32-bit data ack -+ * -+ * It is necessary to calculate the effective MSS we will be using when -+ * sending data. -+ */ -+#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \ -+ MPTCP_SUB_LEN_SEQ_ALIGN + \ -+ MPTCP_SUB_LEN_ACK_ALIGN) -+ -+#define MPTCP_SUB_ADD_ADDR 3 -+#define MPTCP_SUB_LEN_ADD_ADDR4 8 -+#define MPTCP_SUB_LEN_ADD_ADDR6 20 -+#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8 -+#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20 -+ -+#define MPTCP_SUB_REMOVE_ADDR 4 -+#define MPTCP_SUB_LEN_REMOVE_ADDR 4 -+ -+#define MPTCP_SUB_PRIO 5 -+#define MPTCP_SUB_LEN_PRIO 3 -+#define MPTCP_SUB_LEN_PRIO_ADDR 4 -+#define MPTCP_SUB_LEN_PRIO_ALIGN 4 -+ -+#define MPTCP_SUB_FAIL 6 -+#define MPTCP_SUB_LEN_FAIL 12 -+#define MPTCP_SUB_LEN_FAIL_ALIGN 12 -+ -+#define MPTCP_SUB_FCLOSE 7 -+#define MPTCP_SUB_LEN_FCLOSE 12 -+#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12 -+ -+ -+#define OPTION_MPTCP (1 << 5) -+ -+#ifdef CONFIG_MPTCP -+ -+/* Used for checking if the mptcp initialization has been successful */ -+extern bool mptcp_init_failed; -+ -+/* MPTCP options */ -+#define OPTION_TYPE_SYN (1 << 0) -+#define OPTION_TYPE_SYNACK (1 << 1) -+#define OPTION_TYPE_ACK (1 << 2) -+#define OPTION_MP_CAPABLE (1 << 3) -+#define OPTION_DATA_ACK (1 << 4) -+#define OPTION_ADD_ADDR (1 << 5) -+#define OPTION_MP_JOIN (1 << 6) -+#define OPTION_MP_FAIL (1 << 7) -+#define OPTION_MP_FCLOSE (1 << 8) -+#define OPTION_REMOVE_ADDR (1 << 9) -+#define OPTION_MP_PRIO (1 << 10) -+ -+/* MPTCP flags: both TX and RX */ -+#define MPTCPHDR_SEQ 0x01 /* DSS.M option is present */ -+#define MPTCPHDR_FIN 0x02 /* DSS.F option is present */ -+#define MPTCPHDR_SEQ64_INDEX 0x04 /* index of seq in mpcb->snd_high_order */ -+/* MPTCP flags: RX only */ -+#define MPTCPHDR_ACK 0x08 -+#define MPTCPHDR_SEQ64_SET 0x10 /* Did we received a 64-bit seq number? */ -+#define MPTCPHDR_SEQ64_OFO 0x20 /* Is it not in our circular array? */ -+#define MPTCPHDR_DSS_CSUM 0x40 -+#define MPTCPHDR_JOIN 0x80 -+/* MPTCP flags: TX only */ -+#define MPTCPHDR_INF 0x08 -+ -+struct mptcp_option { -+ __u8 kind; -+ __u8 len; -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 ver:4, -+ sub:4; -+#elif defined(__BIG_ENDIAN_BITFIELD) -+ __u8 sub:4, -+ ver:4; -+#else -+#error "Adjust your defines" -+#endif -+}; -+ -+struct mp_capable { -+ __u8 kind; -+ __u8 len; -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 ver:4, -+ sub:4; -+ __u8 h:1, -+ rsv:5, -+ b:1, -+ a:1; -+#elif defined(__BIG_ENDIAN_BITFIELD) -+ __u8 sub:4, -+ ver:4; -+ __u8 a:1, -+ b:1, -+ rsv:5, -+ h:1; -+#else -+#error "Adjust your defines" -+#endif -+ __u64 sender_key; -+ __u64 receiver_key; -+} __attribute__((__packed__)); -+ -+struct mp_join { -+ __u8 kind; -+ __u8 len; -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 b:1, -+ rsv:3, -+ sub:4; -+#elif defined(__BIG_ENDIAN_BITFIELD) -+ __u8 sub:4, -+ rsv:3, -+ b:1; -+#else -+#error "Adjust your defines" -+#endif -+ __u8 addr_id; -+ union { -+ struct { -+ u32 token; -+ u32 nonce; -+ } syn; -+ struct { -+ __u64 mac; -+ u32 nonce; -+ } synack; -+ struct { -+ __u8 mac[20]; -+ } ack; -+ } u; -+} __attribute__((__packed__)); -+ -+struct mp_dss { -+ __u8 kind; -+ __u8 len; -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u16 rsv1:4, -+ sub:4, -+ A:1, -+ a:1, -+ M:1, -+ m:1, -+ F:1, -+ rsv2:3; -+#elif defined(__BIG_ENDIAN_BITFIELD) -+ __u16 sub:4, -+ rsv1:4, -+ rsv2:3, -+ F:1, -+ m:1, -+ M:1, -+ a:1, -+ A:1; -+#else -+#error "Adjust your defines" -+#endif -+}; -+ -+struct mp_add_addr { -+ __u8 kind; -+ __u8 len; -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 ipver:4, -+ sub:4; -+#elif defined(__BIG_ENDIAN_BITFIELD) -+ __u8 sub:4, -+ ipver:4; -+#else -+#error "Adjust your defines" -+#endif -+ __u8 addr_id; -+ union { -+ struct { -+ struct in_addr addr; -+ __be16 port; -+ } v4; -+ struct { -+ struct in6_addr addr; -+ __be16 port; -+ } v6; -+ } u; -+} __attribute__((__packed__)); -+ -+struct mp_remove_addr { -+ __u8 kind; -+ __u8 len; -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 rsv:4, -+ sub:4; -+#elif defined(__BIG_ENDIAN_BITFIELD) -+ __u8 sub:4, -+ rsv:4; -+#else -+#error "Adjust your defines" -+#endif -+ /* list of addr_id */ -+ __u8 addrs_id; -+}; -+ -+struct mp_fail { -+ __u8 kind; -+ __u8 len; -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u16 rsv1:4, -+ sub:4, -+ rsv2:8; -+#elif defined(__BIG_ENDIAN_BITFIELD) -+ __u16 sub:4, -+ rsv1:4, -+ rsv2:8; -+#else -+#error "Adjust your defines" -+#endif -+ __be64 data_seq; -+} __attribute__((__packed__)); -+ -+struct mp_fclose { -+ __u8 kind; -+ __u8 len; -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u16 rsv1:4, -+ sub:4, -+ rsv2:8; -+#elif defined(__BIG_ENDIAN_BITFIELD) -+ __u16 sub:4, -+ rsv1:4, -+ rsv2:8; -+#else -+#error "Adjust your defines" -+#endif -+ __u64 key; -+} __attribute__((__packed__)); -+ -+struct mp_prio { -+ __u8 kind; -+ __u8 len; -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 b:1, -+ rsv:3, -+ sub:4; -+#elif defined(__BIG_ENDIAN_BITFIELD) -+ __u8 sub:4, -+ rsv:3, -+ b:1; -+#else -+#error "Adjust your defines" -+#endif -+ __u8 addr_id; -+} __attribute__((__packed__)); -+ -+static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum) -+{ -+ return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2); -+} -+ -+#define MPTCP_APP 2 -+ -+extern int sysctl_mptcp_enabled; -+extern int sysctl_mptcp_checksum; -+extern int sysctl_mptcp_debug; -+extern int sysctl_mptcp_syn_retries; -+ -+extern struct workqueue_struct *mptcp_wq; -+ -+#define mptcp_debug(fmt, args...) \ -+ do { \ -+ if (unlikely(sysctl_mptcp_debug)) \ -+ pr_err(__FILE__ ": " fmt, ##args); \ -+ } while (0) -+ -+/* Iterates over all subflows */ -+#define mptcp_for_each_tp(mpcb, tp) \ -+ for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next) -+ -+#define mptcp_for_each_sk(mpcb, sk) \ -+ for ((sk) = (struct sock *)(mpcb)->connection_list; \ -+ sk; \ -+ sk = (struct sock *)tcp_sk(sk)->mptcp->next) -+ -+#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) \ -+ for (__sk = (struct sock *)(__mpcb)->connection_list, \ -+ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \ -+ __sk; \ -+ __sk = __temp, \ -+ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL) -+ -+/* Iterates over all bit set to 1 in a bitset */ -+#define mptcp_for_each_bit_set(b, i) \ -+ for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1) -+ -+#define mptcp_for_each_bit_unset(b, i) \ -+ mptcp_for_each_bit_set(~b, i) -+ -+extern struct lock_class_key meta_key; -+extern struct lock_class_key meta_slock_key; -+extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4]; -+ -+/* This is needed to ensure that two subsequent key/nonce-generation result in -+ * different keys/nonces if the IPs and ports are the same. -+ */ -+extern u32 mptcp_seed; -+ -+#define MPTCP_HASH_SIZE 1024 -+ -+extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE]; -+ -+/* This second hashtable is needed to retrieve request socks -+ * created as a result of a join request. While the SYN contains -+ * the token, the final ack does not, so we need a separate hashtable -+ * to retrieve the mpcb. -+ */ -+extern struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE]; -+extern spinlock_t mptcp_reqsk_hlock; /* hashtable protection */ -+ -+/* Lock, protecting the two hash-tables that hold the token. Namely, -+ * mptcp_reqsk_tk_htb and tk_hashtable -+ */ -+extern spinlock_t mptcp_tk_hashlock; /* hashtable protection */ -+ -+/* Request-sockets can be hashed in the tk_htb for collision-detection or in -+ * the regular htb for join-connections. We need to define different NULLS -+ * values so that we can correctly detect a request-socket that has been -+ * recycled. See also c25eb3bfb9729. -+ */ -+#define MPTCP_REQSK_NULLS_BASE (1U << 29) -+ -+ -+void mptcp_data_ready(struct sock *sk); -+void mptcp_write_space(struct sock *sk); -+ -+void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb, -+ struct sock *sk); -+void mptcp_ofo_queue(struct sock *meta_sk); -+void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp); -+void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied); -+int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id, -+ gfp_t flags); -+void mptcp_del_sock(struct sock *sk); -+void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk); -+void mptcp_reinject_data(struct sock *orig_sk, int clone_it); -+void mptcp_update_sndbuf(const struct tcp_sock *tp); -+void mptcp_send_fin(struct sock *meta_sk); -+void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority); -+bool mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, -+ int push_one, gfp_t gfp); -+void tcp_parse_mptcp_options(const struct sk_buff *skb, -+ struct mptcp_options_received *mopt); -+void mptcp_parse_options(const uint8_t *ptr, int opsize, -+ struct mptcp_options_received *mopt, -+ const struct sk_buff *skb); -+void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts, -+ unsigned *remaining); -+void mptcp_synack_options(struct request_sock *req, -+ struct tcp_out_options *opts, -+ unsigned *remaining); -+void mptcp_established_options(struct sock *sk, struct sk_buff *skb, -+ struct tcp_out_options *opts, unsigned *size); -+void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, -+ const struct tcp_out_options *opts, -+ struct sk_buff *skb); -+void mptcp_close(struct sock *meta_sk, long timeout); -+int mptcp_doit(struct sock *sk); -+int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window); -+int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req); -+int mptcp_check_req_master(struct sock *sk, struct sock *child, -+ struct request_sock *req, -+ struct request_sock **prev); -+struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child, -+ struct request_sock *req, -+ struct request_sock **prev, -+ const struct mptcp_options_received *mopt); -+u32 __mptcp_select_window(struct sock *sk); -+void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, -+ __u32 *window_clamp, int wscale_ok, -+ __u8 *rcv_wscale, __u32 init_rcv_wnd, -+ const struct sock *sk); -+unsigned int mptcp_current_mss(struct sock *meta_sk); -+int mptcp_select_size(const struct sock *meta_sk, bool sg); -+void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn); -+void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2, -+ u32 *hash_out); -+void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk); -+void mptcp_fin(struct sock *meta_sk); -+void mptcp_retransmit_timer(struct sock *meta_sk); -+int mptcp_write_wakeup(struct sock *meta_sk); -+void mptcp_sub_close_wq(struct work_struct *work); -+void mptcp_sub_close(struct sock *sk, unsigned long delay); -+struct sock *mptcp_select_ack_sock(const struct sock *meta_sk); -+void mptcp_fallback_meta_sk(struct sock *meta_sk); -+int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb); -+void mptcp_ack_handler(unsigned long); -+int mptcp_check_rtt(const struct tcp_sock *tp, int time); -+int mptcp_check_snd_buf(const struct tcp_sock *tp); -+int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, -+ const struct sk_buff *skb); -+void __init mptcp_init(void); -+int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len); -+void mptcp_destroy_sock(struct sock *sk); -+int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, -+ const struct sk_buff *skb, -+ const struct mptcp_options_received *mopt); -+unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now, -+ int large_allowed); -+int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw); -+void mptcp_twsk_destructor(struct tcp_timewait_sock *tw); -+void mptcp_time_wait(struct sock *sk, int state, int timeo); -+void mptcp_disconnect(struct sock *sk); -+bool mptcp_should_expand_sndbuf(const struct sock *sk); -+int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb); -+void mptcp_tsq_flags(struct sock *sk); -+void mptcp_tsq_sub_deferred(struct sock *meta_sk); -+struct mp_join *mptcp_find_join(const struct sk_buff *skb); -+void mptcp_hash_remove_bh(struct tcp_sock *meta_tp); -+void mptcp_hash_remove(struct tcp_sock *meta_tp); -+struct sock *mptcp_hash_find(const struct net *net, const u32 token); -+int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw); -+int mptcp_do_join_short(struct sk_buff *skb, -+ const struct mptcp_options_received *mopt, -+ struct net *net); -+void mptcp_reqsk_destructor(struct request_sock *req); -+void mptcp_reqsk_new_mptcp(struct request_sock *req, -+ const struct mptcp_options_received *mopt, -+ const struct sk_buff *skb); -+int mptcp_check_req(struct sk_buff *skb, struct net *net); -+void mptcp_connect_init(struct sock *sk); -+void mptcp_sub_force_close(struct sock *sk); -+int mptcp_sub_len_remove_addr_align(u16 bitfield); -+void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, -+ const struct sk_buff *skb); -+void mptcp_init_buffer_space(struct sock *sk); -+void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req, -+ struct sk_buff *skb); -+void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb); -+int mptcp_conn_request(struct sock *sk, struct sk_buff *skb); -+void mptcp_init_congestion_control(struct sock *sk); -+ -+/* MPTCP-path-manager registration/initialization functions */ -+int mptcp_register_path_manager(struct mptcp_pm_ops *pm); -+void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm); -+void mptcp_init_path_manager(struct mptcp_cb *mpcb); -+void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb); -+void mptcp_fallback_default(struct mptcp_cb *mpcb); -+void mptcp_get_default_path_manager(char *name); -+int mptcp_set_default_path_manager(const char *name); -+extern struct mptcp_pm_ops mptcp_pm_default; -+ -+/* MPTCP-scheduler registration/initialization functions */ -+int mptcp_register_scheduler(struct mptcp_sched_ops *sched); -+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); -+void mptcp_init_scheduler(struct mptcp_cb *mpcb); -+void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb); -+void mptcp_get_default_scheduler(char *name); -+int mptcp_set_default_scheduler(const char *name); -+extern struct mptcp_sched_ops mptcp_sched_default; -+ -+static inline void mptcp_reset_synack_timer(struct sock *meta_sk, -+ unsigned long len) -+{ -+ sk_reset_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer, -+ jiffies + len); -+} -+ -+static inline void mptcp_delete_synack_timer(struct sock *meta_sk) -+{ -+ sk_stop_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer); -+} -+ -+static inline bool is_mptcp_enabled(const struct sock *sk) -+{ -+ if (!sysctl_mptcp_enabled || mptcp_init_failed) -+ return false; -+ -+ if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled) -+ return false; -+ -+ return true; -+} -+ -+static inline int mptcp_pi_to_flag(int pi) -+{ -+ return 1 << (pi - 1); -+} -+ -+static inline -+struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req) -+{ -+ return (struct mptcp_request_sock *)req; -+} -+ -+static inline -+struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req) -+{ -+ return (struct request_sock *)req; -+} -+ -+static inline bool mptcp_can_sendpage(struct sock *sk) -+{ -+ struct sock *sk_it; -+ -+ if (tcp_sk(sk)->mpcb->dss_csum) -+ return false; -+ -+ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) { -+ if (!(sk_it->sk_route_caps & NETIF_F_SG) || -+ !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM)) -+ return false; -+ } -+ -+ return true; -+} -+ -+static inline void mptcp_push_pending_frames(struct sock *meta_sk) -+{ -+ /* We check packets out and send-head here. TCP only checks the -+ * send-head. But, MPTCP also checks packets_out, as this is an -+ * indication that we might want to do opportunistic reinjection. -+ */ -+ if (tcp_sk(meta_sk)->packets_out || tcp_send_head(meta_sk)) { -+ struct tcp_sock *tp = tcp_sk(meta_sk); -+ -+ /* We don't care about the MSS, because it will be set in -+ * mptcp_write_xmit. -+ */ -+ __tcp_push_pending_frames(meta_sk, 0, tp->nonagle); -+ } -+} -+ -+static inline void mptcp_send_reset(struct sock *sk) -+{ -+ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); -+ mptcp_sub_force_close(sk); -+} -+ -+static inline bool mptcp_is_data_seq(const struct sk_buff *skb) -+{ -+ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ; -+} -+ -+static inline bool mptcp_is_data_fin(const struct sk_buff *skb) -+{ -+ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN; -+} -+ -+/* Is it a data-fin while in infinite mapping mode? -+ * In infinite mode, a subflow-fin is in fact a data-fin. -+ */ -+static inline bool mptcp_is_data_fin2(const struct sk_buff *skb, -+ const struct tcp_sock *tp) -+{ -+ return mptcp_is_data_fin(skb) || -+ (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin); -+} -+ -+static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb) -+{ -+ u64 data_seq_high = (u32)(data_seq >> 32); -+ -+ if (mpcb->rcv_high_order[0] == data_seq_high) -+ return 0; -+ else if (mpcb->rcv_high_order[1] == data_seq_high) -+ return MPTCPHDR_SEQ64_INDEX; -+ else -+ return MPTCPHDR_SEQ64_OFO; -+} -+ -+/* Sets the data_seq and returns pointer to the in-skb field of the data_seq. -+ * If the packet has a 64-bit dseq, the pointer points to the last 32 bits. -+ */ -+static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb, -+ u32 *data_seq, -+ struct mptcp_cb *mpcb) -+{ -+ __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off); -+ -+ if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) { -+ u64 data_seq64 = get_unaligned_be64(ptr); -+ -+ if (mpcb) -+ TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb); -+ -+ *data_seq = (u32)data_seq64; -+ ptr++; -+ } else { -+ *data_seq = get_unaligned_be32(ptr); -+ } -+ -+ return ptr; -+} -+ -+static inline struct sock *mptcp_meta_sk(const struct sock *sk) -+{ -+ return tcp_sk(sk)->meta_sk; -+} -+ -+static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) -+{ -+ return tcp_sk(tp->meta_sk); -+} -+ -+static inline int is_meta_tp(const struct tcp_sock *tp) -+{ -+ return tp->mpcb && mptcp_meta_tp(tp) == tp; -+} -+ -+static inline int is_meta_sk(const struct sock *sk) -+{ -+ return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP && -+ mptcp(tcp_sk(sk)) && mptcp_meta_sk(sk) == sk; -+} -+ -+static inline int is_master_tp(const struct tcp_sock *tp) -+{ -+ return !mptcp(tp) || (!tp->mptcp->slave_sk && !is_meta_tp(tp)); -+} -+ -+static inline void mptcp_hash_request_remove(struct request_sock *req) -+{ -+ int in_softirq = 0; -+ -+ if (hlist_nulls_unhashed(&mptcp_rsk(req)->hash_entry)) -+ return; -+ -+ if (in_softirq()) { -+ spin_lock(&mptcp_reqsk_hlock); -+ in_softirq = 1; -+ } else { -+ spin_lock_bh(&mptcp_reqsk_hlock); -+ } -+ -+ hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry); -+ -+ if (in_softirq) -+ spin_unlock(&mptcp_reqsk_hlock); -+ else -+ spin_unlock_bh(&mptcp_reqsk_hlock); -+} -+ -+static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt) -+{ -+ mopt->saw_mpc = 0; -+ mopt->dss_csum = 0; -+ mopt->drop_me = 0; -+ -+ mopt->is_mp_join = 0; -+ mopt->join_ack = 0; -+ -+ mopt->saw_low_prio = 0; -+ mopt->low_prio = 0; -+ -+ mopt->saw_add_addr = 0; -+ mopt->more_add_addr = 0; -+ -+ mopt->saw_rem_addr = 0; -+ mopt->more_rem_addr = 0; -+ -+ mopt->mp_fail = 0; -+ mopt->mp_fclose = 0; -+} -+ -+static inline void mptcp_reset_mopt(struct tcp_sock *tp) -+{ -+ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; -+ -+ mopt->saw_low_prio = 0; -+ mopt->saw_add_addr = 0; -+ mopt->more_add_addr = 0; -+ mopt->saw_rem_addr = 0; -+ mopt->more_rem_addr = 0; -+ mopt->join_ack = 0; -+ mopt->mp_fail = 0; -+ mopt->mp_fclose = 0; -+} -+ -+static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb, -+ const struct mptcp_cb *mpcb) -+{ -+ return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags & -+ MPTCPHDR_SEQ64_INDEX) ? 1 : 0]); -+} -+ -+static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index, -+ u32 data_seq_32) -+{ -+ return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32; -+} -+ -+static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp) -+{ -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, -+ meta_tp->rcv_nxt); -+} -+ -+static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc) -+{ -+ if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) { -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; -+ mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2; -+ } -+} -+ -+static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp, -+ u32 old_rcv_nxt) -+{ -+ if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) { -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2; -+ mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1; -+ } -+} -+ -+static inline int mptcp_sk_can_send(const struct sock *sk) -+{ -+ return tcp_passive_fastopen(sk) || -+ ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && -+ !tcp_sk(sk)->mptcp->pre_established); -+} -+ -+static inline int mptcp_sk_can_recv(const struct sock *sk) -+{ -+ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2); -+} -+ -+static inline int mptcp_sk_can_send_ack(const struct sock *sk) -+{ -+ return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | -+ TCPF_CLOSE | TCPF_LISTEN)) && -+ !tcp_sk(sk)->mptcp->pre_established; -+} -+ -+/* Only support GSO if all subflows supports it */ -+static inline bool mptcp_sk_can_gso(const struct sock *meta_sk) -+{ -+ struct sock *sk; -+ -+ if (tcp_sk(meta_sk)->mpcb->dss_csum) -+ return false; -+ -+ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { -+ if (!mptcp_sk_can_send(sk)) -+ continue; -+ if (!sk_can_gso(sk)) -+ return false; -+ } -+ return true; -+} -+ -+static inline bool mptcp_can_sg(const struct sock *meta_sk) -+{ -+ struct sock *sk; -+ -+ if (tcp_sk(meta_sk)->mpcb->dss_csum) -+ return false; -+ -+ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { -+ if (!mptcp_sk_can_send(sk)) -+ continue; -+ if (!(sk->sk_route_caps & NETIF_F_SG)) -+ return false; -+ } -+ return true; -+} -+ -+static inline void mptcp_set_rto(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct sock *sk_it; -+ struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk)); -+ __u32 max_rto = 0; -+ -+ /* We are in recovery-phase on the MPTCP-level. Do not update the -+ * RTO, because this would kill exponential backoff. -+ */ -+ if (micsk->icsk_retransmits) -+ return; -+ -+ mptcp_for_each_sk(tp->mpcb, sk_it) { -+ if (mptcp_sk_can_send(sk_it) && -+ inet_csk(sk_it)->icsk_rto > max_rto) -+ max_rto = inet_csk(sk_it)->icsk_rto; -+ } -+ if (max_rto) { -+ micsk->icsk_rto = max_rto << 1; -+ -+ /* A successfull rto-measurement - reset backoff counter */ -+ micsk->icsk_backoff = 0; -+ } -+} -+ -+static inline int mptcp_sysctl_syn_retries(void) -+{ -+ return sysctl_mptcp_syn_retries; -+} -+ -+static inline void mptcp_sub_close_passive(struct sock *sk) -+{ -+ struct sock *meta_sk = mptcp_meta_sk(sk); -+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk); -+ -+ /* Only close, if the app did a send-shutdown (passive close), and we -+ * received the data-ack of the data-fin. -+ */ -+ if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq) -+ mptcp_sub_close(sk, 0); -+} -+ -+static inline bool mptcp_fallback_infinite(struct sock *sk, int flag) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ /* If data has been acknowleged on the meta-level, fully_established -+ * will have been set before and thus we will not fall back to infinite -+ * mapping. -+ */ -+ if (likely(tp->mptcp->fully_established)) -+ return false; -+ -+ if (!(flag & MPTCP_FLAG_DATA_ACKED)) -+ return false; -+ -+ /* Don't fallback twice ;) */ -+ if (tp->mpcb->infinite_mapping_snd) -+ return false; -+ -+ pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n", -+ __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index, -+ &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr, -+ __builtin_return_address(0)); -+ if (!is_master_tp(tp)) -+ return true; -+ -+ tp->mpcb->infinite_mapping_snd = 1; -+ tp->mpcb->infinite_mapping_rcv = 1; -+ tp->mptcp->fully_established = 1; -+ -+ return false; -+} -+ -+/* Find the first index whose bit in the bit-field == 0 */ -+static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb) -+{ -+ u8 base = mpcb->next_path_index; -+ int i; -+ -+ /* Start at 1, because 0 is reserved for the meta-sk */ -+ mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) { -+ if (i + base < 1) -+ continue; -+ if (i + base >= sizeof(mpcb->path_index_bits) * 8) -+ break; -+ i += base; -+ mpcb->path_index_bits |= (1 << i); -+ mpcb->next_path_index = i + 1; -+ return i; -+ } -+ mptcp_for_each_bit_unset(mpcb->path_index_bits, i) { -+ if (i >= sizeof(mpcb->path_index_bits) * 8) -+ break; -+ if (i < 1) -+ continue; -+ mpcb->path_index_bits |= (1 << i); -+ mpcb->next_path_index = i + 1; -+ return i; -+ } -+ -+ return 0; -+} -+ -+static inline bool mptcp_v6_is_v4_mapped(const struct sock *sk) -+{ -+ return sk->sk_family == AF_INET6 && -+ ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED; -+} -+ -+/* TCP and MPTCP mpc flag-depending functions */ -+u16 mptcp_select_window(struct sock *sk); -+void mptcp_init_buffer_space(struct sock *sk); -+void mptcp_tcp_set_rto(struct sock *sk); -+ -+/* TCP and MPTCP flag-depending functions */ -+bool mptcp_prune_ofo_queue(struct sock *sk); -+ -+#else /* CONFIG_MPTCP */ -+#define mptcp_debug(fmt, args...) \ -+ do { \ -+ } while (0) -+ -+/* Without MPTCP, we just do one iteration -+ * over the only socket available. This assumes that -+ * the sk/tp arg is the socket in that case. -+ */ -+#define mptcp_for_each_sk(mpcb, sk) -+#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) -+ -+static inline bool mptcp_is_data_fin(const struct sk_buff *skb) -+{ -+ return false; -+} -+static inline bool mptcp_is_data_seq(const struct sk_buff *skb) -+{ -+ return false; -+} -+static inline struct sock *mptcp_meta_sk(const struct sock *sk) -+{ -+ return NULL; -+} -+static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) -+{ -+ return NULL; -+} -+static inline int is_meta_sk(const struct sock *sk) -+{ -+ return 0; -+} -+static inline int is_master_tp(const struct tcp_sock *tp) -+{ -+ return 0; -+} -+static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {} -+static inline void mptcp_del_sock(const struct sock *sk) {} -+static inline void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk) {} -+static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {} -+static inline void mptcp_update_sndbuf(const struct tcp_sock *tp) {} -+static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb, -+ const struct sock *sk) {} -+static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {} -+static inline void mptcp_set_rto(const struct sock *sk) {} -+static inline void mptcp_send_fin(const struct sock *meta_sk) {} -+static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize, -+ const struct mptcp_options_received *mopt, -+ const struct sk_buff *skb) {} -+static inline void mptcp_syn_options(const struct sock *sk, -+ struct tcp_out_options *opts, -+ unsigned *remaining) {} -+static inline void mptcp_synack_options(struct request_sock *req, -+ struct tcp_out_options *opts, -+ unsigned *remaining) {} -+ -+static inline void mptcp_established_options(struct sock *sk, -+ struct sk_buff *skb, -+ struct tcp_out_options *opts, -+ unsigned *size) {} -+static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, -+ const struct tcp_out_options *opts, -+ struct sk_buff *skb) {} -+static inline void mptcp_close(struct sock *meta_sk, long timeout) {} -+static inline int mptcp_doit(struct sock *sk) -+{ -+ return 0; -+} -+static inline int mptcp_check_req_fastopen(struct sock *child, -+ struct request_sock *req) -+{ -+ return 1; -+} -+static inline int mptcp_check_req_master(const struct sock *sk, -+ const struct sock *child, -+ struct request_sock *req, -+ struct request_sock **prev) -+{ -+ return 1; -+} -+static inline struct sock *mptcp_check_req_child(struct sock *sk, -+ struct sock *child, -+ struct request_sock *req, -+ struct request_sock **prev, -+ const struct mptcp_options_received *mopt) -+{ -+ return NULL; -+} -+static inline unsigned int mptcp_current_mss(struct sock *meta_sk) -+{ -+ return 0; -+} -+static inline int mptcp_select_size(const struct sock *meta_sk, bool sg) -+{ -+ return 0; -+} -+static inline void mptcp_sub_close_passive(struct sock *sk) {} -+static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag) -+{ -+ return false; -+} -+static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {} -+static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time) -+{ -+ return 0; -+} -+static inline int mptcp_check_snd_buf(const struct tcp_sock *tp) -+{ -+ return 0; -+} -+static inline int mptcp_sysctl_syn_retries(void) -+{ -+ return 0; -+} -+static inline void mptcp_send_reset(const struct sock *sk) {} -+static inline int mptcp_handle_options(struct sock *sk, -+ const struct tcphdr *th, -+ struct sk_buff *skb) -+{ -+ return 0; -+} -+static inline void mptcp_reset_mopt(struct tcp_sock *tp) {} -+static inline void __init mptcp_init(void) {} -+static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) -+{ -+ return 0; -+} -+static inline bool mptcp_sk_can_gso(const struct sock *sk) -+{ -+ return false; -+} -+static inline bool mptcp_can_sg(const struct sock *meta_sk) -+{ -+ return false; -+} -+static inline unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, -+ u32 mss_now, int large_allowed) -+{ -+ return 0; -+} -+static inline void mptcp_destroy_sock(struct sock *sk) {} -+static inline int mptcp_rcv_synsent_state_process(struct sock *sk, -+ struct sock **skptr, -+ struct sk_buff *skb, -+ const struct mptcp_options_received *mopt) -+{ -+ return 0; -+} -+static inline bool mptcp_can_sendpage(struct sock *sk) -+{ -+ return false; -+} -+static inline int mptcp_init_tw_sock(struct sock *sk, -+ struct tcp_timewait_sock *tw) -+{ -+ return 0; -+} -+static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {} -+static inline void mptcp_disconnect(struct sock *sk) {} -+static inline void mptcp_tsq_flags(struct sock *sk) {} -+static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {} -+static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {} -+static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {} -+static inline void mptcp_reqsk_new_mptcp(struct request_sock *req, -+ const struct tcp_options_received *rx_opt, -+ const struct mptcp_options_received *mopt, -+ const struct sk_buff *skb) {} -+static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, -+ const struct sk_buff *skb) {} -+static inline void mptcp_delete_synack_timer(struct sock *meta_sk) {} -+#endif /* CONFIG_MPTCP */ -+ -+#endif /* _MPTCP_H */ -diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h -new file mode 100644 -index 000000000000..93ad97c77c5a ---- /dev/null -+++ b/include/net/mptcp_v4.h -@@ -0,0 +1,67 @@ -+/* -+ * MPTCP implementation -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer & Author: -+ * Christoph Paasch -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+#ifndef MPTCP_V4_H_ -+#define MPTCP_V4_H_ -+ -+ -+#include -+#include -+#include -+#include -+#include -+ -+extern struct request_sock_ops mptcp_request_sock_ops; -+extern const struct inet_connection_sock_af_ops mptcp_v4_specific; -+extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops; -+extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops; -+ -+#ifdef CONFIG_MPTCP -+ -+int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb); -+struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr, -+ const __be32 laddr, const struct net *net); -+int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, -+ struct mptcp_rem4 *rem); -+int mptcp_pm_v4_init(void); -+void mptcp_pm_v4_undo(void); -+u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); -+u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); -+ -+#else -+ -+static inline int mptcp_v4_do_rcv(const struct sock *meta_sk, -+ const struct sk_buff *skb) -+{ -+ return 0; -+} -+ -+#endif /* CONFIG_MPTCP */ -+ -+#endif /* MPTCP_V4_H_ */ -diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h -new file mode 100644 -index 000000000000..49a4f30ccd4d ---- /dev/null -+++ b/include/net/mptcp_v6.h -@@ -0,0 +1,69 @@ -+/* -+ * MPTCP implementation -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer & Author: -+ * Jaakko Korkeaniemi -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+#ifndef _MPTCP_V6_H -+#define _MPTCP_V6_H -+ -+#include -+#include -+ -+#include -+ -+ -+#ifdef CONFIG_MPTCP -+extern const struct inet_connection_sock_af_ops mptcp_v6_mapped; -+extern const struct inet_connection_sock_af_ops mptcp_v6_specific; -+extern struct request_sock_ops mptcp6_request_sock_ops; -+extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops; -+extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops; -+ -+int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb); -+struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr, -+ const struct in6_addr *laddr, const struct net *net); -+int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, -+ struct mptcp_rem6 *rem); -+int mptcp_pm_v6_init(void); -+void mptcp_pm_v6_undo(void); -+__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, -+ __be16 sport, __be16 dport); -+u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, -+ __be16 sport, __be16 dport); -+ -+#else /* CONFIG_MPTCP */ -+ -+#define mptcp_v6_mapped ipv6_mapped -+ -+static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) -+{ -+ return 0; -+} -+ -+#endif /* CONFIG_MPTCP */ -+ -+#endif /* _MPTCP_V6_H */ -diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h -index 361d26077196..bae95a11c531 100644 ---- a/include/net/net_namespace.h -+++ b/include/net/net_namespace.h -@@ -16,6 +16,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -92,6 +93,9 @@ struct net { - #if IS_ENABLED(CONFIG_IPV6) - struct netns_ipv6 ipv6; - #endif -+#if IS_ENABLED(CONFIG_MPTCP) -+ struct netns_mptcp mptcp; -+#endif - #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) - struct netns_ieee802154_lowpan ieee802154_lowpan; - #endif -diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h -new file mode 100644 -index 000000000000..bad418b04cc8 ---- /dev/null -+++ b/include/net/netns/mptcp.h -@@ -0,0 +1,44 @@ -+/* -+ * MPTCP implementation - MPTCP namespace -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer: -+ * Christoph Paasch -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+#ifndef __NETNS_MPTCP_H__ -+#define __NETNS_MPTCP_H__ -+ -+#include -+ -+enum { -+ MPTCP_PM_FULLMESH = 0, -+ MPTCP_PM_MAX -+}; -+ -+struct netns_mptcp { -+ void *path_managers[MPTCP_PM_MAX]; -+}; -+ -+#endif /* __NETNS_MPTCP_H__ */ -diff --git a/include/net/request_sock.h b/include/net/request_sock.h -index 7f830ff67f08..e79e87a8e1a6 100644 ---- a/include/net/request_sock.h -+++ b/include/net/request_sock.h -@@ -164,7 +164,7 @@ struct request_sock_queue { - }; - - int reqsk_queue_alloc(struct request_sock_queue *queue, -- unsigned int nr_table_entries); -+ unsigned int nr_table_entries, gfp_t flags); - - void __reqsk_queue_destroy(struct request_sock_queue *queue); - void reqsk_queue_destroy(struct request_sock_queue *queue); -diff --git a/include/net/sock.h b/include/net/sock.h -index 156350745700..0e23cae8861f 100644 ---- a/include/net/sock.h -+++ b/include/net/sock.h -@@ -901,6 +901,16 @@ void sk_clear_memalloc(struct sock *sk); - - int sk_wait_data(struct sock *sk, long *timeo); - -+/* START - needed for MPTCP */ -+struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family); -+void sock_lock_init(struct sock *sk); -+ -+extern struct lock_class_key af_callback_keys[AF_MAX]; -+extern char *const af_family_clock_key_strings[AF_MAX+1]; -+ -+#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) -+/* END - needed for MPTCP */ -+ - struct request_sock_ops; - struct timewait_sock_ops; - struct inet_hashinfo; -diff --git a/include/net/tcp.h b/include/net/tcp.h -index 7286db80e8b8..ff92e74cd684 100644 ---- a/include/net/tcp.h -+++ b/include/net/tcp.h -@@ -177,6 +177,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); - #define TCPOPT_SACK 5 /* SACK Block */ - #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ - #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ -+#define TCPOPT_MPTCP 30 - #define TCPOPT_EXP 254 /* Experimental */ - /* Magic number to be after the option value for sharing TCP - * experimental options. See draft-ietf-tcpm-experimental-options-00.txt -@@ -229,6 +230,27 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); - #define TFO_SERVER_WO_SOCKOPT1 0x400 - #define TFO_SERVER_WO_SOCKOPT2 0x800 - -+/* Flags from tcp_input.c for tcp_ack */ -+#define FLAG_DATA 0x01 /* Incoming frame contained data. */ -+#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ -+#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ -+#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ -+#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ -+#define FLAG_DATA_SACKED 0x20 /* New SACK. */ -+#define FLAG_ECE 0x40 /* ECE in this ACK */ -+#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ -+#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ -+#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ -+#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ -+#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ -+#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ -+#define MPTCP_FLAG_DATA_ACKED 0x8000 -+ -+#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) -+#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) -+#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) -+#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) -+ - extern struct inet_timewait_death_row tcp_death_row; - - /* sysctl variables for tcp */ -@@ -344,6 +366,107 @@ extern struct proto tcp_prot; - #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val) - #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) - -+/**** START - Exports needed for MPTCP ****/ -+extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; -+extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; -+ -+struct mptcp_options_received; -+ -+void tcp_enter_quickack_mode(struct sock *sk); -+int tcp_close_state(struct sock *sk); -+void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, -+ const struct sk_buff *skb); -+int tcp_xmit_probe_skb(struct sock *sk, int urgent); -+void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb); -+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, -+ gfp_t gfp_mask); -+unsigned int tcp_mss_split_point(const struct sock *sk, -+ const struct sk_buff *skb, -+ unsigned int mss_now, -+ unsigned int max_segs, -+ int nonagle); -+bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, -+ unsigned int cur_mss, int nonagle); -+bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, -+ unsigned int cur_mss); -+unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb); -+int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, -+ unsigned int mss_now); -+void __pskb_trim_head(struct sk_buff *skb, int len); -+void tcp_queue_skb(struct sock *sk, struct sk_buff *skb); -+void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags); -+void tcp_reset(struct sock *sk); -+bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, -+ const u32 ack_seq, const u32 nwin); -+bool tcp_urg_mode(const struct tcp_sock *tp); -+void tcp_ack_probe(struct sock *sk); -+void tcp_rearm_rto(struct sock *sk); -+int tcp_write_timeout(struct sock *sk); -+bool retransmits_timed_out(struct sock *sk, unsigned int boundary, -+ unsigned int timeout, bool syn_set); -+void tcp_write_err(struct sock *sk); -+void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr); -+void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, -+ unsigned int mss_now); -+ -+int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req); -+void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, -+ struct request_sock *req); -+__u32 tcp_v4_init_sequence(const struct sk_buff *skb); -+int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, -+ struct flowi *fl, -+ struct request_sock *req, -+ u16 queue_mapping, -+ struct tcp_fastopen_cookie *foc); -+void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb); -+struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb); -+struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb); -+void tcp_v4_reqsk_destructor(struct request_sock *req); -+ -+int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req); -+void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, -+ struct request_sock *req); -+__u32 tcp_v6_init_sequence(const struct sk_buff *skb); -+int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, -+ struct flowi *fl, struct request_sock *req, -+ u16 queue_mapping, struct tcp_fastopen_cookie *foc); -+void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); -+int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -+int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); -+void tcp_v6_destroy_sock(struct sock *sk); -+void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); -+void tcp_v6_hash(struct sock *sk); -+struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb); -+struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, -+ struct request_sock *req, -+ struct dst_entry *dst); -+void tcp_v6_reqsk_destructor(struct request_sock *req); -+ -+unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, -+ int large_allowed); -+u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb); -+ -+void skb_clone_fraglist(struct sk_buff *skb); -+void copy_skb_header(struct sk_buff *new, const struct sk_buff *old); -+ -+void inet_twsk_free(struct inet_timewait_sock *tw); -+int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb); -+/* These states need RST on ABORT according to RFC793 */ -+static inline bool tcp_need_reset(int state) -+{ -+ return (1 << state) & -+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | -+ TCPF_FIN_WAIT2 | TCPF_SYN_RECV); -+} -+ -+bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, -+ int hlen); -+int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, -+ bool *fragstolen); -+bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, -+ struct sk_buff *from, bool *fragstolen); -+/**** END - Exports needed for MPTCP ****/ -+ - void tcp_tasklet_init(void); - - void tcp_v4_err(struct sk_buff *skb, u32); -@@ -440,6 +563,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int nonblock, int flags, int *addr_len); - void tcp_parse_options(const struct sk_buff *skb, - struct tcp_options_received *opt_rx, -+ struct mptcp_options_received *mopt_rx, - int estab, struct tcp_fastopen_cookie *foc); - const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); - -@@ -493,14 +617,8 @@ static inline u32 tcp_cookie_time(void) - - u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, - u16 *mssp); --__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss); --#else --static inline __u32 cookie_v4_init_sequence(struct sock *sk, -- struct sk_buff *skb, -- __u16 *mss) --{ -- return 0; --} -+__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, -+ __u16 *mss); - #endif - - __u32 cookie_init_timestamp(struct request_sock *req); -@@ -516,13 +634,6 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, - const struct tcphdr *th, u16 *mssp); - __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, - __u16 *mss); --#else --static inline __u32 cookie_v6_init_sequence(struct sock *sk, -- struct sk_buff *skb, -- __u16 *mss) --{ -- return 0; --} - #endif - /* tcp_output.c */ - -@@ -551,10 +662,17 @@ void tcp_send_delayed_ack(struct sock *sk); - void tcp_send_loss_probe(struct sock *sk); - bool tcp_schedule_loss_probe(struct sock *sk); - -+u16 tcp_select_window(struct sock *sk); -+bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, -+ int push_one, gfp_t gfp); -+ - /* tcp_input.c */ - void tcp_resume_early_retransmit(struct sock *sk); - void tcp_rearm_rto(struct sock *sk); - void tcp_reset(struct sock *sk); -+void tcp_set_rto(struct sock *sk); -+bool tcp_should_expand_sndbuf(const struct sock *sk); -+bool tcp_prune_ofo_queue(struct sock *sk); - - /* tcp_timer.c */ - void tcp_init_xmit_timers(struct sock *); -@@ -703,14 +821,27 @@ void tcp_send_window_probe(struct sock *sk); - */ - struct tcp_skb_cb { - union { -- struct inet_skb_parm h4; -+ union { -+ struct inet_skb_parm h4; - #if IS_ENABLED(CONFIG_IPV6) -- struct inet6_skb_parm h6; -+ struct inet6_skb_parm h6; - #endif -- } header; /* For incoming frames */ -+ } header; /* For incoming frames */ -+#ifdef CONFIG_MPTCP -+ union { /* For MPTCP outgoing frames */ -+ __u32 path_mask; /* paths that tried to send this skb */ -+ __u32 dss[6]; /* DSS options */ -+ }; -+#endif -+ }; - __u32 seq; /* Starting sequence number */ - __u32 end_seq; /* SEQ + FIN + SYN + datalen */ - __u32 when; /* used to compute rtt's */ -+#ifdef CONFIG_MPTCP -+ __u8 mptcp_flags; /* flags for the MPTCP layer */ -+ __u8 dss_off; /* Number of 4-byte words until -+ * seq-number */ -+#endif - __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ - - __u8 sacked; /* State flags for SACK/FACK. */ -@@ -1075,7 +1206,8 @@ u32 tcp_default_init_rwnd(u32 mss); - /* Determine a window scaling and initial window to offer. */ - void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, - __u32 *window_clamp, int wscale_ok, -- __u8 *rcv_wscale, __u32 init_rcv_wnd); -+ __u8 *rcv_wscale, __u32 init_rcv_wnd, -+ const struct sock *sk); - - static inline int tcp_win_from_space(int space) - { -@@ -1084,15 +1216,34 @@ static inline int tcp_win_from_space(int space) - space - (space>>sysctl_tcp_adv_win_scale); - } - -+#ifdef CONFIG_MPTCP -+extern struct static_key mptcp_static_key; -+static inline bool mptcp(const struct tcp_sock *tp) -+{ -+ return static_key_false(&mptcp_static_key) && tp->mpc; -+} -+#else -+static inline bool mptcp(const struct tcp_sock *tp) -+{ -+ return 0; -+} -+#endif -+ - /* Note: caller must be prepared to deal with negative returns */ - static inline int tcp_space(const struct sock *sk) - { -+ if (mptcp(tcp_sk(sk))) -+ sk = tcp_sk(sk)->meta_sk; -+ - return tcp_win_from_space(sk->sk_rcvbuf - - atomic_read(&sk->sk_rmem_alloc)); - } - - static inline int tcp_full_space(const struct sock *sk) - { -+ if (mptcp(tcp_sk(sk))) -+ sk = tcp_sk(sk)->meta_sk; -+ - return tcp_win_from_space(sk->sk_rcvbuf); - } - -@@ -1115,6 +1266,8 @@ static inline void tcp_openreq_init(struct request_sock *req, - ireq->wscale_ok = rx_opt->wscale_ok; - ireq->acked = 0; - ireq->ecn_ok = 0; -+ ireq->mptcp_rqsk = 0; -+ ireq->saw_mpc = 0; - ireq->ir_rmt_port = tcp_hdr(skb)->source; - ireq->ir_num = ntohs(tcp_hdr(skb)->dest); - } -@@ -1585,6 +1738,11 @@ int tcp4_proc_init(void); - void tcp4_proc_exit(void); - #endif - -+int tcp_rtx_synack(struct sock *sk, struct request_sock *req); -+int tcp_conn_request(struct request_sock_ops *rsk_ops, -+ const struct tcp_request_sock_ops *af_ops, -+ struct sock *sk, struct sk_buff *skb); -+ - /* TCP af-specific functions */ - struct tcp_sock_af_ops { - #ifdef CONFIG_TCP_MD5SIG -@@ -1601,7 +1759,32 @@ struct tcp_sock_af_ops { - #endif - }; - -+/* TCP/MPTCP-specific functions */ -+struct tcp_sock_ops { -+ u32 (*__select_window)(struct sock *sk); -+ u16 (*select_window)(struct sock *sk); -+ void (*select_initial_window)(int __space, __u32 mss, __u32 *rcv_wnd, -+ __u32 *window_clamp, int wscale_ok, -+ __u8 *rcv_wscale, __u32 init_rcv_wnd, -+ const struct sock *sk); -+ void (*init_buffer_space)(struct sock *sk); -+ void (*set_rto)(struct sock *sk); -+ bool (*should_expand_sndbuf)(const struct sock *sk); -+ void (*send_fin)(struct sock *sk); -+ bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle, -+ int push_one, gfp_t gfp); -+ void (*send_active_reset)(struct sock *sk, gfp_t priority); -+ int (*write_wakeup)(struct sock *sk); -+ bool (*prune_ofo_queue)(struct sock *sk); -+ void (*retransmit_timer)(struct sock *sk); -+ void (*time_wait)(struct sock *sk, int state, int timeo); -+ void (*cleanup_rbuf)(struct sock *sk, int copied); -+ void (*init_congestion_control)(struct sock *sk); -+}; -+extern const struct tcp_sock_ops tcp_specific; -+ - struct tcp_request_sock_ops { -+ u16 mss_clamp; - #ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *(*md5_lookup) (struct sock *sk, - struct request_sock *req); -@@ -1611,8 +1794,39 @@ struct tcp_request_sock_ops { - const struct request_sock *req, - const struct sk_buff *skb); - #endif -+ int (*init_req)(struct request_sock *req, struct sock *sk, -+ struct sk_buff *skb); -+#ifdef CONFIG_SYN_COOKIES -+ __u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb, -+ __u16 *mss); -+#endif -+ struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl, -+ const struct request_sock *req, -+ bool *strict); -+ __u32 (*init_seq)(const struct sk_buff *skb); -+ int (*send_synack)(struct sock *sk, struct dst_entry *dst, -+ struct flowi *fl, struct request_sock *req, -+ u16 queue_mapping, struct tcp_fastopen_cookie *foc); -+ void (*queue_hash_add)(struct sock *sk, struct request_sock *req, -+ const unsigned long timeout); - }; - -+#ifdef CONFIG_SYN_COOKIES -+static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, -+ struct sock *sk, struct sk_buff *skb, -+ __u16 *mss) -+{ -+ return ops->cookie_init_seq(sk, skb, mss); -+} -+#else -+static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, -+ struct sock *sk, struct sk_buff *skb, -+ __u16 *mss) -+{ -+ return 0; -+} -+#endif -+ - int tcpv4_offload_init(void); - - void tcp_v4_init(void); -diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h -index 9cf2394f0bcf..c2634b6ed854 100644 ---- a/include/uapi/linux/if.h -+++ b/include/uapi/linux/if.h -@@ -109,6 +109,9 @@ enum net_device_flags { - #define IFF_DORMANT IFF_DORMANT - #define IFF_ECHO IFF_ECHO - -+#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */ -+#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */ -+ - #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\ - IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT) - -diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h -index 3b9718328d8b..487475681d84 100644 ---- a/include/uapi/linux/tcp.h -+++ b/include/uapi/linux/tcp.h -@@ -112,6 +112,7 @@ enum { - #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ - #define TCP_TIMESTAMP 24 - #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ -+#define MPTCP_ENABLED 26 - - struct tcp_repair_opt { - __u32 opt_code; -diff --git a/net/Kconfig b/net/Kconfig -index d92afe4204d9..96b58593ad5e 100644 ---- a/net/Kconfig -+++ b/net/Kconfig -@@ -79,6 +79,7 @@ if INET - source "net/ipv4/Kconfig" - source "net/ipv6/Kconfig" - source "net/netlabel/Kconfig" -+source "net/mptcp/Kconfig" - - endif # if INET - -diff --git a/net/Makefile b/net/Makefile -index cbbbe6d657ca..244bac1435b1 100644 ---- a/net/Makefile -+++ b/net/Makefile -@@ -20,6 +20,7 @@ obj-$(CONFIG_INET) += ipv4/ - obj-$(CONFIG_XFRM) += xfrm/ - obj-$(CONFIG_UNIX) += unix/ - obj-$(CONFIG_NET) += ipv6/ -+obj-$(CONFIG_MPTCP) += mptcp/ - obj-$(CONFIG_PACKET) += packet/ - obj-$(CONFIG_NET_KEY) += key/ - obj-$(CONFIG_BRIDGE) += bridge/ -diff --git a/net/core/dev.c b/net/core/dev.c -index 367a586d0c8a..215d2757fbf6 100644 ---- a/net/core/dev.c -+++ b/net/core/dev.c -@@ -5420,7 +5420,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) - - dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | - IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | -- IFF_AUTOMEDIA)) | -+ IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) | - (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | - IFF_ALLMULTI)); - -diff --git a/net/core/request_sock.c b/net/core/request_sock.c -index 467f326126e0..909dfa13f499 100644 ---- a/net/core/request_sock.c -+++ b/net/core/request_sock.c -@@ -38,7 +38,8 @@ int sysctl_max_syn_backlog = 256; - EXPORT_SYMBOL(sysctl_max_syn_backlog); - - int reqsk_queue_alloc(struct request_sock_queue *queue, -- unsigned int nr_table_entries) -+ unsigned int nr_table_entries, -+ gfp_t flags) - { - size_t lopt_size = sizeof(struct listen_sock); - struct listen_sock *lopt; -@@ -48,9 +49,11 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, - nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - lopt_size += nr_table_entries * sizeof(struct request_sock *); - if (lopt_size > PAGE_SIZE) -- lopt = vzalloc(lopt_size); -+ lopt = __vmalloc(lopt_size, -+ flags | __GFP_HIGHMEM | __GFP_ZERO, -+ PAGE_KERNEL); - else -- lopt = kzalloc(lopt_size, GFP_KERNEL); -+ lopt = kzalloc(lopt_size, flags); - if (lopt == NULL) - return -ENOMEM; - -diff --git a/net/core/skbuff.c b/net/core/skbuff.c -index c1a33033cbe2..8abc5d60fbe3 100644 ---- a/net/core/skbuff.c -+++ b/net/core/skbuff.c -@@ -472,7 +472,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb) - skb_drop_list(&skb_shinfo(skb)->frag_list); - } - --static void skb_clone_fraglist(struct sk_buff *skb) -+void skb_clone_fraglist(struct sk_buff *skb) - { - struct sk_buff *list; - -@@ -897,7 +897,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) - skb->inner_mac_header += off; - } - --static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) -+void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) - { - __copy_skb_header(new, old); - -diff --git a/net/core/sock.c b/net/core/sock.c -index 026e01f70274..359295523177 100644 ---- a/net/core/sock.c -+++ b/net/core/sock.c -@@ -136,6 +136,11 @@ - - #include - -+#ifdef CONFIG_MPTCP -+#include -+#include -+#endif -+ - #ifdef CONFIG_INET - #include - #endif -@@ -280,7 +285,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { - "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , - "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" - }; --static const char *const af_family_clock_key_strings[AF_MAX+1] = { -+char *const af_family_clock_key_strings[AF_MAX+1] = { - "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , - "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", - "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , -@@ -301,7 +306,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { - * sk_callback_lock locking rules are per-address-family, - * so split the lock classes by using a per-AF key: - */ --static struct lock_class_key af_callback_keys[AF_MAX]; -+struct lock_class_key af_callback_keys[AF_MAX]; - - /* Take into consideration the size of the struct sk_buff overhead in the - * determination of these values, since that is non-constant across -@@ -422,8 +427,6 @@ static void sock_warn_obsolete_bsdism(const char *name) - } - } - --#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) -- - static void sock_disable_timestamp(struct sock *sk, unsigned long flags) - { - if (sk->sk_flags & flags) { -@@ -1253,8 +1256,25 @@ lenout: - * - * (We also register the sk_lock with the lock validator.) - */ --static inline void sock_lock_init(struct sock *sk) --{ -+void sock_lock_init(struct sock *sk) -+{ -+#ifdef CONFIG_MPTCP -+ /* Reclassify the lock-class for subflows */ -+ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) -+ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) { -+ sock_lock_init_class_and_name(sk, "slock-AF_INET-MPTCP", -+ &meta_slock_key, -+ "sk_lock-AF_INET-MPTCP", -+ &meta_key); -+ -+ /* We don't yet have the mptcp-point. -+ * Thus we still need inet_sock_destruct -+ */ -+ sk->sk_destruct = inet_sock_destruct; -+ return; -+ } -+#endif -+ - sock_lock_init_class_and_name(sk, - af_family_slock_key_strings[sk->sk_family], - af_family_slock_keys + sk->sk_family, -@@ -1301,7 +1321,7 @@ void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) - } - EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); - --static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, -+struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, - int family) - { - struct sock *sk; -diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c -index 4db3c2a1679c..04cb17d4b0ce 100644 ---- a/net/dccp/ipv6.c -+++ b/net/dccp/ipv6.c -@@ -386,7 +386,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) - goto drop; - -- req = inet6_reqsk_alloc(&dccp6_request_sock_ops); -+ req = inet_reqsk_alloc(&dccp6_request_sock_ops); - if (req == NULL) - goto drop; - -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index 05c57f0fcabe..630434db0085 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -556,6 +556,30 @@ config TCP_CONG_ILLINOIS - For further details see: - http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html - -+config TCP_CONG_COUPLED -+ tristate "MPTCP COUPLED CONGESTION CONTROL" -+ depends on MPTCP -+ default n -+ ---help--- -+ MultiPath TCP Coupled Congestion Control -+ To enable it, just put 'coupled' in tcp_congestion_control -+ -+config TCP_CONG_OLIA -+ tristate "MPTCP Opportunistic Linked Increase" -+ depends on MPTCP -+ default n -+ ---help--- -+ MultiPath TCP Opportunistic Linked Increase Congestion Control -+ To enable it, just put 'olia' in tcp_congestion_control -+ -+config TCP_CONG_WVEGAS -+ tristate "MPTCP WVEGAS CONGESTION CONTROL" -+ depends on MPTCP -+ default n -+ ---help--- -+ wVegas congestion control for MPTCP -+ To enable it, just put 'wvegas' in tcp_congestion_control -+ - choice - prompt "Default TCP congestion control" - default DEFAULT_CUBIC -@@ -584,6 +608,15 @@ choice - config DEFAULT_WESTWOOD - bool "Westwood" if TCP_CONG_WESTWOOD=y - -+ config DEFAULT_COUPLED -+ bool "Coupled" if TCP_CONG_COUPLED=y -+ -+ config DEFAULT_OLIA -+ bool "Olia" if TCP_CONG_OLIA=y -+ -+ config DEFAULT_WVEGAS -+ bool "Wvegas" if TCP_CONG_WVEGAS=y -+ - config DEFAULT_RENO - bool "Reno" - -@@ -605,6 +638,8 @@ config DEFAULT_TCP_CONG - default "vegas" if DEFAULT_VEGAS - default "westwood" if DEFAULT_WESTWOOD - default "veno" if DEFAULT_VENO -+ default "coupled" if DEFAULT_COUPLED -+ default "wvegas" if DEFAULT_WVEGAS - default "reno" if DEFAULT_RENO - default "cubic" - -diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c -index d156b3c5f363..4afd6d8d9028 100644 ---- a/net/ipv4/af_inet.c -+++ b/net/ipv4/af_inet.c -@@ -104,6 +104,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -246,8 +247,7 @@ EXPORT_SYMBOL(inet_listen); - * Create an inet socket. - */ - --static int inet_create(struct net *net, struct socket *sock, int protocol, -- int kern) -+int inet_create(struct net *net, struct socket *sock, int protocol, int kern) - { - struct sock *sk; - struct inet_protosw *answer; -@@ -676,6 +676,23 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) - lock_sock(sk2); - - sock_rps_record_flow(sk2); -+ -+ if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) { -+ struct sock *sk_it = sk2; -+ -+ mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it) -+ sock_rps_record_flow(sk_it); -+ -+ if (tcp_sk(sk2)->mpcb->master_sk) { -+ sk_it = tcp_sk(sk2)->mpcb->master_sk; -+ -+ write_lock_bh(&sk_it->sk_callback_lock); -+ sk_it->sk_wq = newsock->wq; -+ sk_it->sk_socket = newsock; -+ write_unlock_bh(&sk_it->sk_callback_lock); -+ } -+ } -+ - WARN_ON(!((1 << sk2->sk_state) & - (TCPF_ESTABLISHED | TCPF_SYN_RECV | - TCPF_CLOSE_WAIT | TCPF_CLOSE))); -@@ -1763,6 +1780,9 @@ static int __init inet_init(void) - - ip_init(); - -+ /* We must initialize MPTCP before TCP. */ -+ mptcp_init(); -+ - tcp_v4_init(); - - /* Setup TCP slab cache for open requests. */ -diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c -index 14d02ea905b6..7d734d8af19b 100644 ---- a/net/ipv4/inet_connection_sock.c -+++ b/net/ipv4/inet_connection_sock.c -@@ -23,6 +23,7 @@ - #include - #include - #include -+#include - - #ifdef INET_CSK_DEBUG - const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; -@@ -465,8 +466,8 @@ no_route: - } - EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); - --static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, -- const u32 rnd, const u32 synq_hsize) -+u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd, -+ const u32 synq_hsize) - { - return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); - } -@@ -647,7 +648,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, - - lopt->clock_hand = i; - -- if (lopt->qlen) -+ if (lopt->qlen && !is_meta_sk(parent)) - inet_csk_reset_keepalive_timer(parent, interval); - } - EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); -@@ -664,7 +665,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, - const struct request_sock *req, - const gfp_t priority) - { -- struct sock *newsk = sk_clone_lock(sk, priority); -+ struct sock *newsk; -+ -+ newsk = sk_clone_lock(sk, priority); - - if (newsk != NULL) { - struct inet_connection_sock *newicsk = inet_csk(newsk); -@@ -743,7 +746,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) - { - struct inet_sock *inet = inet_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); -- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); -+ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries, -+ GFP_KERNEL); - - if (rc != 0) - return rc; -@@ -801,9 +805,14 @@ void inet_csk_listen_stop(struct sock *sk) - - while ((req = acc_req) != NULL) { - struct sock *child = req->sk; -+ bool mutex_taken = false; - - acc_req = req->dl_next; - -+ if (is_meta_sk(child)) { -+ mutex_lock(&tcp_sk(child)->mpcb->mpcb_mutex); -+ mutex_taken = true; -+ } - local_bh_disable(); - bh_lock_sock(child); - WARN_ON(sock_owned_by_user(child)); -@@ -832,6 +841,8 @@ void inet_csk_listen_stop(struct sock *sk) - - bh_unlock_sock(child); - local_bh_enable(); -+ if (mutex_taken) -+ mutex_unlock(&tcp_sk(child)->mpcb->mpcb_mutex); - sock_put(child); - - sk_acceptq_removed(sk); -diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c -index c86624b36a62..0ff3fe004d62 100644 ---- a/net/ipv4/syncookies.c -+++ b/net/ipv4/syncookies.c -@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, - } - EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); - --__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) -+__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, -+ __u16 *mssp) - { - const struct iphdr *iph = ip_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); -@@ -284,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, - - /* check for timestamp cookie support */ - memset(&tcp_opt, 0, sizeof(tcp_opt)); -- tcp_parse_options(skb, &tcp_opt, 0, NULL); -+ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL); - - if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) - goto out; -@@ -355,10 +356,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, - /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); - -- tcp_select_initial_window(tcp_full_space(sk), req->mss, -- &req->rcv_wnd, &req->window_clamp, -- ireq->wscale_ok, &rcv_wscale, -- dst_metric(&rt->dst, RTAX_INITRWND)); -+ tp->ops->select_initial_window(tcp_full_space(sk), req->mss, -+ &req->rcv_wnd, &req->window_clamp, -+ ireq->wscale_ok, &rcv_wscale, -+ dst_metric(&rt->dst, RTAX_INITRWND), sk); - - ireq->rcv_wscale = rcv_wscale; - -diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index 9d2118e5fbc7..2cb89f886d45 100644 ---- a/net/ipv4/tcp.c -+++ b/net/ipv4/tcp.c -@@ -271,6 +271,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -371,6 +372,24 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) - return period; - } - -+const struct tcp_sock_ops tcp_specific = { -+ .__select_window = __tcp_select_window, -+ .select_window = tcp_select_window, -+ .select_initial_window = tcp_select_initial_window, -+ .init_buffer_space = tcp_init_buffer_space, -+ .set_rto = tcp_set_rto, -+ .should_expand_sndbuf = tcp_should_expand_sndbuf, -+ .init_congestion_control = tcp_init_congestion_control, -+ .send_fin = tcp_send_fin, -+ .write_xmit = tcp_write_xmit, -+ .send_active_reset = tcp_send_active_reset, -+ .write_wakeup = tcp_write_wakeup, -+ .prune_ofo_queue = tcp_prune_ofo_queue, -+ .retransmit_timer = tcp_retransmit_timer, -+ .time_wait = tcp_time_wait, -+ .cleanup_rbuf = tcp_cleanup_rbuf, -+}; -+ - /* Address-family independent initialization for a tcp_sock. - * - * NOTE: A lot of things set to zero explicitly by call to -@@ -419,6 +438,8 @@ void tcp_init_sock(struct sock *sk) - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; - -+ tp->ops = &tcp_specific; -+ - local_bh_disable(); - sock_update_memcg(sk); - sk_sockets_allocated_inc(sk); -@@ -726,6 +747,14 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, - int ret; - - sock_rps_record_flow(sk); -+ -+#ifdef CONFIG_MPTCP -+ if (mptcp(tcp_sk(sk))) { -+ struct sock *sk_it; -+ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) -+ sock_rps_record_flow(sk_it); -+ } -+#endif - /* - * We can't seek on a socket input - */ -@@ -821,8 +850,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) - return NULL; - } - --static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, -- int large_allowed) -+unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) - { - struct tcp_sock *tp = tcp_sk(sk); - u32 xmit_size_goal, old_size_goal; -@@ -872,8 +900,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) - { - int mss_now; - -- mss_now = tcp_current_mss(sk); -- *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); -+ if (mptcp(tcp_sk(sk))) { -+ mss_now = mptcp_current_mss(sk); -+ *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); -+ } else { -+ mss_now = tcp_current_mss(sk); -+ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); -+ } - - return mss_now; - } -@@ -892,11 +925,32 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, - * is fully established. - */ - if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && -- !tcp_passive_fastopen(sk)) { -+ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ? -+ tp->mpcb->master_sk : sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_err; - } - -+ if (mptcp(tp)) { -+ struct sock *sk_it = sk; -+ -+ /* We must check this with socket-lock hold because we iterate -+ * over the subflows. -+ */ -+ if (!mptcp_can_sendpage(sk)) { -+ ssize_t ret; -+ -+ release_sock(sk); -+ ret = sock_no_sendpage(sk->sk_socket, page, offset, -+ size, flags); -+ lock_sock(sk); -+ return ret; -+ } -+ -+ mptcp_for_each_sk(tp->mpcb, sk_it) -+ sock_rps_record_flow(sk_it); -+ } -+ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - - mss_now = tcp_send_mss(sk, &size_goal, flags); -@@ -1001,8 +1055,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, - { - ssize_t res; - -- if (!(sk->sk_route_caps & NETIF_F_SG) || -- !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) -+ /* If MPTCP is enabled, we check it later after establishment */ -+ if (!mptcp(tcp_sk(sk)) && (!(sk->sk_route_caps & NETIF_F_SG) || -+ !(sk->sk_route_caps & NETIF_F_ALL_CSUM))) - return sock_no_sendpage(sk->sk_socket, page, offset, size, - flags); - -@@ -1018,6 +1073,9 @@ static inline int select_size(const struct sock *sk, bool sg) - const struct tcp_sock *tp = tcp_sk(sk); - int tmp = tp->mss_cache; - -+ if (mptcp(tp)) -+ return mptcp_select_size(sk, sg); -+ - if (sg) { - if (sk_can_gso(sk)) { - /* Small frames wont use a full page: -@@ -1100,11 +1158,18 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - * is fully established. - */ - if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && -- !tcp_passive_fastopen(sk)) { -+ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ? -+ tp->mpcb->master_sk : sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) - goto do_error; - } - -+ if (mptcp(tp)) { -+ struct sock *sk_it = sk; -+ mptcp_for_each_sk(tp->mpcb, sk_it) -+ sock_rps_record_flow(sk_it); -+ } -+ - if (unlikely(tp->repair)) { - if (tp->repair_queue == TCP_RECV_QUEUE) { - copied = tcp_send_rcvq(sk, msg, size); -@@ -1132,7 +1197,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto out_err; - -- sg = !!(sk->sk_route_caps & NETIF_F_SG); -+ if (mptcp(tp)) -+ sg = mptcp_can_sg(sk); -+ else -+ sg = !!(sk->sk_route_caps & NETIF_F_SG); - - while (--iovlen >= 0) { - size_t seglen = iov->iov_len; -@@ -1183,8 +1251,15 @@ new_segment: - - /* - * Check whether we can use HW checksum. -+ * -+ * If dss-csum is enabled, we do not do hw-csum. -+ * In case of non-mptcp we check the -+ * device-capabilities. -+ * In case of mptcp, hw-csum's will be handled -+ * later in mptcp_write_xmit. - */ -- if (sk->sk_route_caps & NETIF_F_ALL_CSUM) -+ if (((mptcp(tp) && !tp->mpcb->dss_csum) || !mptcp(tp)) && -+ (mptcp(tp) || sk->sk_route_caps & NETIF_F_ALL_CSUM)) - skb->ip_summed = CHECKSUM_PARTIAL; - - skb_entail(sk, skb); -@@ -1422,7 +1497,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) - - /* Optimize, __tcp_select_window() is not cheap. */ - if (2*rcv_window_now <= tp->window_clamp) { -- __u32 new_window = __tcp_select_window(sk); -+ __u32 new_window = tp->ops->__select_window(sk); - - /* Send ACK now, if this read freed lots of space - * in our buffer. Certainly, new_window is new window. -@@ -1587,7 +1662,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, - /* Clean up data we have read: This will do ACK frames. */ - if (copied > 0) { - tcp_recv_skb(sk, seq, &offset); -- tcp_cleanup_rbuf(sk, copied); -+ tp->ops->cleanup_rbuf(sk, copied); - } - return copied; - } -@@ -1623,6 +1698,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - - lock_sock(sk); - -+#ifdef CONFIG_MPTCP -+ if (mptcp(tp)) { -+ struct sock *sk_it; -+ mptcp_for_each_sk(tp->mpcb, sk_it) -+ sock_rps_record_flow(sk_it); -+ } -+#endif -+ - err = -ENOTCONN; - if (sk->sk_state == TCP_LISTEN) - goto out; -@@ -1761,7 +1844,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - } - } - -- tcp_cleanup_rbuf(sk, copied); -+ tp->ops->cleanup_rbuf(sk, copied); - - if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { - /* Install new reader */ -@@ -1813,7 +1896,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - if (tp->rcv_wnd == 0 && - !skb_queue_empty(&sk->sk_async_wait_queue)) { - tcp_service_net_dma(sk, true); -- tcp_cleanup_rbuf(sk, copied); -+ tp->ops->cleanup_rbuf(sk, copied); - } else - dma_async_issue_pending(tp->ucopy.dma_chan); - } -@@ -1993,7 +2076,7 @@ skip_copy: - */ - - /* Clean up data we have read: This will do ACK frames. */ -- tcp_cleanup_rbuf(sk, copied); -+ tp->ops->cleanup_rbuf(sk, copied); - - release_sock(sk); - return copied; -@@ -2070,7 +2153,7 @@ static const unsigned char new_state[16] = { - /* TCP_CLOSING */ TCP_CLOSING, - }; - --static int tcp_close_state(struct sock *sk) -+int tcp_close_state(struct sock *sk) - { - int next = (int)new_state[sk->sk_state]; - int ns = next & TCP_STATE_MASK; -@@ -2100,7 +2183,7 @@ void tcp_shutdown(struct sock *sk, int how) - TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { - /* Clear out any half completed packets. FIN if needed. */ - if (tcp_close_state(sk)) -- tcp_send_fin(sk); -+ tcp_sk(sk)->ops->send_fin(sk); - } - } - EXPORT_SYMBOL(tcp_shutdown); -@@ -2125,6 +2208,11 @@ void tcp_close(struct sock *sk, long timeout) - int data_was_unread = 0; - int state; - -+ if (is_meta_sk(sk)) { -+ mptcp_close(sk, timeout); -+ return; -+ } -+ - lock_sock(sk); - sk->sk_shutdown = SHUTDOWN_MASK; - -@@ -2167,7 +2255,7 @@ void tcp_close(struct sock *sk, long timeout) - /* Unread data was tossed, zap the connection. */ - NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); - tcp_set_state(sk, TCP_CLOSE); -- tcp_send_active_reset(sk, sk->sk_allocation); -+ tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation); - } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { - /* Check zero linger _after_ checking for unread data. */ - sk->sk_prot->disconnect(sk, 0); -@@ -2247,7 +2335,7 @@ adjudge_to_death: - struct tcp_sock *tp = tcp_sk(sk); - if (tp->linger2 < 0) { - tcp_set_state(sk, TCP_CLOSE); -- tcp_send_active_reset(sk, GFP_ATOMIC); -+ tp->ops->send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPABORTONLINGER); - } else { -@@ -2257,7 +2345,8 @@ adjudge_to_death: - inet_csk_reset_keepalive_timer(sk, - tmo - TCP_TIMEWAIT_LEN); - } else { -- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); -+ tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2, -+ tmo); - goto out; - } - } -@@ -2266,7 +2355,7 @@ adjudge_to_death: - sk_mem_reclaim(sk); - if (tcp_check_oom(sk, 0)) { - tcp_set_state(sk, TCP_CLOSE); -- tcp_send_active_reset(sk, GFP_ATOMIC); -+ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPABORTONMEMORY); - } -@@ -2291,15 +2380,6 @@ out: - } - EXPORT_SYMBOL(tcp_close); - --/* These states need RST on ABORT according to RFC793 */ -- --static inline bool tcp_need_reset(int state) --{ -- return (1 << state) & -- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | -- TCPF_FIN_WAIT2 | TCPF_SYN_RECV); --} -- - int tcp_disconnect(struct sock *sk, int flags) - { - struct inet_sock *inet = inet_sk(sk); -@@ -2322,7 +2402,7 @@ int tcp_disconnect(struct sock *sk, int flags) - /* The last check adjusts for discrepancy of Linux wrt. RFC - * states - */ -- tcp_send_active_reset(sk, gfp_any()); -+ tp->ops->send_active_reset(sk, gfp_any()); - sk->sk_err = ECONNRESET; - } else if (old_state == TCP_SYN_SENT) - sk->sk_err = ECONNRESET; -@@ -2340,6 +2420,13 @@ int tcp_disconnect(struct sock *sk, int flags) - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); - -+ if (is_meta_sk(sk)) { -+ mptcp_disconnect(sk); -+ } else { -+ if (tp->inside_tk_table) -+ mptcp_hash_remove_bh(tp); -+ } -+ - sk->sk_shutdown = 0; - sock_reset_flag(sk, SOCK_DONE); - tp->srtt_us = 0; -@@ -2632,6 +2719,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, - break; - - case TCP_DEFER_ACCEPT: -+ /* An established MPTCP-connection (mptcp(tp) only returns true -+ * if the socket is established) should not use DEFER on new -+ * subflows. -+ */ -+ if (mptcp(tp)) -+ break; - /* Translate value in seconds to number of retransmits */ - icsk->icsk_accept_queue.rskq_defer_accept = - secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, -@@ -2659,7 +2752,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && - inet_csk_ack_scheduled(sk)) { - icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; -- tcp_cleanup_rbuf(sk, 1); -+ tp->ops->cleanup_rbuf(sk, 1); - if (!(val & 1)) - icsk->icsk_ack.pingpong = 1; - } -@@ -2699,6 +2792,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level, - tp->notsent_lowat = val; - sk->sk_write_space(sk); - break; -+#ifdef CONFIG_MPTCP -+ case MPTCP_ENABLED: -+ if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_LISTEN) { -+ if (val) -+ tp->mptcp_enabled = 1; -+ else -+ tp->mptcp_enabled = 0; -+ } else { -+ err = -EPERM; -+ } -+ break; -+#endif - default: - err = -ENOPROTOOPT; - break; -@@ -2931,6 +3036,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level, - case TCP_NOTSENT_LOWAT: - val = tp->notsent_lowat; - break; -+#ifdef CONFIG_MPTCP -+ case MPTCP_ENABLED: -+ val = tp->mptcp_enabled; -+ break; -+#endif - default: - return -ENOPROTOOPT; - } -@@ -3120,8 +3230,11 @@ void tcp_done(struct sock *sk) - if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); - -+ WARN_ON(sk->sk_state == TCP_CLOSE); - tcp_set_state(sk, TCP_CLOSE); -+ - tcp_clear_xmit_timers(sk); -+ - if (req != NULL) - reqsk_fastopen_remove(sk, req, false); - -diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c -index 9771563ab564..5c230d96c4c1 100644 ---- a/net/ipv4/tcp_fastopen.c -+++ b/net/ipv4/tcp_fastopen.c -@@ -7,6 +7,7 @@ - #include - #include - #include -+#include - - int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; - -@@ -133,7 +134,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, - { - struct tcp_sock *tp; - struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; -- struct sock *child; -+ struct sock *child, *meta_sk; - - req->num_retrans = 0; - req->num_timeout = 0; -@@ -176,13 +177,6 @@ static bool tcp_fastopen_create_child(struct sock *sk, - /* Add the child socket directly into the accept queue */ - inet_csk_reqsk_queue_add(sk, req, child); - -- /* Now finish processing the fastopen child socket. */ -- inet_csk(child)->icsk_af_ops->rebuild_header(child); -- tcp_init_congestion_control(child); -- tcp_mtup_init(child); -- tcp_init_metrics(child); -- tcp_init_buffer_space(child); -- - /* Queue the data carried in the SYN packet. We need to first - * bump skb's refcnt because the caller will attempt to free it. - * -@@ -199,8 +193,24 @@ static bool tcp_fastopen_create_child(struct sock *sk, - tp->syn_data_acked = 1; - } - tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; -+ -+ meta_sk = child; -+ if (!mptcp_check_req_fastopen(meta_sk, req)) { -+ child = tcp_sk(meta_sk)->mpcb->master_sk; -+ tp = tcp_sk(child); -+ } -+ -+ /* Now finish processing the fastopen child socket. */ -+ inet_csk(child)->icsk_af_ops->rebuild_header(child); -+ tp->ops->init_congestion_control(child); -+ tcp_mtup_init(child); -+ tcp_init_metrics(child); -+ tp->ops->init_buffer_space(child); -+ - sk->sk_data_ready(sk); -- bh_unlock_sock(child); -+ if (mptcp(tcp_sk(child))) -+ bh_unlock_sock(child); -+ bh_unlock_sock(meta_sk); - sock_put(child); - WARN_ON(req->sk == NULL); - return true; -diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index 40639c288dc2..3273bb69f387 100644 ---- a/net/ipv4/tcp_input.c -+++ b/net/ipv4/tcp_input.c -@@ -74,6 +74,9 @@ - #include - #include - #include -+#include -+#include -+#include - - int sysctl_tcp_timestamps __read_mostly = 1; - int sysctl_tcp_window_scaling __read_mostly = 1; -@@ -99,25 +102,6 @@ int sysctl_tcp_thin_dupack __read_mostly; - int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; - int sysctl_tcp_early_retrans __read_mostly = 3; - --#define FLAG_DATA 0x01 /* Incoming frame contained data. */ --#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ --#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ --#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ --#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ --#define FLAG_DATA_SACKED 0x20 /* New SACK. */ --#define FLAG_ECE 0x40 /* ECE in this ACK */ --#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ --#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ --#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ --#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ --#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ --#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ -- --#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) --#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) --#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) --#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) -- - #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) - #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) - -@@ -181,7 +165,7 @@ static void tcp_incr_quickack(struct sock *sk) - icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); - } - --static void tcp_enter_quickack_mode(struct sock *sk) -+void tcp_enter_quickack_mode(struct sock *sk) - { - struct inet_connection_sock *icsk = inet_csk(sk); - tcp_incr_quickack(sk); -@@ -283,8 +267,12 @@ static void tcp_sndbuf_expand(struct sock *sk) - per_mss = roundup_pow_of_two(per_mss) + - SKB_DATA_ALIGN(sizeof(struct sk_buff)); - -- nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); -- nr_segs = max_t(u32, nr_segs, tp->reordering + 1); -+ if (mptcp(tp)) { -+ nr_segs = mptcp_check_snd_buf(tp); -+ } else { -+ nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); -+ nr_segs = max_t(u32, nr_segs, tp->reordering + 1); -+ } - - /* Fast Recovery (RFC 5681 3.2) : - * Cubic needs 1.7 factor, rounded to 2 to include -@@ -292,8 +280,16 @@ static void tcp_sndbuf_expand(struct sock *sk) - */ - sndmem = 2 * nr_segs * per_mss; - -- if (sk->sk_sndbuf < sndmem) -+ /* MPTCP: after this sndmem is the new contribution of the -+ * current subflow to the aggregated sndbuf */ -+ if (sk->sk_sndbuf < sndmem) { -+ int old_sndbuf = sk->sk_sndbuf; - sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); -+ /* MPTCP: ok, the subflow sndbuf has grown, reflect -+ * this in the aggregate buffer.*/ -+ if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf) -+ mptcp_update_sndbuf(tp); -+ } - } - - /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) -@@ -342,10 +338,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) - static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) - { - struct tcp_sock *tp = tcp_sk(sk); -+ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); - - /* Check #1 */ -- if (tp->rcv_ssthresh < tp->window_clamp && -- (int)tp->rcv_ssthresh < tcp_space(sk) && -+ if (meta_tp->rcv_ssthresh < meta_tp->window_clamp && -+ (int)meta_tp->rcv_ssthresh < tcp_space(sk) && - !sk_under_memory_pressure(sk)) { - int incr; - -@@ -353,14 +351,14 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) - * will fit to rcvbuf in future. - */ - if (tcp_win_from_space(skb->truesize) <= skb->len) -- incr = 2 * tp->advmss; -+ incr = 2 * meta_tp->advmss; - else -- incr = __tcp_grow_window(sk, skb); -+ incr = __tcp_grow_window(meta_sk, skb); - - if (incr) { - incr = max_t(int, incr, 2 * skb->len); -- tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, -- tp->window_clamp); -+ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr, -+ meta_tp->window_clamp); - inet_csk(sk)->icsk_ack.quick |= 1; - } - } -@@ -543,7 +541,10 @@ void tcp_rcv_space_adjust(struct sock *sk) - int copied; - - time = tcp_time_stamp - tp->rcvq_space.time; -- if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) -+ if (mptcp(tp)) { -+ if (mptcp_check_rtt(tp, time)) -+ return; -+ } else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) - return; - - /* Number of bytes copied to user in last RTT */ -@@ -761,7 +762,7 @@ static void tcp_update_pacing_rate(struct sock *sk) - /* Calculate rto without backoff. This is the second half of Van Jacobson's - * routine referred to above. - */ --static void tcp_set_rto(struct sock *sk) -+void tcp_set_rto(struct sock *sk) - { - const struct tcp_sock *tp = tcp_sk(sk); - /* Old crap is replaced with new one. 8) -@@ -1376,7 +1377,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, - int len; - int in_sack; - -- if (!sk_can_gso(sk)) -+ /* For MPTCP we cannot shift skb-data and remove one skb from the -+ * send-queue, because this will make us loose the DSS-option (which -+ * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing. -+ */ -+ if (!sk_can_gso(sk) || mptcp(tp)) - goto fallback; - - /* Normally R but no L won't result in plain S */ -@@ -2915,7 +2920,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, - return false; - - tcp_rtt_estimator(sk, seq_rtt_us); -- tcp_set_rto(sk); -+ tp->ops->set_rto(sk); - - /* RFC6298: only reset backoff on valid RTT measurement. */ - inet_csk(sk)->icsk_backoff = 0; -@@ -3000,7 +3005,7 @@ void tcp_resume_early_retransmit(struct sock *sk) - } - - /* If we get here, the whole TSO packet has not been acked. */ --static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) -+u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) - { - struct tcp_sock *tp = tcp_sk(sk); - u32 packets_acked; -@@ -3095,6 +3100,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - */ - if (!(scb->tcp_flags & TCPHDR_SYN)) { - flag |= FLAG_DATA_ACKED; -+ if (mptcp(tp) && mptcp_is_data_seq(skb)) -+ flag |= MPTCP_FLAG_DATA_ACKED; - } else { - flag |= FLAG_SYN_ACKED; - tp->retrans_stamp = 0; -@@ -3189,7 +3196,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - return flag; - } - --static void tcp_ack_probe(struct sock *sk) -+void tcp_ack_probe(struct sock *sk) - { - const struct tcp_sock *tp = tcp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); -@@ -3236,9 +3243,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) - /* Check that window update is acceptable. - * The function assumes that snd_una<=ack<=snd_next. - */ --static inline bool tcp_may_update_window(const struct tcp_sock *tp, -- const u32 ack, const u32 ack_seq, -- const u32 nwin) -+bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, -+ const u32 ack_seq, const u32 nwin) - { - return after(ack, tp->snd_una) || - after(ack_seq, tp->snd_wl1) || -@@ -3357,7 +3363,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) - } - - /* This routine deals with incoming acks, but not outgoing ones. */ --static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) -+static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) - { - struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); -@@ -3449,6 +3455,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) - sack_rtt_us); - acked -= tp->packets_out; - -+ if (mptcp(tp)) { -+ if (mptcp_fallback_infinite(sk, flag)) { -+ pr_err("%s resetting flow\n", __func__); -+ mptcp_send_reset(sk); -+ goto invalid_ack; -+ } -+ -+ mptcp_clean_rtx_infinite(skb, sk); -+ } -+ - /* Advance cwnd if state allows */ - if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked); -@@ -3512,8 +3528,9 @@ old_ack: - * the fast version below fails. - */ - void tcp_parse_options(const struct sk_buff *skb, -- struct tcp_options_received *opt_rx, int estab, -- struct tcp_fastopen_cookie *foc) -+ struct tcp_options_received *opt_rx, -+ struct mptcp_options_received *mopt, -+ int estab, struct tcp_fastopen_cookie *foc) - { - const unsigned char *ptr; - const struct tcphdr *th = tcp_hdr(skb); -@@ -3596,6 +3613,9 @@ void tcp_parse_options(const struct sk_buff *skb, - */ - break; - #endif -+ case TCPOPT_MPTCP: -+ mptcp_parse_options(ptr - 2, opsize, mopt, skb); -+ break; - case TCPOPT_EXP: - /* Fast Open option shares code 254 using a - * 16 bits magic number. It's valid only in -@@ -3657,8 +3677,8 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, - if (tcp_parse_aligned_timestamp(tp, th)) - return true; - } -- -- tcp_parse_options(skb, &tp->rx_opt, 1, NULL); -+ tcp_parse_options(skb, &tp->rx_opt, mptcp(tp) ? &tp->mptcp->rx_opt : NULL, -+ 1, NULL); - if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tp->rx_opt.rcv_tsecr -= tp->tsoffset; - -@@ -3831,6 +3851,8 @@ static void tcp_fin(struct sock *sk) - dst = __sk_dst_get(sk); - if (!dst || !dst_metric(dst, RTAX_QUICKACK)) - inet_csk(sk)->icsk_ack.pingpong = 1; -+ if (mptcp(tp)) -+ mptcp_sub_close_passive(sk); - break; - - case TCP_CLOSE_WAIT: -@@ -3852,9 +3874,16 @@ static void tcp_fin(struct sock *sk) - tcp_set_state(sk, TCP_CLOSING); - break; - case TCP_FIN_WAIT2: -+ if (mptcp(tp)) { -+ /* The socket will get closed by mptcp_data_ready. -+ * We first have to process all data-sequences. -+ */ -+ tp->close_it = 1; -+ break; -+ } - /* Received a FIN -- send ACK and enter TIME_WAIT. */ - tcp_send_ack(sk); -- tcp_time_wait(sk, TCP_TIME_WAIT, 0); -+ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0); - break; - default: - /* Only TCP_LISTEN and TCP_CLOSE are left, in these -@@ -3876,6 +3905,10 @@ static void tcp_fin(struct sock *sk) - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_state_change(sk); - -+ /* Don't wake up MPTCP-subflows */ -+ if (mptcp(tp)) -+ return; -+ - /* Do not send POLL_HUP for half duplex close. */ - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) -@@ -4073,7 +4106,11 @@ static void tcp_ofo_queue(struct sock *sk) - tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); - } - -- if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { -+ /* In case of MPTCP, the segment may be empty if it's a -+ * non-data DATA_FIN. (see beginning of tcp_data_queue) -+ */ -+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) && -+ !(mptcp(tp) && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) { - SOCK_DEBUG(sk, "ofo packet was already received\n"); - __skb_unlink(skb, &tp->out_of_order_queue); - __kfree_skb(skb); -@@ -4091,12 +4128,14 @@ static void tcp_ofo_queue(struct sock *sk) - } - } - --static bool tcp_prune_ofo_queue(struct sock *sk); - static int tcp_prune_queue(struct sock *sk); - - static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, - unsigned int size) - { -+ if (mptcp(tcp_sk(sk))) -+ sk = mptcp_meta_sk(sk); -+ - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_rmem_schedule(sk, skb, size)) { - -@@ -4104,7 +4143,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, - return -1; - - if (!sk_rmem_schedule(sk, skb, size)) { -- if (!tcp_prune_ofo_queue(sk)) -+ if (!tcp_sk(sk)->ops->prune_ofo_queue(sk)) - return -1; - - if (!sk_rmem_schedule(sk, skb, size)) -@@ -4127,15 +4166,16 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, - * Better try to coalesce them right now to avoid future collapses. - * Returns true if caller should free @from instead of queueing it - */ --static bool tcp_try_coalesce(struct sock *sk, -- struct sk_buff *to, -- struct sk_buff *from, -- bool *fragstolen) -+bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from, -+ bool *fragstolen) - { - int delta; - - *fragstolen = false; - -+ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk)) -+ return false; -+ - if (tcp_hdr(from)->fin) - return false; - -@@ -4225,7 +4265,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) - - /* Do skb overlap to previous one? */ - if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { -- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { -+ /* MPTCP allows non-data data-fin to be in the ofo-queue */ -+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) && -+ !(mptcp(tp) && end_seq == seq)) { - /* All the bits are present. Drop. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb); -@@ -4263,6 +4305,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) - end_seq); - break; - } -+ /* MPTCP allows non-data data-fin to be in the ofo-queue */ -+ if (mptcp(tp) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq) -+ continue; - __skb_unlink(skb1, &tp->out_of_order_queue); - tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, - TCP_SKB_CB(skb1)->end_seq); -@@ -4280,8 +4325,8 @@ end: - } - } - --static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, -- bool *fragstolen) -+int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, -+ bool *fragstolen) - { - int eaten; - struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); -@@ -4343,7 +4388,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) - int eaten = -1; - bool fragstolen = false; - -- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) -+ /* If no data is present, but a data_fin is in the options, we still -+ * have to call mptcp_queue_skb later on. */ -+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && -+ !(mptcp(tp) && mptcp_is_data_fin(skb))) - goto drop; - - skb_dst_drop(skb); -@@ -4389,7 +4437,7 @@ queue_and_out: - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); - } - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; -- if (skb->len) -+ if (skb->len || mptcp_is_data_fin(skb)) - tcp_event_data_recv(sk, skb); - if (th->fin) - tcp_fin(sk); -@@ -4411,7 +4459,11 @@ queue_and_out: - - if (eaten > 0) - kfree_skb_partial(skb, fragstolen); -- if (!sock_flag(sk, SOCK_DEAD)) -+ if (!sock_flag(sk, SOCK_DEAD) || mptcp(tp)) -+ /* MPTCP: we always have to call data_ready, because -+ * we may be about to receive a data-fin, which still -+ * must get queued. -+ */ - sk->sk_data_ready(sk); - return; - } -@@ -4463,6 +4515,8 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, - next = skb_queue_next(list, skb); - - __skb_unlink(skb, list); -+ if (mptcp(tcp_sk(sk))) -+ mptcp_remove_shortcuts(tcp_sk(sk)->mpcb, skb); - __kfree_skb(skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); - -@@ -4630,7 +4684,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk) - * Purge the out-of-order queue. - * Return true if queue was pruned. - */ --static bool tcp_prune_ofo_queue(struct sock *sk) -+bool tcp_prune_ofo_queue(struct sock *sk) - { - struct tcp_sock *tp = tcp_sk(sk); - bool res = false; -@@ -4686,7 +4740,7 @@ static int tcp_prune_queue(struct sock *sk) - /* Collapsing did not help, destructive actions follow. - * This must not ever occur. */ - -- tcp_prune_ofo_queue(sk); -+ tp->ops->prune_ofo_queue(sk); - - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) - return 0; -@@ -4702,7 +4756,29 @@ static int tcp_prune_queue(struct sock *sk) - return -1; - } - --static bool tcp_should_expand_sndbuf(const struct sock *sk) -+/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. -+ * As additional protections, we do not touch cwnd in retransmission phases, -+ * and if application hit its sndbuf limit recently. -+ */ -+void tcp_cwnd_application_limited(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && -+ sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { -+ /* Limited by application or receiver window. */ -+ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); -+ u32 win_used = max(tp->snd_cwnd_used, init_win); -+ if (win_used < tp->snd_cwnd) { -+ tp->snd_ssthresh = tcp_current_ssthresh(sk); -+ tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; -+ } -+ tp->snd_cwnd_used = 0; -+ } -+ tp->snd_cwnd_stamp = tcp_time_stamp; -+} -+ -+bool tcp_should_expand_sndbuf(const struct sock *sk) - { - const struct tcp_sock *tp = tcp_sk(sk); - -@@ -4737,7 +4813,7 @@ static void tcp_new_space(struct sock *sk) - { - struct tcp_sock *tp = tcp_sk(sk); - -- if (tcp_should_expand_sndbuf(sk)) { -+ if (tp->ops->should_expand_sndbuf(sk)) { - tcp_sndbuf_expand(sk); - tp->snd_cwnd_stamp = tcp_time_stamp; - } -@@ -4749,8 +4825,9 @@ static void tcp_check_space(struct sock *sk) - { - if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { - sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); -- if (sk->sk_socket && -- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) -+ if (mptcp(tcp_sk(sk)) || -+ (sk->sk_socket && -+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))) - tcp_new_space(sk); - } - } -@@ -4773,7 +4850,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) - /* ... and right edge of window advances far enough. - * (tcp_recvmsg() will send ACK otherwise). Or... - */ -- __tcp_select_window(sk) >= tp->rcv_wnd) || -+ tp->ops->__select_window(sk) >= tp->rcv_wnd) || - /* We ACK each frame or... */ - tcp_in_quickack_mode(sk) || - /* We have out of order data. */ -@@ -4875,6 +4952,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t - { - struct tcp_sock *tp = tcp_sk(sk); - -+ /* MPTCP urgent data is not yet supported */ -+ if (mptcp(tp)) -+ return; -+ - /* Check if we get a new urgent pointer - normally not. */ - if (th->urg) - tcp_check_urg(sk, th); -@@ -4942,8 +5023,7 @@ static inline bool tcp_checksum_complete_user(struct sock *sk, - } - - #ifdef CONFIG_NET_DMA --static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, -- int hlen) -+bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) - { - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; -@@ -5052,9 +5132,15 @@ syn_challenge: - goto discard; - } - -+ /* If valid: post process the received MPTCP options. */ -+ if (mptcp(tp) && mptcp_handle_options(sk, th, skb)) -+ goto discard; -+ - return true; - - discard: -+ if (mptcp(tp)) -+ mptcp_reset_mopt(tp); - __kfree_skb(skb); - return false; - } -@@ -5106,6 +5192,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - - tp->rx_opt.saw_tstamp = 0; - -+ /* MPTCP: force slowpath. */ -+ if (mptcp(tp)) -+ goto slow_path; -+ - /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_prediction is to be made - * 'S' will always be tp->tcp_header_len >> 2 -@@ -5205,7 +5295,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); - } - if (copied_early) -- tcp_cleanup_rbuf(sk, skb->len); -+ tp->ops->cleanup_rbuf(sk, skb->len); - } - if (!eaten) { - if (tcp_checksum_complete_user(sk, skb)) -@@ -5313,14 +5403,14 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) - - tcp_init_metrics(sk); - -- tcp_init_congestion_control(sk); -+ tp->ops->init_congestion_control(sk); - - /* Prevent spurious tcp_cwnd_restart() on first data - * packet. - */ - tp->lsndtime = tcp_time_stamp; - -- tcp_init_buffer_space(sk); -+ tp->ops->init_buffer_space(sk); - - if (sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); -@@ -5350,7 +5440,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, - /* Get original SYNACK MSS value if user MSS sets mss_clamp */ - tcp_clear_options(&opt); - opt.user_mss = opt.mss_clamp = 0; -- tcp_parse_options(synack, &opt, 0, NULL); -+ tcp_parse_options(synack, &opt, NULL, 0, NULL); - mss = opt.mss_clamp; - } - -@@ -5365,7 +5455,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, - - tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); - -- if (data) { /* Retransmit unacked data in SYN */ -+ /* In mptcp case, we do not rely on "retransmit", but instead on -+ * "transmit", because if fastopen data is not acked, the retransmission -+ * becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen). -+ */ -+ if (data && !mptcp(tp)) { /* Retransmit unacked data in SYN */ - tcp_for_write_queue_from(data, sk) { - if (data == tcp_send_head(sk) || - __tcp_retransmit_skb(sk, data)) -@@ -5388,8 +5482,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, - struct tcp_sock *tp = tcp_sk(sk); - struct tcp_fastopen_cookie foc = { .len = -1 }; - int saved_clamp = tp->rx_opt.mss_clamp; -+ struct mptcp_options_received mopt; -+ mptcp_init_mp_opt(&mopt); - -- tcp_parse_options(skb, &tp->rx_opt, 0, &foc); -+ tcp_parse_options(skb, &tp->rx_opt, -+ mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc); - if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tp->rx_opt.rcv_tsecr -= tp->tsoffset; - -@@ -5448,6 +5545,30 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, - tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_ack(sk, skb, FLAG_SLOWPATH); - -+ if (tp->request_mptcp || mptcp(tp)) { -+ int ret; -+ ret = mptcp_rcv_synsent_state_process(sk, &sk, -+ skb, &mopt); -+ -+ /* May have changed if we support MPTCP */ -+ tp = tcp_sk(sk); -+ icsk = inet_csk(sk); -+ -+ if (ret == 1) -+ goto reset_and_undo; -+ if (ret == 2) -+ goto discard; -+ } -+ -+ if (mptcp(tp) && !is_master_tp(tp)) { -+ /* Timer for repeating the ACK until an answer -+ * arrives. Used only when establishing an additional -+ * subflow inside of an MPTCP connection. -+ */ -+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, -+ jiffies + icsk->icsk_rto); -+ } -+ - /* Ok.. it's good. Set up sequence numbers and - * move to established. - */ -@@ -5474,6 +5595,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, - tp->tcp_header_len = sizeof(struct tcphdr); - } - -+ if (mptcp(tp)) { -+ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; -+ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; -+ } -+ - if (tcp_is_sack(tp) && sysctl_tcp_fack) - tcp_enable_fack(tp); - -@@ -5494,9 +5620,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, - tcp_rcv_fastopen_synack(sk, skb, &foc)) - return -1; - -- if (sk->sk_write_pending || -+ /* With MPTCP we cannot send data on the third ack due to the -+ * lack of option-space to combine with an MP_CAPABLE. -+ */ -+ if (!mptcp(tp) && (sk->sk_write_pending || - icsk->icsk_accept_queue.rskq_defer_accept || -- icsk->icsk_ack.pingpong) { -+ icsk->icsk_ack.pingpong)) { - /* Save one ACK. Data will be ready after - * several ticks, if write_pending is set. - * -@@ -5536,6 +5665,7 @@ discard: - tcp_paws_reject(&tp->rx_opt, 0)) - goto discard_and_undo; - -+ /* TODO - check this here for MPTCP */ - if (th->syn) { - /* We see SYN without ACK. It is attempt of - * simultaneous connect with crossed SYNs. -@@ -5552,6 +5682,11 @@ discard: - tp->tcp_header_len = sizeof(struct tcphdr); - } - -+ if (mptcp(tp)) { -+ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; -+ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; -+ } -+ - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; - -@@ -5610,6 +5745,7 @@ reset_and_undo: - - int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) -+ __releases(&sk->sk_lock.slock) - { - struct tcp_sock *tp = tcp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); -@@ -5661,6 +5797,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - - case TCP_SYN_SENT: - queued = tcp_rcv_synsent_state_process(sk, skb, th, len); -+ if (is_meta_sk(sk)) { -+ sk = tcp_sk(sk)->mpcb->master_sk; -+ tp = tcp_sk(sk); -+ -+ /* Need to call it here, because it will announce new -+ * addresses, which can only be done after the third ack -+ * of the 3-way handshake. -+ */ -+ mptcp_update_metasocket(sk, tp->meta_sk); -+ } - if (queued >= 0) - return queued; - -@@ -5668,6 +5814,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - tcp_urg(sk, skb, th); - __kfree_skb(skb); - tcp_data_snd_check(sk); -+ if (mptcp(tp) && is_master_tp(tp)) -+ bh_unlock_sock(sk); - return 0; - } - -@@ -5706,11 +5854,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - synack_stamp = tp->lsndtime; - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); -- tcp_init_congestion_control(sk); -+ tp->ops->init_congestion_control(sk); - - tcp_mtup_init(sk); - tp->copied_seq = tp->rcv_nxt; -- tcp_init_buffer_space(sk); -+ tp->ops->init_buffer_space(sk); - } - smp_mb(); - tcp_set_state(sk, TCP_ESTABLISHED); -@@ -5730,6 +5878,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - - if (tp->rx_opt.tstamp_ok) - tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; -+ if (mptcp(tp)) -+ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; - - if (req) { - /* Re-arm the timer because data may have been sent out. -@@ -5751,6 +5901,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - - tcp_initialize_rcv_mss(sk); - tcp_fast_path_on(tp); -+ /* Send an ACK when establishing a new -+ * MPTCP subflow, i.e. using an MP_JOIN -+ * subtype. -+ */ -+ if (mptcp(tp) && !is_master_tp(tp)) -+ tcp_send_ack(sk); - break; - - case TCP_FIN_WAIT1: { -@@ -5802,7 +5958,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); -- } else if (th->fin || sock_owned_by_user(sk)) { -+ } else if (th->fin || mptcp_is_data_fin(skb) || -+ sock_owned_by_user(sk)) { - /* Bad case. We could lose such FIN otherwise. - * It is not a big problem, but it looks confusing - * and not so rare event. We still can lose it now, -@@ -5811,7 +5968,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - */ - inet_csk_reset_keepalive_timer(sk, tmo); - } else { -- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); -+ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo); - goto discard; - } - break; -@@ -5819,7 +5976,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - - case TCP_CLOSING: - if (tp->snd_una == tp->write_seq) { -- tcp_time_wait(sk, TCP_TIME_WAIT, 0); -+ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0); - goto discard; - } - break; -@@ -5831,6 +5988,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - goto discard; - } - break; -+ case TCP_CLOSE: -+ if (tp->mp_killed) -+ goto discard; - } - - /* step 6: check the URG bit */ -@@ -5851,7 +6011,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - */ - if (sk->sk_shutdown & RCV_SHUTDOWN) { - if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && -- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { -+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && -+ !mptcp(tp)) { -+ /* In case of mptcp, the reset is handled by -+ * mptcp_rcv_state_process -+ */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - tcp_reset(sk); - return 1; -@@ -5877,3 +6041,154 @@ discard: - return 0; - } - EXPORT_SYMBOL(tcp_rcv_state_process); -+ -+static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) -+{ -+ struct inet_request_sock *ireq = inet_rsk(req); -+ -+ if (family == AF_INET) -+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), -+ &ireq->ir_rmt_addr, port); -+#if IS_ENABLED(CONFIG_IPV6) -+ else if (family == AF_INET6) -+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"), -+ &ireq->ir_v6_rmt_addr, port); -+#endif -+} -+ -+int tcp_conn_request(struct request_sock_ops *rsk_ops, -+ const struct tcp_request_sock_ops *af_ops, -+ struct sock *sk, struct sk_buff *skb) -+{ -+ struct tcp_options_received tmp_opt; -+ struct request_sock *req; -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct dst_entry *dst = NULL; -+ __u32 isn = TCP_SKB_CB(skb)->when; -+ bool want_cookie = false, fastopen; -+ struct flowi fl; -+ struct tcp_fastopen_cookie foc = { .len = -1 }; -+ int err; -+ -+ -+ /* TW buckets are converted to open requests without -+ * limitations, they conserve resources and peer is -+ * evidently real one. -+ */ -+ if ((sysctl_tcp_syncookies == 2 || -+ inet_csk_reqsk_queue_is_full(sk)) && !isn) { -+ want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); -+ if (!want_cookie) -+ goto drop; -+ } -+ -+ -+ /* Accept backlog is full. If we have already queued enough -+ * of warm entries in syn queue, drop request. It is better than -+ * clogging syn queue with openreqs with exponentially increasing -+ * timeout. -+ */ -+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { -+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -+ goto drop; -+ } -+ -+ req = inet_reqsk_alloc(rsk_ops); -+ if (!req) -+ goto drop; -+ -+ tcp_rsk(req)->af_specific = af_ops; -+ -+ tcp_clear_options(&tmp_opt); -+ tmp_opt.mss_clamp = af_ops->mss_clamp; -+ tmp_opt.user_mss = tp->rx_opt.user_mss; -+ tcp_parse_options(skb, &tmp_opt, NULL, 0, want_cookie ? NULL : &foc); -+ -+ if (want_cookie && !tmp_opt.saw_tstamp) -+ tcp_clear_options(&tmp_opt); -+ -+ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; -+ tcp_openreq_init(req, &tmp_opt, skb); -+ -+ if (af_ops->init_req(req, sk, skb)) -+ goto drop_and_free; -+ -+ if (security_inet_conn_request(sk, skb, req)) -+ goto drop_and_free; -+ -+ if (!want_cookie || tmp_opt.tstamp_ok) -+ TCP_ECN_create_request(req, skb, sock_net(sk)); -+ -+ if (want_cookie) { -+ isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); -+ req->cookie_ts = tmp_opt.tstamp_ok; -+ } else if (!isn) { -+ /* VJ's idea. We save last timestamp seen -+ * from the destination in peer table, when entering -+ * state TIME-WAIT, and check against it before -+ * accepting new connection request. -+ * -+ * If "isn" is not zero, this request hit alive -+ * timewait bucket, so that all the necessary checks -+ * are made in the function processing timewait state. -+ */ -+ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) { -+ bool strict; -+ -+ dst = af_ops->route_req(sk, &fl, req, &strict); -+ if (dst && strict && -+ !tcp_peer_is_proven(req, dst, true)) { -+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); -+ goto drop_and_release; -+ } -+ } -+ /* Kill the following clause, if you dislike this way. */ -+ else if (!sysctl_tcp_syncookies && -+ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < -+ (sysctl_max_syn_backlog >> 2)) && -+ !tcp_peer_is_proven(req, dst, false)) { -+ /* Without syncookies last quarter of -+ * backlog is filled with destinations, -+ * proven to be alive. -+ * It means that we continue to communicate -+ * to destinations, already remembered -+ * to the moment of synflood. -+ */ -+ pr_drop_req(req, ntohs(tcp_hdr(skb)->source), -+ rsk_ops->family); -+ goto drop_and_release; -+ } -+ -+ isn = af_ops->init_seq(skb); -+ } -+ if (!dst) { -+ dst = af_ops->route_req(sk, &fl, req, NULL); -+ if (!dst) -+ goto drop_and_free; -+ } -+ -+ tcp_rsk(req)->snt_isn = isn; -+ tcp_openreq_init_rwin(req, sk, dst); -+ fastopen = !want_cookie && -+ tcp_try_fastopen(sk, skb, req, &foc, dst); -+ err = af_ops->send_synack(sk, dst, &fl, req, -+ skb_get_queue_mapping(skb), &foc); -+ if (!fastopen) { -+ if (err || want_cookie) -+ goto drop_and_free; -+ -+ tcp_rsk(req)->listener = NULL; -+ af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); -+ } -+ -+ return 0; -+ -+drop_and_release: -+ dst_release(dst); -+drop_and_free: -+ reqsk_free(req); -+drop: -+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); -+ return 0; -+} -+EXPORT_SYMBOL(tcp_conn_request); -diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c -index 77cccda1ad0c..c77017f600f1 100644 ---- a/net/ipv4/tcp_ipv4.c -+++ b/net/ipv4/tcp_ipv4.c -@@ -67,6 +67,8 @@ - #include - #include - #include -+#include -+#include - #include - #include - #include -@@ -99,7 +101,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, - struct inet_hashinfo tcp_hashinfo; - EXPORT_SYMBOL(tcp_hashinfo); - --static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) -+__u32 tcp_v4_init_sequence(const struct sk_buff *skb) - { - return secure_tcp_sequence_number(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, -@@ -334,7 +336,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) - struct inet_sock *inet; - const int type = icmp_hdr(icmp_skb)->type; - const int code = icmp_hdr(icmp_skb)->code; -- struct sock *sk; -+ struct sock *sk, *meta_sk; - struct sk_buff *skb; - struct request_sock *fastopen; - __u32 seq, snd_una; -@@ -358,13 +360,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) - return; - } - -- bh_lock_sock(sk); -+ tp = tcp_sk(sk); -+ if (mptcp(tp)) -+ meta_sk = mptcp_meta_sk(sk); -+ else -+ meta_sk = sk; -+ -+ bh_lock_sock(meta_sk); - /* If too many ICMPs get dropped on busy - * servers this needs to be solved differently. - * We do take care of PMTU discovery (RFC1191) special case : - * we can receive locally generated ICMP messages while socket is held. - */ -- if (sock_owned_by_user(sk)) { -+ if (sock_owned_by_user(meta_sk)) { - if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); - } -@@ -377,7 +385,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) - } - - icsk = inet_csk(sk); -- tp = tcp_sk(sk); - seq = ntohl(th->seq); - /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ - fastopen = tp->fastopen_rsk; -@@ -411,11 +418,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) - goto out; - - tp->mtu_info = info; -- if (!sock_owned_by_user(sk)) { -+ if (!sock_owned_by_user(meta_sk)) { - tcp_v4_mtu_reduced(sk); - } else { - if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) - sock_hold(sk); -+ if (mptcp(tp)) -+ mptcp_tsq_flags(sk); - } - goto out; - } -@@ -429,7 +438,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) - !icsk->icsk_backoff || fastopen) - break; - -- if (sock_owned_by_user(sk)) -+ if (sock_owned_by_user(meta_sk)) - break; - - icsk->icsk_backoff--; -@@ -463,7 +472,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) - switch (sk->sk_state) { - struct request_sock *req, **prev; - case TCP_LISTEN: -- if (sock_owned_by_user(sk)) -+ if (sock_owned_by_user(meta_sk)) - goto out; - - req = inet_csk_search_req(sk, &prev, th->dest, -@@ -499,7 +508,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) - if (fastopen && fastopen->sk == NULL) - break; - -- if (!sock_owned_by_user(sk)) { -+ if (!sock_owned_by_user(meta_sk)) { - sk->sk_err = err; - - sk->sk_error_report(sk); -@@ -528,7 +537,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) - */ - - inet = inet_sk(sk); -- if (!sock_owned_by_user(sk) && inet->recverr) { -+ if (!sock_owned_by_user(meta_sk) && inet->recverr) { - sk->sk_err = err; - sk->sk_error_report(sk); - } else { /* Only an error on timeout */ -@@ -536,7 +545,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) - } - - out: -- bh_unlock_sock(sk); -+ bh_unlock_sock(meta_sk); - sock_put(sk); - } - -@@ -578,7 +587,7 @@ EXPORT_SYMBOL(tcp_v4_send_check); - * Exception: precedence violation. We do not implement it in any case. - */ - --static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) -+void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) - { - const struct tcphdr *th = tcp_hdr(skb); - struct { -@@ -702,10 +711,10 @@ release_sk1: - outside socket context is ugly, certainly. What can I do? - */ - --static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, -+static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack, - u32 win, u32 tsval, u32 tsecr, int oif, - struct tcp_md5sig_key *key, -- int reply_flags, u8 tos) -+ int reply_flags, u8 tos, int mptcp) - { - const struct tcphdr *th = tcp_hdr(skb); - struct { -@@ -714,6 +723,10 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, - #ifdef CONFIG_TCP_MD5SIG - + (TCPOLEN_MD5SIG_ALIGNED >> 2) - #endif -+#ifdef CONFIG_MPTCP -+ + ((MPTCP_SUB_LEN_DSS >> 2) + -+ (MPTCP_SUB_LEN_ACK >> 2)) -+#endif - ]; - } rep; - struct ip_reply_arg arg; -@@ -758,6 +771,21 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, - ip_hdr(skb)->daddr, &rep.th); - } - #endif -+#ifdef CONFIG_MPTCP -+ if (mptcp) { -+ int offset = (tsecr) ? 3 : 0; -+ /* Construction of 32-bit data_ack */ -+ rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) | -+ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | -+ (0x20 << 8) | -+ (0x01)); -+ rep.opt[offset] = htonl(data_ack); -+ -+ arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; -+ rep.th.doff = arg.iov[0].iov_len / 4; -+ } -+#endif /* CONFIG_MPTCP */ -+ - arg.flags = reply_flags; - arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, /* XXX */ -@@ -776,36 +804,44 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) - { - struct inet_timewait_sock *tw = inet_twsk(sk); - struct tcp_timewait_sock *tcptw = tcp_twsk(sk); -+ u32 data_ack = 0; -+ int mptcp = 0; -+ -+ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) { -+ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; -+ mptcp = 1; -+ } - - tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, -+ data_ack, - tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp + tcptw->tw_ts_offset, - tcptw->tw_ts_recent, - tw->tw_bound_dev_if, - tcp_twsk_md5_key(tcptw), - tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, -- tw->tw_tos -+ tw->tw_tos, mptcp - ); - - inet_twsk_put(tw); - } - --static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, -- struct request_sock *req) -+void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, -+ struct request_sock *req) - { - /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV - * sk->sk_state == TCP_SYN_RECV -> for Fast Open. - */ - tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? - tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, -- tcp_rsk(req)->rcv_nxt, req->rcv_wnd, -+ tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd, - tcp_time_stamp, - req->ts_recent, - 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, - AF_INET), - inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, -- ip_hdr(skb)->tos); -+ ip_hdr(skb)->tos, 0); - } - - /* -@@ -813,10 +849,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, - * This still operates on a request_sock only, not on a big - * socket. - */ --static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, -- struct request_sock *req, -- u16 queue_mapping, -- struct tcp_fastopen_cookie *foc) -+int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, -+ struct flowi *fl, -+ struct request_sock *req, -+ u16 queue_mapping, -+ struct tcp_fastopen_cookie *foc) - { - const struct inet_request_sock *ireq = inet_rsk(req); - struct flowi4 fl4; -@@ -844,21 +881,10 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, - return err; - } - --static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) --{ -- int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL); -- -- if (!res) { -- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); -- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); -- } -- return res; --} -- - /* - * IPv4 request_sock destructor. - */ --static void tcp_v4_reqsk_destructor(struct request_sock *req) -+void tcp_v4_reqsk_destructor(struct request_sock *req) - { - kfree(inet_rsk(req)->opt); - } -@@ -896,7 +922,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action); - /* - * Save and compile IPv4 options into the request_sock if needed. - */ --static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) -+struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) - { - const struct ip_options *opt = &(IPCB(skb)->opt); - struct ip_options_rcu *dopt = NULL; -@@ -1237,161 +1263,71 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) - - #endif - -+static int tcp_v4_init_req(struct request_sock *req, struct sock *sk, -+ struct sk_buff *skb) -+{ -+ struct inet_request_sock *ireq = inet_rsk(req); -+ -+ ireq->ir_loc_addr = ip_hdr(skb)->daddr; -+ ireq->ir_rmt_addr = ip_hdr(skb)->saddr; -+ ireq->no_srccheck = inet_sk(sk)->transparent; -+ ireq->opt = tcp_v4_save_options(skb); -+ ireq->ir_mark = inet_request_mark(sk, skb); -+ -+ return 0; -+} -+ -+static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, -+ const struct request_sock *req, -+ bool *strict) -+{ -+ struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req); -+ -+ if (strict) { -+ if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr) -+ *strict = true; -+ else -+ *strict = false; -+ } -+ -+ return dst; -+} -+ - struct request_sock_ops tcp_request_sock_ops __read_mostly = { - .family = PF_INET, - .obj_size = sizeof(struct tcp_request_sock), -- .rtx_syn_ack = tcp_v4_rtx_synack, -+ .rtx_syn_ack = tcp_rtx_synack, - .send_ack = tcp_v4_reqsk_send_ack, - .destructor = tcp_v4_reqsk_destructor, - .send_reset = tcp_v4_send_reset, - .syn_ack_timeout = tcp_syn_ack_timeout, - }; - -+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { -+ .mss_clamp = TCP_MSS_DEFAULT, - #ifdef CONFIG_TCP_MD5SIG --static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { - .md5_lookup = tcp_v4_reqsk_md5_lookup, - .calc_md5_hash = tcp_v4_md5_hash_skb, --}; - #endif -+ .init_req = tcp_v4_init_req, -+#ifdef CONFIG_SYN_COOKIES -+ .cookie_init_seq = cookie_v4_init_sequence, -+#endif -+ .route_req = tcp_v4_route_req, -+ .init_seq = tcp_v4_init_sequence, -+ .send_synack = tcp_v4_send_synack, -+ .queue_hash_add = inet_csk_reqsk_queue_hash_add, -+}; - - int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) - { -- struct tcp_options_received tmp_opt; -- struct request_sock *req; -- struct inet_request_sock *ireq; -- struct tcp_sock *tp = tcp_sk(sk); -- struct dst_entry *dst = NULL; -- __be32 saddr = ip_hdr(skb)->saddr; -- __be32 daddr = ip_hdr(skb)->daddr; -- __u32 isn = TCP_SKB_CB(skb)->when; -- bool want_cookie = false, fastopen; -- struct flowi4 fl4; -- struct tcp_fastopen_cookie foc = { .len = -1 }; -- int err; -- - /* Never answer to SYNs send to broadcast or multicast */ - if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - goto drop; - -- /* TW buckets are converted to open requests without -- * limitations, they conserve resources and peer is -- * evidently real one. -- */ -- if ((sysctl_tcp_syncookies == 2 || -- inet_csk_reqsk_queue_is_full(sk)) && !isn) { -- want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); -- if (!want_cookie) -- goto drop; -- } -- -- /* Accept backlog is full. If we have already queued enough -- * of warm entries in syn queue, drop request. It is better than -- * clogging syn queue with openreqs with exponentially increasing -- * timeout. -- */ -- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { -- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -- goto drop; -- } -- -- req = inet_reqsk_alloc(&tcp_request_sock_ops); -- if (!req) -- goto drop; -- --#ifdef CONFIG_TCP_MD5SIG -- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; --#endif -- -- tcp_clear_options(&tmp_opt); -- tmp_opt.mss_clamp = TCP_MSS_DEFAULT; -- tmp_opt.user_mss = tp->rx_opt.user_mss; -- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); -- -- if (want_cookie && !tmp_opt.saw_tstamp) -- tcp_clear_options(&tmp_opt); -- -- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; -- tcp_openreq_init(req, &tmp_opt, skb); -+ return tcp_conn_request(&tcp_request_sock_ops, -+ &tcp_request_sock_ipv4_ops, sk, skb); - -- ireq = inet_rsk(req); -- ireq->ir_loc_addr = daddr; -- ireq->ir_rmt_addr = saddr; -- ireq->no_srccheck = inet_sk(sk)->transparent; -- ireq->opt = tcp_v4_save_options(skb); -- ireq->ir_mark = inet_request_mark(sk, skb); -- -- if (security_inet_conn_request(sk, skb, req)) -- goto drop_and_free; -- -- if (!want_cookie || tmp_opt.tstamp_ok) -- TCP_ECN_create_request(req, skb, sock_net(sk)); -- -- if (want_cookie) { -- isn = cookie_v4_init_sequence(sk, skb, &req->mss); -- req->cookie_ts = tmp_opt.tstamp_ok; -- } else if (!isn) { -- /* VJ's idea. We save last timestamp seen -- * from the destination in peer table, when entering -- * state TIME-WAIT, and check against it before -- * accepting new connection request. -- * -- * If "isn" is not zero, this request hit alive -- * timewait bucket, so that all the necessary checks -- * are made in the function processing timewait state. -- */ -- if (tmp_opt.saw_tstamp && -- tcp_death_row.sysctl_tw_recycle && -- (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && -- fl4.daddr == saddr) { -- if (!tcp_peer_is_proven(req, dst, true)) { -- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); -- goto drop_and_release; -- } -- } -- /* Kill the following clause, if you dislike this way. */ -- else if (!sysctl_tcp_syncookies && -- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < -- (sysctl_max_syn_backlog >> 2)) && -- !tcp_peer_is_proven(req, dst, false)) { -- /* Without syncookies last quarter of -- * backlog is filled with destinations, -- * proven to be alive. -- * It means that we continue to communicate -- * to destinations, already remembered -- * to the moment of synflood. -- */ -- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), -- &saddr, ntohs(tcp_hdr(skb)->source)); -- goto drop_and_release; -- } -- -- isn = tcp_v4_init_sequence(skb); -- } -- if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) -- goto drop_and_free; -- -- tcp_rsk(req)->snt_isn = isn; -- tcp_rsk(req)->snt_synack = tcp_time_stamp; -- tcp_openreq_init_rwin(req, sk, dst); -- fastopen = !want_cookie && -- tcp_try_fastopen(sk, skb, req, &foc, dst); -- err = tcp_v4_send_synack(sk, dst, req, -- skb_get_queue_mapping(skb), &foc); -- if (!fastopen) { -- if (err || want_cookie) -- goto drop_and_free; -- -- tcp_rsk(req)->snt_synack = tcp_time_stamp; -- tcp_rsk(req)->listener = NULL; -- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); -- } -- -- return 0; -- --drop_and_release: -- dst_release(dst); --drop_and_free: -- reqsk_free(req); - drop: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - return 0; -@@ -1497,7 +1433,7 @@ put_and_exit: - } - EXPORT_SYMBOL(tcp_v4_syn_recv_sock); - --static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) -+struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) - { - struct tcphdr *th = tcp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); -@@ -1514,8 +1450,15 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { -+ /* Don't lock again the meta-sk. It has been locked -+ * before mptcp_v4_do_rcv. -+ */ -+ if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk)) -+ bh_lock_sock(mptcp_meta_sk(nsk)); - bh_lock_sock(nsk); -+ - return nsk; -+ - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; -@@ -1550,6 +1493,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) - goto discard; - #endif - -+ if (is_meta_sk(sk)) -+ return mptcp_v4_do_rcv(sk, skb); -+ - if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ - struct dst_entry *dst = sk->sk_rx_dst; - -@@ -1681,7 +1627,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) - } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { - wake_up_interruptible_sync_poll(sk_sleep(sk), - POLLIN | POLLRDNORM | POLLRDBAND); -- if (!inet_csk_ack_scheduled(sk)) -+ if (!inet_csk_ack_scheduled(sk) && !mptcp(tp)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - (3 * tcp_rto_min(sk)) / 4, - TCP_RTO_MAX); -@@ -1698,7 +1644,7 @@ int tcp_v4_rcv(struct sk_buff *skb) - { - const struct iphdr *iph; - const struct tcphdr *th; -- struct sock *sk; -+ struct sock *sk, *meta_sk = NULL; - int ret; - struct net *net = dev_net(skb->dev); - -@@ -1732,18 +1678,42 @@ int tcp_v4_rcv(struct sk_buff *skb) - TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + - skb->len - th->doff * 4); - TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); -+#ifdef CONFIG_MPTCP -+ TCP_SKB_CB(skb)->mptcp_flags = 0; -+ TCP_SKB_CB(skb)->dss_off = 0; -+#endif - TCP_SKB_CB(skb)->when = 0; - TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); - TCP_SKB_CB(skb)->sacked = 0; - - sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); -- if (!sk) -- goto no_tcp_socket; - - process: -- if (sk->sk_state == TCP_TIME_WAIT) -+ if (sk && sk->sk_state == TCP_TIME_WAIT) - goto do_time_wait; - -+#ifdef CONFIG_MPTCP -+ if (!sk && th->syn && !th->ack) { -+ int ret = mptcp_lookup_join(skb, NULL); -+ -+ if (ret < 0) { -+ tcp_v4_send_reset(NULL, skb); -+ goto discard_it; -+ } else if (ret > 0) { -+ return 0; -+ } -+ } -+ -+ /* Is there a pending request sock for this segment ? */ -+ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) { -+ if (sk) -+ sock_put(sk); -+ return 0; -+ } -+#endif -+ if (!sk) -+ goto no_tcp_socket; -+ - if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { - NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); - goto discard_and_relse; -@@ -1759,11 +1729,21 @@ process: - sk_mark_napi_id(sk, skb); - skb->dev = NULL; - -- bh_lock_sock_nested(sk); -+ if (mptcp(tcp_sk(sk))) { -+ meta_sk = mptcp_meta_sk(sk); -+ -+ bh_lock_sock_nested(meta_sk); -+ if (sock_owned_by_user(meta_sk)) -+ skb->sk = sk; -+ } else { -+ meta_sk = sk; -+ bh_lock_sock_nested(sk); -+ } -+ - ret = 0; -- if (!sock_owned_by_user(sk)) { -+ if (!sock_owned_by_user(meta_sk)) { - #ifdef CONFIG_NET_DMA -- struct tcp_sock *tp = tcp_sk(sk); -+ struct tcp_sock *tp = tcp_sk(meta_sk); - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - if (tp->ucopy.dma_chan) -@@ -1771,16 +1751,16 @@ process: - else - #endif - { -- if (!tcp_prequeue(sk, skb)) -+ if (!tcp_prequeue(meta_sk, skb)) - ret = tcp_v4_do_rcv(sk, skb); - } -- } else if (unlikely(sk_add_backlog(sk, skb, -- sk->sk_rcvbuf + sk->sk_sndbuf))) { -- bh_unlock_sock(sk); -+ } else if (unlikely(sk_add_backlog(meta_sk, skb, -+ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { -+ bh_unlock_sock(meta_sk); - NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); - goto discard_and_relse; - } -- bh_unlock_sock(sk); -+ bh_unlock_sock(meta_sk); - - sock_put(sk); - -@@ -1835,6 +1815,18 @@ do_time_wait: - sk = sk2; - goto process; - } -+#ifdef CONFIG_MPTCP -+ if (th->syn && !th->ack) { -+ int ret = mptcp_lookup_join(skb, inet_twsk(sk)); -+ -+ if (ret < 0) { -+ tcp_v4_send_reset(NULL, skb); -+ goto discard_it; -+ } else if (ret > 0) { -+ return 0; -+ } -+ } -+#endif - /* Fall through to ACK */ - } - case TCP_TW_ACK: -@@ -1900,7 +1892,12 @@ static int tcp_v4_init_sock(struct sock *sk) - - tcp_init_sock(sk); - -- icsk->icsk_af_ops = &ipv4_specific; -+#ifdef CONFIG_MPTCP -+ if (is_mptcp_enabled(sk)) -+ icsk->icsk_af_ops = &mptcp_v4_specific; -+ else -+#endif -+ icsk->icsk_af_ops = &ipv4_specific; - - #ifdef CONFIG_TCP_MD5SIG - tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; -@@ -1917,6 +1914,11 @@ void tcp_v4_destroy_sock(struct sock *sk) - - tcp_cleanup_congestion_control(sk); - -+ if (mptcp(tp)) -+ mptcp_destroy_sock(sk); -+ if (tp->inside_tk_table) -+ mptcp_hash_remove(tp); -+ - /* Cleanup up the write buffer. */ - tcp_write_queue_purge(sk); - -@@ -2481,6 +2483,19 @@ void tcp4_proc_exit(void) - } - #endif /* CONFIG_PROC_FS */ - -+#ifdef CONFIG_MPTCP -+static void tcp_v4_clear_sk(struct sock *sk, int size) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ /* we do not want to clear tk_table field, because of RCU lookups */ -+ sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table)); -+ -+ size -= offsetof(struct tcp_sock, tk_table) + sizeof(tp->tk_table); -+ memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size); -+} -+#endif -+ - struct proto tcp_prot = { - .name = "TCP", - .owner = THIS_MODULE, -@@ -2528,6 +2543,9 @@ struct proto tcp_prot = { - .destroy_cgroup = tcp_destroy_cgroup, - .proto_cgroup = tcp_proto_cgroup, - #endif -+#ifdef CONFIG_MPTCP -+ .clear_sk = tcp_v4_clear_sk, -+#endif - }; - EXPORT_SYMBOL(tcp_prot); - -diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c -index e68e0d4af6c9..ae6946857dff 100644 ---- a/net/ipv4/tcp_minisocks.c -+++ b/net/ipv4/tcp_minisocks.c -@@ -18,11 +18,13 @@ - * Jorge Cwik, - */ - -+#include - #include - #include - #include - #include - #include -+#include - #include - #include - #include -@@ -95,10 +97,13 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - struct tcp_options_received tmp_opt; - struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - bool paws_reject = false; -+ struct mptcp_options_received mopt; - - tmp_opt.saw_tstamp = 0; - if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { -- tcp_parse_options(skb, &tmp_opt, 0, NULL); -+ mptcp_init_mp_opt(&mopt); -+ -+ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); - - if (tmp_opt.saw_tstamp) { - tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; -@@ -106,6 +111,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - paws_reject = tcp_paws_reject(&tmp_opt, th->rst); - } -+ -+ if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) { -+ if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key) -+ goto kill_with_rst; -+ } - } - - if (tw->tw_substate == TCP_FIN_WAIT2) { -@@ -128,6 +138,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - if (!th->ack || - !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || - TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { -+ /* If mptcp_is_data_fin() returns true, we are sure that -+ * mopt has been initialized - otherwise it would not -+ * be a DATA_FIN. -+ */ -+ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw && -+ mptcp_is_data_fin(skb) && -+ TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && -+ mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt) -+ return TCP_TW_ACK; -+ - inet_twsk_put(tw); - return TCP_TW_SUCCESS; - } -@@ -290,6 +310,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) - tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; - tcptw->tw_ts_offset = tp->tsoffset; - -+ if (mptcp(tp)) { -+ if (mptcp_init_tw_sock(sk, tcptw)) { -+ inet_twsk_free(tw); -+ goto exit; -+ } -+ } else { -+ tcptw->mptcp_tw = NULL; -+ } -+ - #if IS_ENABLED(CONFIG_IPV6) - if (tw->tw_family == PF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); -@@ -347,15 +376,18 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); - } - -+exit: - tcp_update_metrics(sk); - tcp_done(sk); - } - - void tcp_twsk_destructor(struct sock *sk) - { --#ifdef CONFIG_TCP_MD5SIG - struct tcp_timewait_sock *twsk = tcp_twsk(sk); - -+ if (twsk->mptcp_tw) -+ mptcp_twsk_destructor(twsk); -+#ifdef CONFIG_TCP_MD5SIG - if (twsk->tw_md5_key) - kfree_rcu(twsk->tw_md5_key, rcu); - #endif -@@ -382,13 +414,14 @@ void tcp_openreq_init_rwin(struct request_sock *req, - req->window_clamp = tcp_full_space(sk); - - /* tcp_full_space because it is guaranteed to be the first packet */ -- tcp_select_initial_window(tcp_full_space(sk), -- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), -+ tp->ops->select_initial_window(tcp_full_space(sk), -+ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) - -+ (ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0), - &req->rcv_wnd, - &req->window_clamp, - ireq->wscale_ok, - &rcv_wscale, -- dst_metric(dst, RTAX_INITRWND)); -+ dst_metric(dst, RTAX_INITRWND), sk); - ireq->rcv_wscale = rcv_wscale; - } - EXPORT_SYMBOL(tcp_openreq_init_rwin); -@@ -499,6 +532,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, - newtp->rx_opt.ts_recent_stamp = 0; - newtp->tcp_header_len = sizeof(struct tcphdr); - } -+ if (ireq->saw_mpc) -+ newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; - newtp->tsoffset = 0; - #ifdef CONFIG_TCP_MD5SIG - newtp->md5sig_info = NULL; /*XXX*/ -@@ -535,16 +570,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, - bool fastopen) - { - struct tcp_options_received tmp_opt; -+ struct mptcp_options_received mopt; - struct sock *child; - const struct tcphdr *th = tcp_hdr(skb); - __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); - bool paws_reject = false; - -- BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); -+ BUG_ON(!mptcp(tcp_sk(sk)) && fastopen == (sk->sk_state == TCP_LISTEN)); - - tmp_opt.saw_tstamp = 0; -+ -+ mptcp_init_mp_opt(&mopt); -+ - if (th->doff > (sizeof(struct tcphdr)>>2)) { -- tcp_parse_options(skb, &tmp_opt, 0, NULL); -+ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); - - if (tmp_opt.saw_tstamp) { - tmp_opt.ts_recent = req->ts_recent; -@@ -583,7 +622,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, - * - * Reset timer after retransmitting SYNACK, similar to - * the idea of fast retransmit in recovery. -+ * -+ * Fall back to TCP if MP_CAPABLE is not set. - */ -+ -+ if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc) -+ inet_rsk(req)->saw_mpc = false; -+ -+ - if (!inet_rtx_syn_ack(sk, req)) - req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, - TCP_RTO_MAX) + jiffies; -@@ -718,9 +764,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, - * socket is created, wait for troubles. - */ - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); -+ - if (child == NULL) - goto listen_overflow; - -+ if (!is_meta_sk(sk)) { -+ int ret = mptcp_check_req_master(sk, child, req, prev); -+ if (ret < 0) -+ goto listen_overflow; -+ -+ /* MPTCP-supported */ -+ if (!ret) -+ return tcp_sk(child)->mpcb->master_sk; -+ } else { -+ return mptcp_check_req_child(sk, child, req, prev, &mopt); -+ } - inet_csk_reqsk_queue_unlink(sk, req, prev); - inet_csk_reqsk_queue_removed(sk, req); - -@@ -746,7 +804,17 @@ embryonic_reset: - tcp_reset(sk); - } - if (!fastopen) { -- inet_csk_reqsk_queue_drop(sk, req, prev); -+ if (is_meta_sk(sk)) { -+ /* We want to avoid stoping the keepalive-timer and so -+ * avoid ending up in inet_csk_reqsk_queue_removed ... -+ */ -+ inet_csk_reqsk_queue_unlink(sk, req, prev); -+ if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0) -+ mptcp_delete_synack_timer(sk); -+ reqsk_free(req); -+ } else { -+ inet_csk_reqsk_queue_drop(sk, req, prev); -+ } - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); - } - return NULL; -@@ -770,8 +838,9 @@ int tcp_child_process(struct sock *parent, struct sock *child, - { - int ret = 0; - int state = child->sk_state; -+ struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child; - -- if (!sock_owned_by_user(child)) { -+ if (!sock_owned_by_user(meta_sk)) { - ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), - skb->len); - /* Wakeup parent, send SIGIO */ -@@ -782,10 +851,14 @@ int tcp_child_process(struct sock *parent, struct sock *child, - * in main socket hash table and lock on listening - * socket does not protect us more. - */ -- __sk_add_backlog(child, skb); -+ if (mptcp(tcp_sk(child))) -+ skb->sk = child; -+ __sk_add_backlog(meta_sk, skb); - } - -- bh_unlock_sock(child); -+ if (mptcp(tcp_sk(child))) -+ bh_unlock_sock(child); -+ bh_unlock_sock(meta_sk); - sock_put(child); - return ret; - } -diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 179b51e6bda3..efd31b6c5784 100644 ---- a/net/ipv4/tcp_output.c -+++ b/net/ipv4/tcp_output.c -@@ -36,6 +36,12 @@ - - #define pr_fmt(fmt) "TCP: " fmt - -+#include -+#include -+#if IS_ENABLED(CONFIG_IPV6) -+#include -+#endif -+#include - #include - - #include -@@ -68,11 +74,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; - unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; - EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); - --static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, -- int push_one, gfp_t gfp); -- - /* Account for new data that has been sent to the network. */ --static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) -+void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) - { - struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); -@@ -214,7 +217,7 @@ u32 tcp_default_init_rwnd(u32 mss) - void tcp_select_initial_window(int __space, __u32 mss, - __u32 *rcv_wnd, __u32 *window_clamp, - int wscale_ok, __u8 *rcv_wscale, -- __u32 init_rcv_wnd) -+ __u32 init_rcv_wnd, const struct sock *sk) - { - unsigned int space = (__space < 0 ? 0 : __space); - -@@ -269,12 +272,16 @@ EXPORT_SYMBOL(tcp_select_initial_window); - * value can be stuffed directly into th->window for an outgoing - * frame. - */ --static u16 tcp_select_window(struct sock *sk) -+u16 tcp_select_window(struct sock *sk) - { - struct tcp_sock *tp = tcp_sk(sk); - u32 old_win = tp->rcv_wnd; -- u32 cur_win = tcp_receive_window(tp); -- u32 new_win = __tcp_select_window(sk); -+ /* The window must never shrink at the meta-level. At the subflow we -+ * have to allow this. Otherwise we may announce a window too large -+ * for the current meta-level sk_rcvbuf. -+ */ -+ u32 cur_win = tcp_receive_window(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp); -+ u32 new_win = tp->ops->__select_window(sk); - - /* Never shrink the offered window */ - if (new_win < cur_win) { -@@ -290,6 +297,7 @@ static u16 tcp_select_window(struct sock *sk) - LINUX_MIB_TCPWANTZEROWINDOWADV); - new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); - } -+ - tp->rcv_wnd = new_win; - tp->rcv_wup = tp->rcv_nxt; - -@@ -374,7 +382,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, - /* Constructs common control bits of non-data skb. If SYN/FIN is present, - * auto increment end seqno. - */ --static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) -+void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) - { - struct skb_shared_info *shinfo = skb_shinfo(skb); - -@@ -394,7 +402,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) - TCP_SKB_CB(skb)->end_seq = seq; - } - --static inline bool tcp_urg_mode(const struct tcp_sock *tp) -+bool tcp_urg_mode(const struct tcp_sock *tp) - { - return tp->snd_una != tp->snd_up; - } -@@ -404,17 +412,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) - #define OPTION_MD5 (1 << 2) - #define OPTION_WSCALE (1 << 3) - #define OPTION_FAST_OPEN_COOKIE (1 << 8) -- --struct tcp_out_options { -- u16 options; /* bit field of OPTION_* */ -- u16 mss; /* 0 to disable */ -- u8 ws; /* window scale, 0 to disable */ -- u8 num_sack_blocks; /* number of SACK blocks to include */ -- u8 hash_size; /* bytes in hash_location */ -- __u8 *hash_location; /* temporary pointer, overloaded */ -- __u32 tsval, tsecr; /* need to include OPTION_TS */ -- struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ --}; -+/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */ - - /* Write previously computed TCP options to the packet. - * -@@ -430,7 +428,7 @@ struct tcp_out_options { - * (but it may well be that other scenarios fail similarly). - */ - static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, -- struct tcp_out_options *opts) -+ struct tcp_out_options *opts, struct sk_buff *skb) - { - u16 options = opts->options; /* mungable copy */ - -@@ -513,6 +511,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, - } - ptr += (foc->len + 3) >> 2; - } -+ -+ if (unlikely(OPTION_MPTCP & opts->options)) -+ mptcp_options_write(ptr, tp, opts, skb); - } - - /* Compute TCP options for SYN packets. This is not the final -@@ -564,6 +565,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, - if (unlikely(!(OPTION_TS & opts->options))) - remaining -= TCPOLEN_SACKPERM_ALIGNED; - } -+ if (tp->request_mptcp || mptcp(tp)) -+ mptcp_syn_options(sk, opts, &remaining); - - if (fastopen && fastopen->cookie.len >= 0) { - u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; -@@ -637,6 +640,9 @@ static unsigned int tcp_synack_options(struct sock *sk, - } - } - -+ if (ireq->saw_mpc) -+ mptcp_synack_options(req, opts, &remaining); -+ - return MAX_TCP_OPTION_SPACE - remaining; - } - -@@ -670,16 +676,22 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb - opts->tsecr = tp->rx_opt.ts_recent; - size += TCPOLEN_TSTAMP_ALIGNED; - } -+ if (mptcp(tp)) -+ mptcp_established_options(sk, skb, opts, &size); - - eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; - if (unlikely(eff_sacks)) { -- const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; -- opts->num_sack_blocks = -- min_t(unsigned int, eff_sacks, -- (remaining - TCPOLEN_SACK_BASE_ALIGNED) / -- TCPOLEN_SACK_PERBLOCK); -- size += TCPOLEN_SACK_BASE_ALIGNED + -- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; -+ const unsigned remaining = MAX_TCP_OPTION_SPACE - size; -+ if (remaining < TCPOLEN_SACK_BASE_ALIGNED) -+ opts->num_sack_blocks = 0; -+ else -+ opts->num_sack_blocks = -+ min_t(unsigned int, eff_sacks, -+ (remaining - TCPOLEN_SACK_BASE_ALIGNED) / -+ TCPOLEN_SACK_PERBLOCK); -+ if (opts->num_sack_blocks) -+ size += TCPOLEN_SACK_BASE_ALIGNED + -+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; - } - - return size; -@@ -711,8 +723,8 @@ static void tcp_tsq_handler(struct sock *sk) - if ((1 << sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | - TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) -- tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, -- 0, GFP_ATOMIC); -+ tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk), -+ tcp_sk(sk)->nonagle, 0, GFP_ATOMIC); - } - /* - * One tasklet per cpu tries to send more skbs. -@@ -727,7 +739,7 @@ static void tcp_tasklet_func(unsigned long data) - unsigned long flags; - struct list_head *q, *n; - struct tcp_sock *tp; -- struct sock *sk; -+ struct sock *sk, *meta_sk; - - local_irq_save(flags); - list_splice_init(&tsq->head, &list); -@@ -738,15 +750,25 @@ static void tcp_tasklet_func(unsigned long data) - list_del(&tp->tsq_node); - - sk = (struct sock *)tp; -- bh_lock_sock(sk); -+ meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; -+ bh_lock_sock(meta_sk); - -- if (!sock_owned_by_user(sk)) { -+ if (!sock_owned_by_user(meta_sk)) { - tcp_tsq_handler(sk); -+ if (mptcp(tp)) -+ tcp_tsq_handler(meta_sk); - } else { -+ if (mptcp(tp) && sk->sk_state == TCP_CLOSE) -+ goto exit; -+ - /* defer the work to tcp_release_cb() */ - set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); -+ -+ if (mptcp(tp)) -+ mptcp_tsq_flags(sk); - } -- bh_unlock_sock(sk); -+exit: -+ bh_unlock_sock(meta_sk); - - clear_bit(TSQ_QUEUED, &tp->tsq_flags); - sk_free(sk); -@@ -756,7 +778,10 @@ static void tcp_tasklet_func(unsigned long data) - #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ - (1UL << TCP_WRITE_TIMER_DEFERRED) | \ - (1UL << TCP_DELACK_TIMER_DEFERRED) | \ -- (1UL << TCP_MTU_REDUCED_DEFERRED)) -+ (1UL << TCP_MTU_REDUCED_DEFERRED) | \ -+ (1UL << MPTCP_PATH_MANAGER) | \ -+ (1UL << MPTCP_SUB_DEFERRED)) -+ - /** - * tcp_release_cb - tcp release_sock() callback - * @sk: socket -@@ -803,6 +828,13 @@ void tcp_release_cb(struct sock *sk) - sk->sk_prot->mtu_reduced(sk); - __sock_put(sk); - } -+ if (flags & (1UL << MPTCP_PATH_MANAGER)) { -+ if (tcp_sk(sk)->mpcb->pm_ops->release_sock) -+ tcp_sk(sk)->mpcb->pm_ops->release_sock(sk); -+ __sock_put(sk); -+ } -+ if (flags & (1UL << MPTCP_SUB_DEFERRED)) -+ mptcp_tsq_sub_deferred(sk); - } - EXPORT_SYMBOL(tcp_release_cb); - -@@ -862,8 +894,8 @@ void tcp_wfree(struct sk_buff *skb) - * We are working here with either a clone of the original - * SKB, or a fresh unique copy made by the retransmit engine. - */ --static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, -- gfp_t gfp_mask) -+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, -+ gfp_t gfp_mask) - { - const struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet; -@@ -933,7 +965,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, - */ - th->window = htons(min(tp->rcv_wnd, 65535U)); - } else { -- th->window = htons(tcp_select_window(sk)); -+ th->window = htons(tp->ops->select_window(sk)); - } - th->check = 0; - th->urg_ptr = 0; -@@ -949,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, - } - } - -- tcp_options_write((__be32 *)(th + 1), tp, &opts); -+ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb); - if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) - TCP_ECN_send(sk, skb, tcp_header_size); - -@@ -988,7 +1020,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, - * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, - * otherwise socket can stall. - */ --static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) -+void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) - { - struct tcp_sock *tp = tcp_sk(sk); - -@@ -1001,15 +1033,16 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) - } - - /* Initialize TSO segments for a packet. */ --static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, -- unsigned int mss_now) -+void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, -+ unsigned int mss_now) - { - struct skb_shared_info *shinfo = skb_shinfo(skb); - - /* Make sure we own this skb before messing gso_size/gso_segs */ - WARN_ON_ONCE(skb_cloned(skb)); - -- if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { -+ if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) || -+ (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) { - /* Avoid the costly divide in the normal - * non-TSO case. - */ -@@ -1041,7 +1074,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, - /* Pcount in the middle of the write queue got changed, we need to do various - * tweaks to fix counters - */ --static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) -+void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) - { - struct tcp_sock *tp = tcp_sk(sk); - -@@ -1164,7 +1197,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, - * eventually). The difference is that pulled data not copied, but - * immediately discarded. - */ --static void __pskb_trim_head(struct sk_buff *skb, int len) -+void __pskb_trim_head(struct sk_buff *skb, int len) - { - struct skb_shared_info *shinfo; - int i, k, eat; -@@ -1205,6 +1238,9 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) - /* Remove acked data from a packet in the transmit queue. */ - int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) - { -+ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk) && mptcp_is_data_seq(skb)) -+ return mptcp_trim_head(sk, skb, len); -+ - if (skb_unclone(skb, GFP_ATOMIC)) - return -ENOMEM; - -@@ -1222,6 +1258,15 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) - if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); - -+#ifdef CONFIG_MPTCP -+ /* Some data got acked - we assume that the seq-number reached the dest. -+ * Anyway, our MPTCP-option has been trimmed above - we lost it here. -+ * Only remove the SEQ if the call does not come from a meta retransmit. -+ */ -+ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk)) -+ TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ; -+#endif -+ - return 0; - } - -@@ -1379,6 +1424,7 @@ unsigned int tcp_current_mss(struct sock *sk) - - return mss_now; - } -+EXPORT_SYMBOL(tcp_current_mss); - - /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. - * As additional protections, we do not touch cwnd in retransmission phases, -@@ -1446,8 +1492,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp) - * But we can avoid doing the divide again given we already have - * skb_pcount = skb->len / mss_now - */ --static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, -- const struct sk_buff *skb) -+void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, -+ const struct sk_buff *skb) - { - if (skb->len < tcp_skb_pcount(skb) * mss_now) - tp->snd_sml = TCP_SKB_CB(skb)->end_seq; -@@ -1468,11 +1514,11 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, - (!nonagle && tp->packets_out && tcp_minshall_check(tp))); - } - /* Returns the portion of skb which can be sent right away */ --static unsigned int tcp_mss_split_point(const struct sock *sk, -- const struct sk_buff *skb, -- unsigned int mss_now, -- unsigned int max_segs, -- int nonagle) -+unsigned int tcp_mss_split_point(const struct sock *sk, -+ const struct sk_buff *skb, -+ unsigned int mss_now, -+ unsigned int max_segs, -+ int nonagle) - { - const struct tcp_sock *tp = tcp_sk(sk); - u32 partial, needed, window, max_len; -@@ -1502,13 +1548,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, - /* Can at least one segment of SKB be sent right now, according to the - * congestion window rules? If so, return how many segments are allowed. - */ --static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, -- const struct sk_buff *skb) -+unsigned int tcp_cwnd_test(const struct tcp_sock *tp, -+ const struct sk_buff *skb) - { - u32 in_flight, cwnd; - - /* Don't be strict about the congestion window for the final FIN. */ -- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && -+ if (skb && -+ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && - tcp_skb_pcount(skb) == 1) - return 1; - -@@ -1524,8 +1571,8 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, - * This must be invoked the first time we consider transmitting - * SKB onto the wire. - */ --static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, -- unsigned int mss_now) -+int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, -+ unsigned int mss_now) - { - int tso_segs = tcp_skb_pcount(skb); - -@@ -1540,8 +1587,8 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, - /* Return true if the Nagle test allows this packet to be - * sent now. - */ --static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, -- unsigned int cur_mss, int nonagle) -+bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, -+ unsigned int cur_mss, int nonagle) - { - /* Nagle rule does not apply to frames, which sit in the middle of the - * write_queue (they have no chances to get new data). -@@ -1553,7 +1600,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf - return true; - - /* Don't use the nagle rule for urgent data (or for the final FIN). */ -- if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) -+ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || -+ mptcp_is_data_fin(skb)) - return true; - - if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) -@@ -1563,9 +1611,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf - } - - /* Does at least the first segment of SKB fit into the send window? */ --static bool tcp_snd_wnd_test(const struct tcp_sock *tp, -- const struct sk_buff *skb, -- unsigned int cur_mss) -+bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, -+ unsigned int cur_mss) - { - u32 end_seq = TCP_SKB_CB(skb)->end_seq; - -@@ -1676,7 +1723,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, - u32 send_win, cong_win, limit, in_flight; - int win_divisor; - -- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) -+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb)) - goto send_now; - - if (icsk->icsk_ca_state != TCP_CA_Open) -@@ -1888,7 +1935,7 @@ static int tcp_mtu_probe(struct sock *sk) - * Returns true, if no segments are in flight and we have queued segments, - * but cannot send anything now because of SWS or another problem. - */ --static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, -+bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, - int push_one, gfp_t gfp) - { - struct tcp_sock *tp = tcp_sk(sk); -@@ -1900,7 +1947,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, - - sent_pkts = 0; - -- if (!push_one) { -+ /* pmtu not yet supported with MPTCP. Should be possible, by early -+ * exiting the loop inside tcp_mtu_probe, making sure that only one -+ * single DSS-mapping gets probed. -+ */ -+ if (!push_one && !mptcp(tp)) { - /* Do MTU probing. */ - result = tcp_mtu_probe(sk); - if (!result) { -@@ -2099,7 +2150,8 @@ void tcp_send_loss_probe(struct sock *sk) - int err = -1; - - if (tcp_send_head(sk) != NULL) { -- err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); -+ err = tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2, -+ GFP_ATOMIC); - goto rearm_timer; - } - -@@ -2159,8 +2211,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, - if (unlikely(sk->sk_state == TCP_CLOSE)) - return; - -- if (tcp_write_xmit(sk, cur_mss, nonagle, 0, -- sk_gfp_atomic(sk, GFP_ATOMIC))) -+ if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0, -+ sk_gfp_atomic(sk, GFP_ATOMIC))) - tcp_check_probe_timer(sk); - } - -@@ -2173,7 +2225,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) - - BUG_ON(!skb || skb->len < mss_now); - -- tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); -+ tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, -+ sk->sk_allocation); - } - - /* This function returns the amount that we can raise the -@@ -2386,6 +2439,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) - return; - -+ /* Currently not supported for MPTCP - but it should be possible */ -+ if (mptcp(tp)) -+ return; -+ - tcp_for_write_queue_from_safe(skb, tmp, sk) { - if (!tcp_can_collapse(sk, skb)) - break; -@@ -2843,7 +2900,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, - - /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(min(req->rcv_wnd, 65535U)); -- tcp_options_write((__be32 *)(th + 1), tp, &opts); -+ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb); - th->doff = (tcp_header_size >> 2); - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); - -@@ -2897,13 +2954,13 @@ static void tcp_connect_init(struct sock *sk) - (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) - tp->window_clamp = tcp_full_space(sk); - -- tcp_select_initial_window(tcp_full_space(sk), -- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), -- &tp->rcv_wnd, -- &tp->window_clamp, -- sysctl_tcp_window_scaling, -- &rcv_wscale, -- dst_metric(dst, RTAX_INITRWND)); -+ tp->ops->select_initial_window(tcp_full_space(sk), -+ tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), -+ &tp->rcv_wnd, -+ &tp->window_clamp, -+ sysctl_tcp_window_scaling, -+ &rcv_wscale, -+ dst_metric(dst, RTAX_INITRWND), sk); - - tp->rx_opt.rcv_wscale = rcv_wscale; - tp->rcv_ssthresh = tp->rcv_wnd; -@@ -2927,6 +2984,36 @@ static void tcp_connect_init(struct sock *sk) - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; - inet_csk(sk)->icsk_retransmits = 0; - tcp_clear_retrans(tp); -+ -+#ifdef CONFIG_MPTCP -+ if (sysctl_mptcp_enabled && mptcp_doit(sk)) { -+ if (is_master_tp(tp)) { -+ tp->request_mptcp = 1; -+ mptcp_connect_init(sk); -+ } else if (tp->mptcp) { -+ struct inet_sock *inet = inet_sk(sk); -+ -+ tp->mptcp->snt_isn = tp->write_seq; -+ tp->mptcp->init_rcv_wnd = tp->rcv_wnd; -+ -+ /* Set nonce for new subflows */ -+ if (sk->sk_family == AF_INET) -+ tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce( -+ inet->inet_saddr, -+ inet->inet_daddr, -+ inet->inet_sport, -+ inet->inet_dport); -+#if IS_ENABLED(CONFIG_IPV6) -+ else -+ tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce( -+ inet6_sk(sk)->saddr.s6_addr32, -+ sk->sk_v6_daddr.s6_addr32, -+ inet->inet_sport, -+ inet->inet_dport); -+#endif -+ } -+ } -+#endif - } - - static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) -@@ -3176,6 +3263,7 @@ void tcp_send_ack(struct sock *sk) - TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); - } -+EXPORT_SYMBOL(tcp_send_ack); - - /* This routine sends a packet with an out of date sequence - * number. It assumes the other end will try to ack it. -@@ -3188,7 +3276,7 @@ void tcp_send_ack(struct sock *sk) - * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is - * out-of-date with SND.UNA-1 to probe window. - */ --static int tcp_xmit_probe_skb(struct sock *sk, int urgent) -+int tcp_xmit_probe_skb(struct sock *sk, int urgent) - { - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; -@@ -3270,7 +3358,7 @@ void tcp_send_probe0(struct sock *sk) - struct tcp_sock *tp = tcp_sk(sk); - int err; - -- err = tcp_write_wakeup(sk); -+ err = tp->ops->write_wakeup(sk); - - if (tp->packets_out || !tcp_send_head(sk)) { - /* Cancel probe timer, if it is not required. */ -@@ -3301,3 +3389,18 @@ void tcp_send_probe0(struct sock *sk) - TCP_RTO_MAX); - } - } -+ -+int tcp_rtx_synack(struct sock *sk, struct request_sock *req) -+{ -+ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; -+ struct flowi fl; -+ int res; -+ -+ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); -+ if (!res) { -+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); -+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); -+ } -+ return res; -+} -+EXPORT_SYMBOL(tcp_rtx_synack); -diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c -index 286227abed10..966b873cbf3e 100644 ---- a/net/ipv4/tcp_timer.c -+++ b/net/ipv4/tcp_timer.c -@@ -20,6 +20,7 @@ - - #include - #include -+#include - #include - - int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; -@@ -32,7 +33,7 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; - int sysctl_tcp_orphan_retries __read_mostly; - int sysctl_tcp_thin_linear_timeouts __read_mostly; - --static void tcp_write_err(struct sock *sk) -+void tcp_write_err(struct sock *sk) - { - sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; - sk->sk_error_report(sk); -@@ -74,7 +75,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) - (!tp->snd_wnd && !tp->packets_out)) - do_reset = 1; - if (do_reset) -- tcp_send_active_reset(sk, GFP_ATOMIC); -+ tp->ops->send_active_reset(sk, GFP_ATOMIC); - tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); - return 1; -@@ -124,10 +125,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) - * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if - * syn_set flag is set. - */ --static bool retransmits_timed_out(struct sock *sk, -- unsigned int boundary, -- unsigned int timeout, -- bool syn_set) -+bool retransmits_timed_out(struct sock *sk, unsigned int boundary, -+ unsigned int timeout, bool syn_set) - { - unsigned int linear_backoff_thresh, start_ts; - unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; -@@ -153,7 +152,7 @@ static bool retransmits_timed_out(struct sock *sk, - } - - /* A write timeout has occurred. Process the after effects. */ --static int tcp_write_timeout(struct sock *sk) -+int tcp_write_timeout(struct sock *sk) - { - struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); -@@ -171,6 +170,10 @@ static int tcp_write_timeout(struct sock *sk) - } - retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; - syn_set = true; -+ /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */ -+ if (tcp_sk(sk)->request_mptcp && -+ icsk->icsk_retransmits >= mptcp_sysctl_syn_retries()) -+ tcp_sk(sk)->request_mptcp = 0; - } else { - if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { - /* Black hole detection */ -@@ -251,18 +254,22 @@ out: - static void tcp_delack_timer(unsigned long data) - { - struct sock *sk = (struct sock *)data; -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; - -- bh_lock_sock(sk); -- if (!sock_owned_by_user(sk)) { -+ bh_lock_sock(meta_sk); -+ if (!sock_owned_by_user(meta_sk)) { - tcp_delack_timer_handler(sk); - } else { - inet_csk(sk)->icsk_ack.blocked = 1; -- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); -+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED); - /* deleguate our work to tcp_release_cb() */ - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) - sock_hold(sk); -+ if (mptcp(tp)) -+ mptcp_tsq_flags(sk); - } -- bh_unlock_sock(sk); -+ bh_unlock_sock(meta_sk); - sock_put(sk); - } - -@@ -479,6 +486,10 @@ out_reset_timer: - __sk_dst_reset(sk); - - out:; -+ if (mptcp(tp)) { -+ mptcp_reinject_data(sk, 1); -+ mptcp_set_rto(sk); -+ } - } - - void tcp_write_timer_handler(struct sock *sk) -@@ -505,7 +516,7 @@ void tcp_write_timer_handler(struct sock *sk) - break; - case ICSK_TIME_RETRANS: - icsk->icsk_pending = 0; -- tcp_retransmit_timer(sk); -+ tcp_sk(sk)->ops->retransmit_timer(sk); - break; - case ICSK_TIME_PROBE0: - icsk->icsk_pending = 0; -@@ -520,16 +531,19 @@ out: - static void tcp_write_timer(unsigned long data) - { - struct sock *sk = (struct sock *)data; -+ struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk; - -- bh_lock_sock(sk); -- if (!sock_owned_by_user(sk)) { -+ bh_lock_sock(meta_sk); -+ if (!sock_owned_by_user(meta_sk)) { - tcp_write_timer_handler(sk); - } else { - /* deleguate our work to tcp_release_cb() */ - if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) - sock_hold(sk); -+ if (mptcp(tcp_sk(sk))) -+ mptcp_tsq_flags(sk); - } -- bh_unlock_sock(sk); -+ bh_unlock_sock(meta_sk); - sock_put(sk); - } - -@@ -566,11 +580,12 @@ static void tcp_keepalive_timer (unsigned long data) - struct sock *sk = (struct sock *) data; - struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); -+ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; - u32 elapsed; - - /* Only process if socket is not in use. */ -- bh_lock_sock(sk); -- if (sock_owned_by_user(sk)) { -+ bh_lock_sock(meta_sk); -+ if (sock_owned_by_user(meta_sk)) { - /* Try again later. */ - inet_csk_reset_keepalive_timer (sk, HZ/20); - goto out; -@@ -581,16 +596,38 @@ static void tcp_keepalive_timer (unsigned long data) - goto out; - } - -+ if (tp->send_mp_fclose) { -+ /* MUST do this before tcp_write_timeout, because retrans_stamp -+ * may have been set to 0 in another part while we are -+ * retransmitting MP_FASTCLOSE. Then, we would crash, because -+ * retransmits_timed_out accesses the meta-write-queue. -+ * -+ * We make sure that the timestamp is != 0. -+ */ -+ if (!tp->retrans_stamp) -+ tp->retrans_stamp = tcp_time_stamp ? : 1; -+ -+ if (tcp_write_timeout(sk)) -+ goto out; -+ -+ tcp_send_ack(sk); -+ icsk->icsk_retransmits++; -+ -+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); -+ elapsed = icsk->icsk_rto; -+ goto resched; -+ } -+ - if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { - if (tp->linger2 >= 0) { - const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; - - if (tmo > 0) { -- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); -+ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo); - goto out; - } - } -- tcp_send_active_reset(sk, GFP_ATOMIC); -+ tp->ops->send_active_reset(sk, GFP_ATOMIC); - goto death; - } - -@@ -614,11 +651,11 @@ static void tcp_keepalive_timer (unsigned long data) - icsk->icsk_probes_out > 0) || - (icsk->icsk_user_timeout == 0 && - icsk->icsk_probes_out >= keepalive_probes(tp))) { -- tcp_send_active_reset(sk, GFP_ATOMIC); -+ tp->ops->send_active_reset(sk, GFP_ATOMIC); - tcp_write_err(sk); - goto out; - } -- if (tcp_write_wakeup(sk) <= 0) { -+ if (tp->ops->write_wakeup(sk) <= 0) { - icsk->icsk_probes_out++; - elapsed = keepalive_intvl_when(tp); - } else { -@@ -642,7 +679,7 @@ death: - tcp_done(sk); - - out: -- bh_unlock_sock(sk); -+ bh_unlock_sock(meta_sk); - sock_put(sk); - } - -diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c -index 5667b3003af9..7139c2973fd2 100644 ---- a/net/ipv6/addrconf.c -+++ b/net/ipv6/addrconf.c -@@ -760,6 +760,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) - - kfree_rcu(ifp, rcu); - } -+EXPORT_SYMBOL(inet6_ifa_finish_destroy); - - static void - ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) -diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c -index 7cb4392690dd..7057afbca4df 100644 ---- a/net/ipv6/af_inet6.c -+++ b/net/ipv6/af_inet6.c -@@ -97,8 +97,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) - return (struct ipv6_pinfo *)(((u8 *)sk) + offset); - } - --static int inet6_create(struct net *net, struct socket *sock, int protocol, -- int kern) -+int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) - { - struct inet_sock *inet; - struct ipv6_pinfo *np; -diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c -index a245e5ddffbd..99c892b8992d 100644 ---- a/net/ipv6/inet6_connection_sock.c -+++ b/net/ipv6/inet6_connection_sock.c -@@ -96,8 +96,8 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk, - /* - * request_sock (formerly open request) hash tables. - */ --static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, -- const u32 rnd, const u32 synq_hsize) -+u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, -+ const u32 rnd, const u32 synq_hsize) - { - u32 c; - -diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c -index edb58aff4ae7..ea4d9fda0927 100644 ---- a/net/ipv6/ipv6_sockglue.c -+++ b/net/ipv6/ipv6_sockglue.c -@@ -48,6 +48,8 @@ - #include - #include - #include -+#include -+#include - #include - #include - #include -@@ -196,7 +198,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, - sock_prot_inuse_add(net, &tcp_prot, 1); - local_bh_enable(); - sk->sk_prot = &tcp_prot; -- icsk->icsk_af_ops = &ipv4_specific; -+#ifdef CONFIG_MPTCP -+ if (is_mptcp_enabled(sk)) -+ icsk->icsk_af_ops = &mptcp_v4_specific; -+ else -+#endif -+ icsk->icsk_af_ops = &ipv4_specific; - sk->sk_socket->ops = &inet_stream_ops; - sk->sk_family = PF_INET; - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); -diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c -index a822b880689b..b2b38869d795 100644 ---- a/net/ipv6/syncookies.c -+++ b/net/ipv6/syncookies.c -@@ -181,13 +181,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) - - /* check for timestamp cookie support */ - memset(&tcp_opt, 0, sizeof(tcp_opt)); -- tcp_parse_options(skb, &tcp_opt, 0, NULL); -+ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL); - - if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) - goto out; - - ret = NULL; -- req = inet6_reqsk_alloc(&tcp6_request_sock_ops); -+ req = inet_reqsk_alloc(&tcp6_request_sock_ops); - if (!req) - goto out; - -@@ -255,10 +255,10 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) - } - - req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); -- tcp_select_initial_window(tcp_full_space(sk), req->mss, -- &req->rcv_wnd, &req->window_clamp, -- ireq->wscale_ok, &rcv_wscale, -- dst_metric(dst, RTAX_INITRWND)); -+ tp->ops->select_initial_window(tcp_full_space(sk), req->mss, -+ &req->rcv_wnd, &req->window_clamp, -+ ireq->wscale_ok, &rcv_wscale, -+ dst_metric(dst, RTAX_INITRWND), sk); - - ireq->rcv_wscale = rcv_wscale; - -diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c -index 229239ad96b1..fda94d71666e 100644 ---- a/net/ipv6/tcp_ipv6.c -+++ b/net/ipv6/tcp_ipv6.c -@@ -63,6 +63,8 @@ - #include - #include - #include -+#include -+#include - #include - - #include -@@ -71,12 +73,6 @@ - #include - #include - --static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); --static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, -- struct request_sock *req); -- --static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -- - static const struct inet_connection_sock_af_ops ipv6_mapped; - static const struct inet_connection_sock_af_ops ipv6_specific; - #ifdef CONFIG_TCP_MD5SIG -@@ -90,7 +86,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, - } - #endif - --static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) -+void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) - { - struct dst_entry *dst = skb_dst(skb); - const struct rt6_info *rt = (const struct rt6_info *)dst; -@@ -102,10 +98,11 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) - inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; - } - --static void tcp_v6_hash(struct sock *sk) -+void tcp_v6_hash(struct sock *sk) - { - if (sk->sk_state != TCP_CLOSE) { -- if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { -+ if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped || -+ inet_csk(sk)->icsk_af_ops == &mptcp_v6_mapped) { - tcp_prot.hash(sk); - return; - } -@@ -115,7 +112,7 @@ static void tcp_v6_hash(struct sock *sk) - } - } - --static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) -+__u32 tcp_v6_init_sequence(const struct sk_buff *skb) - { - return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32, -@@ -123,7 +120,7 @@ static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) - tcp_hdr(skb)->source); - } - --static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, -+int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) - { - struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; -@@ -215,7 +212,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, - sin.sin_port = usin->sin6_port; - sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - -- icsk->icsk_af_ops = &ipv6_mapped; -+#ifdef CONFIG_MPTCP -+ if (is_mptcp_enabled(sk)) -+ icsk->icsk_af_ops = &mptcp_v6_mapped; -+ else -+#endif -+ icsk->icsk_af_ops = &ipv6_mapped; - sk->sk_backlog_rcv = tcp_v4_do_rcv; - #ifdef CONFIG_TCP_MD5SIG - tp->af_specific = &tcp_sock_ipv6_mapped_specific; -@@ -225,7 +227,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, - - if (err) { - icsk->icsk_ext_hdr_len = exthdrlen; -- icsk->icsk_af_ops = &ipv6_specific; -+#ifdef CONFIG_MPTCP -+ if (is_mptcp_enabled(sk)) -+ icsk->icsk_af_ops = &mptcp_v6_specific; -+ else -+#endif -+ icsk->icsk_af_ops = &ipv6_specific; - sk->sk_backlog_rcv = tcp_v6_do_rcv; - #ifdef CONFIG_TCP_MD5SIG - tp->af_specific = &tcp_sock_ipv6_specific; -@@ -337,7 +344,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; - const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); - struct ipv6_pinfo *np; -- struct sock *sk; -+ struct sock *sk, *meta_sk; - int err; - struct tcp_sock *tp; - struct request_sock *fastopen; -@@ -358,8 +365,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - return; - } - -- bh_lock_sock(sk); -- if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) -+ tp = tcp_sk(sk); -+ if (mptcp(tp)) -+ meta_sk = mptcp_meta_sk(sk); -+ else -+ meta_sk = sk; -+ -+ bh_lock_sock(meta_sk); -+ if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); - - if (sk->sk_state == TCP_CLOSE) -@@ -370,7 +383,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - goto out; - } - -- tp = tcp_sk(sk); - seq = ntohl(th->seq); - /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ - fastopen = tp->fastopen_rsk; -@@ -403,11 +415,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - goto out; - - tp->mtu_info = ntohl(info); -- if (!sock_owned_by_user(sk)) -+ if (!sock_owned_by_user(meta_sk)) - tcp_v6_mtu_reduced(sk); -- else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, -+ else { -+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, - &tp->tsq_flags)) -- sock_hold(sk); -+ sock_hold(sk); -+ if (mptcp(tp)) -+ mptcp_tsq_flags(sk); -+ } - goto out; - } - -@@ -417,7 +433,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - switch (sk->sk_state) { - struct request_sock *req, **prev; - case TCP_LISTEN: -- if (sock_owned_by_user(sk)) -+ if (sock_owned_by_user(meta_sk)) - goto out; - - req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, -@@ -447,7 +463,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - if (fastopen && fastopen->sk == NULL) - break; - -- if (!sock_owned_by_user(sk)) { -+ if (!sock_owned_by_user(meta_sk)) { - sk->sk_err = err; - sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - -@@ -457,26 +473,27 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - goto out; - } - -- if (!sock_owned_by_user(sk) && np->recverr) { -+ if (!sock_owned_by_user(meta_sk) && np->recverr) { - sk->sk_err = err; - sk->sk_error_report(sk); - } else - sk->sk_err_soft = err; - - out: -- bh_unlock_sock(sk); -+ bh_unlock_sock(meta_sk); - sock_put(sk); - } - - --static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, -- struct flowi6 *fl6, -- struct request_sock *req, -- u16 queue_mapping, -- struct tcp_fastopen_cookie *foc) -+int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, -+ struct flowi *fl, -+ struct request_sock *req, -+ u16 queue_mapping, -+ struct tcp_fastopen_cookie *foc) - { - struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); -+ struct flowi6 *fl6 = &fl->u.ip6; - struct sk_buff *skb; - int err = -ENOMEM; - -@@ -497,18 +514,21 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, - skb_set_queue_mapping(skb, queue_mapping); - err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); - err = net_xmit_eval(err); -+ if (!tcp_rsk(req)->snt_synack && !err) -+ tcp_rsk(req)->snt_synack = tcp_time_stamp; - } - - done: - return err; - } - --static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) -+int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) - { -- struct flowi6 fl6; -+ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; -+ struct flowi fl; - int res; - -- res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL); -+ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); - if (!res) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); -@@ -516,7 +536,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) - return res; - } - --static void tcp_v6_reqsk_destructor(struct request_sock *req) -+void tcp_v6_reqsk_destructor(struct request_sock *req) - { - kfree_skb(inet_rsk(req)->pktopts); - } -@@ -718,27 +738,74 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) - } - #endif - -+static int tcp_v6_init_req(struct request_sock *req, struct sock *sk, -+ struct sk_buff *skb) -+{ -+ struct inet_request_sock *ireq = inet_rsk(req); -+ struct ipv6_pinfo *np = inet6_sk(sk); -+ -+ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; -+ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; -+ -+ ireq->ir_iif = sk->sk_bound_dev_if; -+ ireq->ir_mark = inet_request_mark(sk, skb); -+ -+ /* So that link locals have meaning */ -+ if (!sk->sk_bound_dev_if && -+ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) -+ ireq->ir_iif = inet6_iif(skb); -+ -+ if (!TCP_SKB_CB(skb)->when && -+ (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || -+ np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || -+ np->rxopt.bits.rxohlim || np->repflow)) { -+ atomic_inc(&skb->users); -+ ireq->pktopts = skb; -+ } -+ -+ return 0; -+} -+ -+static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl, -+ const struct request_sock *req, -+ bool *strict) -+{ -+ if (strict) -+ *strict = true; -+ return inet6_csk_route_req(sk, &fl->u.ip6, req); -+} -+ - struct request_sock_ops tcp6_request_sock_ops __read_mostly = { - .family = AF_INET6, - .obj_size = sizeof(struct tcp6_request_sock), -- .rtx_syn_ack = tcp_v6_rtx_synack, -+ .rtx_syn_ack = tcp_rtx_synack, - .send_ack = tcp_v6_reqsk_send_ack, - .destructor = tcp_v6_reqsk_destructor, - .send_reset = tcp_v6_send_reset, - .syn_ack_timeout = tcp_syn_ack_timeout, - }; - -+const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { -+ .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - -+ sizeof(struct ipv6hdr), - #ifdef CONFIG_TCP_MD5SIG --static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { - .md5_lookup = tcp_v6_reqsk_md5_lookup, - .calc_md5_hash = tcp_v6_md5_hash_skb, --}; - #endif -+ .init_req = tcp_v6_init_req, -+#ifdef CONFIG_SYN_COOKIES -+ .cookie_init_seq = cookie_v6_init_sequence, -+#endif -+ .route_req = tcp_v6_route_req, -+ .init_seq = tcp_v6_init_sequence, -+ .send_synack = tcp_v6_send_synack, -+ .queue_hash_add = inet6_csk_reqsk_queue_hash_add, -+}; - --static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, -- u32 tsval, u32 tsecr, int oif, -- struct tcp_md5sig_key *key, int rst, u8 tclass, -- u32 label) -+static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, -+ u32 data_ack, u32 win, u32 tsval, u32 tsecr, -+ int oif, struct tcp_md5sig_key *key, int rst, -+ u8 tclass, u32 label, int mptcp) - { - const struct tcphdr *th = tcp_hdr(skb); - struct tcphdr *t1; -@@ -756,7 +823,10 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, - if (key) - tot_len += TCPOLEN_MD5SIG_ALIGNED; - #endif -- -+#ifdef CONFIG_MPTCP -+ if (mptcp) -+ tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; -+#endif - buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, - GFP_ATOMIC); - if (buff == NULL) -@@ -794,6 +864,17 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, - tcp_v6_md5_hash_hdr((__u8 *)topt, key, - &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, t1); -+ topt += 4; -+ } -+#endif -+#ifdef CONFIG_MPTCP -+ if (mptcp) { -+ /* Construction of 32-bit data_ack */ -+ *topt++ = htonl((TCPOPT_MPTCP << 24) | -+ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | -+ (0x20 << 8) | -+ (0x01)); -+ *topt++ = htonl(data_ack); - } - #endif - -@@ -834,7 +915,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, - kfree_skb(buff); - } - --static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) -+void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) - { - const struct tcphdr *th = tcp_hdr(skb); - u32 seq = 0, ack_seq = 0; -@@ -891,7 +972,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) - (th->doff << 2); - - oif = sk ? sk->sk_bound_dev_if : 0; -- tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); -+ tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0, 0, 0); - - #ifdef CONFIG_TCP_MD5SIG - release_sk1: -@@ -902,45 +983,52 @@ release_sk1: - #endif - } - --static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, -+static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack, - u32 win, u32 tsval, u32 tsecr, int oif, - struct tcp_md5sig_key *key, u8 tclass, -- u32 label) -+ u32 label, int mptcp) - { -- tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass, -- label); -+ tcp_v6_send_response(skb, seq, ack, data_ack, win, tsval, tsecr, oif, -+ key, 0, tclass, label, mptcp); - } - - static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) - { - struct inet_timewait_sock *tw = inet_twsk(sk); - struct tcp_timewait_sock *tcptw = tcp_twsk(sk); -+ u32 data_ack = 0; -+ int mptcp = 0; - -+ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) { -+ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; -+ mptcp = 1; -+ } - tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, -+ data_ack, - tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp + tcptw->tw_ts_offset, - tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), -- tw->tw_tclass, (tw->tw_flowlabel << 12)); -+ tw->tw_tclass, (tw->tw_flowlabel << 12), mptcp); - - inet_twsk_put(tw); - } - --static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, -- struct request_sock *req) -+void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, -+ struct request_sock *req) - { - /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV - * sk->sk_state == TCP_SYN_RECV -> for Fast Open. - */ - tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? - tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, -- tcp_rsk(req)->rcv_nxt, -+ tcp_rsk(req)->rcv_nxt, 0, - req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), -- 0, 0); -+ 0, 0, 0); - } - - --static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) -+struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) - { - struct request_sock *req, **prev; - const struct tcphdr *th = tcp_hdr(skb); -@@ -959,7 +1047,13 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { -+ /* Don't lock again the meta-sk. It has been locked -+ * before mptcp_v6_do_rcv. -+ */ -+ if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk)) -+ bh_lock_sock(mptcp_meta_sk(nsk)); - bh_lock_sock(nsk); -+ - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); -@@ -973,161 +1067,25 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) - return sk; - } - --/* FIXME: this is substantially similar to the ipv4 code. -- * Can some kind of merge be done? -- erics -- */ --static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) -+int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) - { -- struct tcp_options_received tmp_opt; -- struct request_sock *req; -- struct inet_request_sock *ireq; -- struct ipv6_pinfo *np = inet6_sk(sk); -- struct tcp_sock *tp = tcp_sk(sk); -- __u32 isn = TCP_SKB_CB(skb)->when; -- struct dst_entry *dst = NULL; -- struct tcp_fastopen_cookie foc = { .len = -1 }; -- bool want_cookie = false, fastopen; -- struct flowi6 fl6; -- int err; -- - if (skb->protocol == htons(ETH_P_IP)) - return tcp_v4_conn_request(sk, skb); - - if (!ipv6_unicast_destination(skb)) - goto drop; - -- if ((sysctl_tcp_syncookies == 2 || -- inet_csk_reqsk_queue_is_full(sk)) && !isn) { -- want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6"); -- if (!want_cookie) -- goto drop; -- } -- -- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { -- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -- goto drop; -- } -- -- req = inet6_reqsk_alloc(&tcp6_request_sock_ops); -- if (req == NULL) -- goto drop; -- --#ifdef CONFIG_TCP_MD5SIG -- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; --#endif -- -- tcp_clear_options(&tmp_opt); -- tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); -- tmp_opt.user_mss = tp->rx_opt.user_mss; -- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); -- -- if (want_cookie && !tmp_opt.saw_tstamp) -- tcp_clear_options(&tmp_opt); -+ return tcp_conn_request(&tcp6_request_sock_ops, -+ &tcp_request_sock_ipv6_ops, sk, skb); - -- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; -- tcp_openreq_init(req, &tmp_opt, skb); -- -- ireq = inet_rsk(req); -- ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; -- ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; -- if (!want_cookie || tmp_opt.tstamp_ok) -- TCP_ECN_create_request(req, skb, sock_net(sk)); -- -- ireq->ir_iif = sk->sk_bound_dev_if; -- ireq->ir_mark = inet_request_mark(sk, skb); -- -- /* So that link locals have meaning */ -- if (!sk->sk_bound_dev_if && -- ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) -- ireq->ir_iif = inet6_iif(skb); -- -- if (!isn) { -- if (ipv6_opt_accepted(sk, skb) || -- np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || -- np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || -- np->repflow) { -- atomic_inc(&skb->users); -- ireq->pktopts = skb; -- } -- -- if (want_cookie) { -- isn = cookie_v6_init_sequence(sk, skb, &req->mss); -- req->cookie_ts = tmp_opt.tstamp_ok; -- goto have_isn; -- } -- -- /* VJ's idea. We save last timestamp seen -- * from the destination in peer table, when entering -- * state TIME-WAIT, and check against it before -- * accepting new connection request. -- * -- * If "isn" is not zero, this request hit alive -- * timewait bucket, so that all the necessary checks -- * are made in the function processing timewait state. -- */ -- if (tmp_opt.saw_tstamp && -- tcp_death_row.sysctl_tw_recycle && -- (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) { -- if (!tcp_peer_is_proven(req, dst, true)) { -- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); -- goto drop_and_release; -- } -- } -- /* Kill the following clause, if you dislike this way. */ -- else if (!sysctl_tcp_syncookies && -- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < -- (sysctl_max_syn_backlog >> 2)) && -- !tcp_peer_is_proven(req, dst, false)) { -- /* Without syncookies last quarter of -- * backlog is filled with destinations, -- * proven to be alive. -- * It means that we continue to communicate -- * to destinations, already remembered -- * to the moment of synflood. -- */ -- LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n", -- &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source)); -- goto drop_and_release; -- } -- -- isn = tcp_v6_init_sequence(skb); -- } --have_isn: -- -- if (security_inet_conn_request(sk, skb, req)) -- goto drop_and_release; -- -- if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL) -- goto drop_and_free; -- -- tcp_rsk(req)->snt_isn = isn; -- tcp_rsk(req)->snt_synack = tcp_time_stamp; -- tcp_openreq_init_rwin(req, sk, dst); -- fastopen = !want_cookie && -- tcp_try_fastopen(sk, skb, req, &foc, dst); -- err = tcp_v6_send_synack(sk, dst, &fl6, req, -- skb_get_queue_mapping(skb), &foc); -- if (!fastopen) { -- if (err || want_cookie) -- goto drop_and_free; -- -- tcp_rsk(req)->listener = NULL; -- inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); -- } -- return 0; -- --drop_and_release: -- dst_release(dst); --drop_and_free: -- reqsk_free(req); - drop: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - return 0; /* don't send reset */ - } - --static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, -- struct request_sock *req, -- struct dst_entry *dst) -+struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, -+ struct request_sock *req, -+ struct dst_entry *dst) - { - struct inet_request_sock *ireq; - struct ipv6_pinfo *newnp, *np = inet6_sk(sk); -@@ -1165,7 +1123,12 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, - - newsk->sk_v6_rcv_saddr = newnp->saddr; - -- inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; -+#ifdef CONFIG_MPTCP -+ if (is_mptcp_enabled(newsk)) -+ inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped; -+ else -+#endif -+ inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; - newsk->sk_backlog_rcv = tcp_v4_do_rcv; - #ifdef CONFIG_TCP_MD5SIG - newtp->af_specific = &tcp_sock_ipv6_mapped_specific; -@@ -1329,7 +1292,7 @@ out: - * This is because we cannot sleep with the original spinlock - * held. - */ --static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) -+int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) - { - struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp_sock *tp; -@@ -1351,6 +1314,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) - goto discard; - #endif - -+ if (is_meta_sk(sk)) -+ return mptcp_v6_do_rcv(sk, skb); -+ - if (sk_filter(sk, skb)) - goto discard; - -@@ -1472,7 +1438,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) - { - const struct tcphdr *th; - const struct ipv6hdr *hdr; -- struct sock *sk; -+ struct sock *sk, *meta_sk = NULL; - int ret; - struct net *net = dev_net(skb->dev); - -@@ -1503,18 +1469,43 @@ static int tcp_v6_rcv(struct sk_buff *skb) - TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + - skb->len - th->doff*4); - TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); -+#ifdef CONFIG_MPTCP -+ TCP_SKB_CB(skb)->mptcp_flags = 0; -+ TCP_SKB_CB(skb)->dss_off = 0; -+#endif - TCP_SKB_CB(skb)->when = 0; - TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); - TCP_SKB_CB(skb)->sacked = 0; - - sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); -- if (!sk) -- goto no_tcp_socket; - - process: -- if (sk->sk_state == TCP_TIME_WAIT) -+ if (sk && sk->sk_state == TCP_TIME_WAIT) - goto do_time_wait; - -+#ifdef CONFIG_MPTCP -+ if (!sk && th->syn && !th->ack) { -+ int ret = mptcp_lookup_join(skb, NULL); -+ -+ if (ret < 0) { -+ tcp_v6_send_reset(NULL, skb); -+ goto discard_it; -+ } else if (ret > 0) { -+ return 0; -+ } -+ } -+ -+ /* Is there a pending request sock for this segment ? */ -+ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) { -+ if (sk) -+ sock_put(sk); -+ return 0; -+ } -+#endif -+ -+ if (!sk) -+ goto no_tcp_socket; -+ - if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { - NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); - goto discard_and_relse; -@@ -1529,11 +1520,21 @@ process: - sk_mark_napi_id(sk, skb); - skb->dev = NULL; - -- bh_lock_sock_nested(sk); -+ if (mptcp(tcp_sk(sk))) { -+ meta_sk = mptcp_meta_sk(sk); -+ -+ bh_lock_sock_nested(meta_sk); -+ if (sock_owned_by_user(meta_sk)) -+ skb->sk = sk; -+ } else { -+ meta_sk = sk; -+ bh_lock_sock_nested(sk); -+ } -+ - ret = 0; -- if (!sock_owned_by_user(sk)) { -+ if (!sock_owned_by_user(meta_sk)) { - #ifdef CONFIG_NET_DMA -- struct tcp_sock *tp = tcp_sk(sk); -+ struct tcp_sock *tp = tcp_sk(meta_sk); - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - if (tp->ucopy.dma_chan) -@@ -1541,16 +1542,17 @@ process: - else - #endif - { -- if (!tcp_prequeue(sk, skb)) -+ if (!tcp_prequeue(meta_sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); - } -- } else if (unlikely(sk_add_backlog(sk, skb, -- sk->sk_rcvbuf + sk->sk_sndbuf))) { -- bh_unlock_sock(sk); -+ } else if (unlikely(sk_add_backlog(meta_sk, skb, -+ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { -+ bh_unlock_sock(meta_sk); - NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); - goto discard_and_relse; - } -- bh_unlock_sock(sk); -+ -+ bh_unlock_sock(meta_sk); - - sock_put(sk); - return ret ? -1 : 0; -@@ -1607,6 +1609,18 @@ do_time_wait: - sk = sk2; - goto process; - } -+#ifdef CONFIG_MPTCP -+ if (th->syn && !th->ack) { -+ int ret = mptcp_lookup_join(skb, inet_twsk(sk)); -+ -+ if (ret < 0) { -+ tcp_v6_send_reset(NULL, skb); -+ goto discard_it; -+ } else if (ret > 0) { -+ return 0; -+ } -+ } -+#endif - /* Fall through to ACK */ - } - case TCP_TW_ACK: -@@ -1657,7 +1671,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb) - } - } - --static struct timewait_sock_ops tcp6_timewait_sock_ops = { -+struct timewait_sock_ops tcp6_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_unique = tcp_twsk_unique, - .twsk_destructor = tcp_twsk_destructor, -@@ -1730,7 +1744,12 @@ static int tcp_v6_init_sock(struct sock *sk) - - tcp_init_sock(sk); - -- icsk->icsk_af_ops = &ipv6_specific; -+#ifdef CONFIG_MPTCP -+ if (is_mptcp_enabled(sk)) -+ icsk->icsk_af_ops = &mptcp_v6_specific; -+ else -+#endif -+ icsk->icsk_af_ops = &ipv6_specific; - - #ifdef CONFIG_TCP_MD5SIG - tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; -@@ -1739,7 +1758,7 @@ static int tcp_v6_init_sock(struct sock *sk) - return 0; - } - --static void tcp_v6_destroy_sock(struct sock *sk) -+void tcp_v6_destroy_sock(struct sock *sk) - { - tcp_v4_destroy_sock(sk); - inet6_destroy_sock(sk); -@@ -1924,12 +1943,28 @@ void tcp6_proc_exit(struct net *net) - static void tcp_v6_clear_sk(struct sock *sk, int size) - { - struct inet_sock *inet = inet_sk(sk); -+#ifdef CONFIG_MPTCP -+ struct tcp_sock *tp = tcp_sk(sk); -+ /* size_tk_table goes from the end of tk_table to the end of sk */ -+ int size_tk_table = size - offsetof(struct tcp_sock, tk_table) - -+ sizeof(tp->tk_table); -+#endif - - /* we do not want to clear pinet6 field, because of RCU lookups */ - sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6)); - - size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); -+ -+#ifdef CONFIG_MPTCP -+ /* We zero out only from pinet6 to tk_table */ -+ size -= size_tk_table + sizeof(tp->tk_table); -+#endif - memset(&inet->pinet6 + 1, 0, size); -+ -+#ifdef CONFIG_MPTCP -+ memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size_tk_table); -+#endif -+ - } - - struct proto tcpv6_prot = { -diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig -new file mode 100644 -index 000000000000..cdfc03adabf8 ---- /dev/null -+++ b/net/mptcp/Kconfig -@@ -0,0 +1,115 @@ -+# -+# MPTCP configuration -+# -+config MPTCP -+ bool "MPTCP protocol" -+ depends on (IPV6=y || IPV6=n) -+ ---help--- -+ This replaces the normal TCP stack with a Multipath TCP stack, -+ able to use several paths at once. -+ -+menuconfig MPTCP_PM_ADVANCED -+ bool "MPTCP: advanced path-manager control" -+ depends on MPTCP=y -+ ---help--- -+ Support for selection of different path-managers. You should choose 'Y' here, -+ because otherwise you will not actively create new MPTCP-subflows. -+ -+if MPTCP_PM_ADVANCED -+ -+config MPTCP_FULLMESH -+ tristate "MPTCP Full-Mesh Path-Manager" -+ depends on MPTCP=y -+ ---help--- -+ This path-management module will create a full-mesh among all IP-addresses. -+ -+config MPTCP_NDIFFPORTS -+ tristate "MPTCP ndiff-ports" -+ depends on MPTCP=y -+ ---help--- -+ This path-management module will create multiple subflows between the same -+ pair of IP-addresses, modifying the source-port. You can set the number -+ of subflows via the mptcp_ndiffports-sysctl. -+ -+config MPTCP_BINDER -+ tristate "MPTCP Binder" -+ depends on (MPTCP=y) -+ ---help--- -+ This path-management module works like ndiffports, and adds the sysctl -+ option to set the gateway (and/or path to) per each additional subflow -+ via Loose Source Routing (IPv4 only). -+ -+choice -+ prompt "Default MPTCP Path-Manager" -+ default DEFAULT -+ help -+ Select the Path-Manager of your choice -+ -+ config DEFAULT_FULLMESH -+ bool "Full mesh" if MPTCP_FULLMESH=y -+ -+ config DEFAULT_NDIFFPORTS -+ bool "ndiff-ports" if MPTCP_NDIFFPORTS=y -+ -+ config DEFAULT_BINDER -+ bool "binder" if MPTCP_BINDER=y -+ -+ config DEFAULT_DUMMY -+ bool "Default" -+ -+endchoice -+ -+endif -+ -+config DEFAULT_MPTCP_PM -+ string -+ default "default" if DEFAULT_DUMMY -+ default "fullmesh" if DEFAULT_FULLMESH -+ default "ndiffports" if DEFAULT_NDIFFPORTS -+ default "binder" if DEFAULT_BINDER -+ default "default" -+ -+menuconfig MPTCP_SCHED_ADVANCED -+ bool "MPTCP: advanced scheduler control" -+ depends on MPTCP=y -+ ---help--- -+ Support for selection of different schedulers. You should choose 'Y' here, -+ if you want to choose a different scheduler than the default one. -+ -+if MPTCP_SCHED_ADVANCED -+ -+config MPTCP_ROUNDROBIN -+ tristate "MPTCP Round-Robin" -+ depends on (MPTCP=y) -+ ---help--- -+ This is a very simple round-robin scheduler. Probably has bad performance -+ but might be interesting for researchers. -+ -+choice -+ prompt "Default MPTCP Scheduler" -+ default DEFAULT -+ help -+ Select the Scheduler of your choice -+ -+ config DEFAULT_SCHEDULER -+ bool "Default" -+ ---help--- -+ This is the default scheduler, sending first on the subflow -+ with the lowest RTT. -+ -+ config DEFAULT_ROUNDROBIN -+ bool "Round-Robin" if MPTCP_ROUNDROBIN=y -+ ---help--- -+ This is the round-rob scheduler, sending in a round-robin -+ fashion.. -+ -+endchoice -+endif -+ -+config DEFAULT_MPTCP_SCHED -+ string -+ depends on (MPTCP=y) -+ default "default" if DEFAULT_SCHEDULER -+ default "roundrobin" if DEFAULT_ROUNDROBIN -+ default "default" -+ -diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile -new file mode 100644 -index 000000000000..35561a7012e3 ---- /dev/null -+++ b/net/mptcp/Makefile -@@ -0,0 +1,20 @@ -+# -+## Makefile for MultiPath TCP support code. -+# -+# -+ -+obj-$(CONFIG_MPTCP) += mptcp.o -+ -+mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \ -+ mptcp_output.o mptcp_input.o mptcp_sched.o -+ -+obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o -+obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o -+obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o -+obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o -+obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o -+obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o -+obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o -+ -+mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o -+ -diff --git a/net/mptcp/mptcp_binder.c b/net/mptcp/mptcp_binder.c -new file mode 100644 -index 000000000000..95d8da560715 ---- /dev/null -+++ b/net/mptcp/mptcp_binder.c -@@ -0,0 +1,487 @@ -+#include -+ -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define MPTCP_GW_MAX_LISTS 10 -+#define MPTCP_GW_LIST_MAX_LEN 6 -+#define MPTCP_GW_SYSCTL_MAX_LEN (15 * MPTCP_GW_LIST_MAX_LEN * \ -+ MPTCP_GW_MAX_LISTS) -+ -+struct mptcp_gw_list { -+ struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN]; -+ u8 len[MPTCP_GW_MAX_LISTS]; -+}; -+ -+struct binder_priv { -+ /* Worker struct for subflow establishment */ -+ struct work_struct subflow_work; -+ -+ struct mptcp_cb *mpcb; -+ -+ /* Prevent multiple sub-sockets concurrently iterating over sockets */ -+ spinlock_t *flow_lock; -+}; -+ -+static struct mptcp_gw_list *mptcp_gws; -+static rwlock_t mptcp_gws_lock; -+ -+static int mptcp_binder_ndiffports __read_mostly = 1; -+ -+static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly; -+ -+static int mptcp_get_avail_list_ipv4(struct sock *sk) -+{ -+ int i, j, list_taken, opt_ret, opt_len; -+ unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN]; -+ -+ for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) { -+ if (mptcp_gws->len[i] == 0) -+ goto error; -+ -+ mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i); -+ list_taken = 0; -+ -+ /* Loop through all sub-sockets in this connection */ -+ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk) { -+ mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n"); -+ -+ /* Reset length and options buffer, then retrieve -+ * from socket -+ */ -+ opt_len = MAX_IPOPTLEN; -+ memset(opt, 0, MAX_IPOPTLEN); -+ opt_ret = ip_getsockopt(sk, IPPROTO_IP, -+ IP_OPTIONS, opt, &opt_len); -+ if (opt_ret < 0) { -+ mptcp_debug(KERN_ERR "%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n", -+ __func__, opt_ret); -+ goto error; -+ } -+ -+ /* If socket has no options, it has no stake in this list */ -+ if (opt_len <= 0) -+ continue; -+ -+ /* Iterate options buffer */ -+ for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) { -+ if (*opt_ptr == IPOPT_LSRR) { -+ mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n"); -+ goto sock_lsrr; -+ } -+ } -+ continue; -+ -+sock_lsrr: -+ /* Pointer to the 2nd to last address */ -+ opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4; -+ -+ /* Addresses start 3 bytes after type offset */ -+ opt_ptr += 3; -+ j = 0; -+ -+ /* Different length lists cannot be the same */ -+ if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i]) -+ continue; -+ -+ /* Iterate if we are still inside options list -+ * and sysctl list -+ */ -+ while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) { -+ /* If there is a different address, this list must -+ * not be set on this socket -+ */ -+ if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4)) -+ break; -+ -+ /* Jump 4 bytes to next address */ -+ opt_ptr += 4; -+ j++; -+ } -+ -+ /* Reached the end without a differing address, lists -+ * are therefore identical. -+ */ -+ if (j == mptcp_gws->len[i]) { -+ mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n"); -+ list_taken = 1; -+ break; -+ } -+ } -+ -+ /* Free list found if not taken by a socket */ -+ if (!list_taken) { -+ mptcp_debug("mptcp_get_avail_list_ipv4: List free\n"); -+ break; -+ } -+ } -+ -+ if (i >= MPTCP_GW_MAX_LISTS) -+ goto error; -+ -+ return i; -+error: -+ return -1; -+} -+ -+/* The list of addresses is parsed each time a new connection is opened, -+ * to make sure it's up to date. In case of error, all the lists are -+ * marked as unavailable and the subflow's fingerprint is set to 0. -+ */ -+static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr) -+{ -+ int i, j, ret; -+ unsigned char opt[MAX_IPOPTLEN] = {0}; -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0]; -+ -+ /* Read lock: multiple sockets can read LSRR addresses at the same -+ * time, but writes are done in mutual exclusion. -+ * Spin lock: must search for free list for one socket at a time, or -+ * multiple sockets could take the same list. -+ */ -+ read_lock(&mptcp_gws_lock); -+ spin_lock(fmp->flow_lock); -+ -+ i = mptcp_get_avail_list_ipv4(sk); -+ -+ /* Execution enters here only if a free path is found. -+ */ -+ if (i >= 0) { -+ opt[0] = IPOPT_NOP; -+ opt[1] = IPOPT_LSRR; -+ opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) * -+ (mptcp_gws->len[i] + 1) + 3; -+ opt[3] = IPOPT_MINOFF; -+ for (j = 0; j < mptcp_gws->len[i]; ++j) -+ memcpy(opt + 4 + -+ (j * sizeof(mptcp_gws->list[i][0].s_addr)), -+ &mptcp_gws->list[i][j].s_addr, -+ sizeof(mptcp_gws->list[i][0].s_addr)); -+ /* Final destination must be part of IP_OPTIONS parameter. */ -+ memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr, -+ sizeof(addr.s_addr)); -+ -+ /* setsockopt must be inside the lock, otherwise another -+ * subflow could fail to see that we have taken a list. -+ */ -+ ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, opt, -+ 4 + sizeof(mptcp_gws->list[i][0].s_addr) -+ * (mptcp_gws->len[i] + 1)); -+ -+ if (ret < 0) { -+ mptcp_debug(KERN_ERR "%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n", -+ __func__, ret); -+ } -+ } -+ -+ spin_unlock(fmp->flow_lock); -+ read_unlock(&mptcp_gws_lock); -+ -+ return; -+} -+ -+/* Parses gateways string for a list of paths to different -+ * gateways, and stores them for use with the Loose Source Routing (LSRR) -+ * socket option. Each list must have "," separated addresses, and the lists -+ * themselves must be separated by "-". Returns -1 in case one or more of the -+ * addresses is not a valid ipv4/6 address. -+ */ -+static int mptcp_parse_gateway_ipv4(char *gateways) -+{ -+ int i, j, k, ret; -+ char *tmp_string = NULL; -+ struct in_addr tmp_addr; -+ -+ tmp_string = kzalloc(16, GFP_KERNEL); -+ if (tmp_string == NULL) -+ return -ENOMEM; -+ -+ write_lock(&mptcp_gws_lock); -+ -+ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list)); -+ -+ /* A TMP string is used since inet_pton needs a null terminated string -+ * but we do not want to modify the sysctl for obvious reasons. -+ * i will iterate over the SYSCTL string, j will iterate over the -+ * temporary string where each IP is copied into, k will iterate over -+ * the IPs in each list. -+ */ -+ for (i = j = k = 0; -+ i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS; -+ ++i) { -+ if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') { -+ /* If the temp IP is empty and the current list is -+ * empty, we are done. -+ */ -+ if (j == 0 && mptcp_gws->len[k] == 0) -+ break; -+ -+ /* Terminate the temp IP string, then if it is -+ * non-empty parse the IP and copy it. -+ */ -+ tmp_string[j] = '\0'; -+ if (j > 0) { -+ mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i); -+ -+ ret = in4_pton(tmp_string, strlen(tmp_string), -+ (u8 *)&tmp_addr.s_addr, '\0', -+ NULL); -+ -+ if (ret) { -+ mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n", -+ ret, -+ &tmp_addr.s_addr); -+ memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr, -+ &tmp_addr.s_addr, -+ sizeof(tmp_addr.s_addr)); -+ mptcp_gws->len[k]++; -+ j = 0; -+ tmp_string[j] = '\0'; -+ /* Since we can't impose a limit to -+ * what the user can input, make sure -+ * there are not too many IPs in the -+ * SYSCTL string. -+ */ -+ if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) { -+ mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n", -+ k, -+ MPTCP_GW_LIST_MAX_LEN); -+ goto error; -+ } -+ } else { -+ goto error; -+ } -+ } -+ -+ if (gateways[i] == '-' || gateways[i] == '\0') -+ ++k; -+ } else { -+ tmp_string[j] = gateways[i]; -+ ++j; -+ } -+ } -+ -+ /* Number of flows is number of gateway lists plus master flow */ -+ mptcp_binder_ndiffports = k+1; -+ -+ write_unlock(&mptcp_gws_lock); -+ kfree(tmp_string); -+ -+ return 0; -+ -+error: -+ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list)); -+ memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN); -+ write_unlock(&mptcp_gws_lock); -+ kfree(tmp_string); -+ return -1; -+} -+ -+/** -+ * Create all new subflows, by doing calls to mptcp_initX_subsockets -+ * -+ * This function uses a goto next_subflow, to allow releasing the lock between -+ * new subflows and giving other processes a chance to do some work on the -+ * socket and potentially finishing the communication. -+ **/ -+static void create_subflow_worker(struct work_struct *work) -+{ -+ const struct binder_priv *pm_priv = container_of(work, -+ struct binder_priv, -+ subflow_work); -+ struct mptcp_cb *mpcb = pm_priv->mpcb; -+ struct sock *meta_sk = mpcb->meta_sk; -+ int iter = 0; -+ -+next_subflow: -+ if (iter) { -+ release_sock(meta_sk); -+ mutex_unlock(&mpcb->mpcb_mutex); -+ -+ cond_resched(); -+ } -+ mutex_lock(&mpcb->mpcb_mutex); -+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); -+ -+ iter++; -+ -+ if (sock_flag(meta_sk, SOCK_DEAD)) -+ goto exit; -+ -+ if (mpcb->master_sk && -+ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) -+ goto exit; -+ -+ if (mptcp_binder_ndiffports > iter && -+ mptcp_binder_ndiffports > mpcb->cnt_subflows) { -+ struct mptcp_loc4 loc; -+ struct mptcp_rem4 rem; -+ -+ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr; -+ loc.loc4_id = 0; -+ loc.low_prio = 0; -+ -+ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr; -+ rem.port = inet_sk(meta_sk)->inet_dport; -+ rem.rem4_id = 0; /* Default 0 */ -+ -+ mptcp_init4_subsockets(meta_sk, &loc, &rem); -+ -+ goto next_subflow; -+ } -+ -+exit: -+ release_sock(meta_sk); -+ mutex_unlock(&mpcb->mpcb_mutex); -+ sock_put(meta_sk); -+} -+ -+static void binder_new_session(const struct sock *meta_sk) -+{ -+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0]; -+ static DEFINE_SPINLOCK(flow_lock); -+ -+#if IS_ENABLED(CONFIG_IPV6) -+ if (meta_sk->sk_family == AF_INET6 && -+ !mptcp_v6_is_v4_mapped(meta_sk)) { -+ mptcp_fallback_default(mpcb); -+ return; -+ } -+#endif -+ -+ /* Initialize workqueue-struct */ -+ INIT_WORK(&fmp->subflow_work, create_subflow_worker); -+ fmp->mpcb = mpcb; -+ -+ fmp->flow_lock = &flow_lock; -+} -+ -+static void binder_create_subflows(struct sock *meta_sk) -+{ -+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0]; -+ -+ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv || -+ mpcb->send_infinite_mapping || -+ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) -+ return; -+ -+ if (!work_pending(&pm_priv->subflow_work)) { -+ sock_hold(meta_sk); -+ queue_work(mptcp_wq, &pm_priv->subflow_work); -+ } -+} -+ -+static int binder_get_local_id(sa_family_t family, union inet_addr *addr, -+ struct net *net, bool *low_prio) -+{ -+ return 0; -+} -+ -+/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated. -+ * Inspired from proc_tcp_congestion_control(). -+ */ -+static int proc_mptcp_gateways(ctl_table *ctl, int write, -+ void __user *buffer, size_t *lenp, -+ loff_t *ppos) -+{ -+ int ret; -+ ctl_table tbl = { -+ .maxlen = MPTCP_GW_SYSCTL_MAX_LEN, -+ }; -+ -+ if (write) { -+ tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL); -+ if (tbl.data == NULL) -+ return -1; -+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos); -+ if (ret == 0) { -+ ret = mptcp_parse_gateway_ipv4(tbl.data); -+ memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN); -+ } -+ kfree(tbl.data); -+ } else { -+ ret = proc_dostring(ctl, write, buffer, lenp, ppos); -+ } -+ -+ -+ return ret; -+} -+ -+static struct mptcp_pm_ops binder __read_mostly = { -+ .new_session = binder_new_session, -+ .fully_established = binder_create_subflows, -+ .get_local_id = binder_get_local_id, -+ .init_subsocket_v4 = mptcp_v4_add_lsrr, -+ .name = "binder", -+ .owner = THIS_MODULE, -+}; -+ -+static struct ctl_table binder_table[] = { -+ { -+ .procname = "mptcp_binder_gateways", -+ .data = &sysctl_mptcp_binder_gateways, -+ .maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN, -+ .mode = 0644, -+ .proc_handler = &proc_mptcp_gateways -+ }, -+ { } -+}; -+ -+struct ctl_table_header *mptcp_sysctl_binder; -+ -+/* General initialization of MPTCP_PM */ -+static int __init binder_register(void) -+{ -+ mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL); -+ if (!mptcp_gws) -+ return -ENOMEM; -+ -+ rwlock_init(&mptcp_gws_lock); -+ -+ BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE); -+ -+ mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp", -+ binder_table); -+ if (!mptcp_sysctl_binder) -+ goto sysctl_fail; -+ -+ if (mptcp_register_path_manager(&binder)) -+ goto pm_failed; -+ -+ return 0; -+ -+pm_failed: -+ unregister_net_sysctl_table(mptcp_sysctl_binder); -+sysctl_fail: -+ kfree(mptcp_gws); -+ -+ return -1; -+} -+ -+static void binder_unregister(void) -+{ -+ mptcp_unregister_path_manager(&binder); -+ unregister_net_sysctl_table(mptcp_sysctl_binder); -+ kfree(mptcp_gws); -+} -+ -+module_init(binder_register); -+module_exit(binder_unregister); -+ -+MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)"); -+MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("BINDER MPTCP"); -+MODULE_VERSION("0.1"); -diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c -new file mode 100644 -index 000000000000..5d761164eb85 ---- /dev/null -+++ b/net/mptcp/mptcp_coupled.c -@@ -0,0 +1,270 @@ -+/* -+ * MPTCP implementation - Linked Increase congestion control Algorithm (LIA) -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer & Author: -+ * Christoph Paasch -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+#include -+#include -+ -+#include -+ -+/* Scaling is done in the numerator with alpha_scale_num and in the denominator -+ * with alpha_scale_den. -+ * -+ * To downscale, we just need to use alpha_scale. -+ * -+ * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2) -+ */ -+static int alpha_scale_den = 10; -+static int alpha_scale_num = 32; -+static int alpha_scale = 12; -+ -+struct mptcp_ccc { -+ u64 alpha; -+ bool forced_update; -+}; -+ -+static inline int mptcp_ccc_sk_can_send(const struct sock *sk) -+{ -+ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; -+} -+ -+static inline u64 mptcp_get_alpha(const struct sock *meta_sk) -+{ -+ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha; -+} -+ -+static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha) -+{ -+ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha; -+} -+ -+static inline u64 mptcp_ccc_scale(u32 val, int scale) -+{ -+ return (u64) val << scale; -+} -+ -+static inline bool mptcp_get_forced(const struct sock *meta_sk) -+{ -+ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update; -+} -+ -+static inline void mptcp_set_forced(const struct sock *meta_sk, bool force) -+{ -+ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force; -+} -+ -+static void mptcp_ccc_recalc_alpha(const struct sock *sk) -+{ -+ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; -+ const struct sock *sub_sk; -+ int best_cwnd = 0, best_rtt = 0, can_send = 0; -+ u64 max_numerator = 0, sum_denominator = 0, alpha = 1; -+ -+ if (!mpcb) -+ return; -+ -+ /* Only one subflow left - fall back to normal reno-behavior -+ * (set alpha to 1) -+ */ -+ if (mpcb->cnt_established <= 1) -+ goto exit; -+ -+ /* Do regular alpha-calculation for multiple subflows */ -+ -+ /* Find the max numerator of the alpha-calculation */ -+ mptcp_for_each_sk(mpcb, sub_sk) { -+ struct tcp_sock *sub_tp = tcp_sk(sub_sk); -+ u64 tmp; -+ -+ if (!mptcp_ccc_sk_can_send(sub_sk)) -+ continue; -+ -+ can_send++; -+ -+ /* We need to look for the path, that provides the max-value. -+ * Integer-overflow is not possible here, because -+ * tmp will be in u64. -+ */ -+ tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd, -+ alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us); -+ -+ if (tmp >= max_numerator) { -+ max_numerator = tmp; -+ best_cwnd = sub_tp->snd_cwnd; -+ best_rtt = sub_tp->srtt_us; -+ } -+ } -+ -+ /* No subflow is able to send - we don't care anymore */ -+ if (unlikely(!can_send)) -+ goto exit; -+ -+ /* Calculate the denominator */ -+ mptcp_for_each_sk(mpcb, sub_sk) { -+ struct tcp_sock *sub_tp = tcp_sk(sub_sk); -+ -+ if (!mptcp_ccc_sk_can_send(sub_sk)) -+ continue; -+ -+ sum_denominator += div_u64( -+ mptcp_ccc_scale(sub_tp->snd_cwnd, -+ alpha_scale_den) * best_rtt, -+ sub_tp->srtt_us); -+ } -+ sum_denominator *= sum_denominator; -+ if (unlikely(!sum_denominator)) { -+ pr_err("%s: sum_denominator == 0, cnt_established:%d\n", -+ __func__, mpcb->cnt_established); -+ mptcp_for_each_sk(mpcb, sub_sk) { -+ struct tcp_sock *sub_tp = tcp_sk(sub_sk); -+ pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u", -+ __func__, sub_tp->mptcp->path_index, -+ sub_sk->sk_state, sub_tp->srtt_us, -+ sub_tp->snd_cwnd); -+ } -+ } -+ -+ alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator); -+ -+ if (unlikely(!alpha)) -+ alpha = 1; -+ -+exit: -+ mptcp_set_alpha(mptcp_meta_sk(sk), alpha); -+} -+ -+static void mptcp_ccc_init(struct sock *sk) -+{ -+ if (mptcp(tcp_sk(sk))) { -+ mptcp_set_forced(mptcp_meta_sk(sk), 0); -+ mptcp_set_alpha(mptcp_meta_sk(sk), 1); -+ } -+ /* If we do not mptcp, behave like reno: return */ -+} -+ -+static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event) -+{ -+ if (event == CA_EVENT_LOSS) -+ mptcp_ccc_recalc_alpha(sk); -+} -+ -+static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state) -+{ -+ if (!mptcp(tcp_sk(sk))) -+ return; -+ -+ mptcp_set_forced(mptcp_meta_sk(sk), 1); -+} -+ -+static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ const struct mptcp_cb *mpcb = tp->mpcb; -+ int snd_cwnd; -+ -+ if (!mptcp(tp)) { -+ tcp_reno_cong_avoid(sk, ack, acked); -+ return; -+ } -+ -+ if (!tcp_is_cwnd_limited(sk)) -+ return; -+ -+ if (tp->snd_cwnd <= tp->snd_ssthresh) { -+ /* In "safe" area, increase. */ -+ tcp_slow_start(tp, acked); -+ mptcp_ccc_recalc_alpha(sk); -+ return; -+ } -+ -+ if (mptcp_get_forced(mptcp_meta_sk(sk))) { -+ mptcp_ccc_recalc_alpha(sk); -+ mptcp_set_forced(mptcp_meta_sk(sk), 0); -+ } -+ -+ if (mpcb->cnt_established > 1) { -+ u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk)); -+ -+ /* This may happen, if at the initialization, the mpcb -+ * was not yet attached to the sock, and thus -+ * initializing alpha failed. -+ */ -+ if (unlikely(!alpha)) -+ alpha = 1; -+ -+ snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale), -+ alpha); -+ -+ /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd) -+ * Thus, we select here the max value. -+ */ -+ if (snd_cwnd < tp->snd_cwnd) -+ snd_cwnd = tp->snd_cwnd; -+ } else { -+ snd_cwnd = tp->snd_cwnd; -+ } -+ -+ if (tp->snd_cwnd_cnt >= snd_cwnd) { -+ if (tp->snd_cwnd < tp->snd_cwnd_clamp) { -+ tp->snd_cwnd++; -+ mptcp_ccc_recalc_alpha(sk); -+ } -+ -+ tp->snd_cwnd_cnt = 0; -+ } else { -+ tp->snd_cwnd_cnt++; -+ } -+} -+ -+static struct tcp_congestion_ops mptcp_ccc = { -+ .init = mptcp_ccc_init, -+ .ssthresh = tcp_reno_ssthresh, -+ .cong_avoid = mptcp_ccc_cong_avoid, -+ .cwnd_event = mptcp_ccc_cwnd_event, -+ .set_state = mptcp_ccc_set_state, -+ .owner = THIS_MODULE, -+ .name = "lia", -+}; -+ -+static int __init mptcp_ccc_register(void) -+{ -+ BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE); -+ return tcp_register_congestion_control(&mptcp_ccc); -+} -+ -+static void __exit mptcp_ccc_unregister(void) -+{ -+ tcp_unregister_congestion_control(&mptcp_ccc); -+} -+ -+module_init(mptcp_ccc_register); -+module_exit(mptcp_ccc_unregister); -+ -+MODULE_AUTHOR("Christoph Paasch, Sébastien Barré"); -+MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM"); -+MODULE_VERSION("0.1"); -diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c -new file mode 100644 -index 000000000000..28dfa0479f5e ---- /dev/null -+++ b/net/mptcp/mptcp_ctrl.c -@@ -0,0 +1,2401 @@ -+/* -+ * MPTCP implementation - MPTCP-control -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer & Author: -+ * Christoph Paasch -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#if IS_ENABLED(CONFIG_IPV6) -+#include -+#include -+#endif -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static struct kmem_cache *mptcp_sock_cache __read_mostly; -+static struct kmem_cache *mptcp_cb_cache __read_mostly; -+static struct kmem_cache *mptcp_tw_cache __read_mostly; -+ -+int sysctl_mptcp_enabled __read_mostly = 1; -+int sysctl_mptcp_checksum __read_mostly = 1; -+int sysctl_mptcp_debug __read_mostly; -+EXPORT_SYMBOL(sysctl_mptcp_debug); -+int sysctl_mptcp_syn_retries __read_mostly = 3; -+ -+bool mptcp_init_failed __read_mostly; -+ -+struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE; -+EXPORT_SYMBOL(mptcp_static_key); -+ -+static int proc_mptcp_path_manager(ctl_table *ctl, int write, -+ void __user *buffer, size_t *lenp, -+ loff_t *ppos) -+{ -+ char val[MPTCP_PM_NAME_MAX]; -+ ctl_table tbl = { -+ .data = val, -+ .maxlen = MPTCP_PM_NAME_MAX, -+ }; -+ int ret; -+ -+ mptcp_get_default_path_manager(val); -+ -+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos); -+ if (write && ret == 0) -+ ret = mptcp_set_default_path_manager(val); -+ return ret; -+} -+ -+static int proc_mptcp_scheduler(ctl_table *ctl, int write, -+ void __user *buffer, size_t *lenp, -+ loff_t *ppos) -+{ -+ char val[MPTCP_SCHED_NAME_MAX]; -+ ctl_table tbl = { -+ .data = val, -+ .maxlen = MPTCP_SCHED_NAME_MAX, -+ }; -+ int ret; -+ -+ mptcp_get_default_scheduler(val); -+ -+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos); -+ if (write && ret == 0) -+ ret = mptcp_set_default_scheduler(val); -+ return ret; -+} -+ -+static struct ctl_table mptcp_table[] = { -+ { -+ .procname = "mptcp_enabled", -+ .data = &sysctl_mptcp_enabled, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec -+ }, -+ { -+ .procname = "mptcp_checksum", -+ .data = &sysctl_mptcp_checksum, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec -+ }, -+ { -+ .procname = "mptcp_debug", -+ .data = &sysctl_mptcp_debug, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec -+ }, -+ { -+ .procname = "mptcp_syn_retries", -+ .data = &sysctl_mptcp_syn_retries, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec -+ }, -+ { -+ .procname = "mptcp_path_manager", -+ .mode = 0644, -+ .maxlen = MPTCP_PM_NAME_MAX, -+ .proc_handler = proc_mptcp_path_manager, -+ }, -+ { -+ .procname = "mptcp_scheduler", -+ .mode = 0644, -+ .maxlen = MPTCP_SCHED_NAME_MAX, -+ .proc_handler = proc_mptcp_scheduler, -+ }, -+ { } -+}; -+ -+static inline u32 mptcp_hash_tk(u32 token) -+{ -+ return token % MPTCP_HASH_SIZE; -+} -+ -+struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE]; -+EXPORT_SYMBOL(tk_hashtable); -+ -+/* This second hashtable is needed to retrieve request socks -+ * created as a result of a join request. While the SYN contains -+ * the token, the final ack does not, so we need a separate hashtable -+ * to retrieve the mpcb. -+ */ -+struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE]; -+spinlock_t mptcp_reqsk_hlock; /* hashtable protection */ -+ -+/* The following hash table is used to avoid collision of token */ -+static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE]; -+spinlock_t mptcp_tk_hashlock; /* hashtable protection */ -+ -+static bool mptcp_reqsk_find_tk(const u32 token) -+{ -+ const u32 hash = mptcp_hash_tk(token); -+ const struct mptcp_request_sock *mtreqsk; -+ const struct hlist_nulls_node *node; -+ -+begin: -+ hlist_nulls_for_each_entry_rcu(mtreqsk, node, -+ &mptcp_reqsk_tk_htb[hash], hash_entry) { -+ if (token == mtreqsk->mptcp_loc_token) -+ return true; -+ } -+ /* A request-socket is destroyed by RCU. So, it might have been recycled -+ * and put into another hash-table list. So, after the lookup we may -+ * end up in a different list. So, we may need to restart. -+ * -+ * See also the comment in __inet_lookup_established. -+ */ -+ if (get_nulls_value(node) != hash) -+ goto begin; -+ return false; -+} -+ -+static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, const u32 token) -+{ -+ u32 hash = mptcp_hash_tk(token); -+ -+ hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->hash_entry, -+ &mptcp_reqsk_tk_htb[hash]); -+} -+ -+static void mptcp_reqsk_remove_tk(const struct request_sock *reqsk) -+{ -+ rcu_read_lock(); -+ spin_lock(&mptcp_tk_hashlock); -+ hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->hash_entry); -+ spin_unlock(&mptcp_tk_hashlock); -+ rcu_read_unlock(); -+} -+ -+void mptcp_reqsk_destructor(struct request_sock *req) -+{ -+ if (!mptcp_rsk(req)->is_sub) { -+ if (in_softirq()) { -+ mptcp_reqsk_remove_tk(req); -+ } else { -+ rcu_read_lock_bh(); -+ spin_lock(&mptcp_tk_hashlock); -+ hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry); -+ spin_unlock(&mptcp_tk_hashlock); -+ rcu_read_unlock_bh(); -+ } -+ } else { -+ mptcp_hash_request_remove(req); -+ } -+} -+ -+static void __mptcp_hash_insert(struct tcp_sock *meta_tp, const u32 token) -+{ -+ u32 hash = mptcp_hash_tk(token); -+ hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]); -+ meta_tp->inside_tk_table = 1; -+} -+ -+static bool mptcp_find_token(u32 token) -+{ -+ const u32 hash = mptcp_hash_tk(token); -+ const struct tcp_sock *meta_tp; -+ const struct hlist_nulls_node *node; -+ -+begin: -+ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) { -+ if (token == meta_tp->mptcp_loc_token) -+ return true; -+ } -+ /* A TCP-socket is destroyed by RCU. So, it might have been recycled -+ * and put into another hash-table list. So, after the lookup we may -+ * end up in a different list. So, we may need to restart. -+ * -+ * See also the comment in __inet_lookup_established. -+ */ -+ if (get_nulls_value(node) != hash) -+ goto begin; -+ return false; -+} -+ -+static void mptcp_set_key_reqsk(struct request_sock *req, -+ const struct sk_buff *skb) -+{ -+ const struct inet_request_sock *ireq = inet_rsk(req); -+ struct mptcp_request_sock *mtreq = mptcp_rsk(req); -+ -+ if (skb->protocol == htons(ETH_P_IP)) { -+ mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr, -+ ip_hdr(skb)->daddr, -+ htons(ireq->ir_num), -+ ireq->ir_rmt_port); -+#if IS_ENABLED(CONFIG_IPV6) -+ } else { -+ mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32, -+ ipv6_hdr(skb)->daddr.s6_addr32, -+ htons(ireq->ir_num), -+ ireq->ir_rmt_port); -+#endif -+ } -+ -+ mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); -+} -+ -+/* New MPTCP-connection request, prepare a new token for the meta-socket that -+ * will be created in mptcp_check_req_master(), and store the received token. -+ */ -+void mptcp_reqsk_new_mptcp(struct request_sock *req, -+ const struct mptcp_options_received *mopt, -+ const struct sk_buff *skb) -+{ -+ struct mptcp_request_sock *mtreq = mptcp_rsk(req); -+ -+ inet_rsk(req)->saw_mpc = 1; -+ -+ rcu_read_lock(); -+ spin_lock(&mptcp_tk_hashlock); -+ do { -+ mptcp_set_key_reqsk(req, skb); -+ } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) || -+ mptcp_find_token(mtreq->mptcp_loc_token)); -+ -+ mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token); -+ spin_unlock(&mptcp_tk_hashlock); -+ rcu_read_unlock(); -+ mtreq->mptcp_rem_key = mopt->mptcp_key; -+} -+ -+static void mptcp_set_key_sk(const struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ const struct inet_sock *isk = inet_sk(sk); -+ -+ if (sk->sk_family == AF_INET) -+ tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr, -+ isk->inet_daddr, -+ isk->inet_sport, -+ isk->inet_dport); -+#if IS_ENABLED(CONFIG_IPV6) -+ else -+ tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32, -+ sk->sk_v6_daddr.s6_addr32, -+ isk->inet_sport, -+ isk->inet_dport); -+#endif -+ -+ mptcp_key_sha1(tp->mptcp_loc_key, -+ &tp->mptcp_loc_token, NULL); -+} -+ -+void mptcp_connect_init(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ rcu_read_lock_bh(); -+ spin_lock(&mptcp_tk_hashlock); -+ do { -+ mptcp_set_key_sk(sk); -+ } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) || -+ mptcp_find_token(tp->mptcp_loc_token)); -+ -+ __mptcp_hash_insert(tp, tp->mptcp_loc_token); -+ spin_unlock(&mptcp_tk_hashlock); -+ rcu_read_unlock_bh(); -+} -+ -+/** -+ * This function increments the refcount of the mpcb struct. -+ * It is the responsibility of the caller to decrement when releasing -+ * the structure. -+ */ -+struct sock *mptcp_hash_find(const struct net *net, const u32 token) -+{ -+ const u32 hash = mptcp_hash_tk(token); -+ const struct tcp_sock *meta_tp; -+ struct sock *meta_sk = NULL; -+ const struct hlist_nulls_node *node; -+ -+ rcu_read_lock(); -+begin: -+ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], -+ tk_table) { -+ meta_sk = (struct sock *)meta_tp; -+ if (token == meta_tp->mptcp_loc_token && -+ net_eq(net, sock_net(meta_sk))) { -+ if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) -+ goto out; -+ if (unlikely(token != meta_tp->mptcp_loc_token || -+ !net_eq(net, sock_net(meta_sk)))) { -+ sock_gen_put(meta_sk); -+ goto begin; -+ } -+ goto found; -+ } -+ } -+ /* A TCP-socket is destroyed by RCU. So, it might have been recycled -+ * and put into another hash-table list. So, after the lookup we may -+ * end up in a different list. So, we may need to restart. -+ * -+ * See also the comment in __inet_lookup_established. -+ */ -+ if (get_nulls_value(node) != hash) -+ goto begin; -+out: -+ meta_sk = NULL; -+found: -+ rcu_read_unlock(); -+ return meta_sk; -+} -+ -+void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) -+{ -+ /* remove from the token hashtable */ -+ rcu_read_lock_bh(); -+ spin_lock(&mptcp_tk_hashlock); -+ hlist_nulls_del_init_rcu(&meta_tp->tk_table); -+ meta_tp->inside_tk_table = 0; -+ spin_unlock(&mptcp_tk_hashlock); -+ rcu_read_unlock_bh(); -+} -+ -+void mptcp_hash_remove(struct tcp_sock *meta_tp) -+{ -+ rcu_read_lock(); -+ spin_lock(&mptcp_tk_hashlock); -+ hlist_nulls_del_init_rcu(&meta_tp->tk_table); -+ meta_tp->inside_tk_table = 0; -+ spin_unlock(&mptcp_tk_hashlock); -+ rcu_read_unlock(); -+} -+ -+struct sock *mptcp_select_ack_sock(const struct sock *meta_sk) -+{ -+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct sock *sk, *rttsk = NULL, *lastsk = NULL; -+ u32 min_time = 0, last_active = 0; -+ -+ mptcp_for_each_sk(meta_tp->mpcb, sk) { -+ struct tcp_sock *tp = tcp_sk(sk); -+ u32 elapsed; -+ -+ if (!mptcp_sk_can_send_ack(sk) || tp->pf) -+ continue; -+ -+ elapsed = keepalive_time_elapsed(tp); -+ -+ /* We take the one with the lowest RTT within a reasonable -+ * (meta-RTO)-timeframe -+ */ -+ if (elapsed < inet_csk(meta_sk)->icsk_rto) { -+ if (!min_time || tp->srtt_us < min_time) { -+ min_time = tp->srtt_us; -+ rttsk = sk; -+ } -+ continue; -+ } -+ -+ /* Otherwise, we just take the most recent active */ -+ if (!rttsk && (!last_active || elapsed < last_active)) { -+ last_active = elapsed; -+ lastsk = sk; -+ } -+ } -+ -+ if (rttsk) -+ return rttsk; -+ -+ return lastsk; -+} -+EXPORT_SYMBOL(mptcp_select_ack_sock); -+ -+static void mptcp_sock_def_error_report(struct sock *sk) -+{ -+ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; -+ -+ if (!sock_flag(sk, SOCK_DEAD)) -+ mptcp_sub_close(sk, 0); -+ -+ if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd || -+ mpcb->send_infinite_mapping) { -+ struct sock *meta_sk = mptcp_meta_sk(sk); -+ -+ meta_sk->sk_err = sk->sk_err; -+ meta_sk->sk_err_soft = sk->sk_err_soft; -+ -+ if (!sock_flag(meta_sk, SOCK_DEAD)) -+ meta_sk->sk_error_report(meta_sk); -+ -+ tcp_done(meta_sk); -+ } -+ -+ sk->sk_err = 0; -+ return; -+} -+ -+static void mptcp_mpcb_put(struct mptcp_cb *mpcb) -+{ -+ if (atomic_dec_and_test(&mpcb->mpcb_refcnt)) { -+ mptcp_cleanup_path_manager(mpcb); -+ mptcp_cleanup_scheduler(mpcb); -+ kmem_cache_free(mptcp_cb_cache, mpcb); -+ } -+} -+ -+static void mptcp_sock_destruct(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ inet_sock_destruct(sk); -+ -+ if (!is_meta_sk(sk) && !tp->was_meta_sk) { -+ BUG_ON(!hlist_unhashed(&tp->mptcp->cb_list)); -+ -+ kmem_cache_free(mptcp_sock_cache, tp->mptcp); -+ tp->mptcp = NULL; -+ -+ /* Taken when mpcb pointer was set */ -+ sock_put(mptcp_meta_sk(sk)); -+ mptcp_mpcb_put(tp->mpcb); -+ } else { -+ struct mptcp_cb *mpcb = tp->mpcb; -+ struct mptcp_tw *mptw; -+ -+ /* The mpcb is disappearing - we can make the final -+ * update to the rcv_nxt of the time-wait-sock and remove -+ * its reference to the mpcb. -+ */ -+ spin_lock_bh(&mpcb->tw_lock); -+ list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) { -+ list_del_rcu(&mptw->list); -+ mptw->in_list = 0; -+ mptcp_mpcb_put(mpcb); -+ rcu_assign_pointer(mptw->mpcb, NULL); -+ } -+ spin_unlock_bh(&mpcb->tw_lock); -+ -+ mptcp_mpcb_put(mpcb); -+ -+ mptcp_debug("%s destroying meta-sk\n", __func__); -+ } -+ -+ WARN_ON(!static_key_false(&mptcp_static_key)); -+ /* Must be the last call, because is_meta_sk() above still needs the -+ * static key -+ */ -+ static_key_slow_dec(&mptcp_static_key); -+} -+ -+void mptcp_destroy_sock(struct sock *sk) -+{ -+ if (is_meta_sk(sk)) { -+ struct sock *sk_it, *tmpsk; -+ -+ __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue); -+ mptcp_purge_ofo_queue(tcp_sk(sk)); -+ -+ /* We have to close all remaining subflows. Normally, they -+ * should all be about to get closed. But, if the kernel is -+ * forcing a closure (e.g., tcp_write_err), the subflows might -+ * not have been closed properly (as we are waiting for the -+ * DATA_ACK of the DATA_FIN). -+ */ -+ mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) { -+ /* Already did call tcp_close - waiting for graceful -+ * closure, or if we are retransmitting fast-close on -+ * the subflow. The reset (or timeout) will kill the -+ * subflow.. -+ */ -+ if (tcp_sk(sk_it)->closing || -+ tcp_sk(sk_it)->send_mp_fclose) -+ continue; -+ -+ /* Allow the delayed work first to prevent time-wait state */ -+ if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work)) -+ continue; -+ -+ mptcp_sub_close(sk_it, 0); -+ } -+ -+ mptcp_delete_synack_timer(sk); -+ } else { -+ mptcp_del_sock(sk); -+ } -+} -+ -+static void mptcp_set_state(struct sock *sk) -+{ -+ struct sock *meta_sk = mptcp_meta_sk(sk); -+ -+ /* Meta is not yet established - wake up the application */ -+ if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) && -+ sk->sk_state == TCP_ESTABLISHED) { -+ tcp_set_state(meta_sk, TCP_ESTABLISHED); -+ -+ if (!sock_flag(meta_sk, SOCK_DEAD)) { -+ meta_sk->sk_state_change(meta_sk); -+ sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT); -+ } -+ } -+ -+ if (sk->sk_state == TCP_ESTABLISHED) { -+ tcp_sk(sk)->mptcp->establish_increased = 1; -+ tcp_sk(sk)->mpcb->cnt_established++; -+ } -+} -+ -+void mptcp_init_congestion_control(struct sock *sk) -+{ -+ struct inet_connection_sock *icsk = inet_csk(sk); -+ struct inet_connection_sock *meta_icsk = inet_csk(mptcp_meta_sk(sk)); -+ const struct tcp_congestion_ops *ca = meta_icsk->icsk_ca_ops; -+ -+ /* The application didn't set the congestion control to use -+ * fallback to the default one. -+ */ -+ if (ca == &tcp_init_congestion_ops) -+ goto use_default; -+ -+ /* Use the same congestion control as set by the user. If the -+ * module is not available fallback to the default one. -+ */ -+ if (!try_module_get(ca->owner)) { -+ pr_warn("%s: fallback to the system default CC\n", __func__); -+ goto use_default; -+ } -+ -+ icsk->icsk_ca_ops = ca; -+ if (icsk->icsk_ca_ops->init) -+ icsk->icsk_ca_ops->init(sk); -+ -+ return; -+ -+use_default: -+ icsk->icsk_ca_ops = &tcp_init_congestion_ops; -+ tcp_init_congestion_control(sk); -+} -+ -+u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; -+u32 mptcp_seed = 0; -+ -+void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn) -+{ -+ u32 workspace[SHA_WORKSPACE_WORDS]; -+ u32 mptcp_hashed_key[SHA_DIGEST_WORDS]; -+ u8 input[64]; -+ int i; -+ -+ memset(workspace, 0, sizeof(workspace)); -+ -+ /* Initialize input with appropriate padding */ -+ memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte -+ * is explicitly set too -+ */ -+ memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */ -+ input[8] = 0x80; /* Padding: First bit after message = 1 */ -+ input[63] = 0x40; /* Padding: Length of the message = 64 bits */ -+ -+ sha_init(mptcp_hashed_key); -+ sha_transform(mptcp_hashed_key, input, workspace); -+ -+ for (i = 0; i < 5; i++) -+ mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]); -+ -+ if (token) -+ *token = mptcp_hashed_key[0]; -+ if (idsn) -+ *idsn = *((u64 *)&mptcp_hashed_key[3]); -+} -+ -+void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2, -+ u32 *hash_out) -+{ -+ u32 workspace[SHA_WORKSPACE_WORDS]; -+ u8 input[128]; /* 2 512-bit blocks */ -+ int i; -+ -+ memset(workspace, 0, sizeof(workspace)); -+ -+ /* Generate key xored with ipad */ -+ memset(input, 0x36, 64); -+ for (i = 0; i < 8; i++) -+ input[i] ^= key_1[i]; -+ for (i = 0; i < 8; i++) -+ input[i + 8] ^= key_2[i]; -+ -+ memcpy(&input[64], rand_1, 4); -+ memcpy(&input[68], rand_2, 4); -+ input[72] = 0x80; /* Padding: First bit after message = 1 */ -+ memset(&input[73], 0, 53); -+ -+ /* Padding: Length of the message = 512 + 64 bits */ -+ input[126] = 0x02; -+ input[127] = 0x40; -+ -+ sha_init(hash_out); -+ sha_transform(hash_out, input, workspace); -+ memset(workspace, 0, sizeof(workspace)); -+ -+ sha_transform(hash_out, &input[64], workspace); -+ memset(workspace, 0, sizeof(workspace)); -+ -+ for (i = 0; i < 5; i++) -+ hash_out[i] = cpu_to_be32(hash_out[i]); -+ -+ /* Prepare second part of hmac */ -+ memset(input, 0x5C, 64); -+ for (i = 0; i < 8; i++) -+ input[i] ^= key_1[i]; -+ for (i = 0; i < 8; i++) -+ input[i + 8] ^= key_2[i]; -+ -+ memcpy(&input[64], hash_out, 20); -+ input[84] = 0x80; -+ memset(&input[85], 0, 41); -+ -+ /* Padding: Length of the message = 512 + 160 bits */ -+ input[126] = 0x02; -+ input[127] = 0xA0; -+ -+ sha_init(hash_out); -+ sha_transform(hash_out, input, workspace); -+ memset(workspace, 0, sizeof(workspace)); -+ -+ sha_transform(hash_out, &input[64], workspace); -+ -+ for (i = 0; i < 5; i++) -+ hash_out[i] = cpu_to_be32(hash_out[i]); -+} -+ -+static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk) -+{ -+ /* Socket-options handled by sk_clone_lock while creating the meta-sk. -+ * ====== -+ * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT, -+ * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER, -+ * TCP_NODELAY, TCP_CORK -+ * -+ * Socket-options handled in this function here -+ * ====== -+ * TCP_DEFER_ACCEPT -+ * SO_KEEPALIVE -+ * -+ * Socket-options on the todo-list -+ * ====== -+ * SO_BINDTODEVICE - should probably prevent creation of new subsocks -+ * across other devices. - what about the api-draft? -+ * SO_DEBUG -+ * SO_REUSEADDR - probably we don't care about this -+ * SO_DONTROUTE, SO_BROADCAST -+ * SO_OOBINLINE -+ * SO_LINGER -+ * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM -+ * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM -+ * SO_RXQ_OVFL -+ * TCP_COOKIE_TRANSACTIONS -+ * TCP_MAXSEG -+ * TCP_THIN_* - Handled by sk_clone_lock, but we need to support this -+ * in mptcp_retransmit_timer. AND we need to check what is -+ * about the subsockets. -+ * TCP_LINGER2 -+ * TCP_WINDOW_CLAMP -+ * TCP_USER_TIMEOUT -+ * TCP_MD5SIG -+ * -+ * Socket-options of no concern for the meta-socket (but for the subsocket) -+ * ====== -+ * SO_PRIORITY -+ * SO_MARK -+ * TCP_CONGESTION -+ * TCP_SYNCNT -+ * TCP_QUICKACK -+ */ -+ -+ /* DEFER_ACCEPT should not be set on the meta, as we want to accept new subflows directly */ -+ inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0; -+ -+ /* Keepalives are handled entirely at the MPTCP-layer */ -+ if (sock_flag(meta_sk, SOCK_KEEPOPEN)) { -+ inet_csk_reset_keepalive_timer(meta_sk, -+ keepalive_time_when(tcp_sk(meta_sk))); -+ sock_reset_flag(master_sk, SOCK_KEEPOPEN); -+ inet_csk_delete_keepalive_timer(master_sk); -+ } -+ -+ /* Do not propagate subflow-errors up to the MPTCP-layer */ -+ inet_sk(master_sk)->recverr = 0; -+} -+ -+static void mptcp_sub_inherit_sockopts(const struct sock *meta_sk, struct sock *sub_sk) -+{ -+ /* IP_TOS also goes to the subflow. */ -+ if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) { -+ inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos; -+ sub_sk->sk_priority = meta_sk->sk_priority; -+ sk_dst_reset(sub_sk); -+ } -+ -+ /* Inherit SO_REUSEADDR */ -+ sub_sk->sk_reuse = meta_sk->sk_reuse; -+ -+ /* Inherit snd/rcv-buffer locks */ -+ sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; -+ -+ /* Nagle/Cork is forced off on the subflows. It is handled at the meta-layer */ -+ tcp_sk(sub_sk)->nonagle = TCP_NAGLE_OFF|TCP_NAGLE_PUSH; -+ -+ /* Keepalives are handled entirely at the MPTCP-layer */ -+ if (sock_flag(sub_sk, SOCK_KEEPOPEN)) { -+ sock_reset_flag(sub_sk, SOCK_KEEPOPEN); -+ inet_csk_delete_keepalive_timer(sub_sk); -+ } -+ -+ /* Do not propagate subflow-errors up to the MPTCP-layer */ -+ inet_sk(sub_sk)->recverr = 0; -+} -+ -+int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb) -+{ -+ /* skb-sk may be NULL if we receive a packet immediatly after the -+ * SYN/ACK + MP_CAPABLE. -+ */ -+ struct sock *sk = skb->sk ? skb->sk : meta_sk; -+ int ret = 0; -+ -+ skb->sk = NULL; -+ -+ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { -+ kfree_skb(skb); -+ return 0; -+ } -+ -+ if (sk->sk_family == AF_INET) -+ ret = tcp_v4_do_rcv(sk, skb); -+#if IS_ENABLED(CONFIG_IPV6) -+ else -+ ret = tcp_v6_do_rcv(sk, skb); -+#endif -+ -+ sock_put(sk); -+ return ret; -+} -+ -+struct lock_class_key meta_key; -+struct lock_class_key meta_slock_key; -+ -+static void mptcp_synack_timer_handler(unsigned long data) -+{ -+ struct sock *meta_sk = (struct sock *) data; -+ struct listen_sock *lopt = inet_csk(meta_sk)->icsk_accept_queue.listen_opt; -+ -+ /* Only process if socket is not in use. */ -+ bh_lock_sock(meta_sk); -+ -+ if (sock_owned_by_user(meta_sk)) { -+ /* Try again later. */ -+ mptcp_reset_synack_timer(meta_sk, HZ/20); -+ goto out; -+ } -+ -+ /* May happen if the queue got destructed in mptcp_close */ -+ if (!lopt) -+ goto out; -+ -+ inet_csk_reqsk_queue_prune(meta_sk, TCP_SYNQ_INTERVAL, -+ TCP_TIMEOUT_INIT, TCP_RTO_MAX); -+ -+ if (lopt->qlen) -+ mptcp_reset_synack_timer(meta_sk, TCP_SYNQ_INTERVAL); -+ -+out: -+ bh_unlock_sock(meta_sk); -+ sock_put(meta_sk); -+} -+ -+static const struct tcp_sock_ops mptcp_meta_specific = { -+ .__select_window = __mptcp_select_window, -+ .select_window = mptcp_select_window, -+ .select_initial_window = mptcp_select_initial_window, -+ .init_buffer_space = mptcp_init_buffer_space, -+ .set_rto = mptcp_tcp_set_rto, -+ .should_expand_sndbuf = mptcp_should_expand_sndbuf, -+ .init_congestion_control = mptcp_init_congestion_control, -+ .send_fin = mptcp_send_fin, -+ .write_xmit = mptcp_write_xmit, -+ .send_active_reset = mptcp_send_active_reset, -+ .write_wakeup = mptcp_write_wakeup, -+ .prune_ofo_queue = mptcp_prune_ofo_queue, -+ .retransmit_timer = mptcp_retransmit_timer, -+ .time_wait = mptcp_time_wait, -+ .cleanup_rbuf = mptcp_cleanup_rbuf, -+}; -+ -+static const struct tcp_sock_ops mptcp_sub_specific = { -+ .__select_window = __mptcp_select_window, -+ .select_window = mptcp_select_window, -+ .select_initial_window = mptcp_select_initial_window, -+ .init_buffer_space = mptcp_init_buffer_space, -+ .set_rto = mptcp_tcp_set_rto, -+ .should_expand_sndbuf = mptcp_should_expand_sndbuf, -+ .init_congestion_control = mptcp_init_congestion_control, -+ .send_fin = tcp_send_fin, -+ .write_xmit = tcp_write_xmit, -+ .send_active_reset = tcp_send_active_reset, -+ .write_wakeup = tcp_write_wakeup, -+ .prune_ofo_queue = tcp_prune_ofo_queue, -+ .retransmit_timer = tcp_retransmit_timer, -+ .time_wait = tcp_time_wait, -+ .cleanup_rbuf = tcp_cleanup_rbuf, -+}; -+ -+static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window) -+{ -+ struct mptcp_cb *mpcb; -+ struct sock *master_sk; -+ struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk); -+ struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk); -+ u64 idsn; -+ -+ dst_release(meta_sk->sk_rx_dst); -+ meta_sk->sk_rx_dst = NULL; -+ /* This flag is set to announce sock_lock_init to -+ * reclassify the lock-class of the master socket. -+ */ -+ meta_tp->is_master_sk = 1; -+ master_sk = sk_clone_lock(meta_sk, GFP_ATOMIC | __GFP_ZERO); -+ meta_tp->is_master_sk = 0; -+ if (!master_sk) -+ return -ENOBUFS; -+ -+ master_tp = tcp_sk(master_sk); -+ master_icsk = inet_csk(master_sk); -+ -+ mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC); -+ if (!mpcb) { -+ /* sk_free (and __sk_free) requirese wmem_alloc to be 1. -+ * All the rest is set to 0 thanks to __GFP_ZERO above. -+ */ -+ atomic_set(&master_sk->sk_wmem_alloc, 1); -+ sk_free(master_sk); -+ return -ENOBUFS; -+ } -+ -+#if IS_ENABLED(CONFIG_IPV6) -+ if (meta_icsk->icsk_af_ops == &mptcp_v6_mapped) { -+ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk); -+ -+ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6; -+ -+ newnp = inet6_sk(master_sk); -+ memcpy(newnp, np, sizeof(struct ipv6_pinfo)); -+ -+ newnp->ipv6_mc_list = NULL; -+ newnp->ipv6_ac_list = NULL; -+ newnp->ipv6_fl_list = NULL; -+ newnp->opt = NULL; -+ newnp->pktoptions = NULL; -+ (void)xchg(&newnp->rxpmtu, NULL); -+ } else if (meta_sk->sk_family == AF_INET6) { -+ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk); -+ -+ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6; -+ -+ newnp = inet6_sk(master_sk); -+ memcpy(newnp, np, sizeof(struct ipv6_pinfo)); -+ -+ newnp->hop_limit = -1; -+ newnp->mcast_hops = IPV6_DEFAULT_MCASTHOPS; -+ newnp->mc_loop = 1; -+ newnp->pmtudisc = IPV6_PMTUDISC_WANT; -+ newnp->ipv6only = sock_net(master_sk)->ipv6.sysctl.bindv6only; -+ } -+#endif -+ -+ meta_tp->mptcp = NULL; -+ -+ /* Store the keys and generate the peer's token */ -+ mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key; -+ mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token; -+ -+ /* Generate Initial data-sequence-numbers */ -+ mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn); -+ idsn = ntohll(idsn) + 1; -+ mpcb->snd_high_order[0] = idsn >> 32; -+ mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1; -+ -+ meta_tp->write_seq = (u32)idsn; -+ meta_tp->snd_sml = meta_tp->write_seq; -+ meta_tp->snd_una = meta_tp->write_seq; -+ meta_tp->snd_nxt = meta_tp->write_seq; -+ meta_tp->pushed_seq = meta_tp->write_seq; -+ meta_tp->snd_up = meta_tp->write_seq; -+ -+ mpcb->mptcp_rem_key = remote_key; -+ mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn); -+ idsn = ntohll(idsn) + 1; -+ mpcb->rcv_high_order[0] = idsn >> 32; -+ mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1; -+ meta_tp->copied_seq = (u32) idsn; -+ meta_tp->rcv_nxt = (u32) idsn; -+ meta_tp->rcv_wup = (u32) idsn; -+ -+ meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1; -+ meta_tp->snd_wnd = window; -+ meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */ -+ -+ meta_tp->packets_out = 0; -+ meta_icsk->icsk_probes_out = 0; -+ -+ /* Set mptcp-pointers */ -+ master_tp->mpcb = mpcb; -+ master_tp->meta_sk = meta_sk; -+ meta_tp->mpcb = mpcb; -+ meta_tp->meta_sk = meta_sk; -+ mpcb->meta_sk = meta_sk; -+ mpcb->master_sk = master_sk; -+ -+ meta_tp->was_meta_sk = 0; -+ -+ /* Initialize the queues */ -+ skb_queue_head_init(&mpcb->reinject_queue); -+ skb_queue_head_init(&master_tp->out_of_order_queue); -+ tcp_prequeue_init(master_tp); -+ INIT_LIST_HEAD(&master_tp->tsq_node); -+ -+ master_tp->tsq_flags = 0; -+ -+ mutex_init(&mpcb->mpcb_mutex); -+ -+ /* Init the accept_queue structure, we support a queue of 32 pending -+ * connections, it does not need to be huge, since we only store here -+ * pending subflow creations. -+ */ -+ if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) { -+ inet_put_port(master_sk); -+ kmem_cache_free(mptcp_cb_cache, mpcb); -+ sk_free(master_sk); -+ return -ENOMEM; -+ } -+ -+ /* Redefine function-pointers as the meta-sk is now fully ready */ -+ static_key_slow_inc(&mptcp_static_key); -+ meta_tp->mpc = 1; -+ meta_tp->ops = &mptcp_meta_specific; -+ -+ meta_sk->sk_backlog_rcv = mptcp_backlog_rcv; -+ meta_sk->sk_destruct = mptcp_sock_destruct; -+ -+ /* Meta-level retransmit timer */ -+ meta_icsk->icsk_rto *= 2; /* Double of initial - rto */ -+ -+ tcp_init_xmit_timers(master_sk); -+ /* Has been set for sending out the SYN */ -+ inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS); -+ -+ if (!meta_tp->inside_tk_table) { -+ /* Adding the meta_tp in the token hashtable - coming from server-side */ -+ rcu_read_lock(); -+ spin_lock(&mptcp_tk_hashlock); -+ -+ __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token); -+ -+ spin_unlock(&mptcp_tk_hashlock); -+ rcu_read_unlock(); -+ } -+ master_tp->inside_tk_table = 0; -+ -+ /* Init time-wait stuff */ -+ INIT_LIST_HEAD(&mpcb->tw_list); -+ spin_lock_init(&mpcb->tw_lock); -+ -+ INIT_HLIST_HEAD(&mpcb->callback_list); -+ -+ mptcp_mpcb_inherit_sockopts(meta_sk, master_sk); -+ -+ mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf; -+ mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf; -+ mpcb->orig_window_clamp = meta_tp->window_clamp; -+ -+ /* The meta is directly linked - set refcnt to 1 */ -+ atomic_set(&mpcb->mpcb_refcnt, 1); -+ -+ mptcp_init_path_manager(mpcb); -+ mptcp_init_scheduler(mpcb); -+ -+ setup_timer(&mpcb->synack_timer, mptcp_synack_timer_handler, -+ (unsigned long)meta_sk); -+ -+ mptcp_debug("%s: created mpcb with token %#x\n", -+ __func__, mpcb->mptcp_loc_token); -+ -+ return 0; -+} -+ -+void mptcp_fallback_meta_sk(struct sock *meta_sk) -+{ -+ kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt); -+ kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb); -+} -+ -+int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id, -+ gfp_t flags) -+{ -+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags); -+ if (!tp->mptcp) -+ return -ENOMEM; -+ -+ tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb); -+ /* No more space for more subflows? */ -+ if (!tp->mptcp->path_index) { -+ kmem_cache_free(mptcp_sock_cache, tp->mptcp); -+ return -EPERM; -+ } -+ -+ INIT_HLIST_NODE(&tp->mptcp->cb_list); -+ -+ tp->mptcp->tp = tp; -+ tp->mpcb = mpcb; -+ tp->meta_sk = meta_sk; -+ -+ static_key_slow_inc(&mptcp_static_key); -+ tp->mpc = 1; -+ tp->ops = &mptcp_sub_specific; -+ -+ tp->mptcp->loc_id = loc_id; -+ tp->mptcp->rem_id = rem_id; -+ if (mpcb->sched_ops->init) -+ mpcb->sched_ops->init(sk); -+ -+ /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be -+ * included in mptcp_del_sock(), because the mpcb must remain alive -+ * until the last subsocket is completely destroyed. -+ */ -+ sock_hold(meta_sk); -+ atomic_inc(&mpcb->mpcb_refcnt); -+ -+ tp->mptcp->next = mpcb->connection_list; -+ mpcb->connection_list = tp; -+ tp->mptcp->attached = 1; -+ -+ mpcb->cnt_subflows++; -+ atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc), -+ &meta_sk->sk_rmem_alloc); -+ -+ mptcp_sub_inherit_sockopts(meta_sk, sk); -+ INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq); -+ -+ /* As we successfully allocated the mptcp_tcp_sock, we have to -+ * change the function-pointers here (for sk_destruct to work correctly) -+ */ -+ sk->sk_error_report = mptcp_sock_def_error_report; -+ sk->sk_data_ready = mptcp_data_ready; -+ sk->sk_write_space = mptcp_write_space; -+ sk->sk_state_change = mptcp_set_state; -+ sk->sk_destruct = mptcp_sock_destruct; -+ -+ if (sk->sk_family == AF_INET) -+ mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n", -+ __func__ , mpcb->mptcp_loc_token, -+ tp->mptcp->path_index, -+ &((struct inet_sock *)tp)->inet_saddr, -+ ntohs(((struct inet_sock *)tp)->inet_sport), -+ &((struct inet_sock *)tp)->inet_daddr, -+ ntohs(((struct inet_sock *)tp)->inet_dport), -+ mpcb->cnt_subflows); -+#if IS_ENABLED(CONFIG_IPV6) -+ else -+ mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n", -+ __func__ , mpcb->mptcp_loc_token, -+ tp->mptcp->path_index, &inet6_sk(sk)->saddr, -+ ntohs(((struct inet_sock *)tp)->inet_sport), -+ &sk->sk_v6_daddr, -+ ntohs(((struct inet_sock *)tp)->inet_dport), -+ mpcb->cnt_subflows); -+#endif -+ -+ return 0; -+} -+ -+void mptcp_del_sock(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk), *tp_prev; -+ struct mptcp_cb *mpcb; -+ -+ if (!tp->mptcp || !tp->mptcp->attached) -+ return; -+ -+ mpcb = tp->mpcb; -+ tp_prev = mpcb->connection_list; -+ -+ mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n", -+ __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index, -+ sk->sk_state, is_meta_sk(sk)); -+ -+ if (tp_prev == tp) { -+ mpcb->connection_list = tp->mptcp->next; -+ } else { -+ for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) { -+ if (tp_prev->mptcp->next == tp) { -+ tp_prev->mptcp->next = tp->mptcp->next; -+ break; -+ } -+ } -+ } -+ mpcb->cnt_subflows--; -+ if (tp->mptcp->establish_increased) -+ mpcb->cnt_established--; -+ -+ tp->mptcp->next = NULL; -+ tp->mptcp->attached = 0; -+ mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index); -+ -+ if (!skb_queue_empty(&sk->sk_write_queue)) -+ mptcp_reinject_data(sk, 0); -+ -+ if (is_master_tp(tp)) -+ mpcb->master_sk = NULL; -+ else if (tp->mptcp->pre_established) -+ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); -+ -+ rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL); -+} -+ -+/* Updates the metasocket ULID/port data, based on the given sock. -+ * The argument sock must be the sock accessible to the application. -+ * In this function, we update the meta socket info, based on the changes -+ * in the application socket (bind, address allocation, ...) -+ */ -+void mptcp_update_metasocket(struct sock *sk, const struct sock *meta_sk) -+{ -+ if (tcp_sk(sk)->mpcb->pm_ops->new_session) -+ tcp_sk(sk)->mpcb->pm_ops->new_session(meta_sk); -+ -+ tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio; -+} -+ -+/* Clean up the receive buffer for full frames taken by the user, -+ * then send an ACK if necessary. COPIED is the number of bytes -+ * tcp_recvmsg has given to the user so far, it speeds up the -+ * calculation of whether or not we must ACK for the sake of -+ * a window update. -+ */ -+void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct sock *sk; -+ __u32 rcv_window_now = 0; -+ -+ if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) { -+ rcv_window_now = tcp_receive_window(meta_tp); -+ -+ if (2 * rcv_window_now > meta_tp->window_clamp) -+ rcv_window_now = 0; -+ } -+ -+ mptcp_for_each_sk(meta_tp->mpcb, sk) { -+ struct tcp_sock *tp = tcp_sk(sk); -+ const struct inet_connection_sock *icsk = inet_csk(sk); -+ -+ if (!mptcp_sk_can_send_ack(sk)) -+ continue; -+ -+ if (!inet_csk_ack_scheduled(sk)) -+ goto second_part; -+ /* Delayed ACKs frequently hit locked sockets during bulk -+ * receive. -+ */ -+ if (icsk->icsk_ack.blocked || -+ /* Once-per-two-segments ACK was not sent by tcp_input.c */ -+ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || -+ /* If this read emptied read buffer, we send ACK, if -+ * connection is not bidirectional, user drained -+ * receive buffer and there was a small segment -+ * in queue. -+ */ -+ (copied > 0 && -+ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || -+ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && -+ !icsk->icsk_ack.pingpong)) && -+ !atomic_read(&meta_sk->sk_rmem_alloc))) { -+ tcp_send_ack(sk); -+ continue; -+ } -+ -+second_part: -+ /* This here is the second part of tcp_cleanup_rbuf */ -+ if (rcv_window_now) { -+ __u32 new_window = tp->ops->__select_window(sk); -+ -+ /* Send ACK now, if this read freed lots of space -+ * in our buffer. Certainly, new_window is new window. -+ * We can advertise it now, if it is not less than -+ * current one. -+ * "Lots" means "at least twice" here. -+ */ -+ if (new_window && new_window >= 2 * rcv_window_now) -+ tcp_send_ack(sk); -+ } -+ } -+} -+ -+static int mptcp_sub_send_fin(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct sk_buff *skb = tcp_write_queue_tail(sk); -+ int mss_now; -+ -+ /* Optimization, tack on the FIN if we have a queue of -+ * unsent frames. But be careful about outgoing SACKS -+ * and IP options. -+ */ -+ mss_now = tcp_current_mss(sk); -+ -+ if (tcp_send_head(sk) != NULL) { -+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; -+ TCP_SKB_CB(skb)->end_seq++; -+ tp->write_seq++; -+ } else { -+ skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC); -+ if (!skb) -+ return 1; -+ -+ /* Reserve space for headers and prepare control bits. */ -+ skb_reserve(skb, MAX_TCP_HEADER); -+ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ -+ tcp_init_nondata_skb(skb, tp->write_seq, -+ TCPHDR_ACK | TCPHDR_FIN); -+ tcp_queue_skb(sk, skb); -+ } -+ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); -+ -+ return 0; -+} -+ -+void mptcp_sub_close_wq(struct work_struct *work) -+{ -+ struct tcp_sock *tp = container_of(work, struct mptcp_tcp_sock, work.work)->tp; -+ struct sock *sk = (struct sock *)tp; -+ struct sock *meta_sk = mptcp_meta_sk(sk); -+ -+ mutex_lock(&tp->mpcb->mpcb_mutex); -+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); -+ -+ if (sock_flag(sk, SOCK_DEAD)) -+ goto exit; -+ -+ /* We come from tcp_disconnect. We are sure that meta_sk is set */ -+ if (!mptcp(tp)) { -+ tp->closing = 1; -+ sock_rps_reset_flow(sk); -+ tcp_close(sk, 0); -+ goto exit; -+ } -+ -+ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) { -+ tp->closing = 1; -+ sock_rps_reset_flow(sk); -+ tcp_close(sk, 0); -+ } else if (tcp_close_state(sk)) { -+ sk->sk_shutdown |= SEND_SHUTDOWN; -+ tcp_send_fin(sk); -+ } -+ -+exit: -+ release_sock(meta_sk); -+ mutex_unlock(&tp->mpcb->mpcb_mutex); -+ sock_put(sk); -+} -+ -+void mptcp_sub_close(struct sock *sk, unsigned long delay) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct delayed_work *work = &tcp_sk(sk)->mptcp->work; -+ -+ /* We are already closing - e.g., call from sock_def_error_report upon -+ * tcp_disconnect in tcp_close. -+ */ -+ if (tp->closing) -+ return; -+ -+ /* Work already scheduled ? */ -+ if (work_pending(&work->work)) { -+ /* Work present - who will be first ? */ -+ if (jiffies + delay > work->timer.expires) -+ return; -+ -+ /* Try canceling - if it fails, work will be executed soon */ -+ if (!cancel_delayed_work(work)) -+ return; -+ sock_put(sk); -+ } -+ -+ if (!delay) { -+ unsigned char old_state = sk->sk_state; -+ -+ /* If we are in user-context we can directly do the closing -+ * procedure. No need to schedule a work-queue. -+ */ -+ if (!in_softirq()) { -+ if (sock_flag(sk, SOCK_DEAD)) -+ return; -+ -+ if (!mptcp(tp)) { -+ tp->closing = 1; -+ sock_rps_reset_flow(sk); -+ tcp_close(sk, 0); -+ return; -+ } -+ -+ if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK || -+ sk->sk_state == TCP_CLOSE) { -+ tp->closing = 1; -+ sock_rps_reset_flow(sk); -+ tcp_close(sk, 0); -+ } else if (tcp_close_state(sk)) { -+ sk->sk_shutdown |= SEND_SHUTDOWN; -+ tcp_send_fin(sk); -+ } -+ -+ return; -+ } -+ -+ /* We directly send the FIN. Because it may take so a long time, -+ * untile the work-queue will get scheduled... -+ * -+ * If mptcp_sub_send_fin returns 1, it failed and thus we reset -+ * the old state so that tcp_close will finally send the fin -+ * in user-context. -+ */ -+ if (!sk->sk_err && old_state != TCP_CLOSE && -+ tcp_close_state(sk) && mptcp_sub_send_fin(sk)) { -+ if (old_state == TCP_ESTABLISHED) -+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); -+ sk->sk_state = old_state; -+ } -+ } -+ -+ sock_hold(sk); -+ queue_delayed_work(mptcp_wq, work, delay); -+} -+ -+void mptcp_sub_force_close(struct sock *sk) -+{ -+ /* The below tcp_done may have freed the socket, if he is already dead. -+ * Thus, we are not allowed to access it afterwards. That's why -+ * we have to store the dead-state in this local variable. -+ */ -+ int sock_is_dead = sock_flag(sk, SOCK_DEAD); -+ -+ tcp_sk(sk)->mp_killed = 1; -+ -+ if (sk->sk_state != TCP_CLOSE) -+ tcp_done(sk); -+ -+ if (!sock_is_dead) -+ mptcp_sub_close(sk, 0); -+} -+EXPORT_SYMBOL(mptcp_sub_force_close); -+ -+/* Update the mpcb send window, based on the contributions -+ * of each subflow -+ */ -+void mptcp_update_sndbuf(const struct tcp_sock *tp) -+{ -+ struct sock *meta_sk = tp->meta_sk, *sk; -+ int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf; -+ -+ mptcp_for_each_sk(tp->mpcb, sk) { -+ if (!mptcp_sk_can_send(sk)) -+ continue; -+ -+ new_sndbuf += sk->sk_sndbuf; -+ -+ if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) { -+ new_sndbuf = sysctl_tcp_wmem[2]; -+ break; -+ } -+ } -+ meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf); -+ -+ /* The subflow's call to sk_write_space in tcp_new_space ends up in -+ * mptcp_write_space. -+ * It has nothing to do with waking up the application. -+ * So, we do it here. -+ */ -+ if (old_sndbuf != meta_sk->sk_sndbuf) -+ meta_sk->sk_write_space(meta_sk); -+} -+ -+void mptcp_close(struct sock *meta_sk, long timeout) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct sock *sk_it, *tmpsk; -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ struct sk_buff *skb; -+ int data_was_unread = 0; -+ int state; -+ -+ mptcp_debug("%s: Close of meta_sk with tok %#x\n", -+ __func__, mpcb->mptcp_loc_token); -+ -+ mutex_lock(&mpcb->mpcb_mutex); -+ lock_sock(meta_sk); -+ -+ if (meta_tp->inside_tk_table) { -+ /* Detach the mpcb from the token hashtable */ -+ mptcp_hash_remove_bh(meta_tp); -+ reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue); -+ } -+ -+ meta_sk->sk_shutdown = SHUTDOWN_MASK; -+ /* We need to flush the recv. buffs. We do this only on the -+ * descriptor close, not protocol-sourced closes, because the -+ * reader process may not have drained the data yet! -+ */ -+ while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) { -+ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - -+ tcp_hdr(skb)->fin; -+ data_was_unread += len; -+ __kfree_skb(skb); -+ } -+ -+ sk_mem_reclaim(meta_sk); -+ -+ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ -+ if (meta_sk->sk_state == TCP_CLOSE) { -+ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { -+ if (tcp_sk(sk_it)->send_mp_fclose) -+ continue; -+ mptcp_sub_close(sk_it, 0); -+ } -+ goto adjudge_to_death; -+ } -+ -+ if (data_was_unread) { -+ /* Unread data was tossed, zap the connection. */ -+ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE); -+ tcp_set_state(meta_sk, TCP_CLOSE); -+ tcp_sk(meta_sk)->ops->send_active_reset(meta_sk, -+ meta_sk->sk_allocation); -+ } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) { -+ /* Check zero linger _after_ checking for unread data. */ -+ meta_sk->sk_prot->disconnect(meta_sk, 0); -+ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); -+ } else if (tcp_close_state(meta_sk)) { -+ mptcp_send_fin(meta_sk); -+ } else if (meta_tp->snd_una == meta_tp->write_seq) { -+ /* The DATA_FIN has been sent and acknowledged -+ * (e.g., by sk_shutdown). Close all the other subflows -+ */ -+ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { -+ unsigned long delay = 0; -+ /* If we are the passive closer, don't trigger -+ * subflow-fin until the subflow has been finned -+ * by the peer. - thus we add a delay -+ */ -+ if (mpcb->passive_close && -+ sk_it->sk_state == TCP_ESTABLISHED) -+ delay = inet_csk(sk_it)->icsk_rto << 3; -+ -+ mptcp_sub_close(sk_it, delay); -+ } -+ } -+ -+ sk_stream_wait_close(meta_sk, timeout); -+ -+adjudge_to_death: -+ state = meta_sk->sk_state; -+ sock_hold(meta_sk); -+ sock_orphan(meta_sk); -+ -+ /* socket will be freed after mptcp_close - we have to prevent -+ * access from the subflows. -+ */ -+ mptcp_for_each_sk(mpcb, sk_it) { -+ /* Similar to sock_orphan, but we don't set it DEAD, because -+ * the callbacks are still set and must be called. -+ */ -+ write_lock_bh(&sk_it->sk_callback_lock); -+ sk_set_socket(sk_it, NULL); -+ sk_it->sk_wq = NULL; -+ write_unlock_bh(&sk_it->sk_callback_lock); -+ } -+ -+ /* It is the last release_sock in its life. It will remove backlog. */ -+ release_sock(meta_sk); -+ -+ /* Now socket is owned by kernel and we acquire BH lock -+ * to finish close. No need to check for user refs. -+ */ -+ local_bh_disable(); -+ bh_lock_sock(meta_sk); -+ WARN_ON(sock_owned_by_user(meta_sk)); -+ -+ percpu_counter_inc(meta_sk->sk_prot->orphan_count); -+ -+ /* Have we already been destroyed by a softirq or backlog? */ -+ if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE) -+ goto out; -+ -+ /* This is a (useful) BSD violating of the RFC. There is a -+ * problem with TCP as specified in that the other end could -+ * keep a socket open forever with no application left this end. -+ * We use a 3 minute timeout (about the same as BSD) then kill -+ * our end. If they send after that then tough - BUT: long enough -+ * that we won't make the old 4*rto = almost no time - whoops -+ * reset mistake. -+ * -+ * Nope, it was not mistake. It is really desired behaviour -+ * f.e. on http servers, when such sockets are useless, but -+ * consume significant resources. Let's do it with special -+ * linger2 option. --ANK -+ */ -+ -+ if (meta_sk->sk_state == TCP_FIN_WAIT2) { -+ if (meta_tp->linger2 < 0) { -+ tcp_set_state(meta_sk, TCP_CLOSE); -+ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC); -+ NET_INC_STATS_BH(sock_net(meta_sk), -+ LINUX_MIB_TCPABORTONLINGER); -+ } else { -+ const int tmo = tcp_fin_time(meta_sk); -+ -+ if (tmo > TCP_TIMEWAIT_LEN) { -+ inet_csk_reset_keepalive_timer(meta_sk, -+ tmo - TCP_TIMEWAIT_LEN); -+ } else { -+ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, -+ tmo); -+ goto out; -+ } -+ } -+ } -+ if (meta_sk->sk_state != TCP_CLOSE) { -+ sk_mem_reclaim(meta_sk); -+ if (tcp_too_many_orphans(meta_sk, 0)) { -+ if (net_ratelimit()) -+ pr_info("MPTCP: too many of orphaned sockets\n"); -+ tcp_set_state(meta_sk, TCP_CLOSE); -+ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC); -+ NET_INC_STATS_BH(sock_net(meta_sk), -+ LINUX_MIB_TCPABORTONMEMORY); -+ } -+ } -+ -+ -+ if (meta_sk->sk_state == TCP_CLOSE) -+ inet_csk_destroy_sock(meta_sk); -+ /* Otherwise, socket is reprieved until protocol close. */ -+ -+out: -+ bh_unlock_sock(meta_sk); -+ local_bh_enable(); -+ mutex_unlock(&mpcb->mpcb_mutex); -+ sock_put(meta_sk); /* Taken by sock_hold */ -+} -+ -+void mptcp_disconnect(struct sock *sk) -+{ -+ struct sock *subsk, *tmpsk; -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ mptcp_delete_synack_timer(sk); -+ -+ __skb_queue_purge(&tp->mpcb->reinject_queue); -+ -+ if (tp->inside_tk_table) { -+ mptcp_hash_remove_bh(tp); -+ reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue); -+ } -+ -+ local_bh_disable(); -+ mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) { -+ /* The socket will get removed from the subsocket-list -+ * and made non-mptcp by setting mpc to 0. -+ * -+ * This is necessary, because tcp_disconnect assumes -+ * that the connection is completly dead afterwards. -+ * Thus we need to do a mptcp_del_sock. Due to this call -+ * we have to make it non-mptcp. -+ * -+ * We have to lock the socket, because we set mpc to 0. -+ * An incoming packet would take the subsocket's lock -+ * and go on into the receive-path. -+ * This would be a race. -+ */ -+ -+ bh_lock_sock(subsk); -+ mptcp_del_sock(subsk); -+ tcp_sk(subsk)->mpc = 0; -+ tcp_sk(subsk)->ops = &tcp_specific; -+ mptcp_sub_force_close(subsk); -+ bh_unlock_sock(subsk); -+ } -+ local_bh_enable(); -+ -+ tp->was_meta_sk = 1; -+ tp->mpc = 0; -+ tp->ops = &tcp_specific; -+} -+ -+ -+/* Returns 1 if we should enable MPTCP for that socket. */ -+int mptcp_doit(struct sock *sk) -+{ -+ /* Do not allow MPTCP enabling if the MPTCP initialization failed */ -+ if (mptcp_init_failed) -+ return 0; -+ -+ if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled) -+ return 0; -+ -+ /* Socket may already be established (e.g., called from tcp_recvmsg) */ -+ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->request_mptcp) -+ return 1; -+ -+ /* Don't do mptcp over loopback */ -+ if (sk->sk_family == AF_INET && -+ (ipv4_is_loopback(inet_sk(sk)->inet_daddr) || -+ ipv4_is_loopback(inet_sk(sk)->inet_saddr))) -+ return 0; -+#if IS_ENABLED(CONFIG_IPV6) -+ if (sk->sk_family == AF_INET6 && -+ (ipv6_addr_loopback(&sk->sk_v6_daddr) || -+ ipv6_addr_loopback(&inet6_sk(sk)->saddr))) -+ return 0; -+#endif -+ if (mptcp_v6_is_v4_mapped(sk) && -+ ipv4_is_loopback(inet_sk(sk)->inet_saddr)) -+ return 0; -+ -+#ifdef CONFIG_TCP_MD5SIG -+ /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */ -+ if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk)) -+ return 0; -+#endif -+ -+ return 1; -+} -+ -+int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window) -+{ -+ struct tcp_sock *master_tp; -+ struct sock *master_sk; -+ -+ if (mptcp_alloc_mpcb(meta_sk, remote_key, window)) -+ goto err_alloc_mpcb; -+ -+ master_sk = tcp_sk(meta_sk)->mpcb->master_sk; -+ master_tp = tcp_sk(master_sk); -+ -+ if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC)) -+ goto err_add_sock; -+ -+ if (__inet_inherit_port(meta_sk, master_sk) < 0) -+ goto err_add_sock; -+ -+ meta_sk->sk_prot->unhash(meta_sk); -+ -+ if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk)) -+ __inet_hash_nolisten(master_sk, NULL); -+#if IS_ENABLED(CONFIG_IPV6) -+ else -+ __inet6_hash(master_sk, NULL); -+#endif -+ -+ master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd; -+ -+ return 0; -+ -+err_add_sock: -+ mptcp_fallback_meta_sk(meta_sk); -+ -+ inet_csk_prepare_forced_close(master_sk); -+ tcp_done(master_sk); -+ inet_csk_prepare_forced_close(meta_sk); -+ tcp_done(meta_sk); -+ -+err_alloc_mpcb: -+ return -ENOBUFS; -+} -+ -+static int __mptcp_check_req_master(struct sock *child, -+ struct request_sock *req) -+{ -+ struct tcp_sock *child_tp = tcp_sk(child); -+ struct sock *meta_sk = child; -+ struct mptcp_cb *mpcb; -+ struct mptcp_request_sock *mtreq; -+ -+ /* Never contained an MP_CAPABLE */ -+ if (!inet_rsk(req)->mptcp_rqsk) -+ return 1; -+ -+ if (!inet_rsk(req)->saw_mpc) { -+ /* Fallback to regular TCP, because we saw one SYN without -+ * MP_CAPABLE. In tcp_check_req we continue the regular path. -+ * But, the socket has been added to the reqsk_tk_htb, so we -+ * must still remove it. -+ */ -+ mptcp_reqsk_remove_tk(req); -+ return 1; -+ } -+ -+ /* Just set this values to pass them to mptcp_alloc_mpcb */ -+ mtreq = mptcp_rsk(req); -+ child_tp->mptcp_loc_key = mtreq->mptcp_loc_key; -+ child_tp->mptcp_loc_token = mtreq->mptcp_loc_token; -+ -+ if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key, -+ child_tp->snd_wnd)) -+ return -ENOBUFS; -+ -+ child = tcp_sk(child)->mpcb->master_sk; -+ child_tp = tcp_sk(child); -+ mpcb = child_tp->mpcb; -+ -+ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; -+ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; -+ -+ mpcb->dss_csum = mtreq->dss_csum; -+ mpcb->server_side = 1; -+ -+ /* Will be moved to ESTABLISHED by tcp_rcv_state_process() */ -+ mptcp_update_metasocket(child, meta_sk); -+ -+ /* Needs to be done here additionally, because when accepting a -+ * new connection we pass by __reqsk_free and not reqsk_free. -+ */ -+ mptcp_reqsk_remove_tk(req); -+ -+ /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */ -+ sock_put(meta_sk); -+ -+ return 0; -+} -+ -+int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req) -+{ -+ struct sock *meta_sk = child, *master_sk; -+ struct sk_buff *skb; -+ u32 new_mapping; -+ int ret; -+ -+ ret = __mptcp_check_req_master(child, req); -+ if (ret) -+ return ret; -+ -+ master_sk = tcp_sk(meta_sk)->mpcb->master_sk; -+ -+ /* We need to rewind copied_seq as it is set to IDSN + 1 and as we have -+ * pre-MPTCP data in the receive queue. -+ */ -+ tcp_sk(meta_sk)->copied_seq -= tcp_sk(master_sk)->rcv_nxt - -+ tcp_rsk(req)->rcv_isn - 1; -+ -+ /* Map subflow sequence number to data sequence numbers. We need to map -+ * these data to [IDSN - len - 1, IDSN[. -+ */ -+ new_mapping = tcp_sk(meta_sk)->copied_seq - tcp_rsk(req)->rcv_isn - 1; -+ -+ /* There should be only one skb: the SYN + data. */ -+ skb_queue_walk(&meta_sk->sk_receive_queue, skb) { -+ TCP_SKB_CB(skb)->seq += new_mapping; -+ TCP_SKB_CB(skb)->end_seq += new_mapping; -+ } -+ -+ /* With fastopen we change the semantics of the relative subflow -+ * sequence numbers to deal with middleboxes that could add/remove -+ * multiple bytes in the SYN. We chose to start counting at rcv_nxt - 1 -+ * instead of the regular TCP ISN. -+ */ -+ tcp_sk(master_sk)->mptcp->rcv_isn = tcp_sk(master_sk)->rcv_nxt - 1; -+ -+ /* We need to update copied_seq of the master_sk to account for the -+ * already moved data to the meta receive queue. -+ */ -+ tcp_sk(master_sk)->copied_seq = tcp_sk(master_sk)->rcv_nxt; -+ -+ /* Handled by the master_sk */ -+ tcp_sk(meta_sk)->fastopen_rsk = NULL; -+ -+ return 0; -+} -+ -+int mptcp_check_req_master(struct sock *sk, struct sock *child, -+ struct request_sock *req, -+ struct request_sock **prev) -+{ -+ struct sock *meta_sk = child; -+ int ret; -+ -+ ret = __mptcp_check_req_master(child, req); -+ if (ret) -+ return ret; -+ -+ inet_csk_reqsk_queue_unlink(sk, req, prev); -+ inet_csk_reqsk_queue_removed(sk, req); -+ inet_csk_reqsk_queue_add(sk, req, meta_sk); -+ -+ return 0; -+} -+ -+struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child, -+ struct request_sock *req, -+ struct request_sock **prev, -+ const struct mptcp_options_received *mopt) -+{ -+ struct tcp_sock *child_tp = tcp_sk(child); -+ struct mptcp_request_sock *mtreq = mptcp_rsk(req); -+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ u8 hash_mac_check[20]; -+ -+ child_tp->inside_tk_table = 0; -+ -+ if (!mopt->join_ack) -+ goto teardown; -+ -+ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, -+ (u8 *)&mpcb->mptcp_loc_key, -+ (u8 *)&mtreq->mptcp_rem_nonce, -+ (u8 *)&mtreq->mptcp_loc_nonce, -+ (u32 *)hash_mac_check); -+ -+ if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20)) -+ goto teardown; -+ -+ /* Point it to the same struct socket and wq as the meta_sk */ -+ sk_set_socket(child, meta_sk->sk_socket); -+ child->sk_wq = meta_sk->sk_wq; -+ -+ if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) { -+ /* Has been inherited, but now child_tp->mptcp is NULL */ -+ child_tp->mpc = 0; -+ child_tp->ops = &tcp_specific; -+ -+ /* TODO when we support acking the third ack for new subflows, -+ * we should silently discard this third ack, by returning NULL. -+ * -+ * Maybe, at the retransmission we will have enough memory to -+ * fully add the socket to the meta-sk. -+ */ -+ goto teardown; -+ } -+ -+ /* The child is a clone of the meta socket, we must now reset -+ * some of the fields -+ */ -+ child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio; -+ -+ /* We should allow proper increase of the snd/rcv-buffers. Thus, we -+ * use the original values instead of the bloated up ones from the -+ * clone. -+ */ -+ child->sk_sndbuf = mpcb->orig_sk_sndbuf; -+ child->sk_rcvbuf = mpcb->orig_sk_rcvbuf; -+ -+ child_tp->mptcp->slave_sk = 1; -+ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; -+ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; -+ child_tp->mptcp->init_rcv_wnd = req->rcv_wnd; -+ -+ child_tp->tsq_flags = 0; -+ -+ /* Subflows do not use the accept queue, as they -+ * are attached immediately to the mpcb. -+ */ -+ inet_csk_reqsk_queue_unlink(meta_sk, req, prev); -+ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req); -+ reqsk_free(req); -+ return child; -+ -+teardown: -+ /* Drop this request - sock creation failed. */ -+ inet_csk_reqsk_queue_unlink(meta_sk, req, prev); -+ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req); -+ reqsk_free(req); -+ inet_csk_prepare_forced_close(child); -+ tcp_done(child); -+ return meta_sk; -+} -+ -+int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw) -+{ -+ struct mptcp_tw *mptw; -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct mptcp_cb *mpcb = tp->mpcb; -+ -+ /* A subsocket in tw can only receive data. So, if we are in -+ * infinite-receive, then we should not reply with a data-ack or act -+ * upon general MPTCP-signaling. We prevent this by simply not creating -+ * the mptcp_tw_sock. -+ */ -+ if (mpcb->infinite_mapping_rcv) { -+ tw->mptcp_tw = NULL; -+ return 0; -+ } -+ -+ /* Alloc MPTCP-tw-sock */ -+ mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC); -+ if (!mptw) -+ return -ENOBUFS; -+ -+ atomic_inc(&mpcb->mpcb_refcnt); -+ -+ tw->mptcp_tw = mptw; -+ mptw->loc_key = mpcb->mptcp_loc_key; -+ mptw->meta_tw = mpcb->in_time_wait; -+ if (mptw->meta_tw) { -+ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp)); -+ if (mpcb->mptw_state != TCP_TIME_WAIT) -+ mptw->rcv_nxt++; -+ } -+ rcu_assign_pointer(mptw->mpcb, mpcb); -+ -+ spin_lock(&mpcb->tw_lock); -+ list_add_rcu(&mptw->list, &tp->mpcb->tw_list); -+ mptw->in_list = 1; -+ spin_unlock(&mpcb->tw_lock); -+ -+ return 0; -+} -+ -+void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) -+{ -+ struct mptcp_cb *mpcb; -+ -+ rcu_read_lock(); -+ mpcb = rcu_dereference(tw->mptcp_tw->mpcb); -+ -+ /* If we are still holding a ref to the mpcb, we have to remove ourself -+ * from the list and drop the ref properly. -+ */ -+ if (mpcb && atomic_inc_not_zero(&mpcb->mpcb_refcnt)) { -+ spin_lock(&mpcb->tw_lock); -+ if (tw->mptcp_tw->in_list) { -+ list_del_rcu(&tw->mptcp_tw->list); -+ tw->mptcp_tw->in_list = 0; -+ } -+ spin_unlock(&mpcb->tw_lock); -+ -+ /* Twice, because we increased it above */ -+ mptcp_mpcb_put(mpcb); -+ mptcp_mpcb_put(mpcb); -+ } -+ -+ rcu_read_unlock(); -+ -+ kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw); -+} -+ -+/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a -+ * data-fin. -+ */ -+void mptcp_time_wait(struct sock *sk, int state, int timeo) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct mptcp_tw *mptw; -+ -+ /* Used for sockets that go into tw after the meta -+ * (see mptcp_init_tw_sock()) -+ */ -+ tp->mpcb->in_time_wait = 1; -+ tp->mpcb->mptw_state = state; -+ -+ /* Update the time-wait-sock's information */ -+ rcu_read_lock_bh(); -+ list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) { -+ mptw->meta_tw = 1; -+ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp); -+ -+ /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 - -+ * pretend as if the DATA_FIN has already reached us, that way -+ * the checks in tcp_timewait_state_process will be good as the -+ * DATA_FIN comes in. -+ */ -+ if (state != TCP_TIME_WAIT) -+ mptw->rcv_nxt++; -+ } -+ rcu_read_unlock_bh(); -+ -+ tcp_done(sk); -+} -+ -+void mptcp_tsq_flags(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct sock *meta_sk = mptcp_meta_sk(sk); -+ -+ /* It will be handled as a regular deferred-call */ -+ if (is_meta_sk(sk)) -+ return; -+ -+ if (hlist_unhashed(&tp->mptcp->cb_list)) { -+ hlist_add_head(&tp->mptcp->cb_list, &tp->mpcb->callback_list); -+ /* We need to hold it here, as the sock_hold is not assured -+ * by the release_sock as it is done in regular TCP. -+ * -+ * The subsocket may get inet_csk_destroy'd while it is inside -+ * the callback_list. -+ */ -+ sock_hold(sk); -+ } -+ -+ if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &tcp_sk(meta_sk)->tsq_flags)) -+ sock_hold(meta_sk); -+} -+ -+void mptcp_tsq_sub_deferred(struct sock *meta_sk) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct mptcp_tcp_sock *mptcp; -+ struct hlist_node *tmp; -+ -+ BUG_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk); -+ -+ __sock_put(meta_sk); -+ hlist_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) { -+ struct tcp_sock *tp = mptcp->tp; -+ struct sock *sk = (struct sock *)tp; -+ -+ hlist_del_init(&mptcp->cb_list); -+ sk->sk_prot->release_cb(sk); -+ /* Final sock_put (cfr. mptcp_tsq_flags */ -+ sock_put(sk); -+ } -+} -+ -+void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req, -+ struct sk_buff *skb) -+{ -+ struct mptcp_request_sock *mtreq = mptcp_rsk(req); -+ struct mptcp_options_received mopt; -+ u8 mptcp_hash_mac[20]; -+ -+ mptcp_init_mp_opt(&mopt); -+ tcp_parse_mptcp_options(skb, &mopt); -+ -+ mtreq = mptcp_rsk(req); -+ mtreq->mptcp_mpcb = mpcb; -+ mtreq->is_sub = 1; -+ inet_rsk(req)->mptcp_rqsk = 1; -+ -+ mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce; -+ -+ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, -+ (u8 *)&mpcb->mptcp_rem_key, -+ (u8 *)&mtreq->mptcp_loc_nonce, -+ (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac); -+ mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac; -+ -+ mtreq->rem_id = mopt.rem_id; -+ mtreq->rcv_low_prio = mopt.low_prio; -+ inet_rsk(req)->saw_mpc = 1; -+} -+ -+void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb) -+{ -+ struct mptcp_options_received mopt; -+ struct mptcp_request_sock *mreq = mptcp_rsk(req); -+ -+ mptcp_init_mp_opt(&mopt); -+ tcp_parse_mptcp_options(skb, &mopt); -+ -+ mreq->is_sub = 0; -+ inet_rsk(req)->mptcp_rqsk = 1; -+ mreq->dss_csum = mopt.dss_csum; -+ mreq->hash_entry.pprev = NULL; -+ -+ mptcp_reqsk_new_mptcp(req, &mopt, skb); -+} -+ -+int mptcp_conn_request(struct sock *sk, struct sk_buff *skb) -+{ -+ struct mptcp_options_received mopt; -+ const struct tcp_sock *tp = tcp_sk(sk); -+ __u32 isn = TCP_SKB_CB(skb)->when; -+ bool want_cookie = false; -+ -+ if ((sysctl_tcp_syncookies == 2 || -+ inet_csk_reqsk_queue_is_full(sk)) && !isn) { -+ want_cookie = tcp_syn_flood_action(sk, skb, -+ mptcp_request_sock_ops.slab_name); -+ if (!want_cookie) -+ goto drop; -+ } -+ -+ mptcp_init_mp_opt(&mopt); -+ tcp_parse_mptcp_options(skb, &mopt); -+ -+ if (mopt.is_mp_join) -+ return mptcp_do_join_short(skb, &mopt, sock_net(sk)); -+ if (mopt.drop_me) -+ goto drop; -+ -+ if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled) -+ mopt.saw_mpc = 0; -+ -+ if (skb->protocol == htons(ETH_P_IP)) { -+ if (mopt.saw_mpc && !want_cookie) { -+ if (skb_rtable(skb)->rt_flags & -+ (RTCF_BROADCAST | RTCF_MULTICAST)) -+ goto drop; -+ -+ return tcp_conn_request(&mptcp_request_sock_ops, -+ &mptcp_request_sock_ipv4_ops, -+ sk, skb); -+ } -+ -+ return tcp_v4_conn_request(sk, skb); -+#if IS_ENABLED(CONFIG_IPV6) -+ } else { -+ if (mopt.saw_mpc && !want_cookie) { -+ if (!ipv6_unicast_destination(skb)) -+ goto drop; -+ -+ return tcp_conn_request(&mptcp6_request_sock_ops, -+ &mptcp_request_sock_ipv6_ops, -+ sk, skb); -+ } -+ -+ return tcp_v6_conn_request(sk, skb); -+#endif -+ } -+drop: -+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); -+ return 0; -+} -+ -+struct workqueue_struct *mptcp_wq; -+EXPORT_SYMBOL(mptcp_wq); -+ -+/* Output /proc/net/mptcp */ -+static int mptcp_pm_seq_show(struct seq_file *seq, void *v) -+{ -+ struct tcp_sock *meta_tp; -+ const struct net *net = seq->private; -+ int i, n = 0; -+ -+ seq_printf(seq, " sl loc_tok rem_tok v6 local_address remote_address st ns tx_queue rx_queue inode"); -+ seq_putc(seq, '\n'); -+ -+ for (i = 0; i < MPTCP_HASH_SIZE; i++) { -+ struct hlist_nulls_node *node; -+ rcu_read_lock_bh(); -+ hlist_nulls_for_each_entry_rcu(meta_tp, node, -+ &tk_hashtable[i], tk_table) { -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ struct sock *meta_sk = (struct sock *)meta_tp; -+ struct inet_sock *isk = inet_sk(meta_sk); -+ -+ if (!mptcp(meta_tp) || !net_eq(net, sock_net(meta_sk))) -+ continue; -+ -+ if (capable(CAP_NET_ADMIN)) { -+ seq_printf(seq, "%4d: %04X %04X ", n++, -+ mpcb->mptcp_loc_token, -+ mpcb->mptcp_rem_token); -+ } else { -+ seq_printf(seq, "%4d: %04X %04X ", n++, -1, -1); -+ } -+ if (meta_sk->sk_family == AF_INET || -+ mptcp_v6_is_v4_mapped(meta_sk)) { -+ seq_printf(seq, " 0 %08X:%04X %08X:%04X ", -+ isk->inet_rcv_saddr, -+ ntohs(isk->inet_sport), -+ isk->inet_daddr, -+ ntohs(isk->inet_dport)); -+#if IS_ENABLED(CONFIG_IPV6) -+ } else if (meta_sk->sk_family == AF_INET6) { -+ struct in6_addr *src = &meta_sk->sk_v6_rcv_saddr; -+ struct in6_addr *dst = &meta_sk->sk_v6_daddr; -+ seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X", -+ src->s6_addr32[0], src->s6_addr32[1], -+ src->s6_addr32[2], src->s6_addr32[3], -+ ntohs(isk->inet_sport), -+ dst->s6_addr32[0], dst->s6_addr32[1], -+ dst->s6_addr32[2], dst->s6_addr32[3], -+ ntohs(isk->inet_dport)); -+#endif -+ } -+ seq_printf(seq, " %02X %02X %08X:%08X %lu", -+ meta_sk->sk_state, mpcb->cnt_subflows, -+ meta_tp->write_seq - meta_tp->snd_una, -+ max_t(int, meta_tp->rcv_nxt - -+ meta_tp->copied_seq, 0), -+ sock_i_ino(meta_sk)); -+ seq_putc(seq, '\n'); -+ } -+ -+ rcu_read_unlock_bh(); -+ } -+ -+ return 0; -+} -+ -+static int mptcp_pm_seq_open(struct inode *inode, struct file *file) -+{ -+ return single_open_net(inode, file, mptcp_pm_seq_show); -+} -+ -+static const struct file_operations mptcp_pm_seq_fops = { -+ .owner = THIS_MODULE, -+ .open = mptcp_pm_seq_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = single_release_net, -+}; -+ -+static int mptcp_pm_init_net(struct net *net) -+{ -+ if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops)) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+static void mptcp_pm_exit_net(struct net *net) -+{ -+ remove_proc_entry("mptcp", net->proc_net); -+} -+ -+static struct pernet_operations mptcp_pm_proc_ops = { -+ .init = mptcp_pm_init_net, -+ .exit = mptcp_pm_exit_net, -+}; -+ -+/* General initialization of mptcp */ -+void __init mptcp_init(void) -+{ -+ int i; -+ struct ctl_table_header *mptcp_sysctl; -+ -+ mptcp_sock_cache = kmem_cache_create("mptcp_sock", -+ sizeof(struct mptcp_tcp_sock), -+ 0, SLAB_HWCACHE_ALIGN, -+ NULL); -+ if (!mptcp_sock_cache) -+ goto mptcp_sock_cache_failed; -+ -+ mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb), -+ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, -+ NULL); -+ if (!mptcp_cb_cache) -+ goto mptcp_cb_cache_failed; -+ -+ mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw), -+ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, -+ NULL); -+ if (!mptcp_tw_cache) -+ goto mptcp_tw_cache_failed; -+ -+ get_random_bytes(mptcp_secret, sizeof(mptcp_secret)); -+ -+ mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); -+ if (!mptcp_wq) -+ goto alloc_workqueue_failed; -+ -+ for (i = 0; i < MPTCP_HASH_SIZE; i++) { -+ INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i); -+ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_htb[i], -+ i + MPTCP_REQSK_NULLS_BASE); -+ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i); -+ } -+ -+ spin_lock_init(&mptcp_reqsk_hlock); -+ spin_lock_init(&mptcp_tk_hashlock); -+ -+ if (register_pernet_subsys(&mptcp_pm_proc_ops)) -+ goto pernet_failed; -+ -+#if IS_ENABLED(CONFIG_IPV6) -+ if (mptcp_pm_v6_init()) -+ goto mptcp_pm_v6_failed; -+#endif -+ if (mptcp_pm_v4_init()) -+ goto mptcp_pm_v4_failed; -+ -+ mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table); -+ if (!mptcp_sysctl) -+ goto register_sysctl_failed; -+ -+ if (mptcp_register_path_manager(&mptcp_pm_default)) -+ goto register_pm_failed; -+ -+ if (mptcp_register_scheduler(&mptcp_sched_default)) -+ goto register_sched_failed; -+ -+ pr_info("MPTCP: Stable release v0.89.0-rc"); -+ -+ mptcp_init_failed = false; -+ -+ return; -+ -+register_sched_failed: -+ mptcp_unregister_path_manager(&mptcp_pm_default); -+register_pm_failed: -+ unregister_net_sysctl_table(mptcp_sysctl); -+register_sysctl_failed: -+ mptcp_pm_v4_undo(); -+mptcp_pm_v4_failed: -+#if IS_ENABLED(CONFIG_IPV6) -+ mptcp_pm_v6_undo(); -+mptcp_pm_v6_failed: -+#endif -+ unregister_pernet_subsys(&mptcp_pm_proc_ops); -+pernet_failed: -+ destroy_workqueue(mptcp_wq); -+alloc_workqueue_failed: -+ kmem_cache_destroy(mptcp_tw_cache); -+mptcp_tw_cache_failed: -+ kmem_cache_destroy(mptcp_cb_cache); -+mptcp_cb_cache_failed: -+ kmem_cache_destroy(mptcp_sock_cache); -+mptcp_sock_cache_failed: -+ mptcp_init_failed = true; -+} -diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c -new file mode 100644 -index 000000000000..3a54413ce25b ---- /dev/null -+++ b/net/mptcp/mptcp_fullmesh.c -@@ -0,0 +1,1722 @@ -+#include -+ -+#include -+#include -+ -+#if IS_ENABLED(CONFIG_IPV6) -+#include -+#include -+#endif -+ -+enum { -+ MPTCP_EVENT_ADD = 1, -+ MPTCP_EVENT_DEL, -+ MPTCP_EVENT_MOD, -+}; -+ -+#define MPTCP_SUBFLOW_RETRY_DELAY 1000 -+ -+/* Max number of local or remote addresses we can store. -+ * When changing, see the bitfield below in fullmesh_rem4/6. -+ */ -+#define MPTCP_MAX_ADDR 8 -+ -+struct fullmesh_rem4 { -+ u8 rem4_id; -+ u8 bitfield; -+ u8 retry_bitfield; -+ __be16 port; -+ struct in_addr addr; -+}; -+ -+struct fullmesh_rem6 { -+ u8 rem6_id; -+ u8 bitfield; -+ u8 retry_bitfield; -+ __be16 port; -+ struct in6_addr addr; -+}; -+ -+struct mptcp_loc_addr { -+ struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR]; -+ u8 loc4_bits; -+ u8 next_v4_index; -+ -+ struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR]; -+ u8 loc6_bits; -+ u8 next_v6_index; -+}; -+ -+struct mptcp_addr_event { -+ struct list_head list; -+ unsigned short family; -+ u8 code:7, -+ low_prio:1; -+ union inet_addr addr; -+}; -+ -+struct fullmesh_priv { -+ /* Worker struct for subflow establishment */ -+ struct work_struct subflow_work; -+ /* Delayed worker, when the routing-tables are not yet ready. */ -+ struct delayed_work subflow_retry_work; -+ -+ /* Remote addresses */ -+ struct fullmesh_rem4 remaddr4[MPTCP_MAX_ADDR]; -+ struct fullmesh_rem6 remaddr6[MPTCP_MAX_ADDR]; -+ -+ struct mptcp_cb *mpcb; -+ -+ u16 remove_addrs; /* Addresses to remove */ -+ u8 announced_addrs_v4; /* IPv4 Addresses we did announce */ -+ u8 announced_addrs_v6; /* IPv6 Addresses we did announce */ -+ -+ u8 add_addr; /* Are we sending an add_addr? */ -+ -+ u8 rem4_bits; -+ u8 rem6_bits; -+}; -+ -+struct mptcp_fm_ns { -+ struct mptcp_loc_addr __rcu *local; -+ spinlock_t local_lock; /* Protecting the above pointer */ -+ struct list_head events; -+ struct delayed_work address_worker; -+ -+ struct net *net; -+}; -+ -+static struct mptcp_pm_ops full_mesh __read_mostly; -+ -+static void full_mesh_create_subflows(struct sock *meta_sk); -+ -+static struct mptcp_fm_ns *fm_get_ns(const struct net *net) -+{ -+ return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH]; -+} -+ -+static struct fullmesh_priv *fullmesh_get_priv(const struct mptcp_cb *mpcb) -+{ -+ return (struct fullmesh_priv *)&mpcb->mptcp_pm[0]; -+} -+ -+/* Find the first free index in the bitfield */ -+static int __mptcp_find_free_index(u8 bitfield, u8 base) -+{ -+ int i; -+ -+ /* There are anyways no free bits... */ -+ if (bitfield == 0xff) -+ goto exit; -+ -+ i = ffs(~(bitfield >> base)) - 1; -+ if (i < 0) -+ goto exit; -+ -+ /* No free bits when starting at base, try from 0 on */ -+ if (i + base >= sizeof(bitfield) * 8) -+ return __mptcp_find_free_index(bitfield, 0); -+ -+ return i + base; -+exit: -+ return -1; -+} -+ -+static int mptcp_find_free_index(u8 bitfield) -+{ -+ return __mptcp_find_free_index(bitfield, 0); -+} -+ -+static void mptcp_addv4_raddr(struct mptcp_cb *mpcb, -+ const struct in_addr *addr, -+ __be16 port, u8 id) -+{ -+ int i; -+ struct fullmesh_rem4 *rem4; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ -+ mptcp_for_each_bit_set(fmp->rem4_bits, i) { -+ rem4 = &fmp->remaddr4[i]; -+ -+ /* Address is already in the list --- continue */ -+ if (rem4->rem4_id == id && -+ rem4->addr.s_addr == addr->s_addr && rem4->port == port) -+ return; -+ -+ /* This may be the case, when the peer is behind a NAT. He is -+ * trying to JOIN, thus sending the JOIN with a certain ID. -+ * However the src_addr of the IP-packet has been changed. We -+ * update the addr in the list, because this is the address as -+ * OUR BOX sees it. -+ */ -+ if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) { -+ /* update the address */ -+ mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n", -+ __func__, &rem4->addr.s_addr, -+ &addr->s_addr, id); -+ rem4->addr.s_addr = addr->s_addr; -+ rem4->port = port; -+ mpcb->list_rcvd = 1; -+ return; -+ } -+ } -+ -+ i = mptcp_find_free_index(fmp->rem4_bits); -+ /* Do we have already the maximum number of local/remote addresses? */ -+ if (i < 0) { -+ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n", -+ __func__, MPTCP_MAX_ADDR, &addr->s_addr); -+ return; -+ } -+ -+ rem4 = &fmp->remaddr4[i]; -+ -+ /* Address is not known yet, store it */ -+ rem4->addr.s_addr = addr->s_addr; -+ rem4->port = port; -+ rem4->bitfield = 0; -+ rem4->retry_bitfield = 0; -+ rem4->rem4_id = id; -+ mpcb->list_rcvd = 1; -+ fmp->rem4_bits |= (1 << i); -+ -+ return; -+} -+ -+static void mptcp_addv6_raddr(struct mptcp_cb *mpcb, -+ const struct in6_addr *addr, -+ __be16 port, u8 id) -+{ -+ int i; -+ struct fullmesh_rem6 *rem6; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ -+ mptcp_for_each_bit_set(fmp->rem6_bits, i) { -+ rem6 = &fmp->remaddr6[i]; -+ -+ /* Address is already in the list --- continue */ -+ if (rem6->rem6_id == id && -+ ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port) -+ return; -+ -+ /* This may be the case, when the peer is behind a NAT. He is -+ * trying to JOIN, thus sending the JOIN with a certain ID. -+ * However the src_addr of the IP-packet has been changed. We -+ * update the addr in the list, because this is the address as -+ * OUR BOX sees it. -+ */ -+ if (rem6->rem6_id == id) { -+ /* update the address */ -+ mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n", -+ __func__, &rem6->addr, addr, id); -+ rem6->addr = *addr; -+ rem6->port = port; -+ mpcb->list_rcvd = 1; -+ return; -+ } -+ } -+ -+ i = mptcp_find_free_index(fmp->rem6_bits); -+ /* Do we have already the maximum number of local/remote addresses? */ -+ if (i < 0) { -+ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n", -+ __func__, MPTCP_MAX_ADDR, addr); -+ return; -+ } -+ -+ rem6 = &fmp->remaddr6[i]; -+ -+ /* Address is not known yet, store it */ -+ rem6->addr = *addr; -+ rem6->port = port; -+ rem6->bitfield = 0; -+ rem6->retry_bitfield = 0; -+ rem6->rem6_id = id; -+ mpcb->list_rcvd = 1; -+ fmp->rem6_bits |= (1 << i); -+ -+ return; -+} -+ -+static void mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id) -+{ -+ int i; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ -+ mptcp_for_each_bit_set(fmp->rem4_bits, i) { -+ if (fmp->remaddr4[i].rem4_id == id) { -+ /* remove address from bitfield */ -+ fmp->rem4_bits &= ~(1 << i); -+ -+ break; -+ } -+ } -+} -+ -+static void mptcp_v6_rem_raddress(const struct mptcp_cb *mpcb, u8 id) -+{ -+ int i; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ -+ mptcp_for_each_bit_set(fmp->rem6_bits, i) { -+ if (fmp->remaddr6[i].rem6_id == id) { -+ /* remove address from bitfield */ -+ fmp->rem6_bits &= ~(1 << i); -+ -+ break; -+ } -+ } -+} -+ -+/* Sets the bitfield of the remote-address field */ -+static void mptcp_v4_set_init_addr_bit(const struct mptcp_cb *mpcb, -+ const struct in_addr *addr, u8 index) -+{ -+ int i; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ -+ mptcp_for_each_bit_set(fmp->rem4_bits, i) { -+ if (fmp->remaddr4[i].addr.s_addr == addr->s_addr) { -+ fmp->remaddr4[i].bitfield |= (1 << index); -+ return; -+ } -+ } -+} -+ -+/* Sets the bitfield of the remote-address field */ -+static void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb, -+ const struct in6_addr *addr, u8 index) -+{ -+ int i; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ -+ mptcp_for_each_bit_set(fmp->rem6_bits, i) { -+ if (ipv6_addr_equal(&fmp->remaddr6[i].addr, addr)) { -+ fmp->remaddr6[i].bitfield |= (1 << index); -+ return; -+ } -+ } -+} -+ -+static void mptcp_set_init_addr_bit(struct mptcp_cb *mpcb, -+ const union inet_addr *addr, -+ sa_family_t family, u8 id) -+{ -+ if (family == AF_INET) -+ mptcp_v4_set_init_addr_bit(mpcb, &addr->in, id); -+ else -+ mptcp_v6_set_init_addr_bit(mpcb, &addr->in6, id); -+} -+ -+static void retry_subflow_worker(struct work_struct *work) -+{ -+ struct delayed_work *delayed_work = container_of(work, -+ struct delayed_work, -+ work); -+ struct fullmesh_priv *fmp = container_of(delayed_work, -+ struct fullmesh_priv, -+ subflow_retry_work); -+ struct mptcp_cb *mpcb = fmp->mpcb; -+ struct sock *meta_sk = mpcb->meta_sk; -+ struct mptcp_loc_addr *mptcp_local; -+ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); -+ int iter = 0, i; -+ -+ /* We need a local (stable) copy of the address-list. Really, it is not -+ * such a big deal, if the address-list is not 100% up-to-date. -+ */ -+ rcu_read_lock_bh(); -+ mptcp_local = rcu_dereference_bh(fm_ns->local); -+ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC); -+ rcu_read_unlock_bh(); -+ -+ if (!mptcp_local) -+ return; -+ -+next_subflow: -+ if (iter) { -+ release_sock(meta_sk); -+ mutex_unlock(&mpcb->mpcb_mutex); -+ -+ cond_resched(); -+ } -+ mutex_lock(&mpcb->mpcb_mutex); -+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); -+ -+ iter++; -+ -+ if (sock_flag(meta_sk, SOCK_DEAD)) -+ goto exit; -+ -+ mptcp_for_each_bit_set(fmp->rem4_bits, i) { -+ struct fullmesh_rem4 *rem = &fmp->remaddr4[i]; -+ /* Do we need to retry establishing a subflow ? */ -+ if (rem->retry_bitfield) { -+ int i = mptcp_find_free_index(~rem->retry_bitfield); -+ struct mptcp_rem4 rem4; -+ -+ rem->bitfield |= (1 << i); -+ rem->retry_bitfield &= ~(1 << i); -+ -+ rem4.addr = rem->addr; -+ rem4.port = rem->port; -+ rem4.rem4_id = rem->rem4_id; -+ -+ mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], &rem4); -+ goto next_subflow; -+ } -+ } -+ -+#if IS_ENABLED(CONFIG_IPV6) -+ mptcp_for_each_bit_set(fmp->rem6_bits, i) { -+ struct fullmesh_rem6 *rem = &fmp->remaddr6[i]; -+ -+ /* Do we need to retry establishing a subflow ? */ -+ if (rem->retry_bitfield) { -+ int i = mptcp_find_free_index(~rem->retry_bitfield); -+ struct mptcp_rem6 rem6; -+ -+ rem->bitfield |= (1 << i); -+ rem->retry_bitfield &= ~(1 << i); -+ -+ rem6.addr = rem->addr; -+ rem6.port = rem->port; -+ rem6.rem6_id = rem->rem6_id; -+ -+ mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], &rem6); -+ goto next_subflow; -+ } -+ } -+#endif -+ -+exit: -+ kfree(mptcp_local); -+ release_sock(meta_sk); -+ mutex_unlock(&mpcb->mpcb_mutex); -+ sock_put(meta_sk); -+} -+ -+/** -+ * Create all new subflows, by doing calls to mptcp_initX_subsockets -+ * -+ * This function uses a goto next_subflow, to allow releasing the lock between -+ * new subflows and giving other processes a chance to do some work on the -+ * socket and potentially finishing the communication. -+ **/ -+static void create_subflow_worker(struct work_struct *work) -+{ -+ struct fullmesh_priv *fmp = container_of(work, struct fullmesh_priv, -+ subflow_work); -+ struct mptcp_cb *mpcb = fmp->mpcb; -+ struct sock *meta_sk = mpcb->meta_sk; -+ struct mptcp_loc_addr *mptcp_local; -+ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); -+ int iter = 0, retry = 0; -+ int i; -+ -+ /* We need a local (stable) copy of the address-list. Really, it is not -+ * such a big deal, if the address-list is not 100% up-to-date. -+ */ -+ rcu_read_lock_bh(); -+ mptcp_local = rcu_dereference_bh(fm_ns->local); -+ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC); -+ rcu_read_unlock_bh(); -+ -+ if (!mptcp_local) -+ return; -+ -+next_subflow: -+ if (iter) { -+ release_sock(meta_sk); -+ mutex_unlock(&mpcb->mpcb_mutex); -+ -+ cond_resched(); -+ } -+ mutex_lock(&mpcb->mpcb_mutex); -+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); -+ -+ iter++; -+ -+ if (sock_flag(meta_sk, SOCK_DEAD)) -+ goto exit; -+ -+ if (mpcb->master_sk && -+ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) -+ goto exit; -+ -+ mptcp_for_each_bit_set(fmp->rem4_bits, i) { -+ struct fullmesh_rem4 *rem; -+ u8 remaining_bits; -+ -+ rem = &fmp->remaddr4[i]; -+ remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits; -+ -+ /* Are there still combinations to handle? */ -+ if (remaining_bits) { -+ int i = mptcp_find_free_index(~remaining_bits); -+ struct mptcp_rem4 rem4; -+ -+ rem->bitfield |= (1 << i); -+ -+ rem4.addr = rem->addr; -+ rem4.port = rem->port; -+ rem4.rem4_id = rem->rem4_id; -+ -+ /* If a route is not yet available then retry once */ -+ if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], -+ &rem4) == -ENETUNREACH) -+ retry = rem->retry_bitfield |= (1 << i); -+ goto next_subflow; -+ } -+ } -+ -+#if IS_ENABLED(CONFIG_IPV6) -+ mptcp_for_each_bit_set(fmp->rem6_bits, i) { -+ struct fullmesh_rem6 *rem; -+ u8 remaining_bits; -+ -+ rem = &fmp->remaddr6[i]; -+ remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits; -+ -+ /* Are there still combinations to handle? */ -+ if (remaining_bits) { -+ int i = mptcp_find_free_index(~remaining_bits); -+ struct mptcp_rem6 rem6; -+ -+ rem->bitfield |= (1 << i); -+ -+ rem6.addr = rem->addr; -+ rem6.port = rem->port; -+ rem6.rem6_id = rem->rem6_id; -+ -+ /* If a route is not yet available then retry once */ -+ if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], -+ &rem6) == -ENETUNREACH) -+ retry = rem->retry_bitfield |= (1 << i); -+ goto next_subflow; -+ } -+ } -+#endif -+ -+ if (retry && !delayed_work_pending(&fmp->subflow_retry_work)) { -+ sock_hold(meta_sk); -+ queue_delayed_work(mptcp_wq, &fmp->subflow_retry_work, -+ msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY)); -+ } -+ -+exit: -+ kfree(mptcp_local); -+ release_sock(meta_sk); -+ mutex_unlock(&mpcb->mpcb_mutex); -+ sock_put(meta_sk); -+} -+ -+static void announce_remove_addr(u8 addr_id, struct sock *meta_sk) -+{ -+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ struct sock *sk = mptcp_select_ack_sock(meta_sk); -+ -+ fmp->remove_addrs |= (1 << addr_id); -+ mpcb->addr_signal = 1; -+ -+ if (sk) -+ tcp_send_ack(sk); -+} -+ -+static void update_addr_bitfields(struct sock *meta_sk, -+ const struct mptcp_loc_addr *mptcp_local) -+{ -+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ int i; -+ -+ /* The bits in announced_addrs_* always match with loc*_bits. So, a -+ * simply & operation unsets the correct bits, because these go from -+ * announced to non-announced -+ */ -+ fmp->announced_addrs_v4 &= mptcp_local->loc4_bits; -+ -+ mptcp_for_each_bit_set(fmp->rem4_bits, i) { -+ fmp->remaddr4[i].bitfield &= mptcp_local->loc4_bits; -+ fmp->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits; -+ } -+ -+ fmp->announced_addrs_v6 &= mptcp_local->loc6_bits; -+ -+ mptcp_for_each_bit_set(fmp->rem6_bits, i) { -+ fmp->remaddr6[i].bitfield &= mptcp_local->loc6_bits; -+ fmp->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits; -+ } -+} -+ -+static int mptcp_find_address(const struct mptcp_loc_addr *mptcp_local, -+ sa_family_t family, const union inet_addr *addr) -+{ -+ int i; -+ u8 loc_bits; -+ bool found = false; -+ -+ if (family == AF_INET) -+ loc_bits = mptcp_local->loc4_bits; -+ else -+ loc_bits = mptcp_local->loc6_bits; -+ -+ mptcp_for_each_bit_set(loc_bits, i) { -+ if (family == AF_INET && -+ mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) { -+ found = true; -+ break; -+ } -+ if (family == AF_INET6 && -+ ipv6_addr_equal(&mptcp_local->locaddr6[i].addr, -+ &addr->in6)) { -+ found = true; -+ break; -+ } -+ } -+ -+ if (!found) -+ return -1; -+ -+ return i; -+} -+ -+static void mptcp_address_worker(struct work_struct *work) -+{ -+ const struct delayed_work *delayed_work = container_of(work, -+ struct delayed_work, -+ work); -+ struct mptcp_fm_ns *fm_ns = container_of(delayed_work, -+ struct mptcp_fm_ns, -+ address_worker); -+ struct net *net = fm_ns->net; -+ struct mptcp_addr_event *event = NULL; -+ struct mptcp_loc_addr *mptcp_local, *old; -+ int i, id = -1; /* id is used in the socket-code on a delete-event */ -+ bool success; /* Used to indicate if we succeeded handling the event */ -+ -+next_event: -+ success = false; -+ kfree(event); -+ -+ /* First, let's dequeue an event from our event-list */ -+ rcu_read_lock_bh(); -+ spin_lock(&fm_ns->local_lock); -+ -+ event = list_first_entry_or_null(&fm_ns->events, -+ struct mptcp_addr_event, list); -+ if (!event) { -+ spin_unlock(&fm_ns->local_lock); -+ rcu_read_unlock_bh(); -+ return; -+ } -+ -+ list_del(&event->list); -+ -+ mptcp_local = rcu_dereference_bh(fm_ns->local); -+ -+ if (event->code == MPTCP_EVENT_DEL) { -+ id = mptcp_find_address(mptcp_local, event->family, &event->addr); -+ -+ /* Not in the list - so we don't care */ -+ if (id < 0) { -+ mptcp_debug("%s could not find id\n", __func__); -+ goto duno; -+ } -+ -+ old = mptcp_local; -+ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), -+ GFP_ATOMIC); -+ if (!mptcp_local) -+ goto duno; -+ -+ if (event->family == AF_INET) -+ mptcp_local->loc4_bits &= ~(1 << id); -+ else -+ mptcp_local->loc6_bits &= ~(1 << id); -+ -+ rcu_assign_pointer(fm_ns->local, mptcp_local); -+ kfree(old); -+ } else { -+ int i = mptcp_find_address(mptcp_local, event->family, &event->addr); -+ int j = i; -+ -+ if (j < 0) { -+ /* Not in the list, so we have to find an empty slot */ -+ if (event->family == AF_INET) -+ i = __mptcp_find_free_index(mptcp_local->loc4_bits, -+ mptcp_local->next_v4_index); -+ if (event->family == AF_INET6) -+ i = __mptcp_find_free_index(mptcp_local->loc6_bits, -+ mptcp_local->next_v6_index); -+ -+ if (i < 0) { -+ mptcp_debug("%s no more space\n", __func__); -+ goto duno; -+ } -+ -+ /* It might have been a MOD-event. */ -+ event->code = MPTCP_EVENT_ADD; -+ } else { -+ /* Let's check if anything changes */ -+ if (event->family == AF_INET && -+ event->low_prio == mptcp_local->locaddr4[i].low_prio) -+ goto duno; -+ -+ if (event->family == AF_INET6 && -+ event->low_prio == mptcp_local->locaddr6[i].low_prio) -+ goto duno; -+ } -+ -+ old = mptcp_local; -+ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), -+ GFP_ATOMIC); -+ if (!mptcp_local) -+ goto duno; -+ -+ if (event->family == AF_INET) { -+ mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr; -+ mptcp_local->locaddr4[i].loc4_id = i + 1; -+ mptcp_local->locaddr4[i].low_prio = event->low_prio; -+ } else { -+ mptcp_local->locaddr6[i].addr = event->addr.in6; -+ mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR; -+ mptcp_local->locaddr6[i].low_prio = event->low_prio; -+ } -+ -+ if (j < 0) { -+ if (event->family == AF_INET) { -+ mptcp_local->loc4_bits |= (1 << i); -+ mptcp_local->next_v4_index = i + 1; -+ } else { -+ mptcp_local->loc6_bits |= (1 << i); -+ mptcp_local->next_v6_index = i + 1; -+ } -+ } -+ -+ rcu_assign_pointer(fm_ns->local, mptcp_local); -+ kfree(old); -+ } -+ success = true; -+ -+duno: -+ spin_unlock(&fm_ns->local_lock); -+ rcu_read_unlock_bh(); -+ -+ if (!success) -+ goto next_event; -+ -+ /* Now we iterate over the MPTCP-sockets and apply the event. */ -+ for (i = 0; i < MPTCP_HASH_SIZE; i++) { -+ const struct hlist_nulls_node *node; -+ struct tcp_sock *meta_tp; -+ -+ rcu_read_lock_bh(); -+ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i], -+ tk_table) { -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ struct sock *meta_sk = (struct sock *)meta_tp, *sk; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ bool meta_v4 = meta_sk->sk_family == AF_INET; -+ -+ if (sock_net(meta_sk) != net) -+ continue; -+ -+ if (meta_v4) { -+ /* skip IPv6 events if meta is IPv4 */ -+ if (event->family == AF_INET6) -+ continue; -+ } -+ /* skip IPv4 events if IPV6_V6ONLY is set */ -+ else if (event->family == AF_INET && -+ inet6_sk(meta_sk)->ipv6only) -+ continue; -+ -+ if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) -+ continue; -+ -+ bh_lock_sock(meta_sk); -+ -+ if (!mptcp(meta_tp) || !is_meta_sk(meta_sk) || -+ mpcb->infinite_mapping_snd || -+ mpcb->infinite_mapping_rcv || -+ mpcb->send_infinite_mapping) -+ goto next; -+ -+ /* May be that the pm has changed in-between */ -+ if (mpcb->pm_ops != &full_mesh) -+ goto next; -+ -+ if (sock_owned_by_user(meta_sk)) { -+ if (!test_and_set_bit(MPTCP_PATH_MANAGER, -+ &meta_tp->tsq_flags)) -+ sock_hold(meta_sk); -+ -+ goto next; -+ } -+ -+ if (event->code == MPTCP_EVENT_ADD) { -+ fmp->add_addr++; -+ mpcb->addr_signal = 1; -+ -+ sk = mptcp_select_ack_sock(meta_sk); -+ if (sk) -+ tcp_send_ack(sk); -+ -+ full_mesh_create_subflows(meta_sk); -+ } -+ -+ if (event->code == MPTCP_EVENT_DEL) { -+ struct sock *sk, *tmpsk; -+ struct mptcp_loc_addr *mptcp_local; -+ bool found = false; -+ -+ mptcp_local = rcu_dereference_bh(fm_ns->local); -+ -+ /* In any case, we need to update our bitfields */ -+ if (id >= 0) -+ update_addr_bitfields(meta_sk, mptcp_local); -+ -+ /* Look for the socket and remove him */ -+ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) { -+ if ((event->family == AF_INET6 && -+ (sk->sk_family == AF_INET || -+ mptcp_v6_is_v4_mapped(sk))) || -+ (event->family == AF_INET && -+ (sk->sk_family == AF_INET6 && -+ !mptcp_v6_is_v4_mapped(sk)))) -+ continue; -+ -+ if (event->family == AF_INET && -+ (sk->sk_family == AF_INET || -+ mptcp_v6_is_v4_mapped(sk)) && -+ inet_sk(sk)->inet_saddr != event->addr.in.s_addr) -+ continue; -+ -+ if (event->family == AF_INET6 && -+ sk->sk_family == AF_INET6 && -+ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) -+ continue; -+ -+ /* Reinject, so that pf = 1 and so we -+ * won't select this one as the -+ * ack-sock. -+ */ -+ mptcp_reinject_data(sk, 0); -+ -+ /* We announce the removal of this id */ -+ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk); -+ -+ mptcp_sub_force_close(sk); -+ found = true; -+ } -+ -+ if (found) -+ goto next; -+ -+ /* The id may have been given by the event, -+ * matching on a local address. And it may not -+ * have matched on one of the above sockets, -+ * because the client never created a subflow. -+ * So, we have to finally remove it here. -+ */ -+ if (id > 0) -+ announce_remove_addr(id, meta_sk); -+ } -+ -+ if (event->code == MPTCP_EVENT_MOD) { -+ struct sock *sk; -+ -+ mptcp_for_each_sk(mpcb, sk) { -+ struct tcp_sock *tp = tcp_sk(sk); -+ if (event->family == AF_INET && -+ (sk->sk_family == AF_INET || -+ mptcp_v6_is_v4_mapped(sk)) && -+ inet_sk(sk)->inet_saddr == event->addr.in.s_addr) { -+ if (event->low_prio != tp->mptcp->low_prio) { -+ tp->mptcp->send_mp_prio = 1; -+ tp->mptcp->low_prio = event->low_prio; -+ -+ tcp_send_ack(sk); -+ } -+ } -+ -+ if (event->family == AF_INET6 && -+ sk->sk_family == AF_INET6 && -+ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) { -+ if (event->low_prio != tp->mptcp->low_prio) { -+ tp->mptcp->send_mp_prio = 1; -+ tp->mptcp->low_prio = event->low_prio; -+ -+ tcp_send_ack(sk); -+ } -+ } -+ } -+ } -+next: -+ bh_unlock_sock(meta_sk); -+ sock_put(meta_sk); -+ } -+ rcu_read_unlock_bh(); -+ } -+ goto next_event; -+} -+ -+static struct mptcp_addr_event *lookup_similar_event(const struct net *net, -+ const struct mptcp_addr_event *event) -+{ -+ struct mptcp_addr_event *eventq; -+ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); -+ -+ list_for_each_entry(eventq, &fm_ns->events, list) { -+ if (eventq->family != event->family) -+ continue; -+ if (event->family == AF_INET) { -+ if (eventq->addr.in.s_addr == event->addr.in.s_addr) -+ return eventq; -+ } else { -+ if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6)) -+ return eventq; -+ } -+ } -+ return NULL; -+} -+ -+/* We already hold the net-namespace MPTCP-lock */ -+static void add_pm_event(struct net *net, const struct mptcp_addr_event *event) -+{ -+ struct mptcp_addr_event *eventq = lookup_similar_event(net, event); -+ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); -+ -+ if (eventq) { -+ switch (event->code) { -+ case MPTCP_EVENT_DEL: -+ mptcp_debug("%s del old_code %u\n", __func__, eventq->code); -+ list_del(&eventq->list); -+ kfree(eventq); -+ break; -+ case MPTCP_EVENT_ADD: -+ mptcp_debug("%s add old_code %u\n", __func__, eventq->code); -+ eventq->low_prio = event->low_prio; -+ eventq->code = MPTCP_EVENT_ADD; -+ return; -+ case MPTCP_EVENT_MOD: -+ mptcp_debug("%s mod old_code %u\n", __func__, eventq->code); -+ eventq->low_prio = event->low_prio; -+ eventq->code = MPTCP_EVENT_MOD; -+ return; -+ } -+ } -+ -+ /* OK, we have to add the new address to the wait queue */ -+ eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC); -+ if (!eventq) -+ return; -+ -+ list_add_tail(&eventq->list, &fm_ns->events); -+ -+ /* Create work-queue */ -+ if (!delayed_work_pending(&fm_ns->address_worker)) -+ queue_delayed_work(mptcp_wq, &fm_ns->address_worker, -+ msecs_to_jiffies(500)); -+} -+ -+static void addr4_event_handler(const struct in_ifaddr *ifa, unsigned long event, -+ struct net *net) -+{ -+ const struct net_device *netdev = ifa->ifa_dev->dev; -+ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); -+ struct mptcp_addr_event mpevent; -+ -+ if (ifa->ifa_scope > RT_SCOPE_LINK || -+ ipv4_is_loopback(ifa->ifa_local)) -+ return; -+ -+ spin_lock_bh(&fm_ns->local_lock); -+ -+ mpevent.family = AF_INET; -+ mpevent.addr.in.s_addr = ifa->ifa_local; -+ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0; -+ -+ if (event == NETDEV_DOWN || !netif_running(netdev) || -+ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP)) -+ mpevent.code = MPTCP_EVENT_DEL; -+ else if (event == NETDEV_UP) -+ mpevent.code = MPTCP_EVENT_ADD; -+ else if (event == NETDEV_CHANGE) -+ mpevent.code = MPTCP_EVENT_MOD; -+ -+ mptcp_debug("%s created event for %pI4, code %u prio %u\n", __func__, -+ &ifa->ifa_local, mpevent.code, mpevent.low_prio); -+ add_pm_event(net, &mpevent); -+ -+ spin_unlock_bh(&fm_ns->local_lock); -+ return; -+} -+ -+/* React on IPv4-addr add/rem-events */ -+static int mptcp_pm_inetaddr_event(struct notifier_block *this, -+ unsigned long event, void *ptr) -+{ -+ const struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; -+ struct net *net = dev_net(ifa->ifa_dev->dev); -+ -+ if (!(event == NETDEV_UP || event == NETDEV_DOWN || -+ event == NETDEV_CHANGE)) -+ return NOTIFY_DONE; -+ -+ addr4_event_handler(ifa, event, net); -+ -+ return NOTIFY_DONE; -+} -+ -+static struct notifier_block mptcp_pm_inetaddr_notifier = { -+ .notifier_call = mptcp_pm_inetaddr_event, -+}; -+ -+#if IS_ENABLED(CONFIG_IPV6) -+ -+/* IPV6-related address/interface watchers */ -+struct mptcp_dad_data { -+ struct timer_list timer; -+ struct inet6_ifaddr *ifa; -+}; -+ -+static void dad_callback(unsigned long arg); -+static int inet6_addr_event(struct notifier_block *this, -+ unsigned long event, void *ptr); -+ -+static int ipv6_is_in_dad_state(const struct inet6_ifaddr *ifa) -+{ -+ return (ifa->flags & IFA_F_TENTATIVE) && -+ ifa->state == INET6_IFADDR_STATE_DAD; -+} -+ -+static void dad_init_timer(struct mptcp_dad_data *data, -+ struct inet6_ifaddr *ifa) -+{ -+ data->ifa = ifa; -+ data->timer.data = (unsigned long)data; -+ data->timer.function = dad_callback; -+ if (ifa->idev->cnf.rtr_solicit_delay) -+ data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay; -+ else -+ data->timer.expires = jiffies + (HZ/10); -+} -+ -+static void dad_callback(unsigned long arg) -+{ -+ struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg; -+ -+ if (ipv6_is_in_dad_state(data->ifa)) { -+ dad_init_timer(data, data->ifa); -+ add_timer(&data->timer); -+ } else { -+ inet6_addr_event(NULL, NETDEV_UP, data->ifa); -+ in6_ifa_put(data->ifa); -+ kfree(data); -+ } -+} -+ -+static inline void dad_setup_timer(struct inet6_ifaddr *ifa) -+{ -+ struct mptcp_dad_data *data; -+ -+ data = kmalloc(sizeof(*data), GFP_ATOMIC); -+ -+ if (!data) -+ return; -+ -+ init_timer(&data->timer); -+ dad_init_timer(data, ifa); -+ add_timer(&data->timer); -+ in6_ifa_hold(ifa); -+} -+ -+static void addr6_event_handler(const struct inet6_ifaddr *ifa, unsigned long event, -+ struct net *net) -+{ -+ const struct net_device *netdev = ifa->idev->dev; -+ int addr_type = ipv6_addr_type(&ifa->addr); -+ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); -+ struct mptcp_addr_event mpevent; -+ -+ if (ifa->scope > RT_SCOPE_LINK || -+ addr_type == IPV6_ADDR_ANY || -+ (addr_type & IPV6_ADDR_LOOPBACK) || -+ (addr_type & IPV6_ADDR_LINKLOCAL)) -+ return; -+ -+ spin_lock_bh(&fm_ns->local_lock); -+ -+ mpevent.family = AF_INET6; -+ mpevent.addr.in6 = ifa->addr; -+ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0; -+ -+ if (event == NETDEV_DOWN || !netif_running(netdev) || -+ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP)) -+ mpevent.code = MPTCP_EVENT_DEL; -+ else if (event == NETDEV_UP) -+ mpevent.code = MPTCP_EVENT_ADD; -+ else if (event == NETDEV_CHANGE) -+ mpevent.code = MPTCP_EVENT_MOD; -+ -+ mptcp_debug("%s created event for %pI6, code %u prio %u\n", __func__, -+ &ifa->addr, mpevent.code, mpevent.low_prio); -+ add_pm_event(net, &mpevent); -+ -+ spin_unlock_bh(&fm_ns->local_lock); -+ return; -+} -+ -+/* React on IPv6-addr add/rem-events */ -+static int inet6_addr_event(struct notifier_block *this, unsigned long event, -+ void *ptr) -+{ -+ struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr; -+ struct net *net = dev_net(ifa6->idev->dev); -+ -+ if (!(event == NETDEV_UP || event == NETDEV_DOWN || -+ event == NETDEV_CHANGE)) -+ return NOTIFY_DONE; -+ -+ if (ipv6_is_in_dad_state(ifa6)) -+ dad_setup_timer(ifa6); -+ else -+ addr6_event_handler(ifa6, event, net); -+ -+ return NOTIFY_DONE; -+} -+ -+static struct notifier_block inet6_addr_notifier = { -+ .notifier_call = inet6_addr_event, -+}; -+ -+#endif -+ -+/* React on ifup/down-events */ -+static int netdev_event(struct notifier_block *this, unsigned long event, -+ void *ptr) -+{ -+ const struct net_device *dev = netdev_notifier_info_to_dev(ptr); -+ struct in_device *in_dev; -+#if IS_ENABLED(CONFIG_IPV6) -+ struct inet6_dev *in6_dev; -+#endif -+ -+ if (!(event == NETDEV_UP || event == NETDEV_DOWN || -+ event == NETDEV_CHANGE)) -+ return NOTIFY_DONE; -+ -+ rcu_read_lock(); -+ in_dev = __in_dev_get_rtnl(dev); -+ -+ if (in_dev) { -+ for_ifa(in_dev) { -+ mptcp_pm_inetaddr_event(NULL, event, ifa); -+ } endfor_ifa(in_dev); -+ } -+ -+#if IS_ENABLED(CONFIG_IPV6) -+ in6_dev = __in6_dev_get(dev); -+ -+ if (in6_dev) { -+ struct inet6_ifaddr *ifa6; -+ list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) -+ inet6_addr_event(NULL, event, ifa6); -+ } -+#endif -+ -+ rcu_read_unlock(); -+ return NOTIFY_DONE; -+} -+ -+static struct notifier_block mptcp_pm_netdev_notifier = { -+ .notifier_call = netdev_event, -+}; -+ -+static void full_mesh_add_raddr(struct mptcp_cb *mpcb, -+ const union inet_addr *addr, -+ sa_family_t family, __be16 port, u8 id) -+{ -+ if (family == AF_INET) -+ mptcp_addv4_raddr(mpcb, &addr->in, port, id); -+ else -+ mptcp_addv6_raddr(mpcb, &addr->in6, port, id); -+} -+ -+static void full_mesh_new_session(const struct sock *meta_sk) -+{ -+ struct mptcp_loc_addr *mptcp_local; -+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); -+ int i, index; -+ union inet_addr saddr, daddr; -+ sa_family_t family; -+ bool meta_v4 = meta_sk->sk_family == AF_INET; -+ -+ /* Init local variables necessary for the rest */ -+ if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) { -+ saddr.ip = inet_sk(meta_sk)->inet_saddr; -+ daddr.ip = inet_sk(meta_sk)->inet_daddr; -+ family = AF_INET; -+#if IS_ENABLED(CONFIG_IPV6) -+ } else { -+ saddr.in6 = inet6_sk(meta_sk)->saddr; -+ daddr.in6 = meta_sk->sk_v6_daddr; -+ family = AF_INET6; -+#endif -+ } -+ -+ rcu_read_lock(); -+ mptcp_local = rcu_dereference(fm_ns->local); -+ -+ index = mptcp_find_address(mptcp_local, family, &saddr); -+ if (index < 0) -+ goto fallback; -+ -+ full_mesh_add_raddr(mpcb, &daddr, family, 0, 0); -+ mptcp_set_init_addr_bit(mpcb, &daddr, family, index); -+ -+ /* Initialize workqueue-struct */ -+ INIT_WORK(&fmp->subflow_work, create_subflow_worker); -+ INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker); -+ fmp->mpcb = mpcb; -+ -+ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only) -+ goto skip_ipv4; -+ -+ /* Look for the address among the local addresses */ -+ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { -+ __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr; -+ -+ /* We do not need to announce the initial subflow's address again */ -+ if (family == AF_INET && saddr.ip == ifa_address) -+ continue; -+ -+ fmp->add_addr++; -+ mpcb->addr_signal = 1; -+ } -+ -+skip_ipv4: -+#if IS_ENABLED(CONFIG_IPV6) -+ /* skip IPv6 addresses if meta-socket is IPv4 */ -+ if (meta_v4) -+ goto skip_ipv6; -+ -+ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { -+ const struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr; -+ -+ /* We do not need to announce the initial subflow's address again */ -+ if (family == AF_INET6 && ipv6_addr_equal(&saddr.in6, ifa6)) -+ continue; -+ -+ fmp->add_addr++; -+ mpcb->addr_signal = 1; -+ } -+ -+skip_ipv6: -+#endif -+ -+ rcu_read_unlock(); -+ -+ if (family == AF_INET) -+ fmp->announced_addrs_v4 |= (1 << index); -+ else -+ fmp->announced_addrs_v6 |= (1 << index); -+ -+ for (i = fmp->add_addr; i && fmp->add_addr; i--) -+ tcp_send_ack(mpcb->master_sk); -+ -+ return; -+ -+fallback: -+ rcu_read_unlock(); -+ mptcp_fallback_default(mpcb); -+ return; -+} -+ -+static void full_mesh_create_subflows(struct sock *meta_sk) -+{ -+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ -+ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv || -+ mpcb->send_infinite_mapping || -+ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) -+ return; -+ -+ if (mpcb->master_sk && -+ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) -+ return; -+ -+ if (!work_pending(&fmp->subflow_work)) { -+ sock_hold(meta_sk); -+ queue_work(mptcp_wq, &fmp->subflow_work); -+ } -+} -+ -+/* Called upon release_sock, if the socket was owned by the user during -+ * a path-management event. -+ */ -+static void full_mesh_release_sock(struct sock *meta_sk) -+{ -+ struct mptcp_loc_addr *mptcp_local; -+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); -+ struct sock *sk, *tmpsk; -+ bool meta_v4 = meta_sk->sk_family == AF_INET; -+ int i; -+ -+ rcu_read_lock(); -+ mptcp_local = rcu_dereference(fm_ns->local); -+ -+ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only) -+ goto skip_ipv4; -+ -+ /* First, detect modifications or additions */ -+ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { -+ struct in_addr ifa = mptcp_local->locaddr4[i].addr; -+ bool found = false; -+ -+ mptcp_for_each_sk(mpcb, sk) { -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ if (sk->sk_family == AF_INET6 && -+ !mptcp_v6_is_v4_mapped(sk)) -+ continue; -+ -+ if (inet_sk(sk)->inet_saddr != ifa.s_addr) -+ continue; -+ -+ found = true; -+ -+ if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) { -+ tp->mptcp->send_mp_prio = 1; -+ tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio; -+ -+ tcp_send_ack(sk); -+ } -+ } -+ -+ if (!found) { -+ fmp->add_addr++; -+ mpcb->addr_signal = 1; -+ -+ sk = mptcp_select_ack_sock(meta_sk); -+ if (sk) -+ tcp_send_ack(sk); -+ full_mesh_create_subflows(meta_sk); -+ } -+ } -+ -+skip_ipv4: -+#if IS_ENABLED(CONFIG_IPV6) -+ /* skip IPv6 addresses if meta-socket is IPv4 */ -+ if (meta_v4) -+ goto removal; -+ -+ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { -+ struct in6_addr ifa = mptcp_local->locaddr6[i].addr; -+ bool found = false; -+ -+ mptcp_for_each_sk(mpcb, sk) { -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ if (sk->sk_family == AF_INET || -+ mptcp_v6_is_v4_mapped(sk)) -+ continue; -+ -+ if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa)) -+ continue; -+ -+ found = true; -+ -+ if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) { -+ tp->mptcp->send_mp_prio = 1; -+ tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio; -+ -+ tcp_send_ack(sk); -+ } -+ } -+ -+ if (!found) { -+ fmp->add_addr++; -+ mpcb->addr_signal = 1; -+ -+ sk = mptcp_select_ack_sock(meta_sk); -+ if (sk) -+ tcp_send_ack(sk); -+ full_mesh_create_subflows(meta_sk); -+ } -+ } -+ -+removal: -+#endif -+ -+ /* Now, detect address-removals */ -+ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) { -+ bool shall_remove = true; -+ -+ if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) { -+ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { -+ if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) { -+ shall_remove = false; -+ break; -+ } -+ } -+ } else { -+ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { -+ if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) { -+ shall_remove = false; -+ break; -+ } -+ } -+ } -+ -+ if (shall_remove) { -+ /* Reinject, so that pf = 1 and so we -+ * won't select this one as the -+ * ack-sock. -+ */ -+ mptcp_reinject_data(sk, 0); -+ -+ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, -+ meta_sk); -+ -+ mptcp_sub_force_close(sk); -+ } -+ } -+ -+ /* Just call it optimistically. It actually cannot do any harm */ -+ update_addr_bitfields(meta_sk, mptcp_local); -+ -+ rcu_read_unlock(); -+} -+ -+static int full_mesh_get_local_id(sa_family_t family, union inet_addr *addr, -+ struct net *net, bool *low_prio) -+{ -+ struct mptcp_loc_addr *mptcp_local; -+ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net); -+ int index, id = -1; -+ -+ /* Handle the backup-flows */ -+ rcu_read_lock(); -+ mptcp_local = rcu_dereference(fm_ns->local); -+ -+ index = mptcp_find_address(mptcp_local, family, addr); -+ -+ if (index != -1) { -+ if (family == AF_INET) { -+ id = mptcp_local->locaddr4[index].loc4_id; -+ *low_prio = mptcp_local->locaddr4[index].low_prio; -+ } else { -+ id = mptcp_local->locaddr6[index].loc6_id; -+ *low_prio = mptcp_local->locaddr6[index].low_prio; -+ } -+ } -+ -+ -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static void full_mesh_addr_signal(struct sock *sk, unsigned *size, -+ struct tcp_out_options *opts, -+ struct sk_buff *skb) -+{ -+ const struct tcp_sock *tp = tcp_sk(sk); -+ struct mptcp_cb *mpcb = tp->mpcb; -+ struct sock *meta_sk = mpcb->meta_sk; -+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); -+ struct mptcp_loc_addr *mptcp_local; -+ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk)); -+ int remove_addr_len; -+ u8 unannouncedv4 = 0, unannouncedv6 = 0; -+ bool meta_v4 = meta_sk->sk_family == AF_INET; -+ -+ mpcb->addr_signal = 0; -+ -+ if (likely(!fmp->add_addr)) -+ goto remove_addr; -+ -+ rcu_read_lock(); -+ mptcp_local = rcu_dereference(fm_ns->local); -+ -+ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only) -+ goto skip_ipv4; -+ -+ /* IPv4 */ -+ unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits; -+ if (unannouncedv4 && -+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) { -+ int ind = mptcp_find_free_index(~unannouncedv4); -+ -+ opts->options |= OPTION_MPTCP; -+ opts->mptcp_options |= OPTION_ADD_ADDR; -+ opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id; -+ opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr; -+ opts->add_addr_v4 = 1; -+ -+ if (skb) { -+ fmp->announced_addrs_v4 |= (1 << ind); -+ fmp->add_addr--; -+ } -+ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN; -+ } -+ -+ if (meta_v4) -+ goto skip_ipv6; -+ -+skip_ipv4: -+ /* IPv6 */ -+ unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits; -+ if (unannouncedv6 && -+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) { -+ int ind = mptcp_find_free_index(~unannouncedv6); -+ -+ opts->options |= OPTION_MPTCP; -+ opts->mptcp_options |= OPTION_ADD_ADDR; -+ opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id; -+ opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr; -+ opts->add_addr_v6 = 1; -+ -+ if (skb) { -+ fmp->announced_addrs_v6 |= (1 << ind); -+ fmp->add_addr--; -+ } -+ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN; -+ } -+ -+skip_ipv6: -+ rcu_read_unlock(); -+ -+ if (!unannouncedv4 && !unannouncedv6 && skb) -+ fmp->add_addr--; -+ -+remove_addr: -+ if (likely(!fmp->remove_addrs)) -+ goto exit; -+ -+ remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs); -+ if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len) -+ goto exit; -+ -+ opts->options |= OPTION_MPTCP; -+ opts->mptcp_options |= OPTION_REMOVE_ADDR; -+ opts->remove_addrs = fmp->remove_addrs; -+ *size += remove_addr_len; -+ if (skb) -+ fmp->remove_addrs = 0; -+ -+exit: -+ mpcb->addr_signal = !!(fmp->add_addr || fmp->remove_addrs); -+} -+ -+static void full_mesh_rem_raddr(struct mptcp_cb *mpcb, u8 rem_id) -+{ -+ mptcp_v4_rem_raddress(mpcb, rem_id); -+ mptcp_v6_rem_raddress(mpcb, rem_id); -+} -+ -+/* Output /proc/net/mptcp_fullmesh */ -+static int mptcp_fm_seq_show(struct seq_file *seq, void *v) -+{ -+ const struct net *net = seq->private; -+ struct mptcp_loc_addr *mptcp_local; -+ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net); -+ int i; -+ -+ seq_printf(seq, "Index, Address-ID, Backup, IP-address\n"); -+ -+ rcu_read_lock_bh(); -+ mptcp_local = rcu_dereference(fm_ns->local); -+ -+ seq_printf(seq, "IPv4, next v4-index: %u\n", mptcp_local->next_v4_index); -+ -+ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { -+ struct mptcp_loc4 *loc4 = &mptcp_local->locaddr4[i]; -+ -+ seq_printf(seq, "%u, %u, %u, %pI4\n", i, loc4->loc4_id, -+ loc4->low_prio, &loc4->addr); -+ } -+ -+ seq_printf(seq, "IPv6, next v6-index: %u\n", mptcp_local->next_v6_index); -+ -+ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { -+ struct mptcp_loc6 *loc6 = &mptcp_local->locaddr6[i]; -+ -+ seq_printf(seq, "%u, %u, %u, %pI6\n", i, loc6->loc6_id, -+ loc6->low_prio, &loc6->addr); -+ } -+ rcu_read_unlock_bh(); -+ -+ return 0; -+} -+ -+static int mptcp_fm_seq_open(struct inode *inode, struct file *file) -+{ -+ return single_open_net(inode, file, mptcp_fm_seq_show); -+} -+ -+static const struct file_operations mptcp_fm_seq_fops = { -+ .owner = THIS_MODULE, -+ .open = mptcp_fm_seq_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = single_release_net, -+}; -+ -+static int mptcp_fm_init_net(struct net *net) -+{ -+ struct mptcp_loc_addr *mptcp_local; -+ struct mptcp_fm_ns *fm_ns; -+ int err = 0; -+ -+ fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL); -+ if (!fm_ns) -+ return -ENOBUFS; -+ -+ mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL); -+ if (!mptcp_local) { -+ err = -ENOBUFS; -+ goto err_mptcp_local; -+ } -+ -+ if (!proc_create("mptcp_fullmesh", S_IRUGO, net->proc_net, -+ &mptcp_fm_seq_fops)) { -+ err = -ENOMEM; -+ goto err_seq_fops; -+ } -+ -+ mptcp_local->next_v4_index = 1; -+ -+ rcu_assign_pointer(fm_ns->local, mptcp_local); -+ INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker); -+ INIT_LIST_HEAD(&fm_ns->events); -+ spin_lock_init(&fm_ns->local_lock); -+ fm_ns->net = net; -+ net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns; -+ -+ return 0; -+err_seq_fops: -+ kfree(mptcp_local); -+err_mptcp_local: -+ kfree(fm_ns); -+ return err; -+} -+ -+static void mptcp_fm_exit_net(struct net *net) -+{ -+ struct mptcp_addr_event *eventq, *tmp; -+ struct mptcp_fm_ns *fm_ns; -+ struct mptcp_loc_addr *mptcp_local; -+ -+ fm_ns = fm_get_ns(net); -+ cancel_delayed_work_sync(&fm_ns->address_worker); -+ -+ rcu_read_lock_bh(); -+ -+ mptcp_local = rcu_dereference_bh(fm_ns->local); -+ kfree(mptcp_local); -+ -+ spin_lock(&fm_ns->local_lock); -+ list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) { -+ list_del(&eventq->list); -+ kfree(eventq); -+ } -+ spin_unlock(&fm_ns->local_lock); -+ -+ rcu_read_unlock_bh(); -+ -+ remove_proc_entry("mptcp_fullmesh", net->proc_net); -+ -+ kfree(fm_ns); -+} -+ -+static struct pernet_operations full_mesh_net_ops = { -+ .init = mptcp_fm_init_net, -+ .exit = mptcp_fm_exit_net, -+}; -+ -+static struct mptcp_pm_ops full_mesh __read_mostly = { -+ .new_session = full_mesh_new_session, -+ .release_sock = full_mesh_release_sock, -+ .fully_established = full_mesh_create_subflows, -+ .new_remote_address = full_mesh_create_subflows, -+ .get_local_id = full_mesh_get_local_id, -+ .addr_signal = full_mesh_addr_signal, -+ .add_raddr = full_mesh_add_raddr, -+ .rem_raddr = full_mesh_rem_raddr, -+ .name = "fullmesh", -+ .owner = THIS_MODULE, -+}; -+ -+/* General initialization of MPTCP_PM */ -+static int __init full_mesh_register(void) -+{ -+ int ret; -+ -+ BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE); -+ -+ ret = register_pernet_subsys(&full_mesh_net_ops); -+ if (ret) -+ goto out; -+ -+ ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); -+ if (ret) -+ goto err_reg_inetaddr; -+ ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier); -+ if (ret) -+ goto err_reg_netdev; -+ -+#if IS_ENABLED(CONFIG_IPV6) -+ ret = register_inet6addr_notifier(&inet6_addr_notifier); -+ if (ret) -+ goto err_reg_inet6addr; -+#endif -+ -+ ret = mptcp_register_path_manager(&full_mesh); -+ if (ret) -+ goto err_reg_pm; -+ -+out: -+ return ret; -+ -+ -+err_reg_pm: -+#if IS_ENABLED(CONFIG_IPV6) -+ unregister_inet6addr_notifier(&inet6_addr_notifier); -+err_reg_inet6addr: -+#endif -+ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier); -+err_reg_netdev: -+ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); -+err_reg_inetaddr: -+ unregister_pernet_subsys(&full_mesh_net_ops); -+ goto out; -+} -+ -+static void full_mesh_unregister(void) -+{ -+#if IS_ENABLED(CONFIG_IPV6) -+ unregister_inet6addr_notifier(&inet6_addr_notifier); -+#endif -+ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier); -+ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); -+ unregister_pernet_subsys(&full_mesh_net_ops); -+ mptcp_unregister_path_manager(&full_mesh); -+} -+ -+module_init(full_mesh_register); -+module_exit(full_mesh_unregister); -+ -+MODULE_AUTHOR("Christoph Paasch"); -+MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("Full-Mesh MPTCP"); -+MODULE_VERSION("0.88"); -diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c -new file mode 100644 -index 000000000000..43704ccb639e ---- /dev/null -+++ b/net/mptcp/mptcp_input.c -@@ -0,0 +1,2405 @@ -+/* -+ * MPTCP implementation - Sending side -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer & Author: -+ * Christoph Paasch -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+#include -+ -+#include -+#include -+#include -+ -+#include -+ -+/* is seq1 < seq2 ? */ -+static inline bool before64(const u64 seq1, const u64 seq2) -+{ -+ return (s64)(seq1 - seq2) < 0; -+} -+ -+/* is seq1 > seq2 ? */ -+#define after64(seq1, seq2) before64(seq2, seq1) -+ -+static inline void mptcp_become_fully_estab(struct sock *sk) -+{ -+ tcp_sk(sk)->mptcp->fully_established = 1; -+ -+ if (is_master_tp(tcp_sk(sk)) && -+ tcp_sk(sk)->mpcb->pm_ops->fully_established) -+ tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk)); -+} -+ -+/* Similar to tcp_tso_acked without any memory accounting */ -+static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk, -+ struct sk_buff *skb) -+{ -+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ u32 packets_acked, len; -+ -+ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)); -+ -+ packets_acked = tcp_skb_pcount(skb); -+ -+ if (skb_unclone(skb, GFP_ATOMIC)) -+ return 0; -+ -+ len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq; -+ __pskb_trim_head(skb, len); -+ -+ TCP_SKB_CB(skb)->seq += len; -+ skb->ip_summed = CHECKSUM_PARTIAL; -+ skb->truesize -= len; -+ -+ /* Any change of skb->len requires recalculation of tso factor. */ -+ if (tcp_skb_pcount(skb) > 1) -+ tcp_set_skb_tso_segs(meta_sk, skb, tcp_skb_mss(skb)); -+ packets_acked -= tcp_skb_pcount(skb); -+ -+ if (packets_acked) { -+ BUG_ON(tcp_skb_pcount(skb) == 0); -+ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); -+ } -+ -+ return packets_acked; -+} -+ -+/** -+ * Cleans the meta-socket retransmission queue and the reinject-queue. -+ * @sk must be the metasocket. -+ */ -+static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una) -+{ -+ struct sk_buff *skb, *tmp; -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ bool acked = false; -+ u32 acked_pcount; -+ -+ while ((skb = tcp_write_queue_head(meta_sk)) && -+ skb != tcp_send_head(meta_sk)) { -+ bool fully_acked = true; -+ -+ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { -+ if (tcp_skb_pcount(skb) == 1 || -+ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) -+ break; -+ -+ acked_pcount = tcp_tso_acked(meta_sk, skb); -+ if (!acked_pcount) -+ break; -+ -+ fully_acked = false; -+ } else { -+ acked_pcount = tcp_skb_pcount(skb); -+ } -+ -+ acked = true; -+ meta_tp->packets_out -= acked_pcount; -+ meta_tp->retrans_stamp = 0; -+ -+ if (!fully_acked) -+ break; -+ -+ tcp_unlink_write_queue(skb, meta_sk); -+ -+ if (mptcp_is_data_fin(skb)) { -+ struct sock *sk_it; -+ -+ /* DATA_FIN has been acknowledged - now we can close -+ * the subflows -+ */ -+ mptcp_for_each_sk(mpcb, sk_it) { -+ unsigned long delay = 0; -+ -+ /* If we are the passive closer, don't trigger -+ * subflow-fin until the subflow has been finned -+ * by the peer - thus we add a delay. -+ */ -+ if (mpcb->passive_close && -+ sk_it->sk_state == TCP_ESTABLISHED) -+ delay = inet_csk(sk_it)->icsk_rto << 3; -+ -+ mptcp_sub_close(sk_it, delay); -+ } -+ } -+ sk_wmem_free_skb(meta_sk, skb); -+ } -+ /* Remove acknowledged data from the reinject queue */ -+ skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) { -+ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { -+ if (tcp_skb_pcount(skb) == 1 || -+ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) -+ break; -+ -+ mptcp_tso_acked_reinject(meta_sk, skb); -+ break; -+ } -+ -+ __skb_unlink(skb, &mpcb->reinject_queue); -+ __kfree_skb(skb); -+ } -+ -+ if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una))) -+ meta_tp->snd_up = meta_tp->snd_una; -+ -+ if (acked) { -+ tcp_rearm_rto(meta_sk); -+ /* Normally this is done in tcp_try_undo_loss - but MPTCP -+ * does not call this function. -+ */ -+ inet_csk(meta_sk)->icsk_retransmits = 0; -+ } -+} -+ -+/* Inspired by tcp_rcv_state_process */ -+static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk, -+ const struct sk_buff *skb, u32 data_seq, -+ u16 data_len) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); -+ const struct tcphdr *th = tcp_hdr(skb); -+ -+ /* State-machine handling if FIN has been enqueued and he has -+ * been acked (snd_una == write_seq) - it's important that this -+ * here is after sk_wmem_free_skb because otherwise -+ * sk_forward_alloc is wrong upon inet_csk_destroy_sock() -+ */ -+ switch (meta_sk->sk_state) { -+ case TCP_FIN_WAIT1: { -+ struct dst_entry *dst; -+ int tmo; -+ -+ if (meta_tp->snd_una != meta_tp->write_seq) -+ break; -+ -+ tcp_set_state(meta_sk, TCP_FIN_WAIT2); -+ meta_sk->sk_shutdown |= SEND_SHUTDOWN; -+ -+ dst = __sk_dst_get(sk); -+ if (dst) -+ dst_confirm(dst); -+ -+ if (!sock_flag(meta_sk, SOCK_DEAD)) { -+ /* Wake up lingering close() */ -+ meta_sk->sk_state_change(meta_sk); -+ break; -+ } -+ -+ if (meta_tp->linger2 < 0 || -+ (data_len && -+ after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0), -+ meta_tp->rcv_nxt))) { -+ mptcp_send_active_reset(meta_sk, GFP_ATOMIC); -+ tcp_done(meta_sk); -+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); -+ return 1; -+ } -+ -+ tmo = tcp_fin_time(meta_sk); -+ if (tmo > TCP_TIMEWAIT_LEN) { -+ inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN); -+ } else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) { -+ /* Bad case. We could lose such FIN otherwise. -+ * It is not a big problem, but it looks confusing -+ * and not so rare event. We still can lose it now, -+ * if it spins in bh_lock_sock(), but it is really -+ * marginal case. -+ */ -+ inet_csk_reset_keepalive_timer(meta_sk, tmo); -+ } else { -+ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo); -+ } -+ break; -+ } -+ case TCP_CLOSING: -+ case TCP_LAST_ACK: -+ if (meta_tp->snd_una == meta_tp->write_seq) { -+ tcp_done(meta_sk); -+ return 1; -+ } -+ break; -+ } -+ -+ /* step 7: process the segment text */ -+ switch (meta_sk->sk_state) { -+ case TCP_FIN_WAIT1: -+ case TCP_FIN_WAIT2: -+ /* RFC 793 says to queue data in these states, -+ * RFC 1122 says we MUST send a reset. -+ * BSD 4.4 also does reset. -+ */ -+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { -+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && -+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && -+ !mptcp_is_data_fin2(skb, tp)) { -+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); -+ mptcp_send_active_reset(meta_sk, GFP_ATOMIC); -+ tcp_reset(meta_sk); -+ return 1; -+ } -+ } -+ break; -+ } -+ -+ return 0; -+} -+ -+/** -+ * @return: -+ * i) 1: Everything's fine. -+ * ii) -1: A reset has been sent on the subflow - csum-failure -+ * iii) 0: csum-failure but no reset sent, because it's the last subflow. -+ * Last packet should not be destroyed by the caller because it has -+ * been done here. -+ */ -+static int mptcp_verif_dss_csum(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct sk_buff *tmp, *tmp1, *last = NULL; -+ __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */ -+ int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0; -+ int iter = 0; -+ -+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) { -+ unsigned int csum_len; -+ -+ if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq)) -+ /* Mapping ends in the middle of the packet - -+ * csum only these bytes -+ */ -+ csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq; -+ else -+ csum_len = tmp->len; -+ -+ offset = 0; -+ if (overflowed) { -+ char first_word[4]; -+ first_word[0] = 0; -+ first_word[1] = 0; -+ first_word[2] = 0; -+ first_word[3] = *(tmp->data); -+ csum_tcp = csum_partial(first_word, 4, csum_tcp); -+ offset = 1; -+ csum_len--; -+ overflowed = 0; -+ } -+ -+ csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp); -+ -+ /* Was it on an odd-length? Then we have to merge the next byte -+ * correctly (see above) -+ */ -+ if (csum_len != (csum_len & (~1))) -+ overflowed = 1; -+ -+ if (mptcp_is_data_seq(tmp) && !dss_csum_added) { -+ __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32)); -+ -+ /* If a 64-bit dss is present, we increase the offset -+ * by 4 bytes, as the high-order 64-bits will be added -+ * in the final csum_partial-call. -+ */ -+ u32 offset = skb_transport_offset(tmp) + -+ TCP_SKB_CB(tmp)->dss_off; -+ if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET) -+ offset += 4; -+ -+ csum_tcp = skb_checksum(tmp, offset, -+ MPTCP_SUB_LEN_SEQ_CSUM, -+ csum_tcp); -+ -+ csum_tcp = csum_partial(&data_seq, -+ sizeof(data_seq), csum_tcp); -+ -+ dss_csum_added = 1; /* Just do it once */ -+ } -+ last = tmp; -+ iter++; -+ -+ if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) && -+ !before(TCP_SKB_CB(tmp1)->seq, -+ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) -+ break; -+ } -+ -+ /* Now, checksum must be 0 */ -+ if (unlikely(csum_fold(csum_tcp))) { -+ pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n", -+ __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq, -+ dss_csum_added, overflowed, iter); -+ -+ tp->mptcp->send_mp_fail = 1; -+ -+ /* map_data_seq is the data-seq number of the -+ * mapping we are currently checking -+ */ -+ tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq; -+ -+ if (tp->mpcb->cnt_subflows > 1) { -+ mptcp_send_reset(sk); -+ ans = -1; -+ } else { -+ tp->mpcb->send_infinite_mapping = 1; -+ -+ /* Need to purge the rcv-queue as it's no more valid */ -+ while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { -+ tp->copied_seq = TCP_SKB_CB(tmp)->end_seq; -+ kfree_skb(tmp); -+ } -+ -+ ans = 0; -+ } -+ } -+ -+ return ans; -+} -+ -+static inline void mptcp_prepare_skb(struct sk_buff *skb, -+ const struct sock *sk) -+{ -+ const struct tcp_sock *tp = tcp_sk(sk); -+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); -+ u32 inc = 0; -+ -+ /* If skb is the end of this mapping (end is always at mapping-boundary -+ * thanks to the splitting/trimming), then we need to increase -+ * data-end-seq by 1 if this here is a data-fin. -+ * -+ * We need to do -1 because end_seq includes the subflow-FIN. -+ */ -+ if (tp->mptcp->map_data_fin && -+ (tcb->end_seq - (tcp_hdr(skb)->fin ? 1 : 0)) == -+ (tp->mptcp->map_subseq + tp->mptcp->map_data_len)) { -+ inc = 1; -+ -+ /* We manually set the fin-flag if it is a data-fin. For easy -+ * processing in tcp_recvmsg. -+ */ -+ tcp_hdr(skb)->fin = 1; -+ } else { -+ /* We may have a subflow-fin with data but without data-fin */ -+ tcp_hdr(skb)->fin = 0; -+ } -+ -+ /* Adapt data-seq's to the packet itself. We kinda transform the -+ * dss-mapping to a per-packet granularity. This is necessary to -+ * correctly handle overlapping mappings coming from different -+ * subflows. Otherwise it would be a complete mess. -+ */ -+ tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq; -+ tcb->end_seq = tcb->seq + skb->len + inc; -+} -+ -+/** -+ * @return: 1 if the segment has been eaten and can be suppressed, -+ * otherwise 0. -+ */ -+static inline int mptcp_direct_copy(const struct sk_buff *skb, -+ struct sock *meta_sk) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len); -+ int eaten = 0; -+ -+ __set_current_state(TASK_RUNNING); -+ -+ local_bh_enable(); -+ if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) { -+ meta_tp->ucopy.len -= chunk; -+ meta_tp->copied_seq += chunk; -+ eaten = (chunk == skb->len); -+ tcp_rcv_space_adjust(meta_sk); -+ } -+ local_bh_disable(); -+ return eaten; -+} -+ -+static inline void mptcp_reset_mapping(struct tcp_sock *tp) -+{ -+ tp->mptcp->map_data_len = 0; -+ tp->mptcp->map_data_seq = 0; -+ tp->mptcp->map_subseq = 0; -+ tp->mptcp->map_data_fin = 0; -+ tp->mptcp->mapping_present = 0; -+} -+ -+/* The DSS-mapping received on the sk only covers the second half of the skb -+ * (cut at seq). We trim the head from the skb. -+ * Data will be freed upon kfree(). -+ * -+ * Inspired by tcp_trim_head(). -+ */ -+static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq) -+{ -+ int len = seq - TCP_SKB_CB(skb)->seq; -+ u32 new_seq = TCP_SKB_CB(skb)->seq + len; -+ -+ if (len < skb_headlen(skb)) -+ __skb_pull(skb, len); -+ else -+ __pskb_trim_head(skb, len - skb_headlen(skb)); -+ -+ TCP_SKB_CB(skb)->seq = new_seq; -+ -+ skb->truesize -= len; -+ atomic_sub(len, &sk->sk_rmem_alloc); -+ sk_mem_uncharge(sk, len); -+} -+ -+/* The DSS-mapping received on the sk only covers the first half of the skb -+ * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue -+ * as further packets may resolve the mapping of the second half of data. -+ * -+ * Inspired by tcp_fragment(). -+ */ -+static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq) -+{ -+ struct sk_buff *buff; -+ int nsize; -+ int nlen, len; -+ -+ len = seq - TCP_SKB_CB(skb)->seq; -+ nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len; -+ if (nsize < 0) -+ nsize = 0; -+ -+ /* Get a new skb... force flag on. */ -+ buff = alloc_skb(nsize, GFP_ATOMIC); -+ if (buff == NULL) -+ return -ENOMEM; -+ -+ skb_reserve(buff, tcp_sk(sk)->tcp_header_len); -+ skb_reset_transport_header(buff); -+ -+ tcp_hdr(buff)->fin = tcp_hdr(skb)->fin; -+ tcp_hdr(skb)->fin = 0; -+ -+ /* We absolutly need to call skb_set_owner_r before refreshing the -+ * truesize of buff, otherwise the moved data will account twice. -+ */ -+ skb_set_owner_r(buff, sk); -+ nlen = skb->len - len - nsize; -+ buff->truesize += nlen; -+ skb->truesize -= nlen; -+ -+ /* Correct the sequence numbers. */ -+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; -+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; -+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; -+ -+ skb_split(skb, buff, len); -+ -+ __skb_queue_after(&sk->sk_receive_queue, skb, buff); -+ -+ return 0; -+} -+ -+/* @return: 0 everything is fine. Just continue processing -+ * 1 subflow is broken stop everything -+ * -1 this packet was broken - continue with the next one. -+ */ -+static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */ -+ if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) && -+ !tp->mpcb->infinite_mapping_rcv) { -+ /* Remove a pure subflow-fin from the queue and increase -+ * copied_seq. -+ */ -+ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; -+ __skb_unlink(skb, &sk->sk_receive_queue); -+ __kfree_skb(skb); -+ return -1; -+ } -+ -+ /* If we are not yet fully established and do not know the mapping for -+ * this segment, this path has to fallback to infinite or be torn down. -+ */ -+ if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) && -+ !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) { -+ pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n", -+ __func__, tp->mpcb->mptcp_loc_token, -+ tp->mptcp->path_index, __builtin_return_address(0), -+ TCP_SKB_CB(skb)->seq); -+ -+ if (!is_master_tp(tp)) { -+ mptcp_send_reset(sk); -+ return 1; -+ } -+ -+ tp->mpcb->infinite_mapping_snd = 1; -+ tp->mpcb->infinite_mapping_rcv = 1; -+ /* We do a seamless fallback and should not send a inf.mapping. */ -+ tp->mpcb->send_infinite_mapping = 0; -+ tp->mptcp->fully_established = 1; -+ } -+ -+ /* Receiver-side becomes fully established when a whole rcv-window has -+ * been received without the need to fallback due to the previous -+ * condition. -+ */ -+ if (!tp->mptcp->fully_established) { -+ tp->mptcp->init_rcv_wnd -= skb->len; -+ if (tp->mptcp->init_rcv_wnd < 0) -+ mptcp_become_fully_estab(sk); -+ } -+ -+ return 0; -+} -+ -+/* @return: 0 everything is fine. Just continue processing -+ * 1 subflow is broken stop everything -+ * -1 this packet was broken - continue with the next one. -+ */ -+static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb) -+{ -+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); -+ struct mptcp_cb *mpcb = tp->mpcb; -+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); -+ u32 *ptr; -+ u32 data_seq, sub_seq, data_len, tcp_end_seq; -+ -+ /* If we are in infinite-mapping-mode, the subflow is guaranteed to be -+ * in-order at the data-level. Thus data-seq-numbers can be inferred -+ * from what is expected at the data-level. -+ */ -+ if (mpcb->infinite_mapping_rcv) { -+ tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp); -+ tp->mptcp->map_subseq = tcb->seq; -+ tp->mptcp->map_data_len = skb->len; -+ tp->mptcp->map_data_fin = tcp_hdr(skb)->fin; -+ tp->mptcp->mapping_present = 1; -+ return 0; -+ } -+ -+ /* No mapping here? Exit - it is either already set or still on its way */ -+ if (!mptcp_is_data_seq(skb)) { -+ /* Too many packets without a mapping - this subflow is broken */ -+ if (!tp->mptcp->mapping_present && -+ tp->rcv_nxt - tp->copied_seq > 65536) { -+ mptcp_send_reset(sk); -+ return 1; -+ } -+ -+ return 0; -+ } -+ -+ ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb); -+ ptr++; -+ sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn; -+ ptr++; -+ data_len = get_unaligned_be16(ptr); -+ -+ /* If it's an empty skb with DATA_FIN, sub_seq must get fixed. -+ * The draft sets it to 0, but we really would like to have the -+ * real value, to have an easy handling afterwards here in this -+ * function. -+ */ -+ if (mptcp_is_data_fin(skb) && skb->len == 0) -+ sub_seq = TCP_SKB_CB(skb)->seq; -+ -+ /* If there is already a mapping - we check if it maps with the current -+ * one. If not - we reset. -+ */ -+ if (tp->mptcp->mapping_present && -+ (data_seq != (u32)tp->mptcp->map_data_seq || -+ sub_seq != tp->mptcp->map_subseq || -+ data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin || -+ mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) { -+ /* Mapping in packet is different from what we want */ -+ pr_err("%s Mappings do not match!\n", __func__); -+ pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n", -+ __func__, data_seq, (u32)tp->mptcp->map_data_seq, -+ sub_seq, tp->mptcp->map_subseq, data_len, -+ tp->mptcp->map_data_len, mptcp_is_data_fin(skb), -+ tp->mptcp->map_data_fin); -+ mptcp_send_reset(sk); -+ return 1; -+ } -+ -+ /* If the previous check was good, the current mapping is valid and we exit. */ -+ if (tp->mptcp->mapping_present) -+ return 0; -+ -+ /* Mapping not yet set on this subflow - we set it here! */ -+ -+ if (!data_len) { -+ mpcb->infinite_mapping_rcv = 1; -+ tp->mptcp->fully_established = 1; -+ /* We need to repeat mp_fail's until the sender felt -+ * back to infinite-mapping - here we stop repeating it. -+ */ -+ tp->mptcp->send_mp_fail = 0; -+ -+ /* We have to fixup data_len - it must be the same as skb->len */ -+ data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0); -+ sub_seq = tcb->seq; -+ -+ /* TODO kill all other subflows than this one */ -+ /* data_seq and so on are set correctly */ -+ -+ /* At this point, the meta-ofo-queue has to be emptied, -+ * as the following data is guaranteed to be in-order at -+ * the data and subflow-level -+ */ -+ mptcp_purge_ofo_queue(meta_tp); -+ } -+ -+ /* We are sending mp-fail's and thus are in fallback mode. -+ * Ignore packets which do not announce the fallback and still -+ * want to provide a mapping. -+ */ -+ if (tp->mptcp->send_mp_fail) { -+ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; -+ __skb_unlink(skb, &sk->sk_receive_queue); -+ __kfree_skb(skb); -+ return -1; -+ } -+ -+ /* FIN increased the mapping-length by 1 */ -+ if (mptcp_is_data_fin(skb)) -+ data_len--; -+ -+ /* Subflow-sequences of packet must be -+ * (at least partially) be part of the DSS-mapping's -+ * subflow-sequence-space. -+ * -+ * Basically the mapping is not valid, if either of the -+ * following conditions is true: -+ * -+ * 1. It's not a data_fin and -+ * MPTCP-sub_seq >= TCP-end_seq -+ * -+ * 2. It's a data_fin and TCP-end_seq > TCP-seq and -+ * MPTCP-sub_seq >= TCP-end_seq -+ * -+ * The previous two can be merged into: -+ * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq -+ * Because if it's not a data-fin, TCP-end_seq > TCP-seq -+ * -+ * 3. It's a data_fin and skb->len == 0 and -+ * MPTCP-sub_seq > TCP-end_seq -+ * -+ * 4. It's not a data_fin and TCP-end_seq > TCP-seq and -+ * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq -+ * -+ * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq) -+ */ -+ -+ /* subflow-fin is not part of the mapping - ignore it here ! */ -+ tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin; -+ if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) || -+ (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) || -+ (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) || -+ before(sub_seq, tp->copied_seq)) { -+ /* Subflow-sequences of packet is different from what is in the -+ * packet's dss-mapping. The peer is misbehaving - reset -+ */ -+ pr_err("%s Packet's mapping does not map to the DSS sub_seq %u " -+ "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u" -+ "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb), -+ skb->len, data_len, tp->copied_seq); -+ mptcp_send_reset(sk); -+ return 1; -+ } -+ -+ /* Does the DSS had 64-bit seqnum's ? */ -+ if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) { -+ /* Wrapped around? */ -+ if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) { -+ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq); -+ } else { -+ /* Else, access the default high-order bits */ -+ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq); -+ } -+ } else { -+ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq); -+ -+ if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) { -+ /* We make sure that the data_seq is invalid. -+ * It will be dropped later. -+ */ -+ tp->mptcp->map_data_seq += 0xFFFFFFFF; -+ tp->mptcp->map_data_seq += 0xFFFFFFFF; -+ } -+ } -+ -+ tp->mptcp->map_data_len = data_len; -+ tp->mptcp->map_subseq = sub_seq; -+ tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0; -+ tp->mptcp->mapping_present = 1; -+ -+ return 0; -+} -+ -+/* Similar to tcp_sequence(...) */ -+static inline bool mptcp_sequence(const struct tcp_sock *meta_tp, -+ u64 data_seq, u64 end_data_seq) -+{ -+ const struct mptcp_cb *mpcb = meta_tp->mpcb; -+ u64 rcv_wup64; -+ -+ /* Wrap-around? */ -+ if (meta_tp->rcv_wup > meta_tp->rcv_nxt) { -+ rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) | -+ meta_tp->rcv_wup; -+ } else { -+ rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, -+ meta_tp->rcv_wup); -+ } -+ -+ return !before64(end_data_seq, rcv_wup64) && -+ !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp)); -+} -+ -+/* @return: 0 everything is fine. Just continue processing -+ * -1 this packet was broken - continue with the next one. -+ */ -+static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct sk_buff *tmp, *tmp1; -+ u32 tcp_end_seq; -+ -+ if (!tp->mptcp->mapping_present) -+ return 0; -+ -+ /* either, the new skb gave us the mapping and the first segment -+ * in the sub-rcv-queue has to be trimmed ... -+ */ -+ tmp = skb_peek(&sk->sk_receive_queue); -+ if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) && -+ after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq)) -+ mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq); -+ -+ /* ... or the new skb (tail) has to be split at the end. */ -+ tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0); -+ if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) { -+ u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len; -+ if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */ -+ /* TODO : maybe handle this here better. -+ * We now just force meta-retransmission. -+ */ -+ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; -+ __skb_unlink(skb, &sk->sk_receive_queue); -+ __kfree_skb(skb); -+ return -1; -+ } -+ } -+ -+ /* Now, remove old sk_buff's from the receive-queue. -+ * This may happen if the mapping has been lost for these segments and -+ * the next mapping has already been received. -+ */ -+ if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) { -+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { -+ if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq)) -+ break; -+ -+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; -+ __skb_unlink(tmp1, &sk->sk_receive_queue); -+ -+ /* Impossible that we could free skb here, because his -+ * mapping is known to be valid from previous checks -+ */ -+ __kfree_skb(tmp1); -+ } -+ } -+ -+ return 0; -+} -+ -+/* @return: 0 everything is fine. Just continue processing -+ * 1 subflow is broken stop everything -+ * -1 this mapping has been put in the meta-receive-queue -+ * -2 this mapping has been eaten by the application -+ */ -+static int mptcp_queue_skb(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); -+ struct sock *meta_sk = mptcp_meta_sk(sk); -+ struct mptcp_cb *mpcb = tp->mpcb; -+ struct sk_buff *tmp, *tmp1; -+ u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp); -+ bool data_queued = false; -+ -+ /* Have we not yet received the full mapping? */ -+ if (!tp->mptcp->mapping_present || -+ before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) -+ return 0; -+ -+ /* Is this an overlapping mapping? rcv_nxt >= end_data_seq -+ * OR -+ * This mapping is out of window -+ */ -+ if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) || -+ !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq, -+ tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) { -+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { -+ __skb_unlink(tmp1, &sk->sk_receive_queue); -+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; -+ __kfree_skb(tmp1); -+ -+ if (!skb_queue_empty(&sk->sk_receive_queue) && -+ !before(TCP_SKB_CB(tmp)->seq, -+ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) -+ break; -+ } -+ -+ mptcp_reset_mapping(tp); -+ -+ return -1; -+ } -+ -+ /* Record it, because we want to send our data_fin on the same path */ -+ if (tp->mptcp->map_data_fin) { -+ mpcb->dfin_path_index = tp->mptcp->path_index; -+ mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN); -+ } -+ -+ /* Verify the checksum */ -+ if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) { -+ int ret = mptcp_verif_dss_csum(sk); -+ -+ if (ret <= 0) { -+ mptcp_reset_mapping(tp); -+ return 1; -+ } -+ } -+ -+ if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) { -+ /* Seg's have to go to the meta-ofo-queue */ -+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { -+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; -+ mptcp_prepare_skb(tmp1, sk); -+ __skb_unlink(tmp1, &sk->sk_receive_queue); -+ /* MUST be done here, because fragstolen may be true later. -+ * Then, kfree_skb_partial will not account the memory. -+ */ -+ skb_orphan(tmp1); -+ -+ if (!mpcb->in_time_wait) /* In time-wait, do not receive data */ -+ mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk); -+ else -+ __kfree_skb(tmp1); -+ -+ if (!skb_queue_empty(&sk->sk_receive_queue) && -+ !before(TCP_SKB_CB(tmp)->seq, -+ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) -+ break; -+ } -+ tcp_enter_quickack_mode(sk); -+ } else { -+ /* Ready for the meta-rcv-queue */ -+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { -+ int eaten = 0; -+ const bool copied_early = false; -+ bool fragstolen = false; -+ u32 old_rcv_nxt = meta_tp->rcv_nxt; -+ -+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; -+ mptcp_prepare_skb(tmp1, sk); -+ __skb_unlink(tmp1, &sk->sk_receive_queue); -+ /* MUST be done here, because fragstolen may be true. -+ * Then, kfree_skb_partial will not account the memory. -+ */ -+ skb_orphan(tmp1); -+ -+ /* This segment has already been received */ -+ if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) { -+ __kfree_skb(tmp1); -+ goto next; -+ } -+ -+#ifdef CONFIG_NET_DMA -+ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt && -+ meta_tp->ucopy.task == current && -+ meta_tp->copied_seq == meta_tp->rcv_nxt && -+ tmp1->len <= meta_tp->ucopy.len && -+ sock_owned_by_user(meta_sk) && -+ tcp_dma_try_early_copy(meta_sk, tmp1, 0)) { -+ copied_early = true; -+ eaten = 1; -+ } -+#endif -+ -+ /* Is direct copy possible ? */ -+ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt && -+ meta_tp->ucopy.task == current && -+ meta_tp->copied_seq == meta_tp->rcv_nxt && -+ meta_tp->ucopy.len && sock_owned_by_user(meta_sk) && -+ !copied_early) -+ eaten = mptcp_direct_copy(tmp1, meta_sk); -+ -+ if (mpcb->in_time_wait) /* In time-wait, do not receive data */ -+ eaten = 1; -+ -+ if (!eaten) -+ eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen); -+ -+ meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq; -+ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt); -+ -+#ifdef CONFIG_NET_DMA -+ if (copied_early) -+ meta_tp->cleanup_rbuf(meta_sk, tmp1->len); -+#endif -+ -+ if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait) -+ mptcp_fin(meta_sk); -+ -+ /* Check if this fills a gap in the ofo queue */ -+ if (!skb_queue_empty(&meta_tp->out_of_order_queue)) -+ mptcp_ofo_queue(meta_sk); -+ -+#ifdef CONFIG_NET_DMA -+ if (copied_early) -+ __skb_queue_tail(&meta_sk->sk_async_wait_queue, -+ tmp1); -+ else -+#endif -+ if (eaten) -+ kfree_skb_partial(tmp1, fragstolen); -+ -+ data_queued = true; -+next: -+ if (!skb_queue_empty(&sk->sk_receive_queue) && -+ !before(TCP_SKB_CB(tmp)->seq, -+ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) -+ break; -+ } -+ } -+ -+ inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp; -+ mptcp_reset_mapping(tp); -+ -+ return data_queued ? -1 : -2; -+} -+ -+void mptcp_data_ready(struct sock *sk) -+{ -+ struct sock *meta_sk = mptcp_meta_sk(sk); -+ struct sk_buff *skb, *tmp; -+ int queued = 0; -+ -+ /* restart before the check, because mptcp_fin might have changed the -+ * state. -+ */ -+restart: -+ /* If the meta cannot receive data, there is no point in pushing data. -+ * If we are in time-wait, we may still be waiting for the final FIN. -+ * So, we should proceed with the processing. -+ */ -+ if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) { -+ skb_queue_purge(&sk->sk_receive_queue); -+ tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt; -+ goto exit; -+ } -+ -+ /* Iterate over all segments, detect their mapping (if we don't have -+ * one yet), validate them and push everything one level higher. -+ */ -+ skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { -+ int ret; -+ /* Pre-validation - e.g., early fallback */ -+ ret = mptcp_prevalidate_skb(sk, skb); -+ if (ret < 0) -+ goto restart; -+ else if (ret > 0) -+ break; -+ -+ /* Set the current mapping */ -+ ret = mptcp_detect_mapping(sk, skb); -+ if (ret < 0) -+ goto restart; -+ else if (ret > 0) -+ break; -+ -+ /* Validation */ -+ if (mptcp_validate_mapping(sk, skb) < 0) -+ goto restart; -+ -+ /* Push a level higher */ -+ ret = mptcp_queue_skb(sk); -+ if (ret < 0) { -+ if (ret == -1) -+ queued = ret; -+ goto restart; -+ } else if (ret == 0) { -+ continue; -+ } else { /* ret == 1 */ -+ break; -+ } -+ } -+ -+exit: -+ if (tcp_sk(sk)->close_it) { -+ tcp_send_ack(sk); -+ tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0); -+ } -+ -+ if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD)) -+ meta_sk->sk_data_ready(meta_sk); -+} -+ -+ -+int mptcp_check_req(struct sk_buff *skb, struct net *net) -+{ -+ const struct tcphdr *th = tcp_hdr(skb); -+ struct sock *meta_sk = NULL; -+ -+ /* MPTCP structures not initialized */ -+ if (mptcp_init_failed) -+ return 0; -+ -+ if (skb->protocol == htons(ETH_P_IP)) -+ meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr, -+ ip_hdr(skb)->daddr, net); -+#if IS_ENABLED(CONFIG_IPV6) -+ else /* IPv6 */ -+ meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr, -+ &ipv6_hdr(skb)->daddr, net); -+#endif /* CONFIG_IPV6 */ -+ -+ if (!meta_sk) -+ return 0; -+ -+ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN; -+ -+ bh_lock_sock_nested(meta_sk); -+ if (sock_owned_by_user(meta_sk)) { -+ skb->sk = meta_sk; -+ if (unlikely(sk_add_backlog(meta_sk, skb, -+ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { -+ bh_unlock_sock(meta_sk); -+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); -+ sock_put(meta_sk); /* Taken by mptcp_search_req */ -+ kfree_skb(skb); -+ return 1; -+ } -+ } else if (skb->protocol == htons(ETH_P_IP)) { -+ tcp_v4_do_rcv(meta_sk, skb); -+#if IS_ENABLED(CONFIG_IPV6) -+ } else { /* IPv6 */ -+ tcp_v6_do_rcv(meta_sk, skb); -+#endif /* CONFIG_IPV6 */ -+ } -+ bh_unlock_sock(meta_sk); -+ sock_put(meta_sk); /* Taken by mptcp_vX_search_req */ -+ return 1; -+} -+ -+struct mp_join *mptcp_find_join(const struct sk_buff *skb) -+{ -+ const struct tcphdr *th = tcp_hdr(skb); -+ unsigned char *ptr; -+ int length = (th->doff * 4) - sizeof(struct tcphdr); -+ -+ /* Jump through the options to check whether JOIN is there */ -+ ptr = (unsigned char *)(th + 1); -+ while (length > 0) { -+ int opcode = *ptr++; -+ int opsize; -+ -+ switch (opcode) { -+ case TCPOPT_EOL: -+ return NULL; -+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ -+ length--; -+ continue; -+ default: -+ opsize = *ptr++; -+ if (opsize < 2) /* "silly options" */ -+ return NULL; -+ if (opsize > length) -+ return NULL; /* don't parse partial options */ -+ if (opcode == TCPOPT_MPTCP && -+ ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) { -+ return (struct mp_join *)(ptr - 2); -+ } -+ ptr += opsize - 2; -+ length -= opsize; -+ } -+ } -+ return NULL; -+} -+ -+int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw) -+{ -+ const struct mptcp_cb *mpcb; -+ struct sock *meta_sk; -+ u32 token; -+ bool meta_v4; -+ struct mp_join *join_opt = mptcp_find_join(skb); -+ if (!join_opt) -+ return 0; -+ -+ /* MPTCP structures were not initialized, so return error */ -+ if (mptcp_init_failed) -+ return -1; -+ -+ token = join_opt->u.syn.token; -+ meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token); -+ if (!meta_sk) { -+ mptcp_debug("%s:mpcb not found:%x\n", __func__, token); -+ return -1; -+ } -+ -+ meta_v4 = meta_sk->sk_family == AF_INET; -+ if (meta_v4) { -+ if (skb->protocol == htons(ETH_P_IPV6)) { -+ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n"); -+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ -+ return -1; -+ } -+ } else if (skb->protocol == htons(ETH_P_IP) && -+ inet6_sk(meta_sk)->ipv6only) { -+ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n"); -+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ -+ return -1; -+ } -+ -+ mpcb = tcp_sk(meta_sk)->mpcb; -+ if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) { -+ /* We are in fallback-mode on the reception-side - -+ * no new subflows! -+ */ -+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ -+ return -1; -+ } -+ -+ /* Coming from time-wait-sock processing in tcp_v4_rcv. -+ * We have to deschedule it before continuing, because otherwise -+ * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req. -+ */ -+ if (tw) { -+ inet_twsk_deschedule(tw, &tcp_death_row); -+ inet_twsk_put(tw); -+ } -+ -+ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN; -+ /* OK, this is a new syn/join, let's create a new open request and -+ * send syn+ack -+ */ -+ bh_lock_sock_nested(meta_sk); -+ if (sock_owned_by_user(meta_sk)) { -+ skb->sk = meta_sk; -+ if (unlikely(sk_add_backlog(meta_sk, skb, -+ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { -+ bh_unlock_sock(meta_sk); -+ NET_INC_STATS_BH(sock_net(meta_sk), -+ LINUX_MIB_TCPBACKLOGDROP); -+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ -+ kfree_skb(skb); -+ return 1; -+ } -+ } else if (skb->protocol == htons(ETH_P_IP)) { -+ tcp_v4_do_rcv(meta_sk, skb); -+#if IS_ENABLED(CONFIG_IPV6) -+ } else { -+ tcp_v6_do_rcv(meta_sk, skb); -+#endif /* CONFIG_IPV6 */ -+ } -+ bh_unlock_sock(meta_sk); -+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ -+ return 1; -+} -+ -+int mptcp_do_join_short(struct sk_buff *skb, -+ const struct mptcp_options_received *mopt, -+ struct net *net) -+{ -+ struct sock *meta_sk; -+ u32 token; -+ bool meta_v4; -+ -+ token = mopt->mptcp_rem_token; -+ meta_sk = mptcp_hash_find(net, token); -+ if (!meta_sk) { -+ mptcp_debug("%s:mpcb not found:%x\n", __func__, token); -+ return -1; -+ } -+ -+ meta_v4 = meta_sk->sk_family == AF_INET; -+ if (meta_v4) { -+ if (skb->protocol == htons(ETH_P_IPV6)) { -+ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n"); -+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ -+ return -1; -+ } -+ } else if (skb->protocol == htons(ETH_P_IP) && -+ inet6_sk(meta_sk)->ipv6only) { -+ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n"); -+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ -+ return -1; -+ } -+ -+ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN; -+ -+ /* OK, this is a new syn/join, let's create a new open request and -+ * send syn+ack -+ */ -+ bh_lock_sock(meta_sk); -+ -+ /* This check is also done in mptcp_vX_do_rcv. But, there we cannot -+ * call tcp_vX_send_reset, because we hold already two socket-locks. -+ * (the listener and the meta from above) -+ * -+ * And the send-reset will try to take yet another one (ip_send_reply). -+ * Thus, we propagate the reset up to tcp_rcv_state_process. -+ */ -+ if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv || -+ tcp_sk(meta_sk)->mpcb->send_infinite_mapping || -+ meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) { -+ bh_unlock_sock(meta_sk); -+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ -+ return -1; -+ } -+ -+ if (sock_owned_by_user(meta_sk)) { -+ skb->sk = meta_sk; -+ if (unlikely(sk_add_backlog(meta_sk, skb, -+ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) -+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); -+ else -+ /* Must make sure that upper layers won't free the -+ * skb if it is added to the backlog-queue. -+ */ -+ skb_get(skb); -+ } else { -+ /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as -+ * the skb will finally be freed by tcp_v4_do_rcv (where we are -+ * coming from) -+ */ -+ skb_get(skb); -+ if (skb->protocol == htons(ETH_P_IP)) { -+ tcp_v4_do_rcv(meta_sk, skb); -+#if IS_ENABLED(CONFIG_IPV6) -+ } else { /* IPv6 */ -+ tcp_v6_do_rcv(meta_sk, skb); -+#endif /* CONFIG_IPV6 */ -+ } -+ } -+ -+ bh_unlock_sock(meta_sk); -+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ -+ return 0; -+} -+ -+/** -+ * Equivalent of tcp_fin() for MPTCP -+ * Can be called only when the FIN is validly part -+ * of the data seqnum space. Not before when we get holes. -+ */ -+void mptcp_fin(struct sock *meta_sk) -+{ -+ struct sock *sk = NULL, *sk_it; -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ -+ mptcp_for_each_sk(mpcb, sk_it) { -+ if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) { -+ sk = sk_it; -+ break; -+ } -+ } -+ -+ if (!sk || sk->sk_state == TCP_CLOSE) -+ sk = mptcp_select_ack_sock(meta_sk); -+ -+ inet_csk_schedule_ack(sk); -+ -+ meta_sk->sk_shutdown |= RCV_SHUTDOWN; -+ sock_set_flag(meta_sk, SOCK_DONE); -+ -+ switch (meta_sk->sk_state) { -+ case TCP_SYN_RECV: -+ case TCP_ESTABLISHED: -+ /* Move to CLOSE_WAIT */ -+ tcp_set_state(meta_sk, TCP_CLOSE_WAIT); -+ inet_csk(sk)->icsk_ack.pingpong = 1; -+ break; -+ -+ case TCP_CLOSE_WAIT: -+ case TCP_CLOSING: -+ /* Received a retransmission of the FIN, do -+ * nothing. -+ */ -+ break; -+ case TCP_LAST_ACK: -+ /* RFC793: Remain in the LAST-ACK state. */ -+ break; -+ -+ case TCP_FIN_WAIT1: -+ /* This case occurs when a simultaneous close -+ * happens, we must ack the received FIN and -+ * enter the CLOSING state. -+ */ -+ tcp_send_ack(sk); -+ tcp_set_state(meta_sk, TCP_CLOSING); -+ break; -+ case TCP_FIN_WAIT2: -+ /* Received a FIN -- send ACK and enter TIME_WAIT. */ -+ tcp_send_ack(sk); -+ meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0); -+ break; -+ default: -+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these -+ * cases we should never reach this piece of code. -+ */ -+ pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__, -+ meta_sk->sk_state); -+ break; -+ } -+ -+ /* It _is_ possible, that we have something out-of-order _after_ FIN. -+ * Probably, we should reset in this case. For now drop them. -+ */ -+ mptcp_purge_ofo_queue(meta_tp); -+ sk_mem_reclaim(meta_sk); -+ -+ if (!sock_flag(meta_sk, SOCK_DEAD)) { -+ meta_sk->sk_state_change(meta_sk); -+ -+ /* Do not send POLL_HUP for half duplex close. */ -+ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || -+ meta_sk->sk_state == TCP_CLOSE) -+ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP); -+ else -+ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN); -+ } -+ -+ return; -+} -+ -+static void mptcp_xmit_retransmit_queue(struct sock *meta_sk) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct sk_buff *skb; -+ -+ if (!meta_tp->packets_out) -+ return; -+ -+ tcp_for_write_queue(skb, meta_sk) { -+ if (skb == tcp_send_head(meta_sk)) -+ break; -+ -+ if (mptcp_retransmit_skb(meta_sk, skb)) -+ return; -+ -+ if (skb == tcp_write_queue_head(meta_sk)) -+ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, -+ inet_csk(meta_sk)->icsk_rto, -+ TCP_RTO_MAX); -+ } -+} -+ -+/* Handle the DATA_ACK */ -+static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb) -+{ -+ struct sock *meta_sk = mptcp_meta_sk(sk); -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); -+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); -+ u32 prior_snd_una = meta_tp->snd_una; -+ int prior_packets; -+ u32 nwin, data_ack, data_seq; -+ u16 data_len = 0; -+ -+ /* A valid packet came in - subflow is operational again */ -+ tp->pf = 0; -+ -+ /* Even if there is no data-ack, we stop retransmitting. -+ * Except if this is a SYN/ACK. Then it is just a retransmission -+ */ -+ if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) { -+ tp->mptcp->pre_established = 0; -+ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); -+ } -+ -+ /* If we are in infinite mapping mode, rx_opt.data_ack has been -+ * set by mptcp_clean_rtx_infinite. -+ */ -+ if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd) -+ goto exit; -+ -+ data_ack = tp->mptcp->rx_opt.data_ack; -+ -+ if (unlikely(!tp->mptcp->fully_established) && -+ tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq) -+ /* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1) -+ * includes a data-ack, we are fully established -+ */ -+ mptcp_become_fully_estab(sk); -+ -+ /* Get the data_seq */ -+ if (mptcp_is_data_seq(skb)) { -+ data_seq = tp->mptcp->rx_opt.data_seq; -+ data_len = tp->mptcp->rx_opt.data_len; -+ } else { -+ data_seq = meta_tp->snd_wl1; -+ } -+ -+ /* If the ack is older than previous acks -+ * then we can probably ignore it. -+ */ -+ if (before(data_ack, prior_snd_una)) -+ goto exit; -+ -+ /* If the ack includes data we haven't sent yet, discard -+ * this segment (RFC793 Section 3.9). -+ */ -+ if (after(data_ack, meta_tp->snd_nxt)) -+ goto exit; -+ -+ /*** Now, update the window - inspired by tcp_ack_update_window ***/ -+ nwin = ntohs(tcp_hdr(skb)->window); -+ -+ if (likely(!tcp_hdr(skb)->syn)) -+ nwin <<= tp->rx_opt.snd_wscale; -+ -+ if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) { -+ tcp_update_wl(meta_tp, data_seq); -+ -+ /* Draft v09, Section 3.3.5: -+ * [...] It should only update its local receive window values -+ * when the largest sequence number allowed (i.e. DATA_ACK + -+ * receive window) increases. [...] -+ */ -+ if (meta_tp->snd_wnd != nwin && -+ !before(data_ack + nwin, tcp_wnd_end(meta_tp))) { -+ meta_tp->snd_wnd = nwin; -+ -+ if (nwin > meta_tp->max_window) -+ meta_tp->max_window = nwin; -+ } -+ } -+ /*** Done, update the window ***/ -+ -+ /* We passed data and got it acked, remove any soft error -+ * log. Something worked... -+ */ -+ sk->sk_err_soft = 0; -+ inet_csk(meta_sk)->icsk_probes_out = 0; -+ meta_tp->rcv_tstamp = tcp_time_stamp; -+ prior_packets = meta_tp->packets_out; -+ if (!prior_packets) -+ goto no_queue; -+ -+ meta_tp->snd_una = data_ack; -+ -+ mptcp_clean_rtx_queue(meta_sk, prior_snd_una); -+ -+ /* We are in loss-state, and something got acked, retransmit the whole -+ * queue now! -+ */ -+ if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss && -+ after(data_ack, prior_snd_una)) { -+ mptcp_xmit_retransmit_queue(meta_sk); -+ inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open; -+ } -+ -+ /* Simplified version of tcp_new_space, because the snd-buffer -+ * is handled by all the subflows. -+ */ -+ if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) { -+ sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK); -+ if (meta_sk->sk_socket && -+ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) -+ meta_sk->sk_write_space(meta_sk); -+ } -+ -+ if (meta_sk->sk_state != TCP_ESTABLISHED && -+ mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len)) -+ return; -+ -+exit: -+ mptcp_push_pending_frames(meta_sk); -+ -+ return; -+ -+no_queue: -+ if (tcp_send_head(meta_sk)) -+ tcp_ack_probe(meta_sk); -+ -+ mptcp_push_pending_frames(meta_sk); -+ -+ return; -+} -+ -+void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk)); -+ -+ if (!tp->mpcb->infinite_mapping_snd) -+ return; -+ -+ /* The difference between both write_seq's represents the offset between -+ * data-sequence and subflow-sequence. As we are infinite, this must -+ * match. -+ * -+ * Thus, from this difference we can infer the meta snd_una. -+ */ -+ tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt + -+ tp->snd_una; -+ -+ mptcp_data_ack(sk, skb); -+} -+ -+/**** static functions used by mptcp_parse_options */ -+ -+static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id) -+{ -+ struct sock *sk_it, *tmpsk; -+ -+ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { -+ if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) { -+ mptcp_reinject_data(sk_it, 0); -+ sk_it->sk_err = ECONNRESET; -+ if (tcp_need_reset(sk_it->sk_state)) -+ tcp_sk(sk_it)->ops->send_active_reset(sk_it, -+ GFP_ATOMIC); -+ mptcp_sub_force_close(sk_it); -+ } -+ } -+} -+ -+void mptcp_parse_options(const uint8_t *ptr, int opsize, -+ struct mptcp_options_received *mopt, -+ const struct sk_buff *skb) -+{ -+ const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr; -+ -+ /* If the socket is mp-capable we would have a mopt. */ -+ if (!mopt) -+ return; -+ -+ switch (mp_opt->sub) { -+ case MPTCP_SUB_CAPABLE: -+ { -+ const struct mp_capable *mpcapable = (struct mp_capable *)ptr; -+ -+ if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN && -+ opsize != MPTCP_SUB_LEN_CAPABLE_ACK) { -+ mptcp_debug("%s: mp_capable: bad option size %d\n", -+ __func__, opsize); -+ break; -+ } -+ -+ if (!sysctl_mptcp_enabled) -+ break; -+ -+ /* We only support MPTCP version 0 */ -+ if (mpcapable->ver != 0) -+ break; -+ -+ /* MPTCP-RFC 6824: -+ * "If receiving a message with the 'B' flag set to 1, and this -+ * is not understood, then this SYN MUST be silently ignored; -+ */ -+ if (mpcapable->b) { -+ mopt->drop_me = 1; -+ break; -+ } -+ -+ /* MPTCP-RFC 6824: -+ * "An implementation that only supports this method MUST set -+ * bit "H" to 1, and bits "C" through "G" to 0." -+ */ -+ if (!mpcapable->h) -+ break; -+ -+ mopt->saw_mpc = 1; -+ mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a; -+ -+ if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN) -+ mopt->mptcp_key = mpcapable->sender_key; -+ -+ break; -+ } -+ case MPTCP_SUB_JOIN: -+ { -+ const struct mp_join *mpjoin = (struct mp_join *)ptr; -+ -+ if (opsize != MPTCP_SUB_LEN_JOIN_SYN && -+ opsize != MPTCP_SUB_LEN_JOIN_SYNACK && -+ opsize != MPTCP_SUB_LEN_JOIN_ACK) { -+ mptcp_debug("%s: mp_join: bad option size %d\n", -+ __func__, opsize); -+ break; -+ } -+ -+ /* saw_mpc must be set, because in tcp_check_req we assume that -+ * it is set to support falling back to reg. TCP if a rexmitted -+ * SYN has no MP_CAPABLE or MP_JOIN -+ */ -+ switch (opsize) { -+ case MPTCP_SUB_LEN_JOIN_SYN: -+ mopt->is_mp_join = 1; -+ mopt->saw_mpc = 1; -+ mopt->low_prio = mpjoin->b; -+ mopt->rem_id = mpjoin->addr_id; -+ mopt->mptcp_rem_token = mpjoin->u.syn.token; -+ mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce; -+ break; -+ case MPTCP_SUB_LEN_JOIN_SYNACK: -+ mopt->saw_mpc = 1; -+ mopt->low_prio = mpjoin->b; -+ mopt->rem_id = mpjoin->addr_id; -+ mopt->mptcp_recv_tmac = mpjoin->u.synack.mac; -+ mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce; -+ break; -+ case MPTCP_SUB_LEN_JOIN_ACK: -+ mopt->saw_mpc = 1; -+ mopt->join_ack = 1; -+ memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20); -+ break; -+ } -+ break; -+ } -+ case MPTCP_SUB_DSS: -+ { -+ const struct mp_dss *mdss = (struct mp_dss *)ptr; -+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); -+ -+ /* We check opsize for the csum and non-csum case. We do this, -+ * because the draft says that the csum SHOULD be ignored if -+ * it has not been negotiated in the MP_CAPABLE but still is -+ * present in the data. -+ * -+ * It will get ignored later in mptcp_queue_skb. -+ */ -+ if (opsize != mptcp_sub_len_dss(mdss, 0) && -+ opsize != mptcp_sub_len_dss(mdss, 1)) { -+ mptcp_debug("%s: mp_dss: bad option size %d\n", -+ __func__, opsize); -+ break; -+ } -+ -+ ptr += 4; -+ -+ if (mdss->A) { -+ tcb->mptcp_flags |= MPTCPHDR_ACK; -+ -+ if (mdss->a) { -+ mopt->data_ack = (u32) get_unaligned_be64(ptr); -+ ptr += MPTCP_SUB_LEN_ACK_64; -+ } else { -+ mopt->data_ack = get_unaligned_be32(ptr); -+ ptr += MPTCP_SUB_LEN_ACK; -+ } -+ } -+ -+ tcb->dss_off = (ptr - skb_transport_header(skb)); -+ -+ if (mdss->M) { -+ if (mdss->m) { -+ u64 data_seq64 = get_unaligned_be64(ptr); -+ -+ tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET; -+ mopt->data_seq = (u32) data_seq64; -+ -+ ptr += 12; /* 64-bit dseq + subseq */ -+ } else { -+ mopt->data_seq = get_unaligned_be32(ptr); -+ ptr += 8; /* 32-bit dseq + subseq */ -+ } -+ mopt->data_len = get_unaligned_be16(ptr); -+ -+ tcb->mptcp_flags |= MPTCPHDR_SEQ; -+ -+ /* Is a check-sum present? */ -+ if (opsize == mptcp_sub_len_dss(mdss, 1)) -+ tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM; -+ -+ /* DATA_FIN only possible with DSS-mapping */ -+ if (mdss->F) -+ tcb->mptcp_flags |= MPTCPHDR_FIN; -+ } -+ -+ break; -+ } -+ case MPTCP_SUB_ADD_ADDR: -+ { -+#if IS_ENABLED(CONFIG_IPV6) -+ const struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; -+ -+ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 && -+ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) || -+ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 && -+ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) { -+#else -+ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 && -+ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) { -+#endif /* CONFIG_IPV6 */ -+ mptcp_debug("%s: mp_add_addr: bad option size %d\n", -+ __func__, opsize); -+ break; -+ } -+ -+ /* We have to manually parse the options if we got two of them. */ -+ if (mopt->saw_add_addr) { -+ mopt->more_add_addr = 1; -+ break; -+ } -+ mopt->saw_add_addr = 1; -+ mopt->add_addr_ptr = ptr; -+ break; -+ } -+ case MPTCP_SUB_REMOVE_ADDR: -+ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) { -+ mptcp_debug("%s: mp_remove_addr: bad option size %d\n", -+ __func__, opsize); -+ break; -+ } -+ -+ if (mopt->saw_rem_addr) { -+ mopt->more_rem_addr = 1; -+ break; -+ } -+ mopt->saw_rem_addr = 1; -+ mopt->rem_addr_ptr = ptr; -+ break; -+ case MPTCP_SUB_PRIO: -+ { -+ const struct mp_prio *mpprio = (struct mp_prio *)ptr; -+ -+ if (opsize != MPTCP_SUB_LEN_PRIO && -+ opsize != MPTCP_SUB_LEN_PRIO_ADDR) { -+ mptcp_debug("%s: mp_prio: bad option size %d\n", -+ __func__, opsize); -+ break; -+ } -+ -+ mopt->saw_low_prio = 1; -+ mopt->low_prio = mpprio->b; -+ -+ if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) { -+ mopt->saw_low_prio = 2; -+ mopt->prio_addr_id = mpprio->addr_id; -+ } -+ break; -+ } -+ case MPTCP_SUB_FAIL: -+ if (opsize != MPTCP_SUB_LEN_FAIL) { -+ mptcp_debug("%s: mp_fail: bad option size %d\n", -+ __func__, opsize); -+ break; -+ } -+ mopt->mp_fail = 1; -+ break; -+ case MPTCP_SUB_FCLOSE: -+ if (opsize != MPTCP_SUB_LEN_FCLOSE) { -+ mptcp_debug("%s: mp_fclose: bad option size %d\n", -+ __func__, opsize); -+ break; -+ } -+ -+ mopt->mp_fclose = 1; -+ mopt->mptcp_key = ((struct mp_fclose *)ptr)->key; -+ -+ break; -+ default: -+ mptcp_debug("%s: Received unkown subtype: %d\n", -+ __func__, mp_opt->sub); -+ break; -+ } -+} -+ -+/** Parse only MPTCP options */ -+void tcp_parse_mptcp_options(const struct sk_buff *skb, -+ struct mptcp_options_received *mopt) -+{ -+ const struct tcphdr *th = tcp_hdr(skb); -+ int length = (th->doff * 4) - sizeof(struct tcphdr); -+ const unsigned char *ptr = (const unsigned char *)(th + 1); -+ -+ while (length > 0) { -+ int opcode = *ptr++; -+ int opsize; -+ -+ switch (opcode) { -+ case TCPOPT_EOL: -+ return; -+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ -+ length--; -+ continue; -+ default: -+ opsize = *ptr++; -+ if (opsize < 2) /* "silly options" */ -+ return; -+ if (opsize > length) -+ return; /* don't parse partial options */ -+ if (opcode == TCPOPT_MPTCP) -+ mptcp_parse_options(ptr - 2, opsize, mopt, skb); -+ } -+ ptr += opsize - 2; -+ length -= opsize; -+ } -+} -+ -+int mptcp_check_rtt(const struct tcp_sock *tp, int time) -+{ -+ struct mptcp_cb *mpcb = tp->mpcb; -+ struct sock *sk; -+ u32 rtt_max = 0; -+ -+ /* In MPTCP, we take the max delay across all flows, -+ * in order to take into account meta-reordering buffers. -+ */ -+ mptcp_for_each_sk(mpcb, sk) { -+ if (!mptcp_sk_can_recv(sk)) -+ continue; -+ -+ if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt) -+ rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt; -+ } -+ if (time < (rtt_max >> 3) || !rtt_max) -+ return 1; -+ -+ return 0; -+} -+ -+static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk) -+{ -+ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; -+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; -+ __be16 port = 0; -+ union inet_addr addr; -+ sa_family_t family; -+ -+ if (mpadd->ipver == 4) { -+ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2) -+ port = mpadd->u.v4.port; -+ family = AF_INET; -+ addr.in = mpadd->u.v4.addr; -+#if IS_ENABLED(CONFIG_IPV6) -+ } else if (mpadd->ipver == 6) { -+ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2) -+ port = mpadd->u.v6.port; -+ family = AF_INET6; -+ addr.in6 = mpadd->u.v6.addr; -+#endif /* CONFIG_IPV6 */ -+ } else { -+ return; -+ } -+ -+ if (mpcb->pm_ops->add_raddr) -+ mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id); -+} -+ -+static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk) -+{ -+ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; -+ int i; -+ u8 rem_id; -+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; -+ -+ for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) { -+ rem_id = (&mprem->addrs_id)[i]; -+ -+ if (mpcb->pm_ops->rem_raddr) -+ mpcb->pm_ops->rem_raddr(mpcb, rem_id); -+ mptcp_send_reset_rem_id(mpcb, rem_id); -+ } -+} -+ -+static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk) -+{ -+ struct tcphdr *th = tcp_hdr(skb); -+ unsigned char *ptr; -+ int length = (th->doff * 4) - sizeof(struct tcphdr); -+ -+ /* Jump through the options to check whether ADD_ADDR is there */ -+ ptr = (unsigned char *)(th + 1); -+ while (length > 0) { -+ int opcode = *ptr++; -+ int opsize; -+ -+ switch (opcode) { -+ case TCPOPT_EOL: -+ return; -+ case TCPOPT_NOP: -+ length--; -+ continue; -+ default: -+ opsize = *ptr++; -+ if (opsize < 2) -+ return; -+ if (opsize > length) -+ return; /* don't parse partial options */ -+ if (opcode == TCPOPT_MPTCP && -+ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) { -+#if IS_ENABLED(CONFIG_IPV6) -+ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; -+ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 && -+ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) || -+ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 && -+ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) -+#else -+ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 && -+ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) -+#endif /* CONFIG_IPV6 */ -+ goto cont; -+ -+ mptcp_handle_add_addr(ptr, sk); -+ } -+ if (opcode == TCPOPT_MPTCP && -+ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) { -+ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) -+ goto cont; -+ -+ mptcp_handle_rem_addr(ptr, sk); -+ } -+cont: -+ ptr += opsize - 2; -+ length -= opsize; -+ } -+ } -+ return; -+} -+ -+static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th) -+{ -+ struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp; -+ struct sock *meta_sk = mptcp_meta_sk(sk); -+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; -+ -+ if (unlikely(mptcp->rx_opt.mp_fail)) { -+ mptcp->rx_opt.mp_fail = 0; -+ -+ if (!th->rst && !mpcb->infinite_mapping_snd) { -+ struct sock *sk_it; -+ -+ mpcb->send_infinite_mapping = 1; -+ /* We resend everything that has not been acknowledged */ -+ meta_sk->sk_send_head = tcp_write_queue_head(meta_sk); -+ -+ /* We artificially restart the whole send-queue. Thus, -+ * it is as if no packets are in flight -+ */ -+ tcp_sk(meta_sk)->packets_out = 0; -+ -+ /* If the snd_nxt already wrapped around, we have to -+ * undo the wrapping, as we are restarting from snd_una -+ * on. -+ */ -+ if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) { -+ mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2; -+ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; -+ } -+ tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una; -+ -+ /* Trigger a sending on the meta. */ -+ mptcp_push_pending_frames(meta_sk); -+ -+ mptcp_for_each_sk(mpcb, sk_it) { -+ if (sk != sk_it) -+ mptcp_sub_force_close(sk_it); -+ } -+ } -+ -+ return 0; -+ } -+ -+ if (unlikely(mptcp->rx_opt.mp_fclose)) { -+ struct sock *sk_it, *tmpsk; -+ -+ mptcp->rx_opt.mp_fclose = 0; -+ if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key) -+ return 0; -+ -+ if (tcp_need_reset(sk->sk_state)) -+ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); -+ -+ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) -+ mptcp_sub_force_close(sk_it); -+ -+ tcp_reset(meta_sk); -+ -+ return 1; -+ } -+ -+ return 0; -+} -+ -+static inline void mptcp_path_array_check(struct sock *meta_sk) -+{ -+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ -+ if (unlikely(mpcb->list_rcvd)) { -+ mpcb->list_rcvd = 0; -+ if (mpcb->pm_ops->new_remote_address) -+ mpcb->pm_ops->new_remote_address(meta_sk); -+ } -+} -+ -+int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, -+ const struct sk_buff *skb) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; -+ -+ if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd) -+ return 0; -+ -+ if (mptcp_mp_fail_rcvd(sk, th)) -+ return 1; -+ -+ /* RFC 6824, Section 3.3: -+ * If a checksum is not present when its use has been negotiated, the -+ * receiver MUST close the subflow with a RST as it is considered broken. -+ */ -+ if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum && -+ !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) { -+ if (tcp_need_reset(sk->sk_state)) -+ tp->ops->send_active_reset(sk, GFP_ATOMIC); -+ -+ mptcp_sub_force_close(sk); -+ return 1; -+ } -+ -+ /* We have to acknowledge retransmissions of the third -+ * ack. -+ */ -+ if (mopt->join_ack) { -+ tcp_send_delayed_ack(sk); -+ mopt->join_ack = 0; -+ } -+ -+ if (mopt->saw_add_addr || mopt->saw_rem_addr) { -+ if (mopt->more_add_addr || mopt->more_rem_addr) { -+ mptcp_parse_addropt(skb, sk); -+ } else { -+ if (mopt->saw_add_addr) -+ mptcp_handle_add_addr(mopt->add_addr_ptr, sk); -+ if (mopt->saw_rem_addr) -+ mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk); -+ } -+ -+ mopt->more_add_addr = 0; -+ mopt->saw_add_addr = 0; -+ mopt->more_rem_addr = 0; -+ mopt->saw_rem_addr = 0; -+ } -+ if (mopt->saw_low_prio) { -+ if (mopt->saw_low_prio == 1) { -+ tp->mptcp->rcv_low_prio = mopt->low_prio; -+ } else { -+ struct sock *sk_it; -+ mptcp_for_each_sk(tp->mpcb, sk_it) { -+ struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp; -+ if (mptcp->rem_id == mopt->prio_addr_id) -+ mptcp->rcv_low_prio = mopt->low_prio; -+ } -+ } -+ mopt->saw_low_prio = 0; -+ } -+ -+ mptcp_data_ack(sk, skb); -+ -+ mptcp_path_array_check(mptcp_meta_sk(sk)); -+ /* Socket may have been mp_killed by a REMOVE_ADDR */ -+ if (tp->mp_killed) -+ return 1; -+ -+ return 0; -+} -+ -+/* In case of fastopen, some data can already be in the write queue. -+ * We need to update the sequence number of the segments as they -+ * were initially TCP sequence numbers. -+ */ -+static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk); -+ struct sk_buff *skb; -+ u32 new_mapping = meta_tp->write_seq - master_tp->snd_una; -+ -+ /* There should only be one skb in write queue: the data not -+ * acknowledged in the SYN+ACK. In this case, we need to map -+ * this data to data sequence numbers. -+ */ -+ skb_queue_walk(&meta_sk->sk_write_queue, skb) { -+ /* If the server only acknowledges partially the data sent in -+ * the SYN, we need to trim the acknowledged part because -+ * we don't want to retransmit this already received data. -+ * When we reach this point, tcp_ack() has already cleaned up -+ * fully acked segments. However, tcp trims partially acked -+ * segments only when retransmitting. Since MPTCP comes into -+ * play only now, we will fake an initial transmit, and -+ * retransmit_skb() will not be called. The following fragment -+ * comes from __tcp_retransmit_skb(). -+ */ -+ if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) { -+ BUG_ON(before(TCP_SKB_CB(skb)->end_seq, -+ master_tp->snd_una)); -+ /* tcp_trim_head can only returns ENOMEM if skb is -+ * cloned. It is not the case here (see -+ * tcp_send_syn_data). -+ */ -+ BUG_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una - -+ TCP_SKB_CB(skb)->seq)); -+ } -+ -+ TCP_SKB_CB(skb)->seq += new_mapping; -+ TCP_SKB_CB(skb)->end_seq += new_mapping; -+ } -+ -+ /* We can advance write_seq by the number of bytes unacknowledged -+ * and that were mapped in the previous loop. -+ */ -+ meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una; -+ -+ /* The packets from the master_sk will be entailed to it later -+ * Until that time, its write queue is empty, and -+ * write_seq must align with snd_una -+ */ -+ master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una; -+ master_tp->packets_out = 0; -+ -+ /* Although these data have been sent already over the subsk, -+ * They have never been sent over the meta_sk, so we rewind -+ * the send_head so that tcp considers it as an initial send -+ * (instead of retransmit). -+ */ -+ meta_sk->sk_send_head = tcp_write_queue_head(meta_sk); -+} -+ -+/* The skptr is needed, because if we become MPTCP-capable, we have to switch -+ * from meta-socket to master-socket. -+ * -+ * @return: 1 - we want to reset this connection -+ * 2 - we want to discard the received syn/ack -+ * 0 - everything is fine - continue -+ */ -+int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, -+ const struct sk_buff *skb, -+ const struct mptcp_options_received *mopt) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ if (mptcp(tp)) { -+ u8 hash_mac_check[20]; -+ struct mptcp_cb *mpcb = tp->mpcb; -+ -+ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, -+ (u8 *)&mpcb->mptcp_loc_key, -+ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, -+ (u8 *)&tp->mptcp->mptcp_loc_nonce, -+ (u32 *)hash_mac_check); -+ if (memcmp(hash_mac_check, -+ (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) { -+ mptcp_sub_force_close(sk); -+ return 1; -+ } -+ -+ /* Set this flag in order to postpone data sending -+ * until the 4th ack arrives. -+ */ -+ tp->mptcp->pre_established = 1; -+ tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio; -+ -+ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, -+ (u8 *)&mpcb->mptcp_rem_key, -+ (u8 *)&tp->mptcp->mptcp_loc_nonce, -+ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, -+ (u32 *)&tp->mptcp->sender_mac[0]); -+ -+ } else if (mopt->saw_mpc) { -+ struct sock *meta_sk = sk; -+ -+ if (mptcp_create_master_sk(sk, mopt->mptcp_key, -+ ntohs(tcp_hdr(skb)->window))) -+ return 2; -+ -+ sk = tcp_sk(sk)->mpcb->master_sk; -+ *skptr = sk; -+ tp = tcp_sk(sk); -+ -+ /* If fastopen was used data might be in the send queue. We -+ * need to update their sequence number to MPTCP-level seqno. -+ * Note that it can happen in rare cases that fastopen_req is -+ * NULL and syn_data is 0 but fastopen indeed occurred and -+ * data has been queued in the write queue (but not sent). -+ * Example of such rare cases: connect is non-blocking and -+ * TFO is configured to work without cookies. -+ */ -+ if (!skb_queue_empty(&meta_sk->sk_write_queue)) -+ mptcp_rcv_synsent_fastopen(meta_sk); -+ -+ /* -1, because the SYN consumed 1 byte. In case of TFO, we -+ * start the subflow-sequence number as if the data of the SYN -+ * is not part of any mapping. -+ */ -+ tp->mptcp->snt_isn = tp->snd_una - 1; -+ tp->mpcb->dss_csum = mopt->dss_csum; -+ tp->mptcp->include_mpc = 1; -+ -+ /* Ensure that fastopen is handled at the meta-level. */ -+ tp->fastopen_req = NULL; -+ -+ sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket); -+ sk->sk_wq = mptcp_meta_sk(sk)->sk_wq; -+ -+ /* hold in sk_clone_lock due to initialization to 2 */ -+ sock_put(sk); -+ } else { -+ tp->request_mptcp = 0; -+ -+ if (tp->inside_tk_table) -+ mptcp_hash_remove(tp); -+ } -+ -+ if (mptcp(tp)) -+ tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq; -+ -+ return 0; -+} -+ -+bool mptcp_should_expand_sndbuf(const struct sock *sk) -+{ -+ const struct sock *sk_it; -+ const struct sock *meta_sk = mptcp_meta_sk(sk); -+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ int cnt_backups = 0; -+ int backup_available = 0; -+ -+ /* We circumvent this check in tcp_check_space, because we want to -+ * always call sk_write_space. So, we reproduce the check here. -+ */ -+ if (!meta_sk->sk_socket || -+ !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) -+ return false; -+ -+ /* If the user specified a specific send buffer setting, do -+ * not modify it. -+ */ -+ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK) -+ return false; -+ -+ /* If we are under global TCP memory pressure, do not expand. */ -+ if (sk_under_memory_pressure(meta_sk)) -+ return false; -+ -+ /* If we are under soft global TCP memory pressure, do not expand. */ -+ if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0)) -+ return false; -+ -+ -+ /* For MPTCP we look for a subsocket that could send data. -+ * If we found one, then we update the send-buffer. -+ */ -+ mptcp_for_each_sk(meta_tp->mpcb, sk_it) { -+ struct tcp_sock *tp_it = tcp_sk(sk_it); -+ -+ if (!mptcp_sk_can_send(sk_it)) -+ continue; -+ -+ /* Backup-flows have to be counted - if there is no other -+ * subflow we take the backup-flow into account. -+ */ -+ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) -+ cnt_backups++; -+ -+ if (tp_it->packets_out < tp_it->snd_cwnd) { -+ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) { -+ backup_available = 1; -+ continue; -+ } -+ return true; -+ } -+ } -+ -+ /* Backup-flow is available for sending - update send-buffer */ -+ if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available) -+ return true; -+ return false; -+} -+ -+void mptcp_init_buffer_space(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct sock *meta_sk = mptcp_meta_sk(sk); -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ int space; -+ -+ tcp_init_buffer_space(sk); -+ -+ if (is_master_tp(tp)) { -+ meta_tp->rcvq_space.space = meta_tp->rcv_wnd; -+ meta_tp->rcvq_space.time = tcp_time_stamp; -+ meta_tp->rcvq_space.seq = meta_tp->copied_seq; -+ -+ /* If there is only one subflow, we just use regular TCP -+ * autotuning. User-locks are handled already by -+ * tcp_init_buffer_space -+ */ -+ meta_tp->window_clamp = tp->window_clamp; -+ meta_tp->rcv_ssthresh = tp->rcv_ssthresh; -+ meta_sk->sk_rcvbuf = sk->sk_rcvbuf; -+ meta_sk->sk_sndbuf = sk->sk_sndbuf; -+ -+ return; -+ } -+ -+ if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK) -+ goto snd_buf; -+ -+ /* Adding a new subflow to the rcv-buffer space. We make a simple -+ * addition, to give some space to allow traffic on the new subflow. -+ * Autotuning will increase it further later on. -+ */ -+ space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]); -+ if (space > meta_sk->sk_rcvbuf) { -+ meta_tp->window_clamp += tp->window_clamp; -+ meta_tp->rcv_ssthresh += tp->rcv_ssthresh; -+ meta_sk->sk_rcvbuf = space; -+ } -+ -+snd_buf: -+ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK) -+ return; -+ -+ /* Adding a new subflow to the send-buffer space. We make a simple -+ * addition, to give some space to allow traffic on the new subflow. -+ * Autotuning will increase it further later on. -+ */ -+ space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, sysctl_tcp_wmem[2]); -+ if (space > meta_sk->sk_sndbuf) { -+ meta_sk->sk_sndbuf = space; -+ meta_sk->sk_write_space(meta_sk); -+ } -+} -+ -+void mptcp_tcp_set_rto(struct sock *sk) -+{ -+ tcp_set_rto(sk); -+ mptcp_set_rto(sk); -+} -diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c -new file mode 100644 -index 000000000000..1183d1305d35 ---- /dev/null -+++ b/net/mptcp/mptcp_ipv4.c -@@ -0,0 +1,483 @@ -+/* -+ * MPTCP implementation - IPv4-specific functions -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer: -+ * Christoph Paasch -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) -+{ -+ u32 hash[MD5_DIGEST_WORDS]; -+ -+ hash[0] = (__force u32)saddr; -+ hash[1] = (__force u32)daddr; -+ hash[2] = ((__force u16)sport << 16) + (__force u16)dport; -+ hash[3] = mptcp_seed++; -+ -+ md5_transform(hash, mptcp_secret); -+ -+ return hash[0]; -+} -+ -+u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) -+{ -+ u32 hash[MD5_DIGEST_WORDS]; -+ -+ hash[0] = (__force u32)saddr; -+ hash[1] = (__force u32)daddr; -+ hash[2] = ((__force u16)sport << 16) + (__force u16)dport; -+ hash[3] = mptcp_seed++; -+ -+ md5_transform(hash, mptcp_secret); -+ -+ return *((u64 *)hash); -+} -+ -+ -+static void mptcp_v4_reqsk_destructor(struct request_sock *req) -+{ -+ mptcp_reqsk_destructor(req); -+ -+ tcp_v4_reqsk_destructor(req); -+} -+ -+static int mptcp_v4_init_req(struct request_sock *req, struct sock *sk, -+ struct sk_buff *skb) -+{ -+ tcp_request_sock_ipv4_ops.init_req(req, sk, skb); -+ mptcp_reqsk_init(req, skb); -+ -+ return 0; -+} -+ -+static int mptcp_v4_join_init_req(struct request_sock *req, struct sock *sk, -+ struct sk_buff *skb) -+{ -+ struct mptcp_request_sock *mtreq = mptcp_rsk(req); -+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; -+ union inet_addr addr; -+ int loc_id; -+ bool low_prio = false; -+ -+ /* We need to do this as early as possible. Because, if we fail later -+ * (e.g., get_local_id), then reqsk_free tries to remove the -+ * request-socket from the htb in mptcp_hash_request_remove as pprev -+ * may be different from NULL. -+ */ -+ mtreq->hash_entry.pprev = NULL; -+ -+ tcp_request_sock_ipv4_ops.init_req(req, sk, skb); -+ -+ mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr, -+ ip_hdr(skb)->daddr, -+ tcp_hdr(skb)->source, -+ tcp_hdr(skb)->dest); -+ addr.ip = inet_rsk(req)->ir_loc_addr; -+ loc_id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(sk), &low_prio); -+ if (loc_id == -1) -+ return -1; -+ mtreq->loc_id = loc_id; -+ mtreq->low_prio = low_prio; -+ -+ mptcp_join_reqsk_init(mpcb, req, skb); -+ -+ return 0; -+} -+ -+/* Similar to tcp_request_sock_ops */ -+struct request_sock_ops mptcp_request_sock_ops __read_mostly = { -+ .family = PF_INET, -+ .obj_size = sizeof(struct mptcp_request_sock), -+ .rtx_syn_ack = tcp_rtx_synack, -+ .send_ack = tcp_v4_reqsk_send_ack, -+ .destructor = mptcp_v4_reqsk_destructor, -+ .send_reset = tcp_v4_send_reset, -+ .syn_ack_timeout = tcp_syn_ack_timeout, -+}; -+ -+static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk, -+ struct request_sock *req, -+ const unsigned long timeout) -+{ -+ const u32 h1 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, -+ inet_rsk(req)->ir_rmt_port, -+ 0, MPTCP_HASH_SIZE); -+ /* We cannot call inet_csk_reqsk_queue_hash_add(), because we do not -+ * want to reset the keepalive-timer (responsible for retransmitting -+ * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot -+ * overload the keepalive timer. Also, it's not a big deal, because the -+ * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So, -+ * if the third ACK gets lost, the client will handle the retransmission -+ * anyways. If our SYN/ACK gets lost, the client will retransmit the -+ * SYN. -+ */ -+ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); -+ struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt; -+ const u32 h2 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, -+ inet_rsk(req)->ir_rmt_port, -+ lopt->hash_rnd, lopt->nr_table_entries); -+ -+ reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout); -+ if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0) -+ mptcp_reset_synack_timer(meta_sk, timeout); -+ -+ rcu_read_lock(); -+ spin_lock(&mptcp_reqsk_hlock); -+ hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]); -+ spin_unlock(&mptcp_reqsk_hlock); -+ rcu_read_unlock(); -+} -+ -+/* Similar to tcp_v4_conn_request */ -+static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb) -+{ -+ return tcp_conn_request(&mptcp_request_sock_ops, -+ &mptcp_join_request_sock_ipv4_ops, -+ meta_sk, skb); -+} -+ -+/* We only process join requests here. (either the SYN or the final ACK) */ -+int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb) -+{ -+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct sock *child, *rsk = NULL; -+ int ret; -+ -+ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) { -+ struct tcphdr *th = tcp_hdr(skb); -+ const struct iphdr *iph = ip_hdr(skb); -+ struct sock *sk; -+ -+ sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo, -+ iph->saddr, th->source, iph->daddr, -+ th->dest, inet_iif(skb)); -+ -+ if (!sk) { -+ kfree_skb(skb); -+ return 0; -+ } -+ if (is_meta_sk(sk)) { -+ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); -+ kfree_skb(skb); -+ sock_put(sk); -+ return 0; -+ } -+ -+ if (sk->sk_state == TCP_TIME_WAIT) { -+ inet_twsk_put(inet_twsk(sk)); -+ kfree_skb(skb); -+ return 0; -+ } -+ -+ ret = tcp_v4_do_rcv(sk, skb); -+ sock_put(sk); -+ -+ return ret; -+ } -+ TCP_SKB_CB(skb)->mptcp_flags = 0; -+ -+ /* Has been removed from the tk-table. Thus, no new subflows. -+ * -+ * Check for close-state is necessary, because we may have been closed -+ * without passing by mptcp_close(). -+ * -+ * When falling back, no new subflows are allowed either. -+ */ -+ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table || -+ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) -+ goto reset_and_discard; -+ -+ child = tcp_v4_hnd_req(meta_sk, skb); -+ -+ if (!child) -+ goto discard; -+ -+ if (child != meta_sk) { -+ sock_rps_save_rxhash(child, skb); -+ /* We don't call tcp_child_process here, because we hold -+ * already the meta-sk-lock and are sure that it is not owned -+ * by the user. -+ */ -+ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len); -+ bh_unlock_sock(child); -+ sock_put(child); -+ if (ret) { -+ rsk = child; -+ goto reset_and_discard; -+ } -+ } else { -+ if (tcp_hdr(skb)->syn) { -+ mptcp_v4_join_request(meta_sk, skb); -+ goto discard; -+ } -+ goto reset_and_discard; -+ } -+ return 0; -+ -+reset_and_discard: -+ if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) { -+ const struct tcphdr *th = tcp_hdr(skb); -+ const struct iphdr *iph = ip_hdr(skb); -+ struct request_sock **prev, *req; -+ /* If we end up here, it means we should not have matched on the -+ * request-socket. But, because the request-sock queue is only -+ * destroyed in mptcp_close, the socket may actually already be -+ * in close-state (e.g., through shutdown()) while still having -+ * pending request sockets. -+ */ -+ req = inet_csk_search_req(meta_sk, &prev, th->source, -+ iph->saddr, iph->daddr); -+ if (req) { -+ inet_csk_reqsk_queue_unlink(meta_sk, req, prev); -+ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, -+ req); -+ reqsk_free(req); -+ } -+ } -+ -+ tcp_v4_send_reset(rsk, skb); -+discard: -+ kfree_skb(skb); -+ return 0; -+} -+ -+/* After this, the ref count of the meta_sk associated with the request_sock -+ * is incremented. Thus it is the responsibility of the caller -+ * to call sock_put() when the reference is not needed anymore. -+ */ -+struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr, -+ const __be32 laddr, const struct net *net) -+{ -+ const struct mptcp_request_sock *mtreq; -+ struct sock *meta_sk = NULL; -+ const struct hlist_nulls_node *node; -+ const u32 hash = inet_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE); -+ -+ rcu_read_lock(); -+begin: -+ hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash], -+ hash_entry) { -+ struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq)); -+ meta_sk = mtreq->mptcp_mpcb->meta_sk; -+ -+ if (ireq->ir_rmt_port == rport && -+ ireq->ir_rmt_addr == raddr && -+ ireq->ir_loc_addr == laddr && -+ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET && -+ net_eq(net, sock_net(meta_sk))) -+ goto found; -+ meta_sk = NULL; -+ } -+ /* A request-socket is destroyed by RCU. So, it might have been recycled -+ * and put into another hash-table list. So, after the lookup we may -+ * end up in a different list. So, we may need to restart. -+ * -+ * See also the comment in __inet_lookup_established. -+ */ -+ if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE) -+ goto begin; -+ -+found: -+ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) -+ meta_sk = NULL; -+ rcu_read_unlock(); -+ -+ return meta_sk; -+} -+ -+/* Create a new IPv4 subflow. -+ * -+ * We are in user-context and meta-sock-lock is hold. -+ */ -+int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, -+ struct mptcp_rem4 *rem) -+{ -+ struct tcp_sock *tp; -+ struct sock *sk; -+ struct sockaddr_in loc_in, rem_in; -+ struct socket sock; -+ int ret; -+ -+ /** First, create and prepare the new socket */ -+ -+ sock.type = meta_sk->sk_socket->type; -+ sock.state = SS_UNCONNECTED; -+ sock.wq = meta_sk->sk_socket->wq; -+ sock.file = meta_sk->sk_socket->file; -+ sock.ops = NULL; -+ -+ ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1); -+ if (unlikely(ret < 0)) { -+ mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret); -+ return ret; -+ } -+ -+ sk = sock.sk; -+ tp = tcp_sk(sk); -+ -+ /* All subsockets need the MPTCP-lock-class */ -+ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP"); -+ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0); -+ -+ if (mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL)) -+ goto error; -+ -+ tp->mptcp->slave_sk = 1; -+ tp->mptcp->low_prio = loc->low_prio; -+ -+ /* Initializing the timer for an MPTCP subflow */ -+ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk); -+ -+ /** Then, connect the socket to the peer */ -+ loc_in.sin_family = AF_INET; -+ rem_in.sin_family = AF_INET; -+ loc_in.sin_port = 0; -+ if (rem->port) -+ rem_in.sin_port = rem->port; -+ else -+ rem_in.sin_port = inet_sk(meta_sk)->inet_dport; -+ loc_in.sin_addr = loc->addr; -+ rem_in.sin_addr = rem->addr; -+ -+ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in)); -+ if (ret < 0) { -+ mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n", -+ __func__, ret); -+ goto error; -+ } -+ -+ mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n", -+ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, -+ tp->mptcp->path_index, &loc_in.sin_addr, -+ ntohs(loc_in.sin_port), &rem_in.sin_addr, -+ ntohs(rem_in.sin_port)); -+ -+ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4) -+ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr); -+ -+ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in, -+ sizeof(struct sockaddr_in), O_NONBLOCK); -+ if (ret < 0 && ret != -EINPROGRESS) { -+ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n", -+ __func__, ret); -+ goto error; -+ } -+ -+ sk_set_socket(sk, meta_sk->sk_socket); -+ sk->sk_wq = meta_sk->sk_wq; -+ -+ return 0; -+ -+error: -+ /* May happen if mptcp_add_sock fails first */ -+ if (!mptcp(tp)) { -+ tcp_close(sk, 0); -+ } else { -+ local_bh_disable(); -+ mptcp_sub_force_close(sk); -+ local_bh_enable(); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(mptcp_init4_subsockets); -+ -+const struct inet_connection_sock_af_ops mptcp_v4_specific = { -+ .queue_xmit = ip_queue_xmit, -+ .send_check = tcp_v4_send_check, -+ .rebuild_header = inet_sk_rebuild_header, -+ .sk_rx_dst_set = inet_sk_rx_dst_set, -+ .conn_request = mptcp_conn_request, -+ .syn_recv_sock = tcp_v4_syn_recv_sock, -+ .net_header_len = sizeof(struct iphdr), -+ .setsockopt = ip_setsockopt, -+ .getsockopt = ip_getsockopt, -+ .addr2sockaddr = inet_csk_addr2sockaddr, -+ .sockaddr_len = sizeof(struct sockaddr_in), -+ .bind_conflict = inet_csk_bind_conflict, -+#ifdef CONFIG_COMPAT -+ .compat_setsockopt = compat_ip_setsockopt, -+ .compat_getsockopt = compat_ip_getsockopt, -+#endif -+}; -+ -+struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops; -+struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops; -+ -+/* General initialization of IPv4 for MPTCP */ -+int mptcp_pm_v4_init(void) -+{ -+ int ret = 0; -+ struct request_sock_ops *ops = &mptcp_request_sock_ops; -+ -+ mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; -+ mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req; -+ -+ mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; -+ mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req; -+ mptcp_join_request_sock_ipv4_ops.queue_hash_add = mptcp_v4_reqsk_queue_hash_add; -+ -+ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP"); -+ if (ops->slab_name == NULL) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, -+ SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, -+ NULL); -+ -+ if (ops->slab == NULL) { -+ ret = -ENOMEM; -+ goto err_reqsk_create; -+ } -+ -+out: -+ return ret; -+ -+err_reqsk_create: -+ kfree(ops->slab_name); -+ ops->slab_name = NULL; -+ goto out; -+} -+ -+void mptcp_pm_v4_undo(void) -+{ -+ kmem_cache_destroy(mptcp_request_sock_ops.slab); -+ kfree(mptcp_request_sock_ops.slab_name); -+} -diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c -new file mode 100644 -index 000000000000..1036973aa855 ---- /dev/null -+++ b/net/mptcp/mptcp_ipv6.c -@@ -0,0 +1,518 @@ -+/* -+ * MPTCP implementation - IPv6-specific functions -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer: -+ * Jaakko Korkeaniemi -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, -+ __be16 sport, __be16 dport) -+{ -+ u32 secret[MD5_MESSAGE_BYTES / 4]; -+ u32 hash[MD5_DIGEST_WORDS]; -+ u32 i; -+ -+ memcpy(hash, saddr, 16); -+ for (i = 0; i < 4; i++) -+ secret[i] = mptcp_secret[i] + (__force u32)daddr[i]; -+ secret[4] = mptcp_secret[4] + -+ (((__force u16)sport << 16) + (__force u16)dport); -+ secret[5] = mptcp_seed++; -+ for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++) -+ secret[i] = mptcp_secret[i]; -+ -+ md5_transform(hash, secret); -+ -+ return hash[0]; -+} -+ -+u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, -+ __be16 sport, __be16 dport) -+{ -+ u32 secret[MD5_MESSAGE_BYTES / 4]; -+ u32 hash[MD5_DIGEST_WORDS]; -+ u32 i; -+ -+ memcpy(hash, saddr, 16); -+ for (i = 0; i < 4; i++) -+ secret[i] = mptcp_secret[i] + (__force u32)daddr[i]; -+ secret[4] = mptcp_secret[4] + -+ (((__force u16)sport << 16) + (__force u16)dport); -+ secret[5] = mptcp_seed++; -+ for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++) -+ secret[i] = mptcp_secret[i]; -+ -+ md5_transform(hash, secret); -+ -+ return *((u64 *)hash); -+} -+ -+static void mptcp_v6_reqsk_destructor(struct request_sock *req) -+{ -+ mptcp_reqsk_destructor(req); -+ -+ tcp_v6_reqsk_destructor(req); -+} -+ -+static int mptcp_v6_init_req(struct request_sock *req, struct sock *sk, -+ struct sk_buff *skb) -+{ -+ tcp_request_sock_ipv6_ops.init_req(req, sk, skb); -+ mptcp_reqsk_init(req, skb); -+ -+ return 0; -+} -+ -+static int mptcp_v6_join_init_req(struct request_sock *req, struct sock *sk, -+ struct sk_buff *skb) -+{ -+ struct mptcp_request_sock *mtreq = mptcp_rsk(req); -+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; -+ union inet_addr addr; -+ int loc_id; -+ bool low_prio = false; -+ -+ /* We need to do this as early as possible. Because, if we fail later -+ * (e.g., get_local_id), then reqsk_free tries to remove the -+ * request-socket from the htb in mptcp_hash_request_remove as pprev -+ * may be different from NULL. -+ */ -+ mtreq->hash_entry.pprev = NULL; -+ -+ tcp_request_sock_ipv6_ops.init_req(req, sk, skb); -+ -+ mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32, -+ ipv6_hdr(skb)->daddr.s6_addr32, -+ tcp_hdr(skb)->source, -+ tcp_hdr(skb)->dest); -+ addr.in6 = inet_rsk(req)->ir_v6_loc_addr; -+ loc_id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(sk), &low_prio); -+ if (loc_id == -1) -+ return -1; -+ mtreq->loc_id = loc_id; -+ mtreq->low_prio = low_prio; -+ -+ mptcp_join_reqsk_init(mpcb, req, skb); -+ -+ return 0; -+} -+ -+/* Similar to tcp6_request_sock_ops */ -+struct request_sock_ops mptcp6_request_sock_ops __read_mostly = { -+ .family = AF_INET6, -+ .obj_size = sizeof(struct mptcp_request_sock), -+ .rtx_syn_ack = tcp_v6_rtx_synack, -+ .send_ack = tcp_v6_reqsk_send_ack, -+ .destructor = mptcp_v6_reqsk_destructor, -+ .send_reset = tcp_v6_send_reset, -+ .syn_ack_timeout = tcp_syn_ack_timeout, -+}; -+ -+static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk, -+ struct request_sock *req, -+ const unsigned long timeout) -+{ -+ const u32 h1 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr, -+ inet_rsk(req)->ir_rmt_port, -+ 0, MPTCP_HASH_SIZE); -+ /* We cannot call inet6_csk_reqsk_queue_hash_add(), because we do not -+ * want to reset the keepalive-timer (responsible for retransmitting -+ * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot -+ * overload the keepalive timer. Also, it's not a big deal, because the -+ * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So, -+ * if the third ACK gets lost, the client will handle the retransmission -+ * anyways. If our SYN/ACK gets lost, the client will retransmit the -+ * SYN. -+ */ -+ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); -+ struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt; -+ const u32 h2 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr, -+ inet_rsk(req)->ir_rmt_port, -+ lopt->hash_rnd, lopt->nr_table_entries); -+ -+ reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout); -+ if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0) -+ mptcp_reset_synack_timer(meta_sk, timeout); -+ -+ rcu_read_lock(); -+ spin_lock(&mptcp_reqsk_hlock); -+ hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]); -+ spin_unlock(&mptcp_reqsk_hlock); -+ rcu_read_unlock(); -+} -+ -+static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb) -+{ -+ return tcp_conn_request(&mptcp6_request_sock_ops, -+ &mptcp_join_request_sock_ipv6_ops, -+ meta_sk, skb); -+} -+ -+int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) -+{ -+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct sock *child, *rsk = NULL; -+ int ret; -+ -+ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) { -+ struct tcphdr *th = tcp_hdr(skb); -+ const struct ipv6hdr *ip6h = ipv6_hdr(skb); -+ struct sock *sk; -+ -+ sk = __inet6_lookup_established(sock_net(meta_sk), -+ &tcp_hashinfo, -+ &ip6h->saddr, th->source, -+ &ip6h->daddr, ntohs(th->dest), -+ inet6_iif(skb)); -+ -+ if (!sk) { -+ kfree_skb(skb); -+ return 0; -+ } -+ if (is_meta_sk(sk)) { -+ WARN("%s Did not find a sub-sk!\n", __func__); -+ kfree_skb(skb); -+ sock_put(sk); -+ return 0; -+ } -+ -+ if (sk->sk_state == TCP_TIME_WAIT) { -+ inet_twsk_put(inet_twsk(sk)); -+ kfree_skb(skb); -+ return 0; -+ } -+ -+ ret = tcp_v6_do_rcv(sk, skb); -+ sock_put(sk); -+ -+ return ret; -+ } -+ TCP_SKB_CB(skb)->mptcp_flags = 0; -+ -+ /* Has been removed from the tk-table. Thus, no new subflows. -+ * -+ * Check for close-state is necessary, because we may have been closed -+ * without passing by mptcp_close(). -+ * -+ * When falling back, no new subflows are allowed either. -+ */ -+ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table || -+ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) -+ goto reset_and_discard; -+ -+ child = tcp_v6_hnd_req(meta_sk, skb); -+ -+ if (!child) -+ goto discard; -+ -+ if (child != meta_sk) { -+ sock_rps_save_rxhash(child, skb); -+ /* We don't call tcp_child_process here, because we hold -+ * already the meta-sk-lock and are sure that it is not owned -+ * by the user. -+ */ -+ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len); -+ bh_unlock_sock(child); -+ sock_put(child); -+ if (ret) { -+ rsk = child; -+ goto reset_and_discard; -+ } -+ } else { -+ if (tcp_hdr(skb)->syn) { -+ mptcp_v6_join_request(meta_sk, skb); -+ goto discard; -+ } -+ goto reset_and_discard; -+ } -+ return 0; -+ -+reset_and_discard: -+ if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) { -+ const struct tcphdr *th = tcp_hdr(skb); -+ struct request_sock **prev, *req; -+ /* If we end up here, it means we should not have matched on the -+ * request-socket. But, because the request-sock queue is only -+ * destroyed in mptcp_close, the socket may actually already be -+ * in close-state (e.g., through shutdown()) while still having -+ * pending request sockets. -+ */ -+ req = inet6_csk_search_req(meta_sk, &prev, th->source, -+ &ipv6_hdr(skb)->saddr, -+ &ipv6_hdr(skb)->daddr, inet6_iif(skb)); -+ if (req) { -+ inet_csk_reqsk_queue_unlink(meta_sk, req, prev); -+ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, -+ req); -+ reqsk_free(req); -+ } -+ } -+ -+ tcp_v6_send_reset(rsk, skb); -+discard: -+ kfree_skb(skb); -+ return 0; -+} -+ -+/* After this, the ref count of the meta_sk associated with the request_sock -+ * is incremented. Thus it is the responsibility of the caller -+ * to call sock_put() when the reference is not needed anymore. -+ */ -+struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr, -+ const struct in6_addr *laddr, const struct net *net) -+{ -+ const struct mptcp_request_sock *mtreq; -+ struct sock *meta_sk = NULL; -+ const struct hlist_nulls_node *node; -+ const u32 hash = inet6_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE); -+ -+ rcu_read_lock(); -+begin: -+ hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash], -+ hash_entry) { -+ struct inet_request_sock *treq = inet_rsk(rev_mptcp_rsk(mtreq)); -+ meta_sk = mtreq->mptcp_mpcb->meta_sk; -+ -+ if (inet_rsk(rev_mptcp_rsk(mtreq))->ir_rmt_port == rport && -+ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 && -+ ipv6_addr_equal(&treq->ir_v6_rmt_addr, raddr) && -+ ipv6_addr_equal(&treq->ir_v6_loc_addr, laddr) && -+ net_eq(net, sock_net(meta_sk))) -+ goto found; -+ meta_sk = NULL; -+ } -+ /* A request-socket is destroyed by RCU. So, it might have been recycled -+ * and put into another hash-table list. So, after the lookup we may -+ * end up in a different list. So, we may need to restart. -+ * -+ * See also the comment in __inet_lookup_established. -+ */ -+ if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE) -+ goto begin; -+ -+found: -+ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) -+ meta_sk = NULL; -+ rcu_read_unlock(); -+ -+ return meta_sk; -+} -+ -+/* Create a new IPv6 subflow. -+ * -+ * We are in user-context and meta-sock-lock is hold. -+ */ -+int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, -+ struct mptcp_rem6 *rem) -+{ -+ struct tcp_sock *tp; -+ struct sock *sk; -+ struct sockaddr_in6 loc_in, rem_in; -+ struct socket sock; -+ int ret; -+ -+ /** First, create and prepare the new socket */ -+ -+ sock.type = meta_sk->sk_socket->type; -+ sock.state = SS_UNCONNECTED; -+ sock.wq = meta_sk->sk_socket->wq; -+ sock.file = meta_sk->sk_socket->file; -+ sock.ops = NULL; -+ -+ ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1); -+ if (unlikely(ret < 0)) { -+ mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret); -+ return ret; -+ } -+ -+ sk = sock.sk; -+ tp = tcp_sk(sk); -+ -+ /* All subsockets need the MPTCP-lock-class */ -+ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP"); -+ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0); -+ -+ if (mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL)) -+ goto error; -+ -+ tp->mptcp->slave_sk = 1; -+ tp->mptcp->low_prio = loc->low_prio; -+ -+ /* Initializing the timer for an MPTCP subflow */ -+ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk); -+ -+ /** Then, connect the socket to the peer */ -+ loc_in.sin6_family = AF_INET6; -+ rem_in.sin6_family = AF_INET6; -+ loc_in.sin6_port = 0; -+ if (rem->port) -+ rem_in.sin6_port = rem->port; -+ else -+ rem_in.sin6_port = inet_sk(meta_sk)->inet_dport; -+ loc_in.sin6_addr = loc->addr; -+ rem_in.sin6_addr = rem->addr; -+ -+ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in6)); -+ if (ret < 0) { -+ mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n", -+ __func__, ret); -+ goto error; -+ } -+ -+ mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n", -+ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, -+ tp->mptcp->path_index, &loc_in.sin6_addr, -+ ntohs(loc_in.sin6_port), &rem_in.sin6_addr, -+ ntohs(rem_in.sin6_port)); -+ -+ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6) -+ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr); -+ -+ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in, -+ sizeof(struct sockaddr_in6), O_NONBLOCK); -+ if (ret < 0 && ret != -EINPROGRESS) { -+ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n", -+ __func__, ret); -+ goto error; -+ } -+ -+ sk_set_socket(sk, meta_sk->sk_socket); -+ sk->sk_wq = meta_sk->sk_wq; -+ -+ return 0; -+ -+error: -+ /* May happen if mptcp_add_sock fails first */ -+ if (!mptcp(tp)) { -+ tcp_close(sk, 0); -+ } else { -+ local_bh_disable(); -+ mptcp_sub_force_close(sk); -+ local_bh_enable(); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(mptcp_init6_subsockets); -+ -+const struct inet_connection_sock_af_ops mptcp_v6_specific = { -+ .queue_xmit = inet6_csk_xmit, -+ .send_check = tcp_v6_send_check, -+ .rebuild_header = inet6_sk_rebuild_header, -+ .sk_rx_dst_set = inet6_sk_rx_dst_set, -+ .conn_request = mptcp_conn_request, -+ .syn_recv_sock = tcp_v6_syn_recv_sock, -+ .net_header_len = sizeof(struct ipv6hdr), -+ .net_frag_header_len = sizeof(struct frag_hdr), -+ .setsockopt = ipv6_setsockopt, -+ .getsockopt = ipv6_getsockopt, -+ .addr2sockaddr = inet6_csk_addr2sockaddr, -+ .sockaddr_len = sizeof(struct sockaddr_in6), -+ .bind_conflict = inet6_csk_bind_conflict, -+#ifdef CONFIG_COMPAT -+ .compat_setsockopt = compat_ipv6_setsockopt, -+ .compat_getsockopt = compat_ipv6_getsockopt, -+#endif -+}; -+ -+const struct inet_connection_sock_af_ops mptcp_v6_mapped = { -+ .queue_xmit = ip_queue_xmit, -+ .send_check = tcp_v4_send_check, -+ .rebuild_header = inet_sk_rebuild_header, -+ .sk_rx_dst_set = inet_sk_rx_dst_set, -+ .conn_request = mptcp_conn_request, -+ .syn_recv_sock = tcp_v6_syn_recv_sock, -+ .net_header_len = sizeof(struct iphdr), -+ .setsockopt = ipv6_setsockopt, -+ .getsockopt = ipv6_getsockopt, -+ .addr2sockaddr = inet6_csk_addr2sockaddr, -+ .sockaddr_len = sizeof(struct sockaddr_in6), -+ .bind_conflict = inet6_csk_bind_conflict, -+#ifdef CONFIG_COMPAT -+ .compat_setsockopt = compat_ipv6_setsockopt, -+ .compat_getsockopt = compat_ipv6_getsockopt, -+#endif -+}; -+ -+struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops; -+struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops; -+ -+int mptcp_pm_v6_init(void) -+{ -+ int ret = 0; -+ struct request_sock_ops *ops = &mptcp6_request_sock_ops; -+ -+ mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; -+ mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req; -+ -+ mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; -+ mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req; -+ mptcp_join_request_sock_ipv6_ops.queue_hash_add = mptcp_v6_reqsk_queue_hash_add; -+ -+ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6"); -+ if (ops->slab_name == NULL) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, -+ SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, -+ NULL); -+ -+ if (ops->slab == NULL) { -+ ret = -ENOMEM; -+ goto err_reqsk_create; -+ } -+ -+out: -+ return ret; -+ -+err_reqsk_create: -+ kfree(ops->slab_name); -+ ops->slab_name = NULL; -+ goto out; -+} -+ -+void mptcp_pm_v6_undo(void) -+{ -+ kmem_cache_destroy(mptcp6_request_sock_ops.slab); -+ kfree(mptcp6_request_sock_ops.slab_name); -+} -diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c -new file mode 100644 -index 000000000000..6f5087983175 ---- /dev/null -+++ b/net/mptcp/mptcp_ndiffports.c -@@ -0,0 +1,161 @@ -+#include -+ -+#include -+#include -+ -+#if IS_ENABLED(CONFIG_IPV6) -+#include -+#endif -+ -+struct ndiffports_priv { -+ /* Worker struct for subflow establishment */ -+ struct work_struct subflow_work; -+ -+ struct mptcp_cb *mpcb; -+}; -+ -+static int num_subflows __read_mostly = 2; -+module_param(num_subflows, int, 0644); -+MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection"); -+ -+/** -+ * Create all new subflows, by doing calls to mptcp_initX_subsockets -+ * -+ * This function uses a goto next_subflow, to allow releasing the lock between -+ * new subflows and giving other processes a chance to do some work on the -+ * socket and potentially finishing the communication. -+ **/ -+static void create_subflow_worker(struct work_struct *work) -+{ -+ const struct ndiffports_priv *pm_priv = container_of(work, -+ struct ndiffports_priv, -+ subflow_work); -+ struct mptcp_cb *mpcb = pm_priv->mpcb; -+ struct sock *meta_sk = mpcb->meta_sk; -+ int iter = 0; -+ -+next_subflow: -+ if (iter) { -+ release_sock(meta_sk); -+ mutex_unlock(&mpcb->mpcb_mutex); -+ -+ cond_resched(); -+ } -+ mutex_lock(&mpcb->mpcb_mutex); -+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); -+ -+ iter++; -+ -+ if (sock_flag(meta_sk, SOCK_DEAD)) -+ goto exit; -+ -+ if (mpcb->master_sk && -+ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) -+ goto exit; -+ -+ if (num_subflows > iter && num_subflows > mpcb->cnt_subflows) { -+ if (meta_sk->sk_family == AF_INET || -+ mptcp_v6_is_v4_mapped(meta_sk)) { -+ struct mptcp_loc4 loc; -+ struct mptcp_rem4 rem; -+ -+ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr; -+ loc.loc4_id = 0; -+ loc.low_prio = 0; -+ -+ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr; -+ rem.port = inet_sk(meta_sk)->inet_dport; -+ rem.rem4_id = 0; /* Default 0 */ -+ -+ mptcp_init4_subsockets(meta_sk, &loc, &rem); -+ } else { -+#if IS_ENABLED(CONFIG_IPV6) -+ struct mptcp_loc6 loc; -+ struct mptcp_rem6 rem; -+ -+ loc.addr = inet6_sk(meta_sk)->saddr; -+ loc.loc6_id = 0; -+ loc.low_prio = 0; -+ -+ rem.addr = meta_sk->sk_v6_daddr; -+ rem.port = inet_sk(meta_sk)->inet_dport; -+ rem.rem6_id = 0; /* Default 0 */ -+ -+ mptcp_init6_subsockets(meta_sk, &loc, &rem); -+#endif -+ } -+ goto next_subflow; -+ } -+ -+exit: -+ release_sock(meta_sk); -+ mutex_unlock(&mpcb->mpcb_mutex); -+ sock_put(meta_sk); -+} -+ -+static void ndiffports_new_session(const struct sock *meta_sk) -+{ -+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0]; -+ -+ /* Initialize workqueue-struct */ -+ INIT_WORK(&fmp->subflow_work, create_subflow_worker); -+ fmp->mpcb = mpcb; -+} -+ -+static void ndiffports_create_subflows(struct sock *meta_sk) -+{ -+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0]; -+ -+ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv || -+ mpcb->send_infinite_mapping || -+ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) -+ return; -+ -+ if (!work_pending(&pm_priv->subflow_work)) { -+ sock_hold(meta_sk); -+ queue_work(mptcp_wq, &pm_priv->subflow_work); -+ } -+} -+ -+static int ndiffports_get_local_id(sa_family_t family, union inet_addr *addr, -+ struct net *net, bool *low_prio) -+{ -+ return 0; -+} -+ -+static struct mptcp_pm_ops ndiffports __read_mostly = { -+ .new_session = ndiffports_new_session, -+ .fully_established = ndiffports_create_subflows, -+ .get_local_id = ndiffports_get_local_id, -+ .name = "ndiffports", -+ .owner = THIS_MODULE, -+}; -+ -+/* General initialization of MPTCP_PM */ -+static int __init ndiffports_register(void) -+{ -+ BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE); -+ -+ if (mptcp_register_path_manager(&ndiffports)) -+ goto exit; -+ -+ return 0; -+ -+exit: -+ return -1; -+} -+ -+static void ndiffports_unregister(void) -+{ -+ mptcp_unregister_path_manager(&ndiffports); -+} -+ -+module_init(ndiffports_register); -+module_exit(ndiffports_unregister); -+ -+MODULE_AUTHOR("Christoph Paasch"); -+MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("NDIFF-PORTS MPTCP"); -+MODULE_VERSION("0.88"); -diff --git a/net/mptcp/mptcp_ofo_queue.c b/net/mptcp/mptcp_ofo_queue.c -new file mode 100644 -index 000000000000..ec4e98622637 ---- /dev/null -+++ b/net/mptcp/mptcp_ofo_queue.c -@@ -0,0 +1,295 @@ -+/* -+ * MPTCP implementation - Fast algorithm for MPTCP meta-reordering -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer & Author: -+ * Christoph Paasch -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, -+ const struct sk_buff *skb) -+{ -+ struct tcp_sock *tp; -+ -+ mptcp_for_each_tp(mpcb, tp) { -+ if (tp->mptcp->shortcut_ofoqueue == skb) { -+ tp->mptcp->shortcut_ofoqueue = NULL; -+ return; -+ } -+ } -+} -+ -+/* Does 'skb' fits after 'here' in the queue 'head' ? -+ * If yes, we queue it and return 1 -+ */ -+static int mptcp_ofo_queue_after(struct sk_buff_head *head, -+ struct sk_buff *skb, struct sk_buff *here, -+ const struct tcp_sock *tp) -+{ -+ struct sock *meta_sk = tp->meta_sk; -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ u32 seq = TCP_SKB_CB(skb)->seq; -+ u32 end_seq = TCP_SKB_CB(skb)->end_seq; -+ -+ /* We want to queue skb after here, thus seq >= end_seq */ -+ if (before(seq, TCP_SKB_CB(here)->end_seq)) -+ return 0; -+ -+ if (seq == TCP_SKB_CB(here)->end_seq) { -+ bool fragstolen = false; -+ -+ if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) { -+ __skb_queue_after(&meta_tp->out_of_order_queue, here, skb); -+ return 1; -+ } else { -+ kfree_skb_partial(skb, fragstolen); -+ return -1; -+ } -+ } -+ -+ /* If here is the last one, we can always queue it */ -+ if (skb_queue_is_last(head, here)) { -+ __skb_queue_after(head, here, skb); -+ return 1; -+ } else { -+ struct sk_buff *skb1 = skb_queue_next(head, here); -+ /* It's not the last one, but does it fits between 'here' and -+ * the one after 'here' ? Thus, does end_seq <= after_here->seq -+ */ -+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) { -+ __skb_queue_after(head, here, skb); -+ return 1; -+ } -+ } -+ -+ return 0; -+} -+ -+static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb, -+ struct sk_buff_head *head, struct tcp_sock *tp) -+{ -+ struct sock *meta_sk = tp->meta_sk; -+ struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk); -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ struct sk_buff *skb1, *best_shortcut = NULL; -+ u32 seq = TCP_SKB_CB(skb)->seq; -+ u32 end_seq = TCP_SKB_CB(skb)->end_seq; -+ u32 distance = 0xffffffff; -+ -+ /* First, check the tp's shortcut */ -+ if (!shortcut) { -+ if (skb_queue_empty(head)) { -+ __skb_queue_head(head, skb); -+ goto end; -+ } -+ } else { -+ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp); -+ /* Does the tp's shortcut is a hit? If yes, we insert. */ -+ -+ if (ret) { -+ skb = (ret > 0) ? skb : NULL; -+ goto end; -+ } -+ } -+ -+ /* Check the shortcuts of the other subsockets. */ -+ mptcp_for_each_tp(mpcb, tp_it) { -+ shortcut = tp_it->mptcp->shortcut_ofoqueue; -+ /* Can we queue it here? If yes, do so! */ -+ if (shortcut) { -+ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp); -+ -+ if (ret) { -+ skb = (ret > 0) ? skb : NULL; -+ goto end; -+ } -+ } -+ -+ /* Could not queue it, check if we are close. -+ * We are looking for a shortcut, close enough to seq to -+ * set skb1 prematurely and thus improve the subsequent lookup, -+ * which tries to find a skb1 so that skb1->seq <= seq. -+ * -+ * So, here we only take shortcuts, whose shortcut->seq > seq, -+ * and minimize the distance between shortcut->seq and seq and -+ * set best_shortcut to this one with the minimal distance. -+ * -+ * That way, the subsequent while-loop is shortest. -+ */ -+ if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) { -+ /* Are we closer than the current best shortcut? */ -+ if ((u32)(TCP_SKB_CB(shortcut)->seq - seq) < distance) { -+ distance = (u32)(TCP_SKB_CB(shortcut)->seq - seq); -+ best_shortcut = shortcut; -+ } -+ } -+ } -+ -+ if (best_shortcut) -+ skb1 = best_shortcut; -+ else -+ skb1 = skb_peek_tail(head); -+ -+ if (seq == TCP_SKB_CB(skb1)->end_seq) { -+ bool fragstolen = false; -+ -+ if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) { -+ __skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb); -+ } else { -+ kfree_skb_partial(skb, fragstolen); -+ skb = NULL; -+ } -+ -+ goto end; -+ } -+ -+ /* Find the insertion point, starting from best_shortcut if available. -+ * -+ * Inspired from tcp_data_queue_ofo. -+ */ -+ while (1) { -+ /* skb1->seq <= seq */ -+ if (!after(TCP_SKB_CB(skb1)->seq, seq)) -+ break; -+ if (skb_queue_is_first(head, skb1)) { -+ skb1 = NULL; -+ break; -+ } -+ skb1 = skb_queue_prev(head, skb1); -+ } -+ -+ /* Do skb overlap to previous one? */ -+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { -+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { -+ /* All the bits are present. */ -+ __kfree_skb(skb); -+ skb = NULL; -+ goto end; -+ } -+ if (seq == TCP_SKB_CB(skb1)->seq) { -+ if (skb_queue_is_first(head, skb1)) -+ skb1 = NULL; -+ else -+ skb1 = skb_queue_prev(head, skb1); -+ } -+ } -+ if (!skb1) -+ __skb_queue_head(head, skb); -+ else -+ __skb_queue_after(head, skb1, skb); -+ -+ /* And clean segments covered by new one as whole. */ -+ while (!skb_queue_is_last(head, skb)) { -+ skb1 = skb_queue_next(head, skb); -+ -+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) -+ break; -+ -+ __skb_unlink(skb1, head); -+ mptcp_remove_shortcuts(mpcb, skb1); -+ __kfree_skb(skb1); -+ } -+ -+end: -+ if (skb) { -+ skb_set_owner_r(skb, meta_sk); -+ tp->mptcp->shortcut_ofoqueue = skb; -+ } -+ -+ return; -+} -+ -+/** -+ * @sk: the subflow that received this skb. -+ */ -+void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb, -+ struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ try_shortcut(tp->mptcp->shortcut_ofoqueue, skb, -+ &tcp_sk(meta_sk)->out_of_order_queue, tp); -+} -+ -+bool mptcp_prune_ofo_queue(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ bool res = false; -+ -+ if (!skb_queue_empty(&tp->out_of_order_queue)) { -+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); -+ mptcp_purge_ofo_queue(tp); -+ -+ /* No sack at the mptcp-level */ -+ sk_mem_reclaim(sk); -+ res = true; -+ } -+ -+ return res; -+} -+ -+void mptcp_ofo_queue(struct sock *meta_sk) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct sk_buff *skb; -+ -+ while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) { -+ u32 old_rcv_nxt = meta_tp->rcv_nxt; -+ if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt)) -+ break; -+ -+ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) { -+ __skb_unlink(skb, &meta_tp->out_of_order_queue); -+ mptcp_remove_shortcuts(meta_tp->mpcb, skb); -+ __kfree_skb(skb); -+ continue; -+ } -+ -+ __skb_unlink(skb, &meta_tp->out_of_order_queue); -+ mptcp_remove_shortcuts(meta_tp->mpcb, skb); -+ -+ __skb_queue_tail(&meta_sk->sk_receive_queue, skb); -+ meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; -+ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt); -+ -+ if (tcp_hdr(skb)->fin) -+ mptcp_fin(meta_sk); -+ } -+} -+ -+void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) -+{ -+ struct sk_buff_head *head = &meta_tp->out_of_order_queue; -+ struct sk_buff *skb, *tmp; -+ -+ skb_queue_walk_safe(head, skb, tmp) { -+ __skb_unlink(skb, head); -+ mptcp_remove_shortcuts(meta_tp->mpcb, skb); -+ kfree_skb(skb); -+ } -+} -diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c -new file mode 100644 -index 000000000000..53f5c43bb488 ---- /dev/null -+++ b/net/mptcp/mptcp_olia.c -@@ -0,0 +1,311 @@ -+/* -+ * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL: -+ * -+ * Algorithm design: -+ * Ramin Khalili -+ * Nicolas Gast -+ * Jean-Yves Le Boudec -+ * -+ * Implementation: -+ * Ramin Khalili -+ * -+ * Ported to the official MPTCP-kernel: -+ * Christoph Paasch -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+ -+#include -+#include -+ -+#include -+ -+static int scale = 10; -+ -+struct mptcp_olia { -+ u32 mptcp_loss1; -+ u32 mptcp_loss2; -+ u32 mptcp_loss3; -+ int epsilon_num; -+ u32 epsilon_den; -+ int mptcp_snd_cwnd_cnt; -+}; -+ -+static inline int mptcp_olia_sk_can_send(const struct sock *sk) -+{ -+ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; -+} -+ -+static inline u64 mptcp_olia_scale(u64 val, int scale) -+{ -+ return (u64) val << scale; -+} -+ -+/* take care of artificially inflate (see RFC5681) -+ * of cwnd during fast-retransmit phase -+ */ -+static u32 mptcp_get_crt_cwnd(struct sock *sk) -+{ -+ const struct inet_connection_sock *icsk = inet_csk(sk); -+ -+ if (icsk->icsk_ca_state == TCP_CA_Recovery) -+ return tcp_sk(sk)->snd_ssthresh; -+ else -+ return tcp_sk(sk)->snd_cwnd; -+} -+ -+/* return the dominator of the first term of the increasing term */ -+static u64 mptcp_get_rate(const struct mptcp_cb *mpcb , u32 path_rtt) -+{ -+ struct sock *sk; -+ u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */ -+ -+ mptcp_for_each_sk(mpcb, sk) { -+ struct tcp_sock *tp = tcp_sk(sk); -+ u64 scaled_num; -+ u32 tmp_cwnd; -+ -+ if (!mptcp_olia_sk_can_send(sk)) -+ continue; -+ -+ tmp_cwnd = mptcp_get_crt_cwnd(sk); -+ scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt; -+ rate += div_u64(scaled_num , tp->srtt_us); -+ } -+ rate *= rate; -+ return rate; -+} -+ -+/* find the maximum cwnd, used to find set M */ -+static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb) -+{ -+ struct sock *sk; -+ u32 best_cwnd = 0; -+ -+ mptcp_for_each_sk(mpcb, sk) { -+ u32 tmp_cwnd; -+ -+ if (!mptcp_olia_sk_can_send(sk)) -+ continue; -+ -+ tmp_cwnd = mptcp_get_crt_cwnd(sk); -+ if (tmp_cwnd > best_cwnd) -+ best_cwnd = tmp_cwnd; -+ } -+ return best_cwnd; -+} -+ -+static void mptcp_get_epsilon(const struct mptcp_cb *mpcb) -+{ -+ struct mptcp_olia *ca; -+ struct tcp_sock *tp; -+ struct sock *sk; -+ u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1; -+ u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd; -+ u8 M = 0, B_not_M = 0; -+ -+ /* TODO - integrate this in the following loop - we just want to iterate once */ -+ -+ max_cwnd = mptcp_get_max_cwnd(mpcb); -+ -+ /* find the best path */ -+ mptcp_for_each_sk(mpcb, sk) { -+ tp = tcp_sk(sk); -+ ca = inet_csk_ca(sk); -+ -+ if (!mptcp_olia_sk_can_send(sk)) -+ continue; -+ -+ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; -+ /* TODO - check here and rename variables */ -+ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, -+ ca->mptcp_loss2 - ca->mptcp_loss1); -+ -+ tmp_cwnd = mptcp_get_crt_cwnd(sk); -+ if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) { -+ best_rtt = tmp_rtt; -+ best_int = tmp_int; -+ best_cwnd = tmp_cwnd; -+ } -+ } -+ -+ /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */ -+ /* find the size of M and B_not_M */ -+ mptcp_for_each_sk(mpcb, sk) { -+ tp = tcp_sk(sk); -+ ca = inet_csk_ca(sk); -+ -+ if (!mptcp_olia_sk_can_send(sk)) -+ continue; -+ -+ tmp_cwnd = mptcp_get_crt_cwnd(sk); -+ if (tmp_cwnd == max_cwnd) { -+ M++; -+ } else { -+ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; -+ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, -+ ca->mptcp_loss2 - ca->mptcp_loss1); -+ -+ if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) -+ B_not_M++; -+ } -+ } -+ -+ /* check if the path is in M or B_not_M and set the value of epsilon accordingly */ -+ mptcp_for_each_sk(mpcb, sk) { -+ tp = tcp_sk(sk); -+ ca = inet_csk_ca(sk); -+ -+ if (!mptcp_olia_sk_can_send(sk)) -+ continue; -+ -+ if (B_not_M == 0) { -+ ca->epsilon_num = 0; -+ ca->epsilon_den = 1; -+ } else { -+ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; -+ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, -+ ca->mptcp_loss2 - ca->mptcp_loss1); -+ tmp_cwnd = mptcp_get_crt_cwnd(sk); -+ -+ if (tmp_cwnd < max_cwnd && -+ (u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) { -+ ca->epsilon_num = 1; -+ ca->epsilon_den = mpcb->cnt_established * B_not_M; -+ } else if (tmp_cwnd == max_cwnd) { -+ ca->epsilon_num = -1; -+ ca->epsilon_den = mpcb->cnt_established * M; -+ } else { -+ ca->epsilon_num = 0; -+ ca->epsilon_den = 1; -+ } -+ } -+ } -+} -+ -+/* setting the initial values */ -+static void mptcp_olia_init(struct sock *sk) -+{ -+ const struct tcp_sock *tp = tcp_sk(sk); -+ struct mptcp_olia *ca = inet_csk_ca(sk); -+ -+ if (mptcp(tp)) { -+ ca->mptcp_loss1 = tp->snd_una; -+ ca->mptcp_loss2 = tp->snd_una; -+ ca->mptcp_loss3 = tp->snd_una; -+ ca->mptcp_snd_cwnd_cnt = 0; -+ ca->epsilon_num = 0; -+ ca->epsilon_den = 1; -+ } -+} -+ -+/* updating inter-loss distance and ssthresh */ -+static void mptcp_olia_set_state(struct sock *sk, u8 new_state) -+{ -+ if (!mptcp(tcp_sk(sk))) -+ return; -+ -+ if (new_state == TCP_CA_Loss || -+ new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) { -+ struct mptcp_olia *ca = inet_csk_ca(sk); -+ -+ if (ca->mptcp_loss3 != ca->mptcp_loss2 && -+ !inet_csk(sk)->icsk_retransmits) { -+ ca->mptcp_loss1 = ca->mptcp_loss2; -+ ca->mptcp_loss2 = ca->mptcp_loss3; -+ } -+ } -+} -+ -+/* main algorithm */ -+static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct mptcp_olia *ca = inet_csk_ca(sk); -+ const struct mptcp_cb *mpcb = tp->mpcb; -+ -+ u64 inc_num, inc_den, rate, cwnd_scaled; -+ -+ if (!mptcp(tp)) { -+ tcp_reno_cong_avoid(sk, ack, acked); -+ return; -+ } -+ -+ ca->mptcp_loss3 = tp->snd_una; -+ -+ if (!tcp_is_cwnd_limited(sk)) -+ return; -+ -+ /* slow start if it is in the safe area */ -+ if (tp->snd_cwnd <= tp->snd_ssthresh) { -+ tcp_slow_start(tp, acked); -+ return; -+ } -+ -+ mptcp_get_epsilon(mpcb); -+ rate = mptcp_get_rate(mpcb, tp->srtt_us); -+ cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale); -+ inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1; -+ -+ /* calculate the increasing term, scaling is used to reduce the rounding effect */ -+ if (ca->epsilon_num == -1) { -+ if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) { -+ inc_num = rate - ca->epsilon_den * -+ cwnd_scaled * cwnd_scaled; -+ ca->mptcp_snd_cwnd_cnt -= div64_u64( -+ mptcp_olia_scale(inc_num , scale) , inc_den); -+ } else { -+ inc_num = ca->epsilon_den * -+ cwnd_scaled * cwnd_scaled - rate; -+ ca->mptcp_snd_cwnd_cnt += div64_u64( -+ mptcp_olia_scale(inc_num , scale) , inc_den); -+ } -+ } else { -+ inc_num = ca->epsilon_num * rate + -+ ca->epsilon_den * cwnd_scaled * cwnd_scaled; -+ ca->mptcp_snd_cwnd_cnt += div64_u64( -+ mptcp_olia_scale(inc_num , scale) , inc_den); -+ } -+ -+ -+ if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) { -+ if (tp->snd_cwnd < tp->snd_cwnd_clamp) -+ tp->snd_cwnd++; -+ ca->mptcp_snd_cwnd_cnt = 0; -+ } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) { -+ tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1); -+ ca->mptcp_snd_cwnd_cnt = 0; -+ } -+} -+ -+static struct tcp_congestion_ops mptcp_olia = { -+ .init = mptcp_olia_init, -+ .ssthresh = tcp_reno_ssthresh, -+ .cong_avoid = mptcp_olia_cong_avoid, -+ .set_state = mptcp_olia_set_state, -+ .owner = THIS_MODULE, -+ .name = "olia", -+}; -+ -+static int __init mptcp_olia_register(void) -+{ -+ BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE); -+ return tcp_register_congestion_control(&mptcp_olia); -+} -+ -+static void __exit mptcp_olia_unregister(void) -+{ -+ tcp_unregister_congestion_control(&mptcp_olia); -+} -+ -+module_init(mptcp_olia_register); -+module_exit(mptcp_olia_unregister); -+ -+MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec"); -+MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL"); -+MODULE_VERSION("0.1"); -diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c -new file mode 100644 -index 000000000000..400ea254c078 ---- /dev/null -+++ b/net/mptcp/mptcp_output.c -@@ -0,0 +1,1743 @@ -+/* -+ * MPTCP implementation - Sending side -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer & Author: -+ * Christoph Paasch -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+ -+static const int mptcp_dss_len = MPTCP_SUB_LEN_DSS_ALIGN + -+ MPTCP_SUB_LEN_ACK_ALIGN + -+ MPTCP_SUB_LEN_SEQ_ALIGN; -+ -+static inline int mptcp_sub_len_remove_addr(u16 bitfield) -+{ -+ unsigned int c; -+ for (c = 0; bitfield; c++) -+ bitfield &= bitfield - 1; -+ return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1; -+} -+ -+int mptcp_sub_len_remove_addr_align(u16 bitfield) -+{ -+ return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4); -+} -+EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align); -+ -+/* get the data-seq and end-data-seq and store them again in the -+ * tcp_skb_cb -+ */ -+static int mptcp_reconstruct_mapping(struct sk_buff *skb) -+{ -+ const struct mp_dss *mpdss = (struct mp_dss *)TCP_SKB_CB(skb)->dss; -+ u32 *p32; -+ u16 *p16; -+ -+ if (!mpdss->M) -+ return 1; -+ -+ /* Move the pointer to the data-seq */ -+ p32 = (u32 *)mpdss; -+ p32++; -+ if (mpdss->A) { -+ p32++; -+ if (mpdss->a) -+ p32++; -+ } -+ -+ TCP_SKB_CB(skb)->seq = ntohl(*p32); -+ -+ /* Get the data_len to calculate the end_data_seq */ -+ p32++; -+ p32++; -+ p16 = (u16 *)p32; -+ TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq; -+ -+ return 0; -+} -+ -+static void mptcp_find_and_set_pathmask(const struct sock *meta_sk, struct sk_buff *skb) -+{ -+ struct sk_buff *skb_it; -+ -+ skb_it = tcp_write_queue_head(meta_sk); -+ -+ tcp_for_write_queue_from(skb_it, meta_sk) { -+ if (skb_it == tcp_send_head(meta_sk)) -+ break; -+ -+ if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) { -+ TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask; -+ break; -+ } -+ } -+} -+ -+/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are -+ * coming from the meta-retransmit-timer -+ */ -+static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk, -+ struct sock *sk, int clone_it) -+{ -+ struct sk_buff *skb, *skb1; -+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ u32 seq, end_seq; -+ -+ if (clone_it) { -+ /* pskb_copy is necessary here, because the TCP/IP-headers -+ * will be changed when it's going to be reinjected on another -+ * subflow. -+ */ -+ skb = pskb_copy_for_clone(orig_skb, GFP_ATOMIC); -+ } else { -+ __skb_unlink(orig_skb, &sk->sk_write_queue); -+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); -+ sk->sk_wmem_queued -= orig_skb->truesize; -+ sk_mem_uncharge(sk, orig_skb->truesize); -+ skb = orig_skb; -+ } -+ if (unlikely(!skb)) -+ return; -+ -+ if (sk && mptcp_reconstruct_mapping(skb)) { -+ __kfree_skb(skb); -+ return; -+ } -+ -+ skb->sk = meta_sk; -+ -+ /* If it reached already the destination, we don't have to reinject it */ -+ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { -+ __kfree_skb(skb); -+ return; -+ } -+ -+ /* Only reinject segments that are fully covered by the mapping */ -+ if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) != -+ TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { -+ u32 seq = TCP_SKB_CB(skb)->seq; -+ u32 end_seq = TCP_SKB_CB(skb)->end_seq; -+ -+ __kfree_skb(skb); -+ -+ /* Ok, now we have to look for the full mapping in the meta -+ * send-queue :S -+ */ -+ tcp_for_write_queue(skb, meta_sk) { -+ /* Not yet at the mapping? */ -+ if (before(TCP_SKB_CB(skb)->seq, seq)) -+ continue; -+ /* We have passed by the mapping */ -+ if (after(TCP_SKB_CB(skb)->end_seq, end_seq)) -+ return; -+ -+ __mptcp_reinject_data(skb, meta_sk, NULL, 1); -+ } -+ return; -+ } -+ -+ /* Segment goes back to the MPTCP-layer. So, we need to zero the -+ * path_mask/dss. -+ */ -+ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len); -+ -+ /* We need to find out the path-mask from the meta-write-queue -+ * to properly select a subflow. -+ */ -+ mptcp_find_and_set_pathmask(meta_sk, skb); -+ -+ /* If it's empty, just add */ -+ if (skb_queue_empty(&mpcb->reinject_queue)) { -+ skb_queue_head(&mpcb->reinject_queue, skb); -+ return; -+ } -+ -+ /* Find place to insert skb - or even we can 'drop' it, as the -+ * data is already covered by other skb's in the reinject-queue. -+ * -+ * This is inspired by code from tcp_data_queue. -+ */ -+ -+ skb1 = skb_peek_tail(&mpcb->reinject_queue); -+ seq = TCP_SKB_CB(skb)->seq; -+ while (1) { -+ if (!after(TCP_SKB_CB(skb1)->seq, seq)) -+ break; -+ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) { -+ skb1 = NULL; -+ break; -+ } -+ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); -+ } -+ -+ /* Do skb overlap to previous one? */ -+ end_seq = TCP_SKB_CB(skb)->end_seq; -+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { -+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { -+ /* All the bits are present. Don't reinject */ -+ __kfree_skb(skb); -+ return; -+ } -+ if (seq == TCP_SKB_CB(skb1)->seq) { -+ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) -+ skb1 = NULL; -+ else -+ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); -+ } -+ } -+ if (!skb1) -+ __skb_queue_head(&mpcb->reinject_queue, skb); -+ else -+ __skb_queue_after(&mpcb->reinject_queue, skb1, skb); -+ -+ /* And clean segments covered by new one as whole. */ -+ while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) { -+ skb1 = skb_queue_next(&mpcb->reinject_queue, skb); -+ -+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) -+ break; -+ -+ __skb_unlink(skb1, &mpcb->reinject_queue); -+ __kfree_skb(skb1); -+ } -+ return; -+} -+ -+/* Inserts data into the reinject queue */ -+void mptcp_reinject_data(struct sock *sk, int clone_it) -+{ -+ struct sk_buff *skb_it, *tmp; -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct sock *meta_sk = tp->meta_sk; -+ -+ /* It has already been closed - there is really no point in reinjecting */ -+ if (meta_sk->sk_state == TCP_CLOSE) -+ return; -+ -+ skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) { -+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it); -+ /* Subflow syn's and fin's are not reinjected. -+ * -+ * As well as empty subflow-fins with a data-fin. -+ * They are reinjected below (without the subflow-fin-flag) -+ */ -+ if (tcb->tcp_flags & TCPHDR_SYN || -+ (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) || -+ (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len)) -+ continue; -+ -+ __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it); -+ } -+ -+ skb_it = tcp_write_queue_tail(meta_sk); -+ /* If sk has sent the empty data-fin, we have to reinject it too. */ -+ if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 && -+ TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) { -+ __mptcp_reinject_data(skb_it, meta_sk, NULL, 1); -+ } -+ -+ mptcp_push_pending_frames(meta_sk); -+ -+ tp->pf = 1; -+} -+EXPORT_SYMBOL(mptcp_reinject_data); -+ -+static void mptcp_combine_dfin(const struct sk_buff *skb, const struct sock *meta_sk, -+ struct sock *subsk) -+{ -+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ struct sock *sk_it; -+ int all_empty = 1, all_acked; -+ -+ /* In infinite mapping we always try to combine */ -+ if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) { -+ subsk->sk_shutdown |= SEND_SHUTDOWN; -+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; -+ return; -+ } -+ -+ /* Don't combine, if they didn't combine - otherwise we end up in -+ * TIME_WAIT, even if our app is smart enough to avoid it -+ */ -+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { -+ if (!mpcb->dfin_combined) -+ return; -+ } -+ -+ /* If no other subflow has data to send, we can combine */ -+ mptcp_for_each_sk(mpcb, sk_it) { -+ if (!mptcp_sk_can_send(sk_it)) -+ continue; -+ -+ if (!tcp_write_queue_empty(sk_it)) -+ all_empty = 0; -+ } -+ -+ /* If all data has been DATA_ACKed, we can combine. -+ * -1, because the data_fin consumed one byte -+ */ -+ all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1)); -+ -+ if ((all_empty || all_acked) && tcp_close_state(subsk)) { -+ subsk->sk_shutdown |= SEND_SHUTDOWN; -+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; -+ } -+} -+ -+static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb, -+ __be32 *ptr) -+{ -+ const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); -+ __be32 *start = ptr; -+ __u16 data_len; -+ -+ *ptr++ = htonl(tcb->seq); /* data_seq */ -+ -+ /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */ -+ if (mptcp_is_data_fin(skb) && skb->len == 0) -+ *ptr++ = 0; /* subseq */ -+ else -+ *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */ -+ -+ if (tcb->mptcp_flags & MPTCPHDR_INF) -+ data_len = 0; -+ else -+ data_len = tcb->end_seq - tcb->seq; -+ -+ if (tp->mpcb->dss_csum && data_len) { -+ __be16 *p16 = (__be16 *)ptr; -+ __be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb); -+ __wsum csum; -+ -+ *ptr = htonl(((data_len) << 16) | -+ (TCPOPT_EOL << 8) | -+ (TCPOPT_EOL)); -+ csum = csum_partial(ptr - 2, 12, skb->csum); -+ p16++; -+ *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum)); -+ } else { -+ *ptr++ = htonl(((data_len) << 16) | -+ (TCPOPT_NOP << 8) | -+ (TCPOPT_NOP)); -+ } -+ -+ return ptr - start; -+} -+ -+static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb, -+ __be32 *ptr) -+{ -+ struct mp_dss *mdss = (struct mp_dss *)ptr; -+ __be32 *start = ptr; -+ -+ mdss->kind = TCPOPT_MPTCP; -+ mdss->sub = MPTCP_SUB_DSS; -+ mdss->rsv1 = 0; -+ mdss->rsv2 = 0; -+ mdss->F = mptcp_is_data_fin(skb) ? 1 : 0; -+ mdss->m = 0; -+ mdss->M = mptcp_is_data_seq(skb) ? 1 : 0; -+ mdss->a = 0; -+ mdss->A = 1; -+ mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum); -+ ptr++; -+ -+ *ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt); -+ -+ return ptr - start; -+} -+ -+/* RFC6824 states that once a particular subflow mapping has been sent -+ * out it must never be changed. However, packets may be split while -+ * they are in the retransmission queue (due to SACK or ACKs) and that -+ * arguably means that we would change the mapping (e.g. it splits it, -+ * our sends out a subset of the initial mapping). -+ * -+ * Furthermore, the skb checksum is not always preserved across splits -+ * (e.g. mptcp_fragment) which would mean that we need to recompute -+ * the DSS checksum in this case. -+ * -+ * To avoid this we save the initial DSS mapping which allows us to -+ * send the same DSS mapping even for fragmented retransmits. -+ */ -+static void mptcp_save_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb) -+{ -+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); -+ __be32 *ptr = (__be32 *)tcb->dss; -+ -+ tcb->mptcp_flags |= MPTCPHDR_SEQ; -+ -+ ptr += mptcp_write_dss_data_ack(tp, skb, ptr); -+ ptr += mptcp_write_dss_mapping(tp, skb, ptr); -+} -+ -+/* Write the saved DSS mapping to the header */ -+static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb, -+ __be32 *ptr) -+{ -+ __be32 *start = ptr; -+ -+ memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len); -+ -+ /* update the data_ack */ -+ start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt); -+ -+ /* dss is in a union with inet_skb_parm and -+ * the IP layer expects zeroed IPCB fields. -+ */ -+ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len); -+ -+ return mptcp_dss_len/sizeof(*ptr); -+} -+ -+static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ const struct sock *meta_sk = mptcp_meta_sk(sk); -+ const struct mptcp_cb *mpcb = tp->mpcb; -+ struct tcp_skb_cb *tcb; -+ struct sk_buff *subskb = NULL; -+ -+ if (!reinject) -+ TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? -+ MPTCPHDR_SEQ64_INDEX : 0); -+ -+ subskb = pskb_copy_for_clone(skb, GFP_ATOMIC); -+ if (!subskb) -+ return false; -+ -+ /* At the subflow-level we need to call again tcp_init_tso_segs. We -+ * force this, by setting gso_segs to 0. It has been set to 1 prior to -+ * the call to mptcp_skb_entail. -+ */ -+ skb_shinfo(subskb)->gso_segs = 0; -+ -+ TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index); -+ -+ if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) && -+ skb->ip_summed == CHECKSUM_PARTIAL) { -+ subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0); -+ subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE; -+ } -+ -+ tcb = TCP_SKB_CB(subskb); -+ -+ if (tp->mpcb->send_infinite_mapping && -+ !tp->mpcb->infinite_mapping_snd && -+ !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) { -+ tp->mptcp->fully_established = 1; -+ tp->mpcb->infinite_mapping_snd = 1; -+ tp->mptcp->infinite_cutoff_seq = tp->write_seq; -+ tcb->mptcp_flags |= MPTCPHDR_INF; -+ } -+ -+ if (mptcp_is_data_fin(subskb)) -+ mptcp_combine_dfin(subskb, meta_sk, sk); -+ -+ mptcp_save_dss_data_seq(tp, subskb); -+ -+ tcb->seq = tp->write_seq; -+ tcb->sacked = 0; /* reset the sacked field: from the point of view -+ * of this subflow, we are sending a brand new -+ * segment -+ */ -+ /* Take into account seg len */ -+ tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0); -+ tcb->end_seq = tp->write_seq; -+ -+ /* If it's a non-payload DATA_FIN (also no subflow-fin), the -+ * segment is not part of the subflow but on a meta-only-level. -+ */ -+ if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) { -+ tcp_add_write_queue_tail(sk, subskb); -+ sk->sk_wmem_queued += subskb->truesize; -+ sk_mem_charge(sk, subskb->truesize); -+ } else { -+ int err; -+ -+ /* Necessary to initialize for tcp_transmit_skb. mss of 1, as -+ * skb->len = 0 will force tso_segs to 1. -+ */ -+ tcp_init_tso_segs(sk, subskb, 1); -+ /* Empty data-fins are sent immediatly on the subflow */ -+ TCP_SKB_CB(subskb)->when = tcp_time_stamp; -+ err = tcp_transmit_skb(sk, subskb, 1, GFP_ATOMIC); -+ -+ /* It has not been queued, we can free it now. */ -+ kfree_skb(subskb); -+ -+ if (err) -+ return false; -+ } -+ -+ if (!tp->mptcp->fully_established) { -+ tp->mptcp->second_packet = 1; -+ tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq; -+ } -+ -+ return true; -+} -+ -+/* Fragment an skb and update the mptcp meta-data. Due to reinject, we -+ * might need to undo some operations done by tcp_fragment. -+ */ -+static int mptcp_fragment(struct sock *meta_sk, struct sk_buff *skb, u32 len, -+ gfp_t gfp, int reinject) -+{ -+ int ret, diff, old_factor; -+ struct sk_buff *buff; -+ u8 flags; -+ -+ if (skb_headlen(skb) < len) -+ diff = skb->len - len; -+ else -+ diff = skb->data_len; -+ old_factor = tcp_skb_pcount(skb); -+ -+ /* The mss_now in tcp_fragment is used to set the tso_segs of the skb. -+ * At the MPTCP-level we do not care about the absolute value. All we -+ * care about is that it is set to 1 for accurate packets_out -+ * accounting. -+ */ -+ ret = tcp_fragment(meta_sk, skb, len, UINT_MAX, gfp); -+ if (ret) -+ return ret; -+ -+ buff = skb->next; -+ -+ flags = TCP_SKB_CB(skb)->mptcp_flags; -+ TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN); -+ TCP_SKB_CB(buff)->mptcp_flags = flags; -+ TCP_SKB_CB(buff)->path_mask = TCP_SKB_CB(skb)->path_mask; -+ -+ /* If reinject == 1, the buff will be added to the reinject -+ * queue, which is currently not part of memory accounting. So -+ * undo the changes done by tcp_fragment and update the -+ * reinject queue. Also, undo changes to the packet counters. -+ */ -+ if (reinject == 1) { -+ int undo = buff->truesize - diff; -+ meta_sk->sk_wmem_queued -= undo; -+ sk_mem_uncharge(meta_sk, undo); -+ -+ tcp_sk(meta_sk)->mpcb->reinject_queue.qlen++; -+ meta_sk->sk_write_queue.qlen--; -+ -+ if (!before(tcp_sk(meta_sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { -+ undo = old_factor - tcp_skb_pcount(skb) - -+ tcp_skb_pcount(buff); -+ if (undo) -+ tcp_adjust_pcount(meta_sk, skb, -undo); -+ } -+ } -+ -+ return 0; -+} -+ -+/* Inspired by tcp_write_wakeup */ -+int mptcp_write_wakeup(struct sock *meta_sk) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct sk_buff *skb; -+ struct sock *sk_it; -+ int ans = 0; -+ -+ if (meta_sk->sk_state == TCP_CLOSE) -+ return -1; -+ -+ skb = tcp_send_head(meta_sk); -+ if (skb && -+ before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) { -+ unsigned int mss; -+ unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq; -+ struct sock *subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, true); -+ struct tcp_sock *subtp; -+ if (!subsk) -+ goto window_probe; -+ subtp = tcp_sk(subsk); -+ mss = tcp_current_mss(subsk); -+ -+ seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq, -+ tcp_wnd_end(subtp) - subtp->write_seq); -+ -+ if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) -+ meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; -+ -+ /* We are probing the opening of a window -+ * but the window size is != 0 -+ * must have been a result SWS avoidance ( sender ) -+ */ -+ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || -+ skb->len > mss) { -+ seg_size = min(seg_size, mss); -+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; -+ if (mptcp_fragment(meta_sk, skb, seg_size, -+ GFP_ATOMIC, 0)) -+ return -1; -+ } else if (!tcp_skb_pcount(skb)) { -+ /* see mptcp_write_xmit on why we use UINT_MAX */ -+ tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX); -+ } -+ -+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; -+ if (!mptcp_skb_entail(subsk, skb, 0)) -+ return -1; -+ TCP_SKB_CB(skb)->when = tcp_time_stamp; -+ -+ mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq - -+ TCP_SKB_CB(skb)->seq); -+ tcp_event_new_data_sent(meta_sk, skb); -+ -+ __tcp_push_pending_frames(subsk, mss, TCP_NAGLE_PUSH); -+ -+ return 0; -+ } else { -+window_probe: -+ if (between(meta_tp->snd_up, meta_tp->snd_una + 1, -+ meta_tp->snd_una + 0xFFFF)) { -+ mptcp_for_each_sk(meta_tp->mpcb, sk_it) { -+ if (mptcp_sk_can_send_ack(sk_it)) -+ tcp_xmit_probe_skb(sk_it, 1); -+ } -+ } -+ -+ /* At least one of the tcp_xmit_probe_skb's has to succeed */ -+ mptcp_for_each_sk(meta_tp->mpcb, sk_it) { -+ int ret; -+ -+ if (!mptcp_sk_can_send_ack(sk_it)) -+ continue; -+ -+ ret = tcp_xmit_probe_skb(sk_it, 0); -+ if (unlikely(ret > 0)) -+ ans = ret; -+ } -+ return ans; -+ } -+} -+ -+bool mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle, -+ int push_one, gfp_t gfp) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp; -+ struct sock *subsk = NULL; -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ struct sk_buff *skb; -+ unsigned int sent_pkts; -+ int reinject = 0; -+ unsigned int sublimit; -+ -+ sent_pkts = 0; -+ -+ while ((skb = mpcb->sched_ops->next_segment(meta_sk, &reinject, &subsk, -+ &sublimit))) { -+ unsigned int limit; -+ -+ subtp = tcp_sk(subsk); -+ mss_now = tcp_current_mss(subsk); -+ -+ if (reinject == 1) { -+ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { -+ /* Segment already reached the peer, take the next one */ -+ __skb_unlink(skb, &mpcb->reinject_queue); -+ __kfree_skb(skb); -+ continue; -+ } -+ } -+ -+ /* If the segment was cloned (e.g. a meta retransmission), -+ * the header must be expanded/copied so that there is no -+ * corruption of TSO information. -+ */ -+ if (skb_unclone(skb, GFP_ATOMIC)) -+ break; -+ -+ if (unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now))) -+ break; -+ -+ /* Force tso_segs to 1 by using UINT_MAX. -+ * We actually don't care about the exact number of segments -+ * emitted on the subflow. We need just to set tso_segs, because -+ * we still need an accurate packets_out count in -+ * tcp_event_new_data_sent. -+ */ -+ tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX); -+ -+ /* Check for nagle, irregardless of tso_segs. If the segment is -+ * actually larger than mss_now (TSO segment), then -+ * tcp_nagle_check will have partial == false and always trigger -+ * the transmission. -+ * tcp_write_xmit has a TSO-level nagle check which is not -+ * subject to the MPTCP-level. It is based on the properties of -+ * the subflow, not the MPTCP-level. -+ */ -+ if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now, -+ (tcp_skb_is_last(meta_sk, skb) ? -+ nonagle : TCP_NAGLE_PUSH)))) -+ break; -+ -+ limit = mss_now; -+ /* skb->len > mss_now is the equivalent of tso_segs > 1 in -+ * tcp_write_xmit. Otherwise split-point would return 0. -+ */ -+ if (skb->len > mss_now && !tcp_urg_mode(meta_tp)) -+ /* We limit the size of the skb so that it fits into the -+ * window. Call tcp_mss_split_point to avoid duplicating -+ * code. -+ * We really only care about fitting the skb into the -+ * window. That's why we use UINT_MAX. If the skb does -+ * not fit into the cwnd_quota or the NIC's max-segs -+ * limitation, it will be split by the subflow's -+ * tcp_write_xmit which does the appropriate call to -+ * tcp_mss_split_point. -+ */ -+ limit = tcp_mss_split_point(meta_sk, skb, mss_now, -+ UINT_MAX / mss_now, -+ nonagle); -+ -+ if (sublimit) -+ limit = min(limit, sublimit); -+ -+ if (skb->len > limit && -+ unlikely(mptcp_fragment(meta_sk, skb, limit, gfp, reinject))) -+ break; -+ -+ if (!mptcp_skb_entail(subsk, skb, reinject)) -+ break; -+ /* Nagle is handled at the MPTCP-layer, so -+ * always push on the subflow -+ */ -+ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH); -+ TCP_SKB_CB(skb)->when = tcp_time_stamp; -+ -+ if (!reinject) { -+ mptcp_check_sndseq_wrap(meta_tp, -+ TCP_SKB_CB(skb)->end_seq - -+ TCP_SKB_CB(skb)->seq); -+ tcp_event_new_data_sent(meta_sk, skb); -+ } -+ -+ tcp_minshall_update(meta_tp, mss_now, skb); -+ sent_pkts += tcp_skb_pcount(skb); -+ -+ if (reinject > 0) { -+ __skb_unlink(skb, &mpcb->reinject_queue); -+ kfree_skb(skb); -+ } -+ -+ if (push_one) -+ break; -+ } -+ -+ return !meta_tp->packets_out && tcp_send_head(meta_sk); -+} -+ -+void mptcp_write_space(struct sock *sk) -+{ -+ mptcp_push_pending_frames(mptcp_meta_sk(sk)); -+} -+ -+u32 __mptcp_select_window(struct sock *sk) -+{ -+ struct inet_connection_sock *icsk = inet_csk(sk); -+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); -+ int mss, free_space, full_space, window; -+ -+ /* MSS for the peer's data. Previous versions used mss_clamp -+ * here. I don't know if the value based on our guesses -+ * of peer's MSS is better for the performance. It's more correct -+ * but may be worse for the performance because of rcv_mss -+ * fluctuations. --SAW 1998/11/1 -+ */ -+ mss = icsk->icsk_ack.rcv_mss; -+ free_space = tcp_space(sk); -+ full_space = min_t(int, meta_tp->window_clamp, -+ tcp_full_space(sk)); -+ -+ if (mss > full_space) -+ mss = full_space; -+ -+ if (free_space < (full_space >> 1)) { -+ icsk->icsk_ack.quick = 0; -+ -+ if (tcp_memory_pressure) -+ /* TODO this has to be adapted when we support different -+ * MSS's among the subflows. -+ */ -+ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh, -+ 4U * meta_tp->advmss); -+ -+ if (free_space < mss) -+ return 0; -+ } -+ -+ if (free_space > meta_tp->rcv_ssthresh) -+ free_space = meta_tp->rcv_ssthresh; -+ -+ /* Don't do rounding if we are using window scaling, since the -+ * scaled window will not line up with the MSS boundary anyway. -+ */ -+ window = meta_tp->rcv_wnd; -+ if (tp->rx_opt.rcv_wscale) { -+ window = free_space; -+ -+ /* Advertise enough space so that it won't get scaled away. -+ * Import case: prevent zero window announcement if -+ * 1< mss. -+ */ -+ if (((window >> tp->rx_opt.rcv_wscale) << tp-> -+ rx_opt.rcv_wscale) != window) -+ window = (((window >> tp->rx_opt.rcv_wscale) + 1) -+ << tp->rx_opt.rcv_wscale); -+ } else { -+ /* Get the largest window that is a nice multiple of mss. -+ * Window clamp already applied above. -+ * If our current window offering is within 1 mss of the -+ * free space we just keep it. This prevents the divide -+ * and multiply from happening most of the time. -+ * We also don't do any window rounding when the free space -+ * is too small. -+ */ -+ if (window <= free_space - mss || window > free_space) -+ window = (free_space / mss) * mss; -+ else if (mss == full_space && -+ free_space > window + (full_space >> 1)) -+ window = free_space; -+ } -+ -+ return window; -+} -+ -+void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts, -+ unsigned *remaining) -+{ -+ const struct tcp_sock *tp = tcp_sk(sk); -+ -+ opts->options |= OPTION_MPTCP; -+ if (is_master_tp(tp)) { -+ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN; -+ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; -+ opts->mp_capable.sender_key = tp->mptcp_loc_key; -+ opts->dss_csum = !!sysctl_mptcp_checksum; -+ } else { -+ const struct mptcp_cb *mpcb = tp->mpcb; -+ -+ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN; -+ *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN; -+ opts->mp_join_syns.token = mpcb->mptcp_rem_token; -+ opts->mp_join_syns.low_prio = tp->mptcp->low_prio; -+ opts->addr_id = tp->mptcp->loc_id; -+ opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce; -+ } -+} -+ -+void mptcp_synack_options(struct request_sock *req, -+ struct tcp_out_options *opts, unsigned *remaining) -+{ -+ struct mptcp_request_sock *mtreq; -+ mtreq = mptcp_rsk(req); -+ -+ opts->options |= OPTION_MPTCP; -+ /* MPCB not yet set - thus it's a new MPTCP-session */ -+ if (!mtreq->is_sub) { -+ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK; -+ opts->mp_capable.sender_key = mtreq->mptcp_loc_key; -+ opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum; -+ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; -+ } else { -+ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK; -+ opts->mp_join_syns.sender_truncated_mac = -+ mtreq->mptcp_hash_tmac; -+ opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce; -+ opts->mp_join_syns.low_prio = mtreq->low_prio; -+ opts->addr_id = mtreq->loc_id; -+ *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN; -+ } -+} -+ -+void mptcp_established_options(struct sock *sk, struct sk_buff *skb, -+ struct tcp_out_options *opts, unsigned *size) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct mptcp_cb *mpcb = tp->mpcb; -+ const struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; -+ -+ /* We are coming from tcp_current_mss with the meta_sk as an argument. -+ * It does not make sense to check for the options, because when the -+ * segment gets sent, another subflow will be chosen. -+ */ -+ if (!skb && is_meta_sk(sk)) -+ return; -+ -+ /* In fallback mp_fail-mode, we have to repeat it until the fallback -+ * has been done by the sender -+ */ -+ if (unlikely(tp->mptcp->send_mp_fail)) { -+ opts->options |= OPTION_MPTCP; -+ opts->mptcp_options |= OPTION_MP_FAIL; -+ *size += MPTCP_SUB_LEN_FAIL; -+ return; -+ } -+ -+ if (unlikely(tp->send_mp_fclose)) { -+ opts->options |= OPTION_MPTCP; -+ opts->mptcp_options |= OPTION_MP_FCLOSE; -+ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; -+ *size += MPTCP_SUB_LEN_FCLOSE_ALIGN; -+ return; -+ } -+ -+ /* 1. If we are the sender of the infinite-mapping, we need the -+ * MPTCPHDR_INF-flag, because a retransmission of the -+ * infinite-announcment still needs the mptcp-option. -+ * -+ * We need infinite_cutoff_seq, because retransmissions from before -+ * the infinite-cutoff-moment still need the MPTCP-signalling to stay -+ * consistent. -+ * -+ * 2. If we are the receiver of the infinite-mapping, we always skip -+ * mptcp-options, because acknowledgments from before the -+ * infinite-mapping point have already been sent out. -+ * -+ * I know, the whole infinite-mapping stuff is ugly... -+ * -+ * TODO: Handle wrapped data-sequence numbers -+ * (even if it's very unlikely) -+ */ -+ if (unlikely(mpcb->infinite_mapping_snd) && -+ ((mpcb->send_infinite_mapping && tcb && -+ mptcp_is_data_seq(skb) && -+ !(tcb->mptcp_flags & MPTCPHDR_INF) && -+ !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) || -+ !mpcb->send_infinite_mapping)) -+ return; -+ -+ if (unlikely(tp->mptcp->include_mpc)) { -+ opts->options |= OPTION_MPTCP; -+ opts->mptcp_options |= OPTION_MP_CAPABLE | -+ OPTION_TYPE_ACK; -+ *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN; -+ opts->mp_capable.sender_key = mpcb->mptcp_loc_key; -+ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; -+ opts->dss_csum = mpcb->dss_csum; -+ -+ if (skb) -+ tp->mptcp->include_mpc = 0; -+ } -+ if (unlikely(tp->mptcp->pre_established)) { -+ opts->options |= OPTION_MPTCP; -+ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK; -+ *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN; -+ } -+ -+ if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) { -+ opts->options |= OPTION_MPTCP; -+ opts->mptcp_options |= OPTION_DATA_ACK; -+ /* If !skb, we come from tcp_current_mss and thus we always -+ * assume that the DSS-option will be set for the data-packet. -+ */ -+ if (skb && !mptcp_is_data_seq(skb)) { -+ *size += MPTCP_SUB_LEN_ACK_ALIGN; -+ } else { -+ /* Doesn't matter, if csum included or not. It will be -+ * either 10 or 12, and thus aligned = 12 -+ */ -+ *size += MPTCP_SUB_LEN_ACK_ALIGN + -+ MPTCP_SUB_LEN_SEQ_ALIGN; -+ } -+ -+ *size += MPTCP_SUB_LEN_DSS_ALIGN; -+ } -+ -+ if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal) -+ mpcb->pm_ops->addr_signal(sk, size, opts, skb); -+ -+ if (unlikely(tp->mptcp->send_mp_prio) && -+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) { -+ opts->options |= OPTION_MPTCP; -+ opts->mptcp_options |= OPTION_MP_PRIO; -+ if (skb) -+ tp->mptcp->send_mp_prio = 0; -+ *size += MPTCP_SUB_LEN_PRIO_ALIGN; -+ } -+ -+ return; -+} -+ -+u16 mptcp_select_window(struct sock *sk) -+{ -+ u16 new_win = tcp_select_window(sk); -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct tcp_sock *meta_tp = mptcp_meta_tp(tp); -+ -+ meta_tp->rcv_wnd = tp->rcv_wnd; -+ meta_tp->rcv_wup = meta_tp->rcv_nxt; -+ -+ return new_win; -+} -+ -+void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, -+ const struct tcp_out_options *opts, -+ struct sk_buff *skb) -+{ -+ if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) { -+ struct mp_capable *mpc = (struct mp_capable *)ptr; -+ -+ mpc->kind = TCPOPT_MPTCP; -+ -+ if ((OPTION_TYPE_SYN & opts->mptcp_options) || -+ (OPTION_TYPE_SYNACK & opts->mptcp_options)) { -+ mpc->sender_key = opts->mp_capable.sender_key; -+ mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN; -+ ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; -+ } else if (OPTION_TYPE_ACK & opts->mptcp_options) { -+ mpc->sender_key = opts->mp_capable.sender_key; -+ mpc->receiver_key = opts->mp_capable.receiver_key; -+ mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK; -+ ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2; -+ } -+ -+ mpc->sub = MPTCP_SUB_CAPABLE; -+ mpc->ver = 0; -+ mpc->a = opts->dss_csum; -+ mpc->b = 0; -+ mpc->rsv = 0; -+ mpc->h = 1; -+ } -+ -+ if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) { -+ struct mp_join *mpj = (struct mp_join *)ptr; -+ -+ mpj->kind = TCPOPT_MPTCP; -+ mpj->sub = MPTCP_SUB_JOIN; -+ mpj->rsv = 0; -+ -+ if (OPTION_TYPE_SYN & opts->mptcp_options) { -+ mpj->len = MPTCP_SUB_LEN_JOIN_SYN; -+ mpj->u.syn.token = opts->mp_join_syns.token; -+ mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce; -+ mpj->b = opts->mp_join_syns.low_prio; -+ mpj->addr_id = opts->addr_id; -+ ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2; -+ } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) { -+ mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK; -+ mpj->u.synack.mac = -+ opts->mp_join_syns.sender_truncated_mac; -+ mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce; -+ mpj->b = opts->mp_join_syns.low_prio; -+ mpj->addr_id = opts->addr_id; -+ ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2; -+ } else if (OPTION_TYPE_ACK & opts->mptcp_options) { -+ mpj->len = MPTCP_SUB_LEN_JOIN_ACK; -+ mpj->addr_id = 0; /* addr_id is rsv (RFC 6824, p. 21) */ -+ memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20); -+ ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2; -+ } -+ } -+ if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) { -+ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; -+ -+ mpadd->kind = TCPOPT_MPTCP; -+ if (opts->add_addr_v4) { -+ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4; -+ mpadd->sub = MPTCP_SUB_ADD_ADDR; -+ mpadd->ipver = 4; -+ mpadd->addr_id = opts->add_addr4.addr_id; -+ mpadd->u.v4.addr = opts->add_addr4.addr; -+ ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2; -+ } else if (opts->add_addr_v6) { -+ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6; -+ mpadd->sub = MPTCP_SUB_ADD_ADDR; -+ mpadd->ipver = 6; -+ mpadd->addr_id = opts->add_addr6.addr_id; -+ memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr, -+ sizeof(mpadd->u.v6.addr)); -+ ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2; -+ } -+ } -+ if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) { -+ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; -+ u8 *addrs_id; -+ int id, len, len_align; -+ -+ len = mptcp_sub_len_remove_addr(opts->remove_addrs); -+ len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs); -+ -+ mprem->kind = TCPOPT_MPTCP; -+ mprem->len = len; -+ mprem->sub = MPTCP_SUB_REMOVE_ADDR; -+ mprem->rsv = 0; -+ addrs_id = &mprem->addrs_id; -+ -+ mptcp_for_each_bit_set(opts->remove_addrs, id) -+ *(addrs_id++) = id; -+ -+ /* Fill the rest with NOP's */ -+ if (len_align > len) { -+ int i; -+ for (i = 0; i < len_align - len; i++) -+ *(addrs_id++) = TCPOPT_NOP; -+ } -+ -+ ptr += len_align >> 2; -+ } -+ if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) { -+ struct mp_fail *mpfail = (struct mp_fail *)ptr; -+ -+ mpfail->kind = TCPOPT_MPTCP; -+ mpfail->len = MPTCP_SUB_LEN_FAIL; -+ mpfail->sub = MPTCP_SUB_FAIL; -+ mpfail->rsv1 = 0; -+ mpfail->rsv2 = 0; -+ mpfail->data_seq = htonll(tp->mpcb->csum_cutoff_seq); -+ -+ ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2; -+ } -+ if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) { -+ struct mp_fclose *mpfclose = (struct mp_fclose *)ptr; -+ -+ mpfclose->kind = TCPOPT_MPTCP; -+ mpfclose->len = MPTCP_SUB_LEN_FCLOSE; -+ mpfclose->sub = MPTCP_SUB_FCLOSE; -+ mpfclose->rsv1 = 0; -+ mpfclose->rsv2 = 0; -+ mpfclose->key = opts->mp_capable.receiver_key; -+ -+ ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2; -+ } -+ -+ if (OPTION_DATA_ACK & opts->mptcp_options) { -+ if (!mptcp_is_data_seq(skb)) -+ ptr += mptcp_write_dss_data_ack(tp, skb, ptr); -+ else -+ ptr += mptcp_write_dss_data_seq(tp, skb, ptr); -+ } -+ if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) { -+ struct mp_prio *mpprio = (struct mp_prio *)ptr; -+ -+ mpprio->kind = TCPOPT_MPTCP; -+ mpprio->len = MPTCP_SUB_LEN_PRIO; -+ mpprio->sub = MPTCP_SUB_PRIO; -+ mpprio->rsv = 0; -+ mpprio->b = tp->mptcp->low_prio; -+ mpprio->addr_id = TCPOPT_NOP; -+ -+ ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2; -+ } -+} -+ -+/* Sends the datafin */ -+void mptcp_send_fin(struct sock *meta_sk) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct sk_buff *skb = tcp_write_queue_tail(meta_sk); -+ int mss_now; -+ -+ if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) -+ meta_tp->mpcb->passive_close = 1; -+ -+ /* Optimization, tack on the FIN if we have a queue of -+ * unsent frames. But be careful about outgoing SACKS -+ * and IP options. -+ */ -+ mss_now = mptcp_current_mss(meta_sk); -+ -+ if (tcp_send_head(meta_sk) != NULL) { -+ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN; -+ TCP_SKB_CB(skb)->end_seq++; -+ meta_tp->write_seq++; -+ } else { -+ /* Socket is locked, keep trying until memory is available. */ -+ for (;;) { -+ skb = alloc_skb_fclone(MAX_TCP_HEADER, -+ meta_sk->sk_allocation); -+ if (skb) -+ break; -+ yield(); -+ } -+ /* Reserve space for headers and prepare control bits. */ -+ skb_reserve(skb, MAX_TCP_HEADER); -+ -+ tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK); -+ TCP_SKB_CB(skb)->end_seq++; -+ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN; -+ tcp_queue_skb(meta_sk, skb); -+ } -+ __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF); -+} -+ -+void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ struct sock *sk = NULL, *sk_it = NULL, *tmpsk; -+ -+ if (!mpcb->cnt_subflows) -+ return; -+ -+ WARN_ON(meta_tp->send_mp_fclose); -+ -+ /* First - select a socket */ -+ sk = mptcp_select_ack_sock(meta_sk); -+ -+ /* May happen if no subflow is in an appropriate state */ -+ if (!sk) -+ return; -+ -+ /* We are in infinite mode - just send a reset */ -+ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) { -+ sk->sk_err = ECONNRESET; -+ if (tcp_need_reset(sk->sk_state)) -+ tcp_send_active_reset(sk, priority); -+ mptcp_sub_force_close(sk); -+ return; -+ } -+ -+ -+ tcp_sk(sk)->send_mp_fclose = 1; -+ /** Reset all other subflows */ -+ -+ /* tcp_done must be handled with bh disabled */ -+ if (!in_serving_softirq()) -+ local_bh_disable(); -+ -+ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { -+ if (tcp_sk(sk_it)->send_mp_fclose) -+ continue; -+ -+ sk_it->sk_err = ECONNRESET; -+ if (tcp_need_reset(sk_it->sk_state)) -+ tcp_send_active_reset(sk_it, GFP_ATOMIC); -+ mptcp_sub_force_close(sk_it); -+ } -+ -+ if (!in_serving_softirq()) -+ local_bh_enable(); -+ -+ tcp_send_ack(sk); -+ inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto); -+ -+ meta_tp->send_mp_fclose = 1; -+} -+ -+static void mptcp_ack_retransmit_timer(struct sock *sk) -+{ -+ struct sk_buff *skb; -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct inet_connection_sock *icsk = inet_csk(sk); -+ -+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) -+ goto out; /* Routing failure or similar */ -+ -+ if (!tp->retrans_stamp) -+ tp->retrans_stamp = tcp_time_stamp ? : 1; -+ -+ if (tcp_write_timeout(sk)) { -+ tp->mptcp->pre_established = 0; -+ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); -+ tp->ops->send_active_reset(sk, GFP_ATOMIC); -+ goto out; -+ } -+ -+ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); -+ if (skb == NULL) { -+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, -+ jiffies + icsk->icsk_rto); -+ return; -+ } -+ -+ /* Reserve space for headers and prepare control bits */ -+ skb_reserve(skb, MAX_TCP_HEADER); -+ tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK); -+ -+ TCP_SKB_CB(skb)->when = tcp_time_stamp; -+ if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) { -+ /* Retransmission failed because of local congestion, -+ * do not backoff. -+ */ -+ if (!icsk->icsk_retransmits) -+ icsk->icsk_retransmits = 1; -+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, -+ jiffies + icsk->icsk_rto); -+ return; -+ } -+ -+ -+ icsk->icsk_retransmits++; -+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); -+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, -+ jiffies + icsk->icsk_rto); -+ if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) -+ __sk_dst_reset(sk); -+ -+out:; -+} -+ -+void mptcp_ack_handler(unsigned long data) -+{ -+ struct sock *sk = (struct sock *)data; -+ struct sock *meta_sk = mptcp_meta_sk(sk); -+ -+ bh_lock_sock(meta_sk); -+ if (sock_owned_by_user(meta_sk)) { -+ /* Try again later */ -+ sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer, -+ jiffies + (HZ / 20)); -+ goto out_unlock; -+ } -+ -+ if (sk->sk_state == TCP_CLOSE) -+ goto out_unlock; -+ if (!tcp_sk(sk)->mptcp->pre_established) -+ goto out_unlock; -+ -+ mptcp_ack_retransmit_timer(sk); -+ -+ sk_mem_reclaim(sk); -+ -+out_unlock: -+ bh_unlock_sock(meta_sk); -+ sock_put(sk); -+} -+ -+/* Similar to tcp_retransmit_skb -+ * -+ * The diff is that we handle the retransmission-stats (retrans_stamp) at the -+ * meta-level. -+ */ -+int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct sock *subsk; -+ unsigned int limit, mss_now; -+ int err = -1; -+ -+ /* Do not sent more than we queued. 1/4 is reserved for possible -+ * copying overhead: fragmentation, tunneling, mangling etc. -+ * -+ * This is a meta-retransmission thus we check on the meta-socket. -+ */ -+ if (atomic_read(&meta_sk->sk_wmem_alloc) > -+ min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) { -+ return -EAGAIN; -+ } -+ -+ /* We need to make sure that the retransmitted segment can be sent on a -+ * subflow right now. If it is too big, it needs to be fragmented. -+ */ -+ subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, false); -+ if (!subsk) { -+ /* We want to increase icsk_retransmits, thus return 0, so that -+ * mptcp_retransmit_timer enters the desired branch. -+ */ -+ err = 0; -+ goto failed; -+ } -+ mss_now = tcp_current_mss(subsk); -+ -+ /* If the segment was cloned (e.g. a meta retransmission), the header -+ * must be expanded/copied so that there is no corruption of TSO -+ * information. -+ */ -+ if (skb_unclone(skb, GFP_ATOMIC)) { -+ err = -ENOMEM; -+ goto failed; -+ } -+ -+ /* Must have been set by mptcp_write_xmit before */ -+ BUG_ON(!tcp_skb_pcount(skb)); -+ -+ limit = mss_now; -+ /* skb->len > mss_now is the equivalent of tso_segs > 1 in -+ * tcp_write_xmit. Otherwise split-point would return 0. -+ */ -+ if (skb->len > mss_now && !tcp_urg_mode(meta_tp)) -+ limit = tcp_mss_split_point(meta_sk, skb, mss_now, -+ UINT_MAX / mss_now, -+ TCP_NAGLE_OFF); -+ -+ if (skb->len > limit && -+ unlikely(mptcp_fragment(meta_sk, skb, limit, -+ GFP_ATOMIC, 0))) -+ goto failed; -+ -+ if (!mptcp_skb_entail(subsk, skb, -1)) -+ goto failed; -+ TCP_SKB_CB(skb)->when = tcp_time_stamp; -+ -+ /* Update global TCP statistics. */ -+ TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS); -+ -+ /* Diff to tcp_retransmit_skb */ -+ -+ /* Save stamp of the first retransmit. */ -+ if (!meta_tp->retrans_stamp) -+ meta_tp->retrans_stamp = TCP_SKB_CB(skb)->when; -+ -+ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH); -+ -+ return 0; -+ -+failed: -+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL); -+ return err; -+} -+ -+/* Similar to tcp_retransmit_timer -+ * -+ * The diff is that we have to handle retransmissions of the FAST_CLOSE-message -+ * and that we don't have an srtt estimation at the meta-level. -+ */ -+void mptcp_retransmit_timer(struct sock *meta_sk) -+{ -+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); -+ struct mptcp_cb *mpcb = meta_tp->mpcb; -+ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); -+ int err; -+ -+ /* In fallback, retransmission is handled at the subflow-level */ -+ if (!meta_tp->packets_out || mpcb->infinite_mapping_snd || -+ mpcb->send_infinite_mapping) -+ return; -+ -+ WARN_ON(tcp_write_queue_empty(meta_sk)); -+ -+ if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) && -+ !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { -+ /* Receiver dastardly shrinks window. Our retransmits -+ * become zero probes, but we should not timeout this -+ * connection. If the socket is an orphan, time it out, -+ * we cannot allow such beasts to hang infinitely. -+ */ -+ struct inet_sock *meta_inet = inet_sk(meta_sk); -+ if (meta_sk->sk_family == AF_INET) { -+ LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", -+ &meta_inet->inet_daddr, -+ ntohs(meta_inet->inet_dport), -+ meta_inet->inet_num, meta_tp->snd_una, -+ meta_tp->snd_nxt); -+ } -+#if IS_ENABLED(CONFIG_IPV6) -+ else if (meta_sk->sk_family == AF_INET6) { -+ LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", -+ &meta_sk->sk_v6_daddr, -+ ntohs(meta_inet->inet_dport), -+ meta_inet->inet_num, meta_tp->snd_una, -+ meta_tp->snd_nxt); -+ } -+#endif -+ if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) { -+ tcp_write_err(meta_sk); -+ return; -+ } -+ -+ mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk)); -+ goto out_reset_timer; -+ } -+ -+ if (tcp_write_timeout(meta_sk)) -+ return; -+ -+ if (meta_icsk->icsk_retransmits == 0) -+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS); -+ -+ meta_icsk->icsk_ca_state = TCP_CA_Loss; -+ -+ err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk)); -+ if (err > 0) { -+ /* Retransmission failed because of local congestion, -+ * do not backoff. -+ */ -+ if (!meta_icsk->icsk_retransmits) -+ meta_icsk->icsk_retransmits = 1; -+ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, -+ min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), -+ TCP_RTO_MAX); -+ return; -+ } -+ -+ /* Increase the timeout each time we retransmit. Note that -+ * we do not increase the rtt estimate. rto is initialized -+ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests -+ * that doubling rto each time is the least we can get away with. -+ * In KA9Q, Karn uses this for the first few times, and then -+ * goes to quadratic. netBSD doubles, but only goes up to *64, -+ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is -+ * defined in the protocol as the maximum possible RTT. I guess -+ * we'll have to use something other than TCP to talk to the -+ * University of Mars. -+ * -+ * PAWS allows us longer timeouts and large windows, so once -+ * implemented ftp to mars will work nicely. We will have to fix -+ * the 120 second clamps though! -+ */ -+ meta_icsk->icsk_backoff++; -+ meta_icsk->icsk_retransmits++; -+ -+out_reset_timer: -+ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is -+ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this -+ * might be increased if the stream oscillates between thin and thick, -+ * thus the old value might already be too high compared to the value -+ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without -+ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating -+ * exponential backoff behaviour to avoid continue hammering -+ * linear-timeout retransmissions into a black hole -+ */ -+ if (meta_sk->sk_state == TCP_ESTABLISHED && -+ (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && -+ tcp_stream_is_thin(meta_tp) && -+ meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { -+ meta_icsk->icsk_backoff = 0; -+ /* We cannot do the same as in tcp_write_timer because the -+ * srtt is not set here. -+ */ -+ mptcp_set_rto(meta_sk); -+ } else { -+ /* Use normal (exponential) backoff */ -+ meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX); -+ } -+ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX); -+ -+ return; -+} -+ -+/* Modify values to an mptcp-level for the initial window of new subflows */ -+void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, -+ __u32 *window_clamp, int wscale_ok, -+ __u8 *rcv_wscale, __u32 init_rcv_wnd, -+ const struct sock *sk) -+{ -+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; -+ -+ *window_clamp = mpcb->orig_window_clamp; -+ __space = tcp_win_from_space(mpcb->orig_sk_rcvbuf); -+ -+ tcp_select_initial_window(__space, mss, rcv_wnd, window_clamp, -+ wscale_ok, rcv_wscale, init_rcv_wnd, sk); -+} -+ -+static inline u64 mptcp_calc_rate(const struct sock *meta_sk, unsigned int mss, -+ unsigned int (*mss_cb)(struct sock *sk)) -+{ -+ struct sock *sk; -+ u64 rate = 0; -+ -+ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { -+ struct tcp_sock *tp = tcp_sk(sk); -+ int this_mss; -+ u64 this_rate; -+ -+ if (!mptcp_sk_can_send(sk)) -+ continue; -+ -+ /* Do not consider subflows without a RTT estimation yet -+ * otherwise this_rate >>> rate. -+ */ -+ if (unlikely(!tp->srtt_us)) -+ continue; -+ -+ this_mss = mss_cb(sk); -+ -+ /* If this_mss is smaller than mss, it means that a segment will -+ * be splitted in two (or more) when pushed on this subflow. If -+ * you consider that mss = 1428 and this_mss = 1420 then two -+ * segments will be generated: a 1420-byte and 8-byte segment. -+ * The latter will introduce a large overhead as for a single -+ * data segment 2 slots will be used in the congestion window. -+ * Therefore reducing by ~2 the potential throughput of this -+ * subflow. Indeed, 1428 will be send while 2840 could have been -+ * sent if mss == 1420 reducing the throughput by 2840 / 1428. -+ * -+ * The following algorithm take into account this overhead -+ * when computing the potential throughput that MPTCP can -+ * achieve when generating mss-byte segments. -+ * -+ * The formulae is the following: -+ * \sum_{\forall sub} ratio * \frac{mss * cwnd_sub}{rtt_sub} -+ * Where ratio is computed as follows: -+ * \frac{mss}{\ceil{mss / mss_sub} * mss_sub} -+ * -+ * ratio gives the reduction factor of the theoretical -+ * throughput a subflow can achieve if MPTCP uses a specific -+ * MSS value. -+ */ -+ this_rate = div64_u64((u64)mss * mss * (USEC_PER_SEC << 3) * -+ max(tp->snd_cwnd, tp->packets_out), -+ (u64)tp->srtt_us * -+ DIV_ROUND_UP(mss, this_mss) * this_mss); -+ rate += this_rate; -+ } -+ -+ return rate; -+} -+ -+static unsigned int __mptcp_current_mss(const struct sock *meta_sk, -+ unsigned int (*mss_cb)(struct sock *sk)) -+{ -+ unsigned int mss = 0; -+ u64 rate = 0; -+ struct sock *sk; -+ -+ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { -+ int this_mss; -+ u64 this_rate; -+ -+ if (!mptcp_sk_can_send(sk)) -+ continue; -+ -+ this_mss = mss_cb(sk); -+ -+ /* Same mss values will produce the same throughput. */ -+ if (this_mss == mss) -+ continue; -+ -+ /* See whether using this mss value can theoretically improve -+ * the performances. -+ */ -+ this_rate = mptcp_calc_rate(meta_sk, this_mss, mss_cb); -+ if (this_rate >= rate) { -+ mss = this_mss; -+ rate = this_rate; -+ } -+ } -+ -+ return mss; -+} -+ -+unsigned int mptcp_current_mss(struct sock *meta_sk) -+{ -+ unsigned int mss = __mptcp_current_mss(meta_sk, tcp_current_mss); -+ -+ /* If no subflow is available, we take a default-mss from the -+ * meta-socket. -+ */ -+ return !mss ? tcp_current_mss(meta_sk) : mss; -+} -+ -+static unsigned int mptcp_select_size_mss(struct sock *sk) -+{ -+ return tcp_sk(sk)->mss_cache; -+} -+ -+int mptcp_select_size(const struct sock *meta_sk, bool sg) -+{ -+ unsigned int mss = __mptcp_current_mss(meta_sk, mptcp_select_size_mss); -+ -+ if (sg) { -+ if (mptcp_sk_can_gso(meta_sk)) { -+ mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); -+ } else { -+ int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); -+ -+ if (mss >= pgbreak && -+ mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) -+ mss = pgbreak; -+ } -+ } -+ -+ return !mss ? tcp_sk(meta_sk)->mss_cache : mss; -+} -+ -+int mptcp_check_snd_buf(const struct tcp_sock *tp) -+{ -+ const struct sock *sk; -+ u32 rtt_max = tp->srtt_us; -+ u64 bw_est; -+ -+ if (!tp->srtt_us) -+ return tp->reordering + 1; -+ -+ mptcp_for_each_sk(tp->mpcb, sk) { -+ if (!mptcp_sk_can_send(sk)) -+ continue; -+ -+ if (rtt_max < tcp_sk(sk)->srtt_us) -+ rtt_max = tcp_sk(sk)->srtt_us; -+ } -+ -+ bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16, -+ (u64)tp->srtt_us); -+ -+ return max_t(unsigned int, (u32)(bw_est >> 16), -+ tp->reordering + 1); -+} -+ -+unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now, -+ int large_allowed) -+{ -+ struct sock *sk; -+ u32 xmit_size_goal = 0; -+ -+ if (large_allowed && mptcp_sk_can_gso(meta_sk)) { -+ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { -+ int this_size_goal; -+ -+ if (!mptcp_sk_can_send(sk)) -+ continue; -+ -+ this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1); -+ if (this_size_goal > xmit_size_goal) -+ xmit_size_goal = this_size_goal; -+ } -+ } -+ -+ return max(xmit_size_goal, mss_now); -+} -+ -+/* Similar to tcp_trim_head - but we correctly copy the DSS-option */ -+int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) -+{ -+ if (skb_cloned(skb)) { -+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) -+ return -ENOMEM; -+ } -+ -+ __pskb_trim_head(skb, len); -+ -+ TCP_SKB_CB(skb)->seq += len; -+ skb->ip_summed = CHECKSUM_PARTIAL; -+ -+ skb->truesize -= len; -+ sk->sk_wmem_queued -= len; -+ sk_mem_uncharge(sk, len); -+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); -+ -+ /* Any change of skb->len requires recalculation of tso factor. */ -+ if (tcp_skb_pcount(skb) > 1) -+ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); -+ -+ return 0; -+} -diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c -new file mode 100644 -index 000000000000..9542f950729f ---- /dev/null -+++ b/net/mptcp/mptcp_pm.c -@@ -0,0 +1,169 @@ -+/* -+ * MPTCP implementation - MPTCP-subflow-management -+ * -+ * Initial Design & Implementation: -+ * Sébastien Barré -+ * -+ * Current Maintainer & Author: -+ * Christoph Paasch -+ * -+ * Additional authors: -+ * Jaakko Korkeaniemi -+ * Gregory Detal -+ * Fabien Duchêne -+ * Andreas Seelinger -+ * Lavkesh Lahngir -+ * Andreas Ripke -+ * Vlad Dogaru -+ * Octavian Purdila -+ * John Ronan -+ * Catalin Nicutar -+ * Brandon Heller -+ * -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+ -+#include -+#include -+ -+static DEFINE_SPINLOCK(mptcp_pm_list_lock); -+static LIST_HEAD(mptcp_pm_list); -+ -+static int mptcp_default_id(sa_family_t family, union inet_addr *addr, -+ struct net *net, bool *low_prio) -+{ -+ return 0; -+} -+ -+struct mptcp_pm_ops mptcp_pm_default = { -+ .get_local_id = mptcp_default_id, /* We do not care */ -+ .name = "default", -+ .owner = THIS_MODULE, -+}; -+ -+static struct mptcp_pm_ops *mptcp_pm_find(const char *name) -+{ -+ struct mptcp_pm_ops *e; -+ -+ list_for_each_entry_rcu(e, &mptcp_pm_list, list) { -+ if (strcmp(e->name, name) == 0) -+ return e; -+ } -+ -+ return NULL; -+} -+ -+int mptcp_register_path_manager(struct mptcp_pm_ops *pm) -+{ -+ int ret = 0; -+ -+ if (!pm->get_local_id) -+ return -EINVAL; -+ -+ spin_lock(&mptcp_pm_list_lock); -+ if (mptcp_pm_find(pm->name)) { -+ pr_notice("%s already registered\n", pm->name); -+ ret = -EEXIST; -+ } else { -+ list_add_tail_rcu(&pm->list, &mptcp_pm_list); -+ pr_info("%s registered\n", pm->name); -+ } -+ spin_unlock(&mptcp_pm_list_lock); -+ -+ return ret; -+} -+EXPORT_SYMBOL_GPL(mptcp_register_path_manager); -+ -+void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm) -+{ -+ spin_lock(&mptcp_pm_list_lock); -+ list_del_rcu(&pm->list); -+ spin_unlock(&mptcp_pm_list_lock); -+} -+EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager); -+ -+void mptcp_get_default_path_manager(char *name) -+{ -+ struct mptcp_pm_ops *pm; -+ -+ BUG_ON(list_empty(&mptcp_pm_list)); -+ -+ rcu_read_lock(); -+ pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list); -+ strncpy(name, pm->name, MPTCP_PM_NAME_MAX); -+ rcu_read_unlock(); -+} -+ -+int mptcp_set_default_path_manager(const char *name) -+{ -+ struct mptcp_pm_ops *pm; -+ int ret = -ENOENT; -+ -+ spin_lock(&mptcp_pm_list_lock); -+ pm = mptcp_pm_find(name); -+#ifdef CONFIG_MODULES -+ if (!pm && capable(CAP_NET_ADMIN)) { -+ spin_unlock(&mptcp_pm_list_lock); -+ -+ request_module("mptcp_%s", name); -+ spin_lock(&mptcp_pm_list_lock); -+ pm = mptcp_pm_find(name); -+ } -+#endif -+ -+ if (pm) { -+ list_move(&pm->list, &mptcp_pm_list); -+ ret = 0; -+ } else { -+ pr_info("%s is not available\n", name); -+ } -+ spin_unlock(&mptcp_pm_list_lock); -+ -+ return ret; -+} -+ -+void mptcp_init_path_manager(struct mptcp_cb *mpcb) -+{ -+ struct mptcp_pm_ops *pm; -+ -+ rcu_read_lock(); -+ list_for_each_entry_rcu(pm, &mptcp_pm_list, list) { -+ if (try_module_get(pm->owner)) { -+ mpcb->pm_ops = pm; -+ break; -+ } -+ } -+ rcu_read_unlock(); -+} -+ -+/* Manage refcounts on socket close. */ -+void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb) -+{ -+ module_put(mpcb->pm_ops->owner); -+} -+ -+/* Fallback to the default path-manager. */ -+void mptcp_fallback_default(struct mptcp_cb *mpcb) -+{ -+ struct mptcp_pm_ops *pm; -+ -+ mptcp_cleanup_path_manager(mpcb); -+ pm = mptcp_pm_find("default"); -+ -+ /* Cannot fail - it's the default module */ -+ try_module_get(pm->owner); -+ mpcb->pm_ops = pm; -+} -+EXPORT_SYMBOL_GPL(mptcp_fallback_default); -+ -+/* Set default value from kernel configuration at bootup */ -+static int __init mptcp_path_manager_default(void) -+{ -+ return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM); -+} -+late_initcall(mptcp_path_manager_default); -diff --git a/net/mptcp/mptcp_rr.c b/net/mptcp/mptcp_rr.c -new file mode 100644 -index 000000000000..93278f684069 ---- /dev/null -+++ b/net/mptcp/mptcp_rr.c -@@ -0,0 +1,301 @@ -+/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */ -+ -+#include -+#include -+ -+static unsigned char num_segments __read_mostly = 1; -+module_param(num_segments, byte, 0644); -+MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst"); -+ -+static bool cwnd_limited __read_mostly = 1; -+module_param(cwnd_limited, bool, 0644); -+MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows"); -+ -+struct rrsched_priv { -+ unsigned char quota; -+}; -+ -+static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp) -+{ -+ return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0]; -+} -+ -+/* If the sub-socket sk available to send the skb? */ -+static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb, -+ bool zero_wnd_test, bool cwnd_test) -+{ -+ const struct tcp_sock *tp = tcp_sk(sk); -+ unsigned int space, in_flight; -+ -+ /* Set of states for which we are allowed to send data */ -+ if (!mptcp_sk_can_send(sk)) -+ return false; -+ -+ /* We do not send data on this subflow unless it is -+ * fully established, i.e. the 4th ack has been received. -+ */ -+ if (tp->mptcp->pre_established) -+ return false; -+ -+ if (tp->pf) -+ return false; -+ -+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { -+ /* If SACK is disabled, and we got a loss, TCP does not exit -+ * the loss-state until something above high_seq has been acked. -+ * (see tcp_try_undo_recovery) -+ * -+ * high_seq is the snd_nxt at the moment of the RTO. As soon -+ * as we have an RTO, we won't push data on the subflow. -+ * Thus, snd_una can never go beyond high_seq. -+ */ -+ if (!tcp_is_reno(tp)) -+ return false; -+ else if (tp->snd_una != tp->high_seq) -+ return false; -+ } -+ -+ if (!tp->mptcp->fully_established) { -+ /* Make sure that we send in-order data */ -+ if (skb && tp->mptcp->second_packet && -+ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) -+ return false; -+ } -+ -+ if (!cwnd_test) -+ goto zero_wnd_test; -+ -+ in_flight = tcp_packets_in_flight(tp); -+ /* Not even a single spot in the cwnd */ -+ if (in_flight >= tp->snd_cwnd) -+ return false; -+ -+ /* Now, check if what is queued in the subflow's send-queue -+ * already fills the cwnd. -+ */ -+ space = (tp->snd_cwnd - in_flight) * tp->mss_cache; -+ -+ if (tp->write_seq - tp->snd_nxt > space) -+ return false; -+ -+zero_wnd_test: -+ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) -+ return false; -+ -+ return true; -+} -+ -+/* Are we not allowed to reinject this skb on tp? */ -+static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb) -+{ -+ /* If the skb has already been enqueued in this sk, try to find -+ * another one. -+ */ -+ return skb && -+ /* Has the skb already been enqueued into this subsocket? */ -+ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; -+} -+ -+/* We just look for any subflow that is available */ -+static struct sock *rr_get_available_subflow(struct sock *meta_sk, -+ struct sk_buff *skb, -+ bool zero_wnd_test) -+{ -+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct sock *sk, *bestsk = NULL, *backupsk = NULL; -+ -+ /* Answer data_fin on same subflow!!! */ -+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && -+ skb && mptcp_is_data_fin(skb)) { -+ mptcp_for_each_sk(mpcb, sk) { -+ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && -+ mptcp_rr_is_available(sk, skb, zero_wnd_test, true)) -+ return sk; -+ } -+ } -+ -+ /* First, find the best subflow */ -+ mptcp_for_each_sk(mpcb, sk) { -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true)) -+ continue; -+ -+ if (mptcp_rr_dont_reinject_skb(tp, skb)) { -+ backupsk = sk; -+ continue; -+ } -+ -+ bestsk = sk; -+ } -+ -+ if (bestsk) { -+ sk = bestsk; -+ } else if (backupsk) { -+ /* It has been sent on all subflows once - let's give it a -+ * chance again by restarting its pathmask. -+ */ -+ if (skb) -+ TCP_SKB_CB(skb)->path_mask = 0; -+ sk = backupsk; -+ } -+ -+ return sk; -+} -+ -+/* Returns the next segment to be sent from the mptcp meta-queue. -+ * (chooses the reinject queue if any segment is waiting in it, otherwise, -+ * chooses the normal write queue). -+ * Sets *@reinject to 1 if the returned segment comes from the -+ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, -+ * and sets it to -1 if it is a meta-level retransmission to optimize the -+ * receive-buffer. -+ */ -+static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject) -+{ -+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct sk_buff *skb = NULL; -+ -+ *reinject = 0; -+ -+ /* If we are in fallback-mode, just take from the meta-send-queue */ -+ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) -+ return tcp_send_head(meta_sk); -+ -+ skb = skb_peek(&mpcb->reinject_queue); -+ -+ if (skb) -+ *reinject = 1; -+ else -+ skb = tcp_send_head(meta_sk); -+ return skb; -+} -+ -+static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk, -+ int *reinject, -+ struct sock **subsk, -+ unsigned int *limit) -+{ -+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct sock *sk_it, *choose_sk = NULL; -+ struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject); -+ unsigned char split = num_segments; -+ unsigned char iter = 0, full_subs = 0; -+ -+ /* As we set it, we have to reset it as well. */ -+ *limit = 0; -+ -+ if (!skb) -+ return NULL; -+ -+ if (*reinject) { -+ *subsk = rr_get_available_subflow(meta_sk, skb, false); -+ if (!*subsk) -+ return NULL; -+ -+ return skb; -+ } -+ -+retry: -+ -+ /* First, we look for a subflow who is currently being used */ -+ mptcp_for_each_sk(mpcb, sk_it) { -+ struct tcp_sock *tp_it = tcp_sk(sk_it); -+ struct rrsched_priv *rsp = rrsched_get_priv(tp_it); -+ -+ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited)) -+ continue; -+ -+ iter++; -+ -+ /* Is this subflow currently being used? */ -+ if (rsp->quota > 0 && rsp->quota < num_segments) { -+ split = num_segments - rsp->quota; -+ choose_sk = sk_it; -+ goto found; -+ } -+ -+ /* Or, it's totally unused */ -+ if (!rsp->quota) { -+ split = num_segments; -+ choose_sk = sk_it; -+ } -+ -+ /* Or, it must then be fully used */ -+ if (rsp->quota == num_segments) -+ full_subs++; -+ } -+ -+ /* All considered subflows have a full quota, and we considered at -+ * least one. -+ */ -+ if (iter && iter == full_subs) { -+ /* So, we restart this round by setting quota to 0 and retry -+ * to find a subflow. -+ */ -+ mptcp_for_each_sk(mpcb, sk_it) { -+ struct tcp_sock *tp_it = tcp_sk(sk_it); -+ struct rrsched_priv *rsp = rrsched_get_priv(tp_it); -+ -+ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited)) -+ continue; -+ -+ rsp->quota = 0; -+ } -+ -+ goto retry; -+ } -+ -+found: -+ if (choose_sk) { -+ unsigned int mss_now; -+ struct tcp_sock *choose_tp = tcp_sk(choose_sk); -+ struct rrsched_priv *rsp = rrsched_get_priv(choose_tp); -+ -+ if (!mptcp_rr_is_available(choose_sk, skb, false, true)) -+ return NULL; -+ -+ *subsk = choose_sk; -+ mss_now = tcp_current_mss(*subsk); -+ *limit = split * mss_now; -+ -+ if (skb->len > mss_now) -+ rsp->quota += DIV_ROUND_UP(skb->len, mss_now); -+ else -+ rsp->quota++; -+ -+ return skb; -+ } -+ -+ return NULL; -+} -+ -+static struct mptcp_sched_ops mptcp_sched_rr = { -+ .get_subflow = rr_get_available_subflow, -+ .next_segment = mptcp_rr_next_segment, -+ .name = "roundrobin", -+ .owner = THIS_MODULE, -+}; -+ -+static int __init rr_register(void) -+{ -+ BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE); -+ -+ if (mptcp_register_scheduler(&mptcp_sched_rr)) -+ return -1; -+ -+ return 0; -+} -+ -+static void rr_unregister(void) -+{ -+ mptcp_unregister_scheduler(&mptcp_sched_rr); -+} -+ -+module_init(rr_register); -+module_exit(rr_unregister); -+ -+MODULE_AUTHOR("Christoph Paasch"); -+MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("ROUNDROBIN MPTCP"); -+MODULE_VERSION("0.89"); -diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c -new file mode 100644 -index 000000000000..6c7ff4eceac1 ---- /dev/null -+++ b/net/mptcp/mptcp_sched.c -@@ -0,0 +1,493 @@ -+/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */ -+ -+#include -+#include -+ -+static DEFINE_SPINLOCK(mptcp_sched_list_lock); -+static LIST_HEAD(mptcp_sched_list); -+ -+struct defsched_priv { -+ u32 last_rbuf_opti; -+}; -+ -+static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp) -+{ -+ return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0]; -+} -+ -+/* If the sub-socket sk available to send the skb? */ -+static bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb, -+ bool zero_wnd_test) -+{ -+ const struct tcp_sock *tp = tcp_sk(sk); -+ unsigned int mss_now, space, in_flight; -+ -+ /* Set of states for which we are allowed to send data */ -+ if (!mptcp_sk_can_send(sk)) -+ return false; -+ -+ /* We do not send data on this subflow unless it is -+ * fully established, i.e. the 4th ack has been received. -+ */ -+ if (tp->mptcp->pre_established) -+ return false; -+ -+ if (tp->pf) -+ return false; -+ -+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { -+ /* If SACK is disabled, and we got a loss, TCP does not exit -+ * the loss-state until something above high_seq has been acked. -+ * (see tcp_try_undo_recovery) -+ * -+ * high_seq is the snd_nxt at the moment of the RTO. As soon -+ * as we have an RTO, we won't push data on the subflow. -+ * Thus, snd_una can never go beyond high_seq. -+ */ -+ if (!tcp_is_reno(tp)) -+ return false; -+ else if (tp->snd_una != tp->high_seq) -+ return false; -+ } -+ -+ if (!tp->mptcp->fully_established) { -+ /* Make sure that we send in-order data */ -+ if (skb && tp->mptcp->second_packet && -+ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) -+ return false; -+ } -+ -+ /* If TSQ is already throttling us, do not send on this subflow. When -+ * TSQ gets cleared the subflow becomes eligible again. -+ */ -+ if (test_bit(TSQ_THROTTLED, &tp->tsq_flags)) -+ return false; -+ -+ in_flight = tcp_packets_in_flight(tp); -+ /* Not even a single spot in the cwnd */ -+ if (in_flight >= tp->snd_cwnd) -+ return false; -+ -+ /* Now, check if what is queued in the subflow's send-queue -+ * already fills the cwnd. -+ */ -+ space = (tp->snd_cwnd - in_flight) * tp->mss_cache; -+ -+ if (tp->write_seq - tp->snd_nxt > space) -+ return false; -+ -+ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) -+ return false; -+ -+ mss_now = tcp_current_mss(sk); -+ -+ /* Don't send on this subflow if we bypass the allowed send-window at -+ * the per-subflow level. Similar to tcp_snd_wnd_test, but manually -+ * calculated end_seq (because here at this point end_seq is still at -+ * the meta-level). -+ */ -+ if (skb && !zero_wnd_test && -+ after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp))) -+ return false; -+ -+ return true; -+} -+ -+/* Are we not allowed to reinject this skb on tp? */ -+static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb) -+{ -+ /* If the skb has already been enqueued in this sk, try to find -+ * another one. -+ */ -+ return skb && -+ /* Has the skb already been enqueued into this subsocket? */ -+ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; -+} -+ -+/* This is the scheduler. This function decides on which flow to send -+ * a given MSS. If all subflows are found to be busy, NULL is returned -+ * The flow is selected based on the shortest RTT. -+ * If all paths have full cong windows, we simply return NULL. -+ * -+ * Additionally, this function is aware of the backup-subflows. -+ */ -+static struct sock *get_available_subflow(struct sock *meta_sk, -+ struct sk_buff *skb, -+ bool zero_wnd_test) -+{ -+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL; -+ u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff; -+ int cnt_backups = 0; -+ -+ /* if there is only one subflow, bypass the scheduling function */ -+ if (mpcb->cnt_subflows == 1) { -+ bestsk = (struct sock *)mpcb->connection_list; -+ if (!mptcp_is_available(bestsk, skb, zero_wnd_test)) -+ bestsk = NULL; -+ return bestsk; -+ } -+ -+ /* Answer data_fin on same subflow!!! */ -+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && -+ skb && mptcp_is_data_fin(skb)) { -+ mptcp_for_each_sk(mpcb, sk) { -+ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && -+ mptcp_is_available(sk, skb, zero_wnd_test)) -+ return sk; -+ } -+ } -+ -+ /* First, find the best subflow */ -+ mptcp_for_each_sk(mpcb, sk) { -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) -+ cnt_backups++; -+ -+ if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) && -+ tp->srtt_us < lowprio_min_time_to_peer) { -+ if (!mptcp_is_available(sk, skb, zero_wnd_test)) -+ continue; -+ -+ if (mptcp_dont_reinject_skb(tp, skb)) { -+ backupsk = sk; -+ continue; -+ } -+ -+ lowprio_min_time_to_peer = tp->srtt_us; -+ lowpriosk = sk; -+ } else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) && -+ tp->srtt_us < min_time_to_peer) { -+ if (!mptcp_is_available(sk, skb, zero_wnd_test)) -+ continue; -+ -+ if (mptcp_dont_reinject_skb(tp, skb)) { -+ backupsk = sk; -+ continue; -+ } -+ -+ min_time_to_peer = tp->srtt_us; -+ bestsk = sk; -+ } -+ } -+ -+ if (mpcb->cnt_established == cnt_backups && lowpriosk) { -+ sk = lowpriosk; -+ } else if (bestsk) { -+ sk = bestsk; -+ } else if (backupsk) { -+ /* It has been sent on all subflows once - let's give it a -+ * chance again by restarting its pathmask. -+ */ -+ if (skb) -+ TCP_SKB_CB(skb)->path_mask = 0; -+ sk = backupsk; -+ } -+ -+ return sk; -+} -+ -+static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal) -+{ -+ struct sock *meta_sk; -+ const struct tcp_sock *tp = tcp_sk(sk); -+ struct tcp_sock *tp_it; -+ struct sk_buff *skb_head; -+ struct defsched_priv *dsp = defsched_get_priv(tp); -+ -+ if (tp->mpcb->cnt_subflows == 1) -+ return NULL; -+ -+ meta_sk = mptcp_meta_sk(sk); -+ skb_head = tcp_write_queue_head(meta_sk); -+ -+ if (!skb_head || skb_head == tcp_send_head(meta_sk)) -+ return NULL; -+ -+ /* If penalization is optional (coming from mptcp_next_segment() and -+ * We are not send-buffer-limited we do not penalize. The retransmission -+ * is just an optimization to fix the idle-time due to the delay before -+ * we wake up the application. -+ */ -+ if (!penal && sk_stream_memory_free(meta_sk)) -+ goto retrans; -+ -+ /* Only penalize again after an RTT has elapsed */ -+ if (tcp_time_stamp - dsp->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3)) -+ goto retrans; -+ -+ /* Half the cwnd of the slow flow */ -+ mptcp_for_each_tp(tp->mpcb, tp_it) { -+ if (tp_it != tp && -+ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { -+ if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) { -+ tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U); -+ if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH) -+ tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U); -+ -+ dsp->last_rbuf_opti = tcp_time_stamp; -+ } -+ break; -+ } -+ } -+ -+retrans: -+ -+ /* Segment not yet injected into this path? Take it!!! */ -+ if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) { -+ bool do_retrans = false; -+ mptcp_for_each_tp(tp->mpcb, tp_it) { -+ if (tp_it != tp && -+ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { -+ if (tp_it->snd_cwnd <= 4) { -+ do_retrans = true; -+ break; -+ } -+ -+ if (4 * tp->srtt_us >= tp_it->srtt_us) { -+ do_retrans = false; -+ break; -+ } else { -+ do_retrans = true; -+ } -+ } -+ } -+ -+ if (do_retrans && mptcp_is_available(sk, skb_head, false)) -+ return skb_head; -+ } -+ return NULL; -+} -+ -+/* Returns the next segment to be sent from the mptcp meta-queue. -+ * (chooses the reinject queue if any segment is waiting in it, otherwise, -+ * chooses the normal write queue). -+ * Sets *@reinject to 1 if the returned segment comes from the -+ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, -+ * and sets it to -1 if it is a meta-level retransmission to optimize the -+ * receive-buffer. -+ */ -+static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject) -+{ -+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; -+ struct sk_buff *skb = NULL; -+ -+ *reinject = 0; -+ -+ /* If we are in fallback-mode, just take from the meta-send-queue */ -+ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) -+ return tcp_send_head(meta_sk); -+ -+ skb = skb_peek(&mpcb->reinject_queue); -+ -+ if (skb) { -+ *reinject = 1; -+ } else { -+ skb = tcp_send_head(meta_sk); -+ -+ if (!skb && meta_sk->sk_socket && -+ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) && -+ sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) { -+ struct sock *subsk = get_available_subflow(meta_sk, NULL, -+ false); -+ if (!subsk) -+ return NULL; -+ -+ skb = mptcp_rcv_buf_optimization(subsk, 0); -+ if (skb) -+ *reinject = -1; -+ } -+ } -+ return skb; -+} -+ -+static struct sk_buff *mptcp_next_segment(struct sock *meta_sk, -+ int *reinject, -+ struct sock **subsk, -+ unsigned int *limit) -+{ -+ struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject); -+ unsigned int mss_now; -+ struct tcp_sock *subtp; -+ u16 gso_max_segs; -+ u32 max_len, max_segs, window, needed; -+ -+ /* As we set it, we have to reset it as well. */ -+ *limit = 0; -+ -+ if (!skb) -+ return NULL; -+ -+ *subsk = get_available_subflow(meta_sk, skb, false); -+ if (!*subsk) -+ return NULL; -+ -+ subtp = tcp_sk(*subsk); -+ mss_now = tcp_current_mss(*subsk); -+ -+ if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) { -+ skb = mptcp_rcv_buf_optimization(*subsk, 1); -+ if (skb) -+ *reinject = -1; -+ else -+ return NULL; -+ } -+ -+ /* No splitting required, as we will only send one single segment */ -+ if (skb->len <= mss_now) -+ return skb; -+ -+ /* The following is similar to tcp_mss_split_point, but -+ * we do not care about nagle, because we will anyways -+ * use TCP_NAGLE_PUSH, which overrides this. -+ * -+ * So, we first limit according to the cwnd/gso-size and then according -+ * to the subflow's window. -+ */ -+ -+ gso_max_segs = (*subsk)->sk_gso_max_segs; -+ if (!gso_max_segs) /* No gso supported on the subflow's NIC */ -+ gso_max_segs = 1; -+ max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs); -+ if (!max_segs) -+ return NULL; -+ -+ max_len = mss_now * max_segs; -+ window = tcp_wnd_end(subtp) - subtp->write_seq; -+ -+ needed = min(skb->len, window); -+ if (max_len <= skb->len) -+ /* Take max_win, which is actually the cwnd/gso-size */ -+ *limit = max_len; -+ else -+ /* Or, take the window */ -+ *limit = needed; -+ -+ return skb; -+} -+ -+static void defsched_init(struct sock *sk) -+{ -+ struct defsched_priv *dsp = defsched_get_priv(tcp_sk(sk)); -+ -+ dsp->last_rbuf_opti = tcp_time_stamp; -+} -+ -+struct mptcp_sched_ops mptcp_sched_default = { -+ .get_subflow = get_available_subflow, -+ .next_segment = mptcp_next_segment, -+ .init = defsched_init, -+ .name = "default", -+ .owner = THIS_MODULE, -+}; -+ -+static struct mptcp_sched_ops *mptcp_sched_find(const char *name) -+{ -+ struct mptcp_sched_ops *e; -+ -+ list_for_each_entry_rcu(e, &mptcp_sched_list, list) { -+ if (strcmp(e->name, name) == 0) -+ return e; -+ } -+ -+ return NULL; -+} -+ -+int mptcp_register_scheduler(struct mptcp_sched_ops *sched) -+{ -+ int ret = 0; -+ -+ if (!sched->get_subflow || !sched->next_segment) -+ return -EINVAL; -+ -+ spin_lock(&mptcp_sched_list_lock); -+ if (mptcp_sched_find(sched->name)) { -+ pr_notice("%s already registered\n", sched->name); -+ ret = -EEXIST; -+ } else { -+ list_add_tail_rcu(&sched->list, &mptcp_sched_list); -+ pr_info("%s registered\n", sched->name); -+ } -+ spin_unlock(&mptcp_sched_list_lock); -+ -+ return ret; -+} -+EXPORT_SYMBOL_GPL(mptcp_register_scheduler); -+ -+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) -+{ -+ spin_lock(&mptcp_sched_list_lock); -+ list_del_rcu(&sched->list); -+ spin_unlock(&mptcp_sched_list_lock); -+} -+EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler); -+ -+void mptcp_get_default_scheduler(char *name) -+{ -+ struct mptcp_sched_ops *sched; -+ -+ BUG_ON(list_empty(&mptcp_sched_list)); -+ -+ rcu_read_lock(); -+ sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list); -+ strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX); -+ rcu_read_unlock(); -+} -+ -+int mptcp_set_default_scheduler(const char *name) -+{ -+ struct mptcp_sched_ops *sched; -+ int ret = -ENOENT; -+ -+ spin_lock(&mptcp_sched_list_lock); -+ sched = mptcp_sched_find(name); -+#ifdef CONFIG_MODULES -+ if (!sched && capable(CAP_NET_ADMIN)) { -+ spin_unlock(&mptcp_sched_list_lock); -+ -+ request_module("mptcp_%s", name); -+ spin_lock(&mptcp_sched_list_lock); -+ sched = mptcp_sched_find(name); -+ } -+#endif -+ -+ if (sched) { -+ list_move(&sched->list, &mptcp_sched_list); -+ ret = 0; -+ } else { -+ pr_info("%s is not available\n", name); -+ } -+ spin_unlock(&mptcp_sched_list_lock); -+ -+ return ret; -+} -+ -+void mptcp_init_scheduler(struct mptcp_cb *mpcb) -+{ -+ struct mptcp_sched_ops *sched; -+ -+ rcu_read_lock(); -+ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { -+ if (try_module_get(sched->owner)) { -+ mpcb->sched_ops = sched; -+ break; -+ } -+ } -+ rcu_read_unlock(); -+} -+ -+/* Manage refcounts on socket close. */ -+void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb) -+{ -+ module_put(mpcb->sched_ops->owner); -+} -+ -+/* Set default value from kernel configuration at bootup */ -+static int __init mptcp_scheduler_default(void) -+{ -+ BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE); -+ -+ return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED); -+} -+late_initcall(mptcp_scheduler_default); -diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c -new file mode 100644 -index 000000000000..29ca1d868d17 ---- /dev/null -+++ b/net/mptcp/mptcp_wvegas.c -@@ -0,0 +1,268 @@ -+/* -+ * MPTCP implementation - WEIGHTED VEGAS -+ * -+ * Algorithm design: -+ * Yu Cao -+ * Mingwei Xu -+ * Xiaoming Fu -+ * -+ * Implementation: -+ * Yu Cao -+ * Enhuan Dong -+ * -+ * Ported to the official MPTCP-kernel: -+ * Christoph Paasch -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+static int initial_alpha = 2; -+static int total_alpha = 10; -+static int gamma = 1; -+ -+module_param(initial_alpha, int, 0644); -+MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows"); -+module_param(total_alpha, int, 0644); -+MODULE_PARM_DESC(total_alpha, "total alpha for all subflows"); -+module_param(gamma, int, 0644); -+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); -+ -+#define MPTCP_WVEGAS_SCALE 16 -+ -+/* wVegas variables */ -+struct wvegas { -+ u32 beg_snd_nxt; /* right edge during last RTT */ -+ u8 doing_wvegas_now;/* if true, do wvegas for this RTT */ -+ -+ u16 cnt_rtt; /* # of RTTs measured within last RTT */ -+ u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */ -+ u32 base_rtt; /* the min of all wVegas RTT measurements seen (in usec) */ -+ -+ u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */ -+ u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */ -+ int alpha; /* alpha for each subflows */ -+ -+ u32 queue_delay; /* queue delay*/ -+}; -+ -+ -+static inline u64 mptcp_wvegas_scale(u32 val, int scale) -+{ -+ return (u64) val << scale; -+} -+ -+static void wvegas_enable(const struct sock *sk) -+{ -+ const struct tcp_sock *tp = tcp_sk(sk); -+ struct wvegas *wvegas = inet_csk_ca(sk); -+ -+ wvegas->doing_wvegas_now = 1; -+ -+ wvegas->beg_snd_nxt = tp->snd_nxt; -+ -+ wvegas->cnt_rtt = 0; -+ wvegas->sampled_rtt = 0; -+ -+ wvegas->instant_rate = 0; -+ wvegas->alpha = initial_alpha; -+ wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE); -+ -+ wvegas->queue_delay = 0; -+} -+ -+static inline void wvegas_disable(const struct sock *sk) -+{ -+ struct wvegas *wvegas = inet_csk_ca(sk); -+ -+ wvegas->doing_wvegas_now = 0; -+} -+ -+static void mptcp_wvegas_init(struct sock *sk) -+{ -+ struct wvegas *wvegas = inet_csk_ca(sk); -+ -+ wvegas->base_rtt = 0x7fffffff; -+ wvegas_enable(sk); -+} -+ -+static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us) -+{ -+ return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us); -+} -+ -+static void mptcp_wvegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us) -+{ -+ struct wvegas *wvegas = inet_csk_ca(sk); -+ u32 vrtt; -+ -+ if (rtt_us < 0) -+ return; -+ -+ vrtt = rtt_us + 1; -+ -+ if (vrtt < wvegas->base_rtt) -+ wvegas->base_rtt = vrtt; -+ -+ wvegas->sampled_rtt += vrtt; -+ wvegas->cnt_rtt++; -+} -+ -+static void mptcp_wvegas_state(struct sock *sk, u8 ca_state) -+{ -+ if (ca_state == TCP_CA_Open) -+ wvegas_enable(sk); -+ else -+ wvegas_disable(sk); -+} -+ -+static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) -+{ -+ if (event == CA_EVENT_CWND_RESTART) { -+ mptcp_wvegas_init(sk); -+ } else if (event == CA_EVENT_LOSS) { -+ struct wvegas *wvegas = inet_csk_ca(sk); -+ wvegas->instant_rate = 0; -+ } -+} -+ -+static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp) -+{ -+ return min(tp->snd_ssthresh, tp->snd_cwnd - 1); -+} -+ -+static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk) -+{ -+ u64 total_rate = 0; -+ struct sock *sub_sk; -+ const struct wvegas *wvegas = inet_csk_ca(sk); -+ -+ if (!mpcb) -+ return wvegas->weight; -+ -+ -+ mptcp_for_each_sk(mpcb, sub_sk) { -+ struct wvegas *sub_wvegas = inet_csk_ca(sub_sk); -+ -+ /* sampled_rtt is initialized by 0 */ -+ if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0)) -+ total_rate += sub_wvegas->instant_rate; -+ } -+ -+ if (total_rate && wvegas->instant_rate) -+ return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate); -+ else -+ return wvegas->weight; -+} -+ -+static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct wvegas *wvegas = inet_csk_ca(sk); -+ -+ if (!wvegas->doing_wvegas_now) { -+ tcp_reno_cong_avoid(sk, ack, acked); -+ return; -+ } -+ -+ if (after(ack, wvegas->beg_snd_nxt)) { -+ wvegas->beg_snd_nxt = tp->snd_nxt; -+ -+ if (wvegas->cnt_rtt <= 2) { -+ tcp_reno_cong_avoid(sk, ack, acked); -+ } else { -+ u32 rtt, diff, q_delay; -+ u64 target_cwnd; -+ -+ rtt = wvegas->sampled_rtt / wvegas->cnt_rtt; -+ target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt); -+ -+ diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt); -+ -+ if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) { -+ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); -+ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp); -+ -+ } else if (tp->snd_cwnd <= tp->snd_ssthresh) { -+ tcp_slow_start(tp, acked); -+ } else { -+ if (diff >= wvegas->alpha) { -+ wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt); -+ wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk); -+ wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE)); -+ } -+ if (diff > wvegas->alpha) { -+ tp->snd_cwnd--; -+ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp); -+ } else if (diff < wvegas->alpha) { -+ tp->snd_cwnd++; -+ } -+ -+ /* Try to drain link queue if needed*/ -+ q_delay = rtt - wvegas->base_rtt; -+ if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay)) -+ wvegas->queue_delay = q_delay; -+ -+ if (q_delay >= 2 * wvegas->queue_delay) { -+ u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt); -+ tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE; -+ wvegas->queue_delay = 0; -+ } -+ } -+ -+ if (tp->snd_cwnd < 2) -+ tp->snd_cwnd = 2; -+ else if (tp->snd_cwnd > tp->snd_cwnd_clamp) -+ tp->snd_cwnd = tp->snd_cwnd_clamp; -+ -+ tp->snd_ssthresh = tcp_current_ssthresh(sk); -+ } -+ -+ wvegas->cnt_rtt = 0; -+ wvegas->sampled_rtt = 0; -+ } -+ /* Use normal slow start */ -+ else if (tp->snd_cwnd <= tp->snd_ssthresh) -+ tcp_slow_start(tp, acked); -+} -+ -+ -+static struct tcp_congestion_ops mptcp_wvegas __read_mostly = { -+ .init = mptcp_wvegas_init, -+ .ssthresh = tcp_reno_ssthresh, -+ .cong_avoid = mptcp_wvegas_cong_avoid, -+ .pkts_acked = mptcp_wvegas_pkts_acked, -+ .set_state = mptcp_wvegas_state, -+ .cwnd_event = mptcp_wvegas_cwnd_event, -+ -+ .owner = THIS_MODULE, -+ .name = "wvegas", -+}; -+ -+static int __init mptcp_wvegas_register(void) -+{ -+ BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE); -+ tcp_register_congestion_control(&mptcp_wvegas); -+ return 0; -+} -+ -+static void __exit mptcp_wvegas_unregister(void) -+{ -+ tcp_unregister_congestion_control(&mptcp_wvegas); -+} -+ -+module_init(mptcp_wvegas_register); -+module_exit(mptcp_wvegas_unregister); -+ -+MODULE_AUTHOR("Yu Cao, Enhuan Dong"); -+MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("MPTCP wVegas"); -+MODULE_VERSION("0.1");