From: "Mike Pagano" <mpagano@gentoo.org>
To: gentoo-commits@lists.gentoo.org
Subject: [gentoo-commits] proj/linux-patches:3.16 commit in: /
Date: Mon, 6 Oct 2014 11:38:44 +0000 (UTC) [thread overview]
Message-ID: <1412595522.f2ea3e49d07e5b148c974633ec003ba2382f1189.mpagano@gentoo> (raw)
commit: f2ea3e49d07e5b148c974633ec003ba2382f1189
Author: Mike Pagano <mpagano <AT> gentoo <DOT> org>
AuthorDate: Mon Oct 6 11:38:42 2014 +0000
Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org>
CommitDate: Mon Oct 6 11:38:42 2014 +0000
URL: http://sources.gentoo.org/gitweb/?p=proj/linux-patches.git;a=commit;h=f2ea3e49
Move multipath to experimental.
---
5010_multipath-tcp-v3.16-872d7f6c6f4e.patch | 19230 ++++++++++++++++++++++++++
1 file changed, 19230 insertions(+)
diff --git a/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch b/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch
new file mode 100644
index 0000000..3000da3
--- /dev/null
+++ b/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch
@@ -0,0 +1,19230 @@
+diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
+index 768a0fb67dd6..5a46d91a8df9 100644
+--- a/drivers/infiniband/hw/cxgb4/cm.c
++++ b/drivers/infiniband/hw/cxgb4/cm.c
+@@ -3432,7 +3432,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
+ */
+ memset(&tmp_opt, 0, sizeof(tmp_opt));
+ tcp_clear_options(&tmp_opt);
+- tcp_parse_options(skb, &tmp_opt, 0, NULL);
++ tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL);
+
+ req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));
+ memset(req, 0, sizeof(*req));
+diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
+index 2faef339d8f2..d86c853ffaad 100644
+--- a/include/linux/ipv6.h
++++ b/include/linux/ipv6.h
+@@ -256,16 +256,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
+ return inet_sk(__sk)->pinet6;
+ }
+
+-static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops)
+-{
+- struct request_sock *req = reqsk_alloc(ops);
+-
+- if (req)
+- inet_rsk(req)->pktopts = NULL;
+-
+- return req;
+-}
+-
+ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
+ {
+ return (struct raw6_sock *)sk;
+@@ -309,12 +299,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
+ return NULL;
+ }
+
+-static inline struct inet6_request_sock *
+- inet6_rsk(const struct request_sock *rsk)
+-{
+- return NULL;
+-}
+-
+ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
+ {
+ return NULL;
+diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
+index ec89301ada41..99ea4b0e3693 100644
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -2784,8 +2784,10 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
+ bool zero_okay,
+ __sum16 check)
+ {
+- if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
+- skb->csum_valid = 1;
++ if (skb_csum_unnecessary(skb)) {
++ return false;
++ } else if (zero_okay && !check) {
++ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ return false;
+ }
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index a0513210798f..7bc2e078d6ca 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -53,7 +53,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
+ /* TCP Fast Open */
+ #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */
+ #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */
+-#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */
++#define TCP_FASTOPEN_COOKIE_SIZE 4 /* the size employed by this impl. */
+
+ /* TCP Fast Open Cookie as stored in memory */
+ struct tcp_fastopen_cookie {
+@@ -72,6 +72,51 @@ struct tcp_sack_block {
+ u32 end_seq;
+ };
+
++struct tcp_out_options {
++ u16 options; /* bit field of OPTION_* */
++ u8 ws; /* window scale, 0 to disable */
++ u8 num_sack_blocks;/* number of SACK blocks to include */
++ u8 hash_size; /* bytes in hash_location */
++ u16 mss; /* 0 to disable */
++ __u8 *hash_location; /* temporary pointer, overloaded */
++ __u32 tsval, tsecr; /* need to include OPTION_TS */
++ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
++#ifdef CONFIG_MPTCP
++ u16 mptcp_options; /* bit field of MPTCP related OPTION_* */
++ u8 dss_csum:1,
++ add_addr_v4:1,
++ add_addr_v6:1; /* dss-checksum required? */
++
++ union {
++ struct {
++ __u64 sender_key; /* sender's key for mptcp */
++ __u64 receiver_key; /* receiver's key for mptcp */
++ } mp_capable;
++
++ struct {
++ __u64 sender_truncated_mac;
++ __u32 sender_nonce;
++ /* random number of the sender */
++ __u32 token; /* token for mptcp */
++ u8 low_prio:1;
++ } mp_join_syns;
++ };
++
++ struct {
++ struct in_addr addr;
++ u8 addr_id;
++ } add_addr4;
++
++ struct {
++ struct in6_addr addr;
++ u8 addr_id;
++ } add_addr6;
++
++ u16 remove_addrs; /* list of address id */
++ u8 addr_id; /* address id (mp_join or add_address) */
++#endif /* CONFIG_MPTCP */
++};
++
+ /*These are used to set the sack_ok field in struct tcp_options_received */
+ #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */
+ #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/
+@@ -95,6 +140,9 @@ struct tcp_options_received {
+ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
+ };
+
++struct mptcp_cb;
++struct mptcp_tcp_sock;
++
+ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
+ {
+ rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
+@@ -111,10 +159,7 @@ struct tcp_request_sock_ops;
+
+ struct tcp_request_sock {
+ struct inet_request_sock req;
+-#ifdef CONFIG_TCP_MD5SIG
+- /* Only used by TCP MD5 Signature so far. */
+ const struct tcp_request_sock_ops *af_specific;
+-#endif
+ struct sock *listener; /* needed for TFO */
+ u32 rcv_isn;
+ u32 snt_isn;
+@@ -130,6 +175,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
+ return (struct tcp_request_sock *)req;
+ }
+
++struct tcp_md5sig_key;
++
+ struct tcp_sock {
+ /* inet_connection_sock has to be the first member of tcp_sock */
+ struct inet_connection_sock inet_conn;
+@@ -326,6 +373,37 @@ struct tcp_sock {
+ * socket. Used to retransmit SYNACKs etc.
+ */
+ struct request_sock *fastopen_rsk;
++
++ /* MPTCP/TCP-specific callbacks */
++ const struct tcp_sock_ops *ops;
++
++ struct mptcp_cb *mpcb;
++ struct sock *meta_sk;
++ /* We keep these flags even if CONFIG_MPTCP is not checked, because
++ * it allows checking MPTCP capability just by checking the mpc flag,
++ * rather than adding ifdefs everywhere.
++ */
++ u16 mpc:1, /* Other end is multipath capable */
++ inside_tk_table:1, /* Is the tcp_sock inside the token-table? */
++ send_mp_fclose:1,
++ request_mptcp:1, /* Did we send out an MP_CAPABLE?
++ * (this speeds up mptcp_doit() in tcp_recvmsg)
++ */
++ mptcp_enabled:1, /* Is MPTCP enabled from the application ? */
++ pf:1, /* Potentially Failed state: when this flag is set, we
++ * stop using the subflow
++ */
++ mp_killed:1, /* Killed with a tcp_done in mptcp? */
++ was_meta_sk:1, /* This was a meta sk (in case of reuse) */
++ is_master_sk,
++ close_it:1, /* Must close socket in mptcp_data_ready? */
++ closing:1;
++ struct mptcp_tcp_sock *mptcp;
++#ifdef CONFIG_MPTCP
++ struct hlist_nulls_node tk_table;
++ u32 mptcp_loc_token;
++ u64 mptcp_loc_key;
++#endif /* CONFIG_MPTCP */
+ };
+
+ enum tsq_flags {
+@@ -337,6 +415,8 @@ enum tsq_flags {
+ TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
+ * tcp_v{4|6}_mtu_reduced()
+ */
++ MPTCP_PATH_MANAGER, /* MPTCP deferred creation of new subflows */
++ MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */
+ };
+
+ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
+@@ -355,6 +435,7 @@ struct tcp_timewait_sock {
+ #ifdef CONFIG_TCP_MD5SIG
+ struct tcp_md5sig_key *tw_md5_key;
+ #endif
++ struct mptcp_tw *mptcp_tw;
+ };
+
+ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
+diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
+index 74af137304be..83f63033897a 100644
+--- a/include/net/inet6_connection_sock.h
++++ b/include/net/inet6_connection_sock.h
+@@ -27,6 +27,8 @@ int inet6_csk_bind_conflict(const struct sock *sk,
+
+ struct dst_entry *inet6_csk_route_req(struct sock *sk, struct flowi6 *fl6,
+ const struct request_sock *req);
++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
++ const u32 rnd, const u32 synq_hsize);
+
+ struct request_sock *inet6_csk_search_req(const struct sock *sk,
+ struct request_sock ***prevp,
+diff --git a/include/net/inet_common.h b/include/net/inet_common.h
+index fe7994c48b75..780f229f46a8 100644
+--- a/include/net/inet_common.h
++++ b/include/net/inet_common.h
+@@ -1,6 +1,8 @@
+ #ifndef _INET_COMMON_H
+ #define _INET_COMMON_H
+
++#include <net/sock.h>
++
+ extern const struct proto_ops inet_stream_ops;
+ extern const struct proto_ops inet_dgram_ops;
+
+@@ -13,6 +15,8 @@ struct sock;
+ struct sockaddr;
+ struct socket;
+
++int inet_create(struct net *net, struct socket *sock, int protocol, int kern);
++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern);
+ int inet_release(struct socket *sock);
+ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags);
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index 7a4313887568..f62159e39839 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -30,6 +30,7 @@
+
+ struct inet_bind_bucket;
+ struct tcp_congestion_ops;
++struct tcp_options_received;
+
+ /*
+ * Pointers to address related TCP functions
+@@ -243,6 +244,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
+
+ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
+
++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
++ const u32 synq_hsize);
++
+ struct request_sock *inet_csk_search_req(const struct sock *sk,
+ struct request_sock ***prevp,
+ const __be16 rport,
+diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
+index b1edf17bec01..6a32d8d6b85e 100644
+--- a/include/net/inet_sock.h
++++ b/include/net/inet_sock.h
+@@ -86,10 +86,14 @@ struct inet_request_sock {
+ wscale_ok : 1,
+ ecn_ok : 1,
+ acked : 1,
+- no_srccheck: 1;
++ no_srccheck: 1,
++ mptcp_rqsk : 1,
++ saw_mpc : 1;
+ kmemcheck_bitfield_end(flags);
+- struct ip_options_rcu *opt;
+- struct sk_buff *pktopts;
++ union {
++ struct ip_options_rcu *opt;
++ struct sk_buff *pktopts;
++ };
+ u32 ir_mark;
+ };
+
+diff --git a/include/net/mptcp.h b/include/net/mptcp.h
+new file mode 100644
+index 000000000000..712780fc39e4
+--- /dev/null
++++ b/include/net/mptcp.h
+@@ -0,0 +1,1439 @@
++/*
++ * MPTCP implementation
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer & Author:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#ifndef _MPTCP_H
++#define _MPTCP_H
++
++#include <linux/inetdevice.h>
++#include <linux/ipv6.h>
++#include <linux/list.h>
++#include <linux/net.h>
++#include <linux/netpoll.h>
++#include <linux/skbuff.h>
++#include <linux/socket.h>
++#include <linux/tcp.h>
++#include <linux/kernel.h>
++
++#include <asm/byteorder.h>
++#include <asm/unaligned.h>
++#include <crypto/hash.h>
++#include <net/tcp.h>
++
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++ #define ntohll(x) be64_to_cpu(x)
++ #define htonll(x) cpu_to_be64(x)
++#elif defined(__BIG_ENDIAN_BITFIELD)
++ #define ntohll(x) (x)
++ #define htonll(x) (x)
++#endif
++
++struct mptcp_loc4 {
++ u8 loc4_id;
++ u8 low_prio:1;
++ struct in_addr addr;
++};
++
++struct mptcp_rem4 {
++ u8 rem4_id;
++ __be16 port;
++ struct in_addr addr;
++};
++
++struct mptcp_loc6 {
++ u8 loc6_id;
++ u8 low_prio:1;
++ struct in6_addr addr;
++};
++
++struct mptcp_rem6 {
++ u8 rem6_id;
++ __be16 port;
++ struct in6_addr addr;
++};
++
++struct mptcp_request_sock {
++ struct tcp_request_sock req;
++ /* hlist-nulls entry to the hash-table. Depending on whether this is a
++ * a new MPTCP connection or an additional subflow, the request-socket
++ * is either in the mptcp_reqsk_tk_htb or mptcp_reqsk_htb.
++ */
++ struct hlist_nulls_node hash_entry;
++
++ union {
++ struct {
++ /* Only on initial subflows */
++ u64 mptcp_loc_key;
++ u64 mptcp_rem_key;
++ u32 mptcp_loc_token;
++ };
++
++ struct {
++ /* Only on additional subflows */
++ struct mptcp_cb *mptcp_mpcb;
++ u32 mptcp_rem_nonce;
++ u32 mptcp_loc_nonce;
++ u64 mptcp_hash_tmac;
++ };
++ };
++
++ u8 loc_id;
++ u8 rem_id; /* Address-id in the MP_JOIN */
++ u8 dss_csum:1,
++ is_sub:1, /* Is this a new subflow? */
++ low_prio:1, /* Interface set to low-prio? */
++ rcv_low_prio:1;
++};
++
++struct mptcp_options_received {
++ u16 saw_mpc:1,
++ dss_csum:1,
++ drop_me:1,
++
++ is_mp_join:1,
++ join_ack:1,
++
++ saw_low_prio:2, /* 0x1 - low-prio set for this subflow
++ * 0x2 - low-prio set for another subflow
++ */
++ low_prio:1,
++
++ saw_add_addr:2, /* Saw at least one add_addr option:
++ * 0x1: IPv4 - 0x2: IPv6
++ */
++ more_add_addr:1, /* Saw one more add-addr. */
++
++ saw_rem_addr:1, /* Saw at least one rem_addr option */
++ more_rem_addr:1, /* Saw one more rem-addr. */
++
++ mp_fail:1,
++ mp_fclose:1;
++ u8 rem_id; /* Address-id in the MP_JOIN */
++ u8 prio_addr_id; /* Address-id in the MP_PRIO */
++
++ const unsigned char *add_addr_ptr; /* Pointer to add-address option */
++ const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */
++
++ u32 data_ack;
++ u32 data_seq;
++ u16 data_len;
++
++ u32 mptcp_rem_token;/* Remote token */
++
++ /* Key inside the option (from mp_capable or fast_close) */
++ u64 mptcp_key;
++
++ u32 mptcp_recv_nonce;
++ u64 mptcp_recv_tmac;
++ u8 mptcp_recv_mac[20];
++};
++
++struct mptcp_tcp_sock {
++ struct tcp_sock *next; /* Next subflow socket */
++ struct hlist_node cb_list;
++ struct mptcp_options_received rx_opt;
++
++ /* Those three fields record the current mapping */
++ u64 map_data_seq;
++ u32 map_subseq;
++ u16 map_data_len;
++ u16 slave_sk:1,
++ fully_established:1,
++ establish_increased:1,
++ second_packet:1,
++ attached:1,
++ send_mp_fail:1,
++ include_mpc:1,
++ mapping_present:1,
++ map_data_fin:1,
++ low_prio:1, /* use this socket as backup */
++ rcv_low_prio:1, /* Peer sent low-prio option to us */
++ send_mp_prio:1, /* Trigger to send mp_prio on this socket */
++ pre_established:1; /* State between sending 3rd ACK and
++ * receiving the fourth ack of new subflows.
++ */
++
++ /* isn: needed to translate abs to relative subflow seqnums */
++ u32 snt_isn;
++ u32 rcv_isn;
++ u8 path_index;
++ u8 loc_id;
++ u8 rem_id;
++
++#define MPTCP_SCHED_SIZE 4
++ u8 mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8);
++
++ struct sk_buff *shortcut_ofoqueue; /* Shortcut to the current modified
++ * skb in the ofo-queue.
++ */
++
++ int init_rcv_wnd;
++ u32 infinite_cutoff_seq;
++ struct delayed_work work;
++ u32 mptcp_loc_nonce;
++ struct tcp_sock *tp; /* Where is my daddy? */
++ u32 last_end_data_seq;
++
++ /* MP_JOIN subflow: timer for retransmitting the 3rd ack */
++ struct timer_list mptcp_ack_timer;
++
++ /* HMAC of the third ack */
++ char sender_mac[20];
++};
++
++struct mptcp_tw {
++ struct list_head list;
++ u64 loc_key;
++ u64 rcv_nxt;
++ struct mptcp_cb __rcu *mpcb;
++ u8 meta_tw:1,
++ in_list:1;
++};
++
++#define MPTCP_PM_NAME_MAX 16
++struct mptcp_pm_ops {
++ struct list_head list;
++
++ /* Signal the creation of a new MPTCP-session. */
++ void (*new_session)(const struct sock *meta_sk);
++ void (*release_sock)(struct sock *meta_sk);
++ void (*fully_established)(struct sock *meta_sk);
++ void (*new_remote_address)(struct sock *meta_sk);
++ int (*get_local_id)(sa_family_t family, union inet_addr *addr,
++ struct net *net, bool *low_prio);
++ void (*addr_signal)(struct sock *sk, unsigned *size,
++ struct tcp_out_options *opts, struct sk_buff *skb);
++ void (*add_raddr)(struct mptcp_cb *mpcb, const union inet_addr *addr,
++ sa_family_t family, __be16 port, u8 id);
++ void (*rem_raddr)(struct mptcp_cb *mpcb, u8 rem_id);
++ void (*init_subsocket_v4)(struct sock *sk, struct in_addr addr);
++ void (*init_subsocket_v6)(struct sock *sk, struct in6_addr addr);
++
++ char name[MPTCP_PM_NAME_MAX];
++ struct module *owner;
++};
++
++#define MPTCP_SCHED_NAME_MAX 16
++struct mptcp_sched_ops {
++ struct list_head list;
++
++ struct sock * (*get_subflow)(struct sock *meta_sk,
++ struct sk_buff *skb,
++ bool zero_wnd_test);
++ struct sk_buff * (*next_segment)(struct sock *meta_sk,
++ int *reinject,
++ struct sock **subsk,
++ unsigned int *limit);
++ void (*init)(struct sock *sk);
++
++ char name[MPTCP_SCHED_NAME_MAX];
++ struct module *owner;
++};
++
++struct mptcp_cb {
++ /* list of sockets in this multipath connection */
++ struct tcp_sock *connection_list;
++ /* list of sockets that need a call to release_cb */
++ struct hlist_head callback_list;
++
++ /* High-order bits of 64-bit sequence numbers */
++ u32 snd_high_order[2];
++ u32 rcv_high_order[2];
++
++ u16 send_infinite_mapping:1,
++ in_time_wait:1,
++ list_rcvd:1, /* XXX TO REMOVE */
++ addr_signal:1, /* Path-manager wants us to call addr_signal */
++ dss_csum:1,
++ server_side:1,
++ infinite_mapping_rcv:1,
++ infinite_mapping_snd:1,
++ dfin_combined:1, /* Was the DFIN combined with subflow-fin? */
++ passive_close:1,
++ snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */
++ rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */
++
++ /* socket count in this connection */
++ u8 cnt_subflows;
++ u8 cnt_established;
++
++ struct mptcp_sched_ops *sched_ops;
++
++ struct sk_buff_head reinject_queue;
++ /* First cache-line boundary is here minus 8 bytes. But from the
++ * reinject-queue only the next and prev pointers are regularly
++ * accessed. Thus, the whole data-path is on a single cache-line.
++ */
++
++ u64 csum_cutoff_seq;
++
++ /***** Start of fields, used for connection closure */
++ spinlock_t tw_lock;
++ unsigned char mptw_state;
++ u8 dfin_path_index;
++
++ struct list_head tw_list;
++
++ /***** Start of fields, used for subflow establishment and closure */
++ atomic_t mpcb_refcnt;
++
++ /* Mutex needed, because otherwise mptcp_close will complain that the
++ * socket is owned by the user.
++ * E.g., mptcp_sub_close_wq is taking the meta-lock.
++ */
++ struct mutex mpcb_mutex;
++
++ /***** Start of fields, used for subflow establishment */
++ struct sock *meta_sk;
++
++ /* Master socket, also part of the connection_list, this
++ * socket is the one that the application sees.
++ */
++ struct sock *master_sk;
++
++ __u64 mptcp_loc_key;
++ __u64 mptcp_rem_key;
++ __u32 mptcp_loc_token;
++ __u32 mptcp_rem_token;
++
++#define MPTCP_PM_SIZE 608
++ u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8);
++ struct mptcp_pm_ops *pm_ops;
++
++ u32 path_index_bits;
++ /* Next pi to pick up in case a new path becomes available */
++ u8 next_path_index;
++
++ /* Original snd/rcvbuf of the initial subflow.
++ * Used for the new subflows on the server-side to allow correct
++ * autotuning
++ */
++ int orig_sk_rcvbuf;
++ int orig_sk_sndbuf;
++ u32 orig_window_clamp;
++
++ /* Timer for retransmitting SYN/ACK+MP_JOIN */
++ struct timer_list synack_timer;
++};
++
++#define MPTCP_SUB_CAPABLE 0
++#define MPTCP_SUB_LEN_CAPABLE_SYN 12
++#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12
++#define MPTCP_SUB_LEN_CAPABLE_ACK 20
++#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20
++
++#define MPTCP_SUB_JOIN 1
++#define MPTCP_SUB_LEN_JOIN_SYN 12
++#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12
++#define MPTCP_SUB_LEN_JOIN_SYNACK 16
++#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16
++#define MPTCP_SUB_LEN_JOIN_ACK 24
++#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24
++
++#define MPTCP_SUB_DSS 2
++#define MPTCP_SUB_LEN_DSS 4
++#define MPTCP_SUB_LEN_DSS_ALIGN 4
++
++/* Lengths for seq and ack are the ones without the generic MPTCP-option header,
++ * as they are part of the DSS-option.
++ * To get the total length, just add the different options together.
++ */
++#define MPTCP_SUB_LEN_SEQ 10
++#define MPTCP_SUB_LEN_SEQ_CSUM 12
++#define MPTCP_SUB_LEN_SEQ_ALIGN 12
++
++#define MPTCP_SUB_LEN_SEQ_64 14
++#define MPTCP_SUB_LEN_SEQ_CSUM_64 16
++#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16
++
++#define MPTCP_SUB_LEN_ACK 4
++#define MPTCP_SUB_LEN_ACK_ALIGN 4
++
++#define MPTCP_SUB_LEN_ACK_64 8
++#define MPTCP_SUB_LEN_ACK_64_ALIGN 8
++
++/* This is the "default" option-length we will send out most often.
++ * MPTCP DSS-header
++ * 32-bit data sequence number
++ * 32-bit data ack
++ *
++ * It is necessary to calculate the effective MSS we will be using when
++ * sending data.
++ */
++#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \
++ MPTCP_SUB_LEN_SEQ_ALIGN + \
++ MPTCP_SUB_LEN_ACK_ALIGN)
++
++#define MPTCP_SUB_ADD_ADDR 3
++#define MPTCP_SUB_LEN_ADD_ADDR4 8
++#define MPTCP_SUB_LEN_ADD_ADDR6 20
++#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8
++#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20
++
++#define MPTCP_SUB_REMOVE_ADDR 4
++#define MPTCP_SUB_LEN_REMOVE_ADDR 4
++
++#define MPTCP_SUB_PRIO 5
++#define MPTCP_SUB_LEN_PRIO 3
++#define MPTCP_SUB_LEN_PRIO_ADDR 4
++#define MPTCP_SUB_LEN_PRIO_ALIGN 4
++
++#define MPTCP_SUB_FAIL 6
++#define MPTCP_SUB_LEN_FAIL 12
++#define MPTCP_SUB_LEN_FAIL_ALIGN 12
++
++#define MPTCP_SUB_FCLOSE 7
++#define MPTCP_SUB_LEN_FCLOSE 12
++#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12
++
++
++#define OPTION_MPTCP (1 << 5)
++
++#ifdef CONFIG_MPTCP
++
++/* Used for checking if the mptcp initialization has been successful */
++extern bool mptcp_init_failed;
++
++/* MPTCP options */
++#define OPTION_TYPE_SYN (1 << 0)
++#define OPTION_TYPE_SYNACK (1 << 1)
++#define OPTION_TYPE_ACK (1 << 2)
++#define OPTION_MP_CAPABLE (1 << 3)
++#define OPTION_DATA_ACK (1 << 4)
++#define OPTION_ADD_ADDR (1 << 5)
++#define OPTION_MP_JOIN (1 << 6)
++#define OPTION_MP_FAIL (1 << 7)
++#define OPTION_MP_FCLOSE (1 << 8)
++#define OPTION_REMOVE_ADDR (1 << 9)
++#define OPTION_MP_PRIO (1 << 10)
++
++/* MPTCP flags: both TX and RX */
++#define MPTCPHDR_SEQ 0x01 /* DSS.M option is present */
++#define MPTCPHDR_FIN 0x02 /* DSS.F option is present */
++#define MPTCPHDR_SEQ64_INDEX 0x04 /* index of seq in mpcb->snd_high_order */
++/* MPTCP flags: RX only */
++#define MPTCPHDR_ACK 0x08
++#define MPTCPHDR_SEQ64_SET 0x10 /* Did we received a 64-bit seq number? */
++#define MPTCPHDR_SEQ64_OFO 0x20 /* Is it not in our circular array? */
++#define MPTCPHDR_DSS_CSUM 0x40
++#define MPTCPHDR_JOIN 0x80
++/* MPTCP flags: TX only */
++#define MPTCPHDR_INF 0x08
++
++struct mptcp_option {
++ __u8 kind;
++ __u8 len;
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++ __u8 ver:4,
++ sub:4;
++#elif defined(__BIG_ENDIAN_BITFIELD)
++ __u8 sub:4,
++ ver:4;
++#else
++#error "Adjust your <asm/byteorder.h> defines"
++#endif
++};
++
++struct mp_capable {
++ __u8 kind;
++ __u8 len;
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++ __u8 ver:4,
++ sub:4;
++ __u8 h:1,
++ rsv:5,
++ b:1,
++ a:1;
++#elif defined(__BIG_ENDIAN_BITFIELD)
++ __u8 sub:4,
++ ver:4;
++ __u8 a:1,
++ b:1,
++ rsv:5,
++ h:1;
++#else
++#error "Adjust your <asm/byteorder.h> defines"
++#endif
++ __u64 sender_key;
++ __u64 receiver_key;
++} __attribute__((__packed__));
++
++struct mp_join {
++ __u8 kind;
++ __u8 len;
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++ __u8 b:1,
++ rsv:3,
++ sub:4;
++#elif defined(__BIG_ENDIAN_BITFIELD)
++ __u8 sub:4,
++ rsv:3,
++ b:1;
++#else
++#error "Adjust your <asm/byteorder.h> defines"
++#endif
++ __u8 addr_id;
++ union {
++ struct {
++ u32 token;
++ u32 nonce;
++ } syn;
++ struct {
++ __u64 mac;
++ u32 nonce;
++ } synack;
++ struct {
++ __u8 mac[20];
++ } ack;
++ } u;
++} __attribute__((__packed__));
++
++struct mp_dss {
++ __u8 kind;
++ __u8 len;
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++ __u16 rsv1:4,
++ sub:4,
++ A:1,
++ a:1,
++ M:1,
++ m:1,
++ F:1,
++ rsv2:3;
++#elif defined(__BIG_ENDIAN_BITFIELD)
++ __u16 sub:4,
++ rsv1:4,
++ rsv2:3,
++ F:1,
++ m:1,
++ M:1,
++ a:1,
++ A:1;
++#else
++#error "Adjust your <asm/byteorder.h> defines"
++#endif
++};
++
++struct mp_add_addr {
++ __u8 kind;
++ __u8 len;
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++ __u8 ipver:4,
++ sub:4;
++#elif defined(__BIG_ENDIAN_BITFIELD)
++ __u8 sub:4,
++ ipver:4;
++#else
++#error "Adjust your <asm/byteorder.h> defines"
++#endif
++ __u8 addr_id;
++ union {
++ struct {
++ struct in_addr addr;
++ __be16 port;
++ } v4;
++ struct {
++ struct in6_addr addr;
++ __be16 port;
++ } v6;
++ } u;
++} __attribute__((__packed__));
++
++struct mp_remove_addr {
++ __u8 kind;
++ __u8 len;
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++ __u8 rsv:4,
++ sub:4;
++#elif defined(__BIG_ENDIAN_BITFIELD)
++ __u8 sub:4,
++ rsv:4;
++#else
++#error "Adjust your <asm/byteorder.h> defines"
++#endif
++ /* list of addr_id */
++ __u8 addrs_id;
++};
++
++struct mp_fail {
++ __u8 kind;
++ __u8 len;
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++ __u16 rsv1:4,
++ sub:4,
++ rsv2:8;
++#elif defined(__BIG_ENDIAN_BITFIELD)
++ __u16 sub:4,
++ rsv1:4,
++ rsv2:8;
++#else
++#error "Adjust your <asm/byteorder.h> defines"
++#endif
++ __be64 data_seq;
++} __attribute__((__packed__));
++
++struct mp_fclose {
++ __u8 kind;
++ __u8 len;
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++ __u16 rsv1:4,
++ sub:4,
++ rsv2:8;
++#elif defined(__BIG_ENDIAN_BITFIELD)
++ __u16 sub:4,
++ rsv1:4,
++ rsv2:8;
++#else
++#error "Adjust your <asm/byteorder.h> defines"
++#endif
++ __u64 key;
++} __attribute__((__packed__));
++
++struct mp_prio {
++ __u8 kind;
++ __u8 len;
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++ __u8 b:1,
++ rsv:3,
++ sub:4;
++#elif defined(__BIG_ENDIAN_BITFIELD)
++ __u8 sub:4,
++ rsv:3,
++ b:1;
++#else
++#error "Adjust your <asm/byteorder.h> defines"
++#endif
++ __u8 addr_id;
++} __attribute__((__packed__));
++
++static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum)
++{
++ return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2);
++}
++
++#define MPTCP_APP 2
++
++extern int sysctl_mptcp_enabled;
++extern int sysctl_mptcp_checksum;
++extern int sysctl_mptcp_debug;
++extern int sysctl_mptcp_syn_retries;
++
++extern struct workqueue_struct *mptcp_wq;
++
++#define mptcp_debug(fmt, args...) \
++ do { \
++ if (unlikely(sysctl_mptcp_debug)) \
++ pr_err(__FILE__ ": " fmt, ##args); \
++ } while (0)
++
++/* Iterates over all subflows */
++#define mptcp_for_each_tp(mpcb, tp) \
++ for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next)
++
++#define mptcp_for_each_sk(mpcb, sk) \
++ for ((sk) = (struct sock *)(mpcb)->connection_list; \
++ sk; \
++ sk = (struct sock *)tcp_sk(sk)->mptcp->next)
++
++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) \
++ for (__sk = (struct sock *)(__mpcb)->connection_list, \
++ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \
++ __sk; \
++ __sk = __temp, \
++ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL)
++
++/* Iterates over all bit set to 1 in a bitset */
++#define mptcp_for_each_bit_set(b, i) \
++ for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1)
++
++#define mptcp_for_each_bit_unset(b, i) \
++ mptcp_for_each_bit_set(~b, i)
++
++extern struct lock_class_key meta_key;
++extern struct lock_class_key meta_slock_key;
++extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4];
++
++/* This is needed to ensure that two subsequent key/nonce-generation result in
++ * different keys/nonces if the IPs and ports are the same.
++ */
++extern u32 mptcp_seed;
++
++#define MPTCP_HASH_SIZE 1024
++
++extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
++
++/* This second hashtable is needed to retrieve request socks
++ * created as a result of a join request. While the SYN contains
++ * the token, the final ack does not, so we need a separate hashtable
++ * to retrieve the mpcb.
++ */
++extern struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
++extern spinlock_t mptcp_reqsk_hlock; /* hashtable protection */
++
++/* Lock, protecting the two hash-tables that hold the token. Namely,
++ * mptcp_reqsk_tk_htb and tk_hashtable
++ */
++extern spinlock_t mptcp_tk_hashlock; /* hashtable protection */
++
++/* Request-sockets can be hashed in the tk_htb for collision-detection or in
++ * the regular htb for join-connections. We need to define different NULLS
++ * values so that we can correctly detect a request-socket that has been
++ * recycled. See also c25eb3bfb9729.
++ */
++#define MPTCP_REQSK_NULLS_BASE (1U << 29)
++
++
++void mptcp_data_ready(struct sock *sk);
++void mptcp_write_space(struct sock *sk);
++
++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,
++ struct sock *sk);
++void mptcp_ofo_queue(struct sock *meta_sk);
++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp);
++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied);
++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
++ gfp_t flags);
++void mptcp_del_sock(struct sock *sk);
++void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk);
++void mptcp_reinject_data(struct sock *orig_sk, int clone_it);
++void mptcp_update_sndbuf(const struct tcp_sock *tp);
++void mptcp_send_fin(struct sock *meta_sk);
++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority);
++bool mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
++ int push_one, gfp_t gfp);
++void tcp_parse_mptcp_options(const struct sk_buff *skb,
++ struct mptcp_options_received *mopt);
++void mptcp_parse_options(const uint8_t *ptr, int opsize,
++ struct mptcp_options_received *mopt,
++ const struct sk_buff *skb);
++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,
++ unsigned *remaining);
++void mptcp_synack_options(struct request_sock *req,
++ struct tcp_out_options *opts,
++ unsigned *remaining);
++void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
++ struct tcp_out_options *opts, unsigned *size);
++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
++ const struct tcp_out_options *opts,
++ struct sk_buff *skb);
++void mptcp_close(struct sock *meta_sk, long timeout);
++int mptcp_doit(struct sock *sk);
++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window);
++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req);
++int mptcp_check_req_master(struct sock *sk, struct sock *child,
++ struct request_sock *req,
++ struct request_sock **prev);
++struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child,
++ struct request_sock *req,
++ struct request_sock **prev,
++ const struct mptcp_options_received *mopt);
++u32 __mptcp_select_window(struct sock *sk);
++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
++ __u32 *window_clamp, int wscale_ok,
++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
++ const struct sock *sk);
++unsigned int mptcp_current_mss(struct sock *meta_sk);
++int mptcp_select_size(const struct sock *meta_sk, bool sg);
++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn);
++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
++ u32 *hash_out);
++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk);
++void mptcp_fin(struct sock *meta_sk);
++void mptcp_retransmit_timer(struct sock *meta_sk);
++int mptcp_write_wakeup(struct sock *meta_sk);
++void mptcp_sub_close_wq(struct work_struct *work);
++void mptcp_sub_close(struct sock *sk, unsigned long delay);
++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk);
++void mptcp_fallback_meta_sk(struct sock *meta_sk);
++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb);
++void mptcp_ack_handler(unsigned long);
++int mptcp_check_rtt(const struct tcp_sock *tp, int time);
++int mptcp_check_snd_buf(const struct tcp_sock *tp);
++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th,
++ const struct sk_buff *skb);
++void __init mptcp_init(void);
++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len);
++void mptcp_destroy_sock(struct sock *sk);
++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
++ const struct sk_buff *skb,
++ const struct mptcp_options_received *mopt);
++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,
++ int large_allowed);
++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw);
++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw);
++void mptcp_time_wait(struct sock *sk, int state, int timeo);
++void mptcp_disconnect(struct sock *sk);
++bool mptcp_should_expand_sndbuf(const struct sock *sk);
++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb);
++void mptcp_tsq_flags(struct sock *sk);
++void mptcp_tsq_sub_deferred(struct sock *meta_sk);
++struct mp_join *mptcp_find_join(const struct sk_buff *skb);
++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp);
++void mptcp_hash_remove(struct tcp_sock *meta_tp);
++struct sock *mptcp_hash_find(const struct net *net, const u32 token);
++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw);
++int mptcp_do_join_short(struct sk_buff *skb,
++ const struct mptcp_options_received *mopt,
++ struct net *net);
++void mptcp_reqsk_destructor(struct request_sock *req);
++void mptcp_reqsk_new_mptcp(struct request_sock *req,
++ const struct mptcp_options_received *mopt,
++ const struct sk_buff *skb);
++int mptcp_check_req(struct sk_buff *skb, struct net *net);
++void mptcp_connect_init(struct sock *sk);
++void mptcp_sub_force_close(struct sock *sk);
++int mptcp_sub_len_remove_addr_align(u16 bitfield);
++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
++ const struct sk_buff *skb);
++void mptcp_init_buffer_space(struct sock *sk);
++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req,
++ struct sk_buff *skb);
++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb);
++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb);
++void mptcp_init_congestion_control(struct sock *sk);
++
++/* MPTCP-path-manager registration/initialization functions */
++int mptcp_register_path_manager(struct mptcp_pm_ops *pm);
++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm);
++void mptcp_init_path_manager(struct mptcp_cb *mpcb);
++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb);
++void mptcp_fallback_default(struct mptcp_cb *mpcb);
++void mptcp_get_default_path_manager(char *name);
++int mptcp_set_default_path_manager(const char *name);
++extern struct mptcp_pm_ops mptcp_pm_default;
++
++/* MPTCP-scheduler registration/initialization functions */
++int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
++void mptcp_init_scheduler(struct mptcp_cb *mpcb);
++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb);
++void mptcp_get_default_scheduler(char *name);
++int mptcp_set_default_scheduler(const char *name);
++extern struct mptcp_sched_ops mptcp_sched_default;
++
++static inline void mptcp_reset_synack_timer(struct sock *meta_sk,
++ unsigned long len)
++{
++ sk_reset_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer,
++ jiffies + len);
++}
++
++static inline void mptcp_delete_synack_timer(struct sock *meta_sk)
++{
++ sk_stop_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer);
++}
++
++static inline bool is_mptcp_enabled(const struct sock *sk)
++{
++ if (!sysctl_mptcp_enabled || mptcp_init_failed)
++ return false;
++
++ if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)
++ return false;
++
++ return true;
++}
++
++static inline int mptcp_pi_to_flag(int pi)
++{
++ return 1 << (pi - 1);
++}
++
++static inline
++struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req)
++{
++ return (struct mptcp_request_sock *)req;
++}
++
++static inline
++struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req)
++{
++ return (struct request_sock *)req;
++}
++
++static inline bool mptcp_can_sendpage(struct sock *sk)
++{
++ struct sock *sk_it;
++
++ if (tcp_sk(sk)->mpcb->dss_csum)
++ return false;
++
++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) {
++ if (!(sk_it->sk_route_caps & NETIF_F_SG) ||
++ !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM))
++ return false;
++ }
++
++ return true;
++}
++
++static inline void mptcp_push_pending_frames(struct sock *meta_sk)
++{
++ /* We check packets out and send-head here. TCP only checks the
++ * send-head. But, MPTCP also checks packets_out, as this is an
++ * indication that we might want to do opportunistic reinjection.
++ */
++ if (tcp_sk(meta_sk)->packets_out || tcp_send_head(meta_sk)) {
++ struct tcp_sock *tp = tcp_sk(meta_sk);
++
++ /* We don't care about the MSS, because it will be set in
++ * mptcp_write_xmit.
++ */
++ __tcp_push_pending_frames(meta_sk, 0, tp->nonagle);
++ }
++}
++
++static inline void mptcp_send_reset(struct sock *sk)
++{
++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
++ mptcp_sub_force_close(sk);
++}
++
++static inline bool mptcp_is_data_seq(const struct sk_buff *skb)
++{
++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ;
++}
++
++static inline bool mptcp_is_data_fin(const struct sk_buff *skb)
++{
++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN;
++}
++
++/* Is it a data-fin while in infinite mapping mode?
++ * In infinite mode, a subflow-fin is in fact a data-fin.
++ */
++static inline bool mptcp_is_data_fin2(const struct sk_buff *skb,
++ const struct tcp_sock *tp)
++{
++ return mptcp_is_data_fin(skb) ||
++ (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin);
++}
++
++static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
++{
++ u64 data_seq_high = (u32)(data_seq >> 32);
++
++ if (mpcb->rcv_high_order[0] == data_seq_high)
++ return 0;
++ else if (mpcb->rcv_high_order[1] == data_seq_high)
++ return MPTCPHDR_SEQ64_INDEX;
++ else
++ return MPTCPHDR_SEQ64_OFO;
++}
++
++/* Sets the data_seq and returns pointer to the in-skb field of the data_seq.
++ * If the packet has a 64-bit dseq, the pointer points to the last 32 bits.
++ */
++static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb,
++ u32 *data_seq,
++ struct mptcp_cb *mpcb)
++{
++ __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);
++
++ if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
++ u64 data_seq64 = get_unaligned_be64(ptr);
++
++ if (mpcb)
++ TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);
++
++ *data_seq = (u32)data_seq64;
++ ptr++;
++ } else {
++ *data_seq = get_unaligned_be32(ptr);
++ }
++
++ return ptr;
++}
++
++static inline struct sock *mptcp_meta_sk(const struct sock *sk)
++{
++ return tcp_sk(sk)->meta_sk;
++}
++
++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
++{
++ return tcp_sk(tp->meta_sk);
++}
++
++static inline int is_meta_tp(const struct tcp_sock *tp)
++{
++ return tp->mpcb && mptcp_meta_tp(tp) == tp;
++}
++
++static inline int is_meta_sk(const struct sock *sk)
++{
++ return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP &&
++ mptcp(tcp_sk(sk)) && mptcp_meta_sk(sk) == sk;
++}
++
++static inline int is_master_tp(const struct tcp_sock *tp)
++{
++ return !mptcp(tp) || (!tp->mptcp->slave_sk && !is_meta_tp(tp));
++}
++
++static inline void mptcp_hash_request_remove(struct request_sock *req)
++{
++ int in_softirq = 0;
++
++ if (hlist_nulls_unhashed(&mptcp_rsk(req)->hash_entry))
++ return;
++
++ if (in_softirq()) {
++ spin_lock(&mptcp_reqsk_hlock);
++ in_softirq = 1;
++ } else {
++ spin_lock_bh(&mptcp_reqsk_hlock);
++ }
++
++ hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry);
++
++ if (in_softirq)
++ spin_unlock(&mptcp_reqsk_hlock);
++ else
++ spin_unlock_bh(&mptcp_reqsk_hlock);
++}
++
++static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt)
++{
++ mopt->saw_mpc = 0;
++ mopt->dss_csum = 0;
++ mopt->drop_me = 0;
++
++ mopt->is_mp_join = 0;
++ mopt->join_ack = 0;
++
++ mopt->saw_low_prio = 0;
++ mopt->low_prio = 0;
++
++ mopt->saw_add_addr = 0;
++ mopt->more_add_addr = 0;
++
++ mopt->saw_rem_addr = 0;
++ mopt->more_rem_addr = 0;
++
++ mopt->mp_fail = 0;
++ mopt->mp_fclose = 0;
++}
++
++static inline void mptcp_reset_mopt(struct tcp_sock *tp)
++{
++ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
++
++ mopt->saw_low_prio = 0;
++ mopt->saw_add_addr = 0;
++ mopt->more_add_addr = 0;
++ mopt->saw_rem_addr = 0;
++ mopt->more_rem_addr = 0;
++ mopt->join_ack = 0;
++ mopt->mp_fail = 0;
++ mopt->mp_fclose = 0;
++}
++
++static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb,
++ const struct mptcp_cb *mpcb)
++{
++ return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
++ MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
++}
++
++static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index,
++ u32 data_seq_32)
++{
++ return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
++}
++
++static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
++{
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
++ meta_tp->rcv_nxt);
++}
++
++static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
++{
++ if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
++ mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
++ }
++}
++
++static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp,
++ u32 old_rcv_nxt)
++{
++ if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
++ mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
++ }
++}
++
++static inline int mptcp_sk_can_send(const struct sock *sk)
++{
++ return tcp_passive_fastopen(sk) ||
++ ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
++ !tcp_sk(sk)->mptcp->pre_established);
++}
++
++static inline int mptcp_sk_can_recv(const struct sock *sk)
++{
++ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2);
++}
++
++static inline int mptcp_sk_can_send_ack(const struct sock *sk)
++{
++ return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV |
++ TCPF_CLOSE | TCPF_LISTEN)) &&
++ !tcp_sk(sk)->mptcp->pre_established;
++}
++
++/* Only support GSO if all subflows supports it */
++static inline bool mptcp_sk_can_gso(const struct sock *meta_sk)
++{
++ struct sock *sk;
++
++ if (tcp_sk(meta_sk)->mpcb->dss_csum)
++ return false;
++
++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
++ if (!mptcp_sk_can_send(sk))
++ continue;
++ if (!sk_can_gso(sk))
++ return false;
++ }
++ return true;
++}
++
++static inline bool mptcp_can_sg(const struct sock *meta_sk)
++{
++ struct sock *sk;
++
++ if (tcp_sk(meta_sk)->mpcb->dss_csum)
++ return false;
++
++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
++ if (!mptcp_sk_can_send(sk))
++ continue;
++ if (!(sk->sk_route_caps & NETIF_F_SG))
++ return false;
++ }
++ return true;
++}
++
++static inline void mptcp_set_rto(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct sock *sk_it;
++ struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk));
++ __u32 max_rto = 0;
++
++ /* We are in recovery-phase on the MPTCP-level. Do not update the
++ * RTO, because this would kill exponential backoff.
++ */
++ if (micsk->icsk_retransmits)
++ return;
++
++ mptcp_for_each_sk(tp->mpcb, sk_it) {
++ if (mptcp_sk_can_send(sk_it) &&
++ inet_csk(sk_it)->icsk_rto > max_rto)
++ max_rto = inet_csk(sk_it)->icsk_rto;
++ }
++ if (max_rto) {
++ micsk->icsk_rto = max_rto << 1;
++
++ /* A successfull rto-measurement - reset backoff counter */
++ micsk->icsk_backoff = 0;
++ }
++}
++
++static inline int mptcp_sysctl_syn_retries(void)
++{
++ return sysctl_mptcp_syn_retries;
++}
++
++static inline void mptcp_sub_close_passive(struct sock *sk)
++{
++ struct sock *meta_sk = mptcp_meta_sk(sk);
++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk);
++
++ /* Only close, if the app did a send-shutdown (passive close), and we
++ * received the data-ack of the data-fin.
++ */
++ if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq)
++ mptcp_sub_close(sk, 0);
++}
++
++static inline bool mptcp_fallback_infinite(struct sock *sk, int flag)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ /* If data has been acknowleged on the meta-level, fully_established
++ * will have been set before and thus we will not fall back to infinite
++ * mapping.
++ */
++ if (likely(tp->mptcp->fully_established))
++ return false;
++
++ if (!(flag & MPTCP_FLAG_DATA_ACKED))
++ return false;
++
++ /* Don't fallback twice ;) */
++ if (tp->mpcb->infinite_mapping_snd)
++ return false;
++
++ pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n",
++ __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index,
++ &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr,
++ __builtin_return_address(0));
++ if (!is_master_tp(tp))
++ return true;
++
++ tp->mpcb->infinite_mapping_snd = 1;
++ tp->mpcb->infinite_mapping_rcv = 1;
++ tp->mptcp->fully_established = 1;
++
++ return false;
++}
++
++/* Find the first index whose bit in the bit-field == 0 */
++static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb)
++{
++ u8 base = mpcb->next_path_index;
++ int i;
++
++ /* Start at 1, because 0 is reserved for the meta-sk */
++ mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) {
++ if (i + base < 1)
++ continue;
++ if (i + base >= sizeof(mpcb->path_index_bits) * 8)
++ break;
++ i += base;
++ mpcb->path_index_bits |= (1 << i);
++ mpcb->next_path_index = i + 1;
++ return i;
++ }
++ mptcp_for_each_bit_unset(mpcb->path_index_bits, i) {
++ if (i >= sizeof(mpcb->path_index_bits) * 8)
++ break;
++ if (i < 1)
++ continue;
++ mpcb->path_index_bits |= (1 << i);
++ mpcb->next_path_index = i + 1;
++ return i;
++ }
++
++ return 0;
++}
++
++static inline bool mptcp_v6_is_v4_mapped(const struct sock *sk)
++{
++ return sk->sk_family == AF_INET6 &&
++ ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED;
++}
++
++/* TCP and MPTCP mpc flag-depending functions */
++u16 mptcp_select_window(struct sock *sk);
++void mptcp_init_buffer_space(struct sock *sk);
++void mptcp_tcp_set_rto(struct sock *sk);
++
++/* TCP and MPTCP flag-depending functions */
++bool mptcp_prune_ofo_queue(struct sock *sk);
++
++#else /* CONFIG_MPTCP */
++#define mptcp_debug(fmt, args...) \
++ do { \
++ } while (0)
++
++/* Without MPTCP, we just do one iteration
++ * over the only socket available. This assumes that
++ * the sk/tp arg is the socket in that case.
++ */
++#define mptcp_for_each_sk(mpcb, sk)
++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)
++
++static inline bool mptcp_is_data_fin(const struct sk_buff *skb)
++{
++ return false;
++}
++static inline bool mptcp_is_data_seq(const struct sk_buff *skb)
++{
++ return false;
++}
++static inline struct sock *mptcp_meta_sk(const struct sock *sk)
++{
++ return NULL;
++}
++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
++{
++ return NULL;
++}
++static inline int is_meta_sk(const struct sock *sk)
++{
++ return 0;
++}
++static inline int is_master_tp(const struct tcp_sock *tp)
++{
++ return 0;
++}
++static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {}
++static inline void mptcp_del_sock(const struct sock *sk) {}
++static inline void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk) {}
++static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {}
++static inline void mptcp_update_sndbuf(const struct tcp_sock *tp) {}
++static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb,
++ const struct sock *sk) {}
++static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {}
++static inline void mptcp_set_rto(const struct sock *sk) {}
++static inline void mptcp_send_fin(const struct sock *meta_sk) {}
++static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize,
++ const struct mptcp_options_received *mopt,
++ const struct sk_buff *skb) {}
++static inline void mptcp_syn_options(const struct sock *sk,
++ struct tcp_out_options *opts,
++ unsigned *remaining) {}
++static inline void mptcp_synack_options(struct request_sock *req,
++ struct tcp_out_options *opts,
++ unsigned *remaining) {}
++
++static inline void mptcp_established_options(struct sock *sk,
++ struct sk_buff *skb,
++ struct tcp_out_options *opts,
++ unsigned *size) {}
++static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
++ const struct tcp_out_options *opts,
++ struct sk_buff *skb) {}
++static inline void mptcp_close(struct sock *meta_sk, long timeout) {}
++static inline int mptcp_doit(struct sock *sk)
++{
++ return 0;
++}
++static inline int mptcp_check_req_fastopen(struct sock *child,
++ struct request_sock *req)
++{
++ return 1;
++}
++static inline int mptcp_check_req_master(const struct sock *sk,
++ const struct sock *child,
++ struct request_sock *req,
++ struct request_sock **prev)
++{
++ return 1;
++}
++static inline struct sock *mptcp_check_req_child(struct sock *sk,
++ struct sock *child,
++ struct request_sock *req,
++ struct request_sock **prev,
++ const struct mptcp_options_received *mopt)
++{
++ return NULL;
++}
++static inline unsigned int mptcp_current_mss(struct sock *meta_sk)
++{
++ return 0;
++}
++static inline int mptcp_select_size(const struct sock *meta_sk, bool sg)
++{
++ return 0;
++}
++static inline void mptcp_sub_close_passive(struct sock *sk) {}
++static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag)
++{
++ return false;
++}
++static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {}
++static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time)
++{
++ return 0;
++}
++static inline int mptcp_check_snd_buf(const struct tcp_sock *tp)
++{
++ return 0;
++}
++static inline int mptcp_sysctl_syn_retries(void)
++{
++ return 0;
++}
++static inline void mptcp_send_reset(const struct sock *sk) {}
++static inline int mptcp_handle_options(struct sock *sk,
++ const struct tcphdr *th,
++ struct sk_buff *skb)
++{
++ return 0;
++}
++static inline void mptcp_reset_mopt(struct tcp_sock *tp) {}
++static inline void __init mptcp_init(void) {}
++static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
++{
++ return 0;
++}
++static inline bool mptcp_sk_can_gso(const struct sock *sk)
++{
++ return false;
++}
++static inline bool mptcp_can_sg(const struct sock *meta_sk)
++{
++ return false;
++}
++static inline unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk,
++ u32 mss_now, int large_allowed)
++{
++ return 0;
++}
++static inline void mptcp_destroy_sock(struct sock *sk) {}
++static inline int mptcp_rcv_synsent_state_process(struct sock *sk,
++ struct sock **skptr,
++ struct sk_buff *skb,
++ const struct mptcp_options_received *mopt)
++{
++ return 0;
++}
++static inline bool mptcp_can_sendpage(struct sock *sk)
++{
++ return false;
++}
++static inline int mptcp_init_tw_sock(struct sock *sk,
++ struct tcp_timewait_sock *tw)
++{
++ return 0;
++}
++static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {}
++static inline void mptcp_disconnect(struct sock *sk) {}
++static inline void mptcp_tsq_flags(struct sock *sk) {}
++static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {}
++static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {}
++static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {}
++static inline void mptcp_reqsk_new_mptcp(struct request_sock *req,
++ const struct tcp_options_received *rx_opt,
++ const struct mptcp_options_received *mopt,
++ const struct sk_buff *skb) {}
++static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
++ const struct sk_buff *skb) {}
++static inline void mptcp_delete_synack_timer(struct sock *meta_sk) {}
++#endif /* CONFIG_MPTCP */
++
++#endif /* _MPTCP_H */
+diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h
+new file mode 100644
+index 000000000000..93ad97c77c5a
+--- /dev/null
++++ b/include/net/mptcp_v4.h
+@@ -0,0 +1,67 @@
++/*
++ * MPTCP implementation
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer & Author:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#ifndef MPTCP_V4_H_
++#define MPTCP_V4_H_
++
++
++#include <linux/in.h>
++#include <linux/skbuff.h>
++#include <net/mptcp.h>
++#include <net/request_sock.h>
++#include <net/sock.h>
++
++extern struct request_sock_ops mptcp_request_sock_ops;
++extern const struct inet_connection_sock_af_ops mptcp_v4_specific;
++extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
++
++#ifdef CONFIG_MPTCP
++
++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
++ const __be32 laddr, const struct net *net);
++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
++ struct mptcp_rem4 *rem);
++int mptcp_pm_v4_init(void);
++void mptcp_pm_v4_undo(void);
++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
++
++#else
++
++static inline int mptcp_v4_do_rcv(const struct sock *meta_sk,
++ const struct sk_buff *skb)
++{
++ return 0;
++}
++
++#endif /* CONFIG_MPTCP */
++
++#endif /* MPTCP_V4_H_ */
+diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h
+new file mode 100644
+index 000000000000..49a4f30ccd4d
+--- /dev/null
++++ b/include/net/mptcp_v6.h
+@@ -0,0 +1,69 @@
++/*
++ * MPTCP implementation
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer & Author:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#ifndef _MPTCP_V6_H
++#define _MPTCP_V6_H
++
++#include <linux/in6.h>
++#include <net/if_inet6.h>
++
++#include <net/mptcp.h>
++
++
++#ifdef CONFIG_MPTCP
++extern const struct inet_connection_sock_af_ops mptcp_v6_mapped;
++extern const struct inet_connection_sock_af_ops mptcp_v6_specific;
++extern struct request_sock_ops mptcp6_request_sock_ops;
++extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
++
++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
++ const struct in6_addr *laddr, const struct net *net);
++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
++ struct mptcp_rem6 *rem);
++int mptcp_pm_v6_init(void);
++void mptcp_pm_v6_undo(void);
++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
++ __be16 sport, __be16 dport);
++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
++ __be16 sport, __be16 dport);
++
++#else /* CONFIG_MPTCP */
++
++#define mptcp_v6_mapped ipv6_mapped
++
++static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
++{
++ return 0;
++}
++
++#endif /* CONFIG_MPTCP */
++
++#endif /* _MPTCP_V6_H */
+diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
+index 361d26077196..bae95a11c531 100644
+--- a/include/net/net_namespace.h
++++ b/include/net/net_namespace.h
+@@ -16,6 +16,7 @@
+ #include <net/netns/packet.h>
+ #include <net/netns/ipv4.h>
+ #include <net/netns/ipv6.h>
++#include <net/netns/mptcp.h>
+ #include <net/netns/ieee802154_6lowpan.h>
+ #include <net/netns/sctp.h>
+ #include <net/netns/dccp.h>
+@@ -92,6 +93,9 @@ struct net {
+ #if IS_ENABLED(CONFIG_IPV6)
+ struct netns_ipv6 ipv6;
+ #endif
++#if IS_ENABLED(CONFIG_MPTCP)
++ struct netns_mptcp mptcp;
++#endif
+ #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
+ struct netns_ieee802154_lowpan ieee802154_lowpan;
+ #endif
+diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h
+new file mode 100644
+index 000000000000..bad418b04cc8
+--- /dev/null
++++ b/include/net/netns/mptcp.h
+@@ -0,0 +1,44 @@
++/*
++ * MPTCP implementation - MPTCP namespace
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#ifndef __NETNS_MPTCP_H__
++#define __NETNS_MPTCP_H__
++
++#include <linux/compiler.h>
++
++enum {
++ MPTCP_PM_FULLMESH = 0,
++ MPTCP_PM_MAX
++};
++
++struct netns_mptcp {
++ void *path_managers[MPTCP_PM_MAX];
++};
++
++#endif /* __NETNS_MPTCP_H__ */
+diff --git a/include/net/request_sock.h b/include/net/request_sock.h
+index 7f830ff67f08..e79e87a8e1a6 100644
+--- a/include/net/request_sock.h
++++ b/include/net/request_sock.h
+@@ -164,7 +164,7 @@ struct request_sock_queue {
+ };
+
+ int reqsk_queue_alloc(struct request_sock_queue *queue,
+- unsigned int nr_table_entries);
++ unsigned int nr_table_entries, gfp_t flags);
+
+ void __reqsk_queue_destroy(struct request_sock_queue *queue);
+ void reqsk_queue_destroy(struct request_sock_queue *queue);
+diff --git a/include/net/sock.h b/include/net/sock.h
+index 156350745700..0e23cae8861f 100644
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -901,6 +901,16 @@ void sk_clear_memalloc(struct sock *sk);
+
+ int sk_wait_data(struct sock *sk, long *timeo);
+
++/* START - needed for MPTCP */
++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family);
++void sock_lock_init(struct sock *sk);
++
++extern struct lock_class_key af_callback_keys[AF_MAX];
++extern char *const af_family_clock_key_strings[AF_MAX+1];
++
++#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
++/* END - needed for MPTCP */
++
+ struct request_sock_ops;
+ struct timewait_sock_ops;
+ struct inet_hashinfo;
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index 7286db80e8b8..ff92e74cd684 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -177,6 +177,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
+ #define TCPOPT_SACK 5 /* SACK Block */
+ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
+ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
++#define TCPOPT_MPTCP 30
+ #define TCPOPT_EXP 254 /* Experimental */
+ /* Magic number to be after the option value for sharing TCP
+ * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
+@@ -229,6 +230,27 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
+ #define TFO_SERVER_WO_SOCKOPT1 0x400
+ #define TFO_SERVER_WO_SOCKOPT2 0x800
+
++/* Flags from tcp_input.c for tcp_ack */
++#define FLAG_DATA 0x01 /* Incoming frame contained data. */
++#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
++#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
++#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
++#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
++#define FLAG_DATA_SACKED 0x20 /* New SACK. */
++#define FLAG_ECE 0x40 /* ECE in this ACK */
++#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
++#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
++#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
++#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
++#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
++#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
++#define MPTCP_FLAG_DATA_ACKED 0x8000
++
++#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
++#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
++#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
++#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
++
+ extern struct inet_timewait_death_row tcp_death_row;
+
+ /* sysctl variables for tcp */
+@@ -344,6 +366,107 @@ extern struct proto tcp_prot;
+ #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)
+ #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
+
++/**** START - Exports needed for MPTCP ****/
++extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
++extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
++
++struct mptcp_options_received;
++
++void tcp_enter_quickack_mode(struct sock *sk);
++int tcp_close_state(struct sock *sk);
++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
++ const struct sk_buff *skb);
++int tcp_xmit_probe_skb(struct sock *sk, int urgent);
++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb);
++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
++ gfp_t gfp_mask);
++unsigned int tcp_mss_split_point(const struct sock *sk,
++ const struct sk_buff *skb,
++ unsigned int mss_now,
++ unsigned int max_segs,
++ int nonagle);
++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
++ unsigned int cur_mss, int nonagle);
++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
++ unsigned int cur_mss);
++unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb);
++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
++ unsigned int mss_now);
++void __pskb_trim_head(struct sk_buff *skb, int len);
++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);
++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);
++void tcp_reset(struct sock *sk);
++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
++ const u32 ack_seq, const u32 nwin);
++bool tcp_urg_mode(const struct tcp_sock *tp);
++void tcp_ack_probe(struct sock *sk);
++void tcp_rearm_rto(struct sock *sk);
++int tcp_write_timeout(struct sock *sk);
++bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
++ unsigned int timeout, bool syn_set);
++void tcp_write_err(struct sock *sk);
++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr);
++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
++ unsigned int mss_now);
++
++int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req);
++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
++ struct request_sock *req);
++__u32 tcp_v4_init_sequence(const struct sk_buff *skb);
++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
++ struct flowi *fl,
++ struct request_sock *req,
++ u16 queue_mapping,
++ struct tcp_fastopen_cookie *foc);
++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb);
++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb);
++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb);
++void tcp_v4_reqsk_destructor(struct request_sock *req);
++
++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req);
++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
++ struct request_sock *req);
++__u32 tcp_v6_init_sequence(const struct sk_buff *skb);
++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
++ struct flowi *fl, struct request_sock *req,
++ u16 queue_mapping, struct tcp_fastopen_cookie *foc);
++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
++void tcp_v6_destroy_sock(struct sock *sk);
++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
++void tcp_v6_hash(struct sock *sk);
++struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb);
++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
++ struct request_sock *req,
++ struct dst_entry *dst);
++void tcp_v6_reqsk_destructor(struct request_sock *req);
++
++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
++ int large_allowed);
++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb);
++
++void skb_clone_fraglist(struct sk_buff *skb);
++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old);
++
++void inet_twsk_free(struct inet_timewait_sock *tw);
++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb);
++/* These states need RST on ABORT according to RFC793 */
++static inline bool tcp_need_reset(int state)
++{
++ return (1 << state) &
++ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
++ TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
++}
++
++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
++ int hlen);
++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
++ bool *fragstolen);
++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to,
++ struct sk_buff *from, bool *fragstolen);
++/**** END - Exports needed for MPTCP ****/
++
+ void tcp_tasklet_init(void);
+
+ void tcp_v4_err(struct sk_buff *skb, u32);
+@@ -440,6 +563,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len, int nonblock, int flags, int *addr_len);
+ void tcp_parse_options(const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx,
++ struct mptcp_options_received *mopt_rx,
+ int estab, struct tcp_fastopen_cookie *foc);
+ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
+
+@@ -493,14 +617,8 @@ static inline u32 tcp_cookie_time(void)
+
+ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
+ u16 *mssp);
+-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss);
+-#else
+-static inline __u32 cookie_v4_init_sequence(struct sock *sk,
+- struct sk_buff *skb,
+- __u16 *mss)
+-{
+- return 0;
+-}
++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
++ __u16 *mss);
+ #endif
+
+ __u32 cookie_init_timestamp(struct request_sock *req);
+@@ -516,13 +634,6 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
+ const struct tcphdr *th, u16 *mssp);
+ __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb,
+ __u16 *mss);
+-#else
+-static inline __u32 cookie_v6_init_sequence(struct sock *sk,
+- struct sk_buff *skb,
+- __u16 *mss)
+-{
+- return 0;
+-}
+ #endif
+ /* tcp_output.c */
+
+@@ -551,10 +662,17 @@ void tcp_send_delayed_ack(struct sock *sk);
+ void tcp_send_loss_probe(struct sock *sk);
+ bool tcp_schedule_loss_probe(struct sock *sk);
+
++u16 tcp_select_window(struct sock *sk);
++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
++ int push_one, gfp_t gfp);
++
+ /* tcp_input.c */
+ void tcp_resume_early_retransmit(struct sock *sk);
+ void tcp_rearm_rto(struct sock *sk);
+ void tcp_reset(struct sock *sk);
++void tcp_set_rto(struct sock *sk);
++bool tcp_should_expand_sndbuf(const struct sock *sk);
++bool tcp_prune_ofo_queue(struct sock *sk);
+
+ /* tcp_timer.c */
+ void tcp_init_xmit_timers(struct sock *);
+@@ -703,14 +821,27 @@ void tcp_send_window_probe(struct sock *sk);
+ */
+ struct tcp_skb_cb {
+ union {
+- struct inet_skb_parm h4;
++ union {
++ struct inet_skb_parm h4;
+ #if IS_ENABLED(CONFIG_IPV6)
+- struct inet6_skb_parm h6;
++ struct inet6_skb_parm h6;
+ #endif
+- } header; /* For incoming frames */
++ } header; /* For incoming frames */
++#ifdef CONFIG_MPTCP
++ union { /* For MPTCP outgoing frames */
++ __u32 path_mask; /* paths that tried to send this skb */
++ __u32 dss[6]; /* DSS options */
++ };
++#endif
++ };
+ __u32 seq; /* Starting sequence number */
+ __u32 end_seq; /* SEQ + FIN + SYN + datalen */
+ __u32 when; /* used to compute rtt's */
++#ifdef CONFIG_MPTCP
++ __u8 mptcp_flags; /* flags for the MPTCP layer */
++ __u8 dss_off; /* Number of 4-byte words until
++ * seq-number */
++#endif
+ __u8 tcp_flags; /* TCP header flags. (tcp[13]) */
+
+ __u8 sacked; /* State flags for SACK/FACK. */
+@@ -1075,7 +1206,8 @@ u32 tcp_default_init_rwnd(u32 mss);
+ /* Determine a window scaling and initial window to offer. */
+ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
+ __u32 *window_clamp, int wscale_ok,
+- __u8 *rcv_wscale, __u32 init_rcv_wnd);
++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
++ const struct sock *sk);
+
+ static inline int tcp_win_from_space(int space)
+ {
+@@ -1084,15 +1216,34 @@ static inline int tcp_win_from_space(int space)
+ space - (space>>sysctl_tcp_adv_win_scale);
+ }
+
++#ifdef CONFIG_MPTCP
++extern struct static_key mptcp_static_key;
++static inline bool mptcp(const struct tcp_sock *tp)
++{
++ return static_key_false(&mptcp_static_key) && tp->mpc;
++}
++#else
++static inline bool mptcp(const struct tcp_sock *tp)
++{
++ return 0;
++}
++#endif
++
+ /* Note: caller must be prepared to deal with negative returns */
+ static inline int tcp_space(const struct sock *sk)
+ {
++ if (mptcp(tcp_sk(sk)))
++ sk = tcp_sk(sk)->meta_sk;
++
+ return tcp_win_from_space(sk->sk_rcvbuf -
+ atomic_read(&sk->sk_rmem_alloc));
+ }
+
+ static inline int tcp_full_space(const struct sock *sk)
+ {
++ if (mptcp(tcp_sk(sk)))
++ sk = tcp_sk(sk)->meta_sk;
++
+ return tcp_win_from_space(sk->sk_rcvbuf);
+ }
+
+@@ -1115,6 +1266,8 @@ static inline void tcp_openreq_init(struct request_sock *req,
+ ireq->wscale_ok = rx_opt->wscale_ok;
+ ireq->acked = 0;
+ ireq->ecn_ok = 0;
++ ireq->mptcp_rqsk = 0;
++ ireq->saw_mpc = 0;
+ ireq->ir_rmt_port = tcp_hdr(skb)->source;
+ ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+ }
+@@ -1585,6 +1738,11 @@ int tcp4_proc_init(void);
+ void tcp4_proc_exit(void);
+ #endif
+
++int tcp_rtx_synack(struct sock *sk, struct request_sock *req);
++int tcp_conn_request(struct request_sock_ops *rsk_ops,
++ const struct tcp_request_sock_ops *af_ops,
++ struct sock *sk, struct sk_buff *skb);
++
+ /* TCP af-specific functions */
+ struct tcp_sock_af_ops {
+ #ifdef CONFIG_TCP_MD5SIG
+@@ -1601,7 +1759,32 @@ struct tcp_sock_af_ops {
+ #endif
+ };
+
++/* TCP/MPTCP-specific functions */
++struct tcp_sock_ops {
++ u32 (*__select_window)(struct sock *sk);
++ u16 (*select_window)(struct sock *sk);
++ void (*select_initial_window)(int __space, __u32 mss, __u32 *rcv_wnd,
++ __u32 *window_clamp, int wscale_ok,
++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
++ const struct sock *sk);
++ void (*init_buffer_space)(struct sock *sk);
++ void (*set_rto)(struct sock *sk);
++ bool (*should_expand_sndbuf)(const struct sock *sk);
++ void (*send_fin)(struct sock *sk);
++ bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle,
++ int push_one, gfp_t gfp);
++ void (*send_active_reset)(struct sock *sk, gfp_t priority);
++ int (*write_wakeup)(struct sock *sk);
++ bool (*prune_ofo_queue)(struct sock *sk);
++ void (*retransmit_timer)(struct sock *sk);
++ void (*time_wait)(struct sock *sk, int state, int timeo);
++ void (*cleanup_rbuf)(struct sock *sk, int copied);
++ void (*init_congestion_control)(struct sock *sk);
++};
++extern const struct tcp_sock_ops tcp_specific;
++
+ struct tcp_request_sock_ops {
++ u16 mss_clamp;
+ #ifdef CONFIG_TCP_MD5SIG
+ struct tcp_md5sig_key *(*md5_lookup) (struct sock *sk,
+ struct request_sock *req);
+@@ -1611,8 +1794,39 @@ struct tcp_request_sock_ops {
+ const struct request_sock *req,
+ const struct sk_buff *skb);
+ #endif
++ int (*init_req)(struct request_sock *req, struct sock *sk,
++ struct sk_buff *skb);
++#ifdef CONFIG_SYN_COOKIES
++ __u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb,
++ __u16 *mss);
++#endif
++ struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl,
++ const struct request_sock *req,
++ bool *strict);
++ __u32 (*init_seq)(const struct sk_buff *skb);
++ int (*send_synack)(struct sock *sk, struct dst_entry *dst,
++ struct flowi *fl, struct request_sock *req,
++ u16 queue_mapping, struct tcp_fastopen_cookie *foc);
++ void (*queue_hash_add)(struct sock *sk, struct request_sock *req,
++ const unsigned long timeout);
+ };
+
++#ifdef CONFIG_SYN_COOKIES
++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
++ struct sock *sk, struct sk_buff *skb,
++ __u16 *mss)
++{
++ return ops->cookie_init_seq(sk, skb, mss);
++}
++#else
++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
++ struct sock *sk, struct sk_buff *skb,
++ __u16 *mss)
++{
++ return 0;
++}
++#endif
++
+ int tcpv4_offload_init(void);
+
+ void tcp_v4_init(void);
+diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
+index 9cf2394f0bcf..c2634b6ed854 100644
+--- a/include/uapi/linux/if.h
++++ b/include/uapi/linux/if.h
+@@ -109,6 +109,9 @@ enum net_device_flags {
+ #define IFF_DORMANT IFF_DORMANT
+ #define IFF_ECHO IFF_ECHO
+
++#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */
++#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */
++
+ #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
+ IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
+
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index 3b9718328d8b..487475681d84 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -112,6 +112,7 @@ enum {
+ #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
+ #define TCP_TIMESTAMP 24
+ #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */
++#define MPTCP_ENABLED 26
+
+ struct tcp_repair_opt {
+ __u32 opt_code;
+diff --git a/net/Kconfig b/net/Kconfig
+index d92afe4204d9..96b58593ad5e 100644
+--- a/net/Kconfig
++++ b/net/Kconfig
+@@ -79,6 +79,7 @@ if INET
+ source "net/ipv4/Kconfig"
+ source "net/ipv6/Kconfig"
+ source "net/netlabel/Kconfig"
++source "net/mptcp/Kconfig"
+
+ endif # if INET
+
+diff --git a/net/Makefile b/net/Makefile
+index cbbbe6d657ca..244bac1435b1 100644
+--- a/net/Makefile
++++ b/net/Makefile
+@@ -20,6 +20,7 @@ obj-$(CONFIG_INET) += ipv4/
+ obj-$(CONFIG_XFRM) += xfrm/
+ obj-$(CONFIG_UNIX) += unix/
+ obj-$(CONFIG_NET) += ipv6/
++obj-$(CONFIG_MPTCP) += mptcp/
+ obj-$(CONFIG_PACKET) += packet/
+ obj-$(CONFIG_NET_KEY) += key/
+ obj-$(CONFIG_BRIDGE) += bridge/
+diff --git a/net/core/dev.c b/net/core/dev.c
+index 367a586d0c8a..215d2757fbf6 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -5420,7 +5420,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
+
+ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
+ IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
+- IFF_AUTOMEDIA)) |
++ IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) |
+ (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
+ IFF_ALLMULTI));
+
+diff --git a/net/core/request_sock.c b/net/core/request_sock.c
+index 467f326126e0..909dfa13f499 100644
+--- a/net/core/request_sock.c
++++ b/net/core/request_sock.c
+@@ -38,7 +38,8 @@ int sysctl_max_syn_backlog = 256;
+ EXPORT_SYMBOL(sysctl_max_syn_backlog);
+
+ int reqsk_queue_alloc(struct request_sock_queue *queue,
+- unsigned int nr_table_entries)
++ unsigned int nr_table_entries,
++ gfp_t flags)
+ {
+ size_t lopt_size = sizeof(struct listen_sock);
+ struct listen_sock *lopt;
+@@ -48,9 +49,11 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
+ nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
+ lopt_size += nr_table_entries * sizeof(struct request_sock *);
+ if (lopt_size > PAGE_SIZE)
+- lopt = vzalloc(lopt_size);
++ lopt = __vmalloc(lopt_size,
++ flags | __GFP_HIGHMEM | __GFP_ZERO,
++ PAGE_KERNEL);
+ else
+- lopt = kzalloc(lopt_size, GFP_KERNEL);
++ lopt = kzalloc(lopt_size, flags);
+ if (lopt == NULL)
+ return -ENOMEM;
+
+diff --git a/net/core/skbuff.c b/net/core/skbuff.c
+index c1a33033cbe2..8abc5d60fbe3 100644
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -472,7 +472,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb)
+ skb_drop_list(&skb_shinfo(skb)->frag_list);
+ }
+
+-static void skb_clone_fraglist(struct sk_buff *skb)
++void skb_clone_fraglist(struct sk_buff *skb)
+ {
+ struct sk_buff *list;
+
+@@ -897,7 +897,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
+ skb->inner_mac_header += off;
+ }
+
+-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+ {
+ __copy_skb_header(new, old);
+
+diff --git a/net/core/sock.c b/net/core/sock.c
+index 026e01f70274..359295523177 100644
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -136,6 +136,11 @@
+
+ #include <trace/events/sock.h>
+
++#ifdef CONFIG_MPTCP
++#include <net/mptcp.h>
++#include <net/inet_common.h>
++#endif
++
+ #ifdef CONFIG_INET
+ #include <net/tcp.h>
+ #endif
+@@ -280,7 +285,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
+ "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
+ "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
+ };
+-static const char *const af_family_clock_key_strings[AF_MAX+1] = {
++char *const af_family_clock_key_strings[AF_MAX+1] = {
+ "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
+ "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
+ "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
+@@ -301,7 +306,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
+ * sk_callback_lock locking rules are per-address-family,
+ * so split the lock classes by using a per-AF key:
+ */
+-static struct lock_class_key af_callback_keys[AF_MAX];
++struct lock_class_key af_callback_keys[AF_MAX];
+
+ /* Take into consideration the size of the struct sk_buff overhead in the
+ * determination of these values, since that is non-constant across
+@@ -422,8 +427,6 @@ static void sock_warn_obsolete_bsdism(const char *name)
+ }
+ }
+
+-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+-
+ static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
+ {
+ if (sk->sk_flags & flags) {
+@@ -1253,8 +1256,25 @@ lenout:
+ *
+ * (We also register the sk_lock with the lock validator.)
+ */
+-static inline void sock_lock_init(struct sock *sk)
+-{
++void sock_lock_init(struct sock *sk)
++{
++#ifdef CONFIG_MPTCP
++ /* Reclassify the lock-class for subflows */
++ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
++ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) {
++ sock_lock_init_class_and_name(sk, "slock-AF_INET-MPTCP",
++ &meta_slock_key,
++ "sk_lock-AF_INET-MPTCP",
++ &meta_key);
++
++ /* We don't yet have the mptcp-point.
++ * Thus we still need inet_sock_destruct
++ */
++ sk->sk_destruct = inet_sock_destruct;
++ return;
++ }
++#endif
++
+ sock_lock_init_class_and_name(sk,
+ af_family_slock_key_strings[sk->sk_family],
+ af_family_slock_keys + sk->sk_family,
+@@ -1301,7 +1321,7 @@ void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
+ }
+ EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
+
+-static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
+ int family)
+ {
+ struct sock *sk;
+diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
+index 4db3c2a1679c..04cb17d4b0ce 100644
+--- a/net/dccp/ipv6.c
++++ b/net/dccp/ipv6.c
+@@ -386,7 +386,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ goto drop;
+
+- req = inet6_reqsk_alloc(&dccp6_request_sock_ops);
++ req = inet_reqsk_alloc(&dccp6_request_sock_ops);
+ if (req == NULL)
+ goto drop;
+
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 05c57f0fcabe..630434db0085 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -556,6 +556,30 @@ config TCP_CONG_ILLINOIS
+ For further details see:
+ http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+
++config TCP_CONG_COUPLED
++ tristate "MPTCP COUPLED CONGESTION CONTROL"
++ depends on MPTCP
++ default n
++ ---help---
++ MultiPath TCP Coupled Congestion Control
++ To enable it, just put 'coupled' in tcp_congestion_control
++
++config TCP_CONG_OLIA
++ tristate "MPTCP Opportunistic Linked Increase"
++ depends on MPTCP
++ default n
++ ---help---
++ MultiPath TCP Opportunistic Linked Increase Congestion Control
++ To enable it, just put 'olia' in tcp_congestion_control
++
++config TCP_CONG_WVEGAS
++ tristate "MPTCP WVEGAS CONGESTION CONTROL"
++ depends on MPTCP
++ default n
++ ---help---
++ wVegas congestion control for MPTCP
++ To enable it, just put 'wvegas' in tcp_congestion_control
++
+ choice
+ prompt "Default TCP congestion control"
+ default DEFAULT_CUBIC
+@@ -584,6 +608,15 @@ choice
+ config DEFAULT_WESTWOOD
+ bool "Westwood" if TCP_CONG_WESTWOOD=y
+
++ config DEFAULT_COUPLED
++ bool "Coupled" if TCP_CONG_COUPLED=y
++
++ config DEFAULT_OLIA
++ bool "Olia" if TCP_CONG_OLIA=y
++
++ config DEFAULT_WVEGAS
++ bool "Wvegas" if TCP_CONG_WVEGAS=y
++
+ config DEFAULT_RENO
+ bool "Reno"
+
+@@ -605,6 +638,8 @@ config DEFAULT_TCP_CONG
+ default "vegas" if DEFAULT_VEGAS
+ default "westwood" if DEFAULT_WESTWOOD
+ default "veno" if DEFAULT_VENO
++ default "coupled" if DEFAULT_COUPLED
++ default "wvegas" if DEFAULT_WVEGAS
+ default "reno" if DEFAULT_RENO
+ default "cubic"
+
+diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
+index d156b3c5f363..4afd6d8d9028 100644
+--- a/net/ipv4/af_inet.c
++++ b/net/ipv4/af_inet.c
+@@ -104,6 +104,7 @@
+ #include <net/ip_fib.h>
+ #include <net/inet_connection_sock.h>
+ #include <net/tcp.h>
++#include <net/mptcp.h>
+ #include <net/udp.h>
+ #include <net/udplite.h>
+ #include <net/ping.h>
+@@ -246,8 +247,7 @@ EXPORT_SYMBOL(inet_listen);
+ * Create an inet socket.
+ */
+
+-static int inet_create(struct net *net, struct socket *sock, int protocol,
+- int kern)
++int inet_create(struct net *net, struct socket *sock, int protocol, int kern)
+ {
+ struct sock *sk;
+ struct inet_protosw *answer;
+@@ -676,6 +676,23 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+ lock_sock(sk2);
+
+ sock_rps_record_flow(sk2);
++
++ if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) {
++ struct sock *sk_it = sk2;
++
++ mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it)
++ sock_rps_record_flow(sk_it);
++
++ if (tcp_sk(sk2)->mpcb->master_sk) {
++ sk_it = tcp_sk(sk2)->mpcb->master_sk;
++
++ write_lock_bh(&sk_it->sk_callback_lock);
++ sk_it->sk_wq = newsock->wq;
++ sk_it->sk_socket = newsock;
++ write_unlock_bh(&sk_it->sk_callback_lock);
++ }
++ }
++
+ WARN_ON(!((1 << sk2->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+ TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+@@ -1763,6 +1780,9 @@ static int __init inet_init(void)
+
+ ip_init();
+
++ /* We must initialize MPTCP before TCP. */
++ mptcp_init();
++
+ tcp_v4_init();
+
+ /* Setup TCP slab cache for open requests. */
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
+index 14d02ea905b6..7d734d8af19b 100644
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -23,6 +23,7 @@
+ #include <net/route.h>
+ #include <net/tcp_states.h>
+ #include <net/xfrm.h>
++#include <net/mptcp.h>
+
+ #ifdef INET_CSK_DEBUG
+ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+@@ -465,8 +466,8 @@ no_route:
+ }
+ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
+
+-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
+- const u32 rnd, const u32 synq_hsize)
++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
++ const u32 synq_hsize)
+ {
+ return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
+ }
+@@ -647,7 +648,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
+
+ lopt->clock_hand = i;
+
+- if (lopt->qlen)
++ if (lopt->qlen && !is_meta_sk(parent))
+ inet_csk_reset_keepalive_timer(parent, interval);
+ }
+ EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+@@ -664,7 +665,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
+ const struct request_sock *req,
+ const gfp_t priority)
+ {
+- struct sock *newsk = sk_clone_lock(sk, priority);
++ struct sock *newsk;
++
++ newsk = sk_clone_lock(sk, priority);
+
+ if (newsk != NULL) {
+ struct inet_connection_sock *newicsk = inet_csk(newsk);
+@@ -743,7 +746,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+ {
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
++ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries,
++ GFP_KERNEL);
+
+ if (rc != 0)
+ return rc;
+@@ -801,9 +805,14 @@ void inet_csk_listen_stop(struct sock *sk)
+
+ while ((req = acc_req) != NULL) {
+ struct sock *child = req->sk;
++ bool mutex_taken = false;
+
+ acc_req = req->dl_next;
+
++ if (is_meta_sk(child)) {
++ mutex_lock(&tcp_sk(child)->mpcb->mpcb_mutex);
++ mutex_taken = true;
++ }
+ local_bh_disable();
+ bh_lock_sock(child);
+ WARN_ON(sock_owned_by_user(child));
+@@ -832,6 +841,8 @@ void inet_csk_listen_stop(struct sock *sk)
+
+ bh_unlock_sock(child);
+ local_bh_enable();
++ if (mutex_taken)
++ mutex_unlock(&tcp_sk(child)->mpcb->mpcb_mutex);
+ sock_put(child);
+
+ sk_acceptq_removed(sk);
+diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
+index c86624b36a62..0ff3fe004d62 100644
+--- a/net/ipv4/syncookies.c
++++ b/net/ipv4/syncookies.c
+@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
+ }
+ EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
+
+-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
++ __u16 *mssp)
+ {
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+@@ -284,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
+
+ /* check for timestamp cookie support */
+ memset(&tcp_opt, 0, sizeof(tcp_opt));
+- tcp_parse_options(skb, &tcp_opt, 0, NULL);
++ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
+
+ if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
+ goto out;
+@@ -355,10 +356,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
+ /* Try to redo what tcp_v4_send_synack did. */
+ req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+
+- tcp_select_initial_window(tcp_full_space(sk), req->mss,
+- &req->rcv_wnd, &req->window_clamp,
+- ireq->wscale_ok, &rcv_wscale,
+- dst_metric(&rt->dst, RTAX_INITRWND));
++ tp->ops->select_initial_window(tcp_full_space(sk), req->mss,
++ &req->rcv_wnd, &req->window_clamp,
++ ireq->wscale_ok, &rcv_wscale,
++ dst_metric(&rt->dst, RTAX_INITRWND), sk);
+
+ ireq->rcv_wscale = rcv_wscale;
+
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 9d2118e5fbc7..2cb89f886d45 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -271,6 +271,7 @@
+
+ #include <net/icmp.h>
+ #include <net/inet_common.h>
++#include <net/mptcp.h>
+ #include <net/tcp.h>
+ #include <net/xfrm.h>
+ #include <net/ip.h>
+@@ -371,6 +372,24 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
+ return period;
+ }
+
++const struct tcp_sock_ops tcp_specific = {
++ .__select_window = __tcp_select_window,
++ .select_window = tcp_select_window,
++ .select_initial_window = tcp_select_initial_window,
++ .init_buffer_space = tcp_init_buffer_space,
++ .set_rto = tcp_set_rto,
++ .should_expand_sndbuf = tcp_should_expand_sndbuf,
++ .init_congestion_control = tcp_init_congestion_control,
++ .send_fin = tcp_send_fin,
++ .write_xmit = tcp_write_xmit,
++ .send_active_reset = tcp_send_active_reset,
++ .write_wakeup = tcp_write_wakeup,
++ .prune_ofo_queue = tcp_prune_ofo_queue,
++ .retransmit_timer = tcp_retransmit_timer,
++ .time_wait = tcp_time_wait,
++ .cleanup_rbuf = tcp_cleanup_rbuf,
++};
++
+ /* Address-family independent initialization for a tcp_sock.
+ *
+ * NOTE: A lot of things set to zero explicitly by call to
+@@ -419,6 +438,8 @@ void tcp_init_sock(struct sock *sk)
+ sk->sk_sndbuf = sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+
++ tp->ops = &tcp_specific;
++
+ local_bh_disable();
+ sock_update_memcg(sk);
+ sk_sockets_allocated_inc(sk);
+@@ -726,6 +747,14 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
+ int ret;
+
+ sock_rps_record_flow(sk);
++
++#ifdef CONFIG_MPTCP
++ if (mptcp(tcp_sk(sk))) {
++ struct sock *sk_it;
++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it)
++ sock_rps_record_flow(sk_it);
++ }
++#endif
+ /*
+ * We can't seek on a socket input
+ */
+@@ -821,8 +850,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
+ return NULL;
+ }
+
+-static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+- int large_allowed)
++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 xmit_size_goal, old_size_goal;
+@@ -872,8 +900,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+ {
+ int mss_now;
+
+- mss_now = tcp_current_mss(sk);
+- *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
++ if (mptcp(tcp_sk(sk))) {
++ mss_now = mptcp_current_mss(sk);
++ *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
++ } else {
++ mss_now = tcp_current_mss(sk);
++ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
++ }
+
+ return mss_now;
+ }
+@@ -892,11 +925,32 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+ * is fully established.
+ */
+ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+- !tcp_passive_fastopen(sk)) {
++ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
++ tp->mpcb->master_sk : sk)) {
+ if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto out_err;
+ }
+
++ if (mptcp(tp)) {
++ struct sock *sk_it = sk;
++
++ /* We must check this with socket-lock hold because we iterate
++ * over the subflows.
++ */
++ if (!mptcp_can_sendpage(sk)) {
++ ssize_t ret;
++
++ release_sock(sk);
++ ret = sock_no_sendpage(sk->sk_socket, page, offset,
++ size, flags);
++ lock_sock(sk);
++ return ret;
++ }
++
++ mptcp_for_each_sk(tp->mpcb, sk_it)
++ sock_rps_record_flow(sk_it);
++ }
++
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+@@ -1001,8 +1055,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+ {
+ ssize_t res;
+
+- if (!(sk->sk_route_caps & NETIF_F_SG) ||
+- !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
++ /* If MPTCP is enabled, we check it later after establishment */
++ if (!mptcp(tcp_sk(sk)) && (!(sk->sk_route_caps & NETIF_F_SG) ||
++ !(sk->sk_route_caps & NETIF_F_ALL_CSUM)))
+ return sock_no_sendpage(sk->sk_socket, page, offset, size,
+ flags);
+
+@@ -1018,6 +1073,9 @@ static inline int select_size(const struct sock *sk, bool sg)
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int tmp = tp->mss_cache;
+
++ if (mptcp(tp))
++ return mptcp_select_size(sk, sg);
++
+ if (sg) {
+ if (sk_can_gso(sk)) {
+ /* Small frames wont use a full page:
+@@ -1100,11 +1158,18 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ * is fully established.
+ */
+ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+- !tcp_passive_fastopen(sk)) {
++ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
++ tp->mpcb->master_sk : sk)) {
+ if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto do_error;
+ }
+
++ if (mptcp(tp)) {
++ struct sock *sk_it = sk;
++ mptcp_for_each_sk(tp->mpcb, sk_it)
++ sock_rps_record_flow(sk_it);
++ }
++
+ if (unlikely(tp->repair)) {
+ if (tp->repair_queue == TCP_RECV_QUEUE) {
+ copied = tcp_send_rcvq(sk, msg, size);
+@@ -1132,7 +1197,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto out_err;
+
+- sg = !!(sk->sk_route_caps & NETIF_F_SG);
++ if (mptcp(tp))
++ sg = mptcp_can_sg(sk);
++ else
++ sg = !!(sk->sk_route_caps & NETIF_F_SG);
+
+ while (--iovlen >= 0) {
+ size_t seglen = iov->iov_len;
+@@ -1183,8 +1251,15 @@ new_segment:
+
+ /*
+ * Check whether we can use HW checksum.
++ *
++ * If dss-csum is enabled, we do not do hw-csum.
++ * In case of non-mptcp we check the
++ * device-capabilities.
++ * In case of mptcp, hw-csum's will be handled
++ * later in mptcp_write_xmit.
+ */
+- if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
++ if (((mptcp(tp) && !tp->mpcb->dss_csum) || !mptcp(tp)) &&
++ (mptcp(tp) || sk->sk_route_caps & NETIF_F_ALL_CSUM))
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_entail(sk, skb);
+@@ -1422,7 +1497,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
+
+ /* Optimize, __tcp_select_window() is not cheap. */
+ if (2*rcv_window_now <= tp->window_clamp) {
+- __u32 new_window = __tcp_select_window(sk);
++ __u32 new_window = tp->ops->__select_window(sk);
+
+ /* Send ACK now, if this read freed lots of space
+ * in our buffer. Certainly, new_window is new window.
+@@ -1587,7 +1662,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ /* Clean up data we have read: This will do ACK frames. */
+ if (copied > 0) {
+ tcp_recv_skb(sk, seq, &offset);
+- tcp_cleanup_rbuf(sk, copied);
++ tp->ops->cleanup_rbuf(sk, copied);
+ }
+ return copied;
+ }
+@@ -1623,6 +1698,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+
+ lock_sock(sk);
+
++#ifdef CONFIG_MPTCP
++ if (mptcp(tp)) {
++ struct sock *sk_it;
++ mptcp_for_each_sk(tp->mpcb, sk_it)
++ sock_rps_record_flow(sk_it);
++ }
++#endif
++
+ err = -ENOTCONN;
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+@@ -1761,7 +1844,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ }
+ }
+
+- tcp_cleanup_rbuf(sk, copied);
++ tp->ops->cleanup_rbuf(sk, copied);
+
+ if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
+ /* Install new reader */
+@@ -1813,7 +1896,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ if (tp->rcv_wnd == 0 &&
+ !skb_queue_empty(&sk->sk_async_wait_queue)) {
+ tcp_service_net_dma(sk, true);
+- tcp_cleanup_rbuf(sk, copied);
++ tp->ops->cleanup_rbuf(sk, copied);
+ } else
+ dma_async_issue_pending(tp->ucopy.dma_chan);
+ }
+@@ -1993,7 +2076,7 @@ skip_copy:
+ */
+
+ /* Clean up data we have read: This will do ACK frames. */
+- tcp_cleanup_rbuf(sk, copied);
++ tp->ops->cleanup_rbuf(sk, copied);
+
+ release_sock(sk);
+ return copied;
+@@ -2070,7 +2153,7 @@ static const unsigned char new_state[16] = {
+ /* TCP_CLOSING */ TCP_CLOSING,
+ };
+
+-static int tcp_close_state(struct sock *sk)
++int tcp_close_state(struct sock *sk)
+ {
+ int next = (int)new_state[sk->sk_state];
+ int ns = next & TCP_STATE_MASK;
+@@ -2100,7 +2183,7 @@ void tcp_shutdown(struct sock *sk, int how)
+ TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
+ /* Clear out any half completed packets. FIN if needed. */
+ if (tcp_close_state(sk))
+- tcp_send_fin(sk);
++ tcp_sk(sk)->ops->send_fin(sk);
+ }
+ }
+ EXPORT_SYMBOL(tcp_shutdown);
+@@ -2125,6 +2208,11 @@ void tcp_close(struct sock *sk, long timeout)
+ int data_was_unread = 0;
+ int state;
+
++ if (is_meta_sk(sk)) {
++ mptcp_close(sk, timeout);
++ return;
++ }
++
+ lock_sock(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+@@ -2167,7 +2255,7 @@ void tcp_close(struct sock *sk, long timeout)
+ /* Unread data was tossed, zap the connection. */
+ NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
+ tcp_set_state(sk, TCP_CLOSE);
+- tcp_send_active_reset(sk, sk->sk_allocation);
++ tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation);
+ } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+ /* Check zero linger _after_ checking for unread data. */
+ sk->sk_prot->disconnect(sk, 0);
+@@ -2247,7 +2335,7 @@ adjudge_to_death:
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (tp->linger2 < 0) {
+ tcp_set_state(sk, TCP_CLOSE);
+- tcp_send_active_reset(sk, GFP_ATOMIC);
++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPABORTONLINGER);
+ } else {
+@@ -2257,7 +2345,8 @@ adjudge_to_death:
+ inet_csk_reset_keepalive_timer(sk,
+ tmo - TCP_TIMEWAIT_LEN);
+ } else {
+- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
++ tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2,
++ tmo);
+ goto out;
+ }
+ }
+@@ -2266,7 +2355,7 @@ adjudge_to_death:
+ sk_mem_reclaim(sk);
+ if (tcp_check_oom(sk, 0)) {
+ tcp_set_state(sk, TCP_CLOSE);
+- tcp_send_active_reset(sk, GFP_ATOMIC);
++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPABORTONMEMORY);
+ }
+@@ -2291,15 +2380,6 @@ out:
+ }
+ EXPORT_SYMBOL(tcp_close);
+
+-/* These states need RST on ABORT according to RFC793 */
+-
+-static inline bool tcp_need_reset(int state)
+-{
+- return (1 << state) &
+- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
+- TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
+-}
+-
+ int tcp_disconnect(struct sock *sk, int flags)
+ {
+ struct inet_sock *inet = inet_sk(sk);
+@@ -2322,7 +2402,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ /* The last check adjusts for discrepancy of Linux wrt. RFC
+ * states
+ */
+- tcp_send_active_reset(sk, gfp_any());
++ tp->ops->send_active_reset(sk, gfp_any());
+ sk->sk_err = ECONNRESET;
+ } else if (old_state == TCP_SYN_SENT)
+ sk->sk_err = ECONNRESET;
+@@ -2340,6 +2420,13 @@ int tcp_disconnect(struct sock *sk, int flags)
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ inet_reset_saddr(sk);
+
++ if (is_meta_sk(sk)) {
++ mptcp_disconnect(sk);
++ } else {
++ if (tp->inside_tk_table)
++ mptcp_hash_remove_bh(tp);
++ }
++
+ sk->sk_shutdown = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+ tp->srtt_us = 0;
+@@ -2632,6 +2719,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
+ break;
+
+ case TCP_DEFER_ACCEPT:
++ /* An established MPTCP-connection (mptcp(tp) only returns true
++ * if the socket is established) should not use DEFER on new
++ * subflows.
++ */
++ if (mptcp(tp))
++ break;
+ /* Translate value in seconds to number of retransmits */
+ icsk->icsk_accept_queue.rskq_defer_accept =
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+@@ -2659,7 +2752,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+ inet_csk_ack_scheduled(sk)) {
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
+- tcp_cleanup_rbuf(sk, 1);
++ tp->ops->cleanup_rbuf(sk, 1);
+ if (!(val & 1))
+ icsk->icsk_ack.pingpong = 1;
+ }
+@@ -2699,6 +2792,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
+ tp->notsent_lowat = val;
+ sk->sk_write_space(sk);
+ break;
++#ifdef CONFIG_MPTCP
++ case MPTCP_ENABLED:
++ if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_LISTEN) {
++ if (val)
++ tp->mptcp_enabled = 1;
++ else
++ tp->mptcp_enabled = 0;
++ } else {
++ err = -EPERM;
++ }
++ break;
++#endif
+ default:
+ err = -ENOPROTOOPT;
+ break;
+@@ -2931,6 +3036,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
+ case TCP_NOTSENT_LOWAT:
+ val = tp->notsent_lowat;
+ break;
++#ifdef CONFIG_MPTCP
++ case MPTCP_ENABLED:
++ val = tp->mptcp_enabled;
++ break;
++#endif
+ default:
+ return -ENOPROTOOPT;
+ }
+@@ -3120,8 +3230,11 @@ void tcp_done(struct sock *sk)
+ if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+
++ WARN_ON(sk->sk_state == TCP_CLOSE);
+ tcp_set_state(sk, TCP_CLOSE);
++
+ tcp_clear_xmit_timers(sk);
++
+ if (req != NULL)
+ reqsk_fastopen_remove(sk, req, false);
+
+diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
+index 9771563ab564..5c230d96c4c1 100644
+--- a/net/ipv4/tcp_fastopen.c
++++ b/net/ipv4/tcp_fastopen.c
+@@ -7,6 +7,7 @@
+ #include <linux/rculist.h>
+ #include <net/inetpeer.h>
+ #include <net/tcp.h>
++#include <net/mptcp.h>
+
+ int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
+
+@@ -133,7 +134,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
+ {
+ struct tcp_sock *tp;
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+- struct sock *child;
++ struct sock *child, *meta_sk;
+
+ req->num_retrans = 0;
+ req->num_timeout = 0;
+@@ -176,13 +177,6 @@ static bool tcp_fastopen_create_child(struct sock *sk,
+ /* Add the child socket directly into the accept queue */
+ inet_csk_reqsk_queue_add(sk, req, child);
+
+- /* Now finish processing the fastopen child socket. */
+- inet_csk(child)->icsk_af_ops->rebuild_header(child);
+- tcp_init_congestion_control(child);
+- tcp_mtup_init(child);
+- tcp_init_metrics(child);
+- tcp_init_buffer_space(child);
+-
+ /* Queue the data carried in the SYN packet. We need to first
+ * bump skb's refcnt because the caller will attempt to free it.
+ *
+@@ -199,8 +193,24 @@ static bool tcp_fastopen_create_child(struct sock *sk,
+ tp->syn_data_acked = 1;
+ }
+ tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
++
++ meta_sk = child;
++ if (!mptcp_check_req_fastopen(meta_sk, req)) {
++ child = tcp_sk(meta_sk)->mpcb->master_sk;
++ tp = tcp_sk(child);
++ }
++
++ /* Now finish processing the fastopen child socket. */
++ inet_csk(child)->icsk_af_ops->rebuild_header(child);
++ tp->ops->init_congestion_control(child);
++ tcp_mtup_init(child);
++ tcp_init_metrics(child);
++ tp->ops->init_buffer_space(child);
++
+ sk->sk_data_ready(sk);
+- bh_unlock_sock(child);
++ if (mptcp(tcp_sk(child)))
++ bh_unlock_sock(child);
++ bh_unlock_sock(meta_sk);
+ sock_put(child);
+ WARN_ON(req->sk == NULL);
+ return true;
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 40639c288dc2..3273bb69f387 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -74,6 +74,9 @@
+ #include <linux/ipsec.h>
+ #include <asm/unaligned.h>
+ #include <net/netdma.h>
++#include <net/mptcp.h>
++#include <net/mptcp_v4.h>
++#include <net/mptcp_v6.h>
+
+ int sysctl_tcp_timestamps __read_mostly = 1;
+ int sysctl_tcp_window_scaling __read_mostly = 1;
+@@ -99,25 +102,6 @@ int sysctl_tcp_thin_dupack __read_mostly;
+ int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
+ int sysctl_tcp_early_retrans __read_mostly = 3;
+
+-#define FLAG_DATA 0x01 /* Incoming frame contained data. */
+-#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
+-#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
+-#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
+-#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
+-#define FLAG_DATA_SACKED 0x20 /* New SACK. */
+-#define FLAG_ECE 0x40 /* ECE in this ACK */
+-#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
+-#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
+-#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
+-#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
+-#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
+-#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
+-
+-#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
+-#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
+-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
+-#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
+-
+ #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
+ #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
+
+@@ -181,7 +165,7 @@ static void tcp_incr_quickack(struct sock *sk)
+ icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+ }
+
+-static void tcp_enter_quickack_mode(struct sock *sk)
++void tcp_enter_quickack_mode(struct sock *sk)
+ {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_incr_quickack(sk);
+@@ -283,8 +267,12 @@ static void tcp_sndbuf_expand(struct sock *sk)
+ per_mss = roundup_pow_of_two(per_mss) +
+ SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+- nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
+- nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
++ if (mptcp(tp)) {
++ nr_segs = mptcp_check_snd_buf(tp);
++ } else {
++ nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
++ nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
++ }
+
+ /* Fast Recovery (RFC 5681 3.2) :
+ * Cubic needs 1.7 factor, rounded to 2 to include
+@@ -292,8 +280,16 @@ static void tcp_sndbuf_expand(struct sock *sk)
+ */
+ sndmem = 2 * nr_segs * per_mss;
+
+- if (sk->sk_sndbuf < sndmem)
++ /* MPTCP: after this sndmem is the new contribution of the
++ * current subflow to the aggregated sndbuf */
++ if (sk->sk_sndbuf < sndmem) {
++ int old_sndbuf = sk->sk_sndbuf;
+ sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
++ /* MPTCP: ok, the subflow sndbuf has grown, reflect
++ * this in the aggregate buffer.*/
++ if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf)
++ mptcp_update_sndbuf(tp);
++ }
+ }
+
+ /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
+@@ -342,10 +338,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
+ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+
+ /* Check #1 */
+- if (tp->rcv_ssthresh < tp->window_clamp &&
+- (int)tp->rcv_ssthresh < tcp_space(sk) &&
++ if (meta_tp->rcv_ssthresh < meta_tp->window_clamp &&
++ (int)meta_tp->rcv_ssthresh < tcp_space(sk) &&
+ !sk_under_memory_pressure(sk)) {
+ int incr;
+
+@@ -353,14 +351,14 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
+ * will fit to rcvbuf in future.
+ */
+ if (tcp_win_from_space(skb->truesize) <= skb->len)
+- incr = 2 * tp->advmss;
++ incr = 2 * meta_tp->advmss;
+ else
+- incr = __tcp_grow_window(sk, skb);
++ incr = __tcp_grow_window(meta_sk, skb);
+
+ if (incr) {
+ incr = max_t(int, incr, 2 * skb->len);
+- tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
+- tp->window_clamp);
++ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr,
++ meta_tp->window_clamp);
+ inet_csk(sk)->icsk_ack.quick |= 1;
+ }
+ }
+@@ -543,7 +541,10 @@ void tcp_rcv_space_adjust(struct sock *sk)
+ int copied;
+
+ time = tcp_time_stamp - tp->rcvq_space.time;
+- if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
++ if (mptcp(tp)) {
++ if (mptcp_check_rtt(tp, time))
++ return;
++ } else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
+ return;
+
+ /* Number of bytes copied to user in last RTT */
+@@ -761,7 +762,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
+ /* Calculate rto without backoff. This is the second half of Van Jacobson's
+ * routine referred to above.
+ */
+-static void tcp_set_rto(struct sock *sk)
++void tcp_set_rto(struct sock *sk)
+ {
+ const struct tcp_sock *tp = tcp_sk(sk);
+ /* Old crap is replaced with new one. 8)
+@@ -1376,7 +1377,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+ int len;
+ int in_sack;
+
+- if (!sk_can_gso(sk))
++ /* For MPTCP we cannot shift skb-data and remove one skb from the
++ * send-queue, because this will make us loose the DSS-option (which
++ * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing.
++ */
++ if (!sk_can_gso(sk) || mptcp(tp))
+ goto fallback;
+
+ /* Normally R but no L won't result in plain S */
+@@ -2915,7 +2920,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
+ return false;
+
+ tcp_rtt_estimator(sk, seq_rtt_us);
+- tcp_set_rto(sk);
++ tp->ops->set_rto(sk);
+
+ /* RFC6298: only reset backoff on valid RTT measurement. */
+ inet_csk(sk)->icsk_backoff = 0;
+@@ -3000,7 +3005,7 @@ void tcp_resume_early_retransmit(struct sock *sk)
+ }
+
+ /* If we get here, the whole TSO packet has not been acked. */
+-static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 packets_acked;
+@@ -3095,6 +3100,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
+ */
+ if (!(scb->tcp_flags & TCPHDR_SYN)) {
+ flag |= FLAG_DATA_ACKED;
++ if (mptcp(tp) && mptcp_is_data_seq(skb))
++ flag |= MPTCP_FLAG_DATA_ACKED;
+ } else {
+ flag |= FLAG_SYN_ACKED;
+ tp->retrans_stamp = 0;
+@@ -3189,7 +3196,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
+ return flag;
+ }
+
+-static void tcp_ack_probe(struct sock *sk)
++void tcp_ack_probe(struct sock *sk)
+ {
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -3236,9 +3243,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+ /* Check that window update is acceptable.
+ * The function assumes that snd_una<=ack<=snd_next.
+ */
+-static inline bool tcp_may_update_window(const struct tcp_sock *tp,
+- const u32 ack, const u32 ack_seq,
+- const u32 nwin)
++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
++ const u32 ack_seq, const u32 nwin)
+ {
+ return after(ack, tp->snd_una) ||
+ after(ack_seq, tp->snd_wl1) ||
+@@ -3357,7 +3363,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ }
+
+ /* This routine deals with incoming acks, but not outgoing ones. */
+-static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
++static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
+ {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+@@ -3449,6 +3455,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ sack_rtt_us);
+ acked -= tp->packets_out;
+
++ if (mptcp(tp)) {
++ if (mptcp_fallback_infinite(sk, flag)) {
++ pr_err("%s resetting flow\n", __func__);
++ mptcp_send_reset(sk);
++ goto invalid_ack;
++ }
++
++ mptcp_clean_rtx_infinite(skb, sk);
++ }
++
+ /* Advance cwnd if state allows */
+ if (tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, acked);
+@@ -3512,8 +3528,9 @@ old_ack:
+ * the fast version below fails.
+ */
+ void tcp_parse_options(const struct sk_buff *skb,
+- struct tcp_options_received *opt_rx, int estab,
+- struct tcp_fastopen_cookie *foc)
++ struct tcp_options_received *opt_rx,
++ struct mptcp_options_received *mopt,
++ int estab, struct tcp_fastopen_cookie *foc)
+ {
+ const unsigned char *ptr;
+ const struct tcphdr *th = tcp_hdr(skb);
+@@ -3596,6 +3613,9 @@ void tcp_parse_options(const struct sk_buff *skb,
+ */
+ break;
+ #endif
++ case TCPOPT_MPTCP:
++ mptcp_parse_options(ptr - 2, opsize, mopt, skb);
++ break;
+ case TCPOPT_EXP:
+ /* Fast Open option shares code 254 using a
+ * 16 bits magic number. It's valid only in
+@@ -3657,8 +3677,8 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
+ if (tcp_parse_aligned_timestamp(tp, th))
+ return true;
+ }
+-
+- tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
++ tcp_parse_options(skb, &tp->rx_opt, mptcp(tp) ? &tp->mptcp->rx_opt : NULL,
++ 1, NULL);
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
+@@ -3831,6 +3851,8 @@ static void tcp_fin(struct sock *sk)
+ dst = __sk_dst_get(sk);
+ if (!dst || !dst_metric(dst, RTAX_QUICKACK))
+ inet_csk(sk)->icsk_ack.pingpong = 1;
++ if (mptcp(tp))
++ mptcp_sub_close_passive(sk);
+ break;
+
+ case TCP_CLOSE_WAIT:
+@@ -3852,9 +3874,16 @@ static void tcp_fin(struct sock *sk)
+ tcp_set_state(sk, TCP_CLOSING);
+ break;
+ case TCP_FIN_WAIT2:
++ if (mptcp(tp)) {
++ /* The socket will get closed by mptcp_data_ready.
++ * We first have to process all data-sequences.
++ */
++ tp->close_it = 1;
++ break;
++ }
+ /* Received a FIN -- send ACK and enter TIME_WAIT. */
+ tcp_send_ack(sk);
+- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
++ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
+ break;
+ default:
+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
+@@ -3876,6 +3905,10 @@ static void tcp_fin(struct sock *sk)
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+
++ /* Don't wake up MPTCP-subflows */
++ if (mptcp(tp))
++ return;
++
+ /* Do not send POLL_HUP for half duplex close. */
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ sk->sk_state == TCP_CLOSE)
+@@ -4073,7 +4106,11 @@ static void tcp_ofo_queue(struct sock *sk)
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
+ }
+
+- if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
++ /* In case of MPTCP, the segment may be empty if it's a
++ * non-data DATA_FIN. (see beginning of tcp_data_queue)
++ */
++ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) &&
++ !(mptcp(tp) && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) {
+ SOCK_DEBUG(sk, "ofo packet was already received\n");
+ __skb_unlink(skb, &tp->out_of_order_queue);
+ __kfree_skb(skb);
+@@ -4091,12 +4128,14 @@ static void tcp_ofo_queue(struct sock *sk)
+ }
+ }
+
+-static bool tcp_prune_ofo_queue(struct sock *sk);
+ static int tcp_prune_queue(struct sock *sk);
+
+ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+ unsigned int size)
+ {
++ if (mptcp(tcp_sk(sk)))
++ sk = mptcp_meta_sk(sk);
++
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ !sk_rmem_schedule(sk, skb, size)) {
+
+@@ -4104,7 +4143,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+ return -1;
+
+ if (!sk_rmem_schedule(sk, skb, size)) {
+- if (!tcp_prune_ofo_queue(sk))
++ if (!tcp_sk(sk)->ops->prune_ofo_queue(sk))
+ return -1;
+
+ if (!sk_rmem_schedule(sk, skb, size))
+@@ -4127,15 +4166,16 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns true if caller should free @from instead of queueing it
+ */
+-static bool tcp_try_coalesce(struct sock *sk,
+- struct sk_buff *to,
+- struct sk_buff *from,
+- bool *fragstolen)
++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from,
++ bool *fragstolen)
+ {
+ int delta;
+
+ *fragstolen = false;
+
++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))
++ return false;
++
+ if (tcp_hdr(from)->fin)
+ return false;
+
+@@ -4225,7 +4265,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+
+ /* Do skb overlap to previous one? */
+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
++ /* MPTCP allows non-data data-fin to be in the ofo-queue */
++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) &&
++ !(mptcp(tp) && end_seq == seq)) {
+ /* All the bits are present. Drop. */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb);
+@@ -4263,6 +4305,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+ end_seq);
+ break;
+ }
++ /* MPTCP allows non-data data-fin to be in the ofo-queue */
++ if (mptcp(tp) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq)
++ continue;
+ __skb_unlink(skb1, &tp->out_of_order_queue);
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ TCP_SKB_CB(skb1)->end_seq);
+@@ -4280,8 +4325,8 @@ end:
+ }
+ }
+
+-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
+- bool *fragstolen)
++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
++ bool *fragstolen)
+ {
+ int eaten;
+ struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
+@@ -4343,7 +4388,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+ int eaten = -1;
+ bool fragstolen = false;
+
+- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
++ /* If no data is present, but a data_fin is in the options, we still
++ * have to call mptcp_queue_skb later on. */
++ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
++ !(mptcp(tp) && mptcp_is_data_fin(skb)))
+ goto drop;
+
+ skb_dst_drop(skb);
+@@ -4389,7 +4437,7 @@ queue_and_out:
+ eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
+ }
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+- if (skb->len)
++ if (skb->len || mptcp_is_data_fin(skb))
+ tcp_event_data_recv(sk, skb);
+ if (th->fin)
+ tcp_fin(sk);
+@@ -4411,7 +4459,11 @@ queue_and_out:
+
+ if (eaten > 0)
+ kfree_skb_partial(skb, fragstolen);
+- if (!sock_flag(sk, SOCK_DEAD))
++ if (!sock_flag(sk, SOCK_DEAD) || mptcp(tp))
++ /* MPTCP: we always have to call data_ready, because
++ * we may be about to receive a data-fin, which still
++ * must get queued.
++ */
+ sk->sk_data_ready(sk);
+ return;
+ }
+@@ -4463,6 +4515,8 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
+ next = skb_queue_next(list, skb);
+
+ __skb_unlink(skb, list);
++ if (mptcp(tcp_sk(sk)))
++ mptcp_remove_shortcuts(tcp_sk(sk)->mpcb, skb);
+ __kfree_skb(skb);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+
+@@ -4630,7 +4684,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
+ * Purge the out-of-order queue.
+ * Return true if queue was pruned.
+ */
+-static bool tcp_prune_ofo_queue(struct sock *sk)
++bool tcp_prune_ofo_queue(struct sock *sk)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool res = false;
+@@ -4686,7 +4740,7 @@ static int tcp_prune_queue(struct sock *sk)
+ /* Collapsing did not help, destructive actions follow.
+ * This must not ever occur. */
+
+- tcp_prune_ofo_queue(sk);
++ tp->ops->prune_ofo_queue(sk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ return 0;
+@@ -4702,7 +4756,29 @@ static int tcp_prune_queue(struct sock *sk)
+ return -1;
+ }
+
+-static bool tcp_should_expand_sndbuf(const struct sock *sk)
++/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
++ * As additional protections, we do not touch cwnd in retransmission phases,
++ * and if application hit its sndbuf limit recently.
++ */
++void tcp_cwnd_application_limited(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
++ sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
++ /* Limited by application or receiver window. */
++ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
++ u32 win_used = max(tp->snd_cwnd_used, init_win);
++ if (win_used < tp->snd_cwnd) {
++ tp->snd_ssthresh = tcp_current_ssthresh(sk);
++ tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
++ }
++ tp->snd_cwnd_used = 0;
++ }
++ tp->snd_cwnd_stamp = tcp_time_stamp;
++}
++
++bool tcp_should_expand_sndbuf(const struct sock *sk)
+ {
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+@@ -4737,7 +4813,7 @@ static void tcp_new_space(struct sock *sk)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+- if (tcp_should_expand_sndbuf(sk)) {
++ if (tp->ops->should_expand_sndbuf(sk)) {
+ tcp_sndbuf_expand(sk);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ }
+@@ -4749,8 +4825,9 @@ static void tcp_check_space(struct sock *sk)
+ {
+ if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
+ sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
+- if (sk->sk_socket &&
+- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
++ if (mptcp(tcp_sk(sk)) ||
++ (sk->sk_socket &&
++ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)))
+ tcp_new_space(sk);
+ }
+ }
+@@ -4773,7 +4850,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ /* ... and right edge of window advances far enough.
+ * (tcp_recvmsg() will send ACK otherwise). Or...
+ */
+- __tcp_select_window(sk) >= tp->rcv_wnd) ||
++ tp->ops->__select_window(sk) >= tp->rcv_wnd) ||
+ /* We ACK each frame or... */
+ tcp_in_quickack_mode(sk) ||
+ /* We have out of order data. */
+@@ -4875,6 +4952,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+
++ /* MPTCP urgent data is not yet supported */
++ if (mptcp(tp))
++ return;
++
+ /* Check if we get a new urgent pointer - normally not. */
+ if (th->urg)
+ tcp_check_urg(sk, th);
+@@ -4942,8 +5023,7 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
+ }
+
+ #ifdef CONFIG_NET_DMA
+-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
+- int hlen)
++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+ int chunk = skb->len - hlen;
+@@ -5052,9 +5132,15 @@ syn_challenge:
+ goto discard;
+ }
+
++ /* If valid: post process the received MPTCP options. */
++ if (mptcp(tp) && mptcp_handle_options(sk, th, skb))
++ goto discard;
++
+ return true;
+
+ discard:
++ if (mptcp(tp))
++ mptcp_reset_mopt(tp);
+ __kfree_skb(skb);
+ return false;
+ }
+@@ -5106,6 +5192,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+
+ tp->rx_opt.saw_tstamp = 0;
+
++ /* MPTCP: force slowpath. */
++ if (mptcp(tp))
++ goto slow_path;
++
+ /* pred_flags is 0xS?10 << 16 + snd_wnd
+ * if header_prediction is to be made
+ * 'S' will always be tp->tcp_header_len >> 2
+@@ -5205,7 +5295,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+ }
+ if (copied_early)
+- tcp_cleanup_rbuf(sk, skb->len);
++ tp->ops->cleanup_rbuf(sk, skb->len);
+ }
+ if (!eaten) {
+ if (tcp_checksum_complete_user(sk, skb))
+@@ -5313,14 +5403,14 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
+
+ tcp_init_metrics(sk);
+
+- tcp_init_congestion_control(sk);
++ tp->ops->init_congestion_control(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data
+ * packet.
+ */
+ tp->lsndtime = tcp_time_stamp;
+
+- tcp_init_buffer_space(sk);
++ tp->ops->init_buffer_space(sk);
+
+ if (sock_flag(sk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+@@ -5350,7 +5440,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+ /* Get original SYNACK MSS value if user MSS sets mss_clamp */
+ tcp_clear_options(&opt);
+ opt.user_mss = opt.mss_clamp = 0;
+- tcp_parse_options(synack, &opt, 0, NULL);
++ tcp_parse_options(synack, &opt, NULL, 0, NULL);
+ mss = opt.mss_clamp;
+ }
+
+@@ -5365,7 +5455,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+
+ tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
+
+- if (data) { /* Retransmit unacked data in SYN */
++ /* In mptcp case, we do not rely on "retransmit", but instead on
++ * "transmit", because if fastopen data is not acked, the retransmission
++ * becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen).
++ */
++ if (data && !mptcp(tp)) { /* Retransmit unacked data in SYN */
+ tcp_for_write_queue_from(data, sk) {
+ if (data == tcp_send_head(sk) ||
+ __tcp_retransmit_skb(sk, data))
+@@ -5388,8 +5482,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ int saved_clamp = tp->rx_opt.mss_clamp;
++ struct mptcp_options_received mopt;
++ mptcp_init_mp_opt(&mopt);
+
+- tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
++ tcp_parse_options(skb, &tp->rx_opt,
++ mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc);
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
+@@ -5448,6 +5545,30 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+ tcp_ack(sk, skb, FLAG_SLOWPATH);
+
++ if (tp->request_mptcp || mptcp(tp)) {
++ int ret;
++ ret = mptcp_rcv_synsent_state_process(sk, &sk,
++ skb, &mopt);
++
++ /* May have changed if we support MPTCP */
++ tp = tcp_sk(sk);
++ icsk = inet_csk(sk);
++
++ if (ret == 1)
++ goto reset_and_undo;
++ if (ret == 2)
++ goto discard;
++ }
++
++ if (mptcp(tp) && !is_master_tp(tp)) {
++ /* Timer for repeating the ACK until an answer
++ * arrives. Used only when establishing an additional
++ * subflow inside of an MPTCP connection.
++ */
++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
++ jiffies + icsk->icsk_rto);
++ }
++
+ /* Ok.. it's good. Set up sequence numbers and
+ * move to established.
+ */
+@@ -5474,6 +5595,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ }
+
++ if (mptcp(tp)) {
++ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
++ }
++
+ if (tcp_is_sack(tp) && sysctl_tcp_fack)
+ tcp_enable_fack(tp);
+
+@@ -5494,9 +5620,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+ tcp_rcv_fastopen_synack(sk, skb, &foc))
+ return -1;
+
+- if (sk->sk_write_pending ||
++ /* With MPTCP we cannot send data on the third ack due to the
++ * lack of option-space to combine with an MP_CAPABLE.
++ */
++ if (!mptcp(tp) && (sk->sk_write_pending ||
+ icsk->icsk_accept_queue.rskq_defer_accept ||
+- icsk->icsk_ack.pingpong) {
++ icsk->icsk_ack.pingpong)) {
+ /* Save one ACK. Data will be ready after
+ * several ticks, if write_pending is set.
+ *
+@@ -5536,6 +5665,7 @@ discard:
+ tcp_paws_reject(&tp->rx_opt, 0))
+ goto discard_and_undo;
+
++ /* TODO - check this here for MPTCP */
+ if (th->syn) {
+ /* We see SYN without ACK. It is attempt of
+ * simultaneous connect with crossed SYNs.
+@@ -5552,6 +5682,11 @@ discard:
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ }
+
++ if (mptcp(tp)) {
++ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
++ }
++
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+@@ -5610,6 +5745,7 @@ reset_and_undo:
+
+ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, unsigned int len)
++ __releases(&sk->sk_lock.slock)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -5661,6 +5797,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+
+ case TCP_SYN_SENT:
+ queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
++ if (is_meta_sk(sk)) {
++ sk = tcp_sk(sk)->mpcb->master_sk;
++ tp = tcp_sk(sk);
++
++ /* Need to call it here, because it will announce new
++ * addresses, which can only be done after the third ack
++ * of the 3-way handshake.
++ */
++ mptcp_update_metasocket(sk, tp->meta_sk);
++ }
+ if (queued >= 0)
+ return queued;
+
+@@ -5668,6 +5814,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ tcp_urg(sk, skb, th);
+ __kfree_skb(skb);
+ tcp_data_snd_check(sk);
++ if (mptcp(tp) && is_master_tp(tp))
++ bh_unlock_sock(sk);
+ return 0;
+ }
+
+@@ -5706,11 +5854,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ synack_stamp = tp->lsndtime;
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+- tcp_init_congestion_control(sk);
++ tp->ops->init_congestion_control(sk);
+
+ tcp_mtup_init(sk);
+ tp->copied_seq = tp->rcv_nxt;
+- tcp_init_buffer_space(sk);
++ tp->ops->init_buffer_space(sk);
+ }
+ smp_mb();
+ tcp_set_state(sk, TCP_ESTABLISHED);
+@@ -5730,6 +5878,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+
+ if (tp->rx_opt.tstamp_ok)
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
++ if (mptcp(tp))
++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
+
+ if (req) {
+ /* Re-arm the timer because data may have been sent out.
+@@ -5751,6 +5901,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+
+ tcp_initialize_rcv_mss(sk);
+ tcp_fast_path_on(tp);
++ /* Send an ACK when establishing a new
++ * MPTCP subflow, i.e. using an MP_JOIN
++ * subtype.
++ */
++ if (mptcp(tp) && !is_master_tp(tp))
++ tcp_send_ack(sk);
+ break;
+
+ case TCP_FIN_WAIT1: {
+@@ -5802,7 +5958,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ tmo = tcp_fin_time(sk);
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+- } else if (th->fin || sock_owned_by_user(sk)) {
++ } else if (th->fin || mptcp_is_data_fin(skb) ||
++ sock_owned_by_user(sk)) {
+ /* Bad case. We could lose such FIN otherwise.
+ * It is not a big problem, but it looks confusing
+ * and not so rare event. We still can lose it now,
+@@ -5811,7 +5968,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ */
+ inet_csk_reset_keepalive_timer(sk, tmo);
+ } else {
+- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
++ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto discard;
+ }
+ break;
+@@ -5819,7 +5976,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+
+ case TCP_CLOSING:
+ if (tp->snd_una == tp->write_seq) {
+- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
++ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
+ goto discard;
+ }
+ break;
+@@ -5831,6 +5988,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ goto discard;
+ }
+ break;
++ case TCP_CLOSE:
++ if (tp->mp_killed)
++ goto discard;
+ }
+
+ /* step 6: check the URG bit */
+@@ -5851,7 +6011,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ */
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
++ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
++ !mptcp(tp)) {
++ /* In case of mptcp, the reset is handled by
++ * mptcp_rcv_state_process
++ */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ tcp_reset(sk);
+ return 1;
+@@ -5877,3 +6041,154 @@ discard:
+ return 0;
+ }
+ EXPORT_SYMBOL(tcp_rcv_state_process);
++
++static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
++{
++ struct inet_request_sock *ireq = inet_rsk(req);
++
++ if (family == AF_INET)
++ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
++ &ireq->ir_rmt_addr, port);
++#if IS_ENABLED(CONFIG_IPV6)
++ else if (family == AF_INET6)
++ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
++ &ireq->ir_v6_rmt_addr, port);
++#endif
++}
++
++int tcp_conn_request(struct request_sock_ops *rsk_ops,
++ const struct tcp_request_sock_ops *af_ops,
++ struct sock *sk, struct sk_buff *skb)
++{
++ struct tcp_options_received tmp_opt;
++ struct request_sock *req;
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct dst_entry *dst = NULL;
++ __u32 isn = TCP_SKB_CB(skb)->when;
++ bool want_cookie = false, fastopen;
++ struct flowi fl;
++ struct tcp_fastopen_cookie foc = { .len = -1 };
++ int err;
++
++
++ /* TW buckets are converted to open requests without
++ * limitations, they conserve resources and peer is
++ * evidently real one.
++ */
++ if ((sysctl_tcp_syncookies == 2 ||
++ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
++ want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
++ if (!want_cookie)
++ goto drop;
++ }
++
++
++ /* Accept backlog is full. If we have already queued enough
++ * of warm entries in syn queue, drop request. It is better than
++ * clogging syn queue with openreqs with exponentially increasing
++ * timeout.
++ */
++ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
++ goto drop;
++ }
++
++ req = inet_reqsk_alloc(rsk_ops);
++ if (!req)
++ goto drop;
++
++ tcp_rsk(req)->af_specific = af_ops;
++
++ tcp_clear_options(&tmp_opt);
++ tmp_opt.mss_clamp = af_ops->mss_clamp;
++ tmp_opt.user_mss = tp->rx_opt.user_mss;
++ tcp_parse_options(skb, &tmp_opt, NULL, 0, want_cookie ? NULL : &foc);
++
++ if (want_cookie && !tmp_opt.saw_tstamp)
++ tcp_clear_options(&tmp_opt);
++
++ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
++ tcp_openreq_init(req, &tmp_opt, skb);
++
++ if (af_ops->init_req(req, sk, skb))
++ goto drop_and_free;
++
++ if (security_inet_conn_request(sk, skb, req))
++ goto drop_and_free;
++
++ if (!want_cookie || tmp_opt.tstamp_ok)
++ TCP_ECN_create_request(req, skb, sock_net(sk));
++
++ if (want_cookie) {
++ isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
++ req->cookie_ts = tmp_opt.tstamp_ok;
++ } else if (!isn) {
++ /* VJ's idea. We save last timestamp seen
++ * from the destination in peer table, when entering
++ * state TIME-WAIT, and check against it before
++ * accepting new connection request.
++ *
++ * If "isn" is not zero, this request hit alive
++ * timewait bucket, so that all the necessary checks
++ * are made in the function processing timewait state.
++ */
++ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
++ bool strict;
++
++ dst = af_ops->route_req(sk, &fl, req, &strict);
++ if (dst && strict &&
++ !tcp_peer_is_proven(req, dst, true)) {
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
++ goto drop_and_release;
++ }
++ }
++ /* Kill the following clause, if you dislike this way. */
++ else if (!sysctl_tcp_syncookies &&
++ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
++ (sysctl_max_syn_backlog >> 2)) &&
++ !tcp_peer_is_proven(req, dst, false)) {
++ /* Without syncookies last quarter of
++ * backlog is filled with destinations,
++ * proven to be alive.
++ * It means that we continue to communicate
++ * to destinations, already remembered
++ * to the moment of synflood.
++ */
++ pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
++ rsk_ops->family);
++ goto drop_and_release;
++ }
++
++ isn = af_ops->init_seq(skb);
++ }
++ if (!dst) {
++ dst = af_ops->route_req(sk, &fl, req, NULL);
++ if (!dst)
++ goto drop_and_free;
++ }
++
++ tcp_rsk(req)->snt_isn = isn;
++ tcp_openreq_init_rwin(req, sk, dst);
++ fastopen = !want_cookie &&
++ tcp_try_fastopen(sk, skb, req, &foc, dst);
++ err = af_ops->send_synack(sk, dst, &fl, req,
++ skb_get_queue_mapping(skb), &foc);
++ if (!fastopen) {
++ if (err || want_cookie)
++ goto drop_and_free;
++
++ tcp_rsk(req)->listener = NULL;
++ af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
++ }
++
++ return 0;
++
++drop_and_release:
++ dst_release(dst);
++drop_and_free:
++ reqsk_free(req);
++drop:
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
++ return 0;
++}
++EXPORT_SYMBOL(tcp_conn_request);
+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
+index 77cccda1ad0c..c77017f600f1 100644
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -67,6 +67,8 @@
+ #include <net/icmp.h>
+ #include <net/inet_hashtables.h>
+ #include <net/tcp.h>
++#include <net/mptcp.h>
++#include <net/mptcp_v4.h>
+ #include <net/transp_v6.h>
+ #include <net/ipv6.h>
+ #include <net/inet_common.h>
+@@ -99,7 +101,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ struct inet_hashinfo tcp_hashinfo;
+ EXPORT_SYMBOL(tcp_hashinfo);
+
+-static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
++__u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+ {
+ return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr,
+@@ -334,7 +336,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+ struct inet_sock *inet;
+ const int type = icmp_hdr(icmp_skb)->type;
+ const int code = icmp_hdr(icmp_skb)->code;
+- struct sock *sk;
++ struct sock *sk, *meta_sk;
+ struct sk_buff *skb;
+ struct request_sock *fastopen;
+ __u32 seq, snd_una;
+@@ -358,13 +360,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+ return;
+ }
+
+- bh_lock_sock(sk);
++ tp = tcp_sk(sk);
++ if (mptcp(tp))
++ meta_sk = mptcp_meta_sk(sk);
++ else
++ meta_sk = sk;
++
++ bh_lock_sock(meta_sk);
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ * We do take care of PMTU discovery (RFC1191) special case :
+ * we can receive locally generated ICMP messages while socket is held.
+ */
+- if (sock_owned_by_user(sk)) {
++ if (sock_owned_by_user(meta_sk)) {
+ if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ }
+@@ -377,7 +385,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+ }
+
+ icsk = inet_csk(sk);
+- tp = tcp_sk(sk);
+ seq = ntohl(th->seq);
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
+ fastopen = tp->fastopen_rsk;
+@@ -411,11 +418,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+ goto out;
+
+ tp->mtu_info = info;
+- if (!sock_owned_by_user(sk)) {
++ if (!sock_owned_by_user(meta_sk)) {
+ tcp_v4_mtu_reduced(sk);
+ } else {
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+ sock_hold(sk);
++ if (mptcp(tp))
++ mptcp_tsq_flags(sk);
+ }
+ goto out;
+ }
+@@ -429,7 +438,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+ !icsk->icsk_backoff || fastopen)
+ break;
+
+- if (sock_owned_by_user(sk))
++ if (sock_owned_by_user(meta_sk))
+ break;
+
+ icsk->icsk_backoff--;
+@@ -463,7 +472,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+ switch (sk->sk_state) {
+ struct request_sock *req, **prev;
+ case TCP_LISTEN:
+- if (sock_owned_by_user(sk))
++ if (sock_owned_by_user(meta_sk))
+ goto out;
+
+ req = inet_csk_search_req(sk, &prev, th->dest,
+@@ -499,7 +508,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+ if (fastopen && fastopen->sk == NULL)
+ break;
+
+- if (!sock_owned_by_user(sk)) {
++ if (!sock_owned_by_user(meta_sk)) {
+ sk->sk_err = err;
+
+ sk->sk_error_report(sk);
+@@ -528,7 +537,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+ */
+
+ inet = inet_sk(sk);
+- if (!sock_owned_by_user(sk) && inet->recverr) {
++ if (!sock_owned_by_user(meta_sk) && inet->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else { /* Only an error on timeout */
+@@ -536,7 +545,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+ }
+
+ out:
+- bh_unlock_sock(sk);
++ bh_unlock_sock(meta_sk);
+ sock_put(sk);
+ }
+
+@@ -578,7 +587,7 @@ EXPORT_SYMBOL(tcp_v4_send_check);
+ * Exception: precedence violation. We do not implement it in any case.
+ */
+
+-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
+ {
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct {
+@@ -702,10 +711,10 @@ release_sk1:
+ outside socket context is ugly, certainly. What can I do?
+ */
+
+-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
++static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
+ u32 win, u32 tsval, u32 tsecr, int oif,
+ struct tcp_md5sig_key *key,
+- int reply_flags, u8 tos)
++ int reply_flags, u8 tos, int mptcp)
+ {
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct {
+@@ -714,6 +723,10 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+ #ifdef CONFIG_TCP_MD5SIG
+ + (TCPOLEN_MD5SIG_ALIGNED >> 2)
+ #endif
++#ifdef CONFIG_MPTCP
++ + ((MPTCP_SUB_LEN_DSS >> 2) +
++ (MPTCP_SUB_LEN_ACK >> 2))
++#endif
+ ];
+ } rep;
+ struct ip_reply_arg arg;
+@@ -758,6 +771,21 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+ ip_hdr(skb)->daddr, &rep.th);
+ }
+ #endif
++#ifdef CONFIG_MPTCP
++ if (mptcp) {
++ int offset = (tsecr) ? 3 : 0;
++ /* Construction of 32-bit data_ack */
++ rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |
++ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
++ (0x20 << 8) |
++ (0x01));
++ rep.opt[offset] = htonl(data_ack);
++
++ arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
++ rep.th.doff = arg.iov[0].iov_len / 4;
++ }
++#endif /* CONFIG_MPTCP */
++
+ arg.flags = reply_flags;
+ arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr, /* XXX */
+@@ -776,36 +804,44 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+ {
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
++ u32 data_ack = 0;
++ int mptcp = 0;
++
++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
++ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
++ mptcp = 1;
++ }
+
+ tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
++ data_ack,
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+ tcp_time_stamp + tcptw->tw_ts_offset,
+ tcptw->tw_ts_recent,
+ tw->tw_bound_dev_if,
+ tcp_twsk_md5_key(tcptw),
+ tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
+- tw->tw_tos
++ tw->tw_tos, mptcp
+ );
+
+ inet_twsk_put(tw);
+ }
+
+-static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+- struct request_sock *req)
++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
++ struct request_sock *req)
+ {
+ /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+ * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+ */
+ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
+ tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+- tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
++ tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd,
+ tcp_time_stamp,
+ req->ts_recent,
+ 0,
+ tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
+ AF_INET),
+ inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
+- ip_hdr(skb)->tos);
++ ip_hdr(skb)->tos, 0);
+ }
+
+ /*
+@@ -813,10 +849,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ * This still operates on a request_sock only, not on a big
+ * socket.
+ */
+-static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+- struct request_sock *req,
+- u16 queue_mapping,
+- struct tcp_fastopen_cookie *foc)
++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
++ struct flowi *fl,
++ struct request_sock *req,
++ u16 queue_mapping,
++ struct tcp_fastopen_cookie *foc)
+ {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct flowi4 fl4;
+@@ -844,21 +881,10 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+ return err;
+ }
+
+-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
+-{
+- int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
+-
+- if (!res) {
+- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+- }
+- return res;
+-}
+-
+ /*
+ * IPv4 request_sock destructor.
+ */
+-static void tcp_v4_reqsk_destructor(struct request_sock *req)
++void tcp_v4_reqsk_destructor(struct request_sock *req)
+ {
+ kfree(inet_rsk(req)->opt);
+ }
+@@ -896,7 +922,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
+ /*
+ * Save and compile IPv4 options into the request_sock if needed.
+ */
+-static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
+ {
+ const struct ip_options *opt = &(IPCB(skb)->opt);
+ struct ip_options_rcu *dopt = NULL;
+@@ -1237,161 +1263,71 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+
+ #endif
+
++static int tcp_v4_init_req(struct request_sock *req, struct sock *sk,
++ struct sk_buff *skb)
++{
++ struct inet_request_sock *ireq = inet_rsk(req);
++
++ ireq->ir_loc_addr = ip_hdr(skb)->daddr;
++ ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
++ ireq->no_srccheck = inet_sk(sk)->transparent;
++ ireq->opt = tcp_v4_save_options(skb);
++ ireq->ir_mark = inet_request_mark(sk, skb);
++
++ return 0;
++}
++
++static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
++ const struct request_sock *req,
++ bool *strict)
++{
++ struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
++
++ if (strict) {
++ if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
++ *strict = true;
++ else
++ *strict = false;
++ }
++
++ return dst;
++}
++
+ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
+ .family = PF_INET,
+ .obj_size = sizeof(struct tcp_request_sock),
+- .rtx_syn_ack = tcp_v4_rtx_synack,
++ .rtx_syn_ack = tcp_rtx_synack,
+ .send_ack = tcp_v4_reqsk_send_ack,
+ .destructor = tcp_v4_reqsk_destructor,
+ .send_reset = tcp_v4_send_reset,
+ .syn_ack_timeout = tcp_syn_ack_timeout,
+ };
+
++const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
++ .mss_clamp = TCP_MSS_DEFAULT,
+ #ifdef CONFIG_TCP_MD5SIG
+-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+ .md5_lookup = tcp_v4_reqsk_md5_lookup,
+ .calc_md5_hash = tcp_v4_md5_hash_skb,
+-};
+ #endif
++ .init_req = tcp_v4_init_req,
++#ifdef CONFIG_SYN_COOKIES
++ .cookie_init_seq = cookie_v4_init_sequence,
++#endif
++ .route_req = tcp_v4_route_req,
++ .init_seq = tcp_v4_init_sequence,
++ .send_synack = tcp_v4_send_synack,
++ .queue_hash_add = inet_csk_reqsk_queue_hash_add,
++};
+
+ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+ {
+- struct tcp_options_received tmp_opt;
+- struct request_sock *req;
+- struct inet_request_sock *ireq;
+- struct tcp_sock *tp = tcp_sk(sk);
+- struct dst_entry *dst = NULL;
+- __be32 saddr = ip_hdr(skb)->saddr;
+- __be32 daddr = ip_hdr(skb)->daddr;
+- __u32 isn = TCP_SKB_CB(skb)->when;
+- bool want_cookie = false, fastopen;
+- struct flowi4 fl4;
+- struct tcp_fastopen_cookie foc = { .len = -1 };
+- int err;
+-
+ /* Never answer to SYNs send to broadcast or multicast */
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto drop;
+
+- /* TW buckets are converted to open requests without
+- * limitations, they conserve resources and peer is
+- * evidently real one.
+- */
+- if ((sysctl_tcp_syncookies == 2 ||
+- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
+- want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
+- if (!want_cookie)
+- goto drop;
+- }
+-
+- /* Accept backlog is full. If we have already queued enough
+- * of warm entries in syn queue, drop request. It is better than
+- * clogging syn queue with openreqs with exponentially increasing
+- * timeout.
+- */
+- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+- goto drop;
+- }
+-
+- req = inet_reqsk_alloc(&tcp_request_sock_ops);
+- if (!req)
+- goto drop;
+-
+-#ifdef CONFIG_TCP_MD5SIG
+- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
+-#endif
+-
+- tcp_clear_options(&tmp_opt);
+- tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
+- tmp_opt.user_mss = tp->rx_opt.user_mss;
+- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
+-
+- if (want_cookie && !tmp_opt.saw_tstamp)
+- tcp_clear_options(&tmp_opt);
+-
+- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+- tcp_openreq_init(req, &tmp_opt, skb);
++ return tcp_conn_request(&tcp_request_sock_ops,
++ &tcp_request_sock_ipv4_ops, sk, skb);
+
+- ireq = inet_rsk(req);
+- ireq->ir_loc_addr = daddr;
+- ireq->ir_rmt_addr = saddr;
+- ireq->no_srccheck = inet_sk(sk)->transparent;
+- ireq->opt = tcp_v4_save_options(skb);
+- ireq->ir_mark = inet_request_mark(sk, skb);
+-
+- if (security_inet_conn_request(sk, skb, req))
+- goto drop_and_free;
+-
+- if (!want_cookie || tmp_opt.tstamp_ok)
+- TCP_ECN_create_request(req, skb, sock_net(sk));
+-
+- if (want_cookie) {
+- isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+- req->cookie_ts = tmp_opt.tstamp_ok;
+- } else if (!isn) {
+- /* VJ's idea. We save last timestamp seen
+- * from the destination in peer table, when entering
+- * state TIME-WAIT, and check against it before
+- * accepting new connection request.
+- *
+- * If "isn" is not zero, this request hit alive
+- * timewait bucket, so that all the necessary checks
+- * are made in the function processing timewait state.
+- */
+- if (tmp_opt.saw_tstamp &&
+- tcp_death_row.sysctl_tw_recycle &&
+- (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
+- fl4.daddr == saddr) {
+- if (!tcp_peer_is_proven(req, dst, true)) {
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+- goto drop_and_release;
+- }
+- }
+- /* Kill the following clause, if you dislike this way. */
+- else if (!sysctl_tcp_syncookies &&
+- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+- (sysctl_max_syn_backlog >> 2)) &&
+- !tcp_peer_is_proven(req, dst, false)) {
+- /* Without syncookies last quarter of
+- * backlog is filled with destinations,
+- * proven to be alive.
+- * It means that we continue to communicate
+- * to destinations, already remembered
+- * to the moment of synflood.
+- */
+- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
+- &saddr, ntohs(tcp_hdr(skb)->source));
+- goto drop_and_release;
+- }
+-
+- isn = tcp_v4_init_sequence(skb);
+- }
+- if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
+- goto drop_and_free;
+-
+- tcp_rsk(req)->snt_isn = isn;
+- tcp_rsk(req)->snt_synack = tcp_time_stamp;
+- tcp_openreq_init_rwin(req, sk, dst);
+- fastopen = !want_cookie &&
+- tcp_try_fastopen(sk, skb, req, &foc, dst);
+- err = tcp_v4_send_synack(sk, dst, req,
+- skb_get_queue_mapping(skb), &foc);
+- if (!fastopen) {
+- if (err || want_cookie)
+- goto drop_and_free;
+-
+- tcp_rsk(req)->snt_synack = tcp_time_stamp;
+- tcp_rsk(req)->listener = NULL;
+- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+- }
+-
+- return 0;
+-
+-drop_and_release:
+- dst_release(dst);
+-drop_and_free:
+- reqsk_free(req);
+ drop:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return 0;
+@@ -1497,7 +1433,7 @@ put_and_exit:
+ }
+ EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+
+-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+ {
+ struct tcphdr *th = tcp_hdr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
+@@ -1514,8 +1450,15 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+
+ if (nsk) {
+ if (nsk->sk_state != TCP_TIME_WAIT) {
++ /* Don't lock again the meta-sk. It has been locked
++ * before mptcp_v4_do_rcv.
++ */
++ if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk))
++ bh_lock_sock(mptcp_meta_sk(nsk));
+ bh_lock_sock(nsk);
++
+ return nsk;
++
+ }
+ inet_twsk_put(inet_twsk(nsk));
+ return NULL;
+@@ -1550,6 +1493,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+ goto discard;
+ #endif
+
++ if (is_meta_sk(sk))
++ return mptcp_v4_do_rcv(sk, skb);
++
+ if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+ struct dst_entry *dst = sk->sk_rx_dst;
+
+@@ -1681,7 +1627,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
+ } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
+ wake_up_interruptible_sync_poll(sk_sleep(sk),
+ POLLIN | POLLRDNORM | POLLRDBAND);
+- if (!inet_csk_ack_scheduled(sk))
++ if (!inet_csk_ack_scheduled(sk) && !mptcp(tp))
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ (3 * tcp_rto_min(sk)) / 4,
+ TCP_RTO_MAX);
+@@ -1698,7 +1644,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
+ {
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+- struct sock *sk;
++ struct sock *sk, *meta_sk = NULL;
+ int ret;
+ struct net *net = dev_net(skb->dev);
+
+@@ -1732,18 +1678,42 @@ int tcp_v4_rcv(struct sk_buff *skb)
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+ skb->len - th->doff * 4);
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
++#ifdef CONFIG_MPTCP
++ TCP_SKB_CB(skb)->mptcp_flags = 0;
++ TCP_SKB_CB(skb)->dss_off = 0;
++#endif
+ TCP_SKB_CB(skb)->when = 0;
+ TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
+ TCP_SKB_CB(skb)->sacked = 0;
+
+ sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+- if (!sk)
+- goto no_tcp_socket;
+
+ process:
+- if (sk->sk_state == TCP_TIME_WAIT)
++ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ goto do_time_wait;
+
++#ifdef CONFIG_MPTCP
++ if (!sk && th->syn && !th->ack) {
++ int ret = mptcp_lookup_join(skb, NULL);
++
++ if (ret < 0) {
++ tcp_v4_send_reset(NULL, skb);
++ goto discard_it;
++ } else if (ret > 0) {
++ return 0;
++ }
++ }
++
++ /* Is there a pending request sock for this segment ? */
++ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
++ if (sk)
++ sock_put(sk);
++ return 0;
++ }
++#endif
++ if (!sk)
++ goto no_tcp_socket;
++
+ if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ goto discard_and_relse;
+@@ -1759,11 +1729,21 @@ process:
+ sk_mark_napi_id(sk, skb);
+ skb->dev = NULL;
+
+- bh_lock_sock_nested(sk);
++ if (mptcp(tcp_sk(sk))) {
++ meta_sk = mptcp_meta_sk(sk);
++
++ bh_lock_sock_nested(meta_sk);
++ if (sock_owned_by_user(meta_sk))
++ skb->sk = sk;
++ } else {
++ meta_sk = sk;
++ bh_lock_sock_nested(sk);
++ }
++
+ ret = 0;
+- if (!sock_owned_by_user(sk)) {
++ if (!sock_owned_by_user(meta_sk)) {
+ #ifdef CONFIG_NET_DMA
+- struct tcp_sock *tp = tcp_sk(sk);
++ struct tcp_sock *tp = tcp_sk(meta_sk);
+ if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+ tp->ucopy.dma_chan = net_dma_find_channel();
+ if (tp->ucopy.dma_chan)
+@@ -1771,16 +1751,16 @@ process:
+ else
+ #endif
+ {
+- if (!tcp_prequeue(sk, skb))
++ if (!tcp_prequeue(meta_sk, skb))
+ ret = tcp_v4_do_rcv(sk, skb);
+ }
+- } else if (unlikely(sk_add_backlog(sk, skb,
+- sk->sk_rcvbuf + sk->sk_sndbuf))) {
+- bh_unlock_sock(sk);
++ } else if (unlikely(sk_add_backlog(meta_sk, skb,
++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
++ bh_unlock_sock(meta_sk);
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+ goto discard_and_relse;
+ }
+- bh_unlock_sock(sk);
++ bh_unlock_sock(meta_sk);
+
+ sock_put(sk);
+
+@@ -1835,6 +1815,18 @@ do_time_wait:
+ sk = sk2;
+ goto process;
+ }
++#ifdef CONFIG_MPTCP
++ if (th->syn && !th->ack) {
++ int ret = mptcp_lookup_join(skb, inet_twsk(sk));
++
++ if (ret < 0) {
++ tcp_v4_send_reset(NULL, skb);
++ goto discard_it;
++ } else if (ret > 0) {
++ return 0;
++ }
++ }
++#endif
+ /* Fall through to ACK */
+ }
+ case TCP_TW_ACK:
+@@ -1900,7 +1892,12 @@ static int tcp_v4_init_sock(struct sock *sk)
+
+ tcp_init_sock(sk);
+
+- icsk->icsk_af_ops = &ipv4_specific;
++#ifdef CONFIG_MPTCP
++ if (is_mptcp_enabled(sk))
++ icsk->icsk_af_ops = &mptcp_v4_specific;
++ else
++#endif
++ icsk->icsk_af_ops = &ipv4_specific;
+
+ #ifdef CONFIG_TCP_MD5SIG
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
+@@ -1917,6 +1914,11 @@ void tcp_v4_destroy_sock(struct sock *sk)
+
+ tcp_cleanup_congestion_control(sk);
+
++ if (mptcp(tp))
++ mptcp_destroy_sock(sk);
++ if (tp->inside_tk_table)
++ mptcp_hash_remove(tp);
++
+ /* Cleanup up the write buffer. */
+ tcp_write_queue_purge(sk);
+
+@@ -2481,6 +2483,19 @@ void tcp4_proc_exit(void)
+ }
+ #endif /* CONFIG_PROC_FS */
+
++#ifdef CONFIG_MPTCP
++static void tcp_v4_clear_sk(struct sock *sk, int size)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ /* we do not want to clear tk_table field, because of RCU lookups */
++ sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table));
++
++ size -= offsetof(struct tcp_sock, tk_table) + sizeof(tp->tk_table);
++ memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size);
++}
++#endif
++
+ struct proto tcp_prot = {
+ .name = "TCP",
+ .owner = THIS_MODULE,
+@@ -2528,6 +2543,9 @@ struct proto tcp_prot = {
+ .destroy_cgroup = tcp_destroy_cgroup,
+ .proto_cgroup = tcp_proto_cgroup,
+ #endif
++#ifdef CONFIG_MPTCP
++ .clear_sk = tcp_v4_clear_sk,
++#endif
+ };
+ EXPORT_SYMBOL(tcp_prot);
+
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index e68e0d4af6c9..ae6946857dff 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -18,11 +18,13 @@
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
++#include <linux/kconfig.h>
+ #include <linux/mm.h>
+ #include <linux/module.h>
+ #include <linux/slab.h>
+ #include <linux/sysctl.h>
+ #include <linux/workqueue.h>
++#include <net/mptcp.h>
+ #include <net/tcp.h>
+ #include <net/inet_common.h>
+ #include <net/xfrm.h>
+@@ -95,10 +97,13 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+ struct tcp_options_received tmp_opt;
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+ bool paws_reject = false;
++ struct mptcp_options_received mopt;
+
+ tmp_opt.saw_tstamp = 0;
+ if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
+- tcp_parse_options(skb, &tmp_opt, 0, NULL);
++ mptcp_init_mp_opt(&mopt);
++
++ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
+
+ if (tmp_opt.saw_tstamp) {
+ tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
+@@ -106,6 +111,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+ tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
+ }
++
++ if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) {
++ if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key)
++ goto kill_with_rst;
++ }
+ }
+
+ if (tw->tw_substate == TCP_FIN_WAIT2) {
+@@ -128,6 +138,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+ if (!th->ack ||
+ !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
+ TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
++ /* If mptcp_is_data_fin() returns true, we are sure that
++ * mopt has been initialized - otherwise it would not
++ * be a DATA_FIN.
++ */
++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw &&
++ mptcp_is_data_fin(skb) &&
++ TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
++ mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt)
++ return TCP_TW_ACK;
++
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+@@ -290,6 +310,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
+ tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+ tcptw->tw_ts_offset = tp->tsoffset;
+
++ if (mptcp(tp)) {
++ if (mptcp_init_tw_sock(sk, tcptw)) {
++ inet_twsk_free(tw);
++ goto exit;
++ }
++ } else {
++ tcptw->mptcp_tw = NULL;
++ }
++
+ #if IS_ENABLED(CONFIG_IPV6)
+ if (tw->tw_family == PF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+@@ -347,15 +376,18 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
+ }
+
++exit:
+ tcp_update_metrics(sk);
+ tcp_done(sk);
+ }
+
+ void tcp_twsk_destructor(struct sock *sk)
+ {
+-#ifdef CONFIG_TCP_MD5SIG
+ struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+
++ if (twsk->mptcp_tw)
++ mptcp_twsk_destructor(twsk);
++#ifdef CONFIG_TCP_MD5SIG
+ if (twsk->tw_md5_key)
+ kfree_rcu(twsk->tw_md5_key, rcu);
+ #endif
+@@ -382,13 +414,14 @@ void tcp_openreq_init_rwin(struct request_sock *req,
+ req->window_clamp = tcp_full_space(sk);
+
+ /* tcp_full_space because it is guaranteed to be the first packet */
+- tcp_select_initial_window(tcp_full_space(sk),
+- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
++ tp->ops->select_initial_window(tcp_full_space(sk),
++ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) -
++ (ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0),
+ &req->rcv_wnd,
+ &req->window_clamp,
+ ireq->wscale_ok,
+ &rcv_wscale,
+- dst_metric(dst, RTAX_INITRWND));
++ dst_metric(dst, RTAX_INITRWND), sk);
+ ireq->rcv_wscale = rcv_wscale;
+ }
+ EXPORT_SYMBOL(tcp_openreq_init_rwin);
+@@ -499,6 +532,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
+ newtp->rx_opt.ts_recent_stamp = 0;
+ newtp->tcp_header_len = sizeof(struct tcphdr);
+ }
++ if (ireq->saw_mpc)
++ newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
+ newtp->tsoffset = 0;
+ #ifdef CONFIG_TCP_MD5SIG
+ newtp->md5sig_info = NULL; /*XXX*/
+@@ -535,16 +570,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
+ bool fastopen)
+ {
+ struct tcp_options_received tmp_opt;
++ struct mptcp_options_received mopt;
+ struct sock *child;
+ const struct tcphdr *th = tcp_hdr(skb);
+ __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+ bool paws_reject = false;
+
+- BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
++ BUG_ON(!mptcp(tcp_sk(sk)) && fastopen == (sk->sk_state == TCP_LISTEN));
+
+ tmp_opt.saw_tstamp = 0;
++
++ mptcp_init_mp_opt(&mopt);
++
+ if (th->doff > (sizeof(struct tcphdr)>>2)) {
+- tcp_parse_options(skb, &tmp_opt, 0, NULL);
++ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
+
+ if (tmp_opt.saw_tstamp) {
+ tmp_opt.ts_recent = req->ts_recent;
+@@ -583,7 +622,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
+ *
+ * Reset timer after retransmitting SYNACK, similar to
+ * the idea of fast retransmit in recovery.
++ *
++ * Fall back to TCP if MP_CAPABLE is not set.
+ */
++
++ if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc)
++ inet_rsk(req)->saw_mpc = false;
++
++
+ if (!inet_rtx_syn_ack(sk, req))
+ req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
+ TCP_RTO_MAX) + jiffies;
+@@ -718,9 +764,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
+ * socket is created, wait for troubles.
+ */
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
++
+ if (child == NULL)
+ goto listen_overflow;
+
++ if (!is_meta_sk(sk)) {
++ int ret = mptcp_check_req_master(sk, child, req, prev);
++ if (ret < 0)
++ goto listen_overflow;
++
++ /* MPTCP-supported */
++ if (!ret)
++ return tcp_sk(child)->mpcb->master_sk;
++ } else {
++ return mptcp_check_req_child(sk, child, req, prev, &mopt);
++ }
+ inet_csk_reqsk_queue_unlink(sk, req, prev);
+ inet_csk_reqsk_queue_removed(sk, req);
+
+@@ -746,7 +804,17 @@ embryonic_reset:
+ tcp_reset(sk);
+ }
+ if (!fastopen) {
+- inet_csk_reqsk_queue_drop(sk, req, prev);
++ if (is_meta_sk(sk)) {
++ /* We want to avoid stoping the keepalive-timer and so
++ * avoid ending up in inet_csk_reqsk_queue_removed ...
++ */
++ inet_csk_reqsk_queue_unlink(sk, req, prev);
++ if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0)
++ mptcp_delete_synack_timer(sk);
++ reqsk_free(req);
++ } else {
++ inet_csk_reqsk_queue_drop(sk, req, prev);
++ }
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+ }
+ return NULL;
+@@ -770,8 +838,9 @@ int tcp_child_process(struct sock *parent, struct sock *child,
+ {
+ int ret = 0;
+ int state = child->sk_state;
++ struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child;
+
+- if (!sock_owned_by_user(child)) {
++ if (!sock_owned_by_user(meta_sk)) {
+ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
+ skb->len);
+ /* Wakeup parent, send SIGIO */
+@@ -782,10 +851,14 @@ int tcp_child_process(struct sock *parent, struct sock *child,
+ * in main socket hash table and lock on listening
+ * socket does not protect us more.
+ */
+- __sk_add_backlog(child, skb);
++ if (mptcp(tcp_sk(child)))
++ skb->sk = child;
++ __sk_add_backlog(meta_sk, skb);
+ }
+
+- bh_unlock_sock(child);
++ if (mptcp(tcp_sk(child)))
++ bh_unlock_sock(child);
++ bh_unlock_sock(meta_sk);
+ sock_put(child);
+ return ret;
+ }
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index 179b51e6bda3..efd31b6c5784 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -36,6 +36,12 @@
+
+ #define pr_fmt(fmt) "TCP: " fmt
+
++#include <net/mptcp.h>
++#include <net/mptcp_v4.h>
++#if IS_ENABLED(CONFIG_IPV6)
++#include <net/mptcp_v6.h>
++#endif
++#include <net/ipv6.h>
+ #include <net/tcp.h>
+
+ #include <linux/compiler.h>
+@@ -68,11 +74,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+ unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
+ EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
+
+-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+- int push_one, gfp_t gfp);
+-
+ /* Account for new data that has been sent to the network. */
+-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+ {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+@@ -214,7 +217,7 @@ u32 tcp_default_init_rwnd(u32 mss)
+ void tcp_select_initial_window(int __space, __u32 mss,
+ __u32 *rcv_wnd, __u32 *window_clamp,
+ int wscale_ok, __u8 *rcv_wscale,
+- __u32 init_rcv_wnd)
++ __u32 init_rcv_wnd, const struct sock *sk)
+ {
+ unsigned int space = (__space < 0 ? 0 : __space);
+
+@@ -269,12 +272,16 @@ EXPORT_SYMBOL(tcp_select_initial_window);
+ * value can be stuffed directly into th->window for an outgoing
+ * frame.
+ */
+-static u16 tcp_select_window(struct sock *sk)
++u16 tcp_select_window(struct sock *sk)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 old_win = tp->rcv_wnd;
+- u32 cur_win = tcp_receive_window(tp);
+- u32 new_win = __tcp_select_window(sk);
++ /* The window must never shrink at the meta-level. At the subflow we
++ * have to allow this. Otherwise we may announce a window too large
++ * for the current meta-level sk_rcvbuf.
++ */
++ u32 cur_win = tcp_receive_window(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp);
++ u32 new_win = tp->ops->__select_window(sk);
+
+ /* Never shrink the offered window */
+ if (new_win < cur_win) {
+@@ -290,6 +297,7 @@ static u16 tcp_select_window(struct sock *sk)
+ LINUX_MIB_TCPWANTZEROWINDOWADV);
+ new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
+ }
++
+ tp->rcv_wnd = new_win;
+ tp->rcv_wup = tp->rcv_nxt;
+
+@@ -374,7 +382,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
+ /* Constructs common control bits of non-data skb. If SYN/FIN is present,
+ * auto increment end seqno.
+ */
+-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+ {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+@@ -394,7 +402,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+ TCP_SKB_CB(skb)->end_seq = seq;
+ }
+
+-static inline bool tcp_urg_mode(const struct tcp_sock *tp)
++bool tcp_urg_mode(const struct tcp_sock *tp)
+ {
+ return tp->snd_una != tp->snd_up;
+ }
+@@ -404,17 +412,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
+ #define OPTION_MD5 (1 << 2)
+ #define OPTION_WSCALE (1 << 3)
+ #define OPTION_FAST_OPEN_COOKIE (1 << 8)
+-
+-struct tcp_out_options {
+- u16 options; /* bit field of OPTION_* */
+- u16 mss; /* 0 to disable */
+- u8 ws; /* window scale, 0 to disable */
+- u8 num_sack_blocks; /* number of SACK blocks to include */
+- u8 hash_size; /* bytes in hash_location */
+- __u8 *hash_location; /* temporary pointer, overloaded */
+- __u32 tsval, tsecr; /* need to include OPTION_TS */
+- struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
+-};
++/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */
+
+ /* Write previously computed TCP options to the packet.
+ *
+@@ -430,7 +428,7 @@ struct tcp_out_options {
+ * (but it may well be that other scenarios fail similarly).
+ */
+ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+- struct tcp_out_options *opts)
++ struct tcp_out_options *opts, struct sk_buff *skb)
+ {
+ u16 options = opts->options; /* mungable copy */
+
+@@ -513,6 +511,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+ }
+ ptr += (foc->len + 3) >> 2;
+ }
++
++ if (unlikely(OPTION_MPTCP & opts->options))
++ mptcp_options_write(ptr, tp, opts, skb);
+ }
+
+ /* Compute TCP options for SYN packets. This is not the final
+@@ -564,6 +565,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
+ if (unlikely(!(OPTION_TS & opts->options)))
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
+ }
++ if (tp->request_mptcp || mptcp(tp))
++ mptcp_syn_options(sk, opts, &remaining);
+
+ if (fastopen && fastopen->cookie.len >= 0) {
+ u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+@@ -637,6 +640,9 @@ static unsigned int tcp_synack_options(struct sock *sk,
+ }
+ }
+
++ if (ireq->saw_mpc)
++ mptcp_synack_options(req, opts, &remaining);
++
+ return MAX_TCP_OPTION_SPACE - remaining;
+ }
+
+@@ -670,16 +676,22 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
+ opts->tsecr = tp->rx_opt.ts_recent;
+ size += TCPOLEN_TSTAMP_ALIGNED;
+ }
++ if (mptcp(tp))
++ mptcp_established_options(sk, skb, opts, &size);
+
+ eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+ if (unlikely(eff_sacks)) {
+- const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+- opts->num_sack_blocks =
+- min_t(unsigned int, eff_sacks,
+- (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
+- TCPOLEN_SACK_PERBLOCK);
+- size += TCPOLEN_SACK_BASE_ALIGNED +
+- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
++ const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
++ if (remaining < TCPOLEN_SACK_BASE_ALIGNED)
++ opts->num_sack_blocks = 0;
++ else
++ opts->num_sack_blocks =
++ min_t(unsigned int, eff_sacks,
++ (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
++ TCPOLEN_SACK_PERBLOCK);
++ if (opts->num_sack_blocks)
++ size += TCPOLEN_SACK_BASE_ALIGNED +
++ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ }
+
+ return size;
+@@ -711,8 +723,8 @@ static void tcp_tsq_handler(struct sock *sk)
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
+- tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+- 0, GFP_ATOMIC);
++ tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk),
++ tcp_sk(sk)->nonagle, 0, GFP_ATOMIC);
+ }
+ /*
+ * One tasklet per cpu tries to send more skbs.
+@@ -727,7 +739,7 @@ static void tcp_tasklet_func(unsigned long data)
+ unsigned long flags;
+ struct list_head *q, *n;
+ struct tcp_sock *tp;
+- struct sock *sk;
++ struct sock *sk, *meta_sk;
+
+ local_irq_save(flags);
+ list_splice_init(&tsq->head, &list);
+@@ -738,15 +750,25 @@ static void tcp_tasklet_func(unsigned long data)
+ list_del(&tp->tsq_node);
+
+ sk = (struct sock *)tp;
+- bh_lock_sock(sk);
++ meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
++ bh_lock_sock(meta_sk);
+
+- if (!sock_owned_by_user(sk)) {
++ if (!sock_owned_by_user(meta_sk)) {
+ tcp_tsq_handler(sk);
++ if (mptcp(tp))
++ tcp_tsq_handler(meta_sk);
+ } else {
++ if (mptcp(tp) && sk->sk_state == TCP_CLOSE)
++ goto exit;
++
+ /* defer the work to tcp_release_cb() */
+ set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
++
++ if (mptcp(tp))
++ mptcp_tsq_flags(sk);
+ }
+- bh_unlock_sock(sk);
++exit:
++ bh_unlock_sock(meta_sk);
+
+ clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+ sk_free(sk);
+@@ -756,7 +778,10 @@ static void tcp_tasklet_func(unsigned long data)
+ #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
+ (1UL << TCP_WRITE_TIMER_DEFERRED) | \
+ (1UL << TCP_DELACK_TIMER_DEFERRED) | \
+- (1UL << TCP_MTU_REDUCED_DEFERRED))
++ (1UL << TCP_MTU_REDUCED_DEFERRED) | \
++ (1UL << MPTCP_PATH_MANAGER) | \
++ (1UL << MPTCP_SUB_DEFERRED))
++
+ /**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+@@ -803,6 +828,13 @@ void tcp_release_cb(struct sock *sk)
+ sk->sk_prot->mtu_reduced(sk);
+ __sock_put(sk);
+ }
++ if (flags & (1UL << MPTCP_PATH_MANAGER)) {
++ if (tcp_sk(sk)->mpcb->pm_ops->release_sock)
++ tcp_sk(sk)->mpcb->pm_ops->release_sock(sk);
++ __sock_put(sk);
++ }
++ if (flags & (1UL << MPTCP_SUB_DEFERRED))
++ mptcp_tsq_sub_deferred(sk);
+ }
+ EXPORT_SYMBOL(tcp_release_cb);
+
+@@ -862,8 +894,8 @@ void tcp_wfree(struct sk_buff *skb)
+ * We are working here with either a clone of the original
+ * SKB, or a fresh unique copy made by the retransmit engine.
+ */
+-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+- gfp_t gfp_mask)
++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
++ gfp_t gfp_mask)
+ {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet;
+@@ -933,7 +965,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ */
+ th->window = htons(min(tp->rcv_wnd, 65535U));
+ } else {
+- th->window = htons(tcp_select_window(sk));
++ th->window = htons(tp->ops->select_window(sk));
+ }
+ th->check = 0;
+ th->urg_ptr = 0;
+@@ -949,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ }
+ }
+
+- tcp_options_write((__be32 *)(th + 1), tp, &opts);
++ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
+ if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
+ TCP_ECN_send(sk, skb, tcp_header_size);
+
+@@ -988,7 +1020,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
+ */
+-static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+@@ -1001,15 +1033,16 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+ }
+
+ /* Initialize TSO segments for a packet. */
+-static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
+- unsigned int mss_now)
++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
++ unsigned int mss_now)
+ {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* Make sure we own this skb before messing gso_size/gso_segs */
+ WARN_ON_ONCE(skb_cloned(skb));
+
+- if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
++ if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) ||
++ (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) {
+ /* Avoid the costly divide in the normal
+ * non-TSO case.
+ */
+@@ -1041,7 +1074,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
+ /* Pcount in the middle of the write queue got changed, we need to do various
+ * tweaks to fix counters
+ */
+-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+@@ -1164,7 +1197,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+ * eventually). The difference is that pulled data not copied, but
+ * immediately discarded.
+ */
+-static void __pskb_trim_head(struct sk_buff *skb, int len)
++void __pskb_trim_head(struct sk_buff *skb, int len)
+ {
+ struct skb_shared_info *shinfo;
+ int i, k, eat;
+@@ -1205,6 +1238,9 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
+ /* Remove acked data from a packet in the transmit queue. */
+ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+ {
++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk) && mptcp_is_data_seq(skb))
++ return mptcp_trim_head(sk, skb, len);
++
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -ENOMEM;
+
+@@ -1222,6 +1258,15 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+ if (tcp_skb_pcount(skb) > 1)
+ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
+
++#ifdef CONFIG_MPTCP
++ /* Some data got acked - we assume that the seq-number reached the dest.
++ * Anyway, our MPTCP-option has been trimmed above - we lost it here.
++ * Only remove the SEQ if the call does not come from a meta retransmit.
++ */
++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))
++ TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ;
++#endif
++
+ return 0;
+ }
+
+@@ -1379,6 +1424,7 @@ unsigned int tcp_current_mss(struct sock *sk)
+
+ return mss_now;
+ }
++EXPORT_SYMBOL(tcp_current_mss);
+
+ /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+ * As additional protections, we do not touch cwnd in retransmission phases,
+@@ -1446,8 +1492,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp)
+ * But we can avoid doing the divide again given we already have
+ * skb_pcount = skb->len / mss_now
+ */
+-static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
+- const struct sk_buff *skb)
++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
++ const struct sk_buff *skb)
+ {
+ if (skb->len < tcp_skb_pcount(skb) * mss_now)
+ tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+@@ -1468,11 +1514,11 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
+ (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
+ }
+ /* Returns the portion of skb which can be sent right away */
+-static unsigned int tcp_mss_split_point(const struct sock *sk,
+- const struct sk_buff *skb,
+- unsigned int mss_now,
+- unsigned int max_segs,
+- int nonagle)
++unsigned int tcp_mss_split_point(const struct sock *sk,
++ const struct sk_buff *skb,
++ unsigned int mss_now,
++ unsigned int max_segs,
++ int nonagle)
+ {
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 partial, needed, window, max_len;
+@@ -1502,13 +1548,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
+ /* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules? If so, return how many segments are allowed.
+ */
+-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
+- const struct sk_buff *skb)
++unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
++ const struct sk_buff *skb)
+ {
+ u32 in_flight, cwnd;
+
+ /* Don't be strict about the congestion window for the final FIN. */
+- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
++ if (skb &&
++ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
+ tcp_skb_pcount(skb) == 1)
+ return 1;
+
+@@ -1524,8 +1571,8 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
+ * This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+-static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
+- unsigned int mss_now)
++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
++ unsigned int mss_now)
+ {
+ int tso_segs = tcp_skb_pcount(skb);
+
+@@ -1540,8 +1587,8 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
+ /* Return true if the Nagle test allows this packet to be
+ * sent now.
+ */
+-static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+- unsigned int cur_mss, int nonagle)
++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
++ unsigned int cur_mss, int nonagle)
+ {
+ /* Nagle rule does not apply to frames, which sit in the middle of the
+ * write_queue (they have no chances to get new data).
+@@ -1553,7 +1600,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
+ return true;
+
+ /* Don't use the nagle rule for urgent data (or for the final FIN). */
+- if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
++ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
++ mptcp_is_data_fin(skb))
+ return true;
+
+ if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
+@@ -1563,9 +1611,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
+ }
+
+ /* Does at least the first segment of SKB fit into the send window? */
+-static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
+- const struct sk_buff *skb,
+- unsigned int cur_mss)
++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
++ unsigned int cur_mss)
+ {
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+@@ -1676,7 +1723,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
+ u32 send_win, cong_win, limit, in_flight;
+ int win_divisor;
+
+- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
++ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb))
+ goto send_now;
+
+ if (icsk->icsk_ca_state != TCP_CA_Open)
+@@ -1888,7 +1935,7 @@ static int tcp_mtu_probe(struct sock *sk)
+ * Returns true, if no segments are in flight and we have queued segments,
+ * but cannot send anything now because of SWS or another problem.
+ */
+-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+@@ -1900,7 +1947,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+
+ sent_pkts = 0;
+
+- if (!push_one) {
++ /* pmtu not yet supported with MPTCP. Should be possible, by early
++ * exiting the loop inside tcp_mtu_probe, making sure that only one
++ * single DSS-mapping gets probed.
++ */
++ if (!push_one && !mptcp(tp)) {
+ /* Do MTU probing. */
+ result = tcp_mtu_probe(sk);
+ if (!result) {
+@@ -2099,7 +2150,8 @@ void tcp_send_loss_probe(struct sock *sk)
+ int err = -1;
+
+ if (tcp_send_head(sk) != NULL) {
+- err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
++ err = tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2,
++ GFP_ATOMIC);
+ goto rearm_timer;
+ }
+
+@@ -2159,8 +2211,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
+ if (unlikely(sk->sk_state == TCP_CLOSE))
+ return;
+
+- if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
+- sk_gfp_atomic(sk, GFP_ATOMIC)))
++ if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0,
++ sk_gfp_atomic(sk, GFP_ATOMIC)))
+ tcp_check_probe_timer(sk);
+ }
+
+@@ -2173,7 +2225,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
+
+ BUG_ON(!skb || skb->len < mss_now);
+
+- tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
++ tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1,
++ sk->sk_allocation);
+ }
+
+ /* This function returns the amount that we can raise the
+@@ -2386,6 +2439,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ return;
+
++ /* Currently not supported for MPTCP - but it should be possible */
++ if (mptcp(tp))
++ return;
++
+ tcp_for_write_queue_from_safe(skb, tmp, sk) {
+ if (!tcp_can_collapse(sk, skb))
+ break;
+@@ -2843,7 +2900,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
+ th->window = htons(min(req->rcv_wnd, 65535U));
+- tcp_options_write((__be32 *)(th + 1), tp, &opts);
++ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
+ th->doff = (tcp_header_size >> 2);
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
+
+@@ -2897,13 +2954,13 @@ static void tcp_connect_init(struct sock *sk)
+ (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
+ tp->window_clamp = tcp_full_space(sk);
+
+- tcp_select_initial_window(tcp_full_space(sk),
+- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+- &tp->rcv_wnd,
+- &tp->window_clamp,
+- sysctl_tcp_window_scaling,
+- &rcv_wscale,
+- dst_metric(dst, RTAX_INITRWND));
++ tp->ops->select_initial_window(tcp_full_space(sk),
++ tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
++ &tp->rcv_wnd,
++ &tp->window_clamp,
++ sysctl_tcp_window_scaling,
++ &rcv_wscale,
++ dst_metric(dst, RTAX_INITRWND), sk);
+
+ tp->rx_opt.rcv_wscale = rcv_wscale;
+ tp->rcv_ssthresh = tp->rcv_wnd;
+@@ -2927,6 +2984,36 @@ static void tcp_connect_init(struct sock *sk)
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_retransmits = 0;
+ tcp_clear_retrans(tp);
++
++#ifdef CONFIG_MPTCP
++ if (sysctl_mptcp_enabled && mptcp_doit(sk)) {
++ if (is_master_tp(tp)) {
++ tp->request_mptcp = 1;
++ mptcp_connect_init(sk);
++ } else if (tp->mptcp) {
++ struct inet_sock *inet = inet_sk(sk);
++
++ tp->mptcp->snt_isn = tp->write_seq;
++ tp->mptcp->init_rcv_wnd = tp->rcv_wnd;
++
++ /* Set nonce for new subflows */
++ if (sk->sk_family == AF_INET)
++ tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(
++ inet->inet_saddr,
++ inet->inet_daddr,
++ inet->inet_sport,
++ inet->inet_dport);
++#if IS_ENABLED(CONFIG_IPV6)
++ else
++ tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(
++ inet6_sk(sk)->saddr.s6_addr32,
++ sk->sk_v6_daddr.s6_addr32,
++ inet->inet_sport,
++ inet->inet_dport);
++#endif
++ }
++ }
++#endif
+ }
+
+ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+@@ -3176,6 +3263,7 @@ void tcp_send_ack(struct sock *sk)
+ TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
+ }
++EXPORT_SYMBOL(tcp_send_ack);
+
+ /* This routine sends a packet with an out of date sequence
+ * number. It assumes the other end will try to ack it.
+@@ -3188,7 +3276,7 @@ void tcp_send_ack(struct sock *sk)
+ * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
+ * out-of-date with SND.UNA-1 to probe window.
+ */
+-static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
++int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+@@ -3270,7 +3358,7 @@ void tcp_send_probe0(struct sock *sk)
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err;
+
+- err = tcp_write_wakeup(sk);
++ err = tp->ops->write_wakeup(sk);
+
+ if (tp->packets_out || !tcp_send_head(sk)) {
+ /* Cancel probe timer, if it is not required. */
+@@ -3301,3 +3389,18 @@ void tcp_send_probe0(struct sock *sk)
+ TCP_RTO_MAX);
+ }
+ }
++
++int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
++{
++ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
++ struct flowi fl;
++ int res;
++
++ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
++ if (!res) {
++ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
++ }
++ return res;
++}
++EXPORT_SYMBOL(tcp_rtx_synack);
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index 286227abed10..966b873cbf3e 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -20,6 +20,7 @@
+
+ #include <linux/module.h>
+ #include <linux/gfp.h>
++#include <net/mptcp.h>
+ #include <net/tcp.h>
+
+ int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
+@@ -32,7 +33,7 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
+ int sysctl_tcp_orphan_retries __read_mostly;
+ int sysctl_tcp_thin_linear_timeouts __read_mostly;
+
+-static void tcp_write_err(struct sock *sk)
++void tcp_write_err(struct sock *sk)
+ {
+ sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+ sk->sk_error_report(sk);
+@@ -74,7 +75,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
+ (!tp->snd_wnd && !tp->packets_out))
+ do_reset = 1;
+ if (do_reset)
+- tcp_send_active_reset(sk, GFP_ATOMIC);
++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
+ tcp_done(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
+ return 1;
+@@ -124,10 +125,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
+ * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
+ * syn_set flag is set.
+ */
+-static bool retransmits_timed_out(struct sock *sk,
+- unsigned int boundary,
+- unsigned int timeout,
+- bool syn_set)
++bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
++ unsigned int timeout, bool syn_set)
+ {
+ unsigned int linear_backoff_thresh, start_ts;
+ unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
+@@ -153,7 +152,7 @@ static bool retransmits_timed_out(struct sock *sk,
+ }
+
+ /* A write timeout has occurred. Process the after effects. */
+-static int tcp_write_timeout(struct sock *sk)
++int tcp_write_timeout(struct sock *sk)
+ {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+@@ -171,6 +170,10 @@ static int tcp_write_timeout(struct sock *sk)
+ }
+ retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ syn_set = true;
++ /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */
++ if (tcp_sk(sk)->request_mptcp &&
++ icsk->icsk_retransmits >= mptcp_sysctl_syn_retries())
++ tcp_sk(sk)->request_mptcp = 0;
+ } else {
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
+ /* Black hole detection */
+@@ -251,18 +254,22 @@ out:
+ static void tcp_delack_timer(unsigned long data)
+ {
+ struct sock *sk = (struct sock *)data;
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
+
+- bh_lock_sock(sk);
+- if (!sock_owned_by_user(sk)) {
++ bh_lock_sock(meta_sk);
++ if (!sock_owned_by_user(meta_sk)) {
+ tcp_delack_timer_handler(sk);
+ } else {
+ inet_csk(sk)->icsk_ack.blocked = 1;
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED);
+ /* deleguate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ sock_hold(sk);
++ if (mptcp(tp))
++ mptcp_tsq_flags(sk);
+ }
+- bh_unlock_sock(sk);
++ bh_unlock_sock(meta_sk);
+ sock_put(sk);
+ }
+
+@@ -479,6 +486,10 @@ out_reset_timer:
+ __sk_dst_reset(sk);
+
+ out:;
++ if (mptcp(tp)) {
++ mptcp_reinject_data(sk, 1);
++ mptcp_set_rto(sk);
++ }
+ }
+
+ void tcp_write_timer_handler(struct sock *sk)
+@@ -505,7 +516,7 @@ void tcp_write_timer_handler(struct sock *sk)
+ break;
+ case ICSK_TIME_RETRANS:
+ icsk->icsk_pending = 0;
+- tcp_retransmit_timer(sk);
++ tcp_sk(sk)->ops->retransmit_timer(sk);
+ break;
+ case ICSK_TIME_PROBE0:
+ icsk->icsk_pending = 0;
+@@ -520,16 +531,19 @@ out:
+ static void tcp_write_timer(unsigned long data)
+ {
+ struct sock *sk = (struct sock *)data;
++ struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk;
+
+- bh_lock_sock(sk);
+- if (!sock_owned_by_user(sk)) {
++ bh_lock_sock(meta_sk);
++ if (!sock_owned_by_user(meta_sk)) {
+ tcp_write_timer_handler(sk);
+ } else {
+ /* deleguate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ sock_hold(sk);
++ if (mptcp(tcp_sk(sk)))
++ mptcp_tsq_flags(sk);
+ }
+- bh_unlock_sock(sk);
++ bh_unlock_sock(meta_sk);
+ sock_put(sk);
+ }
+
+@@ -566,11 +580,12 @@ static void tcp_keepalive_timer (unsigned long data)
+ struct sock *sk = (struct sock *) data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
+ u32 elapsed;
+
+ /* Only process if socket is not in use. */
+- bh_lock_sock(sk);
+- if (sock_owned_by_user(sk)) {
++ bh_lock_sock(meta_sk);
++ if (sock_owned_by_user(meta_sk)) {
+ /* Try again later. */
+ inet_csk_reset_keepalive_timer (sk, HZ/20);
+ goto out;
+@@ -581,16 +596,38 @@ static void tcp_keepalive_timer (unsigned long data)
+ goto out;
+ }
+
++ if (tp->send_mp_fclose) {
++ /* MUST do this before tcp_write_timeout, because retrans_stamp
++ * may have been set to 0 in another part while we are
++ * retransmitting MP_FASTCLOSE. Then, we would crash, because
++ * retransmits_timed_out accesses the meta-write-queue.
++ *
++ * We make sure that the timestamp is != 0.
++ */
++ if (!tp->retrans_stamp)
++ tp->retrans_stamp = tcp_time_stamp ? : 1;
++
++ if (tcp_write_timeout(sk))
++ goto out;
++
++ tcp_send_ack(sk);
++ icsk->icsk_retransmits++;
++
++ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
++ elapsed = icsk->icsk_rto;
++ goto resched;
++ }
++
+ if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
+ if (tp->linger2 >= 0) {
+ const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
+
+ if (tmo > 0) {
+- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
++ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto out;
+ }
+ }
+- tcp_send_active_reset(sk, GFP_ATOMIC);
++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
+ goto death;
+ }
+
+@@ -614,11 +651,11 @@ static void tcp_keepalive_timer (unsigned long data)
+ icsk->icsk_probes_out > 0) ||
+ (icsk->icsk_user_timeout == 0 &&
+ icsk->icsk_probes_out >= keepalive_probes(tp))) {
+- tcp_send_active_reset(sk, GFP_ATOMIC);
++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
+ tcp_write_err(sk);
+ goto out;
+ }
+- if (tcp_write_wakeup(sk) <= 0) {
++ if (tp->ops->write_wakeup(sk) <= 0) {
+ icsk->icsk_probes_out++;
+ elapsed = keepalive_intvl_when(tp);
+ } else {
+@@ -642,7 +679,7 @@ death:
+ tcp_done(sk);
+
+ out:
+- bh_unlock_sock(sk);
++ bh_unlock_sock(meta_sk);
+ sock_put(sk);
+ }
+
+diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
+index 5667b3003af9..7139c2973fd2 100644
+--- a/net/ipv6/addrconf.c
++++ b/net/ipv6/addrconf.c
+@@ -760,6 +760,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
+
+ kfree_rcu(ifp, rcu);
+ }
++EXPORT_SYMBOL(inet6_ifa_finish_destroy);
+
+ static void
+ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
+diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
+index 7cb4392690dd..7057afbca4df 100644
+--- a/net/ipv6/af_inet6.c
++++ b/net/ipv6/af_inet6.c
+@@ -97,8 +97,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
+ return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
+ }
+
+-static int inet6_create(struct net *net, struct socket *sock, int protocol,
+- int kern)
++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern)
+ {
+ struct inet_sock *inet;
+ struct ipv6_pinfo *np;
+diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
+index a245e5ddffbd..99c892b8992d 100644
+--- a/net/ipv6/inet6_connection_sock.c
++++ b/net/ipv6/inet6_connection_sock.c
+@@ -96,8 +96,8 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
+ /*
+ * request_sock (formerly open request) hash tables.
+ */
+-static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
+- const u32 rnd, const u32 synq_hsize)
++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
++ const u32 rnd, const u32 synq_hsize)
+ {
+ u32 c;
+
+diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
+index edb58aff4ae7..ea4d9fda0927 100644
+--- a/net/ipv6/ipv6_sockglue.c
++++ b/net/ipv6/ipv6_sockglue.c
+@@ -48,6 +48,8 @@
+ #include <net/addrconf.h>
+ #include <net/inet_common.h>
+ #include <net/tcp.h>
++#include <net/mptcp.h>
++#include <net/mptcp_v4.h>
+ #include <net/udp.h>
+ #include <net/udplite.h>
+ #include <net/xfrm.h>
+@@ -196,7 +198,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
+ sock_prot_inuse_add(net, &tcp_prot, 1);
+ local_bh_enable();
+ sk->sk_prot = &tcp_prot;
+- icsk->icsk_af_ops = &ipv4_specific;
++#ifdef CONFIG_MPTCP
++ if (is_mptcp_enabled(sk))
++ icsk->icsk_af_ops = &mptcp_v4_specific;
++ else
++#endif
++ icsk->icsk_af_ops = &ipv4_specific;
+ sk->sk_socket->ops = &inet_stream_ops;
+ sk->sk_family = PF_INET;
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
+index a822b880689b..b2b38869d795 100644
+--- a/net/ipv6/syncookies.c
++++ b/net/ipv6/syncookies.c
+@@ -181,13 +181,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
+
+ /* check for timestamp cookie support */
+ memset(&tcp_opt, 0, sizeof(tcp_opt));
+- tcp_parse_options(skb, &tcp_opt, 0, NULL);
++ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
+
+ if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
+ goto out;
+
+ ret = NULL;
+- req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
++ req = inet_reqsk_alloc(&tcp6_request_sock_ops);
+ if (!req)
+ goto out;
+
+@@ -255,10 +255,10 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
+ }
+
+ req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
+- tcp_select_initial_window(tcp_full_space(sk), req->mss,
+- &req->rcv_wnd, &req->window_clamp,
+- ireq->wscale_ok, &rcv_wscale,
+- dst_metric(dst, RTAX_INITRWND));
++ tp->ops->select_initial_window(tcp_full_space(sk), req->mss,
++ &req->rcv_wnd, &req->window_clamp,
++ ireq->wscale_ok, &rcv_wscale,
++ dst_metric(dst, RTAX_INITRWND), sk);
+
+ ireq->rcv_wscale = rcv_wscale;
+
+diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
+index 229239ad96b1..fda94d71666e 100644
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -63,6 +63,8 @@
+ #include <net/inet_common.h>
+ #include <net/secure_seq.h>
+ #include <net/tcp_memcontrol.h>
++#include <net/mptcp.h>
++#include <net/mptcp_v6.h>
+ #include <net/busy_poll.h>
+
+ #include <linux/proc_fs.h>
+@@ -71,12 +73,6 @@
+ #include <linux/crypto.h>
+ #include <linux/scatterlist.h>
+
+-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
+-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+- struct request_sock *req);
+-
+-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+-
+ static const struct inet_connection_sock_af_ops ipv6_mapped;
+ static const struct inet_connection_sock_af_ops ipv6_specific;
+ #ifdef CONFIG_TCP_MD5SIG
+@@ -90,7 +86,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
+ }
+ #endif
+
+-static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+ {
+ struct dst_entry *dst = skb_dst(skb);
+ const struct rt6_info *rt = (const struct rt6_info *)dst;
+@@ -102,10 +98,11 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+ inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
+ }
+
+-static void tcp_v6_hash(struct sock *sk)
++void tcp_v6_hash(struct sock *sk)
+ {
+ if (sk->sk_state != TCP_CLOSE) {
+- if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
++ if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped ||
++ inet_csk(sk)->icsk_af_ops == &mptcp_v6_mapped) {
+ tcp_prot.hash(sk);
+ return;
+ }
+@@ -115,7 +112,7 @@ static void tcp_v6_hash(struct sock *sk)
+ }
+ }
+
+-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
++__u32 tcp_v6_init_sequence(const struct sk_buff *skb)
+ {
+ return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
+ ipv6_hdr(skb)->saddr.s6_addr32,
+@@ -123,7 +120,7 @@ static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
+ tcp_hdr(skb)->source);
+ }
+
+-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+ {
+ struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+@@ -215,7 +212,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+ sin.sin_port = usin->sin6_port;
+ sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
+
+- icsk->icsk_af_ops = &ipv6_mapped;
++#ifdef CONFIG_MPTCP
++ if (is_mptcp_enabled(sk))
++ icsk->icsk_af_ops = &mptcp_v6_mapped;
++ else
++#endif
++ icsk->icsk_af_ops = &ipv6_mapped;
+ sk->sk_backlog_rcv = tcp_v4_do_rcv;
+ #ifdef CONFIG_TCP_MD5SIG
+ tp->af_specific = &tcp_sock_ipv6_mapped_specific;
+@@ -225,7 +227,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+
+ if (err) {
+ icsk->icsk_ext_hdr_len = exthdrlen;
+- icsk->icsk_af_ops = &ipv6_specific;
++#ifdef CONFIG_MPTCP
++ if (is_mptcp_enabled(sk))
++ icsk->icsk_af_ops = &mptcp_v6_specific;
++ else
++#endif
++ icsk->icsk_af_ops = &ipv6_specific;
+ sk->sk_backlog_rcv = tcp_v6_do_rcv;
+ #ifdef CONFIG_TCP_MD5SIG
+ tp->af_specific = &tcp_sock_ipv6_specific;
+@@ -337,7 +344,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
+ const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
+ struct ipv6_pinfo *np;
+- struct sock *sk;
++ struct sock *sk, *meta_sk;
+ int err;
+ struct tcp_sock *tp;
+ struct request_sock *fastopen;
+@@ -358,8 +365,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ return;
+ }
+
+- bh_lock_sock(sk);
+- if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
++ tp = tcp_sk(sk);
++ if (mptcp(tp))
++ meta_sk = mptcp_meta_sk(sk);
++ else
++ meta_sk = sk;
++
++ bh_lock_sock(meta_sk);
++ if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG)
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+
+ if (sk->sk_state == TCP_CLOSE)
+@@ -370,7 +383,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ goto out;
+ }
+
+- tp = tcp_sk(sk);
+ seq = ntohl(th->seq);
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
+ fastopen = tp->fastopen_rsk;
+@@ -403,11 +415,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ goto out;
+
+ tp->mtu_info = ntohl(info);
+- if (!sock_owned_by_user(sk))
++ if (!sock_owned_by_user(meta_sk))
+ tcp_v6_mtu_reduced(sk);
+- else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
++ else {
++ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
+ &tp->tsq_flags))
+- sock_hold(sk);
++ sock_hold(sk);
++ if (mptcp(tp))
++ mptcp_tsq_flags(sk);
++ }
+ goto out;
+ }
+
+@@ -417,7 +433,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ switch (sk->sk_state) {
+ struct request_sock *req, **prev;
+ case TCP_LISTEN:
+- if (sock_owned_by_user(sk))
++ if (sock_owned_by_user(meta_sk))
+ goto out;
+
+ req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
+@@ -447,7 +463,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ if (fastopen && fastopen->sk == NULL)
+ break;
+
+- if (!sock_owned_by_user(sk)) {
++ if (!sock_owned_by_user(meta_sk)) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
+
+@@ -457,26 +473,27 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ goto out;
+ }
+
+- if (!sock_owned_by_user(sk) && np->recverr) {
++ if (!sock_owned_by_user(meta_sk) && np->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else
+ sk->sk_err_soft = err;
+
+ out:
+- bh_unlock_sock(sk);
++ bh_unlock_sock(meta_sk);
+ sock_put(sk);
+ }
+
+
+-static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
+- struct flowi6 *fl6,
+- struct request_sock *req,
+- u16 queue_mapping,
+- struct tcp_fastopen_cookie *foc)
++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
++ struct flowi *fl,
++ struct request_sock *req,
++ u16 queue_mapping,
++ struct tcp_fastopen_cookie *foc)
+ {
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct ipv6_pinfo *np = inet6_sk(sk);
++ struct flowi6 *fl6 = &fl->u.ip6;
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+@@ -497,18 +514,21 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
+ skb_set_queue_mapping(skb, queue_mapping);
+ err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
+ err = net_xmit_eval(err);
++ if (!tcp_rsk(req)->snt_synack && !err)
++ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ }
+
+ done:
+ return err;
+ }
+
+-static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
+ {
+- struct flowi6 fl6;
++ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
++ struct flowi fl;
+ int res;
+
+- res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL);
++ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
+ if (!res) {
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+@@ -516,7 +536,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
+ return res;
+ }
+
+-static void tcp_v6_reqsk_destructor(struct request_sock *req)
++void tcp_v6_reqsk_destructor(struct request_sock *req)
+ {
+ kfree_skb(inet_rsk(req)->pktopts);
+ }
+@@ -718,27 +738,74 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+ }
+ #endif
+
++static int tcp_v6_init_req(struct request_sock *req, struct sock *sk,
++ struct sk_buff *skb)
++{
++ struct inet_request_sock *ireq = inet_rsk(req);
++ struct ipv6_pinfo *np = inet6_sk(sk);
++
++ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
++ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
++
++ ireq->ir_iif = sk->sk_bound_dev_if;
++ ireq->ir_mark = inet_request_mark(sk, skb);
++
++ /* So that link locals have meaning */
++ if (!sk->sk_bound_dev_if &&
++ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
++ ireq->ir_iif = inet6_iif(skb);
++
++ if (!TCP_SKB_CB(skb)->when &&
++ (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo ||
++ np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
++ np->rxopt.bits.rxohlim || np->repflow)) {
++ atomic_inc(&skb->users);
++ ireq->pktopts = skb;
++ }
++
++ return 0;
++}
++
++static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl,
++ const struct request_sock *req,
++ bool *strict)
++{
++ if (strict)
++ *strict = true;
++ return inet6_csk_route_req(sk, &fl->u.ip6, req);
++}
++
+ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
+ .family = AF_INET6,
+ .obj_size = sizeof(struct tcp6_request_sock),
+- .rtx_syn_ack = tcp_v6_rtx_synack,
++ .rtx_syn_ack = tcp_rtx_synack,
+ .send_ack = tcp_v6_reqsk_send_ack,
+ .destructor = tcp_v6_reqsk_destructor,
+ .send_reset = tcp_v6_send_reset,
+ .syn_ack_timeout = tcp_syn_ack_timeout,
+ };
+
++const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
++ .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
++ sizeof(struct ipv6hdr),
+ #ifdef CONFIG_TCP_MD5SIG
+-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
+ .md5_lookup = tcp_v6_reqsk_md5_lookup,
+ .calc_md5_hash = tcp_v6_md5_hash_skb,
+-};
+ #endif
++ .init_req = tcp_v6_init_req,
++#ifdef CONFIG_SYN_COOKIES
++ .cookie_init_seq = cookie_v6_init_sequence,
++#endif
++ .route_req = tcp_v6_route_req,
++ .init_seq = tcp_v6_init_sequence,
++ .send_synack = tcp_v6_send_synack,
++ .queue_hash_add = inet6_csk_reqsk_queue_hash_add,
++};
+
+-static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
+- u32 tsval, u32 tsecr, int oif,
+- struct tcp_md5sig_key *key, int rst, u8 tclass,
+- u32 label)
++static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack,
++ u32 data_ack, u32 win, u32 tsval, u32 tsecr,
++ int oif, struct tcp_md5sig_key *key, int rst,
++ u8 tclass, u32 label, int mptcp)
+ {
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcphdr *t1;
+@@ -756,7 +823,10 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
+ if (key)
+ tot_len += TCPOLEN_MD5SIG_ALIGNED;
+ #endif
+-
++#ifdef CONFIG_MPTCP
++ if (mptcp)
++ tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
++#endif
+ buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
+ GFP_ATOMIC);
+ if (buff == NULL)
+@@ -794,6 +864,17 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
+ tcp_v6_md5_hash_hdr((__u8 *)topt, key,
+ &ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr, t1);
++ topt += 4;
++ }
++#endif
++#ifdef CONFIG_MPTCP
++ if (mptcp) {
++ /* Construction of 32-bit data_ack */
++ *topt++ = htonl((TCPOPT_MPTCP << 24) |
++ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
++ (0x20 << 8) |
++ (0x01));
++ *topt++ = htonl(data_ack);
+ }
+ #endif
+
+@@ -834,7 +915,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
+ kfree_skb(buff);
+ }
+
+-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
+ {
+ const struct tcphdr *th = tcp_hdr(skb);
+ u32 seq = 0, ack_seq = 0;
+@@ -891,7 +972,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
+ (th->doff << 2);
+
+ oif = sk ? sk->sk_bound_dev_if : 0;
+- tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
++ tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0, 0, 0);
+
+ #ifdef CONFIG_TCP_MD5SIG
+ release_sk1:
+@@ -902,45 +983,52 @@ release_sk1:
+ #endif
+ }
+
+-static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
++static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
+ u32 win, u32 tsval, u32 tsecr, int oif,
+ struct tcp_md5sig_key *key, u8 tclass,
+- u32 label)
++ u32 label, int mptcp)
+ {
+- tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass,
+- label);
++ tcp_v6_send_response(skb, seq, ack, data_ack, win, tsval, tsecr, oif,
++ key, 0, tclass, label, mptcp);
+ }
+
+ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
+ {
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
++ u32 data_ack = 0;
++ int mptcp = 0;
+
++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
++ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
++ mptcp = 1;
++ }
+ tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
++ data_ack,
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+ tcp_time_stamp + tcptw->tw_ts_offset,
+ tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
+- tw->tw_tclass, (tw->tw_flowlabel << 12));
++ tw->tw_tclass, (tw->tw_flowlabel << 12), mptcp);
+
+ inet_twsk_put(tw);
+ }
+
+-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+- struct request_sock *req)
++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
++ struct request_sock *req)
+ {
+ /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+ * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+ */
+ tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
+ tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+- tcp_rsk(req)->rcv_nxt,
++ tcp_rsk(req)->rcv_nxt, 0,
+ req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
+ tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
+- 0, 0);
++ 0, 0, 0);
+ }
+
+
+-static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
++struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
+ {
+ struct request_sock *req, **prev;
+ const struct tcphdr *th = tcp_hdr(skb);
+@@ -959,7 +1047,13 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
+
+ if (nsk) {
+ if (nsk->sk_state != TCP_TIME_WAIT) {
++ /* Don't lock again the meta-sk. It has been locked
++ * before mptcp_v6_do_rcv.
++ */
++ if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk))
++ bh_lock_sock(mptcp_meta_sk(nsk));
+ bh_lock_sock(nsk);
++
+ return nsk;
+ }
+ inet_twsk_put(inet_twsk(nsk));
+@@ -973,161 +1067,25 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
+ return sk;
+ }
+
+-/* FIXME: this is substantially similar to the ipv4 code.
+- * Can some kind of merge be done? -- erics
+- */
+-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+ {
+- struct tcp_options_received tmp_opt;
+- struct request_sock *req;
+- struct inet_request_sock *ireq;
+- struct ipv6_pinfo *np = inet6_sk(sk);
+- struct tcp_sock *tp = tcp_sk(sk);
+- __u32 isn = TCP_SKB_CB(skb)->when;
+- struct dst_entry *dst = NULL;
+- struct tcp_fastopen_cookie foc = { .len = -1 };
+- bool want_cookie = false, fastopen;
+- struct flowi6 fl6;
+- int err;
+-
+ if (skb->protocol == htons(ETH_P_IP))
+ return tcp_v4_conn_request(sk, skb);
+
+ if (!ipv6_unicast_destination(skb))
+ goto drop;
+
+- if ((sysctl_tcp_syncookies == 2 ||
+- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
+- want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6");
+- if (!want_cookie)
+- goto drop;
+- }
+-
+- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+- goto drop;
+- }
+-
+- req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
+- if (req == NULL)
+- goto drop;
+-
+-#ifdef CONFIG_TCP_MD5SIG
+- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
+-#endif
+-
+- tcp_clear_options(&tmp_opt);
+- tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
+- tmp_opt.user_mss = tp->rx_opt.user_mss;
+- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
+-
+- if (want_cookie && !tmp_opt.saw_tstamp)
+- tcp_clear_options(&tmp_opt);
++ return tcp_conn_request(&tcp6_request_sock_ops,
++ &tcp_request_sock_ipv6_ops, sk, skb);
+
+- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+- tcp_openreq_init(req, &tmp_opt, skb);
+-
+- ireq = inet_rsk(req);
+- ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+- ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+- if (!want_cookie || tmp_opt.tstamp_ok)
+- TCP_ECN_create_request(req, skb, sock_net(sk));
+-
+- ireq->ir_iif = sk->sk_bound_dev_if;
+- ireq->ir_mark = inet_request_mark(sk, skb);
+-
+- /* So that link locals have meaning */
+- if (!sk->sk_bound_dev_if &&
+- ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+- ireq->ir_iif = inet6_iif(skb);
+-
+- if (!isn) {
+- if (ipv6_opt_accepted(sk, skb) ||
+- np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+- np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim ||
+- np->repflow) {
+- atomic_inc(&skb->users);
+- ireq->pktopts = skb;
+- }
+-
+- if (want_cookie) {
+- isn = cookie_v6_init_sequence(sk, skb, &req->mss);
+- req->cookie_ts = tmp_opt.tstamp_ok;
+- goto have_isn;
+- }
+-
+- /* VJ's idea. We save last timestamp seen
+- * from the destination in peer table, when entering
+- * state TIME-WAIT, and check against it before
+- * accepting new connection request.
+- *
+- * If "isn" is not zero, this request hit alive
+- * timewait bucket, so that all the necessary checks
+- * are made in the function processing timewait state.
+- */
+- if (tmp_opt.saw_tstamp &&
+- tcp_death_row.sysctl_tw_recycle &&
+- (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) {
+- if (!tcp_peer_is_proven(req, dst, true)) {
+- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+- goto drop_and_release;
+- }
+- }
+- /* Kill the following clause, if you dislike this way. */
+- else if (!sysctl_tcp_syncookies &&
+- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+- (sysctl_max_syn_backlog >> 2)) &&
+- !tcp_peer_is_proven(req, dst, false)) {
+- /* Without syncookies last quarter of
+- * backlog is filled with destinations,
+- * proven to be alive.
+- * It means that we continue to communicate
+- * to destinations, already remembered
+- * to the moment of synflood.
+- */
+- LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n",
+- &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source));
+- goto drop_and_release;
+- }
+-
+- isn = tcp_v6_init_sequence(skb);
+- }
+-have_isn:
+-
+- if (security_inet_conn_request(sk, skb, req))
+- goto drop_and_release;
+-
+- if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL)
+- goto drop_and_free;
+-
+- tcp_rsk(req)->snt_isn = isn;
+- tcp_rsk(req)->snt_synack = tcp_time_stamp;
+- tcp_openreq_init_rwin(req, sk, dst);
+- fastopen = !want_cookie &&
+- tcp_try_fastopen(sk, skb, req, &foc, dst);
+- err = tcp_v6_send_synack(sk, dst, &fl6, req,
+- skb_get_queue_mapping(skb), &foc);
+- if (!fastopen) {
+- if (err || want_cookie)
+- goto drop_and_free;
+-
+- tcp_rsk(req)->listener = NULL;
+- inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+- }
+- return 0;
+-
+-drop_and_release:
+- dst_release(dst);
+-drop_and_free:
+- reqsk_free(req);
+ drop:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return 0; /* don't send reset */
+ }
+
+-static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+- struct request_sock *req,
+- struct dst_entry *dst)
++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
++ struct request_sock *req,
++ struct dst_entry *dst)
+ {
+ struct inet_request_sock *ireq;
+ struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+@@ -1165,7 +1123,12 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+
+ newsk->sk_v6_rcv_saddr = newnp->saddr;
+
+- inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
++#ifdef CONFIG_MPTCP
++ if (is_mptcp_enabled(newsk))
++ inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped;
++ else
++#endif
++ inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
+ newsk->sk_backlog_rcv = tcp_v4_do_rcv;
+ #ifdef CONFIG_TCP_MD5SIG
+ newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
+@@ -1329,7 +1292,7 @@ out:
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+ {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcp_sock *tp;
+@@ -1351,6 +1314,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+ goto discard;
+ #endif
+
++ if (is_meta_sk(sk))
++ return mptcp_v6_do_rcv(sk, skb);
++
+ if (sk_filter(sk, skb))
+ goto discard;
+
+@@ -1472,7 +1438,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
+ {
+ const struct tcphdr *th;
+ const struct ipv6hdr *hdr;
+- struct sock *sk;
++ struct sock *sk, *meta_sk = NULL;
+ int ret;
+ struct net *net = dev_net(skb->dev);
+
+@@ -1503,18 +1469,43 @@ static int tcp_v6_rcv(struct sk_buff *skb)
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+ skb->len - th->doff*4);
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
++#ifdef CONFIG_MPTCP
++ TCP_SKB_CB(skb)->mptcp_flags = 0;
++ TCP_SKB_CB(skb)->dss_off = 0;
++#endif
+ TCP_SKB_CB(skb)->when = 0;
+ TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
+ TCP_SKB_CB(skb)->sacked = 0;
+
+ sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+- if (!sk)
+- goto no_tcp_socket;
+
+ process:
+- if (sk->sk_state == TCP_TIME_WAIT)
++ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ goto do_time_wait;
+
++#ifdef CONFIG_MPTCP
++ if (!sk && th->syn && !th->ack) {
++ int ret = mptcp_lookup_join(skb, NULL);
++
++ if (ret < 0) {
++ tcp_v6_send_reset(NULL, skb);
++ goto discard_it;
++ } else if (ret > 0) {
++ return 0;
++ }
++ }
++
++ /* Is there a pending request sock for this segment ? */
++ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
++ if (sk)
++ sock_put(sk);
++ return 0;
++ }
++#endif
++
++ if (!sk)
++ goto no_tcp_socket;
++
+ if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ goto discard_and_relse;
+@@ -1529,11 +1520,21 @@ process:
+ sk_mark_napi_id(sk, skb);
+ skb->dev = NULL;
+
+- bh_lock_sock_nested(sk);
++ if (mptcp(tcp_sk(sk))) {
++ meta_sk = mptcp_meta_sk(sk);
++
++ bh_lock_sock_nested(meta_sk);
++ if (sock_owned_by_user(meta_sk))
++ skb->sk = sk;
++ } else {
++ meta_sk = sk;
++ bh_lock_sock_nested(sk);
++ }
++
+ ret = 0;
+- if (!sock_owned_by_user(sk)) {
++ if (!sock_owned_by_user(meta_sk)) {
+ #ifdef CONFIG_NET_DMA
+- struct tcp_sock *tp = tcp_sk(sk);
++ struct tcp_sock *tp = tcp_sk(meta_sk);
+ if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+ tp->ucopy.dma_chan = net_dma_find_channel();
+ if (tp->ucopy.dma_chan)
+@@ -1541,16 +1542,17 @@ process:
+ else
+ #endif
+ {
+- if (!tcp_prequeue(sk, skb))
++ if (!tcp_prequeue(meta_sk, skb))
+ ret = tcp_v6_do_rcv(sk, skb);
+ }
+- } else if (unlikely(sk_add_backlog(sk, skb,
+- sk->sk_rcvbuf + sk->sk_sndbuf))) {
+- bh_unlock_sock(sk);
++ } else if (unlikely(sk_add_backlog(meta_sk, skb,
++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
++ bh_unlock_sock(meta_sk);
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+ goto discard_and_relse;
+ }
+- bh_unlock_sock(sk);
++
++ bh_unlock_sock(meta_sk);
+
+ sock_put(sk);
+ return ret ? -1 : 0;
+@@ -1607,6 +1609,18 @@ do_time_wait:
+ sk = sk2;
+ goto process;
+ }
++#ifdef CONFIG_MPTCP
++ if (th->syn && !th->ack) {
++ int ret = mptcp_lookup_join(skb, inet_twsk(sk));
++
++ if (ret < 0) {
++ tcp_v6_send_reset(NULL, skb);
++ goto discard_it;
++ } else if (ret > 0) {
++ return 0;
++ }
++ }
++#endif
+ /* Fall through to ACK */
+ }
+ case TCP_TW_ACK:
+@@ -1657,7 +1671,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
+ }
+ }
+
+-static struct timewait_sock_ops tcp6_timewait_sock_ops = {
++struct timewait_sock_ops tcp6_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
+ .twsk_unique = tcp_twsk_unique,
+ .twsk_destructor = tcp_twsk_destructor,
+@@ -1730,7 +1744,12 @@ static int tcp_v6_init_sock(struct sock *sk)
+
+ tcp_init_sock(sk);
+
+- icsk->icsk_af_ops = &ipv6_specific;
++#ifdef CONFIG_MPTCP
++ if (is_mptcp_enabled(sk))
++ icsk->icsk_af_ops = &mptcp_v6_specific;
++ else
++#endif
++ icsk->icsk_af_ops = &ipv6_specific;
+
+ #ifdef CONFIG_TCP_MD5SIG
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
+@@ -1739,7 +1758,7 @@ static int tcp_v6_init_sock(struct sock *sk)
+ return 0;
+ }
+
+-static void tcp_v6_destroy_sock(struct sock *sk)
++void tcp_v6_destroy_sock(struct sock *sk)
+ {
+ tcp_v4_destroy_sock(sk);
+ inet6_destroy_sock(sk);
+@@ -1924,12 +1943,28 @@ void tcp6_proc_exit(struct net *net)
+ static void tcp_v6_clear_sk(struct sock *sk, int size)
+ {
+ struct inet_sock *inet = inet_sk(sk);
++#ifdef CONFIG_MPTCP
++ struct tcp_sock *tp = tcp_sk(sk);
++ /* size_tk_table goes from the end of tk_table to the end of sk */
++ int size_tk_table = size - offsetof(struct tcp_sock, tk_table) -
++ sizeof(tp->tk_table);
++#endif
+
+ /* we do not want to clear pinet6 field, because of RCU lookups */
+ sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6));
+
+ size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
++
++#ifdef CONFIG_MPTCP
++ /* We zero out only from pinet6 to tk_table */
++ size -= size_tk_table + sizeof(tp->tk_table);
++#endif
+ memset(&inet->pinet6 + 1, 0, size);
++
++#ifdef CONFIG_MPTCP
++ memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size_tk_table);
++#endif
++
+ }
+
+ struct proto tcpv6_prot = {
+diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
+new file mode 100644
+index 000000000000..cdfc03adabf8
+--- /dev/null
++++ b/net/mptcp/Kconfig
+@@ -0,0 +1,115 @@
++#
++# MPTCP configuration
++#
++config MPTCP
++ bool "MPTCP protocol"
++ depends on (IPV6=y || IPV6=n)
++ ---help---
++ This replaces the normal TCP stack with a Multipath TCP stack,
++ able to use several paths at once.
++
++menuconfig MPTCP_PM_ADVANCED
++ bool "MPTCP: advanced path-manager control"
++ depends on MPTCP=y
++ ---help---
++ Support for selection of different path-managers. You should choose 'Y' here,
++ because otherwise you will not actively create new MPTCP-subflows.
++
++if MPTCP_PM_ADVANCED
++
++config MPTCP_FULLMESH
++ tristate "MPTCP Full-Mesh Path-Manager"
++ depends on MPTCP=y
++ ---help---
++ This path-management module will create a full-mesh among all IP-addresses.
++
++config MPTCP_NDIFFPORTS
++ tristate "MPTCP ndiff-ports"
++ depends on MPTCP=y
++ ---help---
++ This path-management module will create multiple subflows between the same
++ pair of IP-addresses, modifying the source-port. You can set the number
++ of subflows via the mptcp_ndiffports-sysctl.
++
++config MPTCP_BINDER
++ tristate "MPTCP Binder"
++ depends on (MPTCP=y)
++ ---help---
++ This path-management module works like ndiffports, and adds the sysctl
++ option to set the gateway (and/or path to) per each additional subflow
++ via Loose Source Routing (IPv4 only).
++
++choice
++ prompt "Default MPTCP Path-Manager"
++ default DEFAULT
++ help
++ Select the Path-Manager of your choice
++
++ config DEFAULT_FULLMESH
++ bool "Full mesh" if MPTCP_FULLMESH=y
++
++ config DEFAULT_NDIFFPORTS
++ bool "ndiff-ports" if MPTCP_NDIFFPORTS=y
++
++ config DEFAULT_BINDER
++ bool "binder" if MPTCP_BINDER=y
++
++ config DEFAULT_DUMMY
++ bool "Default"
++
++endchoice
++
++endif
++
++config DEFAULT_MPTCP_PM
++ string
++ default "default" if DEFAULT_DUMMY
++ default "fullmesh" if DEFAULT_FULLMESH
++ default "ndiffports" if DEFAULT_NDIFFPORTS
++ default "binder" if DEFAULT_BINDER
++ default "default"
++
++menuconfig MPTCP_SCHED_ADVANCED
++ bool "MPTCP: advanced scheduler control"
++ depends on MPTCP=y
++ ---help---
++ Support for selection of different schedulers. You should choose 'Y' here,
++ if you want to choose a different scheduler than the default one.
++
++if MPTCP_SCHED_ADVANCED
++
++config MPTCP_ROUNDROBIN
++ tristate "MPTCP Round-Robin"
++ depends on (MPTCP=y)
++ ---help---
++ This is a very simple round-robin scheduler. Probably has bad performance
++ but might be interesting for researchers.
++
++choice
++ prompt "Default MPTCP Scheduler"
++ default DEFAULT
++ help
++ Select the Scheduler of your choice
++
++ config DEFAULT_SCHEDULER
++ bool "Default"
++ ---help---
++ This is the default scheduler, sending first on the subflow
++ with the lowest RTT.
++
++ config DEFAULT_ROUNDROBIN
++ bool "Round-Robin" if MPTCP_ROUNDROBIN=y
++ ---help---
++ This is the round-rob scheduler, sending in a round-robin
++ fashion..
++
++endchoice
++endif
++
++config DEFAULT_MPTCP_SCHED
++ string
++ depends on (MPTCP=y)
++ default "default" if DEFAULT_SCHEDULER
++ default "roundrobin" if DEFAULT_ROUNDROBIN
++ default "default"
++
+diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
+new file mode 100644
+index 000000000000..35561a7012e3
+--- /dev/null
++++ b/net/mptcp/Makefile
+@@ -0,0 +1,20 @@
++#
++## Makefile for MultiPath TCP support code.
++#
++#
++
++obj-$(CONFIG_MPTCP) += mptcp.o
++
++mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \
++ mptcp_output.o mptcp_input.o mptcp_sched.o
++
++obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o
++obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o
++obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o
++obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o
++obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o
++obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o
++obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o
++
++mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o
++
+diff --git a/net/mptcp/mptcp_binder.c b/net/mptcp/mptcp_binder.c
+new file mode 100644
+index 000000000000..95d8da560715
+--- /dev/null
++++ b/net/mptcp/mptcp_binder.c
+@@ -0,0 +1,487 @@
++#include <linux/module.h>
++
++#include <net/mptcp.h>
++#include <net/mptcp_v4.h>
++
++#include <linux/route.h>
++#include <linux/inet.h>
++#include <linux/mroute.h>
++#include <linux/spinlock_types.h>
++#include <net/inet_ecn.h>
++#include <net/route.h>
++#include <net/xfrm.h>
++#include <net/compat.h>
++#include <linux/slab.h>
++
++#define MPTCP_GW_MAX_LISTS 10
++#define MPTCP_GW_LIST_MAX_LEN 6
++#define MPTCP_GW_SYSCTL_MAX_LEN (15 * MPTCP_GW_LIST_MAX_LEN * \
++ MPTCP_GW_MAX_LISTS)
++
++struct mptcp_gw_list {
++ struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN];
++ u8 len[MPTCP_GW_MAX_LISTS];
++};
++
++struct binder_priv {
++ /* Worker struct for subflow establishment */
++ struct work_struct subflow_work;
++
++ struct mptcp_cb *mpcb;
++
++ /* Prevent multiple sub-sockets concurrently iterating over sockets */
++ spinlock_t *flow_lock;
++};
++
++static struct mptcp_gw_list *mptcp_gws;
++static rwlock_t mptcp_gws_lock;
++
++static int mptcp_binder_ndiffports __read_mostly = 1;
++
++static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly;
++
++static int mptcp_get_avail_list_ipv4(struct sock *sk)
++{
++ int i, j, list_taken, opt_ret, opt_len;
++ unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN];
++
++ for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) {
++ if (mptcp_gws->len[i] == 0)
++ goto error;
++
++ mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i);
++ list_taken = 0;
++
++ /* Loop through all sub-sockets in this connection */
++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk) {
++ mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n");
++
++ /* Reset length and options buffer, then retrieve
++ * from socket
++ */
++ opt_len = MAX_IPOPTLEN;
++ memset(opt, 0, MAX_IPOPTLEN);
++ opt_ret = ip_getsockopt(sk, IPPROTO_IP,
++ IP_OPTIONS, opt, &opt_len);
++ if (opt_ret < 0) {
++ mptcp_debug(KERN_ERR "%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n",
++ __func__, opt_ret);
++ goto error;
++ }
++
++ /* If socket has no options, it has no stake in this list */
++ if (opt_len <= 0)
++ continue;
++
++ /* Iterate options buffer */
++ for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) {
++ if (*opt_ptr == IPOPT_LSRR) {
++ mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n");
++ goto sock_lsrr;
++ }
++ }
++ continue;
++
++sock_lsrr:
++ /* Pointer to the 2nd to last address */
++ opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4;
++
++ /* Addresses start 3 bytes after type offset */
++ opt_ptr += 3;
++ j = 0;
++
++ /* Different length lists cannot be the same */
++ if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i])
++ continue;
++
++ /* Iterate if we are still inside options list
++ * and sysctl list
++ */
++ while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) {
++ /* If there is a different address, this list must
++ * not be set on this socket
++ */
++ if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4))
++ break;
++
++ /* Jump 4 bytes to next address */
++ opt_ptr += 4;
++ j++;
++ }
++
++ /* Reached the end without a differing address, lists
++ * are therefore identical.
++ */
++ if (j == mptcp_gws->len[i]) {
++ mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n");
++ list_taken = 1;
++ break;
++ }
++ }
++
++ /* Free list found if not taken by a socket */
++ if (!list_taken) {
++ mptcp_debug("mptcp_get_avail_list_ipv4: List free\n");
++ break;
++ }
++ }
++
++ if (i >= MPTCP_GW_MAX_LISTS)
++ goto error;
++
++ return i;
++error:
++ return -1;
++}
++
++/* The list of addresses is parsed each time a new connection is opened,
++ * to make sure it's up to date. In case of error, all the lists are
++ * marked as unavailable and the subflow's fingerprint is set to 0.
++ */
++static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr)
++{
++ int i, j, ret;
++ unsigned char opt[MAX_IPOPTLEN] = {0};
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0];
++
++ /* Read lock: multiple sockets can read LSRR addresses at the same
++ * time, but writes are done in mutual exclusion.
++ * Spin lock: must search for free list for one socket at a time, or
++ * multiple sockets could take the same list.
++ */
++ read_lock(&mptcp_gws_lock);
++ spin_lock(fmp->flow_lock);
++
++ i = mptcp_get_avail_list_ipv4(sk);
++
++ /* Execution enters here only if a free path is found.
++ */
++ if (i >= 0) {
++ opt[0] = IPOPT_NOP;
++ opt[1] = IPOPT_LSRR;
++ opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) *
++ (mptcp_gws->len[i] + 1) + 3;
++ opt[3] = IPOPT_MINOFF;
++ for (j = 0; j < mptcp_gws->len[i]; ++j)
++ memcpy(opt + 4 +
++ (j * sizeof(mptcp_gws->list[i][0].s_addr)),
++ &mptcp_gws->list[i][j].s_addr,
++ sizeof(mptcp_gws->list[i][0].s_addr));
++ /* Final destination must be part of IP_OPTIONS parameter. */
++ memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr,
++ sizeof(addr.s_addr));
++
++ /* setsockopt must be inside the lock, otherwise another
++ * subflow could fail to see that we have taken a list.
++ */
++ ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, opt,
++ 4 + sizeof(mptcp_gws->list[i][0].s_addr)
++ * (mptcp_gws->len[i] + 1));
++
++ if (ret < 0) {
++ mptcp_debug(KERN_ERR "%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n",
++ __func__, ret);
++ }
++ }
++
++ spin_unlock(fmp->flow_lock);
++ read_unlock(&mptcp_gws_lock);
++
++ return;
++}
++
++/* Parses gateways string for a list of paths to different
++ * gateways, and stores them for use with the Loose Source Routing (LSRR)
++ * socket option. Each list must have "," separated addresses, and the lists
++ * themselves must be separated by "-". Returns -1 in case one or more of the
++ * addresses is not a valid ipv4/6 address.
++ */
++static int mptcp_parse_gateway_ipv4(char *gateways)
++{
++ int i, j, k, ret;
++ char *tmp_string = NULL;
++ struct in_addr tmp_addr;
++
++ tmp_string = kzalloc(16, GFP_KERNEL);
++ if (tmp_string == NULL)
++ return -ENOMEM;
++
++ write_lock(&mptcp_gws_lock);
++
++ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
++
++ /* A TMP string is used since inet_pton needs a null terminated string
++ * but we do not want to modify the sysctl for obvious reasons.
++ * i will iterate over the SYSCTL string, j will iterate over the
++ * temporary string where each IP is copied into, k will iterate over
++ * the IPs in each list.
++ */
++ for (i = j = k = 0;
++ i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS;
++ ++i) {
++ if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') {
++ /* If the temp IP is empty and the current list is
++ * empty, we are done.
++ */
++ if (j == 0 && mptcp_gws->len[k] == 0)
++ break;
++
++ /* Terminate the temp IP string, then if it is
++ * non-empty parse the IP and copy it.
++ */
++ tmp_string[j] = '\0';
++ if (j > 0) {
++ mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i);
++
++ ret = in4_pton(tmp_string, strlen(tmp_string),
++ (u8 *)&tmp_addr.s_addr, '\0',
++ NULL);
++
++ if (ret) {
++ mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n",
++ ret,
++ &tmp_addr.s_addr);
++ memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr,
++ &tmp_addr.s_addr,
++ sizeof(tmp_addr.s_addr));
++ mptcp_gws->len[k]++;
++ j = 0;
++ tmp_string[j] = '\0';
++ /* Since we can't impose a limit to
++ * what the user can input, make sure
++ * there are not too many IPs in the
++ * SYSCTL string.
++ */
++ if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) {
++ mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n",
++ k,
++ MPTCP_GW_LIST_MAX_LEN);
++ goto error;
++ }
++ } else {
++ goto error;
++ }
++ }
++
++ if (gateways[i] == '-' || gateways[i] == '\0')
++ ++k;
++ } else {
++ tmp_string[j] = gateways[i];
++ ++j;
++ }
++ }
++
++ /* Number of flows is number of gateway lists plus master flow */
++ mptcp_binder_ndiffports = k+1;
++
++ write_unlock(&mptcp_gws_lock);
++ kfree(tmp_string);
++
++ return 0;
++
++error:
++ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
++ memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN);
++ write_unlock(&mptcp_gws_lock);
++ kfree(tmp_string);
++ return -1;
++}
++
++/**
++ * Create all new subflows, by doing calls to mptcp_initX_subsockets
++ *
++ * This function uses a goto next_subflow, to allow releasing the lock between
++ * new subflows and giving other processes a chance to do some work on the
++ * socket and potentially finishing the communication.
++ **/
++static void create_subflow_worker(struct work_struct *work)
++{
++ const struct binder_priv *pm_priv = container_of(work,
++ struct binder_priv,
++ subflow_work);
++ struct mptcp_cb *mpcb = pm_priv->mpcb;
++ struct sock *meta_sk = mpcb->meta_sk;
++ int iter = 0;
++
++next_subflow:
++ if (iter) {
++ release_sock(meta_sk);
++ mutex_unlock(&mpcb->mpcb_mutex);
++
++ cond_resched();
++ }
++ mutex_lock(&mpcb->mpcb_mutex);
++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
++
++ iter++;
++
++ if (sock_flag(meta_sk, SOCK_DEAD))
++ goto exit;
++
++ if (mpcb->master_sk &&
++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
++ goto exit;
++
++ if (mptcp_binder_ndiffports > iter &&
++ mptcp_binder_ndiffports > mpcb->cnt_subflows) {
++ struct mptcp_loc4 loc;
++ struct mptcp_rem4 rem;
++
++ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
++ loc.loc4_id = 0;
++ loc.low_prio = 0;
++
++ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
++ rem.port = inet_sk(meta_sk)->inet_dport;
++ rem.rem4_id = 0; /* Default 0 */
++
++ mptcp_init4_subsockets(meta_sk, &loc, &rem);
++
++ goto next_subflow;
++ }
++
++exit:
++ release_sock(meta_sk);
++ mutex_unlock(&mpcb->mpcb_mutex);
++ sock_put(meta_sk);
++}
++
++static void binder_new_session(const struct sock *meta_sk)
++{
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0];
++ static DEFINE_SPINLOCK(flow_lock);
++
++#if IS_ENABLED(CONFIG_IPV6)
++ if (meta_sk->sk_family == AF_INET6 &&
++ !mptcp_v6_is_v4_mapped(meta_sk)) {
++ mptcp_fallback_default(mpcb);
++ return;
++ }
++#endif
++
++ /* Initialize workqueue-struct */
++ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
++ fmp->mpcb = mpcb;
++
++ fmp->flow_lock = &flow_lock;
++}
++
++static void binder_create_subflows(struct sock *meta_sk)
++{
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0];
++
++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
++ mpcb->send_infinite_mapping ||
++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
++ return;
++
++ if (!work_pending(&pm_priv->subflow_work)) {
++ sock_hold(meta_sk);
++ queue_work(mptcp_wq, &pm_priv->subflow_work);
++ }
++}
++
++static int binder_get_local_id(sa_family_t family, union inet_addr *addr,
++ struct net *net, bool *low_prio)
++{
++ return 0;
++}
++
++/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated.
++ * Inspired from proc_tcp_congestion_control().
++ */
++static int proc_mptcp_gateways(ctl_table *ctl, int write,
++ void __user *buffer, size_t *lenp,
++ loff_t *ppos)
++{
++ int ret;
++ ctl_table tbl = {
++ .maxlen = MPTCP_GW_SYSCTL_MAX_LEN,
++ };
++
++ if (write) {
++ tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL);
++ if (tbl.data == NULL)
++ return -1;
++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
++ if (ret == 0) {
++ ret = mptcp_parse_gateway_ipv4(tbl.data);
++ memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN);
++ }
++ kfree(tbl.data);
++ } else {
++ ret = proc_dostring(ctl, write, buffer, lenp, ppos);
++ }
++
++
++ return ret;
++}
++
++static struct mptcp_pm_ops binder __read_mostly = {
++ .new_session = binder_new_session,
++ .fully_established = binder_create_subflows,
++ .get_local_id = binder_get_local_id,
++ .init_subsocket_v4 = mptcp_v4_add_lsrr,
++ .name = "binder",
++ .owner = THIS_MODULE,
++};
++
++static struct ctl_table binder_table[] = {
++ {
++ .procname = "mptcp_binder_gateways",
++ .data = &sysctl_mptcp_binder_gateways,
++ .maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN,
++ .mode = 0644,
++ .proc_handler = &proc_mptcp_gateways
++ },
++ { }
++};
++
++struct ctl_table_header *mptcp_sysctl_binder;
++
++/* General initialization of MPTCP_PM */
++static int __init binder_register(void)
++{
++ mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL);
++ if (!mptcp_gws)
++ return -ENOMEM;
++
++ rwlock_init(&mptcp_gws_lock);
++
++ BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE);
++
++ mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp",
++ binder_table);
++ if (!mptcp_sysctl_binder)
++ goto sysctl_fail;
++
++ if (mptcp_register_path_manager(&binder))
++ goto pm_failed;
++
++ return 0;
++
++pm_failed:
++ unregister_net_sysctl_table(mptcp_sysctl_binder);
++sysctl_fail:
++ kfree(mptcp_gws);
++
++ return -1;
++}
++
++static void binder_unregister(void)
++{
++ mptcp_unregister_path_manager(&binder);
++ unregister_net_sysctl_table(mptcp_sysctl_binder);
++ kfree(mptcp_gws);
++}
++
++module_init(binder_register);
++module_exit(binder_unregister);
++
++MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)");
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("BINDER MPTCP");
++MODULE_VERSION("0.1");
+diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c
+new file mode 100644
+index 000000000000..5d761164eb85
+--- /dev/null
++++ b/net/mptcp/mptcp_coupled.c
+@@ -0,0 +1,270 @@
++/*
++ * MPTCP implementation - Linked Increase congestion control Algorithm (LIA)
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer & Author:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++#include <net/tcp.h>
++#include <net/mptcp.h>
++
++#include <linux/module.h>
++
++/* Scaling is done in the numerator with alpha_scale_num and in the denominator
++ * with alpha_scale_den.
++ *
++ * To downscale, we just need to use alpha_scale.
++ *
++ * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2)
++ */
++static int alpha_scale_den = 10;
++static int alpha_scale_num = 32;
++static int alpha_scale = 12;
++
++struct mptcp_ccc {
++ u64 alpha;
++ bool forced_update;
++};
++
++static inline int mptcp_ccc_sk_can_send(const struct sock *sk)
++{
++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
++}
++
++static inline u64 mptcp_get_alpha(const struct sock *meta_sk)
++{
++ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha;
++}
++
++static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha)
++{
++ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha;
++}
++
++static inline u64 mptcp_ccc_scale(u32 val, int scale)
++{
++ return (u64) val << scale;
++}
++
++static inline bool mptcp_get_forced(const struct sock *meta_sk)
++{
++ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update;
++}
++
++static inline void mptcp_set_forced(const struct sock *meta_sk, bool force)
++{
++ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force;
++}
++
++static void mptcp_ccc_recalc_alpha(const struct sock *sk)
++{
++ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
++ const struct sock *sub_sk;
++ int best_cwnd = 0, best_rtt = 0, can_send = 0;
++ u64 max_numerator = 0, sum_denominator = 0, alpha = 1;
++
++ if (!mpcb)
++ return;
++
++ /* Only one subflow left - fall back to normal reno-behavior
++ * (set alpha to 1)
++ */
++ if (mpcb->cnt_established <= 1)
++ goto exit;
++
++ /* Do regular alpha-calculation for multiple subflows */
++
++ /* Find the max numerator of the alpha-calculation */
++ mptcp_for_each_sk(mpcb, sub_sk) {
++ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
++ u64 tmp;
++
++ if (!mptcp_ccc_sk_can_send(sub_sk))
++ continue;
++
++ can_send++;
++
++ /* We need to look for the path, that provides the max-value.
++ * Integer-overflow is not possible here, because
++ * tmp will be in u64.
++ */
++ tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd,
++ alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us);
++
++ if (tmp >= max_numerator) {
++ max_numerator = tmp;
++ best_cwnd = sub_tp->snd_cwnd;
++ best_rtt = sub_tp->srtt_us;
++ }
++ }
++
++ /* No subflow is able to send - we don't care anymore */
++ if (unlikely(!can_send))
++ goto exit;
++
++ /* Calculate the denominator */
++ mptcp_for_each_sk(mpcb, sub_sk) {
++ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
++
++ if (!mptcp_ccc_sk_can_send(sub_sk))
++ continue;
++
++ sum_denominator += div_u64(
++ mptcp_ccc_scale(sub_tp->snd_cwnd,
++ alpha_scale_den) * best_rtt,
++ sub_tp->srtt_us);
++ }
++ sum_denominator *= sum_denominator;
++ if (unlikely(!sum_denominator)) {
++ pr_err("%s: sum_denominator == 0, cnt_established:%d\n",
++ __func__, mpcb->cnt_established);
++ mptcp_for_each_sk(mpcb, sub_sk) {
++ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
++ pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u",
++ __func__, sub_tp->mptcp->path_index,
++ sub_sk->sk_state, sub_tp->srtt_us,
++ sub_tp->snd_cwnd);
++ }
++ }
++
++ alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator);
++
++ if (unlikely(!alpha))
++ alpha = 1;
++
++exit:
++ mptcp_set_alpha(mptcp_meta_sk(sk), alpha);
++}
++
++static void mptcp_ccc_init(struct sock *sk)
++{
++ if (mptcp(tcp_sk(sk))) {
++ mptcp_set_forced(mptcp_meta_sk(sk), 0);
++ mptcp_set_alpha(mptcp_meta_sk(sk), 1);
++ }
++ /* If we do not mptcp, behave like reno: return */
++}
++
++static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event)
++{
++ if (event == CA_EVENT_LOSS)
++ mptcp_ccc_recalc_alpha(sk);
++}
++
++static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state)
++{
++ if (!mptcp(tcp_sk(sk)))
++ return;
++
++ mptcp_set_forced(mptcp_meta_sk(sk), 1);
++}
++
++static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ const struct mptcp_cb *mpcb = tp->mpcb;
++ int snd_cwnd;
++
++ if (!mptcp(tp)) {
++ tcp_reno_cong_avoid(sk, ack, acked);
++ return;
++ }
++
++ if (!tcp_is_cwnd_limited(sk))
++ return;
++
++ if (tp->snd_cwnd <= tp->snd_ssthresh) {
++ /* In "safe" area, increase. */
++ tcp_slow_start(tp, acked);
++ mptcp_ccc_recalc_alpha(sk);
++ return;
++ }
++
++ if (mptcp_get_forced(mptcp_meta_sk(sk))) {
++ mptcp_ccc_recalc_alpha(sk);
++ mptcp_set_forced(mptcp_meta_sk(sk), 0);
++ }
++
++ if (mpcb->cnt_established > 1) {
++ u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk));
++
++ /* This may happen, if at the initialization, the mpcb
++ * was not yet attached to the sock, and thus
++ * initializing alpha failed.
++ */
++ if (unlikely(!alpha))
++ alpha = 1;
++
++ snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale),
++ alpha);
++
++ /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd)
++ * Thus, we select here the max value.
++ */
++ if (snd_cwnd < tp->snd_cwnd)
++ snd_cwnd = tp->snd_cwnd;
++ } else {
++ snd_cwnd = tp->snd_cwnd;
++ }
++
++ if (tp->snd_cwnd_cnt >= snd_cwnd) {
++ if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
++ tp->snd_cwnd++;
++ mptcp_ccc_recalc_alpha(sk);
++ }
++
++ tp->snd_cwnd_cnt = 0;
++ } else {
++ tp->snd_cwnd_cnt++;
++ }
++}
++
++static struct tcp_congestion_ops mptcp_ccc = {
++ .init = mptcp_ccc_init,
++ .ssthresh = tcp_reno_ssthresh,
++ .cong_avoid = mptcp_ccc_cong_avoid,
++ .cwnd_event = mptcp_ccc_cwnd_event,
++ .set_state = mptcp_ccc_set_state,
++ .owner = THIS_MODULE,
++ .name = "lia",
++};
++
++static int __init mptcp_ccc_register(void)
++{
++ BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE);
++ return tcp_register_congestion_control(&mptcp_ccc);
++}
++
++static void __exit mptcp_ccc_unregister(void)
++{
++ tcp_unregister_congestion_control(&mptcp_ccc);
++}
++
++module_init(mptcp_ccc_register);
++module_exit(mptcp_ccc_unregister);
++
++MODULE_AUTHOR("Christoph Paasch, Sébastien Barré");
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM");
++MODULE_VERSION("0.1");
+diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c
+new file mode 100644
+index 000000000000..28dfa0479f5e
+--- /dev/null
++++ b/net/mptcp/mptcp_ctrl.c
+@@ -0,0 +1,2401 @@
++/*
++ * MPTCP implementation - MPTCP-control
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer & Author:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#include <net/inet_common.h>
++#include <net/inet6_hashtables.h>
++#include <net/ipv6.h>
++#include <net/ip6_checksum.h>
++#include <net/mptcp.h>
++#include <net/mptcp_v4.h>
++#if IS_ENABLED(CONFIG_IPV6)
++#include <net/ip6_route.h>
++#include <net/mptcp_v6.h>
++#endif
++#include <net/sock.h>
++#include <net/tcp.h>
++#include <net/tcp_states.h>
++#include <net/transp_v6.h>
++#include <net/xfrm.h>
++
++#include <linux/cryptohash.h>
++#include <linux/kconfig.h>
++#include <linux/module.h>
++#include <linux/netpoll.h>
++#include <linux/list.h>
++#include <linux/jhash.h>
++#include <linux/tcp.h>
++#include <linux/net.h>
++#include <linux/in.h>
++#include <linux/random.h>
++#include <linux/inetdevice.h>
++#include <linux/workqueue.h>
++#include <linux/atomic.h>
++#include <linux/sysctl.h>
++
++static struct kmem_cache *mptcp_sock_cache __read_mostly;
++static struct kmem_cache *mptcp_cb_cache __read_mostly;
++static struct kmem_cache *mptcp_tw_cache __read_mostly;
++
++int sysctl_mptcp_enabled __read_mostly = 1;
++int sysctl_mptcp_checksum __read_mostly = 1;
++int sysctl_mptcp_debug __read_mostly;
++EXPORT_SYMBOL(sysctl_mptcp_debug);
++int sysctl_mptcp_syn_retries __read_mostly = 3;
++
++bool mptcp_init_failed __read_mostly;
++
++struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE;
++EXPORT_SYMBOL(mptcp_static_key);
++
++static int proc_mptcp_path_manager(ctl_table *ctl, int write,
++ void __user *buffer, size_t *lenp,
++ loff_t *ppos)
++{
++ char val[MPTCP_PM_NAME_MAX];
++ ctl_table tbl = {
++ .data = val,
++ .maxlen = MPTCP_PM_NAME_MAX,
++ };
++ int ret;
++
++ mptcp_get_default_path_manager(val);
++
++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
++ if (write && ret == 0)
++ ret = mptcp_set_default_path_manager(val);
++ return ret;
++}
++
++static int proc_mptcp_scheduler(ctl_table *ctl, int write,
++ void __user *buffer, size_t *lenp,
++ loff_t *ppos)
++{
++ char val[MPTCP_SCHED_NAME_MAX];
++ ctl_table tbl = {
++ .data = val,
++ .maxlen = MPTCP_SCHED_NAME_MAX,
++ };
++ int ret;
++
++ mptcp_get_default_scheduler(val);
++
++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
++ if (write && ret == 0)
++ ret = mptcp_set_default_scheduler(val);
++ return ret;
++}
++
++static struct ctl_table mptcp_table[] = {
++ {
++ .procname = "mptcp_enabled",
++ .data = &sysctl_mptcp_enabled,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec
++ },
++ {
++ .procname = "mptcp_checksum",
++ .data = &sysctl_mptcp_checksum,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec
++ },
++ {
++ .procname = "mptcp_debug",
++ .data = &sysctl_mptcp_debug,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec
++ },
++ {
++ .procname = "mptcp_syn_retries",
++ .data = &sysctl_mptcp_syn_retries,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec
++ },
++ {
++ .procname = "mptcp_path_manager",
++ .mode = 0644,
++ .maxlen = MPTCP_PM_NAME_MAX,
++ .proc_handler = proc_mptcp_path_manager,
++ },
++ {
++ .procname = "mptcp_scheduler",
++ .mode = 0644,
++ .maxlen = MPTCP_SCHED_NAME_MAX,
++ .proc_handler = proc_mptcp_scheduler,
++ },
++ { }
++};
++
++static inline u32 mptcp_hash_tk(u32 token)
++{
++ return token % MPTCP_HASH_SIZE;
++}
++
++struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
++EXPORT_SYMBOL(tk_hashtable);
++
++/* This second hashtable is needed to retrieve request socks
++ * created as a result of a join request. While the SYN contains
++ * the token, the final ack does not, so we need a separate hashtable
++ * to retrieve the mpcb.
++ */
++struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
++spinlock_t mptcp_reqsk_hlock; /* hashtable protection */
++
++/* The following hash table is used to avoid collision of token */
++static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE];
++spinlock_t mptcp_tk_hashlock; /* hashtable protection */
++
++static bool mptcp_reqsk_find_tk(const u32 token)
++{
++ const u32 hash = mptcp_hash_tk(token);
++ const struct mptcp_request_sock *mtreqsk;
++ const struct hlist_nulls_node *node;
++
++begin:
++ hlist_nulls_for_each_entry_rcu(mtreqsk, node,
++ &mptcp_reqsk_tk_htb[hash], hash_entry) {
++ if (token == mtreqsk->mptcp_loc_token)
++ return true;
++ }
++ /* A request-socket is destroyed by RCU. So, it might have been recycled
++ * and put into another hash-table list. So, after the lookup we may
++ * end up in a different list. So, we may need to restart.
++ *
++ * See also the comment in __inet_lookup_established.
++ */
++ if (get_nulls_value(node) != hash)
++ goto begin;
++ return false;
++}
++
++static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, const u32 token)
++{
++ u32 hash = mptcp_hash_tk(token);
++
++ hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->hash_entry,
++ &mptcp_reqsk_tk_htb[hash]);
++}
++
++static void mptcp_reqsk_remove_tk(const struct request_sock *reqsk)
++{
++ rcu_read_lock();
++ spin_lock(&mptcp_tk_hashlock);
++ hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->hash_entry);
++ spin_unlock(&mptcp_tk_hashlock);
++ rcu_read_unlock();
++}
++
++void mptcp_reqsk_destructor(struct request_sock *req)
++{
++ if (!mptcp_rsk(req)->is_sub) {
++ if (in_softirq()) {
++ mptcp_reqsk_remove_tk(req);
++ } else {
++ rcu_read_lock_bh();
++ spin_lock(&mptcp_tk_hashlock);
++ hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry);
++ spin_unlock(&mptcp_tk_hashlock);
++ rcu_read_unlock_bh();
++ }
++ } else {
++ mptcp_hash_request_remove(req);
++ }
++}
++
++static void __mptcp_hash_insert(struct tcp_sock *meta_tp, const u32 token)
++{
++ u32 hash = mptcp_hash_tk(token);
++ hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]);
++ meta_tp->inside_tk_table = 1;
++}
++
++static bool mptcp_find_token(u32 token)
++{
++ const u32 hash = mptcp_hash_tk(token);
++ const struct tcp_sock *meta_tp;
++ const struct hlist_nulls_node *node;
++
++begin:
++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) {
++ if (token == meta_tp->mptcp_loc_token)
++ return true;
++ }
++ /* A TCP-socket is destroyed by RCU. So, it might have been recycled
++ * and put into another hash-table list. So, after the lookup we may
++ * end up in a different list. So, we may need to restart.
++ *
++ * See also the comment in __inet_lookup_established.
++ */
++ if (get_nulls_value(node) != hash)
++ goto begin;
++ return false;
++}
++
++static void mptcp_set_key_reqsk(struct request_sock *req,
++ const struct sk_buff *skb)
++{
++ const struct inet_request_sock *ireq = inet_rsk(req);
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
++
++ if (skb->protocol == htons(ETH_P_IP)) {
++ mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr,
++ ip_hdr(skb)->daddr,
++ htons(ireq->ir_num),
++ ireq->ir_rmt_port);
++#if IS_ENABLED(CONFIG_IPV6)
++ } else {
++ mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32,
++ ipv6_hdr(skb)->daddr.s6_addr32,
++ htons(ireq->ir_num),
++ ireq->ir_rmt_port);
++#endif
++ }
++
++ mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);
++}
++
++/* New MPTCP-connection request, prepare a new token for the meta-socket that
++ * will be created in mptcp_check_req_master(), and store the received token.
++ */
++void mptcp_reqsk_new_mptcp(struct request_sock *req,
++ const struct mptcp_options_received *mopt,
++ const struct sk_buff *skb)
++{
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
++
++ inet_rsk(req)->saw_mpc = 1;
++
++ rcu_read_lock();
++ spin_lock(&mptcp_tk_hashlock);
++ do {
++ mptcp_set_key_reqsk(req, skb);
++ } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
++ mptcp_find_token(mtreq->mptcp_loc_token));
++
++ mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token);
++ spin_unlock(&mptcp_tk_hashlock);
++ rcu_read_unlock();
++ mtreq->mptcp_rem_key = mopt->mptcp_key;
++}
++
++static void mptcp_set_key_sk(const struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ const struct inet_sock *isk = inet_sk(sk);
++
++ if (sk->sk_family == AF_INET)
++ tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr,
++ isk->inet_daddr,
++ isk->inet_sport,
++ isk->inet_dport);
++#if IS_ENABLED(CONFIG_IPV6)
++ else
++ tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32,
++ sk->sk_v6_daddr.s6_addr32,
++ isk->inet_sport,
++ isk->inet_dport);
++#endif
++
++ mptcp_key_sha1(tp->mptcp_loc_key,
++ &tp->mptcp_loc_token, NULL);
++}
++
++void mptcp_connect_init(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ rcu_read_lock_bh();
++ spin_lock(&mptcp_tk_hashlock);
++ do {
++ mptcp_set_key_sk(sk);
++ } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) ||
++ mptcp_find_token(tp->mptcp_loc_token));
++
++ __mptcp_hash_insert(tp, tp->mptcp_loc_token);
++ spin_unlock(&mptcp_tk_hashlock);
++ rcu_read_unlock_bh();
++}
++
++/**
++ * This function increments the refcount of the mpcb struct.
++ * It is the responsibility of the caller to decrement when releasing
++ * the structure.
++ */
++struct sock *mptcp_hash_find(const struct net *net, const u32 token)
++{
++ const u32 hash = mptcp_hash_tk(token);
++ const struct tcp_sock *meta_tp;
++ struct sock *meta_sk = NULL;
++ const struct hlist_nulls_node *node;
++
++ rcu_read_lock();
++begin:
++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash],
++ tk_table) {
++ meta_sk = (struct sock *)meta_tp;
++ if (token == meta_tp->mptcp_loc_token &&
++ net_eq(net, sock_net(meta_sk))) {
++ if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
++ goto out;
++ if (unlikely(token != meta_tp->mptcp_loc_token ||
++ !net_eq(net, sock_net(meta_sk)))) {
++ sock_gen_put(meta_sk);
++ goto begin;
++ }
++ goto found;
++ }
++ }
++ /* A TCP-socket is destroyed by RCU. So, it might have been recycled
++ * and put into another hash-table list. So, after the lookup we may
++ * end up in a different list. So, we may need to restart.
++ *
++ * See also the comment in __inet_lookup_established.
++ */
++ if (get_nulls_value(node) != hash)
++ goto begin;
++out:
++ meta_sk = NULL;
++found:
++ rcu_read_unlock();
++ return meta_sk;
++}
++
++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp)
++{
++ /* remove from the token hashtable */
++ rcu_read_lock_bh();
++ spin_lock(&mptcp_tk_hashlock);
++ hlist_nulls_del_init_rcu(&meta_tp->tk_table);
++ meta_tp->inside_tk_table = 0;
++ spin_unlock(&mptcp_tk_hashlock);
++ rcu_read_unlock_bh();
++}
++
++void mptcp_hash_remove(struct tcp_sock *meta_tp)
++{
++ rcu_read_lock();
++ spin_lock(&mptcp_tk_hashlock);
++ hlist_nulls_del_init_rcu(&meta_tp->tk_table);
++ meta_tp->inside_tk_table = 0;
++ spin_unlock(&mptcp_tk_hashlock);
++ rcu_read_unlock();
++}
++
++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk)
++{
++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct sock *sk, *rttsk = NULL, *lastsk = NULL;
++ u32 min_time = 0, last_active = 0;
++
++ mptcp_for_each_sk(meta_tp->mpcb, sk) {
++ struct tcp_sock *tp = tcp_sk(sk);
++ u32 elapsed;
++
++ if (!mptcp_sk_can_send_ack(sk) || tp->pf)
++ continue;
++
++ elapsed = keepalive_time_elapsed(tp);
++
++ /* We take the one with the lowest RTT within a reasonable
++ * (meta-RTO)-timeframe
++ */
++ if (elapsed < inet_csk(meta_sk)->icsk_rto) {
++ if (!min_time || tp->srtt_us < min_time) {
++ min_time = tp->srtt_us;
++ rttsk = sk;
++ }
++ continue;
++ }
++
++ /* Otherwise, we just take the most recent active */
++ if (!rttsk && (!last_active || elapsed < last_active)) {
++ last_active = elapsed;
++ lastsk = sk;
++ }
++ }
++
++ if (rttsk)
++ return rttsk;
++
++ return lastsk;
++}
++EXPORT_SYMBOL(mptcp_select_ack_sock);
++
++static void mptcp_sock_def_error_report(struct sock *sk)
++{
++ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
++
++ if (!sock_flag(sk, SOCK_DEAD))
++ mptcp_sub_close(sk, 0);
++
++ if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd ||
++ mpcb->send_infinite_mapping) {
++ struct sock *meta_sk = mptcp_meta_sk(sk);
++
++ meta_sk->sk_err = sk->sk_err;
++ meta_sk->sk_err_soft = sk->sk_err_soft;
++
++ if (!sock_flag(meta_sk, SOCK_DEAD))
++ meta_sk->sk_error_report(meta_sk);
++
++ tcp_done(meta_sk);
++ }
++
++ sk->sk_err = 0;
++ return;
++}
++
++static void mptcp_mpcb_put(struct mptcp_cb *mpcb)
++{
++ if (atomic_dec_and_test(&mpcb->mpcb_refcnt)) {
++ mptcp_cleanup_path_manager(mpcb);
++ mptcp_cleanup_scheduler(mpcb);
++ kmem_cache_free(mptcp_cb_cache, mpcb);
++ }
++}
++
++static void mptcp_sock_destruct(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ inet_sock_destruct(sk);
++
++ if (!is_meta_sk(sk) && !tp->was_meta_sk) {
++ BUG_ON(!hlist_unhashed(&tp->mptcp->cb_list));
++
++ kmem_cache_free(mptcp_sock_cache, tp->mptcp);
++ tp->mptcp = NULL;
++
++ /* Taken when mpcb pointer was set */
++ sock_put(mptcp_meta_sk(sk));
++ mptcp_mpcb_put(tp->mpcb);
++ } else {
++ struct mptcp_cb *mpcb = tp->mpcb;
++ struct mptcp_tw *mptw;
++
++ /* The mpcb is disappearing - we can make the final
++ * update to the rcv_nxt of the time-wait-sock and remove
++ * its reference to the mpcb.
++ */
++ spin_lock_bh(&mpcb->tw_lock);
++ list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) {
++ list_del_rcu(&mptw->list);
++ mptw->in_list = 0;
++ mptcp_mpcb_put(mpcb);
++ rcu_assign_pointer(mptw->mpcb, NULL);
++ }
++ spin_unlock_bh(&mpcb->tw_lock);
++
++ mptcp_mpcb_put(mpcb);
++
++ mptcp_debug("%s destroying meta-sk\n", __func__);
++ }
++
++ WARN_ON(!static_key_false(&mptcp_static_key));
++ /* Must be the last call, because is_meta_sk() above still needs the
++ * static key
++ */
++ static_key_slow_dec(&mptcp_static_key);
++}
++
++void mptcp_destroy_sock(struct sock *sk)
++{
++ if (is_meta_sk(sk)) {
++ struct sock *sk_it, *tmpsk;
++
++ __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue);
++ mptcp_purge_ofo_queue(tcp_sk(sk));
++
++ /* We have to close all remaining subflows. Normally, they
++ * should all be about to get closed. But, if the kernel is
++ * forcing a closure (e.g., tcp_write_err), the subflows might
++ * not have been closed properly (as we are waiting for the
++ * DATA_ACK of the DATA_FIN).
++ */
++ mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) {
++ /* Already did call tcp_close - waiting for graceful
++ * closure, or if we are retransmitting fast-close on
++ * the subflow. The reset (or timeout) will kill the
++ * subflow..
++ */
++ if (tcp_sk(sk_it)->closing ||
++ tcp_sk(sk_it)->send_mp_fclose)
++ continue;
++
++ /* Allow the delayed work first to prevent time-wait state */
++ if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work))
++ continue;
++
++ mptcp_sub_close(sk_it, 0);
++ }
++
++ mptcp_delete_synack_timer(sk);
++ } else {
++ mptcp_del_sock(sk);
++ }
++}
++
++static void mptcp_set_state(struct sock *sk)
++{
++ struct sock *meta_sk = mptcp_meta_sk(sk);
++
++ /* Meta is not yet established - wake up the application */
++ if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) &&
++ sk->sk_state == TCP_ESTABLISHED) {
++ tcp_set_state(meta_sk, TCP_ESTABLISHED);
++
++ if (!sock_flag(meta_sk, SOCK_DEAD)) {
++ meta_sk->sk_state_change(meta_sk);
++ sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT);
++ }
++ }
++
++ if (sk->sk_state == TCP_ESTABLISHED) {
++ tcp_sk(sk)->mptcp->establish_increased = 1;
++ tcp_sk(sk)->mpcb->cnt_established++;
++ }
++}
++
++void mptcp_init_congestion_control(struct sock *sk)
++{
++ struct inet_connection_sock *icsk = inet_csk(sk);
++ struct inet_connection_sock *meta_icsk = inet_csk(mptcp_meta_sk(sk));
++ const struct tcp_congestion_ops *ca = meta_icsk->icsk_ca_ops;
++
++ /* The application didn't set the congestion control to use
++ * fallback to the default one.
++ */
++ if (ca == &tcp_init_congestion_ops)
++ goto use_default;
++
++ /* Use the same congestion control as set by the user. If the
++ * module is not available fallback to the default one.
++ */
++ if (!try_module_get(ca->owner)) {
++ pr_warn("%s: fallback to the system default CC\n", __func__);
++ goto use_default;
++ }
++
++ icsk->icsk_ca_ops = ca;
++ if (icsk->icsk_ca_ops->init)
++ icsk->icsk_ca_ops->init(sk);
++
++ return;
++
++use_default:
++ icsk->icsk_ca_ops = &tcp_init_congestion_ops;
++ tcp_init_congestion_control(sk);
++}
++
++u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;
++u32 mptcp_seed = 0;
++
++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn)
++{
++ u32 workspace[SHA_WORKSPACE_WORDS];
++ u32 mptcp_hashed_key[SHA_DIGEST_WORDS];
++ u8 input[64];
++ int i;
++
++ memset(workspace, 0, sizeof(workspace));
++
++ /* Initialize input with appropriate padding */
++ memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte
++ * is explicitly set too
++ */
++ memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */
++ input[8] = 0x80; /* Padding: First bit after message = 1 */
++ input[63] = 0x40; /* Padding: Length of the message = 64 bits */
++
++ sha_init(mptcp_hashed_key);
++ sha_transform(mptcp_hashed_key, input, workspace);
++
++ for (i = 0; i < 5; i++)
++ mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]);
++
++ if (token)
++ *token = mptcp_hashed_key[0];
++ if (idsn)
++ *idsn = *((u64 *)&mptcp_hashed_key[3]);
++}
++
++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
++ u32 *hash_out)
++{
++ u32 workspace[SHA_WORKSPACE_WORDS];
++ u8 input[128]; /* 2 512-bit blocks */
++ int i;
++
++ memset(workspace, 0, sizeof(workspace));
++
++ /* Generate key xored with ipad */
++ memset(input, 0x36, 64);
++ for (i = 0; i < 8; i++)
++ input[i] ^= key_1[i];
++ for (i = 0; i < 8; i++)
++ input[i + 8] ^= key_2[i];
++
++ memcpy(&input[64], rand_1, 4);
++ memcpy(&input[68], rand_2, 4);
++ input[72] = 0x80; /* Padding: First bit after message = 1 */
++ memset(&input[73], 0, 53);
++
++ /* Padding: Length of the message = 512 + 64 bits */
++ input[126] = 0x02;
++ input[127] = 0x40;
++
++ sha_init(hash_out);
++ sha_transform(hash_out, input, workspace);
++ memset(workspace, 0, sizeof(workspace));
++
++ sha_transform(hash_out, &input[64], workspace);
++ memset(workspace, 0, sizeof(workspace));
++
++ for (i = 0; i < 5; i++)
++ hash_out[i] = cpu_to_be32(hash_out[i]);
++
++ /* Prepare second part of hmac */
++ memset(input, 0x5C, 64);
++ for (i = 0; i < 8; i++)
++ input[i] ^= key_1[i];
++ for (i = 0; i < 8; i++)
++ input[i + 8] ^= key_2[i];
++
++ memcpy(&input[64], hash_out, 20);
++ input[84] = 0x80;
++ memset(&input[85], 0, 41);
++
++ /* Padding: Length of the message = 512 + 160 bits */
++ input[126] = 0x02;
++ input[127] = 0xA0;
++
++ sha_init(hash_out);
++ sha_transform(hash_out, input, workspace);
++ memset(workspace, 0, sizeof(workspace));
++
++ sha_transform(hash_out, &input[64], workspace);
++
++ for (i = 0; i < 5; i++)
++ hash_out[i] = cpu_to_be32(hash_out[i]);
++}
++
++static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk)
++{
++ /* Socket-options handled by sk_clone_lock while creating the meta-sk.
++ * ======
++ * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT,
++ * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER,
++ * TCP_NODELAY, TCP_CORK
++ *
++ * Socket-options handled in this function here
++ * ======
++ * TCP_DEFER_ACCEPT
++ * SO_KEEPALIVE
++ *
++ * Socket-options on the todo-list
++ * ======
++ * SO_BINDTODEVICE - should probably prevent creation of new subsocks
++ * across other devices. - what about the api-draft?
++ * SO_DEBUG
++ * SO_REUSEADDR - probably we don't care about this
++ * SO_DONTROUTE, SO_BROADCAST
++ * SO_OOBINLINE
++ * SO_LINGER
++ * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM
++ * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM
++ * SO_RXQ_OVFL
++ * TCP_COOKIE_TRANSACTIONS
++ * TCP_MAXSEG
++ * TCP_THIN_* - Handled by sk_clone_lock, but we need to support this
++ * in mptcp_retransmit_timer. AND we need to check what is
++ * about the subsockets.
++ * TCP_LINGER2
++ * TCP_WINDOW_CLAMP
++ * TCP_USER_TIMEOUT
++ * TCP_MD5SIG
++ *
++ * Socket-options of no concern for the meta-socket (but for the subsocket)
++ * ======
++ * SO_PRIORITY
++ * SO_MARK
++ * TCP_CONGESTION
++ * TCP_SYNCNT
++ * TCP_QUICKACK
++ */
++
++ /* DEFER_ACCEPT should not be set on the meta, as we want to accept new subflows directly */
++ inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0;
++
++ /* Keepalives are handled entirely at the MPTCP-layer */
++ if (sock_flag(meta_sk, SOCK_KEEPOPEN)) {
++ inet_csk_reset_keepalive_timer(meta_sk,
++ keepalive_time_when(tcp_sk(meta_sk)));
++ sock_reset_flag(master_sk, SOCK_KEEPOPEN);
++ inet_csk_delete_keepalive_timer(master_sk);
++ }
++
++ /* Do not propagate subflow-errors up to the MPTCP-layer */
++ inet_sk(master_sk)->recverr = 0;
++}
++
++static void mptcp_sub_inherit_sockopts(const struct sock *meta_sk, struct sock *sub_sk)
++{
++ /* IP_TOS also goes to the subflow. */
++ if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) {
++ inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos;
++ sub_sk->sk_priority = meta_sk->sk_priority;
++ sk_dst_reset(sub_sk);
++ }
++
++ /* Inherit SO_REUSEADDR */
++ sub_sk->sk_reuse = meta_sk->sk_reuse;
++
++ /* Inherit snd/rcv-buffer locks */
++ sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
++
++ /* Nagle/Cork is forced off on the subflows. It is handled at the meta-layer */
++ tcp_sk(sub_sk)->nonagle = TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
++
++ /* Keepalives are handled entirely at the MPTCP-layer */
++ if (sock_flag(sub_sk, SOCK_KEEPOPEN)) {
++ sock_reset_flag(sub_sk, SOCK_KEEPOPEN);
++ inet_csk_delete_keepalive_timer(sub_sk);
++ }
++
++ /* Do not propagate subflow-errors up to the MPTCP-layer */
++ inet_sk(sub_sk)->recverr = 0;
++}
++
++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb)
++{
++ /* skb-sk may be NULL if we receive a packet immediatly after the
++ * SYN/ACK + MP_CAPABLE.
++ */
++ struct sock *sk = skb->sk ? skb->sk : meta_sk;
++ int ret = 0;
++
++ skb->sk = NULL;
++
++ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
++ kfree_skb(skb);
++ return 0;
++ }
++
++ if (sk->sk_family == AF_INET)
++ ret = tcp_v4_do_rcv(sk, skb);
++#if IS_ENABLED(CONFIG_IPV6)
++ else
++ ret = tcp_v6_do_rcv(sk, skb);
++#endif
++
++ sock_put(sk);
++ return ret;
++}
++
++struct lock_class_key meta_key;
++struct lock_class_key meta_slock_key;
++
++static void mptcp_synack_timer_handler(unsigned long data)
++{
++ struct sock *meta_sk = (struct sock *) data;
++ struct listen_sock *lopt = inet_csk(meta_sk)->icsk_accept_queue.listen_opt;
++
++ /* Only process if socket is not in use. */
++ bh_lock_sock(meta_sk);
++
++ if (sock_owned_by_user(meta_sk)) {
++ /* Try again later. */
++ mptcp_reset_synack_timer(meta_sk, HZ/20);
++ goto out;
++ }
++
++ /* May happen if the queue got destructed in mptcp_close */
++ if (!lopt)
++ goto out;
++
++ inet_csk_reqsk_queue_prune(meta_sk, TCP_SYNQ_INTERVAL,
++ TCP_TIMEOUT_INIT, TCP_RTO_MAX);
++
++ if (lopt->qlen)
++ mptcp_reset_synack_timer(meta_sk, TCP_SYNQ_INTERVAL);
++
++out:
++ bh_unlock_sock(meta_sk);
++ sock_put(meta_sk);
++}
++
++static const struct tcp_sock_ops mptcp_meta_specific = {
++ .__select_window = __mptcp_select_window,
++ .select_window = mptcp_select_window,
++ .select_initial_window = mptcp_select_initial_window,
++ .init_buffer_space = mptcp_init_buffer_space,
++ .set_rto = mptcp_tcp_set_rto,
++ .should_expand_sndbuf = mptcp_should_expand_sndbuf,
++ .init_congestion_control = mptcp_init_congestion_control,
++ .send_fin = mptcp_send_fin,
++ .write_xmit = mptcp_write_xmit,
++ .send_active_reset = mptcp_send_active_reset,
++ .write_wakeup = mptcp_write_wakeup,
++ .prune_ofo_queue = mptcp_prune_ofo_queue,
++ .retransmit_timer = mptcp_retransmit_timer,
++ .time_wait = mptcp_time_wait,
++ .cleanup_rbuf = mptcp_cleanup_rbuf,
++};
++
++static const struct tcp_sock_ops mptcp_sub_specific = {
++ .__select_window = __mptcp_select_window,
++ .select_window = mptcp_select_window,
++ .select_initial_window = mptcp_select_initial_window,
++ .init_buffer_space = mptcp_init_buffer_space,
++ .set_rto = mptcp_tcp_set_rto,
++ .should_expand_sndbuf = mptcp_should_expand_sndbuf,
++ .init_congestion_control = mptcp_init_congestion_control,
++ .send_fin = tcp_send_fin,
++ .write_xmit = tcp_write_xmit,
++ .send_active_reset = tcp_send_active_reset,
++ .write_wakeup = tcp_write_wakeup,
++ .prune_ofo_queue = tcp_prune_ofo_queue,
++ .retransmit_timer = tcp_retransmit_timer,
++ .time_wait = tcp_time_wait,
++ .cleanup_rbuf = tcp_cleanup_rbuf,
++};
++
++static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window)
++{
++ struct mptcp_cb *mpcb;
++ struct sock *master_sk;
++ struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk);
++ struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk);
++ u64 idsn;
++
++ dst_release(meta_sk->sk_rx_dst);
++ meta_sk->sk_rx_dst = NULL;
++ /* This flag is set to announce sock_lock_init to
++ * reclassify the lock-class of the master socket.
++ */
++ meta_tp->is_master_sk = 1;
++ master_sk = sk_clone_lock(meta_sk, GFP_ATOMIC | __GFP_ZERO);
++ meta_tp->is_master_sk = 0;
++ if (!master_sk)
++ return -ENOBUFS;
++
++ master_tp = tcp_sk(master_sk);
++ master_icsk = inet_csk(master_sk);
++
++ mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC);
++ if (!mpcb) {
++ /* sk_free (and __sk_free) requirese wmem_alloc to be 1.
++ * All the rest is set to 0 thanks to __GFP_ZERO above.
++ */
++ atomic_set(&master_sk->sk_wmem_alloc, 1);
++ sk_free(master_sk);
++ return -ENOBUFS;
++ }
++
++#if IS_ENABLED(CONFIG_IPV6)
++ if (meta_icsk->icsk_af_ops == &mptcp_v6_mapped) {
++ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
++
++ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
++
++ newnp = inet6_sk(master_sk);
++ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
++
++ newnp->ipv6_mc_list = NULL;
++ newnp->ipv6_ac_list = NULL;
++ newnp->ipv6_fl_list = NULL;
++ newnp->opt = NULL;
++ newnp->pktoptions = NULL;
++ (void)xchg(&newnp->rxpmtu, NULL);
++ } else if (meta_sk->sk_family == AF_INET6) {
++ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
++
++ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
++
++ newnp = inet6_sk(master_sk);
++ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
++
++ newnp->hop_limit = -1;
++ newnp->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
++ newnp->mc_loop = 1;
++ newnp->pmtudisc = IPV6_PMTUDISC_WANT;
++ newnp->ipv6only = sock_net(master_sk)->ipv6.sysctl.bindv6only;
++ }
++#endif
++
++ meta_tp->mptcp = NULL;
++
++ /* Store the keys and generate the peer's token */
++ mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key;
++ mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token;
++
++ /* Generate Initial data-sequence-numbers */
++ mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn);
++ idsn = ntohll(idsn) + 1;
++ mpcb->snd_high_order[0] = idsn >> 32;
++ mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1;
++
++ meta_tp->write_seq = (u32)idsn;
++ meta_tp->snd_sml = meta_tp->write_seq;
++ meta_tp->snd_una = meta_tp->write_seq;
++ meta_tp->snd_nxt = meta_tp->write_seq;
++ meta_tp->pushed_seq = meta_tp->write_seq;
++ meta_tp->snd_up = meta_tp->write_seq;
++
++ mpcb->mptcp_rem_key = remote_key;
++ mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn);
++ idsn = ntohll(idsn) + 1;
++ mpcb->rcv_high_order[0] = idsn >> 32;
++ mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1;
++ meta_tp->copied_seq = (u32) idsn;
++ meta_tp->rcv_nxt = (u32) idsn;
++ meta_tp->rcv_wup = (u32) idsn;
++
++ meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;
++ meta_tp->snd_wnd = window;
++ meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */
++
++ meta_tp->packets_out = 0;
++ meta_icsk->icsk_probes_out = 0;
++
++ /* Set mptcp-pointers */
++ master_tp->mpcb = mpcb;
++ master_tp->meta_sk = meta_sk;
++ meta_tp->mpcb = mpcb;
++ meta_tp->meta_sk = meta_sk;
++ mpcb->meta_sk = meta_sk;
++ mpcb->master_sk = master_sk;
++
++ meta_tp->was_meta_sk = 0;
++
++ /* Initialize the queues */
++ skb_queue_head_init(&mpcb->reinject_queue);
++ skb_queue_head_init(&master_tp->out_of_order_queue);
++ tcp_prequeue_init(master_tp);
++ INIT_LIST_HEAD(&master_tp->tsq_node);
++
++ master_tp->tsq_flags = 0;
++
++ mutex_init(&mpcb->mpcb_mutex);
++
++ /* Init the accept_queue structure, we support a queue of 32 pending
++ * connections, it does not need to be huge, since we only store here
++ * pending subflow creations.
++ */
++ if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) {
++ inet_put_port(master_sk);
++ kmem_cache_free(mptcp_cb_cache, mpcb);
++ sk_free(master_sk);
++ return -ENOMEM;
++ }
++
++ /* Redefine function-pointers as the meta-sk is now fully ready */
++ static_key_slow_inc(&mptcp_static_key);
++ meta_tp->mpc = 1;
++ meta_tp->ops = &mptcp_meta_specific;
++
++ meta_sk->sk_backlog_rcv = mptcp_backlog_rcv;
++ meta_sk->sk_destruct = mptcp_sock_destruct;
++
++ /* Meta-level retransmit timer */
++ meta_icsk->icsk_rto *= 2; /* Double of initial - rto */
++
++ tcp_init_xmit_timers(master_sk);
++ /* Has been set for sending out the SYN */
++ inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS);
++
++ if (!meta_tp->inside_tk_table) {
++ /* Adding the meta_tp in the token hashtable - coming from server-side */
++ rcu_read_lock();
++ spin_lock(&mptcp_tk_hashlock);
++
++ __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token);
++
++ spin_unlock(&mptcp_tk_hashlock);
++ rcu_read_unlock();
++ }
++ master_tp->inside_tk_table = 0;
++
++ /* Init time-wait stuff */
++ INIT_LIST_HEAD(&mpcb->tw_list);
++ spin_lock_init(&mpcb->tw_lock);
++
++ INIT_HLIST_HEAD(&mpcb->callback_list);
++
++ mptcp_mpcb_inherit_sockopts(meta_sk, master_sk);
++
++ mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf;
++ mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf;
++ mpcb->orig_window_clamp = meta_tp->window_clamp;
++
++ /* The meta is directly linked - set refcnt to 1 */
++ atomic_set(&mpcb->mpcb_refcnt, 1);
++
++ mptcp_init_path_manager(mpcb);
++ mptcp_init_scheduler(mpcb);
++
++ setup_timer(&mpcb->synack_timer, mptcp_synack_timer_handler,
++ (unsigned long)meta_sk);
++
++ mptcp_debug("%s: created mpcb with token %#x\n",
++ __func__, mpcb->mptcp_loc_token);
++
++ return 0;
++}
++
++void mptcp_fallback_meta_sk(struct sock *meta_sk)
++{
++ kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt);
++ kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb);
++}
++
++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
++ gfp_t flags)
++{
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags);
++ if (!tp->mptcp)
++ return -ENOMEM;
++
++ tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb);
++ /* No more space for more subflows? */
++ if (!tp->mptcp->path_index) {
++ kmem_cache_free(mptcp_sock_cache, tp->mptcp);
++ return -EPERM;
++ }
++
++ INIT_HLIST_NODE(&tp->mptcp->cb_list);
++
++ tp->mptcp->tp = tp;
++ tp->mpcb = mpcb;
++ tp->meta_sk = meta_sk;
++
++ static_key_slow_inc(&mptcp_static_key);
++ tp->mpc = 1;
++ tp->ops = &mptcp_sub_specific;
++
++ tp->mptcp->loc_id = loc_id;
++ tp->mptcp->rem_id = rem_id;
++ if (mpcb->sched_ops->init)
++ mpcb->sched_ops->init(sk);
++
++ /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be
++ * included in mptcp_del_sock(), because the mpcb must remain alive
++ * until the last subsocket is completely destroyed.
++ */
++ sock_hold(meta_sk);
++ atomic_inc(&mpcb->mpcb_refcnt);
++
++ tp->mptcp->next = mpcb->connection_list;
++ mpcb->connection_list = tp;
++ tp->mptcp->attached = 1;
++
++ mpcb->cnt_subflows++;
++ atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc),
++ &meta_sk->sk_rmem_alloc);
++
++ mptcp_sub_inherit_sockopts(meta_sk, sk);
++ INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq);
++
++ /* As we successfully allocated the mptcp_tcp_sock, we have to
++ * change the function-pointers here (for sk_destruct to work correctly)
++ */
++ sk->sk_error_report = mptcp_sock_def_error_report;
++ sk->sk_data_ready = mptcp_data_ready;
++ sk->sk_write_space = mptcp_write_space;
++ sk->sk_state_change = mptcp_set_state;
++ sk->sk_destruct = mptcp_sock_destruct;
++
++ if (sk->sk_family == AF_INET)
++ mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n",
++ __func__ , mpcb->mptcp_loc_token,
++ tp->mptcp->path_index,
++ &((struct inet_sock *)tp)->inet_saddr,
++ ntohs(((struct inet_sock *)tp)->inet_sport),
++ &((struct inet_sock *)tp)->inet_daddr,
++ ntohs(((struct inet_sock *)tp)->inet_dport),
++ mpcb->cnt_subflows);
++#if IS_ENABLED(CONFIG_IPV6)
++ else
++ mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n",
++ __func__ , mpcb->mptcp_loc_token,
++ tp->mptcp->path_index, &inet6_sk(sk)->saddr,
++ ntohs(((struct inet_sock *)tp)->inet_sport),
++ &sk->sk_v6_daddr,
++ ntohs(((struct inet_sock *)tp)->inet_dport),
++ mpcb->cnt_subflows);
++#endif
++
++ return 0;
++}
++
++void mptcp_del_sock(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk), *tp_prev;
++ struct mptcp_cb *mpcb;
++
++ if (!tp->mptcp || !tp->mptcp->attached)
++ return;
++
++ mpcb = tp->mpcb;
++ tp_prev = mpcb->connection_list;
++
++ mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n",
++ __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,
++ sk->sk_state, is_meta_sk(sk));
++
++ if (tp_prev == tp) {
++ mpcb->connection_list = tp->mptcp->next;
++ } else {
++ for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) {
++ if (tp_prev->mptcp->next == tp) {
++ tp_prev->mptcp->next = tp->mptcp->next;
++ break;
++ }
++ }
++ }
++ mpcb->cnt_subflows--;
++ if (tp->mptcp->establish_increased)
++ mpcb->cnt_established--;
++
++ tp->mptcp->next = NULL;
++ tp->mptcp->attached = 0;
++ mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index);
++
++ if (!skb_queue_empty(&sk->sk_write_queue))
++ mptcp_reinject_data(sk, 0);
++
++ if (is_master_tp(tp))
++ mpcb->master_sk = NULL;
++ else if (tp->mptcp->pre_established)
++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
++
++ rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL);
++}
++
++/* Updates the metasocket ULID/port data, based on the given sock.
++ * The argument sock must be the sock accessible to the application.
++ * In this function, we update the meta socket info, based on the changes
++ * in the application socket (bind, address allocation, ...)
++ */
++void mptcp_update_metasocket(struct sock *sk, const struct sock *meta_sk)
++{
++ if (tcp_sk(sk)->mpcb->pm_ops->new_session)
++ tcp_sk(sk)->mpcb->pm_ops->new_session(meta_sk);
++
++ tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio;
++}
++
++/* Clean up the receive buffer for full frames taken by the user,
++ * then send an ACK if necessary. COPIED is the number of bytes
++ * tcp_recvmsg has given to the user so far, it speeds up the
++ * calculation of whether or not we must ACK for the sake of
++ * a window update.
++ */
++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct sock *sk;
++ __u32 rcv_window_now = 0;
++
++ if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) {
++ rcv_window_now = tcp_receive_window(meta_tp);
++
++ if (2 * rcv_window_now > meta_tp->window_clamp)
++ rcv_window_now = 0;
++ }
++
++ mptcp_for_each_sk(meta_tp->mpcb, sk) {
++ struct tcp_sock *tp = tcp_sk(sk);
++ const struct inet_connection_sock *icsk = inet_csk(sk);
++
++ if (!mptcp_sk_can_send_ack(sk))
++ continue;
++
++ if (!inet_csk_ack_scheduled(sk))
++ goto second_part;
++ /* Delayed ACKs frequently hit locked sockets during bulk
++ * receive.
++ */
++ if (icsk->icsk_ack.blocked ||
++ /* Once-per-two-segments ACK was not sent by tcp_input.c */
++ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
++ /* If this read emptied read buffer, we send ACK, if
++ * connection is not bidirectional, user drained
++ * receive buffer and there was a small segment
++ * in queue.
++ */
++ (copied > 0 &&
++ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
++ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
++ !icsk->icsk_ack.pingpong)) &&
++ !atomic_read(&meta_sk->sk_rmem_alloc))) {
++ tcp_send_ack(sk);
++ continue;
++ }
++
++second_part:
++ /* This here is the second part of tcp_cleanup_rbuf */
++ if (rcv_window_now) {
++ __u32 new_window = tp->ops->__select_window(sk);
++
++ /* Send ACK now, if this read freed lots of space
++ * in our buffer. Certainly, new_window is new window.
++ * We can advertise it now, if it is not less than
++ * current one.
++ * "Lots" means "at least twice" here.
++ */
++ if (new_window && new_window >= 2 * rcv_window_now)
++ tcp_send_ack(sk);
++ }
++ }
++}
++
++static int mptcp_sub_send_fin(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct sk_buff *skb = tcp_write_queue_tail(sk);
++ int mss_now;
++
++ /* Optimization, tack on the FIN if we have a queue of
++ * unsent frames. But be careful about outgoing SACKS
++ * and IP options.
++ */
++ mss_now = tcp_current_mss(sk);
++
++ if (tcp_send_head(sk) != NULL) {
++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
++ TCP_SKB_CB(skb)->end_seq++;
++ tp->write_seq++;
++ } else {
++ skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC);
++ if (!skb)
++ return 1;
++
++ /* Reserve space for headers and prepare control bits. */
++ skb_reserve(skb, MAX_TCP_HEADER);
++ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
++ tcp_init_nondata_skb(skb, tp->write_seq,
++ TCPHDR_ACK | TCPHDR_FIN);
++ tcp_queue_skb(sk, skb);
++ }
++ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
++
++ return 0;
++}
++
++void mptcp_sub_close_wq(struct work_struct *work)
++{
++ struct tcp_sock *tp = container_of(work, struct mptcp_tcp_sock, work.work)->tp;
++ struct sock *sk = (struct sock *)tp;
++ struct sock *meta_sk = mptcp_meta_sk(sk);
++
++ mutex_lock(&tp->mpcb->mpcb_mutex);
++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
++
++ if (sock_flag(sk, SOCK_DEAD))
++ goto exit;
++
++ /* We come from tcp_disconnect. We are sure that meta_sk is set */
++ if (!mptcp(tp)) {
++ tp->closing = 1;
++ sock_rps_reset_flow(sk);
++ tcp_close(sk, 0);
++ goto exit;
++ }
++
++ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) {
++ tp->closing = 1;
++ sock_rps_reset_flow(sk);
++ tcp_close(sk, 0);
++ } else if (tcp_close_state(sk)) {
++ sk->sk_shutdown |= SEND_SHUTDOWN;
++ tcp_send_fin(sk);
++ }
++
++exit:
++ release_sock(meta_sk);
++ mutex_unlock(&tp->mpcb->mpcb_mutex);
++ sock_put(sk);
++}
++
++void mptcp_sub_close(struct sock *sk, unsigned long delay)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct delayed_work *work = &tcp_sk(sk)->mptcp->work;
++
++ /* We are already closing - e.g., call from sock_def_error_report upon
++ * tcp_disconnect in tcp_close.
++ */
++ if (tp->closing)
++ return;
++
++ /* Work already scheduled ? */
++ if (work_pending(&work->work)) {
++ /* Work present - who will be first ? */
++ if (jiffies + delay > work->timer.expires)
++ return;
++
++ /* Try canceling - if it fails, work will be executed soon */
++ if (!cancel_delayed_work(work))
++ return;
++ sock_put(sk);
++ }
++
++ if (!delay) {
++ unsigned char old_state = sk->sk_state;
++
++ /* If we are in user-context we can directly do the closing
++ * procedure. No need to schedule a work-queue.
++ */
++ if (!in_softirq()) {
++ if (sock_flag(sk, SOCK_DEAD))
++ return;
++
++ if (!mptcp(tp)) {
++ tp->closing = 1;
++ sock_rps_reset_flow(sk);
++ tcp_close(sk, 0);
++ return;
++ }
++
++ if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK ||
++ sk->sk_state == TCP_CLOSE) {
++ tp->closing = 1;
++ sock_rps_reset_flow(sk);
++ tcp_close(sk, 0);
++ } else if (tcp_close_state(sk)) {
++ sk->sk_shutdown |= SEND_SHUTDOWN;
++ tcp_send_fin(sk);
++ }
++
++ return;
++ }
++
++ /* We directly send the FIN. Because it may take so a long time,
++ * untile the work-queue will get scheduled...
++ *
++ * If mptcp_sub_send_fin returns 1, it failed and thus we reset
++ * the old state so that tcp_close will finally send the fin
++ * in user-context.
++ */
++ if (!sk->sk_err && old_state != TCP_CLOSE &&
++ tcp_close_state(sk) && mptcp_sub_send_fin(sk)) {
++ if (old_state == TCP_ESTABLISHED)
++ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
++ sk->sk_state = old_state;
++ }
++ }
++
++ sock_hold(sk);
++ queue_delayed_work(mptcp_wq, work, delay);
++}
++
++void mptcp_sub_force_close(struct sock *sk)
++{
++ /* The below tcp_done may have freed the socket, if he is already dead.
++ * Thus, we are not allowed to access it afterwards. That's why
++ * we have to store the dead-state in this local variable.
++ */
++ int sock_is_dead = sock_flag(sk, SOCK_DEAD);
++
++ tcp_sk(sk)->mp_killed = 1;
++
++ if (sk->sk_state != TCP_CLOSE)
++ tcp_done(sk);
++
++ if (!sock_is_dead)
++ mptcp_sub_close(sk, 0);
++}
++EXPORT_SYMBOL(mptcp_sub_force_close);
++
++/* Update the mpcb send window, based on the contributions
++ * of each subflow
++ */
++void mptcp_update_sndbuf(const struct tcp_sock *tp)
++{
++ struct sock *meta_sk = tp->meta_sk, *sk;
++ int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf;
++
++ mptcp_for_each_sk(tp->mpcb, sk) {
++ if (!mptcp_sk_can_send(sk))
++ continue;
++
++ new_sndbuf += sk->sk_sndbuf;
++
++ if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) {
++ new_sndbuf = sysctl_tcp_wmem[2];
++ break;
++ }
++ }
++ meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf);
++
++ /* The subflow's call to sk_write_space in tcp_new_space ends up in
++ * mptcp_write_space.
++ * It has nothing to do with waking up the application.
++ * So, we do it here.
++ */
++ if (old_sndbuf != meta_sk->sk_sndbuf)
++ meta_sk->sk_write_space(meta_sk);
++}
++
++void mptcp_close(struct sock *meta_sk, long timeout)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct sock *sk_it, *tmpsk;
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ struct sk_buff *skb;
++ int data_was_unread = 0;
++ int state;
++
++ mptcp_debug("%s: Close of meta_sk with tok %#x\n",
++ __func__, mpcb->mptcp_loc_token);
++
++ mutex_lock(&mpcb->mpcb_mutex);
++ lock_sock(meta_sk);
++
++ if (meta_tp->inside_tk_table) {
++ /* Detach the mpcb from the token hashtable */
++ mptcp_hash_remove_bh(meta_tp);
++ reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue);
++ }
++
++ meta_sk->sk_shutdown = SHUTDOWN_MASK;
++ /* We need to flush the recv. buffs. We do this only on the
++ * descriptor close, not protocol-sourced closes, because the
++ * reader process may not have drained the data yet!
++ */
++ while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) {
++ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
++ tcp_hdr(skb)->fin;
++ data_was_unread += len;
++ __kfree_skb(skb);
++ }
++
++ sk_mem_reclaim(meta_sk);
++
++ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
++ if (meta_sk->sk_state == TCP_CLOSE) {
++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
++ if (tcp_sk(sk_it)->send_mp_fclose)
++ continue;
++ mptcp_sub_close(sk_it, 0);
++ }
++ goto adjudge_to_death;
++ }
++
++ if (data_was_unread) {
++ /* Unread data was tossed, zap the connection. */
++ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE);
++ tcp_set_state(meta_sk, TCP_CLOSE);
++ tcp_sk(meta_sk)->ops->send_active_reset(meta_sk,
++ meta_sk->sk_allocation);
++ } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) {
++ /* Check zero linger _after_ checking for unread data. */
++ meta_sk->sk_prot->disconnect(meta_sk, 0);
++ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
++ } else if (tcp_close_state(meta_sk)) {
++ mptcp_send_fin(meta_sk);
++ } else if (meta_tp->snd_una == meta_tp->write_seq) {
++ /* The DATA_FIN has been sent and acknowledged
++ * (e.g., by sk_shutdown). Close all the other subflows
++ */
++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
++ unsigned long delay = 0;
++ /* If we are the passive closer, don't trigger
++ * subflow-fin until the subflow has been finned
++ * by the peer. - thus we add a delay
++ */
++ if (mpcb->passive_close &&
++ sk_it->sk_state == TCP_ESTABLISHED)
++ delay = inet_csk(sk_it)->icsk_rto << 3;
++
++ mptcp_sub_close(sk_it, delay);
++ }
++ }
++
++ sk_stream_wait_close(meta_sk, timeout);
++
++adjudge_to_death:
++ state = meta_sk->sk_state;
++ sock_hold(meta_sk);
++ sock_orphan(meta_sk);
++
++ /* socket will be freed after mptcp_close - we have to prevent
++ * access from the subflows.
++ */
++ mptcp_for_each_sk(mpcb, sk_it) {
++ /* Similar to sock_orphan, but we don't set it DEAD, because
++ * the callbacks are still set and must be called.
++ */
++ write_lock_bh(&sk_it->sk_callback_lock);
++ sk_set_socket(sk_it, NULL);
++ sk_it->sk_wq = NULL;
++ write_unlock_bh(&sk_it->sk_callback_lock);
++ }
++
++ /* It is the last release_sock in its life. It will remove backlog. */
++ release_sock(meta_sk);
++
++ /* Now socket is owned by kernel and we acquire BH lock
++ * to finish close. No need to check for user refs.
++ */
++ local_bh_disable();
++ bh_lock_sock(meta_sk);
++ WARN_ON(sock_owned_by_user(meta_sk));
++
++ percpu_counter_inc(meta_sk->sk_prot->orphan_count);
++
++ /* Have we already been destroyed by a softirq or backlog? */
++ if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE)
++ goto out;
++
++ /* This is a (useful) BSD violating of the RFC. There is a
++ * problem with TCP as specified in that the other end could
++ * keep a socket open forever with no application left this end.
++ * We use a 3 minute timeout (about the same as BSD) then kill
++ * our end. If they send after that then tough - BUT: long enough
++ * that we won't make the old 4*rto = almost no time - whoops
++ * reset mistake.
++ *
++ * Nope, it was not mistake. It is really desired behaviour
++ * f.e. on http servers, when such sockets are useless, but
++ * consume significant resources. Let's do it with special
++ * linger2 option. --ANK
++ */
++
++ if (meta_sk->sk_state == TCP_FIN_WAIT2) {
++ if (meta_tp->linger2 < 0) {
++ tcp_set_state(meta_sk, TCP_CLOSE);
++ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);
++ NET_INC_STATS_BH(sock_net(meta_sk),
++ LINUX_MIB_TCPABORTONLINGER);
++ } else {
++ const int tmo = tcp_fin_time(meta_sk);
++
++ if (tmo > TCP_TIMEWAIT_LEN) {
++ inet_csk_reset_keepalive_timer(meta_sk,
++ tmo - TCP_TIMEWAIT_LEN);
++ } else {
++ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2,
++ tmo);
++ goto out;
++ }
++ }
++ }
++ if (meta_sk->sk_state != TCP_CLOSE) {
++ sk_mem_reclaim(meta_sk);
++ if (tcp_too_many_orphans(meta_sk, 0)) {
++ if (net_ratelimit())
++ pr_info("MPTCP: too many of orphaned sockets\n");
++ tcp_set_state(meta_sk, TCP_CLOSE);
++ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);
++ NET_INC_STATS_BH(sock_net(meta_sk),
++ LINUX_MIB_TCPABORTONMEMORY);
++ }
++ }
++
++
++ if (meta_sk->sk_state == TCP_CLOSE)
++ inet_csk_destroy_sock(meta_sk);
++ /* Otherwise, socket is reprieved until protocol close. */
++
++out:
++ bh_unlock_sock(meta_sk);
++ local_bh_enable();
++ mutex_unlock(&mpcb->mpcb_mutex);
++ sock_put(meta_sk); /* Taken by sock_hold */
++}
++
++void mptcp_disconnect(struct sock *sk)
++{
++ struct sock *subsk, *tmpsk;
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ mptcp_delete_synack_timer(sk);
++
++ __skb_queue_purge(&tp->mpcb->reinject_queue);
++
++ if (tp->inside_tk_table) {
++ mptcp_hash_remove_bh(tp);
++ reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue);
++ }
++
++ local_bh_disable();
++ mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) {
++ /* The socket will get removed from the subsocket-list
++ * and made non-mptcp by setting mpc to 0.
++ *
++ * This is necessary, because tcp_disconnect assumes
++ * that the connection is completly dead afterwards.
++ * Thus we need to do a mptcp_del_sock. Due to this call
++ * we have to make it non-mptcp.
++ *
++ * We have to lock the socket, because we set mpc to 0.
++ * An incoming packet would take the subsocket's lock
++ * and go on into the receive-path.
++ * This would be a race.
++ */
++
++ bh_lock_sock(subsk);
++ mptcp_del_sock(subsk);
++ tcp_sk(subsk)->mpc = 0;
++ tcp_sk(subsk)->ops = &tcp_specific;
++ mptcp_sub_force_close(subsk);
++ bh_unlock_sock(subsk);
++ }
++ local_bh_enable();
++
++ tp->was_meta_sk = 1;
++ tp->mpc = 0;
++ tp->ops = &tcp_specific;
++}
++
++
++/* Returns 1 if we should enable MPTCP for that socket. */
++int mptcp_doit(struct sock *sk)
++{
++ /* Do not allow MPTCP enabling if the MPTCP initialization failed */
++ if (mptcp_init_failed)
++ return 0;
++
++ if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)
++ return 0;
++
++ /* Socket may already be established (e.g., called from tcp_recvmsg) */
++ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->request_mptcp)
++ return 1;
++
++ /* Don't do mptcp over loopback */
++ if (sk->sk_family == AF_INET &&
++ (ipv4_is_loopback(inet_sk(sk)->inet_daddr) ||
++ ipv4_is_loopback(inet_sk(sk)->inet_saddr)))
++ return 0;
++#if IS_ENABLED(CONFIG_IPV6)
++ if (sk->sk_family == AF_INET6 &&
++ (ipv6_addr_loopback(&sk->sk_v6_daddr) ||
++ ipv6_addr_loopback(&inet6_sk(sk)->saddr)))
++ return 0;
++#endif
++ if (mptcp_v6_is_v4_mapped(sk) &&
++ ipv4_is_loopback(inet_sk(sk)->inet_saddr))
++ return 0;
++
++#ifdef CONFIG_TCP_MD5SIG
++ /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */
++ if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk))
++ return 0;
++#endif
++
++ return 1;
++}
++
++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window)
++{
++ struct tcp_sock *master_tp;
++ struct sock *master_sk;
++
++ if (mptcp_alloc_mpcb(meta_sk, remote_key, window))
++ goto err_alloc_mpcb;
++
++ master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
++ master_tp = tcp_sk(master_sk);
++
++ if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC))
++ goto err_add_sock;
++
++ if (__inet_inherit_port(meta_sk, master_sk) < 0)
++ goto err_add_sock;
++
++ meta_sk->sk_prot->unhash(meta_sk);
++
++ if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk))
++ __inet_hash_nolisten(master_sk, NULL);
++#if IS_ENABLED(CONFIG_IPV6)
++ else
++ __inet6_hash(master_sk, NULL);
++#endif
++
++ master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd;
++
++ return 0;
++
++err_add_sock:
++ mptcp_fallback_meta_sk(meta_sk);
++
++ inet_csk_prepare_forced_close(master_sk);
++ tcp_done(master_sk);
++ inet_csk_prepare_forced_close(meta_sk);
++ tcp_done(meta_sk);
++
++err_alloc_mpcb:
++ return -ENOBUFS;
++}
++
++static int __mptcp_check_req_master(struct sock *child,
++ struct request_sock *req)
++{
++ struct tcp_sock *child_tp = tcp_sk(child);
++ struct sock *meta_sk = child;
++ struct mptcp_cb *mpcb;
++ struct mptcp_request_sock *mtreq;
++
++ /* Never contained an MP_CAPABLE */
++ if (!inet_rsk(req)->mptcp_rqsk)
++ return 1;
++
++ if (!inet_rsk(req)->saw_mpc) {
++ /* Fallback to regular TCP, because we saw one SYN without
++ * MP_CAPABLE. In tcp_check_req we continue the regular path.
++ * But, the socket has been added to the reqsk_tk_htb, so we
++ * must still remove it.
++ */
++ mptcp_reqsk_remove_tk(req);
++ return 1;
++ }
++
++ /* Just set this values to pass them to mptcp_alloc_mpcb */
++ mtreq = mptcp_rsk(req);
++ child_tp->mptcp_loc_key = mtreq->mptcp_loc_key;
++ child_tp->mptcp_loc_token = mtreq->mptcp_loc_token;
++
++ if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key,
++ child_tp->snd_wnd))
++ return -ENOBUFS;
++
++ child = tcp_sk(child)->mpcb->master_sk;
++ child_tp = tcp_sk(child);
++ mpcb = child_tp->mpcb;
++
++ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
++ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
++
++ mpcb->dss_csum = mtreq->dss_csum;
++ mpcb->server_side = 1;
++
++ /* Will be moved to ESTABLISHED by tcp_rcv_state_process() */
++ mptcp_update_metasocket(child, meta_sk);
++
++ /* Needs to be done here additionally, because when accepting a
++ * new connection we pass by __reqsk_free and not reqsk_free.
++ */
++ mptcp_reqsk_remove_tk(req);
++
++ /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */
++ sock_put(meta_sk);
++
++ return 0;
++}
++
++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req)
++{
++ struct sock *meta_sk = child, *master_sk;
++ struct sk_buff *skb;
++ u32 new_mapping;
++ int ret;
++
++ ret = __mptcp_check_req_master(child, req);
++ if (ret)
++ return ret;
++
++ master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
++
++ /* We need to rewind copied_seq as it is set to IDSN + 1 and as we have
++ * pre-MPTCP data in the receive queue.
++ */
++ tcp_sk(meta_sk)->copied_seq -= tcp_sk(master_sk)->rcv_nxt -
++ tcp_rsk(req)->rcv_isn - 1;
++
++ /* Map subflow sequence number to data sequence numbers. We need to map
++ * these data to [IDSN - len - 1, IDSN[.
++ */
++ new_mapping = tcp_sk(meta_sk)->copied_seq - tcp_rsk(req)->rcv_isn - 1;
++
++ /* There should be only one skb: the SYN + data. */
++ skb_queue_walk(&meta_sk->sk_receive_queue, skb) {
++ TCP_SKB_CB(skb)->seq += new_mapping;
++ TCP_SKB_CB(skb)->end_seq += new_mapping;
++ }
++
++ /* With fastopen we change the semantics of the relative subflow
++ * sequence numbers to deal with middleboxes that could add/remove
++ * multiple bytes in the SYN. We chose to start counting at rcv_nxt - 1
++ * instead of the regular TCP ISN.
++ */
++ tcp_sk(master_sk)->mptcp->rcv_isn = tcp_sk(master_sk)->rcv_nxt - 1;
++
++ /* We need to update copied_seq of the master_sk to account for the
++ * already moved data to the meta receive queue.
++ */
++ tcp_sk(master_sk)->copied_seq = tcp_sk(master_sk)->rcv_nxt;
++
++ /* Handled by the master_sk */
++ tcp_sk(meta_sk)->fastopen_rsk = NULL;
++
++ return 0;
++}
++
++int mptcp_check_req_master(struct sock *sk, struct sock *child,
++ struct request_sock *req,
++ struct request_sock **prev)
++{
++ struct sock *meta_sk = child;
++ int ret;
++
++ ret = __mptcp_check_req_master(child, req);
++ if (ret)
++ return ret;
++
++ inet_csk_reqsk_queue_unlink(sk, req, prev);
++ inet_csk_reqsk_queue_removed(sk, req);
++ inet_csk_reqsk_queue_add(sk, req, meta_sk);
++
++ return 0;
++}
++
++struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child,
++ struct request_sock *req,
++ struct request_sock **prev,
++ const struct mptcp_options_received *mopt)
++{
++ struct tcp_sock *child_tp = tcp_sk(child);
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ u8 hash_mac_check[20];
++
++ child_tp->inside_tk_table = 0;
++
++ if (!mopt->join_ack)
++ goto teardown;
++
++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
++ (u8 *)&mpcb->mptcp_loc_key,
++ (u8 *)&mtreq->mptcp_rem_nonce,
++ (u8 *)&mtreq->mptcp_loc_nonce,
++ (u32 *)hash_mac_check);
++
++ if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20))
++ goto teardown;
++
++ /* Point it to the same struct socket and wq as the meta_sk */
++ sk_set_socket(child, meta_sk->sk_socket);
++ child->sk_wq = meta_sk->sk_wq;
++
++ if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) {
++ /* Has been inherited, but now child_tp->mptcp is NULL */
++ child_tp->mpc = 0;
++ child_tp->ops = &tcp_specific;
++
++ /* TODO when we support acking the third ack for new subflows,
++ * we should silently discard this third ack, by returning NULL.
++ *
++ * Maybe, at the retransmission we will have enough memory to
++ * fully add the socket to the meta-sk.
++ */
++ goto teardown;
++ }
++
++ /* The child is a clone of the meta socket, we must now reset
++ * some of the fields
++ */
++ child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio;
++
++ /* We should allow proper increase of the snd/rcv-buffers. Thus, we
++ * use the original values instead of the bloated up ones from the
++ * clone.
++ */
++ child->sk_sndbuf = mpcb->orig_sk_sndbuf;
++ child->sk_rcvbuf = mpcb->orig_sk_rcvbuf;
++
++ child_tp->mptcp->slave_sk = 1;
++ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
++ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
++ child_tp->mptcp->init_rcv_wnd = req->rcv_wnd;
++
++ child_tp->tsq_flags = 0;
++
++ /* Subflows do not use the accept queue, as they
++ * are attached immediately to the mpcb.
++ */
++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);
++ reqsk_free(req);
++ return child;
++
++teardown:
++ /* Drop this request - sock creation failed. */
++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);
++ reqsk_free(req);
++ inet_csk_prepare_forced_close(child);
++ tcp_done(child);
++ return meta_sk;
++}
++
++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw)
++{
++ struct mptcp_tw *mptw;
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct mptcp_cb *mpcb = tp->mpcb;
++
++ /* A subsocket in tw can only receive data. So, if we are in
++ * infinite-receive, then we should not reply with a data-ack or act
++ * upon general MPTCP-signaling. We prevent this by simply not creating
++ * the mptcp_tw_sock.
++ */
++ if (mpcb->infinite_mapping_rcv) {
++ tw->mptcp_tw = NULL;
++ return 0;
++ }
++
++ /* Alloc MPTCP-tw-sock */
++ mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC);
++ if (!mptw)
++ return -ENOBUFS;
++
++ atomic_inc(&mpcb->mpcb_refcnt);
++
++ tw->mptcp_tw = mptw;
++ mptw->loc_key = mpcb->mptcp_loc_key;
++ mptw->meta_tw = mpcb->in_time_wait;
++ if (mptw->meta_tw) {
++ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
++ if (mpcb->mptw_state != TCP_TIME_WAIT)
++ mptw->rcv_nxt++;
++ }
++ rcu_assign_pointer(mptw->mpcb, mpcb);
++
++ spin_lock(&mpcb->tw_lock);
++ list_add_rcu(&mptw->list, &tp->mpcb->tw_list);
++ mptw->in_list = 1;
++ spin_unlock(&mpcb->tw_lock);
++
++ return 0;
++}
++
++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw)
++{
++ struct mptcp_cb *mpcb;
++
++ rcu_read_lock();
++ mpcb = rcu_dereference(tw->mptcp_tw->mpcb);
++
++ /* If we are still holding a ref to the mpcb, we have to remove ourself
++ * from the list and drop the ref properly.
++ */
++ if (mpcb && atomic_inc_not_zero(&mpcb->mpcb_refcnt)) {
++ spin_lock(&mpcb->tw_lock);
++ if (tw->mptcp_tw->in_list) {
++ list_del_rcu(&tw->mptcp_tw->list);
++ tw->mptcp_tw->in_list = 0;
++ }
++ spin_unlock(&mpcb->tw_lock);
++
++ /* Twice, because we increased it above */
++ mptcp_mpcb_put(mpcb);
++ mptcp_mpcb_put(mpcb);
++ }
++
++ rcu_read_unlock();
++
++ kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw);
++}
++
++/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a
++ * data-fin.
++ */
++void mptcp_time_wait(struct sock *sk, int state, int timeo)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct mptcp_tw *mptw;
++
++ /* Used for sockets that go into tw after the meta
++ * (see mptcp_init_tw_sock())
++ */
++ tp->mpcb->in_time_wait = 1;
++ tp->mpcb->mptw_state = state;
++
++ /* Update the time-wait-sock's information */
++ rcu_read_lock_bh();
++ list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) {
++ mptw->meta_tw = 1;
++ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp);
++
++ /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 -
++ * pretend as if the DATA_FIN has already reached us, that way
++ * the checks in tcp_timewait_state_process will be good as the
++ * DATA_FIN comes in.
++ */
++ if (state != TCP_TIME_WAIT)
++ mptw->rcv_nxt++;
++ }
++ rcu_read_unlock_bh();
++
++ tcp_done(sk);
++}
++
++void mptcp_tsq_flags(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct sock *meta_sk = mptcp_meta_sk(sk);
++
++ /* It will be handled as a regular deferred-call */
++ if (is_meta_sk(sk))
++ return;
++
++ if (hlist_unhashed(&tp->mptcp->cb_list)) {
++ hlist_add_head(&tp->mptcp->cb_list, &tp->mpcb->callback_list);
++ /* We need to hold it here, as the sock_hold is not assured
++ * by the release_sock as it is done in regular TCP.
++ *
++ * The subsocket may get inet_csk_destroy'd while it is inside
++ * the callback_list.
++ */
++ sock_hold(sk);
++ }
++
++ if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &tcp_sk(meta_sk)->tsq_flags))
++ sock_hold(meta_sk);
++}
++
++void mptcp_tsq_sub_deferred(struct sock *meta_sk)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct mptcp_tcp_sock *mptcp;
++ struct hlist_node *tmp;
++
++ BUG_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk);
++
++ __sock_put(meta_sk);
++ hlist_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) {
++ struct tcp_sock *tp = mptcp->tp;
++ struct sock *sk = (struct sock *)tp;
++
++ hlist_del_init(&mptcp->cb_list);
++ sk->sk_prot->release_cb(sk);
++ /* Final sock_put (cfr. mptcp_tsq_flags */
++ sock_put(sk);
++ }
++}
++
++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req,
++ struct sk_buff *skb)
++{
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
++ struct mptcp_options_received mopt;
++ u8 mptcp_hash_mac[20];
++
++ mptcp_init_mp_opt(&mopt);
++ tcp_parse_mptcp_options(skb, &mopt);
++
++ mtreq = mptcp_rsk(req);
++ mtreq->mptcp_mpcb = mpcb;
++ mtreq->is_sub = 1;
++ inet_rsk(req)->mptcp_rqsk = 1;
++
++ mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;
++
++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
++ (u8 *)&mpcb->mptcp_rem_key,
++ (u8 *)&mtreq->mptcp_loc_nonce,
++ (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac);
++ mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;
++
++ mtreq->rem_id = mopt.rem_id;
++ mtreq->rcv_low_prio = mopt.low_prio;
++ inet_rsk(req)->saw_mpc = 1;
++}
++
++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb)
++{
++ struct mptcp_options_received mopt;
++ struct mptcp_request_sock *mreq = mptcp_rsk(req);
++
++ mptcp_init_mp_opt(&mopt);
++ tcp_parse_mptcp_options(skb, &mopt);
++
++ mreq->is_sub = 0;
++ inet_rsk(req)->mptcp_rqsk = 1;
++ mreq->dss_csum = mopt.dss_csum;
++ mreq->hash_entry.pprev = NULL;
++
++ mptcp_reqsk_new_mptcp(req, &mopt, skb);
++}
++
++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb)
++{
++ struct mptcp_options_received mopt;
++ const struct tcp_sock *tp = tcp_sk(sk);
++ __u32 isn = TCP_SKB_CB(skb)->when;
++ bool want_cookie = false;
++
++ if ((sysctl_tcp_syncookies == 2 ||
++ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
++ want_cookie = tcp_syn_flood_action(sk, skb,
++ mptcp_request_sock_ops.slab_name);
++ if (!want_cookie)
++ goto drop;
++ }
++
++ mptcp_init_mp_opt(&mopt);
++ tcp_parse_mptcp_options(skb, &mopt);
++
++ if (mopt.is_mp_join)
++ return mptcp_do_join_short(skb, &mopt, sock_net(sk));
++ if (mopt.drop_me)
++ goto drop;
++
++ if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled)
++ mopt.saw_mpc = 0;
++
++ if (skb->protocol == htons(ETH_P_IP)) {
++ if (mopt.saw_mpc && !want_cookie) {
++ if (skb_rtable(skb)->rt_flags &
++ (RTCF_BROADCAST | RTCF_MULTICAST))
++ goto drop;
++
++ return tcp_conn_request(&mptcp_request_sock_ops,
++ &mptcp_request_sock_ipv4_ops,
++ sk, skb);
++ }
++
++ return tcp_v4_conn_request(sk, skb);
++#if IS_ENABLED(CONFIG_IPV6)
++ } else {
++ if (mopt.saw_mpc && !want_cookie) {
++ if (!ipv6_unicast_destination(skb))
++ goto drop;
++
++ return tcp_conn_request(&mptcp6_request_sock_ops,
++ &mptcp_request_sock_ipv6_ops,
++ sk, skb);
++ }
++
++ return tcp_v6_conn_request(sk, skb);
++#endif
++ }
++drop:
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
++ return 0;
++}
++
++struct workqueue_struct *mptcp_wq;
++EXPORT_SYMBOL(mptcp_wq);
++
++/* Output /proc/net/mptcp */
++static int mptcp_pm_seq_show(struct seq_file *seq, void *v)
++{
++ struct tcp_sock *meta_tp;
++ const struct net *net = seq->private;
++ int i, n = 0;
++
++ seq_printf(seq, " sl loc_tok rem_tok v6 local_address remote_address st ns tx_queue rx_queue inode");
++ seq_putc(seq, '\n');
++
++ for (i = 0; i < MPTCP_HASH_SIZE; i++) {
++ struct hlist_nulls_node *node;
++ rcu_read_lock_bh();
++ hlist_nulls_for_each_entry_rcu(meta_tp, node,
++ &tk_hashtable[i], tk_table) {
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ struct sock *meta_sk = (struct sock *)meta_tp;
++ struct inet_sock *isk = inet_sk(meta_sk);
++
++ if (!mptcp(meta_tp) || !net_eq(net, sock_net(meta_sk)))
++ continue;
++
++ if (capable(CAP_NET_ADMIN)) {
++ seq_printf(seq, "%4d: %04X %04X ", n++,
++ mpcb->mptcp_loc_token,
++ mpcb->mptcp_rem_token);
++ } else {
++ seq_printf(seq, "%4d: %04X %04X ", n++, -1, -1);
++ }
++ if (meta_sk->sk_family == AF_INET ||
++ mptcp_v6_is_v4_mapped(meta_sk)) {
++ seq_printf(seq, " 0 %08X:%04X %08X:%04X ",
++ isk->inet_rcv_saddr,
++ ntohs(isk->inet_sport),
++ isk->inet_daddr,
++ ntohs(isk->inet_dport));
++#if IS_ENABLED(CONFIG_IPV6)
++ } else if (meta_sk->sk_family == AF_INET6) {
++ struct in6_addr *src = &meta_sk->sk_v6_rcv_saddr;
++ struct in6_addr *dst = &meta_sk->sk_v6_daddr;
++ seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X",
++ src->s6_addr32[0], src->s6_addr32[1],
++ src->s6_addr32[2], src->s6_addr32[3],
++ ntohs(isk->inet_sport),
++ dst->s6_addr32[0], dst->s6_addr32[1],
++ dst->s6_addr32[2], dst->s6_addr32[3],
++ ntohs(isk->inet_dport));
++#endif
++ }
++ seq_printf(seq, " %02X %02X %08X:%08X %lu",
++ meta_sk->sk_state, mpcb->cnt_subflows,
++ meta_tp->write_seq - meta_tp->snd_una,
++ max_t(int, meta_tp->rcv_nxt -
++ meta_tp->copied_seq, 0),
++ sock_i_ino(meta_sk));
++ seq_putc(seq, '\n');
++ }
++
++ rcu_read_unlock_bh();
++ }
++
++ return 0;
++}
++
++static int mptcp_pm_seq_open(struct inode *inode, struct file *file)
++{
++ return single_open_net(inode, file, mptcp_pm_seq_show);
++}
++
++static const struct file_operations mptcp_pm_seq_fops = {
++ .owner = THIS_MODULE,
++ .open = mptcp_pm_seq_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = single_release_net,
++};
++
++static int mptcp_pm_init_net(struct net *net)
++{
++ if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops))
++ return -ENOMEM;
++
++ return 0;
++}
++
++static void mptcp_pm_exit_net(struct net *net)
++{
++ remove_proc_entry("mptcp", net->proc_net);
++}
++
++static struct pernet_operations mptcp_pm_proc_ops = {
++ .init = mptcp_pm_init_net,
++ .exit = mptcp_pm_exit_net,
++};
++
++/* General initialization of mptcp */
++void __init mptcp_init(void)
++{
++ int i;
++ struct ctl_table_header *mptcp_sysctl;
++
++ mptcp_sock_cache = kmem_cache_create("mptcp_sock",
++ sizeof(struct mptcp_tcp_sock),
++ 0, SLAB_HWCACHE_ALIGN,
++ NULL);
++ if (!mptcp_sock_cache)
++ goto mptcp_sock_cache_failed;
++
++ mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb),
++ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
++ NULL);
++ if (!mptcp_cb_cache)
++ goto mptcp_cb_cache_failed;
++
++ mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw),
++ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
++ NULL);
++ if (!mptcp_tw_cache)
++ goto mptcp_tw_cache_failed;
++
++ get_random_bytes(mptcp_secret, sizeof(mptcp_secret));
++
++ mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
++ if (!mptcp_wq)
++ goto alloc_workqueue_failed;
++
++ for (i = 0; i < MPTCP_HASH_SIZE; i++) {
++ INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i);
++ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_htb[i],
++ i + MPTCP_REQSK_NULLS_BASE);
++ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i);
++ }
++
++ spin_lock_init(&mptcp_reqsk_hlock);
++ spin_lock_init(&mptcp_tk_hashlock);
++
++ if (register_pernet_subsys(&mptcp_pm_proc_ops))
++ goto pernet_failed;
++
++#if IS_ENABLED(CONFIG_IPV6)
++ if (mptcp_pm_v6_init())
++ goto mptcp_pm_v6_failed;
++#endif
++ if (mptcp_pm_v4_init())
++ goto mptcp_pm_v4_failed;
++
++ mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table);
++ if (!mptcp_sysctl)
++ goto register_sysctl_failed;
++
++ if (mptcp_register_path_manager(&mptcp_pm_default))
++ goto register_pm_failed;
++
++ if (mptcp_register_scheduler(&mptcp_sched_default))
++ goto register_sched_failed;
++
++ pr_info("MPTCP: Stable release v0.89.0-rc");
++
++ mptcp_init_failed = false;
++
++ return;
++
++register_sched_failed:
++ mptcp_unregister_path_manager(&mptcp_pm_default);
++register_pm_failed:
++ unregister_net_sysctl_table(mptcp_sysctl);
++register_sysctl_failed:
++ mptcp_pm_v4_undo();
++mptcp_pm_v4_failed:
++#if IS_ENABLED(CONFIG_IPV6)
++ mptcp_pm_v6_undo();
++mptcp_pm_v6_failed:
++#endif
++ unregister_pernet_subsys(&mptcp_pm_proc_ops);
++pernet_failed:
++ destroy_workqueue(mptcp_wq);
++alloc_workqueue_failed:
++ kmem_cache_destroy(mptcp_tw_cache);
++mptcp_tw_cache_failed:
++ kmem_cache_destroy(mptcp_cb_cache);
++mptcp_cb_cache_failed:
++ kmem_cache_destroy(mptcp_sock_cache);
++mptcp_sock_cache_failed:
++ mptcp_init_failed = true;
++}
+diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c
+new file mode 100644
+index 000000000000..3a54413ce25b
+--- /dev/null
++++ b/net/mptcp/mptcp_fullmesh.c
+@@ -0,0 +1,1722 @@
++#include <linux/module.h>
++
++#include <net/mptcp.h>
++#include <net/mptcp_v4.h>
++
++#if IS_ENABLED(CONFIG_IPV6)
++#include <net/mptcp_v6.h>
++#include <net/addrconf.h>
++#endif
++
++enum {
++ MPTCP_EVENT_ADD = 1,
++ MPTCP_EVENT_DEL,
++ MPTCP_EVENT_MOD,
++};
++
++#define MPTCP_SUBFLOW_RETRY_DELAY 1000
++
++/* Max number of local or remote addresses we can store.
++ * When changing, see the bitfield below in fullmesh_rem4/6.
++ */
++#define MPTCP_MAX_ADDR 8
++
++struct fullmesh_rem4 {
++ u8 rem4_id;
++ u8 bitfield;
++ u8 retry_bitfield;
++ __be16 port;
++ struct in_addr addr;
++};
++
++struct fullmesh_rem6 {
++ u8 rem6_id;
++ u8 bitfield;
++ u8 retry_bitfield;
++ __be16 port;
++ struct in6_addr addr;
++};
++
++struct mptcp_loc_addr {
++ struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR];
++ u8 loc4_bits;
++ u8 next_v4_index;
++
++ struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR];
++ u8 loc6_bits;
++ u8 next_v6_index;
++};
++
++struct mptcp_addr_event {
++ struct list_head list;
++ unsigned short family;
++ u8 code:7,
++ low_prio:1;
++ union inet_addr addr;
++};
++
++struct fullmesh_priv {
++ /* Worker struct for subflow establishment */
++ struct work_struct subflow_work;
++ /* Delayed worker, when the routing-tables are not yet ready. */
++ struct delayed_work subflow_retry_work;
++
++ /* Remote addresses */
++ struct fullmesh_rem4 remaddr4[MPTCP_MAX_ADDR];
++ struct fullmesh_rem6 remaddr6[MPTCP_MAX_ADDR];
++
++ struct mptcp_cb *mpcb;
++
++ u16 remove_addrs; /* Addresses to remove */
++ u8 announced_addrs_v4; /* IPv4 Addresses we did announce */
++ u8 announced_addrs_v6; /* IPv6 Addresses we did announce */
++
++ u8 add_addr; /* Are we sending an add_addr? */
++
++ u8 rem4_bits;
++ u8 rem6_bits;
++};
++
++struct mptcp_fm_ns {
++ struct mptcp_loc_addr __rcu *local;
++ spinlock_t local_lock; /* Protecting the above pointer */
++ struct list_head events;
++ struct delayed_work address_worker;
++
++ struct net *net;
++};
++
++static struct mptcp_pm_ops full_mesh __read_mostly;
++
++static void full_mesh_create_subflows(struct sock *meta_sk);
++
++static struct mptcp_fm_ns *fm_get_ns(const struct net *net)
++{
++ return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH];
++}
++
++static struct fullmesh_priv *fullmesh_get_priv(const struct mptcp_cb *mpcb)
++{
++ return (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
++}
++
++/* Find the first free index in the bitfield */
++static int __mptcp_find_free_index(u8 bitfield, u8 base)
++{
++ int i;
++
++ /* There are anyways no free bits... */
++ if (bitfield == 0xff)
++ goto exit;
++
++ i = ffs(~(bitfield >> base)) - 1;
++ if (i < 0)
++ goto exit;
++
++ /* No free bits when starting at base, try from 0 on */
++ if (i + base >= sizeof(bitfield) * 8)
++ return __mptcp_find_free_index(bitfield, 0);
++
++ return i + base;
++exit:
++ return -1;
++}
++
++static int mptcp_find_free_index(u8 bitfield)
++{
++ return __mptcp_find_free_index(bitfield, 0);
++}
++
++static void mptcp_addv4_raddr(struct mptcp_cb *mpcb,
++ const struct in_addr *addr,
++ __be16 port, u8 id)
++{
++ int i;
++ struct fullmesh_rem4 *rem4;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
++ rem4 = &fmp->remaddr4[i];
++
++ /* Address is already in the list --- continue */
++ if (rem4->rem4_id == id &&
++ rem4->addr.s_addr == addr->s_addr && rem4->port == port)
++ return;
++
++ /* This may be the case, when the peer is behind a NAT. He is
++ * trying to JOIN, thus sending the JOIN with a certain ID.
++ * However the src_addr of the IP-packet has been changed. We
++ * update the addr in the list, because this is the address as
++ * OUR BOX sees it.
++ */
++ if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) {
++ /* update the address */
++ mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n",
++ __func__, &rem4->addr.s_addr,
++ &addr->s_addr, id);
++ rem4->addr.s_addr = addr->s_addr;
++ rem4->port = port;
++ mpcb->list_rcvd = 1;
++ return;
++ }
++ }
++
++ i = mptcp_find_free_index(fmp->rem4_bits);
++ /* Do we have already the maximum number of local/remote addresses? */
++ if (i < 0) {
++ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n",
++ __func__, MPTCP_MAX_ADDR, &addr->s_addr);
++ return;
++ }
++
++ rem4 = &fmp->remaddr4[i];
++
++ /* Address is not known yet, store it */
++ rem4->addr.s_addr = addr->s_addr;
++ rem4->port = port;
++ rem4->bitfield = 0;
++ rem4->retry_bitfield = 0;
++ rem4->rem4_id = id;
++ mpcb->list_rcvd = 1;
++ fmp->rem4_bits |= (1 << i);
++
++ return;
++}
++
++static void mptcp_addv6_raddr(struct mptcp_cb *mpcb,
++ const struct in6_addr *addr,
++ __be16 port, u8 id)
++{
++ int i;
++ struct fullmesh_rem6 *rem6;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
++ rem6 = &fmp->remaddr6[i];
++
++ /* Address is already in the list --- continue */
++ if (rem6->rem6_id == id &&
++ ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port)
++ return;
++
++ /* This may be the case, when the peer is behind a NAT. He is
++ * trying to JOIN, thus sending the JOIN with a certain ID.
++ * However the src_addr of the IP-packet has been changed. We
++ * update the addr in the list, because this is the address as
++ * OUR BOX sees it.
++ */
++ if (rem6->rem6_id == id) {
++ /* update the address */
++ mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n",
++ __func__, &rem6->addr, addr, id);
++ rem6->addr = *addr;
++ rem6->port = port;
++ mpcb->list_rcvd = 1;
++ return;
++ }
++ }
++
++ i = mptcp_find_free_index(fmp->rem6_bits);
++ /* Do we have already the maximum number of local/remote addresses? */
++ if (i < 0) {
++ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n",
++ __func__, MPTCP_MAX_ADDR, addr);
++ return;
++ }
++
++ rem6 = &fmp->remaddr6[i];
++
++ /* Address is not known yet, store it */
++ rem6->addr = *addr;
++ rem6->port = port;
++ rem6->bitfield = 0;
++ rem6->retry_bitfield = 0;
++ rem6->rem6_id = id;
++ mpcb->list_rcvd = 1;
++ fmp->rem6_bits |= (1 << i);
++
++ return;
++}
++
++static void mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id)
++{
++ int i;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
++ if (fmp->remaddr4[i].rem4_id == id) {
++ /* remove address from bitfield */
++ fmp->rem4_bits &= ~(1 << i);
++
++ break;
++ }
++ }
++}
++
++static void mptcp_v6_rem_raddress(const struct mptcp_cb *mpcb, u8 id)
++{
++ int i;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
++ if (fmp->remaddr6[i].rem6_id == id) {
++ /* remove address from bitfield */
++ fmp->rem6_bits &= ~(1 << i);
++
++ break;
++ }
++ }
++}
++
++/* Sets the bitfield of the remote-address field */
++static void mptcp_v4_set_init_addr_bit(const struct mptcp_cb *mpcb,
++ const struct in_addr *addr, u8 index)
++{
++ int i;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
++ if (fmp->remaddr4[i].addr.s_addr == addr->s_addr) {
++ fmp->remaddr4[i].bitfield |= (1 << index);
++ return;
++ }
++ }
++}
++
++/* Sets the bitfield of the remote-address field */
++static void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,
++ const struct in6_addr *addr, u8 index)
++{
++ int i;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
++ if (ipv6_addr_equal(&fmp->remaddr6[i].addr, addr)) {
++ fmp->remaddr6[i].bitfield |= (1 << index);
++ return;
++ }
++ }
++}
++
++static void mptcp_set_init_addr_bit(struct mptcp_cb *mpcb,
++ const union inet_addr *addr,
++ sa_family_t family, u8 id)
++{
++ if (family == AF_INET)
++ mptcp_v4_set_init_addr_bit(mpcb, &addr->in, id);
++ else
++ mptcp_v6_set_init_addr_bit(mpcb, &addr->in6, id);
++}
++
++static void retry_subflow_worker(struct work_struct *work)
++{
++ struct delayed_work *delayed_work = container_of(work,
++ struct delayed_work,
++ work);
++ struct fullmesh_priv *fmp = container_of(delayed_work,
++ struct fullmesh_priv,
++ subflow_retry_work);
++ struct mptcp_cb *mpcb = fmp->mpcb;
++ struct sock *meta_sk = mpcb->meta_sk;
++ struct mptcp_loc_addr *mptcp_local;
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
++ int iter = 0, i;
++
++ /* We need a local (stable) copy of the address-list. Really, it is not
++ * such a big deal, if the address-list is not 100% up-to-date.
++ */
++ rcu_read_lock_bh();
++ mptcp_local = rcu_dereference_bh(fm_ns->local);
++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
++ rcu_read_unlock_bh();
++
++ if (!mptcp_local)
++ return;
++
++next_subflow:
++ if (iter) {
++ release_sock(meta_sk);
++ mutex_unlock(&mpcb->mpcb_mutex);
++
++ cond_resched();
++ }
++ mutex_lock(&mpcb->mpcb_mutex);
++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
++
++ iter++;
++
++ if (sock_flag(meta_sk, SOCK_DEAD))
++ goto exit;
++
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
++ struct fullmesh_rem4 *rem = &fmp->remaddr4[i];
++ /* Do we need to retry establishing a subflow ? */
++ if (rem->retry_bitfield) {
++ int i = mptcp_find_free_index(~rem->retry_bitfield);
++ struct mptcp_rem4 rem4;
++
++ rem->bitfield |= (1 << i);
++ rem->retry_bitfield &= ~(1 << i);
++
++ rem4.addr = rem->addr;
++ rem4.port = rem->port;
++ rem4.rem4_id = rem->rem4_id;
++
++ mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], &rem4);
++ goto next_subflow;
++ }
++ }
++
++#if IS_ENABLED(CONFIG_IPV6)
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
++ struct fullmesh_rem6 *rem = &fmp->remaddr6[i];
++
++ /* Do we need to retry establishing a subflow ? */
++ if (rem->retry_bitfield) {
++ int i = mptcp_find_free_index(~rem->retry_bitfield);
++ struct mptcp_rem6 rem6;
++
++ rem->bitfield |= (1 << i);
++ rem->retry_bitfield &= ~(1 << i);
++
++ rem6.addr = rem->addr;
++ rem6.port = rem->port;
++ rem6.rem6_id = rem->rem6_id;
++
++ mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], &rem6);
++ goto next_subflow;
++ }
++ }
++#endif
++
++exit:
++ kfree(mptcp_local);
++ release_sock(meta_sk);
++ mutex_unlock(&mpcb->mpcb_mutex);
++ sock_put(meta_sk);
++}
++
++/**
++ * Create all new subflows, by doing calls to mptcp_initX_subsockets
++ *
++ * This function uses a goto next_subflow, to allow releasing the lock between
++ * new subflows and giving other processes a chance to do some work on the
++ * socket and potentially finishing the communication.
++ **/
++static void create_subflow_worker(struct work_struct *work)
++{
++ struct fullmesh_priv *fmp = container_of(work, struct fullmesh_priv,
++ subflow_work);
++ struct mptcp_cb *mpcb = fmp->mpcb;
++ struct sock *meta_sk = mpcb->meta_sk;
++ struct mptcp_loc_addr *mptcp_local;
++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
++ int iter = 0, retry = 0;
++ int i;
++
++ /* We need a local (stable) copy of the address-list. Really, it is not
++ * such a big deal, if the address-list is not 100% up-to-date.
++ */
++ rcu_read_lock_bh();
++ mptcp_local = rcu_dereference_bh(fm_ns->local);
++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
++ rcu_read_unlock_bh();
++
++ if (!mptcp_local)
++ return;
++
++next_subflow:
++ if (iter) {
++ release_sock(meta_sk);
++ mutex_unlock(&mpcb->mpcb_mutex);
++
++ cond_resched();
++ }
++ mutex_lock(&mpcb->mpcb_mutex);
++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
++
++ iter++;
++
++ if (sock_flag(meta_sk, SOCK_DEAD))
++ goto exit;
++
++ if (mpcb->master_sk &&
++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
++ goto exit;
++
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
++ struct fullmesh_rem4 *rem;
++ u8 remaining_bits;
++
++ rem = &fmp->remaddr4[i];
++ remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits;
++
++ /* Are there still combinations to handle? */
++ if (remaining_bits) {
++ int i = mptcp_find_free_index(~remaining_bits);
++ struct mptcp_rem4 rem4;
++
++ rem->bitfield |= (1 << i);
++
++ rem4.addr = rem->addr;
++ rem4.port = rem->port;
++ rem4.rem4_id = rem->rem4_id;
++
++ /* If a route is not yet available then retry once */
++ if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i],
++ &rem4) == -ENETUNREACH)
++ retry = rem->retry_bitfield |= (1 << i);
++ goto next_subflow;
++ }
++ }
++
++#if IS_ENABLED(CONFIG_IPV6)
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
++ struct fullmesh_rem6 *rem;
++ u8 remaining_bits;
++
++ rem = &fmp->remaddr6[i];
++ remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits;
++
++ /* Are there still combinations to handle? */
++ if (remaining_bits) {
++ int i = mptcp_find_free_index(~remaining_bits);
++ struct mptcp_rem6 rem6;
++
++ rem->bitfield |= (1 << i);
++
++ rem6.addr = rem->addr;
++ rem6.port = rem->port;
++ rem6.rem6_id = rem->rem6_id;
++
++ /* If a route is not yet available then retry once */
++ if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i],
++ &rem6) == -ENETUNREACH)
++ retry = rem->retry_bitfield |= (1 << i);
++ goto next_subflow;
++ }
++ }
++#endif
++
++ if (retry && !delayed_work_pending(&fmp->subflow_retry_work)) {
++ sock_hold(meta_sk);
++ queue_delayed_work(mptcp_wq, &fmp->subflow_retry_work,
++ msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY));
++ }
++
++exit:
++ kfree(mptcp_local);
++ release_sock(meta_sk);
++ mutex_unlock(&mpcb->mpcb_mutex);
++ sock_put(meta_sk);
++}
++
++static void announce_remove_addr(u8 addr_id, struct sock *meta_sk)
++{
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++ struct sock *sk = mptcp_select_ack_sock(meta_sk);
++
++ fmp->remove_addrs |= (1 << addr_id);
++ mpcb->addr_signal = 1;
++
++ if (sk)
++ tcp_send_ack(sk);
++}
++
++static void update_addr_bitfields(struct sock *meta_sk,
++ const struct mptcp_loc_addr *mptcp_local)
++{
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++ int i;
++
++ /* The bits in announced_addrs_* always match with loc*_bits. So, a
++ * simply & operation unsets the correct bits, because these go from
++ * announced to non-announced
++ */
++ fmp->announced_addrs_v4 &= mptcp_local->loc4_bits;
++
++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
++ fmp->remaddr4[i].bitfield &= mptcp_local->loc4_bits;
++ fmp->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits;
++ }
++
++ fmp->announced_addrs_v6 &= mptcp_local->loc6_bits;
++
++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
++ fmp->remaddr6[i].bitfield &= mptcp_local->loc6_bits;
++ fmp->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits;
++ }
++}
++
++static int mptcp_find_address(const struct mptcp_loc_addr *mptcp_local,
++ sa_family_t family, const union inet_addr *addr)
++{
++ int i;
++ u8 loc_bits;
++ bool found = false;
++
++ if (family == AF_INET)
++ loc_bits = mptcp_local->loc4_bits;
++ else
++ loc_bits = mptcp_local->loc6_bits;
++
++ mptcp_for_each_bit_set(loc_bits, i) {
++ if (family == AF_INET &&
++ mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) {
++ found = true;
++ break;
++ }
++ if (family == AF_INET6 &&
++ ipv6_addr_equal(&mptcp_local->locaddr6[i].addr,
++ &addr->in6)) {
++ found = true;
++ break;
++ }
++ }
++
++ if (!found)
++ return -1;
++
++ return i;
++}
++
++static void mptcp_address_worker(struct work_struct *work)
++{
++ const struct delayed_work *delayed_work = container_of(work,
++ struct delayed_work,
++ work);
++ struct mptcp_fm_ns *fm_ns = container_of(delayed_work,
++ struct mptcp_fm_ns,
++ address_worker);
++ struct net *net = fm_ns->net;
++ struct mptcp_addr_event *event = NULL;
++ struct mptcp_loc_addr *mptcp_local, *old;
++ int i, id = -1; /* id is used in the socket-code on a delete-event */
++ bool success; /* Used to indicate if we succeeded handling the event */
++
++next_event:
++ success = false;
++ kfree(event);
++
++ /* First, let's dequeue an event from our event-list */
++ rcu_read_lock_bh();
++ spin_lock(&fm_ns->local_lock);
++
++ event = list_first_entry_or_null(&fm_ns->events,
++ struct mptcp_addr_event, list);
++ if (!event) {
++ spin_unlock(&fm_ns->local_lock);
++ rcu_read_unlock_bh();
++ return;
++ }
++
++ list_del(&event->list);
++
++ mptcp_local = rcu_dereference_bh(fm_ns->local);
++
++ if (event->code == MPTCP_EVENT_DEL) {
++ id = mptcp_find_address(mptcp_local, event->family, &event->addr);
++
++ /* Not in the list - so we don't care */
++ if (id < 0) {
++ mptcp_debug("%s could not find id\n", __func__);
++ goto duno;
++ }
++
++ old = mptcp_local;
++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
++ GFP_ATOMIC);
++ if (!mptcp_local)
++ goto duno;
++
++ if (event->family == AF_INET)
++ mptcp_local->loc4_bits &= ~(1 << id);
++ else
++ mptcp_local->loc6_bits &= ~(1 << id);
++
++ rcu_assign_pointer(fm_ns->local, mptcp_local);
++ kfree(old);
++ } else {
++ int i = mptcp_find_address(mptcp_local, event->family, &event->addr);
++ int j = i;
++
++ if (j < 0) {
++ /* Not in the list, so we have to find an empty slot */
++ if (event->family == AF_INET)
++ i = __mptcp_find_free_index(mptcp_local->loc4_bits,
++ mptcp_local->next_v4_index);
++ if (event->family == AF_INET6)
++ i = __mptcp_find_free_index(mptcp_local->loc6_bits,
++ mptcp_local->next_v6_index);
++
++ if (i < 0) {
++ mptcp_debug("%s no more space\n", __func__);
++ goto duno;
++ }
++
++ /* It might have been a MOD-event. */
++ event->code = MPTCP_EVENT_ADD;
++ } else {
++ /* Let's check if anything changes */
++ if (event->family == AF_INET &&
++ event->low_prio == mptcp_local->locaddr4[i].low_prio)
++ goto duno;
++
++ if (event->family == AF_INET6 &&
++ event->low_prio == mptcp_local->locaddr6[i].low_prio)
++ goto duno;
++ }
++
++ old = mptcp_local;
++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
++ GFP_ATOMIC);
++ if (!mptcp_local)
++ goto duno;
++
++ if (event->family == AF_INET) {
++ mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr;
++ mptcp_local->locaddr4[i].loc4_id = i + 1;
++ mptcp_local->locaddr4[i].low_prio = event->low_prio;
++ } else {
++ mptcp_local->locaddr6[i].addr = event->addr.in6;
++ mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR;
++ mptcp_local->locaddr6[i].low_prio = event->low_prio;
++ }
++
++ if (j < 0) {
++ if (event->family == AF_INET) {
++ mptcp_local->loc4_bits |= (1 << i);
++ mptcp_local->next_v4_index = i + 1;
++ } else {
++ mptcp_local->loc6_bits |= (1 << i);
++ mptcp_local->next_v6_index = i + 1;
++ }
++ }
++
++ rcu_assign_pointer(fm_ns->local, mptcp_local);
++ kfree(old);
++ }
++ success = true;
++
++duno:
++ spin_unlock(&fm_ns->local_lock);
++ rcu_read_unlock_bh();
++
++ if (!success)
++ goto next_event;
++
++ /* Now we iterate over the MPTCP-sockets and apply the event. */
++ for (i = 0; i < MPTCP_HASH_SIZE; i++) {
++ const struct hlist_nulls_node *node;
++ struct tcp_sock *meta_tp;
++
++ rcu_read_lock_bh();
++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i],
++ tk_table) {
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ struct sock *meta_sk = (struct sock *)meta_tp, *sk;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++ bool meta_v4 = meta_sk->sk_family == AF_INET;
++
++ if (sock_net(meta_sk) != net)
++ continue;
++
++ if (meta_v4) {
++ /* skip IPv6 events if meta is IPv4 */
++ if (event->family == AF_INET6)
++ continue;
++ }
++ /* skip IPv4 events if IPV6_V6ONLY is set */
++ else if (event->family == AF_INET &&
++ inet6_sk(meta_sk)->ipv6only)
++ continue;
++
++ if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
++ continue;
++
++ bh_lock_sock(meta_sk);
++
++ if (!mptcp(meta_tp) || !is_meta_sk(meta_sk) ||
++ mpcb->infinite_mapping_snd ||
++ mpcb->infinite_mapping_rcv ||
++ mpcb->send_infinite_mapping)
++ goto next;
++
++ /* May be that the pm has changed in-between */
++ if (mpcb->pm_ops != &full_mesh)
++ goto next;
++
++ if (sock_owned_by_user(meta_sk)) {
++ if (!test_and_set_bit(MPTCP_PATH_MANAGER,
++ &meta_tp->tsq_flags))
++ sock_hold(meta_sk);
++
++ goto next;
++ }
++
++ if (event->code == MPTCP_EVENT_ADD) {
++ fmp->add_addr++;
++ mpcb->addr_signal = 1;
++
++ sk = mptcp_select_ack_sock(meta_sk);
++ if (sk)
++ tcp_send_ack(sk);
++
++ full_mesh_create_subflows(meta_sk);
++ }
++
++ if (event->code == MPTCP_EVENT_DEL) {
++ struct sock *sk, *tmpsk;
++ struct mptcp_loc_addr *mptcp_local;
++ bool found = false;
++
++ mptcp_local = rcu_dereference_bh(fm_ns->local);
++
++ /* In any case, we need to update our bitfields */
++ if (id >= 0)
++ update_addr_bitfields(meta_sk, mptcp_local);
++
++ /* Look for the socket and remove him */
++ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
++ if ((event->family == AF_INET6 &&
++ (sk->sk_family == AF_INET ||
++ mptcp_v6_is_v4_mapped(sk))) ||
++ (event->family == AF_INET &&
++ (sk->sk_family == AF_INET6 &&
++ !mptcp_v6_is_v4_mapped(sk))))
++ continue;
++
++ if (event->family == AF_INET &&
++ (sk->sk_family == AF_INET ||
++ mptcp_v6_is_v4_mapped(sk)) &&
++ inet_sk(sk)->inet_saddr != event->addr.in.s_addr)
++ continue;
++
++ if (event->family == AF_INET6 &&
++ sk->sk_family == AF_INET6 &&
++ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6))
++ continue;
++
++ /* Reinject, so that pf = 1 and so we
++ * won't select this one as the
++ * ack-sock.
++ */
++ mptcp_reinject_data(sk, 0);
++
++ /* We announce the removal of this id */
++ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk);
++
++ mptcp_sub_force_close(sk);
++ found = true;
++ }
++
++ if (found)
++ goto next;
++
++ /* The id may have been given by the event,
++ * matching on a local address. And it may not
++ * have matched on one of the above sockets,
++ * because the client never created a subflow.
++ * So, we have to finally remove it here.
++ */
++ if (id > 0)
++ announce_remove_addr(id, meta_sk);
++ }
++
++ if (event->code == MPTCP_EVENT_MOD) {
++ struct sock *sk;
++
++ mptcp_for_each_sk(mpcb, sk) {
++ struct tcp_sock *tp = tcp_sk(sk);
++ if (event->family == AF_INET &&
++ (sk->sk_family == AF_INET ||
++ mptcp_v6_is_v4_mapped(sk)) &&
++ inet_sk(sk)->inet_saddr == event->addr.in.s_addr) {
++ if (event->low_prio != tp->mptcp->low_prio) {
++ tp->mptcp->send_mp_prio = 1;
++ tp->mptcp->low_prio = event->low_prio;
++
++ tcp_send_ack(sk);
++ }
++ }
++
++ if (event->family == AF_INET6 &&
++ sk->sk_family == AF_INET6 &&
++ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) {
++ if (event->low_prio != tp->mptcp->low_prio) {
++ tp->mptcp->send_mp_prio = 1;
++ tp->mptcp->low_prio = event->low_prio;
++
++ tcp_send_ack(sk);
++ }
++ }
++ }
++ }
++next:
++ bh_unlock_sock(meta_sk);
++ sock_put(meta_sk);
++ }
++ rcu_read_unlock_bh();
++ }
++ goto next_event;
++}
++
++static struct mptcp_addr_event *lookup_similar_event(const struct net *net,
++ const struct mptcp_addr_event *event)
++{
++ struct mptcp_addr_event *eventq;
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
++
++ list_for_each_entry(eventq, &fm_ns->events, list) {
++ if (eventq->family != event->family)
++ continue;
++ if (event->family == AF_INET) {
++ if (eventq->addr.in.s_addr == event->addr.in.s_addr)
++ return eventq;
++ } else {
++ if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6))
++ return eventq;
++ }
++ }
++ return NULL;
++}
++
++/* We already hold the net-namespace MPTCP-lock */
++static void add_pm_event(struct net *net, const struct mptcp_addr_event *event)
++{
++ struct mptcp_addr_event *eventq = lookup_similar_event(net, event);
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
++
++ if (eventq) {
++ switch (event->code) {
++ case MPTCP_EVENT_DEL:
++ mptcp_debug("%s del old_code %u\n", __func__, eventq->code);
++ list_del(&eventq->list);
++ kfree(eventq);
++ break;
++ case MPTCP_EVENT_ADD:
++ mptcp_debug("%s add old_code %u\n", __func__, eventq->code);
++ eventq->low_prio = event->low_prio;
++ eventq->code = MPTCP_EVENT_ADD;
++ return;
++ case MPTCP_EVENT_MOD:
++ mptcp_debug("%s mod old_code %u\n", __func__, eventq->code);
++ eventq->low_prio = event->low_prio;
++ eventq->code = MPTCP_EVENT_MOD;
++ return;
++ }
++ }
++
++ /* OK, we have to add the new address to the wait queue */
++ eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC);
++ if (!eventq)
++ return;
++
++ list_add_tail(&eventq->list, &fm_ns->events);
++
++ /* Create work-queue */
++ if (!delayed_work_pending(&fm_ns->address_worker))
++ queue_delayed_work(mptcp_wq, &fm_ns->address_worker,
++ msecs_to_jiffies(500));
++}
++
++static void addr4_event_handler(const struct in_ifaddr *ifa, unsigned long event,
++ struct net *net)
++{
++ const struct net_device *netdev = ifa->ifa_dev->dev;
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
++ struct mptcp_addr_event mpevent;
++
++ if (ifa->ifa_scope > RT_SCOPE_LINK ||
++ ipv4_is_loopback(ifa->ifa_local))
++ return;
++
++ spin_lock_bh(&fm_ns->local_lock);
++
++ mpevent.family = AF_INET;
++ mpevent.addr.in.s_addr = ifa->ifa_local;
++ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
++
++ if (event == NETDEV_DOWN || !netif_running(netdev) ||
++ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))
++ mpevent.code = MPTCP_EVENT_DEL;
++ else if (event == NETDEV_UP)
++ mpevent.code = MPTCP_EVENT_ADD;
++ else if (event == NETDEV_CHANGE)
++ mpevent.code = MPTCP_EVENT_MOD;
++
++ mptcp_debug("%s created event for %pI4, code %u prio %u\n", __func__,
++ &ifa->ifa_local, mpevent.code, mpevent.low_prio);
++ add_pm_event(net, &mpevent);
++
++ spin_unlock_bh(&fm_ns->local_lock);
++ return;
++}
++
++/* React on IPv4-addr add/rem-events */
++static int mptcp_pm_inetaddr_event(struct notifier_block *this,
++ unsigned long event, void *ptr)
++{
++ const struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
++ struct net *net = dev_net(ifa->ifa_dev->dev);
++
++ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
++ event == NETDEV_CHANGE))
++ return NOTIFY_DONE;
++
++ addr4_event_handler(ifa, event, net);
++
++ return NOTIFY_DONE;
++}
++
++static struct notifier_block mptcp_pm_inetaddr_notifier = {
++ .notifier_call = mptcp_pm_inetaddr_event,
++};
++
++#if IS_ENABLED(CONFIG_IPV6)
++
++/* IPV6-related address/interface watchers */
++struct mptcp_dad_data {
++ struct timer_list timer;
++ struct inet6_ifaddr *ifa;
++};
++
++static void dad_callback(unsigned long arg);
++static int inet6_addr_event(struct notifier_block *this,
++ unsigned long event, void *ptr);
++
++static int ipv6_is_in_dad_state(const struct inet6_ifaddr *ifa)
++{
++ return (ifa->flags & IFA_F_TENTATIVE) &&
++ ifa->state == INET6_IFADDR_STATE_DAD;
++}
++
++static void dad_init_timer(struct mptcp_dad_data *data,
++ struct inet6_ifaddr *ifa)
++{
++ data->ifa = ifa;
++ data->timer.data = (unsigned long)data;
++ data->timer.function = dad_callback;
++ if (ifa->idev->cnf.rtr_solicit_delay)
++ data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay;
++ else
++ data->timer.expires = jiffies + (HZ/10);
++}
++
++static void dad_callback(unsigned long arg)
++{
++ struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg;
++
++ if (ipv6_is_in_dad_state(data->ifa)) {
++ dad_init_timer(data, data->ifa);
++ add_timer(&data->timer);
++ } else {
++ inet6_addr_event(NULL, NETDEV_UP, data->ifa);
++ in6_ifa_put(data->ifa);
++ kfree(data);
++ }
++}
++
++static inline void dad_setup_timer(struct inet6_ifaddr *ifa)
++{
++ struct mptcp_dad_data *data;
++
++ data = kmalloc(sizeof(*data), GFP_ATOMIC);
++
++ if (!data)
++ return;
++
++ init_timer(&data->timer);
++ dad_init_timer(data, ifa);
++ add_timer(&data->timer);
++ in6_ifa_hold(ifa);
++}
++
++static void addr6_event_handler(const struct inet6_ifaddr *ifa, unsigned long event,
++ struct net *net)
++{
++ const struct net_device *netdev = ifa->idev->dev;
++ int addr_type = ipv6_addr_type(&ifa->addr);
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
++ struct mptcp_addr_event mpevent;
++
++ if (ifa->scope > RT_SCOPE_LINK ||
++ addr_type == IPV6_ADDR_ANY ||
++ (addr_type & IPV6_ADDR_LOOPBACK) ||
++ (addr_type & IPV6_ADDR_LINKLOCAL))
++ return;
++
++ spin_lock_bh(&fm_ns->local_lock);
++
++ mpevent.family = AF_INET6;
++ mpevent.addr.in6 = ifa->addr;
++ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
++
++ if (event == NETDEV_DOWN || !netif_running(netdev) ||
++ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))
++ mpevent.code = MPTCP_EVENT_DEL;
++ else if (event == NETDEV_UP)
++ mpevent.code = MPTCP_EVENT_ADD;
++ else if (event == NETDEV_CHANGE)
++ mpevent.code = MPTCP_EVENT_MOD;
++
++ mptcp_debug("%s created event for %pI6, code %u prio %u\n", __func__,
++ &ifa->addr, mpevent.code, mpevent.low_prio);
++ add_pm_event(net, &mpevent);
++
++ spin_unlock_bh(&fm_ns->local_lock);
++ return;
++}
++
++/* React on IPv6-addr add/rem-events */
++static int inet6_addr_event(struct notifier_block *this, unsigned long event,
++ void *ptr)
++{
++ struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr;
++ struct net *net = dev_net(ifa6->idev->dev);
++
++ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
++ event == NETDEV_CHANGE))
++ return NOTIFY_DONE;
++
++ if (ipv6_is_in_dad_state(ifa6))
++ dad_setup_timer(ifa6);
++ else
++ addr6_event_handler(ifa6, event, net);
++
++ return NOTIFY_DONE;
++}
++
++static struct notifier_block inet6_addr_notifier = {
++ .notifier_call = inet6_addr_event,
++};
++
++#endif
++
++/* React on ifup/down-events */
++static int netdev_event(struct notifier_block *this, unsigned long event,
++ void *ptr)
++{
++ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
++ struct in_device *in_dev;
++#if IS_ENABLED(CONFIG_IPV6)
++ struct inet6_dev *in6_dev;
++#endif
++
++ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
++ event == NETDEV_CHANGE))
++ return NOTIFY_DONE;
++
++ rcu_read_lock();
++ in_dev = __in_dev_get_rtnl(dev);
++
++ if (in_dev) {
++ for_ifa(in_dev) {
++ mptcp_pm_inetaddr_event(NULL, event, ifa);
++ } endfor_ifa(in_dev);
++ }
++
++#if IS_ENABLED(CONFIG_IPV6)
++ in6_dev = __in6_dev_get(dev);
++
++ if (in6_dev) {
++ struct inet6_ifaddr *ifa6;
++ list_for_each_entry(ifa6, &in6_dev->addr_list, if_list)
++ inet6_addr_event(NULL, event, ifa6);
++ }
++#endif
++
++ rcu_read_unlock();
++ return NOTIFY_DONE;
++}
++
++static struct notifier_block mptcp_pm_netdev_notifier = {
++ .notifier_call = netdev_event,
++};
++
++static void full_mesh_add_raddr(struct mptcp_cb *mpcb,
++ const union inet_addr *addr,
++ sa_family_t family, __be16 port, u8 id)
++{
++ if (family == AF_INET)
++ mptcp_addv4_raddr(mpcb, &addr->in, port, id);
++ else
++ mptcp_addv6_raddr(mpcb, &addr->in6, port, id);
++}
++
++static void full_mesh_new_session(const struct sock *meta_sk)
++{
++ struct mptcp_loc_addr *mptcp_local;
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
++ int i, index;
++ union inet_addr saddr, daddr;
++ sa_family_t family;
++ bool meta_v4 = meta_sk->sk_family == AF_INET;
++
++ /* Init local variables necessary for the rest */
++ if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) {
++ saddr.ip = inet_sk(meta_sk)->inet_saddr;
++ daddr.ip = inet_sk(meta_sk)->inet_daddr;
++ family = AF_INET;
++#if IS_ENABLED(CONFIG_IPV6)
++ } else {
++ saddr.in6 = inet6_sk(meta_sk)->saddr;
++ daddr.in6 = meta_sk->sk_v6_daddr;
++ family = AF_INET6;
++#endif
++ }
++
++ rcu_read_lock();
++ mptcp_local = rcu_dereference(fm_ns->local);
++
++ index = mptcp_find_address(mptcp_local, family, &saddr);
++ if (index < 0)
++ goto fallback;
++
++ full_mesh_add_raddr(mpcb, &daddr, family, 0, 0);
++ mptcp_set_init_addr_bit(mpcb, &daddr, family, index);
++
++ /* Initialize workqueue-struct */
++ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
++ INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker);
++ fmp->mpcb = mpcb;
++
++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)
++ goto skip_ipv4;
++
++ /* Look for the address among the local addresses */
++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
++ __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr;
++
++ /* We do not need to announce the initial subflow's address again */
++ if (family == AF_INET && saddr.ip == ifa_address)
++ continue;
++
++ fmp->add_addr++;
++ mpcb->addr_signal = 1;
++ }
++
++skip_ipv4:
++#if IS_ENABLED(CONFIG_IPV6)
++ /* skip IPv6 addresses if meta-socket is IPv4 */
++ if (meta_v4)
++ goto skip_ipv6;
++
++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
++ const struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr;
++
++ /* We do not need to announce the initial subflow's address again */
++ if (family == AF_INET6 && ipv6_addr_equal(&saddr.in6, ifa6))
++ continue;
++
++ fmp->add_addr++;
++ mpcb->addr_signal = 1;
++ }
++
++skip_ipv6:
++#endif
++
++ rcu_read_unlock();
++
++ if (family == AF_INET)
++ fmp->announced_addrs_v4 |= (1 << index);
++ else
++ fmp->announced_addrs_v6 |= (1 << index);
++
++ for (i = fmp->add_addr; i && fmp->add_addr; i--)
++ tcp_send_ack(mpcb->master_sk);
++
++ return;
++
++fallback:
++ rcu_read_unlock();
++ mptcp_fallback_default(mpcb);
++ return;
++}
++
++static void full_mesh_create_subflows(struct sock *meta_sk)
++{
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++
++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
++ mpcb->send_infinite_mapping ||
++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
++ return;
++
++ if (mpcb->master_sk &&
++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
++ return;
++
++ if (!work_pending(&fmp->subflow_work)) {
++ sock_hold(meta_sk);
++ queue_work(mptcp_wq, &fmp->subflow_work);
++ }
++}
++
++/* Called upon release_sock, if the socket was owned by the user during
++ * a path-management event.
++ */
++static void full_mesh_release_sock(struct sock *meta_sk)
++{
++ struct mptcp_loc_addr *mptcp_local;
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
++ struct sock *sk, *tmpsk;
++ bool meta_v4 = meta_sk->sk_family == AF_INET;
++ int i;
++
++ rcu_read_lock();
++ mptcp_local = rcu_dereference(fm_ns->local);
++
++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)
++ goto skip_ipv4;
++
++ /* First, detect modifications or additions */
++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
++ struct in_addr ifa = mptcp_local->locaddr4[i].addr;
++ bool found = false;
++
++ mptcp_for_each_sk(mpcb, sk) {
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ if (sk->sk_family == AF_INET6 &&
++ !mptcp_v6_is_v4_mapped(sk))
++ continue;
++
++ if (inet_sk(sk)->inet_saddr != ifa.s_addr)
++ continue;
++
++ found = true;
++
++ if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) {
++ tp->mptcp->send_mp_prio = 1;
++ tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio;
++
++ tcp_send_ack(sk);
++ }
++ }
++
++ if (!found) {
++ fmp->add_addr++;
++ mpcb->addr_signal = 1;
++
++ sk = mptcp_select_ack_sock(meta_sk);
++ if (sk)
++ tcp_send_ack(sk);
++ full_mesh_create_subflows(meta_sk);
++ }
++ }
++
++skip_ipv4:
++#if IS_ENABLED(CONFIG_IPV6)
++ /* skip IPv6 addresses if meta-socket is IPv4 */
++ if (meta_v4)
++ goto removal;
++
++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
++ struct in6_addr ifa = mptcp_local->locaddr6[i].addr;
++ bool found = false;
++
++ mptcp_for_each_sk(mpcb, sk) {
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ if (sk->sk_family == AF_INET ||
++ mptcp_v6_is_v4_mapped(sk))
++ continue;
++
++ if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa))
++ continue;
++
++ found = true;
++
++ if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) {
++ tp->mptcp->send_mp_prio = 1;
++ tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio;
++
++ tcp_send_ack(sk);
++ }
++ }
++
++ if (!found) {
++ fmp->add_addr++;
++ mpcb->addr_signal = 1;
++
++ sk = mptcp_select_ack_sock(meta_sk);
++ if (sk)
++ tcp_send_ack(sk);
++ full_mesh_create_subflows(meta_sk);
++ }
++ }
++
++removal:
++#endif
++
++ /* Now, detect address-removals */
++ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
++ bool shall_remove = true;
++
++ if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
++ if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) {
++ shall_remove = false;
++ break;
++ }
++ }
++ } else {
++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
++ if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) {
++ shall_remove = false;
++ break;
++ }
++ }
++ }
++
++ if (shall_remove) {
++ /* Reinject, so that pf = 1 and so we
++ * won't select this one as the
++ * ack-sock.
++ */
++ mptcp_reinject_data(sk, 0);
++
++ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id,
++ meta_sk);
++
++ mptcp_sub_force_close(sk);
++ }
++ }
++
++ /* Just call it optimistically. It actually cannot do any harm */
++ update_addr_bitfields(meta_sk, mptcp_local);
++
++ rcu_read_unlock();
++}
++
++static int full_mesh_get_local_id(sa_family_t family, union inet_addr *addr,
++ struct net *net, bool *low_prio)
++{
++ struct mptcp_loc_addr *mptcp_local;
++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
++ int index, id = -1;
++
++ /* Handle the backup-flows */
++ rcu_read_lock();
++ mptcp_local = rcu_dereference(fm_ns->local);
++
++ index = mptcp_find_address(mptcp_local, family, addr);
++
++ if (index != -1) {
++ if (family == AF_INET) {
++ id = mptcp_local->locaddr4[index].loc4_id;
++ *low_prio = mptcp_local->locaddr4[index].low_prio;
++ } else {
++ id = mptcp_local->locaddr6[index].loc6_id;
++ *low_prio = mptcp_local->locaddr6[index].low_prio;
++ }
++ }
++
++
++ rcu_read_unlock();
++
++ return id;
++}
++
++static void full_mesh_addr_signal(struct sock *sk, unsigned *size,
++ struct tcp_out_options *opts,
++ struct sk_buff *skb)
++{
++ const struct tcp_sock *tp = tcp_sk(sk);
++ struct mptcp_cb *mpcb = tp->mpcb;
++ struct sock *meta_sk = mpcb->meta_sk;
++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
++ struct mptcp_loc_addr *mptcp_local;
++ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk));
++ int remove_addr_len;
++ u8 unannouncedv4 = 0, unannouncedv6 = 0;
++ bool meta_v4 = meta_sk->sk_family == AF_INET;
++
++ mpcb->addr_signal = 0;
++
++ if (likely(!fmp->add_addr))
++ goto remove_addr;
++
++ rcu_read_lock();
++ mptcp_local = rcu_dereference(fm_ns->local);
++
++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)
++ goto skip_ipv4;
++
++ /* IPv4 */
++ unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits;
++ if (unannouncedv4 &&
++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) {
++ int ind = mptcp_find_free_index(~unannouncedv4);
++
++ opts->options |= OPTION_MPTCP;
++ opts->mptcp_options |= OPTION_ADD_ADDR;
++ opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id;
++ opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr;
++ opts->add_addr_v4 = 1;
++
++ if (skb) {
++ fmp->announced_addrs_v4 |= (1 << ind);
++ fmp->add_addr--;
++ }
++ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN;
++ }
++
++ if (meta_v4)
++ goto skip_ipv6;
++
++skip_ipv4:
++ /* IPv6 */
++ unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits;
++ if (unannouncedv6 &&
++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) {
++ int ind = mptcp_find_free_index(~unannouncedv6);
++
++ opts->options |= OPTION_MPTCP;
++ opts->mptcp_options |= OPTION_ADD_ADDR;
++ opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id;
++ opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr;
++ opts->add_addr_v6 = 1;
++
++ if (skb) {
++ fmp->announced_addrs_v6 |= (1 << ind);
++ fmp->add_addr--;
++ }
++ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN;
++ }
++
++skip_ipv6:
++ rcu_read_unlock();
++
++ if (!unannouncedv4 && !unannouncedv6 && skb)
++ fmp->add_addr--;
++
++remove_addr:
++ if (likely(!fmp->remove_addrs))
++ goto exit;
++
++ remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs);
++ if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len)
++ goto exit;
++
++ opts->options |= OPTION_MPTCP;
++ opts->mptcp_options |= OPTION_REMOVE_ADDR;
++ opts->remove_addrs = fmp->remove_addrs;
++ *size += remove_addr_len;
++ if (skb)
++ fmp->remove_addrs = 0;
++
++exit:
++ mpcb->addr_signal = !!(fmp->add_addr || fmp->remove_addrs);
++}
++
++static void full_mesh_rem_raddr(struct mptcp_cb *mpcb, u8 rem_id)
++{
++ mptcp_v4_rem_raddress(mpcb, rem_id);
++ mptcp_v6_rem_raddress(mpcb, rem_id);
++}
++
++/* Output /proc/net/mptcp_fullmesh */
++static int mptcp_fm_seq_show(struct seq_file *seq, void *v)
++{
++ const struct net *net = seq->private;
++ struct mptcp_loc_addr *mptcp_local;
++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
++ int i;
++
++ seq_printf(seq, "Index, Address-ID, Backup, IP-address\n");
++
++ rcu_read_lock_bh();
++ mptcp_local = rcu_dereference(fm_ns->local);
++
++ seq_printf(seq, "IPv4, next v4-index: %u\n", mptcp_local->next_v4_index);
++
++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
++ struct mptcp_loc4 *loc4 = &mptcp_local->locaddr4[i];
++
++ seq_printf(seq, "%u, %u, %u, %pI4\n", i, loc4->loc4_id,
++ loc4->low_prio, &loc4->addr);
++ }
++
++ seq_printf(seq, "IPv6, next v6-index: %u\n", mptcp_local->next_v6_index);
++
++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
++ struct mptcp_loc6 *loc6 = &mptcp_local->locaddr6[i];
++
++ seq_printf(seq, "%u, %u, %u, %pI6\n", i, loc6->loc6_id,
++ loc6->low_prio, &loc6->addr);
++ }
++ rcu_read_unlock_bh();
++
++ return 0;
++}
++
++static int mptcp_fm_seq_open(struct inode *inode, struct file *file)
++{
++ return single_open_net(inode, file, mptcp_fm_seq_show);
++}
++
++static const struct file_operations mptcp_fm_seq_fops = {
++ .owner = THIS_MODULE,
++ .open = mptcp_fm_seq_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = single_release_net,
++};
++
++static int mptcp_fm_init_net(struct net *net)
++{
++ struct mptcp_loc_addr *mptcp_local;
++ struct mptcp_fm_ns *fm_ns;
++ int err = 0;
++
++ fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL);
++ if (!fm_ns)
++ return -ENOBUFS;
++
++ mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL);
++ if (!mptcp_local) {
++ err = -ENOBUFS;
++ goto err_mptcp_local;
++ }
++
++ if (!proc_create("mptcp_fullmesh", S_IRUGO, net->proc_net,
++ &mptcp_fm_seq_fops)) {
++ err = -ENOMEM;
++ goto err_seq_fops;
++ }
++
++ mptcp_local->next_v4_index = 1;
++
++ rcu_assign_pointer(fm_ns->local, mptcp_local);
++ INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker);
++ INIT_LIST_HEAD(&fm_ns->events);
++ spin_lock_init(&fm_ns->local_lock);
++ fm_ns->net = net;
++ net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns;
++
++ return 0;
++err_seq_fops:
++ kfree(mptcp_local);
++err_mptcp_local:
++ kfree(fm_ns);
++ return err;
++}
++
++static void mptcp_fm_exit_net(struct net *net)
++{
++ struct mptcp_addr_event *eventq, *tmp;
++ struct mptcp_fm_ns *fm_ns;
++ struct mptcp_loc_addr *mptcp_local;
++
++ fm_ns = fm_get_ns(net);
++ cancel_delayed_work_sync(&fm_ns->address_worker);
++
++ rcu_read_lock_bh();
++
++ mptcp_local = rcu_dereference_bh(fm_ns->local);
++ kfree(mptcp_local);
++
++ spin_lock(&fm_ns->local_lock);
++ list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) {
++ list_del(&eventq->list);
++ kfree(eventq);
++ }
++ spin_unlock(&fm_ns->local_lock);
++
++ rcu_read_unlock_bh();
++
++ remove_proc_entry("mptcp_fullmesh", net->proc_net);
++
++ kfree(fm_ns);
++}
++
++static struct pernet_operations full_mesh_net_ops = {
++ .init = mptcp_fm_init_net,
++ .exit = mptcp_fm_exit_net,
++};
++
++static struct mptcp_pm_ops full_mesh __read_mostly = {
++ .new_session = full_mesh_new_session,
++ .release_sock = full_mesh_release_sock,
++ .fully_established = full_mesh_create_subflows,
++ .new_remote_address = full_mesh_create_subflows,
++ .get_local_id = full_mesh_get_local_id,
++ .addr_signal = full_mesh_addr_signal,
++ .add_raddr = full_mesh_add_raddr,
++ .rem_raddr = full_mesh_rem_raddr,
++ .name = "fullmesh",
++ .owner = THIS_MODULE,
++};
++
++/* General initialization of MPTCP_PM */
++static int __init full_mesh_register(void)
++{
++ int ret;
++
++ BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE);
++
++ ret = register_pernet_subsys(&full_mesh_net_ops);
++ if (ret)
++ goto out;
++
++ ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
++ if (ret)
++ goto err_reg_inetaddr;
++ ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier);
++ if (ret)
++ goto err_reg_netdev;
++
++#if IS_ENABLED(CONFIG_IPV6)
++ ret = register_inet6addr_notifier(&inet6_addr_notifier);
++ if (ret)
++ goto err_reg_inet6addr;
++#endif
++
++ ret = mptcp_register_path_manager(&full_mesh);
++ if (ret)
++ goto err_reg_pm;
++
++out:
++ return ret;
++
++
++err_reg_pm:
++#if IS_ENABLED(CONFIG_IPV6)
++ unregister_inet6addr_notifier(&inet6_addr_notifier);
++err_reg_inet6addr:
++#endif
++ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
++err_reg_netdev:
++ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
++err_reg_inetaddr:
++ unregister_pernet_subsys(&full_mesh_net_ops);
++ goto out;
++}
++
++static void full_mesh_unregister(void)
++{
++#if IS_ENABLED(CONFIG_IPV6)
++ unregister_inet6addr_notifier(&inet6_addr_notifier);
++#endif
++ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
++ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
++ unregister_pernet_subsys(&full_mesh_net_ops);
++ mptcp_unregister_path_manager(&full_mesh);
++}
++
++module_init(full_mesh_register);
++module_exit(full_mesh_unregister);
++
++MODULE_AUTHOR("Christoph Paasch");
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("Full-Mesh MPTCP");
++MODULE_VERSION("0.88");
+diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c
+new file mode 100644
+index 000000000000..43704ccb639e
+--- /dev/null
++++ b/net/mptcp/mptcp_input.c
+@@ -0,0 +1,2405 @@
++/*
++ * MPTCP implementation - Sending side
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer & Author:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#include <asm/unaligned.h>
++
++#include <net/mptcp.h>
++#include <net/mptcp_v4.h>
++#include <net/mptcp_v6.h>
++
++#include <linux/kconfig.h>
++
++/* is seq1 < seq2 ? */
++static inline bool before64(const u64 seq1, const u64 seq2)
++{
++ return (s64)(seq1 - seq2) < 0;
++}
++
++/* is seq1 > seq2 ? */
++#define after64(seq1, seq2) before64(seq2, seq1)
++
++static inline void mptcp_become_fully_estab(struct sock *sk)
++{
++ tcp_sk(sk)->mptcp->fully_established = 1;
++
++ if (is_master_tp(tcp_sk(sk)) &&
++ tcp_sk(sk)->mpcb->pm_ops->fully_established)
++ tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk));
++}
++
++/* Similar to tcp_tso_acked without any memory accounting */
++static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk,
++ struct sk_buff *skb)
++{
++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ u32 packets_acked, len;
++
++ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una));
++
++ packets_acked = tcp_skb_pcount(skb);
++
++ if (skb_unclone(skb, GFP_ATOMIC))
++ return 0;
++
++ len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq;
++ __pskb_trim_head(skb, len);
++
++ TCP_SKB_CB(skb)->seq += len;
++ skb->ip_summed = CHECKSUM_PARTIAL;
++ skb->truesize -= len;
++
++ /* Any change of skb->len requires recalculation of tso factor. */
++ if (tcp_skb_pcount(skb) > 1)
++ tcp_set_skb_tso_segs(meta_sk, skb, tcp_skb_mss(skb));
++ packets_acked -= tcp_skb_pcount(skb);
++
++ if (packets_acked) {
++ BUG_ON(tcp_skb_pcount(skb) == 0);
++ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
++ }
++
++ return packets_acked;
++}
++
++/**
++ * Cleans the meta-socket retransmission queue and the reinject-queue.
++ * @sk must be the metasocket.
++ */
++static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una)
++{
++ struct sk_buff *skb, *tmp;
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ bool acked = false;
++ u32 acked_pcount;
++
++ while ((skb = tcp_write_queue_head(meta_sk)) &&
++ skb != tcp_send_head(meta_sk)) {
++ bool fully_acked = true;
++
++ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
++ if (tcp_skb_pcount(skb) == 1 ||
++ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
++ break;
++
++ acked_pcount = tcp_tso_acked(meta_sk, skb);
++ if (!acked_pcount)
++ break;
++
++ fully_acked = false;
++ } else {
++ acked_pcount = tcp_skb_pcount(skb);
++ }
++
++ acked = true;
++ meta_tp->packets_out -= acked_pcount;
++ meta_tp->retrans_stamp = 0;
++
++ if (!fully_acked)
++ break;
++
++ tcp_unlink_write_queue(skb, meta_sk);
++
++ if (mptcp_is_data_fin(skb)) {
++ struct sock *sk_it;
++
++ /* DATA_FIN has been acknowledged - now we can close
++ * the subflows
++ */
++ mptcp_for_each_sk(mpcb, sk_it) {
++ unsigned long delay = 0;
++
++ /* If we are the passive closer, don't trigger
++ * subflow-fin until the subflow has been finned
++ * by the peer - thus we add a delay.
++ */
++ if (mpcb->passive_close &&
++ sk_it->sk_state == TCP_ESTABLISHED)
++ delay = inet_csk(sk_it)->icsk_rto << 3;
++
++ mptcp_sub_close(sk_it, delay);
++ }
++ }
++ sk_wmem_free_skb(meta_sk, skb);
++ }
++ /* Remove acknowledged data from the reinject queue */
++ skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) {
++ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
++ if (tcp_skb_pcount(skb) == 1 ||
++ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
++ break;
++
++ mptcp_tso_acked_reinject(meta_sk, skb);
++ break;
++ }
++
++ __skb_unlink(skb, &mpcb->reinject_queue);
++ __kfree_skb(skb);
++ }
++
++ if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una)))
++ meta_tp->snd_up = meta_tp->snd_una;
++
++ if (acked) {
++ tcp_rearm_rto(meta_sk);
++ /* Normally this is done in tcp_try_undo_loss - but MPTCP
++ * does not call this function.
++ */
++ inet_csk(meta_sk)->icsk_retransmits = 0;
++ }
++}
++
++/* Inspired by tcp_rcv_state_process */
++static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk,
++ const struct sk_buff *skb, u32 data_seq,
++ u16 data_len)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
++ const struct tcphdr *th = tcp_hdr(skb);
++
++ /* State-machine handling if FIN has been enqueued and he has
++ * been acked (snd_una == write_seq) - it's important that this
++ * here is after sk_wmem_free_skb because otherwise
++ * sk_forward_alloc is wrong upon inet_csk_destroy_sock()
++ */
++ switch (meta_sk->sk_state) {
++ case TCP_FIN_WAIT1: {
++ struct dst_entry *dst;
++ int tmo;
++
++ if (meta_tp->snd_una != meta_tp->write_seq)
++ break;
++
++ tcp_set_state(meta_sk, TCP_FIN_WAIT2);
++ meta_sk->sk_shutdown |= SEND_SHUTDOWN;
++
++ dst = __sk_dst_get(sk);
++ if (dst)
++ dst_confirm(dst);
++
++ if (!sock_flag(meta_sk, SOCK_DEAD)) {
++ /* Wake up lingering close() */
++ meta_sk->sk_state_change(meta_sk);
++ break;
++ }
++
++ if (meta_tp->linger2 < 0 ||
++ (data_len &&
++ after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0),
++ meta_tp->rcv_nxt))) {
++ mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
++ tcp_done(meta_sk);
++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
++ return 1;
++ }
++
++ tmo = tcp_fin_time(meta_sk);
++ if (tmo > TCP_TIMEWAIT_LEN) {
++ inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN);
++ } else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) {
++ /* Bad case. We could lose such FIN otherwise.
++ * It is not a big problem, but it looks confusing
++ * and not so rare event. We still can lose it now,
++ * if it spins in bh_lock_sock(), but it is really
++ * marginal case.
++ */
++ inet_csk_reset_keepalive_timer(meta_sk, tmo);
++ } else {
++ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo);
++ }
++ break;
++ }
++ case TCP_CLOSING:
++ case TCP_LAST_ACK:
++ if (meta_tp->snd_una == meta_tp->write_seq) {
++ tcp_done(meta_sk);
++ return 1;
++ }
++ break;
++ }
++
++ /* step 7: process the segment text */
++ switch (meta_sk->sk_state) {
++ case TCP_FIN_WAIT1:
++ case TCP_FIN_WAIT2:
++ /* RFC 793 says to queue data in these states,
++ * RFC 1122 says we MUST send a reset.
++ * BSD 4.4 also does reset.
++ */
++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
++ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
++ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
++ !mptcp_is_data_fin2(skb, tp)) {
++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
++ mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
++ tcp_reset(meta_sk);
++ return 1;
++ }
++ }
++ break;
++ }
++
++ return 0;
++}
++
++/**
++ * @return:
++ * i) 1: Everything's fine.
++ * ii) -1: A reset has been sent on the subflow - csum-failure
++ * iii) 0: csum-failure but no reset sent, because it's the last subflow.
++ * Last packet should not be destroyed by the caller because it has
++ * been done here.
++ */
++static int mptcp_verif_dss_csum(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct sk_buff *tmp, *tmp1, *last = NULL;
++ __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */
++ int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0;
++ int iter = 0;
++
++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) {
++ unsigned int csum_len;
++
++ if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq))
++ /* Mapping ends in the middle of the packet -
++ * csum only these bytes
++ */
++ csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq;
++ else
++ csum_len = tmp->len;
++
++ offset = 0;
++ if (overflowed) {
++ char first_word[4];
++ first_word[0] = 0;
++ first_word[1] = 0;
++ first_word[2] = 0;
++ first_word[3] = *(tmp->data);
++ csum_tcp = csum_partial(first_word, 4, csum_tcp);
++ offset = 1;
++ csum_len--;
++ overflowed = 0;
++ }
++
++ csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp);
++
++ /* Was it on an odd-length? Then we have to merge the next byte
++ * correctly (see above)
++ */
++ if (csum_len != (csum_len & (~1)))
++ overflowed = 1;
++
++ if (mptcp_is_data_seq(tmp) && !dss_csum_added) {
++ __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32));
++
++ /* If a 64-bit dss is present, we increase the offset
++ * by 4 bytes, as the high-order 64-bits will be added
++ * in the final csum_partial-call.
++ */
++ u32 offset = skb_transport_offset(tmp) +
++ TCP_SKB_CB(tmp)->dss_off;
++ if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET)
++ offset += 4;
++
++ csum_tcp = skb_checksum(tmp, offset,
++ MPTCP_SUB_LEN_SEQ_CSUM,
++ csum_tcp);
++
++ csum_tcp = csum_partial(&data_seq,
++ sizeof(data_seq), csum_tcp);
++
++ dss_csum_added = 1; /* Just do it once */
++ }
++ last = tmp;
++ iter++;
++
++ if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) &&
++ !before(TCP_SKB_CB(tmp1)->seq,
++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
++ break;
++ }
++
++ /* Now, checksum must be 0 */
++ if (unlikely(csum_fold(csum_tcp))) {
++ pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n",
++ __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq,
++ dss_csum_added, overflowed, iter);
++
++ tp->mptcp->send_mp_fail = 1;
++
++ /* map_data_seq is the data-seq number of the
++ * mapping we are currently checking
++ */
++ tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq;
++
++ if (tp->mpcb->cnt_subflows > 1) {
++ mptcp_send_reset(sk);
++ ans = -1;
++ } else {
++ tp->mpcb->send_infinite_mapping = 1;
++
++ /* Need to purge the rcv-queue as it's no more valid */
++ while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
++ tp->copied_seq = TCP_SKB_CB(tmp)->end_seq;
++ kfree_skb(tmp);
++ }
++
++ ans = 0;
++ }
++ }
++
++ return ans;
++}
++
++static inline void mptcp_prepare_skb(struct sk_buff *skb,
++ const struct sock *sk)
++{
++ const struct tcp_sock *tp = tcp_sk(sk);
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
++ u32 inc = 0;
++
++ /* If skb is the end of this mapping (end is always at mapping-boundary
++ * thanks to the splitting/trimming), then we need to increase
++ * data-end-seq by 1 if this here is a data-fin.
++ *
++ * We need to do -1 because end_seq includes the subflow-FIN.
++ */
++ if (tp->mptcp->map_data_fin &&
++ (tcb->end_seq - (tcp_hdr(skb)->fin ? 1 : 0)) ==
++ (tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
++ inc = 1;
++
++ /* We manually set the fin-flag if it is a data-fin. For easy
++ * processing in tcp_recvmsg.
++ */
++ tcp_hdr(skb)->fin = 1;
++ } else {
++ /* We may have a subflow-fin with data but without data-fin */
++ tcp_hdr(skb)->fin = 0;
++ }
++
++ /* Adapt data-seq's to the packet itself. We kinda transform the
++ * dss-mapping to a per-packet granularity. This is necessary to
++ * correctly handle overlapping mappings coming from different
++ * subflows. Otherwise it would be a complete mess.
++ */
++ tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq;
++ tcb->end_seq = tcb->seq + skb->len + inc;
++}
++
++/**
++ * @return: 1 if the segment has been eaten and can be suppressed,
++ * otherwise 0.
++ */
++static inline int mptcp_direct_copy(const struct sk_buff *skb,
++ struct sock *meta_sk)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len);
++ int eaten = 0;
++
++ __set_current_state(TASK_RUNNING);
++
++ local_bh_enable();
++ if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) {
++ meta_tp->ucopy.len -= chunk;
++ meta_tp->copied_seq += chunk;
++ eaten = (chunk == skb->len);
++ tcp_rcv_space_adjust(meta_sk);
++ }
++ local_bh_disable();
++ return eaten;
++}
++
++static inline void mptcp_reset_mapping(struct tcp_sock *tp)
++{
++ tp->mptcp->map_data_len = 0;
++ tp->mptcp->map_data_seq = 0;
++ tp->mptcp->map_subseq = 0;
++ tp->mptcp->map_data_fin = 0;
++ tp->mptcp->mapping_present = 0;
++}
++
++/* The DSS-mapping received on the sk only covers the second half of the skb
++ * (cut at seq). We trim the head from the skb.
++ * Data will be freed upon kfree().
++ *
++ * Inspired by tcp_trim_head().
++ */
++static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq)
++{
++ int len = seq - TCP_SKB_CB(skb)->seq;
++ u32 new_seq = TCP_SKB_CB(skb)->seq + len;
++
++ if (len < skb_headlen(skb))
++ __skb_pull(skb, len);
++ else
++ __pskb_trim_head(skb, len - skb_headlen(skb));
++
++ TCP_SKB_CB(skb)->seq = new_seq;
++
++ skb->truesize -= len;
++ atomic_sub(len, &sk->sk_rmem_alloc);
++ sk_mem_uncharge(sk, len);
++}
++
++/* The DSS-mapping received on the sk only covers the first half of the skb
++ * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue
++ * as further packets may resolve the mapping of the second half of data.
++ *
++ * Inspired by tcp_fragment().
++ */
++static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq)
++{
++ struct sk_buff *buff;
++ int nsize;
++ int nlen, len;
++
++ len = seq - TCP_SKB_CB(skb)->seq;
++ nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len;
++ if (nsize < 0)
++ nsize = 0;
++
++ /* Get a new skb... force flag on. */
++ buff = alloc_skb(nsize, GFP_ATOMIC);
++ if (buff == NULL)
++ return -ENOMEM;
++
++ skb_reserve(buff, tcp_sk(sk)->tcp_header_len);
++ skb_reset_transport_header(buff);
++
++ tcp_hdr(buff)->fin = tcp_hdr(skb)->fin;
++ tcp_hdr(skb)->fin = 0;
++
++ /* We absolutly need to call skb_set_owner_r before refreshing the
++ * truesize of buff, otherwise the moved data will account twice.
++ */
++ skb_set_owner_r(buff, sk);
++ nlen = skb->len - len - nsize;
++ buff->truesize += nlen;
++ skb->truesize -= nlen;
++
++ /* Correct the sequence numbers. */
++ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
++ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
++ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
++
++ skb_split(skb, buff, len);
++
++ __skb_queue_after(&sk->sk_receive_queue, skb, buff);
++
++ return 0;
++}
++
++/* @return: 0 everything is fine. Just continue processing
++ * 1 subflow is broken stop everything
++ * -1 this packet was broken - continue with the next one.
++ */
++static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */
++ if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) &&
++ !tp->mpcb->infinite_mapping_rcv) {
++ /* Remove a pure subflow-fin from the queue and increase
++ * copied_seq.
++ */
++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
++ __skb_unlink(skb, &sk->sk_receive_queue);
++ __kfree_skb(skb);
++ return -1;
++ }
++
++ /* If we are not yet fully established and do not know the mapping for
++ * this segment, this path has to fallback to infinite or be torn down.
++ */
++ if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&
++ !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) {
++ pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n",
++ __func__, tp->mpcb->mptcp_loc_token,
++ tp->mptcp->path_index, __builtin_return_address(0),
++ TCP_SKB_CB(skb)->seq);
++
++ if (!is_master_tp(tp)) {
++ mptcp_send_reset(sk);
++ return 1;
++ }
++
++ tp->mpcb->infinite_mapping_snd = 1;
++ tp->mpcb->infinite_mapping_rcv = 1;
++ /* We do a seamless fallback and should not send a inf.mapping. */
++ tp->mpcb->send_infinite_mapping = 0;
++ tp->mptcp->fully_established = 1;
++ }
++
++ /* Receiver-side becomes fully established when a whole rcv-window has
++ * been received without the need to fallback due to the previous
++ * condition.
++ */
++ if (!tp->mptcp->fully_established) {
++ tp->mptcp->init_rcv_wnd -= skb->len;
++ if (tp->mptcp->init_rcv_wnd < 0)
++ mptcp_become_fully_estab(sk);
++ }
++
++ return 0;
++}
++
++/* @return: 0 everything is fine. Just continue processing
++ * 1 subflow is broken stop everything
++ * -1 this packet was broken - continue with the next one.
++ */
++static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb)
++{
++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
++ struct mptcp_cb *mpcb = tp->mpcb;
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
++ u32 *ptr;
++ u32 data_seq, sub_seq, data_len, tcp_end_seq;
++
++ /* If we are in infinite-mapping-mode, the subflow is guaranteed to be
++ * in-order at the data-level. Thus data-seq-numbers can be inferred
++ * from what is expected at the data-level.
++ */
++ if (mpcb->infinite_mapping_rcv) {
++ tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp);
++ tp->mptcp->map_subseq = tcb->seq;
++ tp->mptcp->map_data_len = skb->len;
++ tp->mptcp->map_data_fin = tcp_hdr(skb)->fin;
++ tp->mptcp->mapping_present = 1;
++ return 0;
++ }
++
++ /* No mapping here? Exit - it is either already set or still on its way */
++ if (!mptcp_is_data_seq(skb)) {
++ /* Too many packets without a mapping - this subflow is broken */
++ if (!tp->mptcp->mapping_present &&
++ tp->rcv_nxt - tp->copied_seq > 65536) {
++ mptcp_send_reset(sk);
++ return 1;
++ }
++
++ return 0;
++ }
++
++ ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb);
++ ptr++;
++ sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn;
++ ptr++;
++ data_len = get_unaligned_be16(ptr);
++
++ /* If it's an empty skb with DATA_FIN, sub_seq must get fixed.
++ * The draft sets it to 0, but we really would like to have the
++ * real value, to have an easy handling afterwards here in this
++ * function.
++ */
++ if (mptcp_is_data_fin(skb) && skb->len == 0)
++ sub_seq = TCP_SKB_CB(skb)->seq;
++
++ /* If there is already a mapping - we check if it maps with the current
++ * one. If not - we reset.
++ */
++ if (tp->mptcp->mapping_present &&
++ (data_seq != (u32)tp->mptcp->map_data_seq ||
++ sub_seq != tp->mptcp->map_subseq ||
++ data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin ||
++ mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) {
++ /* Mapping in packet is different from what we want */
++ pr_err("%s Mappings do not match!\n", __func__);
++ pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n",
++ __func__, data_seq, (u32)tp->mptcp->map_data_seq,
++ sub_seq, tp->mptcp->map_subseq, data_len,
++ tp->mptcp->map_data_len, mptcp_is_data_fin(skb),
++ tp->mptcp->map_data_fin);
++ mptcp_send_reset(sk);
++ return 1;
++ }
++
++ /* If the previous check was good, the current mapping is valid and we exit. */
++ if (tp->mptcp->mapping_present)
++ return 0;
++
++ /* Mapping not yet set on this subflow - we set it here! */
++
++ if (!data_len) {
++ mpcb->infinite_mapping_rcv = 1;
++ tp->mptcp->fully_established = 1;
++ /* We need to repeat mp_fail's until the sender felt
++ * back to infinite-mapping - here we stop repeating it.
++ */
++ tp->mptcp->send_mp_fail = 0;
++
++ /* We have to fixup data_len - it must be the same as skb->len */
++ data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0);
++ sub_seq = tcb->seq;
++
++ /* TODO kill all other subflows than this one */
++ /* data_seq and so on are set correctly */
++
++ /* At this point, the meta-ofo-queue has to be emptied,
++ * as the following data is guaranteed to be in-order at
++ * the data and subflow-level
++ */
++ mptcp_purge_ofo_queue(meta_tp);
++ }
++
++ /* We are sending mp-fail's and thus are in fallback mode.
++ * Ignore packets which do not announce the fallback and still
++ * want to provide a mapping.
++ */
++ if (tp->mptcp->send_mp_fail) {
++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
++ __skb_unlink(skb, &sk->sk_receive_queue);
++ __kfree_skb(skb);
++ return -1;
++ }
++
++ /* FIN increased the mapping-length by 1 */
++ if (mptcp_is_data_fin(skb))
++ data_len--;
++
++ /* Subflow-sequences of packet must be
++ * (at least partially) be part of the DSS-mapping's
++ * subflow-sequence-space.
++ *
++ * Basically the mapping is not valid, if either of the
++ * following conditions is true:
++ *
++ * 1. It's not a data_fin and
++ * MPTCP-sub_seq >= TCP-end_seq
++ *
++ * 2. It's a data_fin and TCP-end_seq > TCP-seq and
++ * MPTCP-sub_seq >= TCP-end_seq
++ *
++ * The previous two can be merged into:
++ * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq
++ * Because if it's not a data-fin, TCP-end_seq > TCP-seq
++ *
++ * 3. It's a data_fin and skb->len == 0 and
++ * MPTCP-sub_seq > TCP-end_seq
++ *
++ * 4. It's not a data_fin and TCP-end_seq > TCP-seq and
++ * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq
++ *
++ * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq)
++ */
++
++ /* subflow-fin is not part of the mapping - ignore it here ! */
++ tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin;
++ if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) ||
++ (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) ||
++ (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) ||
++ before(sub_seq, tp->copied_seq)) {
++ /* Subflow-sequences of packet is different from what is in the
++ * packet's dss-mapping. The peer is misbehaving - reset
++ */
++ pr_err("%s Packet's mapping does not map to the DSS sub_seq %u "
++ "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u"
++ "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb),
++ skb->len, data_len, tp->copied_seq);
++ mptcp_send_reset(sk);
++ return 1;
++ }
++
++ /* Does the DSS had 64-bit seqnum's ? */
++ if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) {
++ /* Wrapped around? */
++ if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);
++ } else {
++ /* Else, access the default high-order bits */
++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq);
++ }
++ } else {
++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq);
++
++ if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) {
++ /* We make sure that the data_seq is invalid.
++ * It will be dropped later.
++ */
++ tp->mptcp->map_data_seq += 0xFFFFFFFF;
++ tp->mptcp->map_data_seq += 0xFFFFFFFF;
++ }
++ }
++
++ tp->mptcp->map_data_len = data_len;
++ tp->mptcp->map_subseq = sub_seq;
++ tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;
++ tp->mptcp->mapping_present = 1;
++
++ return 0;
++}
++
++/* Similar to tcp_sequence(...) */
++static inline bool mptcp_sequence(const struct tcp_sock *meta_tp,
++ u64 data_seq, u64 end_data_seq)
++{
++ const struct mptcp_cb *mpcb = meta_tp->mpcb;
++ u64 rcv_wup64;
++
++ /* Wrap-around? */
++ if (meta_tp->rcv_wup > meta_tp->rcv_nxt) {
++ rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) |
++ meta_tp->rcv_wup;
++ } else {
++ rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
++ meta_tp->rcv_wup);
++ }
++
++ return !before64(end_data_seq, rcv_wup64) &&
++ !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp));
++}
++
++/* @return: 0 everything is fine. Just continue processing
++ * -1 this packet was broken - continue with the next one.
++ */
++static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct sk_buff *tmp, *tmp1;
++ u32 tcp_end_seq;
++
++ if (!tp->mptcp->mapping_present)
++ return 0;
++
++ /* either, the new skb gave us the mapping and the first segment
++ * in the sub-rcv-queue has to be trimmed ...
++ */
++ tmp = skb_peek(&sk->sk_receive_queue);
++ if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) &&
++ after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq))
++ mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq);
++
++ /* ... or the new skb (tail) has to be split at the end. */
++ tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0);
++ if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
++ u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len;
++ if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */
++ /* TODO : maybe handle this here better.
++ * We now just force meta-retransmission.
++ */
++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
++ __skb_unlink(skb, &sk->sk_receive_queue);
++ __kfree_skb(skb);
++ return -1;
++ }
++ }
++
++ /* Now, remove old sk_buff's from the receive-queue.
++ * This may happen if the mapping has been lost for these segments and
++ * the next mapping has already been received.
++ */
++ if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) {
++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
++ if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq))
++ break;
++
++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
++ __skb_unlink(tmp1, &sk->sk_receive_queue);
++
++ /* Impossible that we could free skb here, because his
++ * mapping is known to be valid from previous checks
++ */
++ __kfree_skb(tmp1);
++ }
++ }
++
++ return 0;
++}
++
++/* @return: 0 everything is fine. Just continue processing
++ * 1 subflow is broken stop everything
++ * -1 this mapping has been put in the meta-receive-queue
++ * -2 this mapping has been eaten by the application
++ */
++static int mptcp_queue_skb(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
++ struct sock *meta_sk = mptcp_meta_sk(sk);
++ struct mptcp_cb *mpcb = tp->mpcb;
++ struct sk_buff *tmp, *tmp1;
++ u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp);
++ bool data_queued = false;
++
++ /* Have we not yet received the full mapping? */
++ if (!tp->mptcp->mapping_present ||
++ before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len))
++ return 0;
++
++ /* Is this an overlapping mapping? rcv_nxt >= end_data_seq
++ * OR
++ * This mapping is out of window
++ */
++ if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) ||
++ !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq,
++ tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) {
++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
++ __skb_unlink(tmp1, &sk->sk_receive_queue);
++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
++ __kfree_skb(tmp1);
++
++ if (!skb_queue_empty(&sk->sk_receive_queue) &&
++ !before(TCP_SKB_CB(tmp)->seq,
++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
++ break;
++ }
++
++ mptcp_reset_mapping(tp);
++
++ return -1;
++ }
++
++ /* Record it, because we want to send our data_fin on the same path */
++ if (tp->mptcp->map_data_fin) {
++ mpcb->dfin_path_index = tp->mptcp->path_index;
++ mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN);
++ }
++
++ /* Verify the checksum */
++ if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) {
++ int ret = mptcp_verif_dss_csum(sk);
++
++ if (ret <= 0) {
++ mptcp_reset_mapping(tp);
++ return 1;
++ }
++ }
++
++ if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) {
++ /* Seg's have to go to the meta-ofo-queue */
++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
++ mptcp_prepare_skb(tmp1, sk);
++ __skb_unlink(tmp1, &sk->sk_receive_queue);
++ /* MUST be done here, because fragstolen may be true later.
++ * Then, kfree_skb_partial will not account the memory.
++ */
++ skb_orphan(tmp1);
++
++ if (!mpcb->in_time_wait) /* In time-wait, do not receive data */
++ mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk);
++ else
++ __kfree_skb(tmp1);
++
++ if (!skb_queue_empty(&sk->sk_receive_queue) &&
++ !before(TCP_SKB_CB(tmp)->seq,
++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
++ break;
++ }
++ tcp_enter_quickack_mode(sk);
++ } else {
++ /* Ready for the meta-rcv-queue */
++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
++ int eaten = 0;
++ const bool copied_early = false;
++ bool fragstolen = false;
++ u32 old_rcv_nxt = meta_tp->rcv_nxt;
++
++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
++ mptcp_prepare_skb(tmp1, sk);
++ __skb_unlink(tmp1, &sk->sk_receive_queue);
++ /* MUST be done here, because fragstolen may be true.
++ * Then, kfree_skb_partial will not account the memory.
++ */
++ skb_orphan(tmp1);
++
++ /* This segment has already been received */
++ if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) {
++ __kfree_skb(tmp1);
++ goto next;
++ }
++
++#ifdef CONFIG_NET_DMA
++ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&
++ meta_tp->ucopy.task == current &&
++ meta_tp->copied_seq == meta_tp->rcv_nxt &&
++ tmp1->len <= meta_tp->ucopy.len &&
++ sock_owned_by_user(meta_sk) &&
++ tcp_dma_try_early_copy(meta_sk, tmp1, 0)) {
++ copied_early = true;
++ eaten = 1;
++ }
++#endif
++
++ /* Is direct copy possible ? */
++ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&
++ meta_tp->ucopy.task == current &&
++ meta_tp->copied_seq == meta_tp->rcv_nxt &&
++ meta_tp->ucopy.len && sock_owned_by_user(meta_sk) &&
++ !copied_early)
++ eaten = mptcp_direct_copy(tmp1, meta_sk);
++
++ if (mpcb->in_time_wait) /* In time-wait, do not receive data */
++ eaten = 1;
++
++ if (!eaten)
++ eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen);
++
++ meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq;
++ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
++
++#ifdef CONFIG_NET_DMA
++ if (copied_early)
++ meta_tp->cleanup_rbuf(meta_sk, tmp1->len);
++#endif
++
++ if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait)
++ mptcp_fin(meta_sk);
++
++ /* Check if this fills a gap in the ofo queue */
++ if (!skb_queue_empty(&meta_tp->out_of_order_queue))
++ mptcp_ofo_queue(meta_sk);
++
++#ifdef CONFIG_NET_DMA
++ if (copied_early)
++ __skb_queue_tail(&meta_sk->sk_async_wait_queue,
++ tmp1);
++ else
++#endif
++ if (eaten)
++ kfree_skb_partial(tmp1, fragstolen);
++
++ data_queued = true;
++next:
++ if (!skb_queue_empty(&sk->sk_receive_queue) &&
++ !before(TCP_SKB_CB(tmp)->seq,
++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
++ break;
++ }
++ }
++
++ inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp;
++ mptcp_reset_mapping(tp);
++
++ return data_queued ? -1 : -2;
++}
++
++void mptcp_data_ready(struct sock *sk)
++{
++ struct sock *meta_sk = mptcp_meta_sk(sk);
++ struct sk_buff *skb, *tmp;
++ int queued = 0;
++
++ /* restart before the check, because mptcp_fin might have changed the
++ * state.
++ */
++restart:
++ /* If the meta cannot receive data, there is no point in pushing data.
++ * If we are in time-wait, we may still be waiting for the final FIN.
++ * So, we should proceed with the processing.
++ */
++ if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) {
++ skb_queue_purge(&sk->sk_receive_queue);
++ tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt;
++ goto exit;
++ }
++
++ /* Iterate over all segments, detect their mapping (if we don't have
++ * one yet), validate them and push everything one level higher.
++ */
++ skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
++ int ret;
++ /* Pre-validation - e.g., early fallback */
++ ret = mptcp_prevalidate_skb(sk, skb);
++ if (ret < 0)
++ goto restart;
++ else if (ret > 0)
++ break;
++
++ /* Set the current mapping */
++ ret = mptcp_detect_mapping(sk, skb);
++ if (ret < 0)
++ goto restart;
++ else if (ret > 0)
++ break;
++
++ /* Validation */
++ if (mptcp_validate_mapping(sk, skb) < 0)
++ goto restart;
++
++ /* Push a level higher */
++ ret = mptcp_queue_skb(sk);
++ if (ret < 0) {
++ if (ret == -1)
++ queued = ret;
++ goto restart;
++ } else if (ret == 0) {
++ continue;
++ } else { /* ret == 1 */
++ break;
++ }
++ }
++
++exit:
++ if (tcp_sk(sk)->close_it) {
++ tcp_send_ack(sk);
++ tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0);
++ }
++
++ if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD))
++ meta_sk->sk_data_ready(meta_sk);
++}
++
++
++int mptcp_check_req(struct sk_buff *skb, struct net *net)
++{
++ const struct tcphdr *th = tcp_hdr(skb);
++ struct sock *meta_sk = NULL;
++
++ /* MPTCP structures not initialized */
++ if (mptcp_init_failed)
++ return 0;
++
++ if (skb->protocol == htons(ETH_P_IP))
++ meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr,
++ ip_hdr(skb)->daddr, net);
++#if IS_ENABLED(CONFIG_IPV6)
++ else /* IPv6 */
++ meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr,
++ &ipv6_hdr(skb)->daddr, net);
++#endif /* CONFIG_IPV6 */
++
++ if (!meta_sk)
++ return 0;
++
++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;
++
++ bh_lock_sock_nested(meta_sk);
++ if (sock_owned_by_user(meta_sk)) {
++ skb->sk = meta_sk;
++ if (unlikely(sk_add_backlog(meta_sk, skb,
++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
++ bh_unlock_sock(meta_sk);
++ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
++ sock_put(meta_sk); /* Taken by mptcp_search_req */
++ kfree_skb(skb);
++ return 1;
++ }
++ } else if (skb->protocol == htons(ETH_P_IP)) {
++ tcp_v4_do_rcv(meta_sk, skb);
++#if IS_ENABLED(CONFIG_IPV6)
++ } else { /* IPv6 */
++ tcp_v6_do_rcv(meta_sk, skb);
++#endif /* CONFIG_IPV6 */
++ }
++ bh_unlock_sock(meta_sk);
++ sock_put(meta_sk); /* Taken by mptcp_vX_search_req */
++ return 1;
++}
++
++struct mp_join *mptcp_find_join(const struct sk_buff *skb)
++{
++ const struct tcphdr *th = tcp_hdr(skb);
++ unsigned char *ptr;
++ int length = (th->doff * 4) - sizeof(struct tcphdr);
++
++ /* Jump through the options to check whether JOIN is there */
++ ptr = (unsigned char *)(th + 1);
++ while (length > 0) {
++ int opcode = *ptr++;
++ int opsize;
++
++ switch (opcode) {
++ case TCPOPT_EOL:
++ return NULL;
++ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
++ length--;
++ continue;
++ default:
++ opsize = *ptr++;
++ if (opsize < 2) /* "silly options" */
++ return NULL;
++ if (opsize > length)
++ return NULL; /* don't parse partial options */
++ if (opcode == TCPOPT_MPTCP &&
++ ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) {
++ return (struct mp_join *)(ptr - 2);
++ }
++ ptr += opsize - 2;
++ length -= opsize;
++ }
++ }
++ return NULL;
++}
++
++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw)
++{
++ const struct mptcp_cb *mpcb;
++ struct sock *meta_sk;
++ u32 token;
++ bool meta_v4;
++ struct mp_join *join_opt = mptcp_find_join(skb);
++ if (!join_opt)
++ return 0;
++
++ /* MPTCP structures were not initialized, so return error */
++ if (mptcp_init_failed)
++ return -1;
++
++ token = join_opt->u.syn.token;
++ meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token);
++ if (!meta_sk) {
++ mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
++ return -1;
++ }
++
++ meta_v4 = meta_sk->sk_family == AF_INET;
++ if (meta_v4) {
++ if (skb->protocol == htons(ETH_P_IPV6)) {
++ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
++ return -1;
++ }
++ } else if (skb->protocol == htons(ETH_P_IP) &&
++ inet6_sk(meta_sk)->ipv6only) {
++ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
++ return -1;
++ }
++
++ mpcb = tcp_sk(meta_sk)->mpcb;
++ if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) {
++ /* We are in fallback-mode on the reception-side -
++ * no new subflows!
++ */
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
++ return -1;
++ }
++
++ /* Coming from time-wait-sock processing in tcp_v4_rcv.
++ * We have to deschedule it before continuing, because otherwise
++ * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req.
++ */
++ if (tw) {
++ inet_twsk_deschedule(tw, &tcp_death_row);
++ inet_twsk_put(tw);
++ }
++
++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;
++ /* OK, this is a new syn/join, let's create a new open request and
++ * send syn+ack
++ */
++ bh_lock_sock_nested(meta_sk);
++ if (sock_owned_by_user(meta_sk)) {
++ skb->sk = meta_sk;
++ if (unlikely(sk_add_backlog(meta_sk, skb,
++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
++ bh_unlock_sock(meta_sk);
++ NET_INC_STATS_BH(sock_net(meta_sk),
++ LINUX_MIB_TCPBACKLOGDROP);
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
++ kfree_skb(skb);
++ return 1;
++ }
++ } else if (skb->protocol == htons(ETH_P_IP)) {
++ tcp_v4_do_rcv(meta_sk, skb);
++#if IS_ENABLED(CONFIG_IPV6)
++ } else {
++ tcp_v6_do_rcv(meta_sk, skb);
++#endif /* CONFIG_IPV6 */
++ }
++ bh_unlock_sock(meta_sk);
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
++ return 1;
++}
++
++int mptcp_do_join_short(struct sk_buff *skb,
++ const struct mptcp_options_received *mopt,
++ struct net *net)
++{
++ struct sock *meta_sk;
++ u32 token;
++ bool meta_v4;
++
++ token = mopt->mptcp_rem_token;
++ meta_sk = mptcp_hash_find(net, token);
++ if (!meta_sk) {
++ mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
++ return -1;
++ }
++
++ meta_v4 = meta_sk->sk_family == AF_INET;
++ if (meta_v4) {
++ if (skb->protocol == htons(ETH_P_IPV6)) {
++ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
++ return -1;
++ }
++ } else if (skb->protocol == htons(ETH_P_IP) &&
++ inet6_sk(meta_sk)->ipv6only) {
++ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
++ return -1;
++ }
++
++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;
++
++ /* OK, this is a new syn/join, let's create a new open request and
++ * send syn+ack
++ */
++ bh_lock_sock(meta_sk);
++
++ /* This check is also done in mptcp_vX_do_rcv. But, there we cannot
++ * call tcp_vX_send_reset, because we hold already two socket-locks.
++ * (the listener and the meta from above)
++ *
++ * And the send-reset will try to take yet another one (ip_send_reply).
++ * Thus, we propagate the reset up to tcp_rcv_state_process.
++ */
++ if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv ||
++ tcp_sk(meta_sk)->mpcb->send_infinite_mapping ||
++ meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) {
++ bh_unlock_sock(meta_sk);
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
++ return -1;
++ }
++
++ if (sock_owned_by_user(meta_sk)) {
++ skb->sk = meta_sk;
++ if (unlikely(sk_add_backlog(meta_sk, skb,
++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf)))
++ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
++ else
++ /* Must make sure that upper layers won't free the
++ * skb if it is added to the backlog-queue.
++ */
++ skb_get(skb);
++ } else {
++ /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as
++ * the skb will finally be freed by tcp_v4_do_rcv (where we are
++ * coming from)
++ */
++ skb_get(skb);
++ if (skb->protocol == htons(ETH_P_IP)) {
++ tcp_v4_do_rcv(meta_sk, skb);
++#if IS_ENABLED(CONFIG_IPV6)
++ } else { /* IPv6 */
++ tcp_v6_do_rcv(meta_sk, skb);
++#endif /* CONFIG_IPV6 */
++ }
++ }
++
++ bh_unlock_sock(meta_sk);
++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
++ return 0;
++}
++
++/**
++ * Equivalent of tcp_fin() for MPTCP
++ * Can be called only when the FIN is validly part
++ * of the data seqnum space. Not before when we get holes.
++ */
++void mptcp_fin(struct sock *meta_sk)
++{
++ struct sock *sk = NULL, *sk_it;
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++
++ mptcp_for_each_sk(mpcb, sk_it) {
++ if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) {
++ sk = sk_it;
++ break;
++ }
++ }
++
++ if (!sk || sk->sk_state == TCP_CLOSE)
++ sk = mptcp_select_ack_sock(meta_sk);
++
++ inet_csk_schedule_ack(sk);
++
++ meta_sk->sk_shutdown |= RCV_SHUTDOWN;
++ sock_set_flag(meta_sk, SOCK_DONE);
++
++ switch (meta_sk->sk_state) {
++ case TCP_SYN_RECV:
++ case TCP_ESTABLISHED:
++ /* Move to CLOSE_WAIT */
++ tcp_set_state(meta_sk, TCP_CLOSE_WAIT);
++ inet_csk(sk)->icsk_ack.pingpong = 1;
++ break;
++
++ case TCP_CLOSE_WAIT:
++ case TCP_CLOSING:
++ /* Received a retransmission of the FIN, do
++ * nothing.
++ */
++ break;
++ case TCP_LAST_ACK:
++ /* RFC793: Remain in the LAST-ACK state. */
++ break;
++
++ case TCP_FIN_WAIT1:
++ /* This case occurs when a simultaneous close
++ * happens, we must ack the received FIN and
++ * enter the CLOSING state.
++ */
++ tcp_send_ack(sk);
++ tcp_set_state(meta_sk, TCP_CLOSING);
++ break;
++ case TCP_FIN_WAIT2:
++ /* Received a FIN -- send ACK and enter TIME_WAIT. */
++ tcp_send_ack(sk);
++ meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0);
++ break;
++ default:
++ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
++ * cases we should never reach this piece of code.
++ */
++ pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__,
++ meta_sk->sk_state);
++ break;
++ }
++
++ /* It _is_ possible, that we have something out-of-order _after_ FIN.
++ * Probably, we should reset in this case. For now drop them.
++ */
++ mptcp_purge_ofo_queue(meta_tp);
++ sk_mem_reclaim(meta_sk);
++
++ if (!sock_flag(meta_sk, SOCK_DEAD)) {
++ meta_sk->sk_state_change(meta_sk);
++
++ /* Do not send POLL_HUP for half duplex close. */
++ if (meta_sk->sk_shutdown == SHUTDOWN_MASK ||
++ meta_sk->sk_state == TCP_CLOSE)
++ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP);
++ else
++ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN);
++ }
++
++ return;
++}
++
++static void mptcp_xmit_retransmit_queue(struct sock *meta_sk)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct sk_buff *skb;
++
++ if (!meta_tp->packets_out)
++ return;
++
++ tcp_for_write_queue(skb, meta_sk) {
++ if (skb == tcp_send_head(meta_sk))
++ break;
++
++ if (mptcp_retransmit_skb(meta_sk, skb))
++ return;
++
++ if (skb == tcp_write_queue_head(meta_sk))
++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
++ inet_csk(meta_sk)->icsk_rto,
++ TCP_RTO_MAX);
++ }
++}
++
++/* Handle the DATA_ACK */
++static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb)
++{
++ struct sock *meta_sk = mptcp_meta_sk(sk);
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
++ u32 prior_snd_una = meta_tp->snd_una;
++ int prior_packets;
++ u32 nwin, data_ack, data_seq;
++ u16 data_len = 0;
++
++ /* A valid packet came in - subflow is operational again */
++ tp->pf = 0;
++
++ /* Even if there is no data-ack, we stop retransmitting.
++ * Except if this is a SYN/ACK. Then it is just a retransmission
++ */
++ if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) {
++ tp->mptcp->pre_established = 0;
++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
++ }
++
++ /* If we are in infinite mapping mode, rx_opt.data_ack has been
++ * set by mptcp_clean_rtx_infinite.
++ */
++ if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd)
++ goto exit;
++
++ data_ack = tp->mptcp->rx_opt.data_ack;
++
++ if (unlikely(!tp->mptcp->fully_established) &&
++ tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)
++ /* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1)
++ * includes a data-ack, we are fully established
++ */
++ mptcp_become_fully_estab(sk);
++
++ /* Get the data_seq */
++ if (mptcp_is_data_seq(skb)) {
++ data_seq = tp->mptcp->rx_opt.data_seq;
++ data_len = tp->mptcp->rx_opt.data_len;
++ } else {
++ data_seq = meta_tp->snd_wl1;
++ }
++
++ /* If the ack is older than previous acks
++ * then we can probably ignore it.
++ */
++ if (before(data_ack, prior_snd_una))
++ goto exit;
++
++ /* If the ack includes data we haven't sent yet, discard
++ * this segment (RFC793 Section 3.9).
++ */
++ if (after(data_ack, meta_tp->snd_nxt))
++ goto exit;
++
++ /*** Now, update the window - inspired by tcp_ack_update_window ***/
++ nwin = ntohs(tcp_hdr(skb)->window);
++
++ if (likely(!tcp_hdr(skb)->syn))
++ nwin <<= tp->rx_opt.snd_wscale;
++
++ if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) {
++ tcp_update_wl(meta_tp, data_seq);
++
++ /* Draft v09, Section 3.3.5:
++ * [...] It should only update its local receive window values
++ * when the largest sequence number allowed (i.e. DATA_ACK +
++ * receive window) increases. [...]
++ */
++ if (meta_tp->snd_wnd != nwin &&
++ !before(data_ack + nwin, tcp_wnd_end(meta_tp))) {
++ meta_tp->snd_wnd = nwin;
++
++ if (nwin > meta_tp->max_window)
++ meta_tp->max_window = nwin;
++ }
++ }
++ /*** Done, update the window ***/
++
++ /* We passed data and got it acked, remove any soft error
++ * log. Something worked...
++ */
++ sk->sk_err_soft = 0;
++ inet_csk(meta_sk)->icsk_probes_out = 0;
++ meta_tp->rcv_tstamp = tcp_time_stamp;
++ prior_packets = meta_tp->packets_out;
++ if (!prior_packets)
++ goto no_queue;
++
++ meta_tp->snd_una = data_ack;
++
++ mptcp_clean_rtx_queue(meta_sk, prior_snd_una);
++
++ /* We are in loss-state, and something got acked, retransmit the whole
++ * queue now!
++ */
++ if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss &&
++ after(data_ack, prior_snd_una)) {
++ mptcp_xmit_retransmit_queue(meta_sk);
++ inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open;
++ }
++
++ /* Simplified version of tcp_new_space, because the snd-buffer
++ * is handled by all the subflows.
++ */
++ if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) {
++ sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK);
++ if (meta_sk->sk_socket &&
++ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
++ meta_sk->sk_write_space(meta_sk);
++ }
++
++ if (meta_sk->sk_state != TCP_ESTABLISHED &&
++ mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len))
++ return;
++
++exit:
++ mptcp_push_pending_frames(meta_sk);
++
++ return;
++
++no_queue:
++ if (tcp_send_head(meta_sk))
++ tcp_ack_probe(meta_sk);
++
++ mptcp_push_pending_frames(meta_sk);
++
++ return;
++}
++
++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk));
++
++ if (!tp->mpcb->infinite_mapping_snd)
++ return;
++
++ /* The difference between both write_seq's represents the offset between
++ * data-sequence and subflow-sequence. As we are infinite, this must
++ * match.
++ *
++ * Thus, from this difference we can infer the meta snd_una.
++ */
++ tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt +
++ tp->snd_una;
++
++ mptcp_data_ack(sk, skb);
++}
++
++/**** static functions used by mptcp_parse_options */
++
++static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id)
++{
++ struct sock *sk_it, *tmpsk;
++
++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
++ if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) {
++ mptcp_reinject_data(sk_it, 0);
++ sk_it->sk_err = ECONNRESET;
++ if (tcp_need_reset(sk_it->sk_state))
++ tcp_sk(sk_it)->ops->send_active_reset(sk_it,
++ GFP_ATOMIC);
++ mptcp_sub_force_close(sk_it);
++ }
++ }
++}
++
++void mptcp_parse_options(const uint8_t *ptr, int opsize,
++ struct mptcp_options_received *mopt,
++ const struct sk_buff *skb)
++{
++ const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr;
++
++ /* If the socket is mp-capable we would have a mopt. */
++ if (!mopt)
++ return;
++
++ switch (mp_opt->sub) {
++ case MPTCP_SUB_CAPABLE:
++ {
++ const struct mp_capable *mpcapable = (struct mp_capable *)ptr;
++
++ if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN &&
++ opsize != MPTCP_SUB_LEN_CAPABLE_ACK) {
++ mptcp_debug("%s: mp_capable: bad option size %d\n",
++ __func__, opsize);
++ break;
++ }
++
++ if (!sysctl_mptcp_enabled)
++ break;
++
++ /* We only support MPTCP version 0 */
++ if (mpcapable->ver != 0)
++ break;
++
++ /* MPTCP-RFC 6824:
++ * "If receiving a message with the 'B' flag set to 1, and this
++ * is not understood, then this SYN MUST be silently ignored;
++ */
++ if (mpcapable->b) {
++ mopt->drop_me = 1;
++ break;
++ }
++
++ /* MPTCP-RFC 6824:
++ * "An implementation that only supports this method MUST set
++ * bit "H" to 1, and bits "C" through "G" to 0."
++ */
++ if (!mpcapable->h)
++ break;
++
++ mopt->saw_mpc = 1;
++ mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a;
++
++ if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN)
++ mopt->mptcp_key = mpcapable->sender_key;
++
++ break;
++ }
++ case MPTCP_SUB_JOIN:
++ {
++ const struct mp_join *mpjoin = (struct mp_join *)ptr;
++
++ if (opsize != MPTCP_SUB_LEN_JOIN_SYN &&
++ opsize != MPTCP_SUB_LEN_JOIN_SYNACK &&
++ opsize != MPTCP_SUB_LEN_JOIN_ACK) {
++ mptcp_debug("%s: mp_join: bad option size %d\n",
++ __func__, opsize);
++ break;
++ }
++
++ /* saw_mpc must be set, because in tcp_check_req we assume that
++ * it is set to support falling back to reg. TCP if a rexmitted
++ * SYN has no MP_CAPABLE or MP_JOIN
++ */
++ switch (opsize) {
++ case MPTCP_SUB_LEN_JOIN_SYN:
++ mopt->is_mp_join = 1;
++ mopt->saw_mpc = 1;
++ mopt->low_prio = mpjoin->b;
++ mopt->rem_id = mpjoin->addr_id;
++ mopt->mptcp_rem_token = mpjoin->u.syn.token;
++ mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce;
++ break;
++ case MPTCP_SUB_LEN_JOIN_SYNACK:
++ mopt->saw_mpc = 1;
++ mopt->low_prio = mpjoin->b;
++ mopt->rem_id = mpjoin->addr_id;
++ mopt->mptcp_recv_tmac = mpjoin->u.synack.mac;
++ mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce;
++ break;
++ case MPTCP_SUB_LEN_JOIN_ACK:
++ mopt->saw_mpc = 1;
++ mopt->join_ack = 1;
++ memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20);
++ break;
++ }
++ break;
++ }
++ case MPTCP_SUB_DSS:
++ {
++ const struct mp_dss *mdss = (struct mp_dss *)ptr;
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
++
++ /* We check opsize for the csum and non-csum case. We do this,
++ * because the draft says that the csum SHOULD be ignored if
++ * it has not been negotiated in the MP_CAPABLE but still is
++ * present in the data.
++ *
++ * It will get ignored later in mptcp_queue_skb.
++ */
++ if (opsize != mptcp_sub_len_dss(mdss, 0) &&
++ opsize != mptcp_sub_len_dss(mdss, 1)) {
++ mptcp_debug("%s: mp_dss: bad option size %d\n",
++ __func__, opsize);
++ break;
++ }
++
++ ptr += 4;
++
++ if (mdss->A) {
++ tcb->mptcp_flags |= MPTCPHDR_ACK;
++
++ if (mdss->a) {
++ mopt->data_ack = (u32) get_unaligned_be64(ptr);
++ ptr += MPTCP_SUB_LEN_ACK_64;
++ } else {
++ mopt->data_ack = get_unaligned_be32(ptr);
++ ptr += MPTCP_SUB_LEN_ACK;
++ }
++ }
++
++ tcb->dss_off = (ptr - skb_transport_header(skb));
++
++ if (mdss->M) {
++ if (mdss->m) {
++ u64 data_seq64 = get_unaligned_be64(ptr);
++
++ tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET;
++ mopt->data_seq = (u32) data_seq64;
++
++ ptr += 12; /* 64-bit dseq + subseq */
++ } else {
++ mopt->data_seq = get_unaligned_be32(ptr);
++ ptr += 8; /* 32-bit dseq + subseq */
++ }
++ mopt->data_len = get_unaligned_be16(ptr);
++
++ tcb->mptcp_flags |= MPTCPHDR_SEQ;
++
++ /* Is a check-sum present? */
++ if (opsize == mptcp_sub_len_dss(mdss, 1))
++ tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM;
++
++ /* DATA_FIN only possible with DSS-mapping */
++ if (mdss->F)
++ tcb->mptcp_flags |= MPTCPHDR_FIN;
++ }
++
++ break;
++ }
++ case MPTCP_SUB_ADD_ADDR:
++ {
++#if IS_ENABLED(CONFIG_IPV6)
++ const struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
++
++ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
++ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
++ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) {
++#else
++ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) {
++#endif /* CONFIG_IPV6 */
++ mptcp_debug("%s: mp_add_addr: bad option size %d\n",
++ __func__, opsize);
++ break;
++ }
++
++ /* We have to manually parse the options if we got two of them. */
++ if (mopt->saw_add_addr) {
++ mopt->more_add_addr = 1;
++ break;
++ }
++ mopt->saw_add_addr = 1;
++ mopt->add_addr_ptr = ptr;
++ break;
++ }
++ case MPTCP_SUB_REMOVE_ADDR:
++ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) {
++ mptcp_debug("%s: mp_remove_addr: bad option size %d\n",
++ __func__, opsize);
++ break;
++ }
++
++ if (mopt->saw_rem_addr) {
++ mopt->more_rem_addr = 1;
++ break;
++ }
++ mopt->saw_rem_addr = 1;
++ mopt->rem_addr_ptr = ptr;
++ break;
++ case MPTCP_SUB_PRIO:
++ {
++ const struct mp_prio *mpprio = (struct mp_prio *)ptr;
++
++ if (opsize != MPTCP_SUB_LEN_PRIO &&
++ opsize != MPTCP_SUB_LEN_PRIO_ADDR) {
++ mptcp_debug("%s: mp_prio: bad option size %d\n",
++ __func__, opsize);
++ break;
++ }
++
++ mopt->saw_low_prio = 1;
++ mopt->low_prio = mpprio->b;
++
++ if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) {
++ mopt->saw_low_prio = 2;
++ mopt->prio_addr_id = mpprio->addr_id;
++ }
++ break;
++ }
++ case MPTCP_SUB_FAIL:
++ if (opsize != MPTCP_SUB_LEN_FAIL) {
++ mptcp_debug("%s: mp_fail: bad option size %d\n",
++ __func__, opsize);
++ break;
++ }
++ mopt->mp_fail = 1;
++ break;
++ case MPTCP_SUB_FCLOSE:
++ if (opsize != MPTCP_SUB_LEN_FCLOSE) {
++ mptcp_debug("%s: mp_fclose: bad option size %d\n",
++ __func__, opsize);
++ break;
++ }
++
++ mopt->mp_fclose = 1;
++ mopt->mptcp_key = ((struct mp_fclose *)ptr)->key;
++
++ break;
++ default:
++ mptcp_debug("%s: Received unkown subtype: %d\n",
++ __func__, mp_opt->sub);
++ break;
++ }
++}
++
++/** Parse only MPTCP options */
++void tcp_parse_mptcp_options(const struct sk_buff *skb,
++ struct mptcp_options_received *mopt)
++{
++ const struct tcphdr *th = tcp_hdr(skb);
++ int length = (th->doff * 4) - sizeof(struct tcphdr);
++ const unsigned char *ptr = (const unsigned char *)(th + 1);
++
++ while (length > 0) {
++ int opcode = *ptr++;
++ int opsize;
++
++ switch (opcode) {
++ case TCPOPT_EOL:
++ return;
++ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
++ length--;
++ continue;
++ default:
++ opsize = *ptr++;
++ if (opsize < 2) /* "silly options" */
++ return;
++ if (opsize > length)
++ return; /* don't parse partial options */
++ if (opcode == TCPOPT_MPTCP)
++ mptcp_parse_options(ptr - 2, opsize, mopt, skb);
++ }
++ ptr += opsize - 2;
++ length -= opsize;
++ }
++}
++
++int mptcp_check_rtt(const struct tcp_sock *tp, int time)
++{
++ struct mptcp_cb *mpcb = tp->mpcb;
++ struct sock *sk;
++ u32 rtt_max = 0;
++
++ /* In MPTCP, we take the max delay across all flows,
++ * in order to take into account meta-reordering buffers.
++ */
++ mptcp_for_each_sk(mpcb, sk) {
++ if (!mptcp_sk_can_recv(sk))
++ continue;
++
++ if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt)
++ rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt;
++ }
++ if (time < (rtt_max >> 3) || !rtt_max)
++ return 1;
++
++ return 0;
++}
++
++static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk)
++{
++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
++ __be16 port = 0;
++ union inet_addr addr;
++ sa_family_t family;
++
++ if (mpadd->ipver == 4) {
++ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2)
++ port = mpadd->u.v4.port;
++ family = AF_INET;
++ addr.in = mpadd->u.v4.addr;
++#if IS_ENABLED(CONFIG_IPV6)
++ } else if (mpadd->ipver == 6) {
++ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2)
++ port = mpadd->u.v6.port;
++ family = AF_INET6;
++ addr.in6 = mpadd->u.v6.addr;
++#endif /* CONFIG_IPV6 */
++ } else {
++ return;
++ }
++
++ if (mpcb->pm_ops->add_raddr)
++ mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id);
++}
++
++static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk)
++{
++ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
++ int i;
++ u8 rem_id;
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
++
++ for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) {
++ rem_id = (&mprem->addrs_id)[i];
++
++ if (mpcb->pm_ops->rem_raddr)
++ mpcb->pm_ops->rem_raddr(mpcb, rem_id);
++ mptcp_send_reset_rem_id(mpcb, rem_id);
++ }
++}
++
++static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk)
++{
++ struct tcphdr *th = tcp_hdr(skb);
++ unsigned char *ptr;
++ int length = (th->doff * 4) - sizeof(struct tcphdr);
++
++ /* Jump through the options to check whether ADD_ADDR is there */
++ ptr = (unsigned char *)(th + 1);
++ while (length > 0) {
++ int opcode = *ptr++;
++ int opsize;
++
++ switch (opcode) {
++ case TCPOPT_EOL:
++ return;
++ case TCPOPT_NOP:
++ length--;
++ continue;
++ default:
++ opsize = *ptr++;
++ if (opsize < 2)
++ return;
++ if (opsize > length)
++ return; /* don't parse partial options */
++ if (opcode == TCPOPT_MPTCP &&
++ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) {
++#if IS_ENABLED(CONFIG_IPV6)
++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
++ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
++ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
++ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2))
++#else
++ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2)
++#endif /* CONFIG_IPV6 */
++ goto cont;
++
++ mptcp_handle_add_addr(ptr, sk);
++ }
++ if (opcode == TCPOPT_MPTCP &&
++ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) {
++ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0)
++ goto cont;
++
++ mptcp_handle_rem_addr(ptr, sk);
++ }
++cont:
++ ptr += opsize - 2;
++ length -= opsize;
++ }
++ }
++ return;
++}
++
++static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th)
++{
++ struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;
++ struct sock *meta_sk = mptcp_meta_sk(sk);
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
++
++ if (unlikely(mptcp->rx_opt.mp_fail)) {
++ mptcp->rx_opt.mp_fail = 0;
++
++ if (!th->rst && !mpcb->infinite_mapping_snd) {
++ struct sock *sk_it;
++
++ mpcb->send_infinite_mapping = 1;
++ /* We resend everything that has not been acknowledged */
++ meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);
++
++ /* We artificially restart the whole send-queue. Thus,
++ * it is as if no packets are in flight
++ */
++ tcp_sk(meta_sk)->packets_out = 0;
++
++ /* If the snd_nxt already wrapped around, we have to
++ * undo the wrapping, as we are restarting from snd_una
++ * on.
++ */
++ if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) {
++ mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2;
++ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
++ }
++ tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una;
++
++ /* Trigger a sending on the meta. */
++ mptcp_push_pending_frames(meta_sk);
++
++ mptcp_for_each_sk(mpcb, sk_it) {
++ if (sk != sk_it)
++ mptcp_sub_force_close(sk_it);
++ }
++ }
++
++ return 0;
++ }
++
++ if (unlikely(mptcp->rx_opt.mp_fclose)) {
++ struct sock *sk_it, *tmpsk;
++
++ mptcp->rx_opt.mp_fclose = 0;
++ if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key)
++ return 0;
++
++ if (tcp_need_reset(sk->sk_state))
++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
++
++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk)
++ mptcp_sub_force_close(sk_it);
++
++ tcp_reset(meta_sk);
++
++ return 1;
++ }
++
++ return 0;
++}
++
++static inline void mptcp_path_array_check(struct sock *meta_sk)
++{
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++
++ if (unlikely(mpcb->list_rcvd)) {
++ mpcb->list_rcvd = 0;
++ if (mpcb->pm_ops->new_remote_address)
++ mpcb->pm_ops->new_remote_address(meta_sk);
++ }
++}
++
++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th,
++ const struct sk_buff *skb)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
++
++ if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd)
++ return 0;
++
++ if (mptcp_mp_fail_rcvd(sk, th))
++ return 1;
++
++ /* RFC 6824, Section 3.3:
++ * If a checksum is not present when its use has been negotiated, the
++ * receiver MUST close the subflow with a RST as it is considered broken.
++ */
++ if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum &&
++ !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) {
++ if (tcp_need_reset(sk->sk_state))
++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
++
++ mptcp_sub_force_close(sk);
++ return 1;
++ }
++
++ /* We have to acknowledge retransmissions of the third
++ * ack.
++ */
++ if (mopt->join_ack) {
++ tcp_send_delayed_ack(sk);
++ mopt->join_ack = 0;
++ }
++
++ if (mopt->saw_add_addr || mopt->saw_rem_addr) {
++ if (mopt->more_add_addr || mopt->more_rem_addr) {
++ mptcp_parse_addropt(skb, sk);
++ } else {
++ if (mopt->saw_add_addr)
++ mptcp_handle_add_addr(mopt->add_addr_ptr, sk);
++ if (mopt->saw_rem_addr)
++ mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk);
++ }
++
++ mopt->more_add_addr = 0;
++ mopt->saw_add_addr = 0;
++ mopt->more_rem_addr = 0;
++ mopt->saw_rem_addr = 0;
++ }
++ if (mopt->saw_low_prio) {
++ if (mopt->saw_low_prio == 1) {
++ tp->mptcp->rcv_low_prio = mopt->low_prio;
++ } else {
++ struct sock *sk_it;
++ mptcp_for_each_sk(tp->mpcb, sk_it) {
++ struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp;
++ if (mptcp->rem_id == mopt->prio_addr_id)
++ mptcp->rcv_low_prio = mopt->low_prio;
++ }
++ }
++ mopt->saw_low_prio = 0;
++ }
++
++ mptcp_data_ack(sk, skb);
++
++ mptcp_path_array_check(mptcp_meta_sk(sk));
++ /* Socket may have been mp_killed by a REMOVE_ADDR */
++ if (tp->mp_killed)
++ return 1;
++
++ return 0;
++}
++
++/* In case of fastopen, some data can already be in the write queue.
++ * We need to update the sequence number of the segments as they
++ * were initially TCP sequence numbers.
++ */
++static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk);
++ struct sk_buff *skb;
++ u32 new_mapping = meta_tp->write_seq - master_tp->snd_una;
++
++ /* There should only be one skb in write queue: the data not
++ * acknowledged in the SYN+ACK. In this case, we need to map
++ * this data to data sequence numbers.
++ */
++ skb_queue_walk(&meta_sk->sk_write_queue, skb) {
++ /* If the server only acknowledges partially the data sent in
++ * the SYN, we need to trim the acknowledged part because
++ * we don't want to retransmit this already received data.
++ * When we reach this point, tcp_ack() has already cleaned up
++ * fully acked segments. However, tcp trims partially acked
++ * segments only when retransmitting. Since MPTCP comes into
++ * play only now, we will fake an initial transmit, and
++ * retransmit_skb() will not be called. The following fragment
++ * comes from __tcp_retransmit_skb().
++ */
++ if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) {
++ BUG_ON(before(TCP_SKB_CB(skb)->end_seq,
++ master_tp->snd_una));
++ /* tcp_trim_head can only returns ENOMEM if skb is
++ * cloned. It is not the case here (see
++ * tcp_send_syn_data).
++ */
++ BUG_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una -
++ TCP_SKB_CB(skb)->seq));
++ }
++
++ TCP_SKB_CB(skb)->seq += new_mapping;
++ TCP_SKB_CB(skb)->end_seq += new_mapping;
++ }
++
++ /* We can advance write_seq by the number of bytes unacknowledged
++ * and that were mapped in the previous loop.
++ */
++ meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una;
++
++ /* The packets from the master_sk will be entailed to it later
++ * Until that time, its write queue is empty, and
++ * write_seq must align with snd_una
++ */
++ master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una;
++ master_tp->packets_out = 0;
++
++ /* Although these data have been sent already over the subsk,
++ * They have never been sent over the meta_sk, so we rewind
++ * the send_head so that tcp considers it as an initial send
++ * (instead of retransmit).
++ */
++ meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);
++}
++
++/* The skptr is needed, because if we become MPTCP-capable, we have to switch
++ * from meta-socket to master-socket.
++ *
++ * @return: 1 - we want to reset this connection
++ * 2 - we want to discard the received syn/ack
++ * 0 - everything is fine - continue
++ */
++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
++ const struct sk_buff *skb,
++ const struct mptcp_options_received *mopt)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ if (mptcp(tp)) {
++ u8 hash_mac_check[20];
++ struct mptcp_cb *mpcb = tp->mpcb;
++
++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
++ (u8 *)&mpcb->mptcp_loc_key,
++ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
++ (u8 *)&tp->mptcp->mptcp_loc_nonce,
++ (u32 *)hash_mac_check);
++ if (memcmp(hash_mac_check,
++ (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) {
++ mptcp_sub_force_close(sk);
++ return 1;
++ }
++
++ /* Set this flag in order to postpone data sending
++ * until the 4th ack arrives.
++ */
++ tp->mptcp->pre_established = 1;
++ tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio;
++
++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
++ (u8 *)&mpcb->mptcp_rem_key,
++ (u8 *)&tp->mptcp->mptcp_loc_nonce,
++ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
++ (u32 *)&tp->mptcp->sender_mac[0]);
++
++ } else if (mopt->saw_mpc) {
++ struct sock *meta_sk = sk;
++
++ if (mptcp_create_master_sk(sk, mopt->mptcp_key,
++ ntohs(tcp_hdr(skb)->window)))
++ return 2;
++
++ sk = tcp_sk(sk)->mpcb->master_sk;
++ *skptr = sk;
++ tp = tcp_sk(sk);
++
++ /* If fastopen was used data might be in the send queue. We
++ * need to update their sequence number to MPTCP-level seqno.
++ * Note that it can happen in rare cases that fastopen_req is
++ * NULL and syn_data is 0 but fastopen indeed occurred and
++ * data has been queued in the write queue (but not sent).
++ * Example of such rare cases: connect is non-blocking and
++ * TFO is configured to work without cookies.
++ */
++ if (!skb_queue_empty(&meta_sk->sk_write_queue))
++ mptcp_rcv_synsent_fastopen(meta_sk);
++
++ /* -1, because the SYN consumed 1 byte. In case of TFO, we
++ * start the subflow-sequence number as if the data of the SYN
++ * is not part of any mapping.
++ */
++ tp->mptcp->snt_isn = tp->snd_una - 1;
++ tp->mpcb->dss_csum = mopt->dss_csum;
++ tp->mptcp->include_mpc = 1;
++
++ /* Ensure that fastopen is handled at the meta-level. */
++ tp->fastopen_req = NULL;
++
++ sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket);
++ sk->sk_wq = mptcp_meta_sk(sk)->sk_wq;
++
++ /* hold in sk_clone_lock due to initialization to 2 */
++ sock_put(sk);
++ } else {
++ tp->request_mptcp = 0;
++
++ if (tp->inside_tk_table)
++ mptcp_hash_remove(tp);
++ }
++
++ if (mptcp(tp))
++ tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq;
++
++ return 0;
++}
++
++bool mptcp_should_expand_sndbuf(const struct sock *sk)
++{
++ const struct sock *sk_it;
++ const struct sock *meta_sk = mptcp_meta_sk(sk);
++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ int cnt_backups = 0;
++ int backup_available = 0;
++
++ /* We circumvent this check in tcp_check_space, because we want to
++ * always call sk_write_space. So, we reproduce the check here.
++ */
++ if (!meta_sk->sk_socket ||
++ !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
++ return false;
++
++ /* If the user specified a specific send buffer setting, do
++ * not modify it.
++ */
++ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
++ return false;
++
++ /* If we are under global TCP memory pressure, do not expand. */
++ if (sk_under_memory_pressure(meta_sk))
++ return false;
++
++ /* If we are under soft global TCP memory pressure, do not expand. */
++ if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0))
++ return false;
++
++
++ /* For MPTCP we look for a subsocket that could send data.
++ * If we found one, then we update the send-buffer.
++ */
++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
++ struct tcp_sock *tp_it = tcp_sk(sk_it);
++
++ if (!mptcp_sk_can_send(sk_it))
++ continue;
++
++ /* Backup-flows have to be counted - if there is no other
++ * subflow we take the backup-flow into account.
++ */
++ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio)
++ cnt_backups++;
++
++ if (tp_it->packets_out < tp_it->snd_cwnd) {
++ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {
++ backup_available = 1;
++ continue;
++ }
++ return true;
++ }
++ }
++
++ /* Backup-flow is available for sending - update send-buffer */
++ if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available)
++ return true;
++ return false;
++}
++
++void mptcp_init_buffer_space(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct sock *meta_sk = mptcp_meta_sk(sk);
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ int space;
++
++ tcp_init_buffer_space(sk);
++
++ if (is_master_tp(tp)) {
++ meta_tp->rcvq_space.space = meta_tp->rcv_wnd;
++ meta_tp->rcvq_space.time = tcp_time_stamp;
++ meta_tp->rcvq_space.seq = meta_tp->copied_seq;
++
++ /* If there is only one subflow, we just use regular TCP
++ * autotuning. User-locks are handled already by
++ * tcp_init_buffer_space
++ */
++ meta_tp->window_clamp = tp->window_clamp;
++ meta_tp->rcv_ssthresh = tp->rcv_ssthresh;
++ meta_sk->sk_rcvbuf = sk->sk_rcvbuf;
++ meta_sk->sk_sndbuf = sk->sk_sndbuf;
++
++ return;
++ }
++
++ if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK)
++ goto snd_buf;
++
++ /* Adding a new subflow to the rcv-buffer space. We make a simple
++ * addition, to give some space to allow traffic on the new subflow.
++ * Autotuning will increase it further later on.
++ */
++ space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]);
++ if (space > meta_sk->sk_rcvbuf) {
++ meta_tp->window_clamp += tp->window_clamp;
++ meta_tp->rcv_ssthresh += tp->rcv_ssthresh;
++ meta_sk->sk_rcvbuf = space;
++ }
++
++snd_buf:
++ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
++ return;
++
++ /* Adding a new subflow to the send-buffer space. We make a simple
++ * addition, to give some space to allow traffic on the new subflow.
++ * Autotuning will increase it further later on.
++ */
++ space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, sysctl_tcp_wmem[2]);
++ if (space > meta_sk->sk_sndbuf) {
++ meta_sk->sk_sndbuf = space;
++ meta_sk->sk_write_space(meta_sk);
++ }
++}
++
++void mptcp_tcp_set_rto(struct sock *sk)
++{
++ tcp_set_rto(sk);
++ mptcp_set_rto(sk);
++}
+diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c
+new file mode 100644
+index 000000000000..1183d1305d35
+--- /dev/null
++++ b/net/mptcp/mptcp_ipv4.c
+@@ -0,0 +1,483 @@
++/*
++ * MPTCP implementation - IPv4-specific functions
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#include <linux/export.h>
++#include <linux/ip.h>
++#include <linux/list.h>
++#include <linux/skbuff.h>
++#include <linux/spinlock.h>
++#include <linux/tcp.h>
++
++#include <net/inet_common.h>
++#include <net/inet_connection_sock.h>
++#include <net/mptcp.h>
++#include <net/mptcp_v4.h>
++#include <net/request_sock.h>
++#include <net/tcp.h>
++
++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
++{
++ u32 hash[MD5_DIGEST_WORDS];
++
++ hash[0] = (__force u32)saddr;
++ hash[1] = (__force u32)daddr;
++ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
++ hash[3] = mptcp_seed++;
++
++ md5_transform(hash, mptcp_secret);
++
++ return hash[0];
++}
++
++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
++{
++ u32 hash[MD5_DIGEST_WORDS];
++
++ hash[0] = (__force u32)saddr;
++ hash[1] = (__force u32)daddr;
++ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
++ hash[3] = mptcp_seed++;
++
++ md5_transform(hash, mptcp_secret);
++
++ return *((u64 *)hash);
++}
++
++
++static void mptcp_v4_reqsk_destructor(struct request_sock *req)
++{
++ mptcp_reqsk_destructor(req);
++
++ tcp_v4_reqsk_destructor(req);
++}
++
++static int mptcp_v4_init_req(struct request_sock *req, struct sock *sk,
++ struct sk_buff *skb)
++{
++ tcp_request_sock_ipv4_ops.init_req(req, sk, skb);
++ mptcp_reqsk_init(req, skb);
++
++ return 0;
++}
++
++static int mptcp_v4_join_init_req(struct request_sock *req, struct sock *sk,
++ struct sk_buff *skb)
++{
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
++ union inet_addr addr;
++ int loc_id;
++ bool low_prio = false;
++
++ /* We need to do this as early as possible. Because, if we fail later
++ * (e.g., get_local_id), then reqsk_free tries to remove the
++ * request-socket from the htb in mptcp_hash_request_remove as pprev
++ * may be different from NULL.
++ */
++ mtreq->hash_entry.pprev = NULL;
++
++ tcp_request_sock_ipv4_ops.init_req(req, sk, skb);
++
++ mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr,
++ ip_hdr(skb)->daddr,
++ tcp_hdr(skb)->source,
++ tcp_hdr(skb)->dest);
++ addr.ip = inet_rsk(req)->ir_loc_addr;
++ loc_id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(sk), &low_prio);
++ if (loc_id == -1)
++ return -1;
++ mtreq->loc_id = loc_id;
++ mtreq->low_prio = low_prio;
++
++ mptcp_join_reqsk_init(mpcb, req, skb);
++
++ return 0;
++}
++
++/* Similar to tcp_request_sock_ops */
++struct request_sock_ops mptcp_request_sock_ops __read_mostly = {
++ .family = PF_INET,
++ .obj_size = sizeof(struct mptcp_request_sock),
++ .rtx_syn_ack = tcp_rtx_synack,
++ .send_ack = tcp_v4_reqsk_send_ack,
++ .destructor = mptcp_v4_reqsk_destructor,
++ .send_reset = tcp_v4_send_reset,
++ .syn_ack_timeout = tcp_syn_ack_timeout,
++};
++
++static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk,
++ struct request_sock *req,
++ const unsigned long timeout)
++{
++ const u32 h1 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
++ inet_rsk(req)->ir_rmt_port,
++ 0, MPTCP_HASH_SIZE);
++ /* We cannot call inet_csk_reqsk_queue_hash_add(), because we do not
++ * want to reset the keepalive-timer (responsible for retransmitting
++ * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot
++ * overload the keepalive timer. Also, it's not a big deal, because the
++ * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,
++ * if the third ACK gets lost, the client will handle the retransmission
++ * anyways. If our SYN/ACK gets lost, the client will retransmit the
++ * SYN.
++ */
++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
++ struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;
++ const u32 h2 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
++ inet_rsk(req)->ir_rmt_port,
++ lopt->hash_rnd, lopt->nr_table_entries);
++
++ reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);
++ if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0)
++ mptcp_reset_synack_timer(meta_sk, timeout);
++
++ rcu_read_lock();
++ spin_lock(&mptcp_reqsk_hlock);
++ hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]);
++ spin_unlock(&mptcp_reqsk_hlock);
++ rcu_read_unlock();
++}
++
++/* Similar to tcp_v4_conn_request */
++static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb)
++{
++ return tcp_conn_request(&mptcp_request_sock_ops,
++ &mptcp_join_request_sock_ipv4_ops,
++ meta_sk, skb);
++}
++
++/* We only process join requests here. (either the SYN or the final ACK) */
++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
++{
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct sock *child, *rsk = NULL;
++ int ret;
++
++ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
++ struct tcphdr *th = tcp_hdr(skb);
++ const struct iphdr *iph = ip_hdr(skb);
++ struct sock *sk;
++
++ sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo,
++ iph->saddr, th->source, iph->daddr,
++ th->dest, inet_iif(skb));
++
++ if (!sk) {
++ kfree_skb(skb);
++ return 0;
++ }
++ if (is_meta_sk(sk)) {
++ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
++ kfree_skb(skb);
++ sock_put(sk);
++ return 0;
++ }
++
++ if (sk->sk_state == TCP_TIME_WAIT) {
++ inet_twsk_put(inet_twsk(sk));
++ kfree_skb(skb);
++ return 0;
++ }
++
++ ret = tcp_v4_do_rcv(sk, skb);
++ sock_put(sk);
++
++ return ret;
++ }
++ TCP_SKB_CB(skb)->mptcp_flags = 0;
++
++ /* Has been removed from the tk-table. Thus, no new subflows.
++ *
++ * Check for close-state is necessary, because we may have been closed
++ * without passing by mptcp_close().
++ *
++ * When falling back, no new subflows are allowed either.
++ */
++ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
++ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
++ goto reset_and_discard;
++
++ child = tcp_v4_hnd_req(meta_sk, skb);
++
++ if (!child)
++ goto discard;
++
++ if (child != meta_sk) {
++ sock_rps_save_rxhash(child, skb);
++ /* We don't call tcp_child_process here, because we hold
++ * already the meta-sk-lock and are sure that it is not owned
++ * by the user.
++ */
++ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
++ bh_unlock_sock(child);
++ sock_put(child);
++ if (ret) {
++ rsk = child;
++ goto reset_and_discard;
++ }
++ } else {
++ if (tcp_hdr(skb)->syn) {
++ mptcp_v4_join_request(meta_sk, skb);
++ goto discard;
++ }
++ goto reset_and_discard;
++ }
++ return 0;
++
++reset_and_discard:
++ if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) {
++ const struct tcphdr *th = tcp_hdr(skb);
++ const struct iphdr *iph = ip_hdr(skb);
++ struct request_sock **prev, *req;
++ /* If we end up here, it means we should not have matched on the
++ * request-socket. But, because the request-sock queue is only
++ * destroyed in mptcp_close, the socket may actually already be
++ * in close-state (e.g., through shutdown()) while still having
++ * pending request sockets.
++ */
++ req = inet_csk_search_req(meta_sk, &prev, th->source,
++ iph->saddr, iph->daddr);
++ if (req) {
++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue,
++ req);
++ reqsk_free(req);
++ }
++ }
++
++ tcp_v4_send_reset(rsk, skb);
++discard:
++ kfree_skb(skb);
++ return 0;
++}
++
++/* After this, the ref count of the meta_sk associated with the request_sock
++ * is incremented. Thus it is the responsibility of the caller
++ * to call sock_put() when the reference is not needed anymore.
++ */
++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
++ const __be32 laddr, const struct net *net)
++{
++ const struct mptcp_request_sock *mtreq;
++ struct sock *meta_sk = NULL;
++ const struct hlist_nulls_node *node;
++ const u32 hash = inet_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE);
++
++ rcu_read_lock();
++begin:
++ hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash],
++ hash_entry) {
++ struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq));
++ meta_sk = mtreq->mptcp_mpcb->meta_sk;
++
++ if (ireq->ir_rmt_port == rport &&
++ ireq->ir_rmt_addr == raddr &&
++ ireq->ir_loc_addr == laddr &&
++ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET &&
++ net_eq(net, sock_net(meta_sk)))
++ goto found;
++ meta_sk = NULL;
++ }
++ /* A request-socket is destroyed by RCU. So, it might have been recycled
++ * and put into another hash-table list. So, after the lookup we may
++ * end up in a different list. So, we may need to restart.
++ *
++ * See also the comment in __inet_lookup_established.
++ */
++ if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE)
++ goto begin;
++
++found:
++ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
++ meta_sk = NULL;
++ rcu_read_unlock();
++
++ return meta_sk;
++}
++
++/* Create a new IPv4 subflow.
++ *
++ * We are in user-context and meta-sock-lock is hold.
++ */
++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
++ struct mptcp_rem4 *rem)
++{
++ struct tcp_sock *tp;
++ struct sock *sk;
++ struct sockaddr_in loc_in, rem_in;
++ struct socket sock;
++ int ret;
++
++ /** First, create and prepare the new socket */
++
++ sock.type = meta_sk->sk_socket->type;
++ sock.state = SS_UNCONNECTED;
++ sock.wq = meta_sk->sk_socket->wq;
++ sock.file = meta_sk->sk_socket->file;
++ sock.ops = NULL;
++
++ ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
++ if (unlikely(ret < 0)) {
++ mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret);
++ return ret;
++ }
++
++ sk = sock.sk;
++ tp = tcp_sk(sk);
++
++ /* All subsockets need the MPTCP-lock-class */
++ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
++ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
++
++ if (mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL))
++ goto error;
++
++ tp->mptcp->slave_sk = 1;
++ tp->mptcp->low_prio = loc->low_prio;
++
++ /* Initializing the timer for an MPTCP subflow */
++ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
++
++ /** Then, connect the socket to the peer */
++ loc_in.sin_family = AF_INET;
++ rem_in.sin_family = AF_INET;
++ loc_in.sin_port = 0;
++ if (rem->port)
++ rem_in.sin_port = rem->port;
++ else
++ rem_in.sin_port = inet_sk(meta_sk)->inet_dport;
++ loc_in.sin_addr = loc->addr;
++ rem_in.sin_addr = rem->addr;
++
++ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in));
++ if (ret < 0) {
++ mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n",
++ __func__, ret);
++ goto error;
++ }
++
++ mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n",
++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
++ tp->mptcp->path_index, &loc_in.sin_addr,
++ ntohs(loc_in.sin_port), &rem_in.sin_addr,
++ ntohs(rem_in.sin_port));
++
++ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4)
++ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr);
++
++ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
++ sizeof(struct sockaddr_in), O_NONBLOCK);
++ if (ret < 0 && ret != -EINPROGRESS) {
++ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
++ __func__, ret);
++ goto error;
++ }
++
++ sk_set_socket(sk, meta_sk->sk_socket);
++ sk->sk_wq = meta_sk->sk_wq;
++
++ return 0;
++
++error:
++ /* May happen if mptcp_add_sock fails first */
++ if (!mptcp(tp)) {
++ tcp_close(sk, 0);
++ } else {
++ local_bh_disable();
++ mptcp_sub_force_close(sk);
++ local_bh_enable();
++ }
++ return ret;
++}
++EXPORT_SYMBOL(mptcp_init4_subsockets);
++
++const struct inet_connection_sock_af_ops mptcp_v4_specific = {
++ .queue_xmit = ip_queue_xmit,
++ .send_check = tcp_v4_send_check,
++ .rebuild_header = inet_sk_rebuild_header,
++ .sk_rx_dst_set = inet_sk_rx_dst_set,
++ .conn_request = mptcp_conn_request,
++ .syn_recv_sock = tcp_v4_syn_recv_sock,
++ .net_header_len = sizeof(struct iphdr),
++ .setsockopt = ip_setsockopt,
++ .getsockopt = ip_getsockopt,
++ .addr2sockaddr = inet_csk_addr2sockaddr,
++ .sockaddr_len = sizeof(struct sockaddr_in),
++ .bind_conflict = inet_csk_bind_conflict,
++#ifdef CONFIG_COMPAT
++ .compat_setsockopt = compat_ip_setsockopt,
++ .compat_getsockopt = compat_ip_getsockopt,
++#endif
++};
++
++struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
++struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
++
++/* General initialization of IPv4 for MPTCP */
++int mptcp_pm_v4_init(void)
++{
++ int ret = 0;
++ struct request_sock_ops *ops = &mptcp_request_sock_ops;
++
++ mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
++ mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req;
++
++ mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
++ mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req;
++ mptcp_join_request_sock_ipv4_ops.queue_hash_add = mptcp_v4_reqsk_queue_hash_add;
++
++ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP");
++ if (ops->slab_name == NULL) {
++ ret = -ENOMEM;
++ goto out;
++ }
++
++ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
++ SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
++ NULL);
++
++ if (ops->slab == NULL) {
++ ret = -ENOMEM;
++ goto err_reqsk_create;
++ }
++
++out:
++ return ret;
++
++err_reqsk_create:
++ kfree(ops->slab_name);
++ ops->slab_name = NULL;
++ goto out;
++}
++
++void mptcp_pm_v4_undo(void)
++{
++ kmem_cache_destroy(mptcp_request_sock_ops.slab);
++ kfree(mptcp_request_sock_ops.slab_name);
++}
+diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c
+new file mode 100644
+index 000000000000..1036973aa855
+--- /dev/null
++++ b/net/mptcp/mptcp_ipv6.c
+@@ -0,0 +1,518 @@
++/*
++ * MPTCP implementation - IPv6-specific functions
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#include <linux/export.h>
++#include <linux/in6.h>
++#include <linux/kernel.h>
++
++#include <net/addrconf.h>
++#include <net/flow.h>
++#include <net/inet6_connection_sock.h>
++#include <net/inet6_hashtables.h>
++#include <net/inet_common.h>
++#include <net/ipv6.h>
++#include <net/ip6_checksum.h>
++#include <net/ip6_route.h>
++#include <net/mptcp.h>
++#include <net/mptcp_v6.h>
++#include <net/tcp.h>
++#include <net/transp_v6.h>
++
++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
++ __be16 sport, __be16 dport)
++{
++ u32 secret[MD5_MESSAGE_BYTES / 4];
++ u32 hash[MD5_DIGEST_WORDS];
++ u32 i;
++
++ memcpy(hash, saddr, 16);
++ for (i = 0; i < 4; i++)
++ secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
++ secret[4] = mptcp_secret[4] +
++ (((__force u16)sport << 16) + (__force u16)dport);
++ secret[5] = mptcp_seed++;
++ for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)
++ secret[i] = mptcp_secret[i];
++
++ md5_transform(hash, secret);
++
++ return hash[0];
++}
++
++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
++ __be16 sport, __be16 dport)
++{
++ u32 secret[MD5_MESSAGE_BYTES / 4];
++ u32 hash[MD5_DIGEST_WORDS];
++ u32 i;
++
++ memcpy(hash, saddr, 16);
++ for (i = 0; i < 4; i++)
++ secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
++ secret[4] = mptcp_secret[4] +
++ (((__force u16)sport << 16) + (__force u16)dport);
++ secret[5] = mptcp_seed++;
++ for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)
++ secret[i] = mptcp_secret[i];
++
++ md5_transform(hash, secret);
++
++ return *((u64 *)hash);
++}
++
++static void mptcp_v6_reqsk_destructor(struct request_sock *req)
++{
++ mptcp_reqsk_destructor(req);
++
++ tcp_v6_reqsk_destructor(req);
++}
++
++static int mptcp_v6_init_req(struct request_sock *req, struct sock *sk,
++ struct sk_buff *skb)
++{
++ tcp_request_sock_ipv6_ops.init_req(req, sk, skb);
++ mptcp_reqsk_init(req, skb);
++
++ return 0;
++}
++
++static int mptcp_v6_join_init_req(struct request_sock *req, struct sock *sk,
++ struct sk_buff *skb)
++{
++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
++ union inet_addr addr;
++ int loc_id;
++ bool low_prio = false;
++
++ /* We need to do this as early as possible. Because, if we fail later
++ * (e.g., get_local_id), then reqsk_free tries to remove the
++ * request-socket from the htb in mptcp_hash_request_remove as pprev
++ * may be different from NULL.
++ */
++ mtreq->hash_entry.pprev = NULL;
++
++ tcp_request_sock_ipv6_ops.init_req(req, sk, skb);
++
++ mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32,
++ ipv6_hdr(skb)->daddr.s6_addr32,
++ tcp_hdr(skb)->source,
++ tcp_hdr(skb)->dest);
++ addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
++ loc_id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(sk), &low_prio);
++ if (loc_id == -1)
++ return -1;
++ mtreq->loc_id = loc_id;
++ mtreq->low_prio = low_prio;
++
++ mptcp_join_reqsk_init(mpcb, req, skb);
++
++ return 0;
++}
++
++/* Similar to tcp6_request_sock_ops */
++struct request_sock_ops mptcp6_request_sock_ops __read_mostly = {
++ .family = AF_INET6,
++ .obj_size = sizeof(struct mptcp_request_sock),
++ .rtx_syn_ack = tcp_v6_rtx_synack,
++ .send_ack = tcp_v6_reqsk_send_ack,
++ .destructor = mptcp_v6_reqsk_destructor,
++ .send_reset = tcp_v6_send_reset,
++ .syn_ack_timeout = tcp_syn_ack_timeout,
++};
++
++static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk,
++ struct request_sock *req,
++ const unsigned long timeout)
++{
++ const u32 h1 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
++ inet_rsk(req)->ir_rmt_port,
++ 0, MPTCP_HASH_SIZE);
++ /* We cannot call inet6_csk_reqsk_queue_hash_add(), because we do not
++ * want to reset the keepalive-timer (responsible for retransmitting
++ * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot
++ * overload the keepalive timer. Also, it's not a big deal, because the
++ * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,
++ * if the third ACK gets lost, the client will handle the retransmission
++ * anyways. If our SYN/ACK gets lost, the client will retransmit the
++ * SYN.
++ */
++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
++ struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;
++ const u32 h2 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
++ inet_rsk(req)->ir_rmt_port,
++ lopt->hash_rnd, lopt->nr_table_entries);
++
++ reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);
++ if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0)
++ mptcp_reset_synack_timer(meta_sk, timeout);
++
++ rcu_read_lock();
++ spin_lock(&mptcp_reqsk_hlock);
++ hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]);
++ spin_unlock(&mptcp_reqsk_hlock);
++ rcu_read_unlock();
++}
++
++static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb)
++{
++ return tcp_conn_request(&mptcp6_request_sock_ops,
++ &mptcp_join_request_sock_ipv6_ops,
++ meta_sk, skb);
++}
++
++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
++{
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct sock *child, *rsk = NULL;
++ int ret;
++
++ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
++ struct tcphdr *th = tcp_hdr(skb);
++ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
++ struct sock *sk;
++
++ sk = __inet6_lookup_established(sock_net(meta_sk),
++ &tcp_hashinfo,
++ &ip6h->saddr, th->source,
++ &ip6h->daddr, ntohs(th->dest),
++ inet6_iif(skb));
++
++ if (!sk) {
++ kfree_skb(skb);
++ return 0;
++ }
++ if (is_meta_sk(sk)) {
++ WARN("%s Did not find a sub-sk!\n", __func__);
++ kfree_skb(skb);
++ sock_put(sk);
++ return 0;
++ }
++
++ if (sk->sk_state == TCP_TIME_WAIT) {
++ inet_twsk_put(inet_twsk(sk));
++ kfree_skb(skb);
++ return 0;
++ }
++
++ ret = tcp_v6_do_rcv(sk, skb);
++ sock_put(sk);
++
++ return ret;
++ }
++ TCP_SKB_CB(skb)->mptcp_flags = 0;
++
++ /* Has been removed from the tk-table. Thus, no new subflows.
++ *
++ * Check for close-state is necessary, because we may have been closed
++ * without passing by mptcp_close().
++ *
++ * When falling back, no new subflows are allowed either.
++ */
++ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
++ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
++ goto reset_and_discard;
++
++ child = tcp_v6_hnd_req(meta_sk, skb);
++
++ if (!child)
++ goto discard;
++
++ if (child != meta_sk) {
++ sock_rps_save_rxhash(child, skb);
++ /* We don't call tcp_child_process here, because we hold
++ * already the meta-sk-lock and are sure that it is not owned
++ * by the user.
++ */
++ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
++ bh_unlock_sock(child);
++ sock_put(child);
++ if (ret) {
++ rsk = child;
++ goto reset_and_discard;
++ }
++ } else {
++ if (tcp_hdr(skb)->syn) {
++ mptcp_v6_join_request(meta_sk, skb);
++ goto discard;
++ }
++ goto reset_and_discard;
++ }
++ return 0;
++
++reset_and_discard:
++ if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) {
++ const struct tcphdr *th = tcp_hdr(skb);
++ struct request_sock **prev, *req;
++ /* If we end up here, it means we should not have matched on the
++ * request-socket. But, because the request-sock queue is only
++ * destroyed in mptcp_close, the socket may actually already be
++ * in close-state (e.g., through shutdown()) while still having
++ * pending request sockets.
++ */
++ req = inet6_csk_search_req(meta_sk, &prev, th->source,
++ &ipv6_hdr(skb)->saddr,
++ &ipv6_hdr(skb)->daddr, inet6_iif(skb));
++ if (req) {
++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue,
++ req);
++ reqsk_free(req);
++ }
++ }
++
++ tcp_v6_send_reset(rsk, skb);
++discard:
++ kfree_skb(skb);
++ return 0;
++}
++
++/* After this, the ref count of the meta_sk associated with the request_sock
++ * is incremented. Thus it is the responsibility of the caller
++ * to call sock_put() when the reference is not needed anymore.
++ */
++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
++ const struct in6_addr *laddr, const struct net *net)
++{
++ const struct mptcp_request_sock *mtreq;
++ struct sock *meta_sk = NULL;
++ const struct hlist_nulls_node *node;
++ const u32 hash = inet6_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE);
++
++ rcu_read_lock();
++begin:
++ hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash],
++ hash_entry) {
++ struct inet_request_sock *treq = inet_rsk(rev_mptcp_rsk(mtreq));
++ meta_sk = mtreq->mptcp_mpcb->meta_sk;
++
++ if (inet_rsk(rev_mptcp_rsk(mtreq))->ir_rmt_port == rport &&
++ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 &&
++ ipv6_addr_equal(&treq->ir_v6_rmt_addr, raddr) &&
++ ipv6_addr_equal(&treq->ir_v6_loc_addr, laddr) &&
++ net_eq(net, sock_net(meta_sk)))
++ goto found;
++ meta_sk = NULL;
++ }
++ /* A request-socket is destroyed by RCU. So, it might have been recycled
++ * and put into another hash-table list. So, after the lookup we may
++ * end up in a different list. So, we may need to restart.
++ *
++ * See also the comment in __inet_lookup_established.
++ */
++ if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE)
++ goto begin;
++
++found:
++ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
++ meta_sk = NULL;
++ rcu_read_unlock();
++
++ return meta_sk;
++}
++
++/* Create a new IPv6 subflow.
++ *
++ * We are in user-context and meta-sock-lock is hold.
++ */
++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
++ struct mptcp_rem6 *rem)
++{
++ struct tcp_sock *tp;
++ struct sock *sk;
++ struct sockaddr_in6 loc_in, rem_in;
++ struct socket sock;
++ int ret;
++
++ /** First, create and prepare the new socket */
++
++ sock.type = meta_sk->sk_socket->type;
++ sock.state = SS_UNCONNECTED;
++ sock.wq = meta_sk->sk_socket->wq;
++ sock.file = meta_sk->sk_socket->file;
++ sock.ops = NULL;
++
++ ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
++ if (unlikely(ret < 0)) {
++ mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret);
++ return ret;
++ }
++
++ sk = sock.sk;
++ tp = tcp_sk(sk);
++
++ /* All subsockets need the MPTCP-lock-class */
++ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
++ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
++
++ if (mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL))
++ goto error;
++
++ tp->mptcp->slave_sk = 1;
++ tp->mptcp->low_prio = loc->low_prio;
++
++ /* Initializing the timer for an MPTCP subflow */
++ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
++
++ /** Then, connect the socket to the peer */
++ loc_in.sin6_family = AF_INET6;
++ rem_in.sin6_family = AF_INET6;
++ loc_in.sin6_port = 0;
++ if (rem->port)
++ rem_in.sin6_port = rem->port;
++ else
++ rem_in.sin6_port = inet_sk(meta_sk)->inet_dport;
++ loc_in.sin6_addr = loc->addr;
++ rem_in.sin6_addr = rem->addr;
++
++ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in6));
++ if (ret < 0) {
++ mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n",
++ __func__, ret);
++ goto error;
++ }
++
++ mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n",
++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
++ tp->mptcp->path_index, &loc_in.sin6_addr,
++ ntohs(loc_in.sin6_port), &rem_in.sin6_addr,
++ ntohs(rem_in.sin6_port));
++
++ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6)
++ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr);
++
++ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
++ sizeof(struct sockaddr_in6), O_NONBLOCK);
++ if (ret < 0 && ret != -EINPROGRESS) {
++ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
++ __func__, ret);
++ goto error;
++ }
++
++ sk_set_socket(sk, meta_sk->sk_socket);
++ sk->sk_wq = meta_sk->sk_wq;
++
++ return 0;
++
++error:
++ /* May happen if mptcp_add_sock fails first */
++ if (!mptcp(tp)) {
++ tcp_close(sk, 0);
++ } else {
++ local_bh_disable();
++ mptcp_sub_force_close(sk);
++ local_bh_enable();
++ }
++ return ret;
++}
++EXPORT_SYMBOL(mptcp_init6_subsockets);
++
++const struct inet_connection_sock_af_ops mptcp_v6_specific = {
++ .queue_xmit = inet6_csk_xmit,
++ .send_check = tcp_v6_send_check,
++ .rebuild_header = inet6_sk_rebuild_header,
++ .sk_rx_dst_set = inet6_sk_rx_dst_set,
++ .conn_request = mptcp_conn_request,
++ .syn_recv_sock = tcp_v6_syn_recv_sock,
++ .net_header_len = sizeof(struct ipv6hdr),
++ .net_frag_header_len = sizeof(struct frag_hdr),
++ .setsockopt = ipv6_setsockopt,
++ .getsockopt = ipv6_getsockopt,
++ .addr2sockaddr = inet6_csk_addr2sockaddr,
++ .sockaddr_len = sizeof(struct sockaddr_in6),
++ .bind_conflict = inet6_csk_bind_conflict,
++#ifdef CONFIG_COMPAT
++ .compat_setsockopt = compat_ipv6_setsockopt,
++ .compat_getsockopt = compat_ipv6_getsockopt,
++#endif
++};
++
++const struct inet_connection_sock_af_ops mptcp_v6_mapped = {
++ .queue_xmit = ip_queue_xmit,
++ .send_check = tcp_v4_send_check,
++ .rebuild_header = inet_sk_rebuild_header,
++ .sk_rx_dst_set = inet_sk_rx_dst_set,
++ .conn_request = mptcp_conn_request,
++ .syn_recv_sock = tcp_v6_syn_recv_sock,
++ .net_header_len = sizeof(struct iphdr),
++ .setsockopt = ipv6_setsockopt,
++ .getsockopt = ipv6_getsockopt,
++ .addr2sockaddr = inet6_csk_addr2sockaddr,
++ .sockaddr_len = sizeof(struct sockaddr_in6),
++ .bind_conflict = inet6_csk_bind_conflict,
++#ifdef CONFIG_COMPAT
++ .compat_setsockopt = compat_ipv6_setsockopt,
++ .compat_getsockopt = compat_ipv6_getsockopt,
++#endif
++};
++
++struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
++struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
++
++int mptcp_pm_v6_init(void)
++{
++ int ret = 0;
++ struct request_sock_ops *ops = &mptcp6_request_sock_ops;
++
++ mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
++ mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req;
++
++ mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
++ mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req;
++ mptcp_join_request_sock_ipv6_ops.queue_hash_add = mptcp_v6_reqsk_queue_hash_add;
++
++ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6");
++ if (ops->slab_name == NULL) {
++ ret = -ENOMEM;
++ goto out;
++ }
++
++ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
++ SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
++ NULL);
++
++ if (ops->slab == NULL) {
++ ret = -ENOMEM;
++ goto err_reqsk_create;
++ }
++
++out:
++ return ret;
++
++err_reqsk_create:
++ kfree(ops->slab_name);
++ ops->slab_name = NULL;
++ goto out;
++}
++
++void mptcp_pm_v6_undo(void)
++{
++ kmem_cache_destroy(mptcp6_request_sock_ops.slab);
++ kfree(mptcp6_request_sock_ops.slab_name);
++}
+diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c
+new file mode 100644
+index 000000000000..6f5087983175
+--- /dev/null
++++ b/net/mptcp/mptcp_ndiffports.c
+@@ -0,0 +1,161 @@
++#include <linux/module.h>
++
++#include <net/mptcp.h>
++#include <net/mptcp_v4.h>
++
++#if IS_ENABLED(CONFIG_IPV6)
++#include <net/mptcp_v6.h>
++#endif
++
++struct ndiffports_priv {
++ /* Worker struct for subflow establishment */
++ struct work_struct subflow_work;
++
++ struct mptcp_cb *mpcb;
++};
++
++static int num_subflows __read_mostly = 2;
++module_param(num_subflows, int, 0644);
++MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection");
++
++/**
++ * Create all new subflows, by doing calls to mptcp_initX_subsockets
++ *
++ * This function uses a goto next_subflow, to allow releasing the lock between
++ * new subflows and giving other processes a chance to do some work on the
++ * socket and potentially finishing the communication.
++ **/
++static void create_subflow_worker(struct work_struct *work)
++{
++ const struct ndiffports_priv *pm_priv = container_of(work,
++ struct ndiffports_priv,
++ subflow_work);
++ struct mptcp_cb *mpcb = pm_priv->mpcb;
++ struct sock *meta_sk = mpcb->meta_sk;
++ int iter = 0;
++
++next_subflow:
++ if (iter) {
++ release_sock(meta_sk);
++ mutex_unlock(&mpcb->mpcb_mutex);
++
++ cond_resched();
++ }
++ mutex_lock(&mpcb->mpcb_mutex);
++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
++
++ iter++;
++
++ if (sock_flag(meta_sk, SOCK_DEAD))
++ goto exit;
++
++ if (mpcb->master_sk &&
++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
++ goto exit;
++
++ if (num_subflows > iter && num_subflows > mpcb->cnt_subflows) {
++ if (meta_sk->sk_family == AF_INET ||
++ mptcp_v6_is_v4_mapped(meta_sk)) {
++ struct mptcp_loc4 loc;
++ struct mptcp_rem4 rem;
++
++ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
++ loc.loc4_id = 0;
++ loc.low_prio = 0;
++
++ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
++ rem.port = inet_sk(meta_sk)->inet_dport;
++ rem.rem4_id = 0; /* Default 0 */
++
++ mptcp_init4_subsockets(meta_sk, &loc, &rem);
++ } else {
++#if IS_ENABLED(CONFIG_IPV6)
++ struct mptcp_loc6 loc;
++ struct mptcp_rem6 rem;
++
++ loc.addr = inet6_sk(meta_sk)->saddr;
++ loc.loc6_id = 0;
++ loc.low_prio = 0;
++
++ rem.addr = meta_sk->sk_v6_daddr;
++ rem.port = inet_sk(meta_sk)->inet_dport;
++ rem.rem6_id = 0; /* Default 0 */
++
++ mptcp_init6_subsockets(meta_sk, &loc, &rem);
++#endif
++ }
++ goto next_subflow;
++ }
++
++exit:
++ release_sock(meta_sk);
++ mutex_unlock(&mpcb->mpcb_mutex);
++ sock_put(meta_sk);
++}
++
++static void ndiffports_new_session(const struct sock *meta_sk)
++{
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
++
++ /* Initialize workqueue-struct */
++ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
++ fmp->mpcb = mpcb;
++}
++
++static void ndiffports_create_subflows(struct sock *meta_sk)
++{
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
++
++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
++ mpcb->send_infinite_mapping ||
++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
++ return;
++
++ if (!work_pending(&pm_priv->subflow_work)) {
++ sock_hold(meta_sk);
++ queue_work(mptcp_wq, &pm_priv->subflow_work);
++ }
++}
++
++static int ndiffports_get_local_id(sa_family_t family, union inet_addr *addr,
++ struct net *net, bool *low_prio)
++{
++ return 0;
++}
++
++static struct mptcp_pm_ops ndiffports __read_mostly = {
++ .new_session = ndiffports_new_session,
++ .fully_established = ndiffports_create_subflows,
++ .get_local_id = ndiffports_get_local_id,
++ .name = "ndiffports",
++ .owner = THIS_MODULE,
++};
++
++/* General initialization of MPTCP_PM */
++static int __init ndiffports_register(void)
++{
++ BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE);
++
++ if (mptcp_register_path_manager(&ndiffports))
++ goto exit;
++
++ return 0;
++
++exit:
++ return -1;
++}
++
++static void ndiffports_unregister(void)
++{
++ mptcp_unregister_path_manager(&ndiffports);
++}
++
++module_init(ndiffports_register);
++module_exit(ndiffports_unregister);
++
++MODULE_AUTHOR("Christoph Paasch");
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("NDIFF-PORTS MPTCP");
++MODULE_VERSION("0.88");
+diff --git a/net/mptcp/mptcp_ofo_queue.c b/net/mptcp/mptcp_ofo_queue.c
+new file mode 100644
+index 000000000000..ec4e98622637
+--- /dev/null
++++ b/net/mptcp/mptcp_ofo_queue.c
+@@ -0,0 +1,295 @@
++/*
++ * MPTCP implementation - Fast algorithm for MPTCP meta-reordering
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer & Author:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#include <linux/skbuff.h>
++#include <linux/slab.h>
++#include <net/tcp.h>
++#include <net/mptcp.h>
++
++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
++ const struct sk_buff *skb)
++{
++ struct tcp_sock *tp;
++
++ mptcp_for_each_tp(mpcb, tp) {
++ if (tp->mptcp->shortcut_ofoqueue == skb) {
++ tp->mptcp->shortcut_ofoqueue = NULL;
++ return;
++ }
++ }
++}
++
++/* Does 'skb' fits after 'here' in the queue 'head' ?
++ * If yes, we queue it and return 1
++ */
++static int mptcp_ofo_queue_after(struct sk_buff_head *head,
++ struct sk_buff *skb, struct sk_buff *here,
++ const struct tcp_sock *tp)
++{
++ struct sock *meta_sk = tp->meta_sk;
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ u32 seq = TCP_SKB_CB(skb)->seq;
++ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
++
++ /* We want to queue skb after here, thus seq >= end_seq */
++ if (before(seq, TCP_SKB_CB(here)->end_seq))
++ return 0;
++
++ if (seq == TCP_SKB_CB(here)->end_seq) {
++ bool fragstolen = false;
++
++ if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) {
++ __skb_queue_after(&meta_tp->out_of_order_queue, here, skb);
++ return 1;
++ } else {
++ kfree_skb_partial(skb, fragstolen);
++ return -1;
++ }
++ }
++
++ /* If here is the last one, we can always queue it */
++ if (skb_queue_is_last(head, here)) {
++ __skb_queue_after(head, here, skb);
++ return 1;
++ } else {
++ struct sk_buff *skb1 = skb_queue_next(head, here);
++ /* It's not the last one, but does it fits between 'here' and
++ * the one after 'here' ? Thus, does end_seq <= after_here->seq
++ */
++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) {
++ __skb_queue_after(head, here, skb);
++ return 1;
++ }
++ }
++
++ return 0;
++}
++
++static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb,
++ struct sk_buff_head *head, struct tcp_sock *tp)
++{
++ struct sock *meta_sk = tp->meta_sk;
++ struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk);
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ struct sk_buff *skb1, *best_shortcut = NULL;
++ u32 seq = TCP_SKB_CB(skb)->seq;
++ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
++ u32 distance = 0xffffffff;
++
++ /* First, check the tp's shortcut */
++ if (!shortcut) {
++ if (skb_queue_empty(head)) {
++ __skb_queue_head(head, skb);
++ goto end;
++ }
++ } else {
++ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
++ /* Does the tp's shortcut is a hit? If yes, we insert. */
++
++ if (ret) {
++ skb = (ret > 0) ? skb : NULL;
++ goto end;
++ }
++ }
++
++ /* Check the shortcuts of the other subsockets. */
++ mptcp_for_each_tp(mpcb, tp_it) {
++ shortcut = tp_it->mptcp->shortcut_ofoqueue;
++ /* Can we queue it here? If yes, do so! */
++ if (shortcut) {
++ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
++
++ if (ret) {
++ skb = (ret > 0) ? skb : NULL;
++ goto end;
++ }
++ }
++
++ /* Could not queue it, check if we are close.
++ * We are looking for a shortcut, close enough to seq to
++ * set skb1 prematurely and thus improve the subsequent lookup,
++ * which tries to find a skb1 so that skb1->seq <= seq.
++ *
++ * So, here we only take shortcuts, whose shortcut->seq > seq,
++ * and minimize the distance between shortcut->seq and seq and
++ * set best_shortcut to this one with the minimal distance.
++ *
++ * That way, the subsequent while-loop is shortest.
++ */
++ if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) {
++ /* Are we closer than the current best shortcut? */
++ if ((u32)(TCP_SKB_CB(shortcut)->seq - seq) < distance) {
++ distance = (u32)(TCP_SKB_CB(shortcut)->seq - seq);
++ best_shortcut = shortcut;
++ }
++ }
++ }
++
++ if (best_shortcut)
++ skb1 = best_shortcut;
++ else
++ skb1 = skb_peek_tail(head);
++
++ if (seq == TCP_SKB_CB(skb1)->end_seq) {
++ bool fragstolen = false;
++
++ if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) {
++ __skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb);
++ } else {
++ kfree_skb_partial(skb, fragstolen);
++ skb = NULL;
++ }
++
++ goto end;
++ }
++
++ /* Find the insertion point, starting from best_shortcut if available.
++ *
++ * Inspired from tcp_data_queue_ofo.
++ */
++ while (1) {
++ /* skb1->seq <= seq */
++ if (!after(TCP_SKB_CB(skb1)->seq, seq))
++ break;
++ if (skb_queue_is_first(head, skb1)) {
++ skb1 = NULL;
++ break;
++ }
++ skb1 = skb_queue_prev(head, skb1);
++ }
++
++ /* Do skb overlap to previous one? */
++ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
++ /* All the bits are present. */
++ __kfree_skb(skb);
++ skb = NULL;
++ goto end;
++ }
++ if (seq == TCP_SKB_CB(skb1)->seq) {
++ if (skb_queue_is_first(head, skb1))
++ skb1 = NULL;
++ else
++ skb1 = skb_queue_prev(head, skb1);
++ }
++ }
++ if (!skb1)
++ __skb_queue_head(head, skb);
++ else
++ __skb_queue_after(head, skb1, skb);
++
++ /* And clean segments covered by new one as whole. */
++ while (!skb_queue_is_last(head, skb)) {
++ skb1 = skb_queue_next(head, skb);
++
++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
++ break;
++
++ __skb_unlink(skb1, head);
++ mptcp_remove_shortcuts(mpcb, skb1);
++ __kfree_skb(skb1);
++ }
++
++end:
++ if (skb) {
++ skb_set_owner_r(skb, meta_sk);
++ tp->mptcp->shortcut_ofoqueue = skb;
++ }
++
++ return;
++}
++
++/**
++ * @sk: the subflow that received this skb.
++ */
++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,
++ struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ try_shortcut(tp->mptcp->shortcut_ofoqueue, skb,
++ &tcp_sk(meta_sk)->out_of_order_queue, tp);
++}
++
++bool mptcp_prune_ofo_queue(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ bool res = false;
++
++ if (!skb_queue_empty(&tp->out_of_order_queue)) {
++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
++ mptcp_purge_ofo_queue(tp);
++
++ /* No sack at the mptcp-level */
++ sk_mem_reclaim(sk);
++ res = true;
++ }
++
++ return res;
++}
++
++void mptcp_ofo_queue(struct sock *meta_sk)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct sk_buff *skb;
++
++ while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) {
++ u32 old_rcv_nxt = meta_tp->rcv_nxt;
++ if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt))
++ break;
++
++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) {
++ __skb_unlink(skb, &meta_tp->out_of_order_queue);
++ mptcp_remove_shortcuts(meta_tp->mpcb, skb);
++ __kfree_skb(skb);
++ continue;
++ }
++
++ __skb_unlink(skb, &meta_tp->out_of_order_queue);
++ mptcp_remove_shortcuts(meta_tp->mpcb, skb);
++
++ __skb_queue_tail(&meta_sk->sk_receive_queue, skb);
++ meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
++ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
++
++ if (tcp_hdr(skb)->fin)
++ mptcp_fin(meta_sk);
++ }
++}
++
++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp)
++{
++ struct sk_buff_head *head = &meta_tp->out_of_order_queue;
++ struct sk_buff *skb, *tmp;
++
++ skb_queue_walk_safe(head, skb, tmp) {
++ __skb_unlink(skb, head);
++ mptcp_remove_shortcuts(meta_tp->mpcb, skb);
++ kfree_skb(skb);
++ }
++}
+diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c
+new file mode 100644
+index 000000000000..53f5c43bb488
+--- /dev/null
++++ b/net/mptcp/mptcp_olia.c
+@@ -0,0 +1,311 @@
++/*
++ * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL:
++ *
++ * Algorithm design:
++ * Ramin Khalili <ramin.khalili@epfl.ch>
++ * Nicolas Gast <nicolas.gast@epfl.ch>
++ * Jean-Yves Le Boudec <jean-yves.leboudec@epfl.ch>
++ *
++ * Implementation:
++ * Ramin Khalili <ramin.khalili@epfl.ch>
++ *
++ * Ported to the official MPTCP-kernel:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++
++#include <net/tcp.h>
++#include <net/mptcp.h>
++
++#include <linux/module.h>
++
++static int scale = 10;
++
++struct mptcp_olia {
++ u32 mptcp_loss1;
++ u32 mptcp_loss2;
++ u32 mptcp_loss3;
++ int epsilon_num;
++ u32 epsilon_den;
++ int mptcp_snd_cwnd_cnt;
++};
++
++static inline int mptcp_olia_sk_can_send(const struct sock *sk)
++{
++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
++}
++
++static inline u64 mptcp_olia_scale(u64 val, int scale)
++{
++ return (u64) val << scale;
++}
++
++/* take care of artificially inflate (see RFC5681)
++ * of cwnd during fast-retransmit phase
++ */
++static u32 mptcp_get_crt_cwnd(struct sock *sk)
++{
++ const struct inet_connection_sock *icsk = inet_csk(sk);
++
++ if (icsk->icsk_ca_state == TCP_CA_Recovery)
++ return tcp_sk(sk)->snd_ssthresh;
++ else
++ return tcp_sk(sk)->snd_cwnd;
++}
++
++/* return the dominator of the first term of the increasing term */
++static u64 mptcp_get_rate(const struct mptcp_cb *mpcb , u32 path_rtt)
++{
++ struct sock *sk;
++ u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */
++
++ mptcp_for_each_sk(mpcb, sk) {
++ struct tcp_sock *tp = tcp_sk(sk);
++ u64 scaled_num;
++ u32 tmp_cwnd;
++
++ if (!mptcp_olia_sk_can_send(sk))
++ continue;
++
++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
++ scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt;
++ rate += div_u64(scaled_num , tp->srtt_us);
++ }
++ rate *= rate;
++ return rate;
++}
++
++/* find the maximum cwnd, used to find set M */
++static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb)
++{
++ struct sock *sk;
++ u32 best_cwnd = 0;
++
++ mptcp_for_each_sk(mpcb, sk) {
++ u32 tmp_cwnd;
++
++ if (!mptcp_olia_sk_can_send(sk))
++ continue;
++
++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
++ if (tmp_cwnd > best_cwnd)
++ best_cwnd = tmp_cwnd;
++ }
++ return best_cwnd;
++}
++
++static void mptcp_get_epsilon(const struct mptcp_cb *mpcb)
++{
++ struct mptcp_olia *ca;
++ struct tcp_sock *tp;
++ struct sock *sk;
++ u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1;
++ u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd;
++ u8 M = 0, B_not_M = 0;
++
++ /* TODO - integrate this in the following loop - we just want to iterate once */
++
++ max_cwnd = mptcp_get_max_cwnd(mpcb);
++
++ /* find the best path */
++ mptcp_for_each_sk(mpcb, sk) {
++ tp = tcp_sk(sk);
++ ca = inet_csk_ca(sk);
++
++ if (!mptcp_olia_sk_can_send(sk))
++ continue;
++
++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
++ /* TODO - check here and rename variables */
++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
++ ca->mptcp_loss2 - ca->mptcp_loss1);
++
++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
++ if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) {
++ best_rtt = tmp_rtt;
++ best_int = tmp_int;
++ best_cwnd = tmp_cwnd;
++ }
++ }
++
++ /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */
++ /* find the size of M and B_not_M */
++ mptcp_for_each_sk(mpcb, sk) {
++ tp = tcp_sk(sk);
++ ca = inet_csk_ca(sk);
++
++ if (!mptcp_olia_sk_can_send(sk))
++ continue;
++
++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
++ if (tmp_cwnd == max_cwnd) {
++ M++;
++ } else {
++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
++ ca->mptcp_loss2 - ca->mptcp_loss1);
++
++ if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt)
++ B_not_M++;
++ }
++ }
++
++ /* check if the path is in M or B_not_M and set the value of epsilon accordingly */
++ mptcp_for_each_sk(mpcb, sk) {
++ tp = tcp_sk(sk);
++ ca = inet_csk_ca(sk);
++
++ if (!mptcp_olia_sk_can_send(sk))
++ continue;
++
++ if (B_not_M == 0) {
++ ca->epsilon_num = 0;
++ ca->epsilon_den = 1;
++ } else {
++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
++ ca->mptcp_loss2 - ca->mptcp_loss1);
++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
++
++ if (tmp_cwnd < max_cwnd &&
++ (u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) {
++ ca->epsilon_num = 1;
++ ca->epsilon_den = mpcb->cnt_established * B_not_M;
++ } else if (tmp_cwnd == max_cwnd) {
++ ca->epsilon_num = -1;
++ ca->epsilon_den = mpcb->cnt_established * M;
++ } else {
++ ca->epsilon_num = 0;
++ ca->epsilon_den = 1;
++ }
++ }
++ }
++}
++
++/* setting the initial values */
++static void mptcp_olia_init(struct sock *sk)
++{
++ const struct tcp_sock *tp = tcp_sk(sk);
++ struct mptcp_olia *ca = inet_csk_ca(sk);
++
++ if (mptcp(tp)) {
++ ca->mptcp_loss1 = tp->snd_una;
++ ca->mptcp_loss2 = tp->snd_una;
++ ca->mptcp_loss3 = tp->snd_una;
++ ca->mptcp_snd_cwnd_cnt = 0;
++ ca->epsilon_num = 0;
++ ca->epsilon_den = 1;
++ }
++}
++
++/* updating inter-loss distance and ssthresh */
++static void mptcp_olia_set_state(struct sock *sk, u8 new_state)
++{
++ if (!mptcp(tcp_sk(sk)))
++ return;
++
++ if (new_state == TCP_CA_Loss ||
++ new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) {
++ struct mptcp_olia *ca = inet_csk_ca(sk);
++
++ if (ca->mptcp_loss3 != ca->mptcp_loss2 &&
++ !inet_csk(sk)->icsk_retransmits) {
++ ca->mptcp_loss1 = ca->mptcp_loss2;
++ ca->mptcp_loss2 = ca->mptcp_loss3;
++ }
++ }
++}
++
++/* main algorithm */
++static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct mptcp_olia *ca = inet_csk_ca(sk);
++ const struct mptcp_cb *mpcb = tp->mpcb;
++
++ u64 inc_num, inc_den, rate, cwnd_scaled;
++
++ if (!mptcp(tp)) {
++ tcp_reno_cong_avoid(sk, ack, acked);
++ return;
++ }
++
++ ca->mptcp_loss3 = tp->snd_una;
++
++ if (!tcp_is_cwnd_limited(sk))
++ return;
++
++ /* slow start if it is in the safe area */
++ if (tp->snd_cwnd <= tp->snd_ssthresh) {
++ tcp_slow_start(tp, acked);
++ return;
++ }
++
++ mptcp_get_epsilon(mpcb);
++ rate = mptcp_get_rate(mpcb, tp->srtt_us);
++ cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale);
++ inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1;
++
++ /* calculate the increasing term, scaling is used to reduce the rounding effect */
++ if (ca->epsilon_num == -1) {
++ if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) {
++ inc_num = rate - ca->epsilon_den *
++ cwnd_scaled * cwnd_scaled;
++ ca->mptcp_snd_cwnd_cnt -= div64_u64(
++ mptcp_olia_scale(inc_num , scale) , inc_den);
++ } else {
++ inc_num = ca->epsilon_den *
++ cwnd_scaled * cwnd_scaled - rate;
++ ca->mptcp_snd_cwnd_cnt += div64_u64(
++ mptcp_olia_scale(inc_num , scale) , inc_den);
++ }
++ } else {
++ inc_num = ca->epsilon_num * rate +
++ ca->epsilon_den * cwnd_scaled * cwnd_scaled;
++ ca->mptcp_snd_cwnd_cnt += div64_u64(
++ mptcp_olia_scale(inc_num , scale) , inc_den);
++ }
++
++
++ if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) {
++ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
++ tp->snd_cwnd++;
++ ca->mptcp_snd_cwnd_cnt = 0;
++ } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) {
++ tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1);
++ ca->mptcp_snd_cwnd_cnt = 0;
++ }
++}
++
++static struct tcp_congestion_ops mptcp_olia = {
++ .init = mptcp_olia_init,
++ .ssthresh = tcp_reno_ssthresh,
++ .cong_avoid = mptcp_olia_cong_avoid,
++ .set_state = mptcp_olia_set_state,
++ .owner = THIS_MODULE,
++ .name = "olia",
++};
++
++static int __init mptcp_olia_register(void)
++{
++ BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE);
++ return tcp_register_congestion_control(&mptcp_olia);
++}
++
++static void __exit mptcp_olia_unregister(void)
++{
++ tcp_unregister_congestion_control(&mptcp_olia);
++}
++
++module_init(mptcp_olia_register);
++module_exit(mptcp_olia_unregister);
++
++MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec");
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
++MODULE_VERSION("0.1");
+diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c
+new file mode 100644
+index 000000000000..400ea254c078
+--- /dev/null
++++ b/net/mptcp/mptcp_output.c
+@@ -0,0 +1,1743 @@
++/*
++ * MPTCP implementation - Sending side
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer & Author:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#include <linux/kconfig.h>
++#include <linux/skbuff.h>
++#include <linux/tcp.h>
++
++#include <net/mptcp.h>
++#include <net/mptcp_v4.h>
++#include <net/mptcp_v6.h>
++#include <net/sock.h>
++
++static const int mptcp_dss_len = MPTCP_SUB_LEN_DSS_ALIGN +
++ MPTCP_SUB_LEN_ACK_ALIGN +
++ MPTCP_SUB_LEN_SEQ_ALIGN;
++
++static inline int mptcp_sub_len_remove_addr(u16 bitfield)
++{
++ unsigned int c;
++ for (c = 0; bitfield; c++)
++ bitfield &= bitfield - 1;
++ return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1;
++}
++
++int mptcp_sub_len_remove_addr_align(u16 bitfield)
++{
++ return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4);
++}
++EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align);
++
++/* get the data-seq and end-data-seq and store them again in the
++ * tcp_skb_cb
++ */
++static int mptcp_reconstruct_mapping(struct sk_buff *skb)
++{
++ const struct mp_dss *mpdss = (struct mp_dss *)TCP_SKB_CB(skb)->dss;
++ u32 *p32;
++ u16 *p16;
++
++ if (!mpdss->M)
++ return 1;
++
++ /* Move the pointer to the data-seq */
++ p32 = (u32 *)mpdss;
++ p32++;
++ if (mpdss->A) {
++ p32++;
++ if (mpdss->a)
++ p32++;
++ }
++
++ TCP_SKB_CB(skb)->seq = ntohl(*p32);
++
++ /* Get the data_len to calculate the end_data_seq */
++ p32++;
++ p32++;
++ p16 = (u16 *)p32;
++ TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq;
++
++ return 0;
++}
++
++static void mptcp_find_and_set_pathmask(const struct sock *meta_sk, struct sk_buff *skb)
++{
++ struct sk_buff *skb_it;
++
++ skb_it = tcp_write_queue_head(meta_sk);
++
++ tcp_for_write_queue_from(skb_it, meta_sk) {
++ if (skb_it == tcp_send_head(meta_sk))
++ break;
++
++ if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) {
++ TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask;
++ break;
++ }
++ }
++}
++
++/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are
++ * coming from the meta-retransmit-timer
++ */
++static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk,
++ struct sock *sk, int clone_it)
++{
++ struct sk_buff *skb, *skb1;
++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ u32 seq, end_seq;
++
++ if (clone_it) {
++ /* pskb_copy is necessary here, because the TCP/IP-headers
++ * will be changed when it's going to be reinjected on another
++ * subflow.
++ */
++ skb = pskb_copy_for_clone(orig_skb, GFP_ATOMIC);
++ } else {
++ __skb_unlink(orig_skb, &sk->sk_write_queue);
++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
++ sk->sk_wmem_queued -= orig_skb->truesize;
++ sk_mem_uncharge(sk, orig_skb->truesize);
++ skb = orig_skb;
++ }
++ if (unlikely(!skb))
++ return;
++
++ if (sk && mptcp_reconstruct_mapping(skb)) {
++ __kfree_skb(skb);
++ return;
++ }
++
++ skb->sk = meta_sk;
++
++ /* If it reached already the destination, we don't have to reinject it */
++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
++ __kfree_skb(skb);
++ return;
++ }
++
++ /* Only reinject segments that are fully covered by the mapping */
++ if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) !=
++ TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
++ u32 seq = TCP_SKB_CB(skb)->seq;
++ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
++
++ __kfree_skb(skb);
++
++ /* Ok, now we have to look for the full mapping in the meta
++ * send-queue :S
++ */
++ tcp_for_write_queue(skb, meta_sk) {
++ /* Not yet at the mapping? */
++ if (before(TCP_SKB_CB(skb)->seq, seq))
++ continue;
++ /* We have passed by the mapping */
++ if (after(TCP_SKB_CB(skb)->end_seq, end_seq))
++ return;
++
++ __mptcp_reinject_data(skb, meta_sk, NULL, 1);
++ }
++ return;
++ }
++
++ /* Segment goes back to the MPTCP-layer. So, we need to zero the
++ * path_mask/dss.
++ */
++ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);
++
++ /* We need to find out the path-mask from the meta-write-queue
++ * to properly select a subflow.
++ */
++ mptcp_find_and_set_pathmask(meta_sk, skb);
++
++ /* If it's empty, just add */
++ if (skb_queue_empty(&mpcb->reinject_queue)) {
++ skb_queue_head(&mpcb->reinject_queue, skb);
++ return;
++ }
++
++ /* Find place to insert skb - or even we can 'drop' it, as the
++ * data is already covered by other skb's in the reinject-queue.
++ *
++ * This is inspired by code from tcp_data_queue.
++ */
++
++ skb1 = skb_peek_tail(&mpcb->reinject_queue);
++ seq = TCP_SKB_CB(skb)->seq;
++ while (1) {
++ if (!after(TCP_SKB_CB(skb1)->seq, seq))
++ break;
++ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) {
++ skb1 = NULL;
++ break;
++ }
++ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
++ }
++
++ /* Do skb overlap to previous one? */
++ end_seq = TCP_SKB_CB(skb)->end_seq;
++ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
++ /* All the bits are present. Don't reinject */
++ __kfree_skb(skb);
++ return;
++ }
++ if (seq == TCP_SKB_CB(skb1)->seq) {
++ if (skb_queue_is_first(&mpcb->reinject_queue, skb1))
++ skb1 = NULL;
++ else
++ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
++ }
++ }
++ if (!skb1)
++ __skb_queue_head(&mpcb->reinject_queue, skb);
++ else
++ __skb_queue_after(&mpcb->reinject_queue, skb1, skb);
++
++ /* And clean segments covered by new one as whole. */
++ while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) {
++ skb1 = skb_queue_next(&mpcb->reinject_queue, skb);
++
++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
++ break;
++
++ __skb_unlink(skb1, &mpcb->reinject_queue);
++ __kfree_skb(skb1);
++ }
++ return;
++}
++
++/* Inserts data into the reinject queue */
++void mptcp_reinject_data(struct sock *sk, int clone_it)
++{
++ struct sk_buff *skb_it, *tmp;
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct sock *meta_sk = tp->meta_sk;
++
++ /* It has already been closed - there is really no point in reinjecting */
++ if (meta_sk->sk_state == TCP_CLOSE)
++ return;
++
++ skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) {
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it);
++ /* Subflow syn's and fin's are not reinjected.
++ *
++ * As well as empty subflow-fins with a data-fin.
++ * They are reinjected below (without the subflow-fin-flag)
++ */
++ if (tcb->tcp_flags & TCPHDR_SYN ||
++ (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) ||
++ (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len))
++ continue;
++
++ __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it);
++ }
++
++ skb_it = tcp_write_queue_tail(meta_sk);
++ /* If sk has sent the empty data-fin, we have to reinject it too. */
++ if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 &&
++ TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) {
++ __mptcp_reinject_data(skb_it, meta_sk, NULL, 1);
++ }
++
++ mptcp_push_pending_frames(meta_sk);
++
++ tp->pf = 1;
++}
++EXPORT_SYMBOL(mptcp_reinject_data);
++
++static void mptcp_combine_dfin(const struct sk_buff *skb, const struct sock *meta_sk,
++ struct sock *subsk)
++{
++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ struct sock *sk_it;
++ int all_empty = 1, all_acked;
++
++ /* In infinite mapping we always try to combine */
++ if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) {
++ subsk->sk_shutdown |= SEND_SHUTDOWN;
++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
++ return;
++ }
++
++ /* Don't combine, if they didn't combine - otherwise we end up in
++ * TIME_WAIT, even if our app is smart enough to avoid it
++ */
++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
++ if (!mpcb->dfin_combined)
++ return;
++ }
++
++ /* If no other subflow has data to send, we can combine */
++ mptcp_for_each_sk(mpcb, sk_it) {
++ if (!mptcp_sk_can_send(sk_it))
++ continue;
++
++ if (!tcp_write_queue_empty(sk_it))
++ all_empty = 0;
++ }
++
++ /* If all data has been DATA_ACKed, we can combine.
++ * -1, because the data_fin consumed one byte
++ */
++ all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1));
++
++ if ((all_empty || all_acked) && tcp_close_state(subsk)) {
++ subsk->sk_shutdown |= SEND_SHUTDOWN;
++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
++ }
++}
++
++static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb,
++ __be32 *ptr)
++{
++ const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
++ __be32 *start = ptr;
++ __u16 data_len;
++
++ *ptr++ = htonl(tcb->seq); /* data_seq */
++
++ /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */
++ if (mptcp_is_data_fin(skb) && skb->len == 0)
++ *ptr++ = 0; /* subseq */
++ else
++ *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */
++
++ if (tcb->mptcp_flags & MPTCPHDR_INF)
++ data_len = 0;
++ else
++ data_len = tcb->end_seq - tcb->seq;
++
++ if (tp->mpcb->dss_csum && data_len) {
++ __be16 *p16 = (__be16 *)ptr;
++ __be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb);
++ __wsum csum;
++
++ *ptr = htonl(((data_len) << 16) |
++ (TCPOPT_EOL << 8) |
++ (TCPOPT_EOL));
++ csum = csum_partial(ptr - 2, 12, skb->csum);
++ p16++;
++ *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));
++ } else {
++ *ptr++ = htonl(((data_len) << 16) |
++ (TCPOPT_NOP << 8) |
++ (TCPOPT_NOP));
++ }
++
++ return ptr - start;
++}
++
++static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb,
++ __be32 *ptr)
++{
++ struct mp_dss *mdss = (struct mp_dss *)ptr;
++ __be32 *start = ptr;
++
++ mdss->kind = TCPOPT_MPTCP;
++ mdss->sub = MPTCP_SUB_DSS;
++ mdss->rsv1 = 0;
++ mdss->rsv2 = 0;
++ mdss->F = mptcp_is_data_fin(skb) ? 1 : 0;
++ mdss->m = 0;
++ mdss->M = mptcp_is_data_seq(skb) ? 1 : 0;
++ mdss->a = 0;
++ mdss->A = 1;
++ mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
++ ptr++;
++
++ *ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt);
++
++ return ptr - start;
++}
++
++/* RFC6824 states that once a particular subflow mapping has been sent
++ * out it must never be changed. However, packets may be split while
++ * they are in the retransmission queue (due to SACK or ACKs) and that
++ * arguably means that we would change the mapping (e.g. it splits it,
++ * our sends out a subset of the initial mapping).
++ *
++ * Furthermore, the skb checksum is not always preserved across splits
++ * (e.g. mptcp_fragment) which would mean that we need to recompute
++ * the DSS checksum in this case.
++ *
++ * To avoid this we save the initial DSS mapping which allows us to
++ * send the same DSS mapping even for fragmented retransmits.
++ */
++static void mptcp_save_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb)
++{
++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
++ __be32 *ptr = (__be32 *)tcb->dss;
++
++ tcb->mptcp_flags |= MPTCPHDR_SEQ;
++
++ ptr += mptcp_write_dss_data_ack(tp, skb, ptr);
++ ptr += mptcp_write_dss_mapping(tp, skb, ptr);
++}
++
++/* Write the saved DSS mapping to the header */
++static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb,
++ __be32 *ptr)
++{
++ __be32 *start = ptr;
++
++ memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len);
++
++ /* update the data_ack */
++ start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt);
++
++ /* dss is in a union with inet_skb_parm and
++ * the IP layer expects zeroed IPCB fields.
++ */
++ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);
++
++ return mptcp_dss_len/sizeof(*ptr);
++}
++
++static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ const struct sock *meta_sk = mptcp_meta_sk(sk);
++ const struct mptcp_cb *mpcb = tp->mpcb;
++ struct tcp_skb_cb *tcb;
++ struct sk_buff *subskb = NULL;
++
++ if (!reinject)
++ TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ?
++ MPTCPHDR_SEQ64_INDEX : 0);
++
++ subskb = pskb_copy_for_clone(skb, GFP_ATOMIC);
++ if (!subskb)
++ return false;
++
++ /* At the subflow-level we need to call again tcp_init_tso_segs. We
++ * force this, by setting gso_segs to 0. It has been set to 1 prior to
++ * the call to mptcp_skb_entail.
++ */
++ skb_shinfo(subskb)->gso_segs = 0;
++
++ TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index);
++
++ if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) &&
++ skb->ip_summed == CHECKSUM_PARTIAL) {
++ subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0);
++ subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE;
++ }
++
++ tcb = TCP_SKB_CB(subskb);
++
++ if (tp->mpcb->send_infinite_mapping &&
++ !tp->mpcb->infinite_mapping_snd &&
++ !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) {
++ tp->mptcp->fully_established = 1;
++ tp->mpcb->infinite_mapping_snd = 1;
++ tp->mptcp->infinite_cutoff_seq = tp->write_seq;
++ tcb->mptcp_flags |= MPTCPHDR_INF;
++ }
++
++ if (mptcp_is_data_fin(subskb))
++ mptcp_combine_dfin(subskb, meta_sk, sk);
++
++ mptcp_save_dss_data_seq(tp, subskb);
++
++ tcb->seq = tp->write_seq;
++ tcb->sacked = 0; /* reset the sacked field: from the point of view
++ * of this subflow, we are sending a brand new
++ * segment
++ */
++ /* Take into account seg len */
++ tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0);
++ tcb->end_seq = tp->write_seq;
++
++ /* If it's a non-payload DATA_FIN (also no subflow-fin), the
++ * segment is not part of the subflow but on a meta-only-level.
++ */
++ if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) {
++ tcp_add_write_queue_tail(sk, subskb);
++ sk->sk_wmem_queued += subskb->truesize;
++ sk_mem_charge(sk, subskb->truesize);
++ } else {
++ int err;
++
++ /* Necessary to initialize for tcp_transmit_skb. mss of 1, as
++ * skb->len = 0 will force tso_segs to 1.
++ */
++ tcp_init_tso_segs(sk, subskb, 1);
++ /* Empty data-fins are sent immediatly on the subflow */
++ TCP_SKB_CB(subskb)->when = tcp_time_stamp;
++ err = tcp_transmit_skb(sk, subskb, 1, GFP_ATOMIC);
++
++ /* It has not been queued, we can free it now. */
++ kfree_skb(subskb);
++
++ if (err)
++ return false;
++ }
++
++ if (!tp->mptcp->fully_established) {
++ tp->mptcp->second_packet = 1;
++ tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq;
++ }
++
++ return true;
++}
++
++/* Fragment an skb and update the mptcp meta-data. Due to reinject, we
++ * might need to undo some operations done by tcp_fragment.
++ */
++static int mptcp_fragment(struct sock *meta_sk, struct sk_buff *skb, u32 len,
++ gfp_t gfp, int reinject)
++{
++ int ret, diff, old_factor;
++ struct sk_buff *buff;
++ u8 flags;
++
++ if (skb_headlen(skb) < len)
++ diff = skb->len - len;
++ else
++ diff = skb->data_len;
++ old_factor = tcp_skb_pcount(skb);
++
++ /* The mss_now in tcp_fragment is used to set the tso_segs of the skb.
++ * At the MPTCP-level we do not care about the absolute value. All we
++ * care about is that it is set to 1 for accurate packets_out
++ * accounting.
++ */
++ ret = tcp_fragment(meta_sk, skb, len, UINT_MAX, gfp);
++ if (ret)
++ return ret;
++
++ buff = skb->next;
++
++ flags = TCP_SKB_CB(skb)->mptcp_flags;
++ TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);
++ TCP_SKB_CB(buff)->mptcp_flags = flags;
++ TCP_SKB_CB(buff)->path_mask = TCP_SKB_CB(skb)->path_mask;
++
++ /* If reinject == 1, the buff will be added to the reinject
++ * queue, which is currently not part of memory accounting. So
++ * undo the changes done by tcp_fragment and update the
++ * reinject queue. Also, undo changes to the packet counters.
++ */
++ if (reinject == 1) {
++ int undo = buff->truesize - diff;
++ meta_sk->sk_wmem_queued -= undo;
++ sk_mem_uncharge(meta_sk, undo);
++
++ tcp_sk(meta_sk)->mpcb->reinject_queue.qlen++;
++ meta_sk->sk_write_queue.qlen--;
++
++ if (!before(tcp_sk(meta_sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
++ undo = old_factor - tcp_skb_pcount(skb) -
++ tcp_skb_pcount(buff);
++ if (undo)
++ tcp_adjust_pcount(meta_sk, skb, -undo);
++ }
++ }
++
++ return 0;
++}
++
++/* Inspired by tcp_write_wakeup */
++int mptcp_write_wakeup(struct sock *meta_sk)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct sk_buff *skb;
++ struct sock *sk_it;
++ int ans = 0;
++
++ if (meta_sk->sk_state == TCP_CLOSE)
++ return -1;
++
++ skb = tcp_send_head(meta_sk);
++ if (skb &&
++ before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) {
++ unsigned int mss;
++ unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq;
++ struct sock *subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, true);
++ struct tcp_sock *subtp;
++ if (!subsk)
++ goto window_probe;
++ subtp = tcp_sk(subsk);
++ mss = tcp_current_mss(subsk);
++
++ seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq,
++ tcp_wnd_end(subtp) - subtp->write_seq);
++
++ if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
++ meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
++
++ /* We are probing the opening of a window
++ * but the window size is != 0
++ * must have been a result SWS avoidance ( sender )
++ */
++ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
++ skb->len > mss) {
++ seg_size = min(seg_size, mss);
++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
++ if (mptcp_fragment(meta_sk, skb, seg_size,
++ GFP_ATOMIC, 0))
++ return -1;
++ } else if (!tcp_skb_pcount(skb)) {
++ /* see mptcp_write_xmit on why we use UINT_MAX */
++ tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX);
++ }
++
++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
++ if (!mptcp_skb_entail(subsk, skb, 0))
++ return -1;
++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
++
++ mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq -
++ TCP_SKB_CB(skb)->seq);
++ tcp_event_new_data_sent(meta_sk, skb);
++
++ __tcp_push_pending_frames(subsk, mss, TCP_NAGLE_PUSH);
++
++ return 0;
++ } else {
++window_probe:
++ if (between(meta_tp->snd_up, meta_tp->snd_una + 1,
++ meta_tp->snd_una + 0xFFFF)) {
++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
++ if (mptcp_sk_can_send_ack(sk_it))
++ tcp_xmit_probe_skb(sk_it, 1);
++ }
++ }
++
++ /* At least one of the tcp_xmit_probe_skb's has to succeed */
++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
++ int ret;
++
++ if (!mptcp_sk_can_send_ack(sk_it))
++ continue;
++
++ ret = tcp_xmit_probe_skb(sk_it, 0);
++ if (unlikely(ret > 0))
++ ans = ret;
++ }
++ return ans;
++ }
++}
++
++bool mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle,
++ int push_one, gfp_t gfp)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp;
++ struct sock *subsk = NULL;
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ struct sk_buff *skb;
++ unsigned int sent_pkts;
++ int reinject = 0;
++ unsigned int sublimit;
++
++ sent_pkts = 0;
++
++ while ((skb = mpcb->sched_ops->next_segment(meta_sk, &reinject, &subsk,
++ &sublimit))) {
++ unsigned int limit;
++
++ subtp = tcp_sk(subsk);
++ mss_now = tcp_current_mss(subsk);
++
++ if (reinject == 1) {
++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
++ /* Segment already reached the peer, take the next one */
++ __skb_unlink(skb, &mpcb->reinject_queue);
++ __kfree_skb(skb);
++ continue;
++ }
++ }
++
++ /* If the segment was cloned (e.g. a meta retransmission),
++ * the header must be expanded/copied so that there is no
++ * corruption of TSO information.
++ */
++ if (skb_unclone(skb, GFP_ATOMIC))
++ break;
++
++ if (unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now)))
++ break;
++
++ /* Force tso_segs to 1 by using UINT_MAX.
++ * We actually don't care about the exact number of segments
++ * emitted on the subflow. We need just to set tso_segs, because
++ * we still need an accurate packets_out count in
++ * tcp_event_new_data_sent.
++ */
++ tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX);
++
++ /* Check for nagle, irregardless of tso_segs. If the segment is
++ * actually larger than mss_now (TSO segment), then
++ * tcp_nagle_check will have partial == false and always trigger
++ * the transmission.
++ * tcp_write_xmit has a TSO-level nagle check which is not
++ * subject to the MPTCP-level. It is based on the properties of
++ * the subflow, not the MPTCP-level.
++ */
++ if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now,
++ (tcp_skb_is_last(meta_sk, skb) ?
++ nonagle : TCP_NAGLE_PUSH))))
++ break;
++
++ limit = mss_now;
++ /* skb->len > mss_now is the equivalent of tso_segs > 1 in
++ * tcp_write_xmit. Otherwise split-point would return 0.
++ */
++ if (skb->len > mss_now && !tcp_urg_mode(meta_tp))
++ /* We limit the size of the skb so that it fits into the
++ * window. Call tcp_mss_split_point to avoid duplicating
++ * code.
++ * We really only care about fitting the skb into the
++ * window. That's why we use UINT_MAX. If the skb does
++ * not fit into the cwnd_quota or the NIC's max-segs
++ * limitation, it will be split by the subflow's
++ * tcp_write_xmit which does the appropriate call to
++ * tcp_mss_split_point.
++ */
++ limit = tcp_mss_split_point(meta_sk, skb, mss_now,
++ UINT_MAX / mss_now,
++ nonagle);
++
++ if (sublimit)
++ limit = min(limit, sublimit);
++
++ if (skb->len > limit &&
++ unlikely(mptcp_fragment(meta_sk, skb, limit, gfp, reinject)))
++ break;
++
++ if (!mptcp_skb_entail(subsk, skb, reinject))
++ break;
++ /* Nagle is handled at the MPTCP-layer, so
++ * always push on the subflow
++ */
++ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);
++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
++
++ if (!reinject) {
++ mptcp_check_sndseq_wrap(meta_tp,
++ TCP_SKB_CB(skb)->end_seq -
++ TCP_SKB_CB(skb)->seq);
++ tcp_event_new_data_sent(meta_sk, skb);
++ }
++
++ tcp_minshall_update(meta_tp, mss_now, skb);
++ sent_pkts += tcp_skb_pcount(skb);
++
++ if (reinject > 0) {
++ __skb_unlink(skb, &mpcb->reinject_queue);
++ kfree_skb(skb);
++ }
++
++ if (push_one)
++ break;
++ }
++
++ return !meta_tp->packets_out && tcp_send_head(meta_sk);
++}
++
++void mptcp_write_space(struct sock *sk)
++{
++ mptcp_push_pending_frames(mptcp_meta_sk(sk));
++}
++
++u32 __mptcp_select_window(struct sock *sk)
++{
++ struct inet_connection_sock *icsk = inet_csk(sk);
++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
++ int mss, free_space, full_space, window;
++
++ /* MSS for the peer's data. Previous versions used mss_clamp
++ * here. I don't know if the value based on our guesses
++ * of peer's MSS is better for the performance. It's more correct
++ * but may be worse for the performance because of rcv_mss
++ * fluctuations. --SAW 1998/11/1
++ */
++ mss = icsk->icsk_ack.rcv_mss;
++ free_space = tcp_space(sk);
++ full_space = min_t(int, meta_tp->window_clamp,
++ tcp_full_space(sk));
++
++ if (mss > full_space)
++ mss = full_space;
++
++ if (free_space < (full_space >> 1)) {
++ icsk->icsk_ack.quick = 0;
++
++ if (tcp_memory_pressure)
++ /* TODO this has to be adapted when we support different
++ * MSS's among the subflows.
++ */
++ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh,
++ 4U * meta_tp->advmss);
++
++ if (free_space < mss)
++ return 0;
++ }
++
++ if (free_space > meta_tp->rcv_ssthresh)
++ free_space = meta_tp->rcv_ssthresh;
++
++ /* Don't do rounding if we are using window scaling, since the
++ * scaled window will not line up with the MSS boundary anyway.
++ */
++ window = meta_tp->rcv_wnd;
++ if (tp->rx_opt.rcv_wscale) {
++ window = free_space;
++
++ /* Advertise enough space so that it won't get scaled away.
++ * Import case: prevent zero window announcement if
++ * 1<<rcv_wscale > mss.
++ */
++ if (((window >> tp->rx_opt.rcv_wscale) << tp->
++ rx_opt.rcv_wscale) != window)
++ window = (((window >> tp->rx_opt.rcv_wscale) + 1)
++ << tp->rx_opt.rcv_wscale);
++ } else {
++ /* Get the largest window that is a nice multiple of mss.
++ * Window clamp already applied above.
++ * If our current window offering is within 1 mss of the
++ * free space we just keep it. This prevents the divide
++ * and multiply from happening most of the time.
++ * We also don't do any window rounding when the free space
++ * is too small.
++ */
++ if (window <= free_space - mss || window > free_space)
++ window = (free_space / mss) * mss;
++ else if (mss == full_space &&
++ free_space > window + (full_space >> 1))
++ window = free_space;
++ }
++
++ return window;
++}
++
++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,
++ unsigned *remaining)
++{
++ const struct tcp_sock *tp = tcp_sk(sk);
++
++ opts->options |= OPTION_MPTCP;
++ if (is_master_tp(tp)) {
++ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN;
++ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
++ opts->mp_capable.sender_key = tp->mptcp_loc_key;
++ opts->dss_csum = !!sysctl_mptcp_checksum;
++ } else {
++ const struct mptcp_cb *mpcb = tp->mpcb;
++
++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN;
++ *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN;
++ opts->mp_join_syns.token = mpcb->mptcp_rem_token;
++ opts->mp_join_syns.low_prio = tp->mptcp->low_prio;
++ opts->addr_id = tp->mptcp->loc_id;
++ opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce;
++ }
++}
++
++void mptcp_synack_options(struct request_sock *req,
++ struct tcp_out_options *opts, unsigned *remaining)
++{
++ struct mptcp_request_sock *mtreq;
++ mtreq = mptcp_rsk(req);
++
++ opts->options |= OPTION_MPTCP;
++ /* MPCB not yet set - thus it's a new MPTCP-session */
++ if (!mtreq->is_sub) {
++ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK;
++ opts->mp_capable.sender_key = mtreq->mptcp_loc_key;
++ opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum;
++ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
++ } else {
++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK;
++ opts->mp_join_syns.sender_truncated_mac =
++ mtreq->mptcp_hash_tmac;
++ opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce;
++ opts->mp_join_syns.low_prio = mtreq->low_prio;
++ opts->addr_id = mtreq->loc_id;
++ *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN;
++ }
++}
++
++void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
++ struct tcp_out_options *opts, unsigned *size)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct mptcp_cb *mpcb = tp->mpcb;
++ const struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
++
++ /* We are coming from tcp_current_mss with the meta_sk as an argument.
++ * It does not make sense to check for the options, because when the
++ * segment gets sent, another subflow will be chosen.
++ */
++ if (!skb && is_meta_sk(sk))
++ return;
++
++ /* In fallback mp_fail-mode, we have to repeat it until the fallback
++ * has been done by the sender
++ */
++ if (unlikely(tp->mptcp->send_mp_fail)) {
++ opts->options |= OPTION_MPTCP;
++ opts->mptcp_options |= OPTION_MP_FAIL;
++ *size += MPTCP_SUB_LEN_FAIL;
++ return;
++ }
++
++ if (unlikely(tp->send_mp_fclose)) {
++ opts->options |= OPTION_MPTCP;
++ opts->mptcp_options |= OPTION_MP_FCLOSE;
++ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
++ *size += MPTCP_SUB_LEN_FCLOSE_ALIGN;
++ return;
++ }
++
++ /* 1. If we are the sender of the infinite-mapping, we need the
++ * MPTCPHDR_INF-flag, because a retransmission of the
++ * infinite-announcment still needs the mptcp-option.
++ *
++ * We need infinite_cutoff_seq, because retransmissions from before
++ * the infinite-cutoff-moment still need the MPTCP-signalling to stay
++ * consistent.
++ *
++ * 2. If we are the receiver of the infinite-mapping, we always skip
++ * mptcp-options, because acknowledgments from before the
++ * infinite-mapping point have already been sent out.
++ *
++ * I know, the whole infinite-mapping stuff is ugly...
++ *
++ * TODO: Handle wrapped data-sequence numbers
++ * (even if it's very unlikely)
++ */
++ if (unlikely(mpcb->infinite_mapping_snd) &&
++ ((mpcb->send_infinite_mapping && tcb &&
++ mptcp_is_data_seq(skb) &&
++ !(tcb->mptcp_flags & MPTCPHDR_INF) &&
++ !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) ||
++ !mpcb->send_infinite_mapping))
++ return;
++
++ if (unlikely(tp->mptcp->include_mpc)) {
++ opts->options |= OPTION_MPTCP;
++ opts->mptcp_options |= OPTION_MP_CAPABLE |
++ OPTION_TYPE_ACK;
++ *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN;
++ opts->mp_capable.sender_key = mpcb->mptcp_loc_key;
++ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
++ opts->dss_csum = mpcb->dss_csum;
++
++ if (skb)
++ tp->mptcp->include_mpc = 0;
++ }
++ if (unlikely(tp->mptcp->pre_established)) {
++ opts->options |= OPTION_MPTCP;
++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK;
++ *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN;
++ }
++
++ if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) {
++ opts->options |= OPTION_MPTCP;
++ opts->mptcp_options |= OPTION_DATA_ACK;
++ /* If !skb, we come from tcp_current_mss and thus we always
++ * assume that the DSS-option will be set for the data-packet.
++ */
++ if (skb && !mptcp_is_data_seq(skb)) {
++ *size += MPTCP_SUB_LEN_ACK_ALIGN;
++ } else {
++ /* Doesn't matter, if csum included or not. It will be
++ * either 10 or 12, and thus aligned = 12
++ */
++ *size += MPTCP_SUB_LEN_ACK_ALIGN +
++ MPTCP_SUB_LEN_SEQ_ALIGN;
++ }
++
++ *size += MPTCP_SUB_LEN_DSS_ALIGN;
++ }
++
++ if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal)
++ mpcb->pm_ops->addr_signal(sk, size, opts, skb);
++
++ if (unlikely(tp->mptcp->send_mp_prio) &&
++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) {
++ opts->options |= OPTION_MPTCP;
++ opts->mptcp_options |= OPTION_MP_PRIO;
++ if (skb)
++ tp->mptcp->send_mp_prio = 0;
++ *size += MPTCP_SUB_LEN_PRIO_ALIGN;
++ }
++
++ return;
++}
++
++u16 mptcp_select_window(struct sock *sk)
++{
++ u16 new_win = tcp_select_window(sk);
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct tcp_sock *meta_tp = mptcp_meta_tp(tp);
++
++ meta_tp->rcv_wnd = tp->rcv_wnd;
++ meta_tp->rcv_wup = meta_tp->rcv_nxt;
++
++ return new_win;
++}
++
++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
++ const struct tcp_out_options *opts,
++ struct sk_buff *skb)
++{
++ if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) {
++ struct mp_capable *mpc = (struct mp_capable *)ptr;
++
++ mpc->kind = TCPOPT_MPTCP;
++
++ if ((OPTION_TYPE_SYN & opts->mptcp_options) ||
++ (OPTION_TYPE_SYNACK & opts->mptcp_options)) {
++ mpc->sender_key = opts->mp_capable.sender_key;
++ mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN;
++ ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;
++ } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
++ mpc->sender_key = opts->mp_capable.sender_key;
++ mpc->receiver_key = opts->mp_capable.receiver_key;
++ mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK;
++ ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2;
++ }
++
++ mpc->sub = MPTCP_SUB_CAPABLE;
++ mpc->ver = 0;
++ mpc->a = opts->dss_csum;
++ mpc->b = 0;
++ mpc->rsv = 0;
++ mpc->h = 1;
++ }
++
++ if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) {
++ struct mp_join *mpj = (struct mp_join *)ptr;
++
++ mpj->kind = TCPOPT_MPTCP;
++ mpj->sub = MPTCP_SUB_JOIN;
++ mpj->rsv = 0;
++
++ if (OPTION_TYPE_SYN & opts->mptcp_options) {
++ mpj->len = MPTCP_SUB_LEN_JOIN_SYN;
++ mpj->u.syn.token = opts->mp_join_syns.token;
++ mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce;
++ mpj->b = opts->mp_join_syns.low_prio;
++ mpj->addr_id = opts->addr_id;
++ ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2;
++ } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) {
++ mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK;
++ mpj->u.synack.mac =
++ opts->mp_join_syns.sender_truncated_mac;
++ mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce;
++ mpj->b = opts->mp_join_syns.low_prio;
++ mpj->addr_id = opts->addr_id;
++ ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2;
++ } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
++ mpj->len = MPTCP_SUB_LEN_JOIN_ACK;
++ mpj->addr_id = 0; /* addr_id is rsv (RFC 6824, p. 21) */
++ memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20);
++ ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2;
++ }
++ }
++ if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) {
++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
++
++ mpadd->kind = TCPOPT_MPTCP;
++ if (opts->add_addr_v4) {
++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4;
++ mpadd->sub = MPTCP_SUB_ADD_ADDR;
++ mpadd->ipver = 4;
++ mpadd->addr_id = opts->add_addr4.addr_id;
++ mpadd->u.v4.addr = opts->add_addr4.addr;
++ ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2;
++ } else if (opts->add_addr_v6) {
++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6;
++ mpadd->sub = MPTCP_SUB_ADD_ADDR;
++ mpadd->ipver = 6;
++ mpadd->addr_id = opts->add_addr6.addr_id;
++ memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr,
++ sizeof(mpadd->u.v6.addr));
++ ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2;
++ }
++ }
++ if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) {
++ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
++ u8 *addrs_id;
++ int id, len, len_align;
++
++ len = mptcp_sub_len_remove_addr(opts->remove_addrs);
++ len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs);
++
++ mprem->kind = TCPOPT_MPTCP;
++ mprem->len = len;
++ mprem->sub = MPTCP_SUB_REMOVE_ADDR;
++ mprem->rsv = 0;
++ addrs_id = &mprem->addrs_id;
++
++ mptcp_for_each_bit_set(opts->remove_addrs, id)
++ *(addrs_id++) = id;
++
++ /* Fill the rest with NOP's */
++ if (len_align > len) {
++ int i;
++ for (i = 0; i < len_align - len; i++)
++ *(addrs_id++) = TCPOPT_NOP;
++ }
++
++ ptr += len_align >> 2;
++ }
++ if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) {
++ struct mp_fail *mpfail = (struct mp_fail *)ptr;
++
++ mpfail->kind = TCPOPT_MPTCP;
++ mpfail->len = MPTCP_SUB_LEN_FAIL;
++ mpfail->sub = MPTCP_SUB_FAIL;
++ mpfail->rsv1 = 0;
++ mpfail->rsv2 = 0;
++ mpfail->data_seq = htonll(tp->mpcb->csum_cutoff_seq);
++
++ ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2;
++ }
++ if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) {
++ struct mp_fclose *mpfclose = (struct mp_fclose *)ptr;
++
++ mpfclose->kind = TCPOPT_MPTCP;
++ mpfclose->len = MPTCP_SUB_LEN_FCLOSE;
++ mpfclose->sub = MPTCP_SUB_FCLOSE;
++ mpfclose->rsv1 = 0;
++ mpfclose->rsv2 = 0;
++ mpfclose->key = opts->mp_capable.receiver_key;
++
++ ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2;
++ }
++
++ if (OPTION_DATA_ACK & opts->mptcp_options) {
++ if (!mptcp_is_data_seq(skb))
++ ptr += mptcp_write_dss_data_ack(tp, skb, ptr);
++ else
++ ptr += mptcp_write_dss_data_seq(tp, skb, ptr);
++ }
++ if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) {
++ struct mp_prio *mpprio = (struct mp_prio *)ptr;
++
++ mpprio->kind = TCPOPT_MPTCP;
++ mpprio->len = MPTCP_SUB_LEN_PRIO;
++ mpprio->sub = MPTCP_SUB_PRIO;
++ mpprio->rsv = 0;
++ mpprio->b = tp->mptcp->low_prio;
++ mpprio->addr_id = TCPOPT_NOP;
++
++ ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2;
++ }
++}
++
++/* Sends the datafin */
++void mptcp_send_fin(struct sock *meta_sk)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct sk_buff *skb = tcp_write_queue_tail(meta_sk);
++ int mss_now;
++
++ if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
++ meta_tp->mpcb->passive_close = 1;
++
++ /* Optimization, tack on the FIN if we have a queue of
++ * unsent frames. But be careful about outgoing SACKS
++ * and IP options.
++ */
++ mss_now = mptcp_current_mss(meta_sk);
++
++ if (tcp_send_head(meta_sk) != NULL) {
++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
++ TCP_SKB_CB(skb)->end_seq++;
++ meta_tp->write_seq++;
++ } else {
++ /* Socket is locked, keep trying until memory is available. */
++ for (;;) {
++ skb = alloc_skb_fclone(MAX_TCP_HEADER,
++ meta_sk->sk_allocation);
++ if (skb)
++ break;
++ yield();
++ }
++ /* Reserve space for headers and prepare control bits. */
++ skb_reserve(skb, MAX_TCP_HEADER);
++
++ tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK);
++ TCP_SKB_CB(skb)->end_seq++;
++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
++ tcp_queue_skb(meta_sk, skb);
++ }
++ __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF);
++}
++
++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ struct sock *sk = NULL, *sk_it = NULL, *tmpsk;
++
++ if (!mpcb->cnt_subflows)
++ return;
++
++ WARN_ON(meta_tp->send_mp_fclose);
++
++ /* First - select a socket */
++ sk = mptcp_select_ack_sock(meta_sk);
++
++ /* May happen if no subflow is in an appropriate state */
++ if (!sk)
++ return;
++
++ /* We are in infinite mode - just send a reset */
++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) {
++ sk->sk_err = ECONNRESET;
++ if (tcp_need_reset(sk->sk_state))
++ tcp_send_active_reset(sk, priority);
++ mptcp_sub_force_close(sk);
++ return;
++ }
++
++
++ tcp_sk(sk)->send_mp_fclose = 1;
++ /** Reset all other subflows */
++
++ /* tcp_done must be handled with bh disabled */
++ if (!in_serving_softirq())
++ local_bh_disable();
++
++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
++ if (tcp_sk(sk_it)->send_mp_fclose)
++ continue;
++
++ sk_it->sk_err = ECONNRESET;
++ if (tcp_need_reset(sk_it->sk_state))
++ tcp_send_active_reset(sk_it, GFP_ATOMIC);
++ mptcp_sub_force_close(sk_it);
++ }
++
++ if (!in_serving_softirq())
++ local_bh_enable();
++
++ tcp_send_ack(sk);
++ inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto);
++
++ meta_tp->send_mp_fclose = 1;
++}
++
++static void mptcp_ack_retransmit_timer(struct sock *sk)
++{
++ struct sk_buff *skb;
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct inet_connection_sock *icsk = inet_csk(sk);
++
++ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
++ goto out; /* Routing failure or similar */
++
++ if (!tp->retrans_stamp)
++ tp->retrans_stamp = tcp_time_stamp ? : 1;
++
++ if (tcp_write_timeout(sk)) {
++ tp->mptcp->pre_established = 0;
++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
++ goto out;
++ }
++
++ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
++ if (skb == NULL) {
++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
++ jiffies + icsk->icsk_rto);
++ return;
++ }
++
++ /* Reserve space for headers and prepare control bits */
++ skb_reserve(skb, MAX_TCP_HEADER);
++ tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK);
++
++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
++ if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) {
++ /* Retransmission failed because of local congestion,
++ * do not backoff.
++ */
++ if (!icsk->icsk_retransmits)
++ icsk->icsk_retransmits = 1;
++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
++ jiffies + icsk->icsk_rto);
++ return;
++ }
++
++
++ icsk->icsk_retransmits++;
++ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
++ jiffies + icsk->icsk_rto);
++ if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
++ __sk_dst_reset(sk);
++
++out:;
++}
++
++void mptcp_ack_handler(unsigned long data)
++{
++ struct sock *sk = (struct sock *)data;
++ struct sock *meta_sk = mptcp_meta_sk(sk);
++
++ bh_lock_sock(meta_sk);
++ if (sock_owned_by_user(meta_sk)) {
++ /* Try again later */
++ sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer,
++ jiffies + (HZ / 20));
++ goto out_unlock;
++ }
++
++ if (sk->sk_state == TCP_CLOSE)
++ goto out_unlock;
++ if (!tcp_sk(sk)->mptcp->pre_established)
++ goto out_unlock;
++
++ mptcp_ack_retransmit_timer(sk);
++
++ sk_mem_reclaim(sk);
++
++out_unlock:
++ bh_unlock_sock(meta_sk);
++ sock_put(sk);
++}
++
++/* Similar to tcp_retransmit_skb
++ *
++ * The diff is that we handle the retransmission-stats (retrans_stamp) at the
++ * meta-level.
++ */
++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct sock *subsk;
++ unsigned int limit, mss_now;
++ int err = -1;
++
++ /* Do not sent more than we queued. 1/4 is reserved for possible
++ * copying overhead: fragmentation, tunneling, mangling etc.
++ *
++ * This is a meta-retransmission thus we check on the meta-socket.
++ */
++ if (atomic_read(&meta_sk->sk_wmem_alloc) >
++ min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) {
++ return -EAGAIN;
++ }
++
++ /* We need to make sure that the retransmitted segment can be sent on a
++ * subflow right now. If it is too big, it needs to be fragmented.
++ */
++ subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, false);
++ if (!subsk) {
++ /* We want to increase icsk_retransmits, thus return 0, so that
++ * mptcp_retransmit_timer enters the desired branch.
++ */
++ err = 0;
++ goto failed;
++ }
++ mss_now = tcp_current_mss(subsk);
++
++ /* If the segment was cloned (e.g. a meta retransmission), the header
++ * must be expanded/copied so that there is no corruption of TSO
++ * information.
++ */
++ if (skb_unclone(skb, GFP_ATOMIC)) {
++ err = -ENOMEM;
++ goto failed;
++ }
++
++ /* Must have been set by mptcp_write_xmit before */
++ BUG_ON(!tcp_skb_pcount(skb));
++
++ limit = mss_now;
++ /* skb->len > mss_now is the equivalent of tso_segs > 1 in
++ * tcp_write_xmit. Otherwise split-point would return 0.
++ */
++ if (skb->len > mss_now && !tcp_urg_mode(meta_tp))
++ limit = tcp_mss_split_point(meta_sk, skb, mss_now,
++ UINT_MAX / mss_now,
++ TCP_NAGLE_OFF);
++
++ if (skb->len > limit &&
++ unlikely(mptcp_fragment(meta_sk, skb, limit,
++ GFP_ATOMIC, 0)))
++ goto failed;
++
++ if (!mptcp_skb_entail(subsk, skb, -1))
++ goto failed;
++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
++
++ /* Update global TCP statistics. */
++ TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS);
++
++ /* Diff to tcp_retransmit_skb */
++
++ /* Save stamp of the first retransmit. */
++ if (!meta_tp->retrans_stamp)
++ meta_tp->retrans_stamp = TCP_SKB_CB(skb)->when;
++
++ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);
++
++ return 0;
++
++failed:
++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL);
++ return err;
++}
++
++/* Similar to tcp_retransmit_timer
++ *
++ * The diff is that we have to handle retransmissions of the FAST_CLOSE-message
++ * and that we don't have an srtt estimation at the meta-level.
++ */
++void mptcp_retransmit_timer(struct sock *meta_sk)
++{
++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
++ struct mptcp_cb *mpcb = meta_tp->mpcb;
++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
++ int err;
++
++ /* In fallback, retransmission is handled at the subflow-level */
++ if (!meta_tp->packets_out || mpcb->infinite_mapping_snd ||
++ mpcb->send_infinite_mapping)
++ return;
++
++ WARN_ON(tcp_write_queue_empty(meta_sk));
++
++ if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) &&
++ !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
++ /* Receiver dastardly shrinks window. Our retransmits
++ * become zero probes, but we should not timeout this
++ * connection. If the socket is an orphan, time it out,
++ * we cannot allow such beasts to hang infinitely.
++ */
++ struct inet_sock *meta_inet = inet_sk(meta_sk);
++ if (meta_sk->sk_family == AF_INET) {
++ LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
++ &meta_inet->inet_daddr,
++ ntohs(meta_inet->inet_dport),
++ meta_inet->inet_num, meta_tp->snd_una,
++ meta_tp->snd_nxt);
++ }
++#if IS_ENABLED(CONFIG_IPV6)
++ else if (meta_sk->sk_family == AF_INET6) {
++ LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
++ &meta_sk->sk_v6_daddr,
++ ntohs(meta_inet->inet_dport),
++ meta_inet->inet_num, meta_tp->snd_una,
++ meta_tp->snd_nxt);
++ }
++#endif
++ if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) {
++ tcp_write_err(meta_sk);
++ return;
++ }
++
++ mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
++ goto out_reset_timer;
++ }
++
++ if (tcp_write_timeout(meta_sk))
++ return;
++
++ if (meta_icsk->icsk_retransmits == 0)
++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS);
++
++ meta_icsk->icsk_ca_state = TCP_CA_Loss;
++
++ err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
++ if (err > 0) {
++ /* Retransmission failed because of local congestion,
++ * do not backoff.
++ */
++ if (!meta_icsk->icsk_retransmits)
++ meta_icsk->icsk_retransmits = 1;
++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
++ min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
++ TCP_RTO_MAX);
++ return;
++ }
++
++ /* Increase the timeout each time we retransmit. Note that
++ * we do not increase the rtt estimate. rto is initialized
++ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
++ * that doubling rto each time is the least we can get away with.
++ * In KA9Q, Karn uses this for the first few times, and then
++ * goes to quadratic. netBSD doubles, but only goes up to *64,
++ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
++ * defined in the protocol as the maximum possible RTT. I guess
++ * we'll have to use something other than TCP to talk to the
++ * University of Mars.
++ *
++ * PAWS allows us longer timeouts and large windows, so once
++ * implemented ftp to mars will work nicely. We will have to fix
++ * the 120 second clamps though!
++ */
++ meta_icsk->icsk_backoff++;
++ meta_icsk->icsk_retransmits++;
++
++out_reset_timer:
++ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
++ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
++ * might be increased if the stream oscillates between thin and thick,
++ * thus the old value might already be too high compared to the value
++ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
++ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
++ * exponential backoff behaviour to avoid continue hammering
++ * linear-timeout retransmissions into a black hole
++ */
++ if (meta_sk->sk_state == TCP_ESTABLISHED &&
++ (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
++ tcp_stream_is_thin(meta_tp) &&
++ meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
++ meta_icsk->icsk_backoff = 0;
++ /* We cannot do the same as in tcp_write_timer because the
++ * srtt is not set here.
++ */
++ mptcp_set_rto(meta_sk);
++ } else {
++ /* Use normal (exponential) backoff */
++ meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX);
++ }
++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX);
++
++ return;
++}
++
++/* Modify values to an mptcp-level for the initial window of new subflows */
++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
++ __u32 *window_clamp, int wscale_ok,
++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
++ const struct sock *sk)
++{
++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
++
++ *window_clamp = mpcb->orig_window_clamp;
++ __space = tcp_win_from_space(mpcb->orig_sk_rcvbuf);
++
++ tcp_select_initial_window(__space, mss, rcv_wnd, window_clamp,
++ wscale_ok, rcv_wscale, init_rcv_wnd, sk);
++}
++
++static inline u64 mptcp_calc_rate(const struct sock *meta_sk, unsigned int mss,
++ unsigned int (*mss_cb)(struct sock *sk))
++{
++ struct sock *sk;
++ u64 rate = 0;
++
++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
++ struct tcp_sock *tp = tcp_sk(sk);
++ int this_mss;
++ u64 this_rate;
++
++ if (!mptcp_sk_can_send(sk))
++ continue;
++
++ /* Do not consider subflows without a RTT estimation yet
++ * otherwise this_rate >>> rate.
++ */
++ if (unlikely(!tp->srtt_us))
++ continue;
++
++ this_mss = mss_cb(sk);
++
++ /* If this_mss is smaller than mss, it means that a segment will
++ * be splitted in two (or more) when pushed on this subflow. If
++ * you consider that mss = 1428 and this_mss = 1420 then two
++ * segments will be generated: a 1420-byte and 8-byte segment.
++ * The latter will introduce a large overhead as for a single
++ * data segment 2 slots will be used in the congestion window.
++ * Therefore reducing by ~2 the potential throughput of this
++ * subflow. Indeed, 1428 will be send while 2840 could have been
++ * sent if mss == 1420 reducing the throughput by 2840 / 1428.
++ *
++ * The following algorithm take into account this overhead
++ * when computing the potential throughput that MPTCP can
++ * achieve when generating mss-byte segments.
++ *
++ * The formulae is the following:
++ * \sum_{\forall sub} ratio * \frac{mss * cwnd_sub}{rtt_sub}
++ * Where ratio is computed as follows:
++ * \frac{mss}{\ceil{mss / mss_sub} * mss_sub}
++ *
++ * ratio gives the reduction factor of the theoretical
++ * throughput a subflow can achieve if MPTCP uses a specific
++ * MSS value.
++ */
++ this_rate = div64_u64((u64)mss * mss * (USEC_PER_SEC << 3) *
++ max(tp->snd_cwnd, tp->packets_out),
++ (u64)tp->srtt_us *
++ DIV_ROUND_UP(mss, this_mss) * this_mss);
++ rate += this_rate;
++ }
++
++ return rate;
++}
++
++static unsigned int __mptcp_current_mss(const struct sock *meta_sk,
++ unsigned int (*mss_cb)(struct sock *sk))
++{
++ unsigned int mss = 0;
++ u64 rate = 0;
++ struct sock *sk;
++
++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
++ int this_mss;
++ u64 this_rate;
++
++ if (!mptcp_sk_can_send(sk))
++ continue;
++
++ this_mss = mss_cb(sk);
++
++ /* Same mss values will produce the same throughput. */
++ if (this_mss == mss)
++ continue;
++
++ /* See whether using this mss value can theoretically improve
++ * the performances.
++ */
++ this_rate = mptcp_calc_rate(meta_sk, this_mss, mss_cb);
++ if (this_rate >= rate) {
++ mss = this_mss;
++ rate = this_rate;
++ }
++ }
++
++ return mss;
++}
++
++unsigned int mptcp_current_mss(struct sock *meta_sk)
++{
++ unsigned int mss = __mptcp_current_mss(meta_sk, tcp_current_mss);
++
++ /* If no subflow is available, we take a default-mss from the
++ * meta-socket.
++ */
++ return !mss ? tcp_current_mss(meta_sk) : mss;
++}
++
++static unsigned int mptcp_select_size_mss(struct sock *sk)
++{
++ return tcp_sk(sk)->mss_cache;
++}
++
++int mptcp_select_size(const struct sock *meta_sk, bool sg)
++{
++ unsigned int mss = __mptcp_current_mss(meta_sk, mptcp_select_size_mss);
++
++ if (sg) {
++ if (mptcp_sk_can_gso(meta_sk)) {
++ mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
++ } else {
++ int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
++
++ if (mss >= pgbreak &&
++ mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
++ mss = pgbreak;
++ }
++ }
++
++ return !mss ? tcp_sk(meta_sk)->mss_cache : mss;
++}
++
++int mptcp_check_snd_buf(const struct tcp_sock *tp)
++{
++ const struct sock *sk;
++ u32 rtt_max = tp->srtt_us;
++ u64 bw_est;
++
++ if (!tp->srtt_us)
++ return tp->reordering + 1;
++
++ mptcp_for_each_sk(tp->mpcb, sk) {
++ if (!mptcp_sk_can_send(sk))
++ continue;
++
++ if (rtt_max < tcp_sk(sk)->srtt_us)
++ rtt_max = tcp_sk(sk)->srtt_us;
++ }
++
++ bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16,
++ (u64)tp->srtt_us);
++
++ return max_t(unsigned int, (u32)(bw_est >> 16),
++ tp->reordering + 1);
++}
++
++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,
++ int large_allowed)
++{
++ struct sock *sk;
++ u32 xmit_size_goal = 0;
++
++ if (large_allowed && mptcp_sk_can_gso(meta_sk)) {
++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
++ int this_size_goal;
++
++ if (!mptcp_sk_can_send(sk))
++ continue;
++
++ this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1);
++ if (this_size_goal > xmit_size_goal)
++ xmit_size_goal = this_size_goal;
++ }
++ }
++
++ return max(xmit_size_goal, mss_now);
++}
++
++/* Similar to tcp_trim_head - but we correctly copy the DSS-option */
++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
++{
++ if (skb_cloned(skb)) {
++ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
++ return -ENOMEM;
++ }
++
++ __pskb_trim_head(skb, len);
++
++ TCP_SKB_CB(skb)->seq += len;
++ skb->ip_summed = CHECKSUM_PARTIAL;
++
++ skb->truesize -= len;
++ sk->sk_wmem_queued -= len;
++ sk_mem_uncharge(sk, len);
++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
++
++ /* Any change of skb->len requires recalculation of tso factor. */
++ if (tcp_skb_pcount(skb) > 1)
++ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
++
++ return 0;
++}
+diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c
+new file mode 100644
+index 000000000000..9542f950729f
+--- /dev/null
++++ b/net/mptcp/mptcp_pm.c
+@@ -0,0 +1,169 @@
++/*
++ * MPTCP implementation - MPTCP-subflow-management
++ *
++ * Initial Design & Implementation:
++ * Sébastien Barré <sebastien.barre@uclouvain.be>
++ *
++ * Current Maintainer & Author:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * Additional authors:
++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
++ * Gregory Detal <gregory.detal@uclouvain.be>
++ * Fabien Duchêne <fabien.duchene@uclouvain.be>
++ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
++ * Lavkesh Lahngir <lavkesh51@gmail.com>
++ * Andreas Ripke <ripke@neclab.eu>
++ * Vlad Dogaru <vlad.dogaru@intel.com>
++ * Octavian Purdila <octavian.purdila@intel.com>
++ * John Ronan <jronan@tssg.org>
++ * Catalin Nicutar <catalin.nicutar@gmail.com>
++ * Brandon Heller <brandonh@stanford.edu>
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++
++#include <linux/module.h>
++#include <net/mptcp.h>
++
++static DEFINE_SPINLOCK(mptcp_pm_list_lock);
++static LIST_HEAD(mptcp_pm_list);
++
++static int mptcp_default_id(sa_family_t family, union inet_addr *addr,
++ struct net *net, bool *low_prio)
++{
++ return 0;
++}
++
++struct mptcp_pm_ops mptcp_pm_default = {
++ .get_local_id = mptcp_default_id, /* We do not care */
++ .name = "default",
++ .owner = THIS_MODULE,
++};
++
++static struct mptcp_pm_ops *mptcp_pm_find(const char *name)
++{
++ struct mptcp_pm_ops *e;
++
++ list_for_each_entry_rcu(e, &mptcp_pm_list, list) {
++ if (strcmp(e->name, name) == 0)
++ return e;
++ }
++
++ return NULL;
++}
++
++int mptcp_register_path_manager(struct mptcp_pm_ops *pm)
++{
++ int ret = 0;
++
++ if (!pm->get_local_id)
++ return -EINVAL;
++
++ spin_lock(&mptcp_pm_list_lock);
++ if (mptcp_pm_find(pm->name)) {
++ pr_notice("%s already registered\n", pm->name);
++ ret = -EEXIST;
++ } else {
++ list_add_tail_rcu(&pm->list, &mptcp_pm_list);
++ pr_info("%s registered\n", pm->name);
++ }
++ spin_unlock(&mptcp_pm_list_lock);
++
++ return ret;
++}
++EXPORT_SYMBOL_GPL(mptcp_register_path_manager);
++
++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm)
++{
++ spin_lock(&mptcp_pm_list_lock);
++ list_del_rcu(&pm->list);
++ spin_unlock(&mptcp_pm_list_lock);
++}
++EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager);
++
++void mptcp_get_default_path_manager(char *name)
++{
++ struct mptcp_pm_ops *pm;
++
++ BUG_ON(list_empty(&mptcp_pm_list));
++
++ rcu_read_lock();
++ pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list);
++ strncpy(name, pm->name, MPTCP_PM_NAME_MAX);
++ rcu_read_unlock();
++}
++
++int mptcp_set_default_path_manager(const char *name)
++{
++ struct mptcp_pm_ops *pm;
++ int ret = -ENOENT;
++
++ spin_lock(&mptcp_pm_list_lock);
++ pm = mptcp_pm_find(name);
++#ifdef CONFIG_MODULES
++ if (!pm && capable(CAP_NET_ADMIN)) {
++ spin_unlock(&mptcp_pm_list_lock);
++
++ request_module("mptcp_%s", name);
++ spin_lock(&mptcp_pm_list_lock);
++ pm = mptcp_pm_find(name);
++ }
++#endif
++
++ if (pm) {
++ list_move(&pm->list, &mptcp_pm_list);
++ ret = 0;
++ } else {
++ pr_info("%s is not available\n", name);
++ }
++ spin_unlock(&mptcp_pm_list_lock);
++
++ return ret;
++}
++
++void mptcp_init_path_manager(struct mptcp_cb *mpcb)
++{
++ struct mptcp_pm_ops *pm;
++
++ rcu_read_lock();
++ list_for_each_entry_rcu(pm, &mptcp_pm_list, list) {
++ if (try_module_get(pm->owner)) {
++ mpcb->pm_ops = pm;
++ break;
++ }
++ }
++ rcu_read_unlock();
++}
++
++/* Manage refcounts on socket close. */
++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb)
++{
++ module_put(mpcb->pm_ops->owner);
++}
++
++/* Fallback to the default path-manager. */
++void mptcp_fallback_default(struct mptcp_cb *mpcb)
++{
++ struct mptcp_pm_ops *pm;
++
++ mptcp_cleanup_path_manager(mpcb);
++ pm = mptcp_pm_find("default");
++
++ /* Cannot fail - it's the default module */
++ try_module_get(pm->owner);
++ mpcb->pm_ops = pm;
++}
++EXPORT_SYMBOL_GPL(mptcp_fallback_default);
++
++/* Set default value from kernel configuration at bootup */
++static int __init mptcp_path_manager_default(void)
++{
++ return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM);
++}
++late_initcall(mptcp_path_manager_default);
+diff --git a/net/mptcp/mptcp_rr.c b/net/mptcp/mptcp_rr.c
+new file mode 100644
+index 000000000000..93278f684069
+--- /dev/null
++++ b/net/mptcp/mptcp_rr.c
+@@ -0,0 +1,301 @@
++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
++
++#include <linux/module.h>
++#include <net/mptcp.h>
++
++static unsigned char num_segments __read_mostly = 1;
++module_param(num_segments, byte, 0644);
++MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst");
++
++static bool cwnd_limited __read_mostly = 1;
++module_param(cwnd_limited, bool, 0644);
++MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows");
++
++struct rrsched_priv {
++ unsigned char quota;
++};
++
++static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp)
++{
++ return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0];
++}
++
++/* If the sub-socket sk available to send the skb? */
++static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb,
++ bool zero_wnd_test, bool cwnd_test)
++{
++ const struct tcp_sock *tp = tcp_sk(sk);
++ unsigned int space, in_flight;
++
++ /* Set of states for which we are allowed to send data */
++ if (!mptcp_sk_can_send(sk))
++ return false;
++
++ /* We do not send data on this subflow unless it is
++ * fully established, i.e. the 4th ack has been received.
++ */
++ if (tp->mptcp->pre_established)
++ return false;
++
++ if (tp->pf)
++ return false;
++
++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
++ /* If SACK is disabled, and we got a loss, TCP does not exit
++ * the loss-state until something above high_seq has been acked.
++ * (see tcp_try_undo_recovery)
++ *
++ * high_seq is the snd_nxt at the moment of the RTO. As soon
++ * as we have an RTO, we won't push data on the subflow.
++ * Thus, snd_una can never go beyond high_seq.
++ */
++ if (!tcp_is_reno(tp))
++ return false;
++ else if (tp->snd_una != tp->high_seq)
++ return false;
++ }
++
++ if (!tp->mptcp->fully_established) {
++ /* Make sure that we send in-order data */
++ if (skb && tp->mptcp->second_packet &&
++ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
++ return false;
++ }
++
++ if (!cwnd_test)
++ goto zero_wnd_test;
++
++ in_flight = tcp_packets_in_flight(tp);
++ /* Not even a single spot in the cwnd */
++ if (in_flight >= tp->snd_cwnd)
++ return false;
++
++ /* Now, check if what is queued in the subflow's send-queue
++ * already fills the cwnd.
++ */
++ space = (tp->snd_cwnd - in_flight) * tp->mss_cache;
++
++ if (tp->write_seq - tp->snd_nxt > space)
++ return false;
++
++zero_wnd_test:
++ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
++ return false;
++
++ return true;
++}
++
++/* Are we not allowed to reinject this skb on tp? */
++static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
++{
++ /* If the skb has already been enqueued in this sk, try to find
++ * another one.
++ */
++ return skb &&
++ /* Has the skb already been enqueued into this subsocket? */
++ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
++}
++
++/* We just look for any subflow that is available */
++static struct sock *rr_get_available_subflow(struct sock *meta_sk,
++ struct sk_buff *skb,
++ bool zero_wnd_test)
++{
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct sock *sk, *bestsk = NULL, *backupsk = NULL;
++
++ /* Answer data_fin on same subflow!!! */
++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
++ skb && mptcp_is_data_fin(skb)) {
++ mptcp_for_each_sk(mpcb, sk) {
++ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
++ mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
++ return sk;
++ }
++ }
++
++ /* First, find the best subflow */
++ mptcp_for_each_sk(mpcb, sk) {
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
++ continue;
++
++ if (mptcp_rr_dont_reinject_skb(tp, skb)) {
++ backupsk = sk;
++ continue;
++ }
++
++ bestsk = sk;
++ }
++
++ if (bestsk) {
++ sk = bestsk;
++ } else if (backupsk) {
++ /* It has been sent on all subflows once - let's give it a
++ * chance again by restarting its pathmask.
++ */
++ if (skb)
++ TCP_SKB_CB(skb)->path_mask = 0;
++ sk = backupsk;
++ }
++
++ return sk;
++}
++
++/* Returns the next segment to be sent from the mptcp meta-queue.
++ * (chooses the reinject queue if any segment is waiting in it, otherwise,
++ * chooses the normal write queue).
++ * Sets *@reinject to 1 if the returned segment comes from the
++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
++ * and sets it to -1 if it is a meta-level retransmission to optimize the
++ * receive-buffer.
++ */
++static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject)
++{
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct sk_buff *skb = NULL;
++
++ *reinject = 0;
++
++ /* If we are in fallback-mode, just take from the meta-send-queue */
++ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
++ return tcp_send_head(meta_sk);
++
++ skb = skb_peek(&mpcb->reinject_queue);
++
++ if (skb)
++ *reinject = 1;
++ else
++ skb = tcp_send_head(meta_sk);
++ return skb;
++}
++
++static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk,
++ int *reinject,
++ struct sock **subsk,
++ unsigned int *limit)
++{
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct sock *sk_it, *choose_sk = NULL;
++ struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject);
++ unsigned char split = num_segments;
++ unsigned char iter = 0, full_subs = 0;
++
++ /* As we set it, we have to reset it as well. */
++ *limit = 0;
++
++ if (!skb)
++ return NULL;
++
++ if (*reinject) {
++ *subsk = rr_get_available_subflow(meta_sk, skb, false);
++ if (!*subsk)
++ return NULL;
++
++ return skb;
++ }
++
++retry:
++
++ /* First, we look for a subflow who is currently being used */
++ mptcp_for_each_sk(mpcb, sk_it) {
++ struct tcp_sock *tp_it = tcp_sk(sk_it);
++ struct rrsched_priv *rsp = rrsched_get_priv(tp_it);
++
++ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
++ continue;
++
++ iter++;
++
++ /* Is this subflow currently being used? */
++ if (rsp->quota > 0 && rsp->quota < num_segments) {
++ split = num_segments - rsp->quota;
++ choose_sk = sk_it;
++ goto found;
++ }
++
++ /* Or, it's totally unused */
++ if (!rsp->quota) {
++ split = num_segments;
++ choose_sk = sk_it;
++ }
++
++ /* Or, it must then be fully used */
++ if (rsp->quota == num_segments)
++ full_subs++;
++ }
++
++ /* All considered subflows have a full quota, and we considered at
++ * least one.
++ */
++ if (iter && iter == full_subs) {
++ /* So, we restart this round by setting quota to 0 and retry
++ * to find a subflow.
++ */
++ mptcp_for_each_sk(mpcb, sk_it) {
++ struct tcp_sock *tp_it = tcp_sk(sk_it);
++ struct rrsched_priv *rsp = rrsched_get_priv(tp_it);
++
++ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
++ continue;
++
++ rsp->quota = 0;
++ }
++
++ goto retry;
++ }
++
++found:
++ if (choose_sk) {
++ unsigned int mss_now;
++ struct tcp_sock *choose_tp = tcp_sk(choose_sk);
++ struct rrsched_priv *rsp = rrsched_get_priv(choose_tp);
++
++ if (!mptcp_rr_is_available(choose_sk, skb, false, true))
++ return NULL;
++
++ *subsk = choose_sk;
++ mss_now = tcp_current_mss(*subsk);
++ *limit = split * mss_now;
++
++ if (skb->len > mss_now)
++ rsp->quota += DIV_ROUND_UP(skb->len, mss_now);
++ else
++ rsp->quota++;
++
++ return skb;
++ }
++
++ return NULL;
++}
++
++static struct mptcp_sched_ops mptcp_sched_rr = {
++ .get_subflow = rr_get_available_subflow,
++ .next_segment = mptcp_rr_next_segment,
++ .name = "roundrobin",
++ .owner = THIS_MODULE,
++};
++
++static int __init rr_register(void)
++{
++ BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE);
++
++ if (mptcp_register_scheduler(&mptcp_sched_rr))
++ return -1;
++
++ return 0;
++}
++
++static void rr_unregister(void)
++{
++ mptcp_unregister_scheduler(&mptcp_sched_rr);
++}
++
++module_init(rr_register);
++module_exit(rr_unregister);
++
++MODULE_AUTHOR("Christoph Paasch");
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("ROUNDROBIN MPTCP");
++MODULE_VERSION("0.89");
+diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c
+new file mode 100644
+index 000000000000..6c7ff4eceac1
+--- /dev/null
++++ b/net/mptcp/mptcp_sched.c
+@@ -0,0 +1,493 @@
++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
++
++#include <linux/module.h>
++#include <net/mptcp.h>
++
++static DEFINE_SPINLOCK(mptcp_sched_list_lock);
++static LIST_HEAD(mptcp_sched_list);
++
++struct defsched_priv {
++ u32 last_rbuf_opti;
++};
++
++static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp)
++{
++ return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0];
++}
++
++/* If the sub-socket sk available to send the skb? */
++static bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb,
++ bool zero_wnd_test)
++{
++ const struct tcp_sock *tp = tcp_sk(sk);
++ unsigned int mss_now, space, in_flight;
++
++ /* Set of states for which we are allowed to send data */
++ if (!mptcp_sk_can_send(sk))
++ return false;
++
++ /* We do not send data on this subflow unless it is
++ * fully established, i.e. the 4th ack has been received.
++ */
++ if (tp->mptcp->pre_established)
++ return false;
++
++ if (tp->pf)
++ return false;
++
++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
++ /* If SACK is disabled, and we got a loss, TCP does not exit
++ * the loss-state until something above high_seq has been acked.
++ * (see tcp_try_undo_recovery)
++ *
++ * high_seq is the snd_nxt at the moment of the RTO. As soon
++ * as we have an RTO, we won't push data on the subflow.
++ * Thus, snd_una can never go beyond high_seq.
++ */
++ if (!tcp_is_reno(tp))
++ return false;
++ else if (tp->snd_una != tp->high_seq)
++ return false;
++ }
++
++ if (!tp->mptcp->fully_established) {
++ /* Make sure that we send in-order data */
++ if (skb && tp->mptcp->second_packet &&
++ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
++ return false;
++ }
++
++ /* If TSQ is already throttling us, do not send on this subflow. When
++ * TSQ gets cleared the subflow becomes eligible again.
++ */
++ if (test_bit(TSQ_THROTTLED, &tp->tsq_flags))
++ return false;
++
++ in_flight = tcp_packets_in_flight(tp);
++ /* Not even a single spot in the cwnd */
++ if (in_flight >= tp->snd_cwnd)
++ return false;
++
++ /* Now, check if what is queued in the subflow's send-queue
++ * already fills the cwnd.
++ */
++ space = (tp->snd_cwnd - in_flight) * tp->mss_cache;
++
++ if (tp->write_seq - tp->snd_nxt > space)
++ return false;
++
++ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
++ return false;
++
++ mss_now = tcp_current_mss(sk);
++
++ /* Don't send on this subflow if we bypass the allowed send-window at
++ * the per-subflow level. Similar to tcp_snd_wnd_test, but manually
++ * calculated end_seq (because here at this point end_seq is still at
++ * the meta-level).
++ */
++ if (skb && !zero_wnd_test &&
++ after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp)))
++ return false;
++
++ return true;
++}
++
++/* Are we not allowed to reinject this skb on tp? */
++static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
++{
++ /* If the skb has already been enqueued in this sk, try to find
++ * another one.
++ */
++ return skb &&
++ /* Has the skb already been enqueued into this subsocket? */
++ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
++}
++
++/* This is the scheduler. This function decides on which flow to send
++ * a given MSS. If all subflows are found to be busy, NULL is returned
++ * The flow is selected based on the shortest RTT.
++ * If all paths have full cong windows, we simply return NULL.
++ *
++ * Additionally, this function is aware of the backup-subflows.
++ */
++static struct sock *get_available_subflow(struct sock *meta_sk,
++ struct sk_buff *skb,
++ bool zero_wnd_test)
++{
++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL;
++ u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff;
++ int cnt_backups = 0;
++
++ /* if there is only one subflow, bypass the scheduling function */
++ if (mpcb->cnt_subflows == 1) {
++ bestsk = (struct sock *)mpcb->connection_list;
++ if (!mptcp_is_available(bestsk, skb, zero_wnd_test))
++ bestsk = NULL;
++ return bestsk;
++ }
++
++ /* Answer data_fin on same subflow!!! */
++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
++ skb && mptcp_is_data_fin(skb)) {
++ mptcp_for_each_sk(mpcb, sk) {
++ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
++ mptcp_is_available(sk, skb, zero_wnd_test))
++ return sk;
++ }
++ }
++
++ /* First, find the best subflow */
++ mptcp_for_each_sk(mpcb, sk) {
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio)
++ cnt_backups++;
++
++ if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
++ tp->srtt_us < lowprio_min_time_to_peer) {
++ if (!mptcp_is_available(sk, skb, zero_wnd_test))
++ continue;
++
++ if (mptcp_dont_reinject_skb(tp, skb)) {
++ backupsk = sk;
++ continue;
++ }
++
++ lowprio_min_time_to_peer = tp->srtt_us;
++ lowpriosk = sk;
++ } else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
++ tp->srtt_us < min_time_to_peer) {
++ if (!mptcp_is_available(sk, skb, zero_wnd_test))
++ continue;
++
++ if (mptcp_dont_reinject_skb(tp, skb)) {
++ backupsk = sk;
++ continue;
++ }
++
++ min_time_to_peer = tp->srtt_us;
++ bestsk = sk;
++ }
++ }
++
++ if (mpcb->cnt_established == cnt_backups && lowpriosk) {
++ sk = lowpriosk;
++ } else if (bestsk) {
++ sk = bestsk;
++ } else if (backupsk) {
++ /* It has been sent on all subflows once - let's give it a
++ * chance again by restarting its pathmask.
++ */
++ if (skb)
++ TCP_SKB_CB(skb)->path_mask = 0;
++ sk = backupsk;
++ }
++
++ return sk;
++}
++
++static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal)
++{
++ struct sock *meta_sk;
++ const struct tcp_sock *tp = tcp_sk(sk);
++ struct tcp_sock *tp_it;
++ struct sk_buff *skb_head;
++ struct defsched_priv *dsp = defsched_get_priv(tp);
++
++ if (tp->mpcb->cnt_subflows == 1)
++ return NULL;
++
++ meta_sk = mptcp_meta_sk(sk);
++ skb_head = tcp_write_queue_head(meta_sk);
++
++ if (!skb_head || skb_head == tcp_send_head(meta_sk))
++ return NULL;
++
++ /* If penalization is optional (coming from mptcp_next_segment() and
++ * We are not send-buffer-limited we do not penalize. The retransmission
++ * is just an optimization to fix the idle-time due to the delay before
++ * we wake up the application.
++ */
++ if (!penal && sk_stream_memory_free(meta_sk))
++ goto retrans;
++
++ /* Only penalize again after an RTT has elapsed */
++ if (tcp_time_stamp - dsp->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3))
++ goto retrans;
++
++ /* Half the cwnd of the slow flow */
++ mptcp_for_each_tp(tp->mpcb, tp_it) {
++ if (tp_it != tp &&
++ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
++ if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {
++ tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);
++ if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH)
++ tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);
++
++ dsp->last_rbuf_opti = tcp_time_stamp;
++ }
++ break;
++ }
++ }
++
++retrans:
++
++ /* Segment not yet injected into this path? Take it!!! */
++ if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {
++ bool do_retrans = false;
++ mptcp_for_each_tp(tp->mpcb, tp_it) {
++ if (tp_it != tp &&
++ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
++ if (tp_it->snd_cwnd <= 4) {
++ do_retrans = true;
++ break;
++ }
++
++ if (4 * tp->srtt_us >= tp_it->srtt_us) {
++ do_retrans = false;
++ break;
++ } else {
++ do_retrans = true;
++ }
++ }
++ }
++
++ if (do_retrans && mptcp_is_available(sk, skb_head, false))
++ return skb_head;
++ }
++ return NULL;
++}
++
++/* Returns the next segment to be sent from the mptcp meta-queue.
++ * (chooses the reinject queue if any segment is waiting in it, otherwise,
++ * chooses the normal write queue).
++ * Sets *@reinject to 1 if the returned segment comes from the
++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
++ * and sets it to -1 if it is a meta-level retransmission to optimize the
++ * receive-buffer.
++ */
++static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject)
++{
++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
++ struct sk_buff *skb = NULL;
++
++ *reinject = 0;
++
++ /* If we are in fallback-mode, just take from the meta-send-queue */
++ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
++ return tcp_send_head(meta_sk);
++
++ skb = skb_peek(&mpcb->reinject_queue);
++
++ if (skb) {
++ *reinject = 1;
++ } else {
++ skb = tcp_send_head(meta_sk);
++
++ if (!skb && meta_sk->sk_socket &&
++ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) &&
++ sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {
++ struct sock *subsk = get_available_subflow(meta_sk, NULL,
++ false);
++ if (!subsk)
++ return NULL;
++
++ skb = mptcp_rcv_buf_optimization(subsk, 0);
++ if (skb)
++ *reinject = -1;
++ }
++ }
++ return skb;
++}
++
++static struct sk_buff *mptcp_next_segment(struct sock *meta_sk,
++ int *reinject,
++ struct sock **subsk,
++ unsigned int *limit)
++{
++ struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject);
++ unsigned int mss_now;
++ struct tcp_sock *subtp;
++ u16 gso_max_segs;
++ u32 max_len, max_segs, window, needed;
++
++ /* As we set it, we have to reset it as well. */
++ *limit = 0;
++
++ if (!skb)
++ return NULL;
++
++ *subsk = get_available_subflow(meta_sk, skb, false);
++ if (!*subsk)
++ return NULL;
++
++ subtp = tcp_sk(*subsk);
++ mss_now = tcp_current_mss(*subsk);
++
++ if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) {
++ skb = mptcp_rcv_buf_optimization(*subsk, 1);
++ if (skb)
++ *reinject = -1;
++ else
++ return NULL;
++ }
++
++ /* No splitting required, as we will only send one single segment */
++ if (skb->len <= mss_now)
++ return skb;
++
++ /* The following is similar to tcp_mss_split_point, but
++ * we do not care about nagle, because we will anyways
++ * use TCP_NAGLE_PUSH, which overrides this.
++ *
++ * So, we first limit according to the cwnd/gso-size and then according
++ * to the subflow's window.
++ */
++
++ gso_max_segs = (*subsk)->sk_gso_max_segs;
++ if (!gso_max_segs) /* No gso supported on the subflow's NIC */
++ gso_max_segs = 1;
++ max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs);
++ if (!max_segs)
++ return NULL;
++
++ max_len = mss_now * max_segs;
++ window = tcp_wnd_end(subtp) - subtp->write_seq;
++
++ needed = min(skb->len, window);
++ if (max_len <= skb->len)
++ /* Take max_win, which is actually the cwnd/gso-size */
++ *limit = max_len;
++ else
++ /* Or, take the window */
++ *limit = needed;
++
++ return skb;
++}
++
++static void defsched_init(struct sock *sk)
++{
++ struct defsched_priv *dsp = defsched_get_priv(tcp_sk(sk));
++
++ dsp->last_rbuf_opti = tcp_time_stamp;
++}
++
++struct mptcp_sched_ops mptcp_sched_default = {
++ .get_subflow = get_available_subflow,
++ .next_segment = mptcp_next_segment,
++ .init = defsched_init,
++ .name = "default",
++ .owner = THIS_MODULE,
++};
++
++static struct mptcp_sched_ops *mptcp_sched_find(const char *name)
++{
++ struct mptcp_sched_ops *e;
++
++ list_for_each_entry_rcu(e, &mptcp_sched_list, list) {
++ if (strcmp(e->name, name) == 0)
++ return e;
++ }
++
++ return NULL;
++}
++
++int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
++{
++ int ret = 0;
++
++ if (!sched->get_subflow || !sched->next_segment)
++ return -EINVAL;
++
++ spin_lock(&mptcp_sched_list_lock);
++ if (mptcp_sched_find(sched->name)) {
++ pr_notice("%s already registered\n", sched->name);
++ ret = -EEXIST;
++ } else {
++ list_add_tail_rcu(&sched->list, &mptcp_sched_list);
++ pr_info("%s registered\n", sched->name);
++ }
++ spin_unlock(&mptcp_sched_list_lock);
++
++ return ret;
++}
++EXPORT_SYMBOL_GPL(mptcp_register_scheduler);
++
++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
++{
++ spin_lock(&mptcp_sched_list_lock);
++ list_del_rcu(&sched->list);
++ spin_unlock(&mptcp_sched_list_lock);
++}
++EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler);
++
++void mptcp_get_default_scheduler(char *name)
++{
++ struct mptcp_sched_ops *sched;
++
++ BUG_ON(list_empty(&mptcp_sched_list));
++
++ rcu_read_lock();
++ sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list);
++ strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX);
++ rcu_read_unlock();
++}
++
++int mptcp_set_default_scheduler(const char *name)
++{
++ struct mptcp_sched_ops *sched;
++ int ret = -ENOENT;
++
++ spin_lock(&mptcp_sched_list_lock);
++ sched = mptcp_sched_find(name);
++#ifdef CONFIG_MODULES
++ if (!sched && capable(CAP_NET_ADMIN)) {
++ spin_unlock(&mptcp_sched_list_lock);
++
++ request_module("mptcp_%s", name);
++ spin_lock(&mptcp_sched_list_lock);
++ sched = mptcp_sched_find(name);
++ }
++#endif
++
++ if (sched) {
++ list_move(&sched->list, &mptcp_sched_list);
++ ret = 0;
++ } else {
++ pr_info("%s is not available\n", name);
++ }
++ spin_unlock(&mptcp_sched_list_lock);
++
++ return ret;
++}
++
++void mptcp_init_scheduler(struct mptcp_cb *mpcb)
++{
++ struct mptcp_sched_ops *sched;
++
++ rcu_read_lock();
++ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
++ if (try_module_get(sched->owner)) {
++ mpcb->sched_ops = sched;
++ break;
++ }
++ }
++ rcu_read_unlock();
++}
++
++/* Manage refcounts on socket close. */
++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb)
++{
++ module_put(mpcb->sched_ops->owner);
++}
++
++/* Set default value from kernel configuration at bootup */
++static int __init mptcp_scheduler_default(void)
++{
++ BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE);
++
++ return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED);
++}
++late_initcall(mptcp_scheduler_default);
+diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c
+new file mode 100644
+index 000000000000..29ca1d868d17
+--- /dev/null
++++ b/net/mptcp/mptcp_wvegas.c
+@@ -0,0 +1,268 @@
++/*
++ * MPTCP implementation - WEIGHTED VEGAS
++ *
++ * Algorithm design:
++ * Yu Cao <cyAnalyst@126.com>
++ * Mingwei Xu <xmw@csnet1.cs.tsinghua.edu.cn>
++ * Xiaoming Fu <fu@cs.uni-goettinggen.de>
++ *
++ * Implementation:
++ * Yu Cao <cyAnalyst@126.com>
++ * Enhuan Dong <deh13@mails.tsinghua.edu.cn>
++ *
++ * Ported to the official MPTCP-kernel:
++ * Christoph Paasch <christoph.paasch@uclouvain.be>
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#include <linux/skbuff.h>
++#include <net/tcp.h>
++#include <net/mptcp.h>
++#include <linux/module.h>
++#include <linux/tcp.h>
++
++static int initial_alpha = 2;
++static int total_alpha = 10;
++static int gamma = 1;
++
++module_param(initial_alpha, int, 0644);
++MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows");
++module_param(total_alpha, int, 0644);
++MODULE_PARM_DESC(total_alpha, "total alpha for all subflows");
++module_param(gamma, int, 0644);
++MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
++
++#define MPTCP_WVEGAS_SCALE 16
++
++/* wVegas variables */
++struct wvegas {
++ u32 beg_snd_nxt; /* right edge during last RTT */
++ u8 doing_wvegas_now;/* if true, do wvegas for this RTT */
++
++ u16 cnt_rtt; /* # of RTTs measured within last RTT */
++ u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */
++ u32 base_rtt; /* the min of all wVegas RTT measurements seen (in usec) */
++
++ u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */
++ u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */
++ int alpha; /* alpha for each subflows */
++
++ u32 queue_delay; /* queue delay*/
++};
++
++
++static inline u64 mptcp_wvegas_scale(u32 val, int scale)
++{
++ return (u64) val << scale;
++}
++
++static void wvegas_enable(const struct sock *sk)
++{
++ const struct tcp_sock *tp = tcp_sk(sk);
++ struct wvegas *wvegas = inet_csk_ca(sk);
++
++ wvegas->doing_wvegas_now = 1;
++
++ wvegas->beg_snd_nxt = tp->snd_nxt;
++
++ wvegas->cnt_rtt = 0;
++ wvegas->sampled_rtt = 0;
++
++ wvegas->instant_rate = 0;
++ wvegas->alpha = initial_alpha;
++ wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE);
++
++ wvegas->queue_delay = 0;
++}
++
++static inline void wvegas_disable(const struct sock *sk)
++{
++ struct wvegas *wvegas = inet_csk_ca(sk);
++
++ wvegas->doing_wvegas_now = 0;
++}
++
++static void mptcp_wvegas_init(struct sock *sk)
++{
++ struct wvegas *wvegas = inet_csk_ca(sk);
++
++ wvegas->base_rtt = 0x7fffffff;
++ wvegas_enable(sk);
++}
++
++static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us)
++{
++ return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us);
++}
++
++static void mptcp_wvegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
++{
++ struct wvegas *wvegas = inet_csk_ca(sk);
++ u32 vrtt;
++
++ if (rtt_us < 0)
++ return;
++
++ vrtt = rtt_us + 1;
++
++ if (vrtt < wvegas->base_rtt)
++ wvegas->base_rtt = vrtt;
++
++ wvegas->sampled_rtt += vrtt;
++ wvegas->cnt_rtt++;
++}
++
++static void mptcp_wvegas_state(struct sock *sk, u8 ca_state)
++{
++ if (ca_state == TCP_CA_Open)
++ wvegas_enable(sk);
++ else
++ wvegas_disable(sk);
++}
++
++static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
++{
++ if (event == CA_EVENT_CWND_RESTART) {
++ mptcp_wvegas_init(sk);
++ } else if (event == CA_EVENT_LOSS) {
++ struct wvegas *wvegas = inet_csk_ca(sk);
++ wvegas->instant_rate = 0;
++ }
++}
++
++static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp)
++{
++ return min(tp->snd_ssthresh, tp->snd_cwnd - 1);
++}
++
++static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk)
++{
++ u64 total_rate = 0;
++ struct sock *sub_sk;
++ const struct wvegas *wvegas = inet_csk_ca(sk);
++
++ if (!mpcb)
++ return wvegas->weight;
++
++
++ mptcp_for_each_sk(mpcb, sub_sk) {
++ struct wvegas *sub_wvegas = inet_csk_ca(sub_sk);
++
++ /* sampled_rtt is initialized by 0 */
++ if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0))
++ total_rate += sub_wvegas->instant_rate;
++ }
++
++ if (total_rate && wvegas->instant_rate)
++ return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate);
++ else
++ return wvegas->weight;
++}
++
++static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct wvegas *wvegas = inet_csk_ca(sk);
++
++ if (!wvegas->doing_wvegas_now) {
++ tcp_reno_cong_avoid(sk, ack, acked);
++ return;
++ }
++
++ if (after(ack, wvegas->beg_snd_nxt)) {
++ wvegas->beg_snd_nxt = tp->snd_nxt;
++
++ if (wvegas->cnt_rtt <= 2) {
++ tcp_reno_cong_avoid(sk, ack, acked);
++ } else {
++ u32 rtt, diff, q_delay;
++ u64 target_cwnd;
++
++ rtt = wvegas->sampled_rtt / wvegas->cnt_rtt;
++ target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt);
++
++ diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt);
++
++ if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
++ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
++ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
++
++ } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
++ tcp_slow_start(tp, acked);
++ } else {
++ if (diff >= wvegas->alpha) {
++ wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt);
++ wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk);
++ wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE));
++ }
++ if (diff > wvegas->alpha) {
++ tp->snd_cwnd--;
++ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
++ } else if (diff < wvegas->alpha) {
++ tp->snd_cwnd++;
++ }
++
++ /* Try to drain link queue if needed*/
++ q_delay = rtt - wvegas->base_rtt;
++ if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay))
++ wvegas->queue_delay = q_delay;
++
++ if (q_delay >= 2 * wvegas->queue_delay) {
++ u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt);
++ tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE;
++ wvegas->queue_delay = 0;
++ }
++ }
++
++ if (tp->snd_cwnd < 2)
++ tp->snd_cwnd = 2;
++ else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
++ tp->snd_cwnd = tp->snd_cwnd_clamp;
++
++ tp->snd_ssthresh = tcp_current_ssthresh(sk);
++ }
++
++ wvegas->cnt_rtt = 0;
++ wvegas->sampled_rtt = 0;
++ }
++ /* Use normal slow start */
++ else if (tp->snd_cwnd <= tp->snd_ssthresh)
++ tcp_slow_start(tp, acked);
++}
++
++
++static struct tcp_congestion_ops mptcp_wvegas __read_mostly = {
++ .init = mptcp_wvegas_init,
++ .ssthresh = tcp_reno_ssthresh,
++ .cong_avoid = mptcp_wvegas_cong_avoid,
++ .pkts_acked = mptcp_wvegas_pkts_acked,
++ .set_state = mptcp_wvegas_state,
++ .cwnd_event = mptcp_wvegas_cwnd_event,
++
++ .owner = THIS_MODULE,
++ .name = "wvegas",
++};
++
++static int __init mptcp_wvegas_register(void)
++{
++ BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE);
++ tcp_register_congestion_control(&mptcp_wvegas);
++ return 0;
++}
++
++static void __exit mptcp_wvegas_unregister(void)
++{
++ tcp_unregister_congestion_control(&mptcp_wvegas);
++}
++
++module_init(mptcp_wvegas_register);
++module_exit(mptcp_wvegas_unregister);
++
++MODULE_AUTHOR("Yu Cao, Enhuan Dong");
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("MPTCP wVegas");
++MODULE_VERSION("0.1");
next reply other threads:[~2014-10-06 11:38 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-10-06 11:38 Mike Pagano [this message]
-- strict thread matches above, loose matches on Subject: below --
2014-12-16 17:29 [gentoo-commits] proj/linux-patches:3.16 commit in: / Mike Pagano
2014-11-29 18:11 Mike Pagano
2014-11-29 18:05 Mike Pagano
2014-11-29 18:05 Mike Pagano
2014-11-29 18:05 Mike Pagano
2014-10-30 19:29 Mike Pagano
2014-10-15 12:42 Mike Pagano
2014-10-09 19:54 Mike Pagano
2014-10-07 1:34 Anthony G. Basile
2014-10-07 1:28 Anthony G. Basile
2014-10-06 11:39 Mike Pagano
2014-10-06 11:16 Anthony G. Basile
2014-10-06 11:16 Anthony G. Basile
2014-09-27 13:37 Mike Pagano
2014-09-26 19:40 Mike Pagano
2014-09-22 23:37 Mike Pagano
2014-09-17 22:19 Anthony G. Basile
2014-09-09 21:38 Vlastimil Babka
2014-08-26 12:16 Mike Pagano
2014-08-19 11:44 Mike Pagano
2014-08-14 11:51 ` Mike Pagano
2014-08-08 19:48 Mike Pagano
2014-08-19 11:44 ` Mike Pagano
2014-07-15 12:23 Mike Pagano
2014-07-15 12:18 Mike Pagano
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1412595522.f2ea3e49d07e5b148c974633ec003ba2382f1189.mpagano@gentoo \
--to=mpagano@gentoo.org \
--cc=gentoo-commits@lists.gentoo.org \
--cc=gentoo-dev@lists.gentoo.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox