From 77b228276c0c160b3be23cac758817df1a2ca2ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ole=20Andr=C3=A9=20Vadla=20Ravn=C3=A5s?= Date: Tue, 10 May 2022 00:47:37 +0200 Subject: [PATCH] Make it possible to build IP stack as a module This is not pretty, but it works. --- include/net/ip.h | 4 +- net/.gitignore | 4 + net/Kconfig | 2 +- net/core/secure_seq.c | 2 +- net/ipv4/Makefile | 30 +- net/ipv4/af_inet.c | 35 + net/ipv4/inet_hashtables.c | 1 + net/ipv4/inet_timewait_sock.c | 1 + net/ipv4/module_lib.c | 1370 +++++++++++++++++++++++++++++++++ net/ipv4/proc.c | 2 + net/ipv4/sysctl_net_ipv4.c | 6 + net/ipv4/sysfs_net_ipv4.c | 6 + net/ipv4/tcp.c | 8 + net/ipv4/tcp_cong.c | 7 + net/ipv4/tcp_fastopen.c | 6 + net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_memcontrol.c | 6 + net/ipv4/udp.c | 2 + net/ipv6/Makefile | 7 + net/ipv6/af_inet6.c | 4 + net/ipv6/exthdrs_offload.c | 2 +- net/ipv6/ip6_offload.c | 5 +- net/ipv6/ip6_offload.h | 1 + net/ipv6/tcp_ipv6.c | 2 +- net/ipv6/tcpv6_offload.c | 2 +- net/ipv6/udp_offload.c | 2 +- 26 files changed, 1494 insertions(+), 25 deletions(-) create mode 100644 net/.gitignore create mode 100644 net/ipv4/module_lib.c diff --git a/include/net/ip.h b/include/net/ip.h index 1f6794b2..65ac5fab 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -234,7 +234,7 @@ static inline bool ip_is_fragment(const struct iphdr *iph) return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0; } -#ifdef CONFIG_INET +#if IS_ENABLED(CONFIG_INET) #include /* The function in 2.2 was invalid, producing wrong result for @@ -404,7 +404,7 @@ enum ip_defrag_users { }; int ip_defrag(struct sk_buff *skb, u32 user); -#ifdef CONFIG_INET +#if IS_ENABLED(CONFIG_INET) struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user); #else static inline struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) diff --git a/net/.gitignore b/net/.gitignore new file mode 100644 index 00000000..b3f74bdb --- /dev/null +++ b/net/.gitignore @@ -0,0 +1,4 @@ +# +# Generated files +# +Module.symvers diff --git a/net/Kconfig b/net/Kconfig index 2a680dad..4e986e85 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -51,7 +51,7 @@ source "net/xfrm/Kconfig" source "net/iucv/Kconfig" config INET - bool "TCP/IP networking" + tristate "TCP/IP networking" select CRYPTO select CRYPTO_AES ---help--- diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index d0afc322..533ebfc8 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -32,7 +32,7 @@ static void net_secret_init(void) } #endif -#ifdef CONFIG_INET +#if IS_ENABLED(CONFIG_INET) static u32 seq_scale(u32 seq) { /* diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 5a9af0a9..e6188ede 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -2,7 +2,9 @@ # Makefile for the Linux TCP/IP (INET) layer. # -obj-y := route.o inetpeer.o protocol.o \ +obj-$(CONFIG_INET) += ipv4.o + +ipv4-y := route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ inet_timewait_sock.o inet_connection_sock.o \ @@ -13,17 +15,21 @@ obj-y := route.o inetpeer.o protocol.o \ fib_frontend.o fib_semantics.o fib_trie.o \ inet_fragment.o ping.o +ifeq ($(CONFIG_INET),m) +ipv4-y += module_lib.o +endif + obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o -obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o -obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o -obj-$(CONFIG_PROC_FS) += proc.o -obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o -obj-$(CONFIG_IP_MROUTE) += ipmr.o +ipv4-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o +ipv4-$(CONFIG_SYSFS) += sysfs_net_ipv4.o +ipv4-$(CONFIG_PROC_FS) += proc.o +ipv4-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o +ipv4-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_NET_IPVTI) += ip_vti.o -obj-$(CONFIG_SYN_COOKIES) += syncookies.o +ipv4-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o @@ -33,8 +39,8 @@ obj-$(CONFIG_INET_LRO) += inet_lro.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o -obj-$(CONFIG_IP_PNP) += ipconfig.o -obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ +ipv4-$(CONFIG_IP_PNP) += ipconfig.o +ipv4-$(CONFIG_NETFILTER) += netfilter.o netfilter/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o @@ -51,8 +57,8 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o -obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o -obj-$(CONFIG_NETLABEL) += cipso_ipv4.o +ipv4-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o +ipv4-$(CONFIG_NETLABEL) += cipso_ipv4.o -obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ +ipv4-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f022e0e9..ca900ac8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -119,6 +119,10 @@ #include #endif +MODULE_AUTHOR("Cast of dozens"); +MODULE_DESCRIPTION("IPv4 protocol stack for Linux"); +MODULE_LICENSE("GPL"); + #ifdef CONFIG_ANDROID_PARANOID_NETWORK #include @@ -1702,13 +1706,23 @@ static int __init ipv4_offload_init(void) return 0; } +#ifndef CONFIG_INET_MODULE fs_initcall(ipv4_offload_init); +#endif static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, }; +#ifdef CONFIG_INET_MODULE +int tcp_congestion_init(void); +int tcp_fastopen_init(void); +int sysctl_ipv4_init(void); +int sysfs_ipv4_init(void); +int tcp_memcontrol_init(void); +#endif + static int __init inet_init(void) { struct inet_protosw *q; @@ -1717,6 +1731,12 @@ static int __init inet_init(void) BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); +#ifdef CONFIG_INET_MODULE + tcp_congestion_init(); + tcp_fastopen_init(); + ipv4_offload_init(); +#endif + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); if (!sysctl_local_reserved_ports) goto out; @@ -1823,6 +1843,21 @@ static int __init inet_init(void) dev_add_pack(&ip_packet_type); +#ifdef CONFIG_INET_MODULE +#ifdef CONFIG_SYSCTL + sysctl_ipv4_init(); +#endif +#ifdef CONFIG_SYSFS + sysfs_ipv4_init(); +#endif +#ifdef CONFIG_MEMCG_KMEM + tcp_memcontrol_init(); +#endif + + /* TODO: Implement unload logic */ + try_module_get(THIS_MODULE); +#endif + rc = 0; out: return rc; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c95848d0..1e321e2b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -582,6 +582,7 @@ out: return ret; } } +EXPORT_SYMBOL(__inet_hash_connect); /* * Bind a port for a connect operation and hash it. diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 1f27c9f4..0f599b21 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -65,6 +65,7 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, */ return 1; } +EXPORT_SYMBOL(inet_twsk_unhash); /* Must be called with locally disabled BHs. */ static void __inet_twsk_kill(struct inet_timewait_sock *tw, diff --git a/net/ipv4/module_lib.c b/net/ipv4/module_lib.c new file mode 100644 index 00000000..3972ee3f --- /dev/null +++ b/net/ipv4/module_lib.c @@ -0,0 +1,1370 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * Bits from fs/pipe.c + */ + +void pipe_wait(struct pipe_inode_info *pipe) +{ + DEFINE_WAIT(wait); + + /* + * Pipes are system-local resources, so sleeping on them + * is considered a noninteractive wait: + */ + prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); + pipe_unlock(pipe); + schedule(); + finish_wait(&pipe->wait, &wait); + pipe_lock(pipe); +} + + +/* + * Bits from fs/splice.c + */ + +static void wakeup_pipe_readers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); +} + +ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + unsigned int spd_pages = spd->nr_pages; + int ret, do_wakeup, page_nr; + + ret = 0; + do_wakeup = 0; + page_nr = 0; + + pipe_lock(pipe); + + for (;;) { + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + + if (pipe->nrbufs < pipe->buffers) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + + buf->page = spd->pages[page_nr]; + buf->offset = spd->partial[page_nr].offset; + buf->len = spd->partial[page_nr].len; + buf->private = spd->partial[page_nr].private; + buf->ops = spd->ops; + if (spd->flags & SPLICE_F_GIFT) + buf->flags |= PIPE_BUF_FLAG_GIFT; + + pipe->nrbufs++; + page_nr++; + ret += buf->len; + + if (pipe->files) + do_wakeup = 1; + + if (!--spd->nr_pages) + break; + if (pipe->nrbufs < pipe->buffers) + continue; + + break; + } + + if (spd->flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + + pipe_unlock(pipe); + + if (do_wakeup) + wakeup_pipe_readers(pipe); + + while (page_nr < spd_pages) + spd->spd_release(spd, page_nr++); + + return ret; +} + + +/* + * Bits from kernel/res_counter.c + */ + +void res_counter_init(struct res_counter *counter, struct res_counter *parent) +{ + spin_lock_init(&counter->lock); + counter->limit = RESOURCE_MAX; + counter->soft_limit = RESOURCE_MAX; + counter->parent = parent; +} + +static inline unsigned long long * +res_counter_member(struct res_counter *counter, int member) +{ + switch (member) { + case RES_USAGE: + return &counter->usage; + case RES_MAX_USAGE: + return &counter->max_usage; + case RES_LIMIT: + return &counter->limit; + case RES_FAILCNT: + return &counter->failcnt; + case RES_SOFT_LIMIT: + return &counter->soft_limit; + }; + + BUG(); + return NULL; +} + +#if BITS_PER_LONG == 32 +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&counter->lock, flags); + ret = *res_counter_member(counter, member); + spin_unlock_irqrestore(&counter->lock, flags); + + return ret; +} +#else +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + return *res_counter_member(counter, member); +} +#endif + +int res_counter_memparse_write_strategy(const char *buf, + unsigned long long *res) +{ + char *end; + + /* return RESOURCE_MAX(unlimited) if "-1" is specified */ + if (*buf == '-') { + *res = simple_strtoull(buf + 1, &end, 10); + if (*res != 1 || *end != '\0') + return -EINVAL; + *res = RESOURCE_MAX; + return 0; + } + + *res = memparse(buf, &end); + if (*end != '\0') + return -EINVAL; + + *res = PAGE_ALIGN(*res); + return 0; +} + + +/* + * Bits from kernel/sysctl.c + */ + +static void proc_skip_char(char **buf, size_t *size, const char v) +{ + while (*size) { + if (**buf != v) + break; + (*size)--; + (*buf)++; + } +} + +#define TMPBUFLEN 22 +static int proc_get_long(char **buf, size_t *size, + unsigned long *val, bool *neg, + const char *perm_tr, unsigned perm_tr_len, char *tr) +{ + int len; + char *p, tmp[TMPBUFLEN]; + + if (!*size) + return -EINVAL; + + len = *size; + if (len > TMPBUFLEN - 1) + len = TMPBUFLEN - 1; + + memcpy(tmp, *buf, len); + + tmp[len] = 0; + p = tmp; + if (*p == '-' && *size > 1) { + *neg = true; + p++; + } else + *neg = false; + if (!isdigit(*p)) + return -EINVAL; + + *val = simple_strtoul(p, &p, 0); + + len = p - tmp; + + /* We don't know if the next char is whitespace thus we may accept + * invalid integers (e.g. 1234...a) or two integers instead of one + * (e.g. 123...1). So lets not allow such large numbers. */ + if (len == TMPBUFLEN - 1) + return -EINVAL; + + if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) + return -EINVAL; + + if (tr && (len < *size)) + *tr = *p; + + *buf += len; + *size -= len; + + return 0; +} + +static int proc_put_long(void __user **buf, size_t *size, unsigned long val, + bool neg) +{ + int len; + char tmp[TMPBUFLEN], *p = tmp; + + sprintf(p, "%s%lu", neg ? "-" : "", val); + len = strlen(tmp); + if (len > *size) + len = *size; + if (copy_to_user(*buf, tmp, len)) + return -EFAULT; + *size -= len; + *buf += len; + return 0; +} +#undef TMPBUFLEN + +static int proc_put_char(void __user **buf, size_t *size, char c) +{ + if (*size) { + char __user **buffer = (char __user **)buf; + if (put_user(c, *buffer)) + return -EFAULT; + (*size)--, (*buffer)++; + *buf = *buffer; + } + return 0; +} + +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err = 0; + bool first = 1; + size_t left = *lenp; + unsigned long bitmap_len = table->maxlen; + unsigned long *bitmap = (unsigned long *) table->data; + unsigned long *tmp_bitmap = NULL; + char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; + + if (!bitmap_len || !left || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + unsigned long page = 0; + char *kbuf; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + free_page(page); + return -EFAULT; + } + kbuf[left] = 0; + + tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), + GFP_KERNEL); + if (!tmp_bitmap) { + free_page(page); + return -ENOMEM; + } + proc_skip_char(&kbuf, &left, '\n'); + while (!err && left) { + unsigned long val_a, val_b; + bool neg; + + err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, + sizeof(tr_a), &c); + if (err) + break; + if (val_a >= bitmap_len || neg) { + err = -EINVAL; + break; + } + + val_b = val_a; + if (left) { + kbuf++; + left--; + } + + if (c == '-') { + err = proc_get_long(&kbuf, &left, &val_b, + &neg, tr_b, sizeof(tr_b), + &c); + if (err) + break; + if (val_b >= bitmap_len || neg || + val_a > val_b) { + err = -EINVAL; + break; + } + if (left) { + kbuf++; + left--; + } + } + + bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); + first = 0; + proc_skip_char(&kbuf, &left, '\n'); + } + free_page(page); + } else { + unsigned long bit_a, bit_b = 0; + + while (left) { + bit_a = find_next_bit(bitmap, bitmap_len, bit_b); + if (bit_a >= bitmap_len) + break; + bit_b = find_next_zero_bit(bitmap, bitmap_len, + bit_a + 1) - 1; + + if (!first) { + err = proc_put_char(&buffer, &left, ','); + if (err) + break; + } + err = proc_put_long(&buffer, &left, bit_a, false); + if (err) + break; + if (bit_a != bit_b) { + err = proc_put_char(&buffer, &left, '-'); + if (err) + break; + err = proc_put_long(&buffer, &left, bit_b, false); + if (err) + break; + } + + first = 0; bit_b++; + } + if (!err) + err = proc_put_char(&buffer, &left, '\n'); + } + + if (!err) { + if (write) { + if (*ppos) + bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); + else + bitmap_copy(bitmap, tmp_bitmap, bitmap_len); + } + kfree(tmp_bitmap); + *lenp -= left; + *ppos += *lenp; + return 0; + } else { + kfree(tmp_bitmap); + return err; + } +} + + +/* + * Bits from mm/memcontrol.c + */ + +enum mem_cgroup_stat_index { + /* + * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. + */ + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ + MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ + MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ + MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ + MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ + MEM_CGROUP_STAT_NSTATS, +}; + +enum mem_cgroup_events_index { + MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ + MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ + MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ + MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ + MEM_CGROUP_EVENTS_NSTATS, +}; + +enum mem_cgroup_events_target { + MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_TARGET_NUMAINFO, + MEM_CGROUP_NTARGETS, +}; + +struct mem_cgroup_stat_cpu { + long count[MEM_CGROUP_STAT_NSTATS]; + unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct mem_cgroup_reclaim_iter { + /* + * last scanned hierarchy member. Valid only if last_dead_count + * matches memcg->dead_count of the hierarchy root group. + */ + struct mem_cgroup *last_visited; + unsigned long last_dead_count; + + /* scan generation, increased every round-trip */ + unsigned int generation; +}; + +struct mem_cgroup_per_zone { + struct lruvec lruvec; + unsigned long lru_size[NR_LRU_LISTS]; + + struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + + struct rb_node tree_node; /* RB tree node */ + unsigned long long usage_in_excess;/* Set to the value by which */ + /* the soft limit is exceeded*/ + bool on_tree; + struct mem_cgroup *memcg; /* Back pointer, we cannot */ + /* use container_of */ +}; + +struct mem_cgroup_per_node { + struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; +}; + +struct mem_cgroup_lru_info { + struct mem_cgroup_per_node *nodeinfo[0]; +}; + +struct mem_cgroup_thresholds { + /* Primary thresholds array */ + struct mem_cgroup_threshold_ary *primary; + /* + * Spare threshold array. + * This is needed to make mem_cgroup_unregister_event() "never fail". + * It must be able to store at least primary->size - 1 entries. + */ + struct mem_cgroup_threshold_ary *spare; +}; + +struct mem_cgroup { + struct cgroup_subsys_state css; + /* + * the counter to account for memory usage + */ + struct res_counter res; + + /* vmpressure notifications */ + struct vmpressure vmpressure; + + union { + /* + * the counter to account for mem+swap usage. + */ + struct res_counter memsw; + + /* + * rcu_freeing is used only when freeing struct mem_cgroup, + * so put it into a union to avoid wasting more memory. + * It must be disjoint from the css field. It could be + * in a union with the res field, but res plays a much + * larger part in mem_cgroup life than memsw, and might + * be of interest, even at time of free, when debugging. + * So share rcu_head with the less interesting memsw. + */ + struct rcu_head rcu_freeing; + /* + * We also need some space for a worker in deferred freeing. + * By the time we call it, rcu_freeing is no longer in use. + */ + struct work_struct work_freeing; + }; + + /* + * the counter to account for kernel memory usage. + */ + struct res_counter kmem; + /* + * Should the accounting and control be hierarchical, per subtree? + */ + bool use_hierarchy; + unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ + + bool oom_lock; + atomic_t under_oom; + atomic_t oom_wakeups; + + atomic_t refcnt; + + int swappiness; + /* OOM-Killer disable */ + int oom_kill_disable; + + /* set when res.limit == memsw.limit */ + bool memsw_is_minimum; + + /* protect arrays of thresholds */ + struct mutex thresholds_lock; + + /* thresholds for memory usage. RCU-protected */ + struct mem_cgroup_thresholds thresholds; + + /* thresholds for mem+swap usage. RCU-protected */ + struct mem_cgroup_thresholds memsw_thresholds; + + /* For oom notifier event fd */ + struct list_head oom_notify; + + /* + * Should we move charges of a task when a task is moved into this + * mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; + /* + * set > 0 if pages under this cgroup are moving to other cgroup. + */ + atomic_t moving_account; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + /* + * percpu counter. + */ + struct mem_cgroup_stat_cpu __percpu *stat; + /* + * used when a cpu is offlined or other synchronizations + * See mem_cgroup_read_stat(). + */ + struct mem_cgroup_stat_cpu nocpu_base; + spinlock_t pcp_counter_lock; + + atomic_t dead_count; +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) + struct tcp_memcontrol tcp_mem; +#endif +#if defined(CONFIG_MEMCG_KMEM) + /* analogous to slab_common's slab_caches list. per-memcg */ + struct list_head memcg_slab_caches; + /* Not a spinlock, we can take a lot of time walking the list */ + struct mutex slab_caches_mutex; + /* Index in the kmem_cache->memcg_params->memcg_caches array */ + int kmemcg_id; +#endif + + int last_scanned_node; +#if MAX_NUMNODES > 1 + nodemask_t scan_nodes; + atomic_t numainfo_events; + atomic_t numainfo_updating; +#endif + + /* + * Per cgroup active and inactive list, similar to the + * per zone LRU lists. + * + * WARNING: This has to be the last element of the struct. Don't + * add new fields after this point. + */ + struct mem_cgroup_lru_info info; +}; + +static inline +struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) +{ + return container_of(s, struct mem_cgroup, css); +} + +struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) +{ + return mem_cgroup_from_css( + cgroup_subsys_state(cont, mem_cgroup_subsys_id)); +} + +struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) +{ + /* + * mm_update_next_owner() may clear mm->owner to NULL + * if it races with swapoff, page migration, etc. + * So this can be called with p == NULL. + */ + if (unlikely(!p)) + return NULL; + + return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); +} + + +/* + * Bits from mm/page_alloc.c + * + * Slightly simplified to avoid depending on nr_kernel_pages, nr_all_pages, + * and alloc_bootmem_nopanic(). + */ + +void *alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long low_limit, + unsigned long high_limit) +{ + unsigned long long max = high_limit; + unsigned long log2qty, size; + void *table = NULL; + + /* allow the kernel cmdline to have a say */ + if (!numentries) + numentries = PAGE_SIZE / bucketsize; + numentries = roundup_pow_of_two(numentries); + + /* limit allocation size */ + if (max == 0) { + max = 64 * 1024; + do_div(max, bucketsize); + } + max = min(max, 0x80000000ULL); + + if (numentries < low_limit) + numentries = low_limit; + if (numentries > max) + numentries = max; + + log2qty = ilog2(numentries); + + do { + size = bucketsize << log2qty; + /* + * If bucketsize is not a power-of-two, we may free + * some pages at the end of hash table which + * alloc_pages_exact() automatically does + */ + if (get_order(size) < MAX_ORDER) { + table = alloc_pages_exact(size, GFP_ATOMIC); + kmemleak_alloc(table, size, 1, GFP_ATOMIC); + } + } while (!table && size > PAGE_SIZE && --log2qty); + + if (!table) + panic("Failed to allocate %s hash table\n", tablename); + + printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", + tablename, + (1UL << log2qty), + ilog2(size) - PAGE_SHIFT, + size); + + if (_hash_shift) + *_hash_shift = log2qty; + if (_hash_mask) + *_hash_mask = (1 << log2qty) - 1; + + return table; +} + + +/* + * Bits from net/core/dst.c + */ + +const u32 dst_default_metrics[RTAX_MAX + 1] = { + /* This initializer is needed to force linker to place this variable + * into const section. Otherwise it might end into bss section. + * We really want to avoid false sharing on this variable, and catch + * any writes on it. + */ + [RTAX_MAX] = 0xdeadbeef, +}; + + +/* + * Bits from net/core/neighbour.c + */ + +#define PNEIGH_HASHMASK 0xF + +static u32 pneigh_hash(const void *pkey, int key_len) +{ + u32 hash_val = *(u32 *)(pkey + key_len - 4); + hash_val ^= (hash_val >> 16); + hash_val ^= hash_val >> 8; + hash_val ^= hash_val >> 4; + hash_val &= PNEIGH_HASHMASK; + return hash_val; +} + +int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n, **np; + int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + write_lock_bh(&tbl->lock); + for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + np = &n->next) { + if (!memcmp(n->key, pkey, key_len) && n->dev == dev && + net_eq(pneigh_net(n), net)) { + *np = n->next; + write_unlock_bh(&tbl->lock); + if (tbl->pdestructor) + tbl->pdestructor(n); + if (n->dev) + dev_put(n->dev); + release_net(pneigh_net(n)); + kfree(n); + return 0; + } + } + write_unlock_bh(&tbl->lock); + return -ENOENT; +} + + +/* + * Bits from net/core/request_sock.c + */ + +int reqsk_queue_alloc(struct request_sock_queue *queue, + unsigned int nr_table_entries) +{ + size_t lopt_size = sizeof(struct listen_sock); + struct listen_sock *lopt; + + nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); + nr_table_entries = max_t(u32, nr_table_entries, 8); + nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); + lopt_size += nr_table_entries * sizeof(struct request_sock *); + if (lopt_size > PAGE_SIZE) + lopt = vzalloc(lopt_size); + else + lopt = kzalloc(lopt_size, GFP_KERNEL); + if (lopt == NULL) + return -ENOMEM; + + for (lopt->max_qlen_log = 3; + (1 << lopt->max_qlen_log) < nr_table_entries; + lopt->max_qlen_log++); + + get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); + rwlock_init(&queue->syn_wait_lock); + queue->rskq_accept_head = NULL; + lopt->nr_table_entries = nr_table_entries; + + write_lock_bh(&queue->syn_wait_lock); + queue->listen_opt = lopt; + write_unlock_bh(&queue->syn_wait_lock); + + return 0; +} + +void __reqsk_queue_destroy(struct request_sock_queue *queue) +{ + struct listen_sock *lopt; + size_t lopt_size; + + /* + * this is an error recovery path only + * no locking needed and the lopt is not NULL + */ + + lopt = queue->listen_opt; + lopt_size = sizeof(struct listen_sock) + + lopt->nr_table_entries * sizeof(struct request_sock *); + + if (lopt_size > PAGE_SIZE) + vfree(lopt); + else + kfree(lopt); +} + +static inline struct listen_sock *reqsk_queue_yank_listen_sk( + struct request_sock_queue *queue) +{ + struct listen_sock *lopt; + + write_lock_bh(&queue->syn_wait_lock); + lopt = queue->listen_opt; + queue->listen_opt = NULL; + write_unlock_bh(&queue->syn_wait_lock); + + return lopt; +} + +void reqsk_queue_destroy(struct request_sock_queue *queue) +{ + /* make all the listen_opt local to us */ + struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); + size_t lopt_size = sizeof(struct listen_sock) + + lopt->nr_table_entries * sizeof(struct request_sock *); + + if (lopt->qlen != 0) { + unsigned int i; + + for (i = 0; i < lopt->nr_table_entries; i++) { + struct request_sock *req; + + while ((req = lopt->syn_table[i]) != NULL) { + lopt->syn_table[i] = req->dl_next; + lopt->qlen--; + reqsk_free(req); + } + } + } + + WARN_ON(lopt->qlen != 0); + if (lopt_size > PAGE_SIZE) + vfree(lopt); + else + kfree(lopt); +} + +void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, + bool reset) +{ + struct sock *lsk = tcp_rsk(req)->listener; + struct fastopen_queue *fastopenq = + inet_csk(lsk)->icsk_accept_queue.fastopenq; + + tcp_sk(sk)->fastopen_rsk = NULL; + spin_lock_bh(&fastopenq->lock); + fastopenq->qlen--; + tcp_rsk(req)->listener = NULL; + if (req->sk) /* the child socket hasn't been accepted yet */ + goto out; + + if (!reset || lsk->sk_state != TCP_LISTEN) { + /* If the listener has been closed don't bother with the + * special RST handling below. + */ + spin_unlock_bh(&fastopenq->lock); + sock_put(lsk); + reqsk_free(req); + return; + } + /* Wait for 60secs before removing a req that has triggered RST. + * This is a simple defense against TFO spoofing attack - by + * counting the req against fastopen.max_qlen, and disabling + * TFO when the qlen exceeds max_qlen. + * + * For more details see CoNext'11 "TCP Fast Open" paper. + */ + req->expires = jiffies + 60*HZ; + if (fastopenq->rskq_rst_head == NULL) + fastopenq->rskq_rst_head = req; + else + fastopenq->rskq_rst_tail->dl_next = req; + + req->dl_next = NULL; + fastopenq->rskq_rst_tail = req; + fastopenq->qlen++; +out: + spin_unlock_bh(&fastopenq->lock); + sock_put(lsk); + return; +} + + +/* + * Bits from net/core/secure_seq.c + */ + +#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) + +static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; + +static void net_secret_init(void) +{ + u32 tmp; + int i; + + if (likely(net_secret[0])) + return; + + for (i = NET_SECRET_SIZE; i > 0;) { + do { + get_random_bytes(&tmp, sizeof(tmp)); + } while (!tmp); + cmpxchg(&net_secret[--i], 0, tmp); + } +} + +static u32 seq_scale(u32 seq) +{ + /* + * As close as possible to RFC 793, which + * suggests using a 250 kHz clock. + * Further reading shows this assumes 2 Mb/s networks. + * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate. + * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but + * we also need to limit the resolution so that the u32 seq + * overlaps less than one time per MSL (2 minutes). + * Choosing a clock of 64 ns period is OK. (period of 274 s) + */ + return seq + (ktime_to_ns(ktime_get_real()) >> 6); +} + +__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + u32 hash[MD5_DIGEST_WORDS]; + + net_secret_init(); + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = ((__force u16)sport << 16) + (__force u16)dport; + hash[3] = net_secret[15]; + + md5_transform(hash, net_secret); + + return seq_scale(hash[0]); +} + +u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) +{ + u32 hash[MD5_DIGEST_WORDS]; + + net_secret_init(); + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = (__force u32)dport ^ net_secret[14]; + hash[3] = net_secret[15]; + + md5_transform(hash, net_secret); + + return hash[0]; +} + +#if IS_ENABLED(CONFIG_IPV6) +__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport) +{ + u32 secret[MD5_MESSAGE_BYTES / 4]; + u32 hash[MD5_DIGEST_WORDS]; + u32 i; + + net_secret_init(); + memcpy(hash, saddr, 16); + for (i = 0; i < 4; i++) + secret[i] = net_secret[i] + (__force u32)daddr[i]; + secret[4] = net_secret[4] + + (((__force u16)sport << 16) + (__force u16)dport); + for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) + secret[i] = net_secret[i]; + + md5_transform(hash, secret); + + return seq_scale(hash[0]); +} +EXPORT_SYMBOL(secure_tcpv6_sequence_number); + +u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, + __be16 dport) +{ + u32 secret[MD5_MESSAGE_BYTES / 4]; + u32 hash[MD5_DIGEST_WORDS]; + u32 i; + + net_secret_init(); + memcpy(hash, saddr, 16); + for (i = 0; i < 4; i++) + secret[i] = net_secret[i] + (__force u32) daddr[i]; + secret[4] = net_secret[4] + (__force u32)dport; + for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) + secret[i] = net_secret[i]; + + md5_transform(hash, secret); + + return hash[0]; +} +EXPORT_SYMBOL(secure_ipv6_port_ephemeral); +#endif + + +/* + * Bits from net/core/skbuff.c + */ + +static struct page *linear_to_page(struct page *page, unsigned int *len, + unsigned int *offset, + struct sock *sk) +{ + struct page_frag *pfrag = sk_page_frag(sk); + + if (!sk_page_frag_refill(sk, pfrag)) + return NULL; + + *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); + + memcpy(page_address(pfrag->page) + pfrag->offset, + page_address(page) + *offset, *len); + *offset = pfrag->offset; + pfrag->offset += *len; + + return pfrag->page; +} + +static bool spd_can_coalesce(const struct splice_pipe_desc *spd, + struct page *page, + unsigned int offset) +{ + return spd->nr_pages && + spd->pages[spd->nr_pages - 1] == page && + (spd->partial[spd->nr_pages - 1].offset + + spd->partial[spd->nr_pages - 1].len == offset); +} + +static bool spd_fill_page(struct splice_pipe_desc *spd, + struct pipe_inode_info *pipe, struct page *page, + unsigned int *len, unsigned int offset, + bool linear, + struct sock *sk) +{ + if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) + return true; + + if (linear) { + page = linear_to_page(page, len, &offset, sk); + if (!page) + return true; + } + if (spd_can_coalesce(spd, page, offset)) { + spd->partial[spd->nr_pages - 1].len += *len; + return false; + } + get_page(page); + spd->pages[spd->nr_pages] = page; + spd->partial[spd->nr_pages].len = *len; + spd->partial[spd->nr_pages].offset = offset; + spd->nr_pages++; + + return false; +} + +static bool __splice_segment(struct page *page, unsigned int poff, + unsigned int plen, unsigned int *off, + unsigned int *len, + struct splice_pipe_desc *spd, bool linear, + struct sock *sk, + struct pipe_inode_info *pipe) +{ + if (!*len) + return true; + + /* skip this segment if already processed */ + if (*off >= plen) { + *off -= plen; + return false; + } + + /* ignore any bits we already processed */ + poff += *off; + plen -= *off; + *off = 0; + + do { + unsigned int flen = min(*len, plen); + + if (spd_fill_page(spd, pipe, page, &flen, poff, + linear, sk)) + return true; + poff += flen; + plen -= flen; + *len -= flen; + } while (*len && plen); + + return false; +} + +static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, + unsigned int *offset, unsigned int *len, + struct splice_pipe_desc *spd, struct sock *sk) +{ + int seg; + + /* map the linear part : + * If skb->head_frag is set, this 'linear' part is backed by a + * fragment, and if the head is not shared with any clones then + * we can avoid a copy since we own the head portion of this page. + */ + if (__splice_segment(virt_to_page(skb->data), + (unsigned long) skb->data & (PAGE_SIZE - 1), + skb_headlen(skb), + offset, len, spd, + skb_head_is_locked(skb), + sk, pipe)) + return true; + + /* + * then map the fragments + */ + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { + const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; + + if (__splice_segment(skb_frag_page(f), + f->page_offset, skb_frag_size(f), + offset, len, spd, false, sk, pipe)) + return true; + } + + return false; +} + +static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + put_page(spd->pages[i]); +} + +int skb_splice_bits(struct sk_buff *skb, unsigned int offset, + struct pipe_inode_info *pipe, unsigned int tlen, + unsigned int flags) +{ + struct partial_page partial[MAX_SKB_FRAGS]; + struct page *pages[MAX_SKB_FRAGS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .nr_pages_max = MAX_SKB_FRAGS, + .flags = flags, + .ops = &nosteal_pipe_buf_ops, + .spd_release = sock_spd_release, + }; + struct sk_buff *frag_iter; + struct sock *sk = skb->sk; + int ret = 0; + + /* + * __skb_splice_bits() only fails if the output has no room left, + * so no point in going over the frag_list for the error case. + */ + if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) + goto done; + else if (!tlen) + goto done; + + /* + * now see if we have a frag_list to map + */ + skb_walk_frags(skb, frag_iter) { + if (!tlen) + break; + if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) + break; + } + +done: + if (spd.nr_pages) { + /* + * Drop the socket lock, otherwise we have reverse + * locking dependencies between sk_lock and i_mutex + * here as compared to sendfile(). We enter here + * with the socket lock held, and splice_to_pipe() will + * grab the pipe inode lock. For sendfile() emulation, + * we call into ->sendpage() with the i_mutex lock held + * and networking will grab the socket lock. + */ + release_sock(sk); + ret = splice_to_pipe(pipe, &spd); + lock_sock(sk); + } + + return ret; +} + +static int skb_prepare_for_shift(struct sk_buff *skb) +{ + return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} + +int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) +{ + int from, to, merge, todo; + struct skb_frag_struct *fragfrom, *fragto; + + BUG_ON(shiftlen > skb->len); + BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ + + todo = shiftlen; + from = 0; + to = skb_shinfo(tgt)->nr_frags; + fragfrom = &skb_shinfo(skb)->frags[from]; + + /* Actual merge is delayed until the point when we know we can + * commit all, so that we don't have to undo partial changes + */ + if (!to || + !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), + fragfrom->page_offset)) { + merge = -1; + } else { + merge = to - 1; + + todo -= skb_frag_size(fragfrom); + if (todo < 0) { + if (skb_prepare_for_shift(skb) || + skb_prepare_for_shift(tgt)) + return 0; + + /* All previous frag pointers might be stale! */ + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + skb_frag_size_add(fragto, shiftlen); + skb_frag_size_sub(fragfrom, shiftlen); + fragfrom->page_offset += shiftlen; + + goto onlymerged; + } + + from++; + } + + /* Skip full, not-fitting skb to avoid expensive operations */ + if ((shiftlen == skb->len) && + (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) + return 0; + + if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) + return 0; + + while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { + if (to == MAX_SKB_FRAGS) + return 0; + + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[to]; + + if (todo >= skb_frag_size(fragfrom)) { + *fragto = *fragfrom; + todo -= skb_frag_size(fragfrom); + from++; + to++; + + } else { + __skb_frag_ref(fragfrom); + fragto->page = fragfrom->page; + fragto->page_offset = fragfrom->page_offset; + skb_frag_size_set(fragto, todo); + + fragfrom->page_offset += todo; + skb_frag_size_sub(fragfrom, todo); + todo = 0; + + to++; + break; + } + } + + /* Ready to "commit" this state change to tgt */ + skb_shinfo(tgt)->nr_frags = to; + + if (merge >= 0) { + fragfrom = &skb_shinfo(skb)->frags[0]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + skb_frag_size_add(fragto, skb_frag_size(fragfrom)); + __skb_frag_unref(fragfrom); + } + + /* Reposition in the original skb */ + to = 0; + while (from < skb_shinfo(skb)->nr_frags) + skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; + skb_shinfo(skb)->nr_frags = to; + + BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); + +onlymerged: + /* Most likely the tgt won't ever need its checksum anymore, skb on + * the other hand might need it if it needs to be resent + */ + tgt->ip_summed = CHECKSUM_PARTIAL; + skb->ip_summed = CHECKSUM_PARTIAL; + + /* Yak, is it really working this way? Some helper please? */ + skb->len -= shiftlen; + skb->data_len -= shiftlen; + skb->truesize -= shiftlen; + tgt->len += shiftlen; + tgt->data_len += shiftlen; + tgt->truesize += shiftlen; + + return shiftlen; +} + + +/* + * Bits from net/core/sock.c + */ + +#define _SK_MEM_PACKETS 256 +#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) +#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) + +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2a5bf86d..9764bf37 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,7 +59,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sockets = proto_sockets_allocated_sum_positive(&tcp_prot); local_bh_enable(); +#ifndef CONFIG_INET_MODULE socket_seq_show(seq); +#endif seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, tcp_death_row.tw_count, sockets, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 2c707a91..7215c207 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -958,7 +958,11 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = { .exit = ipv4_sysctl_exit_net, }; +#ifdef CONFIG_INET_MODULE +int sysctl_ipv4_init(void) +#else static __init int sysctl_ipv4_init(void) +#endif { struct ctl_table_header *hdr; struct ctl_table *i; @@ -984,4 +988,6 @@ static __init int sysctl_ipv4_init(void) return 0; } +#ifndef CONFIG_INET_MODULE __initcall(sysctl_ipv4_init); +#endif diff --git a/net/ipv4/sysfs_net_ipv4.c b/net/ipv4/sysfs_net_ipv4.c index 0cbbf100..2d441831 100644 --- a/net/ipv4/sysfs_net_ipv4.c +++ b/net/ipv4/sysfs_net_ipv4.c @@ -67,7 +67,11 @@ static struct attribute_group ipv4_attr_group = { .attrs = ipv4_attrs, }; +#ifdef CONFIG_INET_MODULE +int sysfs_ipv4_init(void) +#else static __init int sysfs_ipv4_init(void) +#endif { struct kobject *ipv4_kobject; int ret; @@ -85,4 +89,6 @@ static __init int sysfs_ipv4_init(void) return 0; } +#ifndef CONFIG_INET_MODULE subsys_initcall(sysfs_ipv4_init); +#endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8cc9b549..d9a929fb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1245,8 +1245,10 @@ out: out_nopush: release_sock(sk); +#ifndef CONFIG_INET_MODULE if (copied + copied_syn) uid_stat_tcp_snd(current_uid(), copied + copied_syn); +#endif return copied + copied_syn; do_fault: @@ -1551,7 +1553,9 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (copied > 0) { tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); +#ifndef CONFIG_INET_MODULE uid_stat_tcp_rcv(current_uid(), copied); +#endif } return copied; } @@ -1957,8 +1961,10 @@ skip_copy: release_sock(sk); +#ifndef CONFIG_INET_MODULE if (copied > 0) uid_stat_tcp_rcv(current_uid(), copied); +#endif return copied; out: @@ -1967,8 +1973,10 @@ out: recv_urg: err = tcp_recv_urg(sk, msg, len, flags); +#ifndef CONFIG_INET_MODULE if (err > 0) uid_stat_tcp_rcv(current_uid(), err); +#endif goto out; recv_sndq: diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 019c2389..49d655c0 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -137,12 +137,19 @@ int tcp_set_default_congestion_control(const char *name) return ret; } +#ifdef CONFIG_INET_MODULE +int tcp_congestion_init(void) +#else /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) +#endif { return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); } + +#ifndef CONFIG_INET_MODULE late_initcall(tcp_congestion_default); +#endif /* Build string with list of available congestion control values */ diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8f7ef0ad..3ce673b9 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -80,7 +80,11 @@ void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc) rcu_read_unlock(); } +#ifdef CONFIG_INET_MODULE +int tcp_fastopen_init(void) +#else static int __init tcp_fastopen_init(void) +#endif { __u8 key[TCP_FASTOPEN_KEY_LENGTH]; @@ -89,4 +93,6 @@ static int __init tcp_fastopen_init(void) return 0; } +#ifndef CONFIG_INET_MODULE late_initcall(tcp_fastopen_init); +#endif diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ef47406d..32bac336 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2891,7 +2891,7 @@ struct proto tcp_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG_KMEM) && !defined(CONFIG_INET_MODULE) .init_cgroup = tcp_init_cgroup, .destroy_cgroup = tcp_destroy_cgroup, .proto_cgroup = tcp_proto_cgroup, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index da14436c..377a6f8e 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -280,9 +280,15 @@ static struct cftype tcp_files[] = { { } /* terminate */ }; +#ifdef CONFIG_INET_MODULE +int tcp_memcontrol_init(void) +#else static int __init tcp_memcontrol_init(void) +#endif { WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); return 0; } +#ifndef CONFIG_INET_MODULE __initcall(tcp_memcontrol_init); +#endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 58c5dbd5..94910949 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1413,7 +1413,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) is_udplite); UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); +#ifndef CONFIG_INET_MODULE trace_udp_fail_queue_rcv_skb(rc, sk); +#endif return -1; } diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 470a9c00..da4db0e7 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -40,7 +40,14 @@ obj-$(CONFIG_IPV6_SIT) += sit.o obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o obj-$(CONFIG_IPV6_GRE) += ip6_gre.o +ifeq ($(CONFIG_INET),m) +ipv6-y += ip6_icmp.o +ipv6-y += output_core.o protocol.o $(ipv6-offload) + +ipv6-y += inet6_hashtables.o +else obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o +endif diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d29ae19a..08a98f0e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -77,6 +77,8 @@ static inline int current_has_network(void) } #endif +#include "ip6_offload.h" + MODULE_AUTHOR("Cast of dozens"); MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); MODULE_LICENSE("GPL"); @@ -850,6 +852,8 @@ static int __init inet6_init(void) BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); + ipv6_offload_init(); + /* Register the socket-side information for inet6_create. */ for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c index 447a7fbd..9e95c0fc 100644 --- a/net/ipv6/exthdrs_offload.c +++ b/net/ipv6/exthdrs_offload.c @@ -20,7 +20,7 @@ static const struct net_offload dstopt_offload = { .flags = INET6_PROTO_GSO_EXTHDR, }; -int __init ipv6_exthdrs_offload_init(void) +int ipv6_exthdrs_offload_init(void) { int ret; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 71b766ee..e0130f35 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -264,7 +264,7 @@ static struct packet_offload ipv6_packet_offload __read_mostly = { }, }; -static int __init ipv6_offload_init(void) +void ipv6_offload_init(void) { if (tcpv6_offload_init() < 0) @@ -275,7 +275,4 @@ static int __init ipv6_offload_init(void) pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); dev_add_offload(&ipv6_packet_offload); - return 0; } - -fs_initcall(ipv6_offload_init); diff --git a/net/ipv6/ip6_offload.h b/net/ipv6/ip6_offload.h index 2e155c65..e474f886 100644 --- a/net/ipv6/ip6_offload.h +++ b/net/ipv6/ip6_offload.h @@ -11,6 +11,7 @@ #ifndef __ip6_offload_h #define __ip6_offload_h +void ipv6_offload_init(void); int ipv6_exthdrs_offload_init(void); int udp_offload_init(void); int tcpv6_offload_init(void); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0acad490..70f40a25 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1945,7 +1945,7 @@ struct proto tcpv6_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG_KMEM) && !defined(CONFIG_INET_MODULE) .proto_cgroup = tcp_proto_cgroup, #endif .clear_sk = tcp_v6_clear_sk, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 2ec6bf6a..8642491c 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -89,7 +89,7 @@ static const struct net_offload tcpv6_offload = { }, }; -int __init tcpv6_offload_init(void) +int tcpv6_offload_init(void) { return inet6_add_offload(&tcpv6_offload, IPPROTO_TCP); } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 2f65b022..3f66f0ac 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -125,7 +125,7 @@ static const struct net_offload udpv6_offload = { }, }; -int __init udp_offload_init(void) +int udp_offload_init(void) { return inet6_add_offload(&udpv6_offload, IPPROTO_UDP); }