#include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Bits from fs/pipe.c */ void pipe_wait(struct pipe_inode_info *pipe) { DEFINE_WAIT(wait); /* * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); pipe_unlock(pipe); schedule(); finish_wait(&pipe->wait, &wait); pipe_lock(pipe); } /* * Bits from fs/splice.c */ static void wakeup_pipe_readers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->wait)) wake_up_interruptible(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; int ret, do_wakeup, page_nr; ret = 0; do_wakeup = 0; page_nr = 0; pipe_lock(pipe); for (;;) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } if (pipe->nrbufs < pipe->buffers) { int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; if (spd->flags & SPLICE_F_GIFT) buf->flags |= PIPE_BUF_FLAG_GIFT; pipe->nrbufs++; page_nr++; ret += buf->len; if (pipe->files) do_wakeup = 1; if (!--spd->nr_pages) break; if (pipe->nrbufs < pipe->buffers) continue; break; } if (spd->flags & SPLICE_F_NONBLOCK) { if (!ret) ret = -EAGAIN; break; } if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } if (do_wakeup) { smp_mb(); if (waitqueue_active(&pipe->wait)) wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } pipe->waiting_writers++; pipe_wait(pipe); pipe->waiting_writers--; } pipe_unlock(pipe); if (do_wakeup) wakeup_pipe_readers(pipe); while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); return ret; } /* * Bits from kernel/res_counter.c */ void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = RESOURCE_MAX; counter->soft_limit = RESOURCE_MAX; counter->parent = parent; } static inline unsigned long long * res_counter_member(struct res_counter *counter, int member) { switch (member) { case RES_USAGE: return &counter->usage; case RES_MAX_USAGE: return &counter->max_usage; case RES_LIMIT: return &counter->limit; case RES_FAILCNT: return &counter->failcnt; case RES_SOFT_LIMIT: return &counter->soft_limit; }; BUG(); return NULL; } #if BITS_PER_LONG == 32 u64 res_counter_read_u64(struct res_counter *counter, int member) { unsigned long flags; u64 ret; spin_lock_irqsave(&counter->lock, flags); ret = *res_counter_member(counter, member); spin_unlock_irqrestore(&counter->lock, flags); return ret; } #else u64 res_counter_read_u64(struct res_counter *counter, int member) { return *res_counter_member(counter, member); } #endif int res_counter_memparse_write_strategy(const char *buf, unsigned long long *res) { char *end; /* return RESOURCE_MAX(unlimited) if "-1" is specified */ if (*buf == '-') { *res = simple_strtoull(buf + 1, &end, 10); if (*res != 1 || *end != '\0') return -EINVAL; *res = RESOURCE_MAX; return 0; } *res = memparse(buf, &end); if (*end != '\0') return -EINVAL; *res = PAGE_ALIGN(*res); return 0; } /* * Bits from kernel/sysctl.c */ static void proc_skip_char(char **buf, size_t *size, const char v) { while (*size) { if (**buf != v) break; (*size)--; (*buf)++; } } #define TMPBUFLEN 22 static int proc_get_long(char **buf, size_t *size, unsigned long *val, bool *neg, const char *perm_tr, unsigned perm_tr_len, char *tr) { int len; char *p, tmp[TMPBUFLEN]; if (!*size) return -EINVAL; len = *size; if (len > TMPBUFLEN - 1) len = TMPBUFLEN - 1; memcpy(tmp, *buf, len); tmp[len] = 0; p = tmp; if (*p == '-' && *size > 1) { *neg = true; p++; } else *neg = false; if (!isdigit(*p)) return -EINVAL; *val = simple_strtoul(p, &p, 0); len = p - tmp; /* We don't know if the next char is whitespace thus we may accept * invalid integers (e.g. 1234...a) or two integers instead of one * (e.g. 123...1). So lets not allow such large numbers. */ if (len == TMPBUFLEN - 1) return -EINVAL; if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) return -EINVAL; if (tr && (len < *size)) *tr = *p; *buf += len; *size -= len; return 0; } static int proc_put_long(void __user **buf, size_t *size, unsigned long val, bool neg) { int len; char tmp[TMPBUFLEN], *p = tmp; sprintf(p, "%s%lu", neg ? "-" : "", val); len = strlen(tmp); if (len > *size) len = *size; if (copy_to_user(*buf, tmp, len)) return -EFAULT; *size -= len; *buf += len; return 0; } #undef TMPBUFLEN static int proc_put_char(void __user **buf, size_t *size, char c) { if (*size) { char __user **buffer = (char __user **)buf; if (put_user(c, *buffer)) return -EFAULT; (*size)--, (*buffer)++; *buf = *buffer; } return 0; } int proc_do_large_bitmap(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int err = 0; bool first = 1; size_t left = *lenp; unsigned long bitmap_len = table->maxlen; unsigned long *bitmap = (unsigned long *) table->data; unsigned long *tmp_bitmap = NULL; char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; if (!bitmap_len || !left || (*ppos && !write)) { *lenp = 0; return 0; } if (write) { unsigned long page = 0; char *kbuf; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); kbuf = (char *) page; if (!kbuf) return -ENOMEM; if (copy_from_user(kbuf, buffer, left)) { free_page(page); return -EFAULT; } kbuf[left] = 0; tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), GFP_KERNEL); if (!tmp_bitmap) { free_page(page); return -ENOMEM; } proc_skip_char(&kbuf, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; bool neg; err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); if (err) break; if (val_a >= bitmap_len || neg) { err = -EINVAL; break; } val_b = val_a; if (left) { kbuf++; left--; } if (c == '-') { err = proc_get_long(&kbuf, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); if (err) break; if (val_b >= bitmap_len || neg || val_a > val_b) { err = -EINVAL; break; } if (left) { kbuf++; left--; } } bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); first = 0; proc_skip_char(&kbuf, &left, '\n'); } free_page(page); } else { unsigned long bit_a, bit_b = 0; while (left) { bit_a = find_next_bit(bitmap, bitmap_len, bit_b); if (bit_a >= bitmap_len) break; bit_b = find_next_zero_bit(bitmap, bitmap_len, bit_a + 1) - 1; if (!first) { err = proc_put_char(&buffer, &left, ','); if (err) break; } err = proc_put_long(&buffer, &left, bit_a, false); if (err) break; if (bit_a != bit_b) { err = proc_put_char(&buffer, &left, '-'); if (err) break; err = proc_put_long(&buffer, &left, bit_b, false); if (err) break; } first = 0; bit_b++; } if (!err) err = proc_put_char(&buffer, &left, '\n'); } if (!err) { if (write) { if (*ppos) bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); else bitmap_copy(bitmap, tmp_bitmap, bitmap_len); } kfree(tmp_bitmap); *lenp -= left; *ppos += *lenp; return 0; } else { kfree(tmp_bitmap); return err; } } /* * Bits from mm/memcontrol.c */ enum mem_cgroup_stat_index { /* * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. */ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, }; enum mem_cgroup_events_index { MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ MEM_CGROUP_EVENTS_NSTATS, }; enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, MEM_CGROUP_TARGET_SOFTLIMIT, MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; struct mem_cgroup_stat_cpu { long count[MEM_CGROUP_STAT_NSTATS]; unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; }; struct mem_cgroup_reclaim_iter { /* * last scanned hierarchy member. Valid only if last_dead_count * matches memcg->dead_count of the hierarchy root group. */ struct mem_cgroup *last_visited; unsigned long last_dead_count; /* scan generation, increased every round-trip */ unsigned int generation; }; struct mem_cgroup_per_zone { struct lruvec lruvec; unsigned long lru_size[NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; struct rb_node tree_node; /* RB tree node */ unsigned long long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; struct mem_cgroup_lru_info { struct mem_cgroup_per_node *nodeinfo[0]; }; struct mem_cgroup_thresholds { /* Primary thresholds array */ struct mem_cgroup_threshold_ary *primary; /* * Spare threshold array. * This is needed to make mem_cgroup_unregister_event() "never fail". * It must be able to store at least primary->size - 1 entries. */ struct mem_cgroup_threshold_ary *spare; }; struct mem_cgroup { struct cgroup_subsys_state css; /* * the counter to account for memory usage */ struct res_counter res; /* vmpressure notifications */ struct vmpressure vmpressure; union { /* * the counter to account for mem+swap usage. */ struct res_counter memsw; /* * rcu_freeing is used only when freeing struct mem_cgroup, * so put it into a union to avoid wasting more memory. * It must be disjoint from the css field. It could be * in a union with the res field, but res plays a much * larger part in mem_cgroup life than memsw, and might * be of interest, even at time of free, when debugging. * So share rcu_head with the less interesting memsw. */ struct rcu_head rcu_freeing; /* * We also need some space for a worker in deferred freeing. * By the time we call it, rcu_freeing is no longer in use. */ struct work_struct work_freeing; }; /* * the counter to account for kernel memory usage. */ struct res_counter kmem; /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ bool oom_lock; atomic_t under_oom; atomic_t oom_wakeups; atomic_t refcnt; int swappiness; /* OOM-Killer disable */ int oom_kill_disable; /* set when res.limit == memsw.limit */ bool memsw_is_minimum; /* protect arrays of thresholds */ struct mutex thresholds_lock; /* thresholds for memory usage. RCU-protected */ struct mem_cgroup_thresholds thresholds; /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; /* For oom notifier event fd */ struct list_head oom_notify; /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ atomic_t moving_account; /* taken only while moving_account > 0 */ spinlock_t move_lock; /* * percpu counter. */ struct mem_cgroup_stat_cpu __percpu *stat; /* * used when a cpu is offlined or other synchronizations * See mem_cgroup_read_stat(). */ struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; atomic_t dead_count; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) /* analogous to slab_common's slab_caches list. per-memcg */ struct list_head memcg_slab_caches; /* Not a spinlock, we can take a lot of time walking the list */ struct mutex slab_caches_mutex; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif int last_scanned_node; #if MAX_NUMNODES > 1 nodemask_t scan_nodes; atomic_t numainfo_events; atomic_t numainfo_updating; #endif /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. * * WARNING: This has to be the last element of the struct. Don't * add new fields after this point. */ struct mem_cgroup_lru_info info; }; static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { return container_of(s, struct mem_cgroup, css); } struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return mem_cgroup_from_css( cgroup_subsys_state(cont, mem_cgroup_subsys_id)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* * mm_update_next_owner() may clear mm->owner to NULL * if it races with swapoff, page migration, etc. * So this can be called with p == NULL. */ if (unlikely(!p)) return NULL; return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); } /* * Bits from mm/page_alloc.c * * Slightly simplified to avoid depending on nr_kernel_pages, nr_all_pages, * and alloc_bootmem_nopanic(). */ void *alloc_large_system_hash(const char *tablename, unsigned long bucketsize, unsigned long numentries, int scale, int flags, unsigned int *_hash_shift, unsigned int *_hash_mask, unsigned long low_limit, unsigned long high_limit) { unsigned long long max = high_limit; unsigned long log2qty, size; void *table = NULL; /* allow the kernel cmdline to have a say */ if (!numentries) numentries = PAGE_SIZE / bucketsize; numentries = roundup_pow_of_two(numentries); /* limit allocation size */ if (max == 0) { max = 64 * 1024; do_div(max, bucketsize); } max = min(max, 0x80000000ULL); if (numentries < low_limit) numentries = low_limit; if (numentries > max) numentries = max; log2qty = ilog2(numentries); do { size = bucketsize << log2qty; /* * If bucketsize is not a power-of-two, we may free * some pages at the end of hash table which * alloc_pages_exact() automatically does */ if (get_order(size) < MAX_ORDER) { table = alloc_pages_exact(size, GFP_ATOMIC); kmemleak_alloc(table, size, 1, GFP_ATOMIC); } } while (!table && size > PAGE_SIZE && --log2qty); if (!table) panic("Failed to allocate %s hash table\n", tablename); printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", tablename, (1UL << log2qty), ilog2(size) - PAGE_SHIFT, size); if (_hash_shift) *_hash_shift = log2qty; if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; return table; } /* * Bits from net/core/dst.c */ const u32 dst_default_metrics[RTAX_MAX + 1] = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ [RTAX_MAX] = 0xdeadbeef, }; /* * Bits from net/core/neighbour.c */ #define PNEIGH_HASHMASK 0xF static u32 pneigh_hash(const void *pkey, int key_len) { u32 hash_val = *(u32 *)(pkey + key_len - 4); hash_val ^= (hash_val >> 16); hash_val ^= hash_val >> 8; hash_val ^= hash_val >> 4; hash_val &= PNEIGH_HASHMASK; return hash_val; } int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, **np; int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { *np = n->next; write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); if (n->dev) dev_put(n->dev); release_net(pneigh_net(n)); kfree(n); return 0; } } write_unlock_bh(&tbl->lock); return -ENOENT; } /* * Bits from net/core/request_sock.c */ int reqsk_queue_alloc(struct request_sock_queue *queue, unsigned int nr_table_entries) { size_t lopt_size = sizeof(struct listen_sock); struct listen_sock *lopt; nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); nr_table_entries = max_t(u32, nr_table_entries, 8); nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) lopt = vzalloc(lopt_size); else lopt = kzalloc(lopt_size, GFP_KERNEL); if (lopt == NULL) return -ENOMEM; for (lopt->max_qlen_log = 3; (1 << lopt->max_qlen_log) < nr_table_entries; lopt->max_qlen_log++); get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); rwlock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; lopt->nr_table_entries = nr_table_entries; write_lock_bh(&queue->syn_wait_lock); queue->listen_opt = lopt; write_unlock_bh(&queue->syn_wait_lock); return 0; } void __reqsk_queue_destroy(struct request_sock_queue *queue) { struct listen_sock *lopt; size_t lopt_size; /* * this is an error recovery path only * no locking needed and the lopt is not NULL */ lopt = queue->listen_opt; lopt_size = sizeof(struct listen_sock) + lopt->nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) vfree(lopt); else kfree(lopt); } static inline struct listen_sock *reqsk_queue_yank_listen_sk( struct request_sock_queue *queue) { struct listen_sock *lopt; write_lock_bh(&queue->syn_wait_lock); lopt = queue->listen_opt; queue->listen_opt = NULL; write_unlock_bh(&queue->syn_wait_lock); return lopt; } void reqsk_queue_destroy(struct request_sock_queue *queue) { /* make all the listen_opt local to us */ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); size_t lopt_size = sizeof(struct listen_sock) + lopt->nr_table_entries * sizeof(struct request_sock *); if (lopt->qlen != 0) { unsigned int i; for (i = 0; i < lopt->nr_table_entries; i++) { struct request_sock *req; while ((req = lopt->syn_table[i]) != NULL) { lopt->syn_table[i] = req->dl_next; lopt->qlen--; reqsk_free(req); } } } WARN_ON(lopt->qlen != 0); if (lopt_size > PAGE_SIZE) vfree(lopt); else kfree(lopt); } void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset) { struct sock *lsk = tcp_rsk(req)->listener; struct fastopen_queue *fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq; tcp_sk(sk)->fastopen_rsk = NULL; spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; tcp_rsk(req)->listener = NULL; if (req->sk) /* the child socket hasn't been accepted yet */ goto out; if (!reset || lsk->sk_state != TCP_LISTEN) { /* If the listener has been closed don't bother with the * special RST handling below. */ spin_unlock_bh(&fastopenq->lock); sock_put(lsk); reqsk_free(req); return; } /* Wait for 60secs before removing a req that has triggered RST. * This is a simple defense against TFO spoofing attack - by * counting the req against fastopen.max_qlen, and disabling * TFO when the qlen exceeds max_qlen. * * For more details see CoNext'11 "TCP Fast Open" paper. */ req->expires = jiffies + 60*HZ; if (fastopenq->rskq_rst_head == NULL) fastopenq->rskq_rst_head = req; else fastopenq->rskq_rst_tail->dl_next = req; req->dl_next = NULL; fastopenq->rskq_rst_tail = req; fastopenq->qlen++; out: spin_unlock_bh(&fastopenq->lock); sock_put(lsk); return; } /* * Bits from net/core/secure_seq.c */ #define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; static void net_secret_init(void) { u32 tmp; int i; if (likely(net_secret[0])) return; for (i = NET_SECRET_SIZE; i > 0;) { do { get_random_bytes(&tmp, sizeof(tmp)); } while (!tmp); cmpxchg(&net_secret[--i], 0, tmp); } } static u32 seq_scale(u32 seq) { /* * As close as possible to RFC 793, which * suggests using a 250 kHz clock. * Further reading shows this assumes 2 Mb/s networks. * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate. * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but * we also need to limit the resolution so that the u32 seq * overlaps less than one time per MSL (2 minutes). * Choosing a clock of 64 ns period is OK. (period of 274 s) */ return seq + (ktime_to_ns(ktime_get_real()) >> 6); } __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) { u32 hash[MD5_DIGEST_WORDS]; net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = ((__force u16)sport << 16) + (__force u16)dport; hash[3] = net_secret[15]; md5_transform(hash, net_secret); return seq_scale(hash[0]); } u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { u32 hash[MD5_DIGEST_WORDS]; net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = (__force u32)dport ^ net_secret[14]; hash[3] = net_secret[15]; md5_transform(hash, net_secret); return hash[0]; } #if IS_ENABLED(CONFIG_IPV6) __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport) { u32 secret[MD5_MESSAGE_BYTES / 4]; u32 hash[MD5_DIGEST_WORDS]; u32 i; net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + (__force u32)daddr[i]; secret[4] = net_secret[4] + (((__force u16)sport << 16) + (__force u16)dport); for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) secret[i] = net_secret[i]; md5_transform(hash, secret); return seq_scale(hash[0]); } EXPORT_SYMBOL(secure_tcpv6_sequence_number); u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) { u32 secret[MD5_MESSAGE_BYTES / 4]; u32 hash[MD5_DIGEST_WORDS]; u32 i; net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + (__force u32) daddr[i]; secret[4] = net_secret[4] + (__force u32)dport; for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) secret[i] = net_secret[i]; md5_transform(hash, secret); return hash[0]; } EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif /* * Bits from net/core/skbuff.c */ static struct page *linear_to_page(struct page *page, unsigned int *len, unsigned int *offset, struct sock *sk) { struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) return NULL; *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); memcpy(page_address(pfrag->page) + pfrag->offset, page_address(page) + *offset, *len); *offset = pfrag->offset; pfrag->offset += *len; return pfrag->page; } static bool spd_can_coalesce(const struct splice_pipe_desc *spd, struct page *page, unsigned int offset) { return spd->nr_pages && spd->pages[spd->nr_pages - 1] == page && (spd->partial[spd->nr_pages - 1].offset + spd->partial[spd->nr_pages - 1].len == offset); } static bool spd_fill_page(struct splice_pipe_desc *spd, struct pipe_inode_info *pipe, struct page *page, unsigned int *len, unsigned int offset, bool linear, struct sock *sk) { if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) return true; if (linear) { page = linear_to_page(page, len, &offset, sk); if (!page) return true; } if (spd_can_coalesce(spd, page, offset)) { spd->partial[spd->nr_pages - 1].len += *len; return false; } get_page(page); spd->pages[spd->nr_pages] = page; spd->partial[spd->nr_pages].len = *len; spd->partial[spd->nr_pages].offset = offset; spd->nr_pages++; return false; } static bool __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct splice_pipe_desc *spd, bool linear, struct sock *sk, struct pipe_inode_info *pipe) { if (!*len) return true; /* skip this segment if already processed */ if (*off >= plen) { *off -= plen; return false; } /* ignore any bits we already processed */ poff += *off; plen -= *off; *off = 0; do { unsigned int flen = min(*len, plen); if (spd_fill_page(spd, pipe, page, &flen, poff, linear, sk)) return true; poff += flen; plen -= flen; *len -= flen; } while (*len && plen); return false; } static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, unsigned int *offset, unsigned int *len, struct splice_pipe_desc *spd, struct sock *sk) { int seg; /* map the linear part : * If skb->head_frag is set, this 'linear' part is backed by a * fragment, and if the head is not shared with any clones then * we can avoid a copy since we own the head portion of this page. */ if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), offset, len, spd, skb_head_is_locked(skb), sk, pipe)) return true; /* * then map the fragments */ for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (__splice_segment(skb_frag_page(f), f->page_offset, skb_frag_size(f), offset, len, spd, false, sk, pipe)) return true; } return false; } static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) { put_page(spd->pages[i]); } int skb_splice_bits(struct sk_buff *skb, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, unsigned int flags) { struct partial_page partial[MAX_SKB_FRAGS]; struct page *pages[MAX_SKB_FRAGS]; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, .nr_pages_max = MAX_SKB_FRAGS, .flags = flags, .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; struct sk_buff *frag_iter; struct sock *sk = skb->sk; int ret = 0; /* * __skb_splice_bits() only fails if the output has no room left, * so no point in going over the frag_list for the error case. */ if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) goto done; else if (!tlen) goto done; /* * now see if we have a frag_list to map */ skb_walk_frags(skb, frag_iter) { if (!tlen) break; if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) break; } done: if (spd.nr_pages) { /* * Drop the socket lock, otherwise we have reverse * locking dependencies between sk_lock and i_mutex * here as compared to sendfile(). We enter here * with the socket lock held, and splice_to_pipe() will * grab the pipe inode lock. For sendfile() emulation, * we call into ->sendpage() with the i_mutex lock held * and networking will grab the socket lock. */ release_sock(sk); ret = splice_to_pipe(pipe, &spd); lock_sock(sk); } return ret; } static int skb_prepare_for_shift(struct sk_buff *skb) { return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) { int from, to, merge, todo; struct skb_frag_struct *fragfrom, *fragto; BUG_ON(shiftlen > skb->len); BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ todo = shiftlen; from = 0; to = skb_shinfo(tgt)->nr_frags; fragfrom = &skb_shinfo(skb)->frags[from]; /* Actual merge is delayed until the point when we know we can * commit all, so that we don't have to undo partial changes */ if (!to || !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), fragfrom->page_offset)) { merge = -1; } else { merge = to - 1; todo -= skb_frag_size(fragfrom); if (todo < 0) { if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) return 0; /* All previous frag pointers might be stale! */ fragfrom = &skb_shinfo(skb)->frags[from]; fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, shiftlen); skb_frag_size_sub(fragfrom, shiftlen); fragfrom->page_offset += shiftlen; goto onlymerged; } from++; } /* Skip full, not-fitting skb to avoid expensive operations */ if ((shiftlen == skb->len) && (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) return 0; if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) return 0; while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { if (to == MAX_SKB_FRAGS) return 0; fragfrom = &skb_shinfo(skb)->frags[from]; fragto = &skb_shinfo(tgt)->frags[to]; if (todo >= skb_frag_size(fragfrom)) { *fragto = *fragfrom; todo -= skb_frag_size(fragfrom); from++; to++; } else { __skb_frag_ref(fragfrom); fragto->page = fragfrom->page; fragto->page_offset = fragfrom->page_offset; skb_frag_size_set(fragto, todo); fragfrom->page_offset += todo; skb_frag_size_sub(fragfrom, todo); todo = 0; to++; break; } } /* Ready to "commit" this state change to tgt */ skb_shinfo(tgt)->nr_frags = to; if (merge >= 0) { fragfrom = &skb_shinfo(skb)->frags[0]; fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, skb_frag_size(fragfrom)); __skb_frag_unref(fragfrom); } /* Reposition in the original skb */ to = 0; while (from < skb_shinfo(skb)->nr_frags) skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; skb_shinfo(skb)->nr_frags = to; BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); onlymerged: /* Most likely the tgt won't ever need its checksum anymore, skb on * the other hand might need it if it needs to be resent */ tgt->ip_summed = CHECKSUM_PARTIAL; skb->ip_summed = CHECKSUM_PARTIAL; /* Yak, is it really working this way? Some helper please? */ skb->len -= shiftlen; skb->data_len -= shiftlen; skb->truesize -= shiftlen; tgt->len += shiftlen; tgt->data_len += shiftlen; tgt->truesize += shiftlen; return shiftlen; } /* * Bits from net/core/sock.c */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;