1371 lines
31 KiB
C
1371 lines
31 KiB
C
|
#include <linux/bootmem.h>
|
||
|
#include <linux/cryptohash.h>
|
||
|
#include <linux/ctype.h>
|
||
|
#include <linux/res_counter.h>
|
||
|
#include <linux/skbuff.h>
|
||
|
#include <linux/splice.h>
|
||
|
#include <linux/sysctl.h>
|
||
|
#include <linux/vmpressure.h>
|
||
|
#include <net/dst.h>
|
||
|
#include <net/neighbour.h>
|
||
|
#include <net/request_sock.h>
|
||
|
#include <net/secure_seq.h>
|
||
|
#include <net/sock.h>
|
||
|
#include <net/tcp.h>
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from fs/pipe.c
|
||
|
*/
|
||
|
|
||
|
void pipe_wait(struct pipe_inode_info *pipe)
|
||
|
{
|
||
|
DEFINE_WAIT(wait);
|
||
|
|
||
|
/*
|
||
|
* Pipes are system-local resources, so sleeping on them
|
||
|
* is considered a noninteractive wait:
|
||
|
*/
|
||
|
prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
|
||
|
pipe_unlock(pipe);
|
||
|
schedule();
|
||
|
finish_wait(&pipe->wait, &wait);
|
||
|
pipe_lock(pipe);
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from fs/splice.c
|
||
|
*/
|
||
|
|
||
|
static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
|
||
|
{
|
||
|
smp_mb();
|
||
|
if (waitqueue_active(&pipe->wait))
|
||
|
wake_up_interruptible(&pipe->wait);
|
||
|
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
|
||
|
}
|
||
|
|
||
|
ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
|
||
|
struct splice_pipe_desc *spd)
|
||
|
{
|
||
|
unsigned int spd_pages = spd->nr_pages;
|
||
|
int ret, do_wakeup, page_nr;
|
||
|
|
||
|
ret = 0;
|
||
|
do_wakeup = 0;
|
||
|
page_nr = 0;
|
||
|
|
||
|
pipe_lock(pipe);
|
||
|
|
||
|
for (;;) {
|
||
|
if (!pipe->readers) {
|
||
|
send_sig(SIGPIPE, current, 0);
|
||
|
if (!ret)
|
||
|
ret = -EPIPE;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (pipe->nrbufs < pipe->buffers) {
|
||
|
int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
|
||
|
struct pipe_buffer *buf = pipe->bufs + newbuf;
|
||
|
|
||
|
buf->page = spd->pages[page_nr];
|
||
|
buf->offset = spd->partial[page_nr].offset;
|
||
|
buf->len = spd->partial[page_nr].len;
|
||
|
buf->private = spd->partial[page_nr].private;
|
||
|
buf->ops = spd->ops;
|
||
|
if (spd->flags & SPLICE_F_GIFT)
|
||
|
buf->flags |= PIPE_BUF_FLAG_GIFT;
|
||
|
|
||
|
pipe->nrbufs++;
|
||
|
page_nr++;
|
||
|
ret += buf->len;
|
||
|
|
||
|
if (pipe->files)
|
||
|
do_wakeup = 1;
|
||
|
|
||
|
if (!--spd->nr_pages)
|
||
|
break;
|
||
|
if (pipe->nrbufs < pipe->buffers)
|
||
|
continue;
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (spd->flags & SPLICE_F_NONBLOCK) {
|
||
|
if (!ret)
|
||
|
ret = -EAGAIN;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (signal_pending(current)) {
|
||
|
if (!ret)
|
||
|
ret = -ERESTARTSYS;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (do_wakeup) {
|
||
|
smp_mb();
|
||
|
if (waitqueue_active(&pipe->wait))
|
||
|
wake_up_interruptible_sync(&pipe->wait);
|
||
|
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
|
||
|
do_wakeup = 0;
|
||
|
}
|
||
|
|
||
|
pipe->waiting_writers++;
|
||
|
pipe_wait(pipe);
|
||
|
pipe->waiting_writers--;
|
||
|
}
|
||
|
|
||
|
pipe_unlock(pipe);
|
||
|
|
||
|
if (do_wakeup)
|
||
|
wakeup_pipe_readers(pipe);
|
||
|
|
||
|
while (page_nr < spd_pages)
|
||
|
spd->spd_release(spd, page_nr++);
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from kernel/res_counter.c
|
||
|
*/
|
||
|
|
||
|
void res_counter_init(struct res_counter *counter, struct res_counter *parent)
|
||
|
{
|
||
|
spin_lock_init(&counter->lock);
|
||
|
counter->limit = RESOURCE_MAX;
|
||
|
counter->soft_limit = RESOURCE_MAX;
|
||
|
counter->parent = parent;
|
||
|
}
|
||
|
|
||
|
static inline unsigned long long *
|
||
|
res_counter_member(struct res_counter *counter, int member)
|
||
|
{
|
||
|
switch (member) {
|
||
|
case RES_USAGE:
|
||
|
return &counter->usage;
|
||
|
case RES_MAX_USAGE:
|
||
|
return &counter->max_usage;
|
||
|
case RES_LIMIT:
|
||
|
return &counter->limit;
|
||
|
case RES_FAILCNT:
|
||
|
return &counter->failcnt;
|
||
|
case RES_SOFT_LIMIT:
|
||
|
return &counter->soft_limit;
|
||
|
};
|
||
|
|
||
|
BUG();
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
#if BITS_PER_LONG == 32
|
||
|
u64 res_counter_read_u64(struct res_counter *counter, int member)
|
||
|
{
|
||
|
unsigned long flags;
|
||
|
u64 ret;
|
||
|
|
||
|
spin_lock_irqsave(&counter->lock, flags);
|
||
|
ret = *res_counter_member(counter, member);
|
||
|
spin_unlock_irqrestore(&counter->lock, flags);
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
#else
|
||
|
u64 res_counter_read_u64(struct res_counter *counter, int member)
|
||
|
{
|
||
|
return *res_counter_member(counter, member);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
int res_counter_memparse_write_strategy(const char *buf,
|
||
|
unsigned long long *res)
|
||
|
{
|
||
|
char *end;
|
||
|
|
||
|
/* return RESOURCE_MAX(unlimited) if "-1" is specified */
|
||
|
if (*buf == '-') {
|
||
|
*res = simple_strtoull(buf + 1, &end, 10);
|
||
|
if (*res != 1 || *end != '\0')
|
||
|
return -EINVAL;
|
||
|
*res = RESOURCE_MAX;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
*res = memparse(buf, &end);
|
||
|
if (*end != '\0')
|
||
|
return -EINVAL;
|
||
|
|
||
|
*res = PAGE_ALIGN(*res);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from kernel/sysctl.c
|
||
|
*/
|
||
|
|
||
|
static void proc_skip_char(char **buf, size_t *size, const char v)
|
||
|
{
|
||
|
while (*size) {
|
||
|
if (**buf != v)
|
||
|
break;
|
||
|
(*size)--;
|
||
|
(*buf)++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#define TMPBUFLEN 22
|
||
|
static int proc_get_long(char **buf, size_t *size,
|
||
|
unsigned long *val, bool *neg,
|
||
|
const char *perm_tr, unsigned perm_tr_len, char *tr)
|
||
|
{
|
||
|
int len;
|
||
|
char *p, tmp[TMPBUFLEN];
|
||
|
|
||
|
if (!*size)
|
||
|
return -EINVAL;
|
||
|
|
||
|
len = *size;
|
||
|
if (len > TMPBUFLEN - 1)
|
||
|
len = TMPBUFLEN - 1;
|
||
|
|
||
|
memcpy(tmp, *buf, len);
|
||
|
|
||
|
tmp[len] = 0;
|
||
|
p = tmp;
|
||
|
if (*p == '-' && *size > 1) {
|
||
|
*neg = true;
|
||
|
p++;
|
||
|
} else
|
||
|
*neg = false;
|
||
|
if (!isdigit(*p))
|
||
|
return -EINVAL;
|
||
|
|
||
|
*val = simple_strtoul(p, &p, 0);
|
||
|
|
||
|
len = p - tmp;
|
||
|
|
||
|
/* We don't know if the next char is whitespace thus we may accept
|
||
|
* invalid integers (e.g. 1234...a) or two integers instead of one
|
||
|
* (e.g. 123...1). So lets not allow such large numbers. */
|
||
|
if (len == TMPBUFLEN - 1)
|
||
|
return -EINVAL;
|
||
|
|
||
|
if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
|
||
|
return -EINVAL;
|
||
|
|
||
|
if (tr && (len < *size))
|
||
|
*tr = *p;
|
||
|
|
||
|
*buf += len;
|
||
|
*size -= len;
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
|
||
|
bool neg)
|
||
|
{
|
||
|
int len;
|
||
|
char tmp[TMPBUFLEN], *p = tmp;
|
||
|
|
||
|
sprintf(p, "%s%lu", neg ? "-" : "", val);
|
||
|
len = strlen(tmp);
|
||
|
if (len > *size)
|
||
|
len = *size;
|
||
|
if (copy_to_user(*buf, tmp, len))
|
||
|
return -EFAULT;
|
||
|
*size -= len;
|
||
|
*buf += len;
|
||
|
return 0;
|
||
|
}
|
||
|
#undef TMPBUFLEN
|
||
|
|
||
|
static int proc_put_char(void __user **buf, size_t *size, char c)
|
||
|
{
|
||
|
if (*size) {
|
||
|
char __user **buffer = (char __user **)buf;
|
||
|
if (put_user(c, *buffer))
|
||
|
return -EFAULT;
|
||
|
(*size)--, (*buffer)++;
|
||
|
*buf = *buffer;
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
int proc_do_large_bitmap(struct ctl_table *table, int write,
|
||
|
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||
|
{
|
||
|
int err = 0;
|
||
|
bool first = 1;
|
||
|
size_t left = *lenp;
|
||
|
unsigned long bitmap_len = table->maxlen;
|
||
|
unsigned long *bitmap = (unsigned long *) table->data;
|
||
|
unsigned long *tmp_bitmap = NULL;
|
||
|
char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
|
||
|
|
||
|
if (!bitmap_len || !left || (*ppos && !write)) {
|
||
|
*lenp = 0;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
if (write) {
|
||
|
unsigned long page = 0;
|
||
|
char *kbuf;
|
||
|
|
||
|
if (left > PAGE_SIZE - 1)
|
||
|
left = PAGE_SIZE - 1;
|
||
|
|
||
|
page = __get_free_page(GFP_TEMPORARY);
|
||
|
kbuf = (char *) page;
|
||
|
if (!kbuf)
|
||
|
return -ENOMEM;
|
||
|
if (copy_from_user(kbuf, buffer, left)) {
|
||
|
free_page(page);
|
||
|
return -EFAULT;
|
||
|
}
|
||
|
kbuf[left] = 0;
|
||
|
|
||
|
tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
|
||
|
GFP_KERNEL);
|
||
|
if (!tmp_bitmap) {
|
||
|
free_page(page);
|
||
|
return -ENOMEM;
|
||
|
}
|
||
|
proc_skip_char(&kbuf, &left, '\n');
|
||
|
while (!err && left) {
|
||
|
unsigned long val_a, val_b;
|
||
|
bool neg;
|
||
|
|
||
|
err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
|
||
|
sizeof(tr_a), &c);
|
||
|
if (err)
|
||
|
break;
|
||
|
if (val_a >= bitmap_len || neg) {
|
||
|
err = -EINVAL;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
val_b = val_a;
|
||
|
if (left) {
|
||
|
kbuf++;
|
||
|
left--;
|
||
|
}
|
||
|
|
||
|
if (c == '-') {
|
||
|
err = proc_get_long(&kbuf, &left, &val_b,
|
||
|
&neg, tr_b, sizeof(tr_b),
|
||
|
&c);
|
||
|
if (err)
|
||
|
break;
|
||
|
if (val_b >= bitmap_len || neg ||
|
||
|
val_a > val_b) {
|
||
|
err = -EINVAL;
|
||
|
break;
|
||
|
}
|
||
|
if (left) {
|
||
|
kbuf++;
|
||
|
left--;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
|
||
|
first = 0;
|
||
|
proc_skip_char(&kbuf, &left, '\n');
|
||
|
}
|
||
|
free_page(page);
|
||
|
} else {
|
||
|
unsigned long bit_a, bit_b = 0;
|
||
|
|
||
|
while (left) {
|
||
|
bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
|
||
|
if (bit_a >= bitmap_len)
|
||
|
break;
|
||
|
bit_b = find_next_zero_bit(bitmap, bitmap_len,
|
||
|
bit_a + 1) - 1;
|
||
|
|
||
|
if (!first) {
|
||
|
err = proc_put_char(&buffer, &left, ',');
|
||
|
if (err)
|
||
|
break;
|
||
|
}
|
||
|
err = proc_put_long(&buffer, &left, bit_a, false);
|
||
|
if (err)
|
||
|
break;
|
||
|
if (bit_a != bit_b) {
|
||
|
err = proc_put_char(&buffer, &left, '-');
|
||
|
if (err)
|
||
|
break;
|
||
|
err = proc_put_long(&buffer, &left, bit_b, false);
|
||
|
if (err)
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
first = 0; bit_b++;
|
||
|
}
|
||
|
if (!err)
|
||
|
err = proc_put_char(&buffer, &left, '\n');
|
||
|
}
|
||
|
|
||
|
if (!err) {
|
||
|
if (write) {
|
||
|
if (*ppos)
|
||
|
bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
|
||
|
else
|
||
|
bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
|
||
|
}
|
||
|
kfree(tmp_bitmap);
|
||
|
*lenp -= left;
|
||
|
*ppos += *lenp;
|
||
|
return 0;
|
||
|
} else {
|
||
|
kfree(tmp_bitmap);
|
||
|
return err;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from mm/memcontrol.c
|
||
|
*/
|
||
|
|
||
|
enum mem_cgroup_stat_index {
|
||
|
/*
|
||
|
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
|
||
|
*/
|
||
|
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
|
||
|
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
|
||
|
MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */
|
||
|
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
|
||
|
MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
|
||
|
MEM_CGROUP_STAT_NSTATS,
|
||
|
};
|
||
|
|
||
|
enum mem_cgroup_events_index {
|
||
|
MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
|
||
|
MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
|
||
|
MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
|
||
|
MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
|
||
|
MEM_CGROUP_EVENTS_NSTATS,
|
||
|
};
|
||
|
|
||
|
enum mem_cgroup_events_target {
|
||
|
MEM_CGROUP_TARGET_THRESH,
|
||
|
MEM_CGROUP_TARGET_SOFTLIMIT,
|
||
|
MEM_CGROUP_TARGET_NUMAINFO,
|
||
|
MEM_CGROUP_NTARGETS,
|
||
|
};
|
||
|
|
||
|
struct mem_cgroup_stat_cpu {
|
||
|
long count[MEM_CGROUP_STAT_NSTATS];
|
||
|
unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
|
||
|
unsigned long nr_page_events;
|
||
|
unsigned long targets[MEM_CGROUP_NTARGETS];
|
||
|
};
|
||
|
|
||
|
struct mem_cgroup_reclaim_iter {
|
||
|
/*
|
||
|
* last scanned hierarchy member. Valid only if last_dead_count
|
||
|
* matches memcg->dead_count of the hierarchy root group.
|
||
|
*/
|
||
|
struct mem_cgroup *last_visited;
|
||
|
unsigned long last_dead_count;
|
||
|
|
||
|
/* scan generation, increased every round-trip */
|
||
|
unsigned int generation;
|
||
|
};
|
||
|
|
||
|
struct mem_cgroup_per_zone {
|
||
|
struct lruvec lruvec;
|
||
|
unsigned long lru_size[NR_LRU_LISTS];
|
||
|
|
||
|
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
|
||
|
|
||
|
struct rb_node tree_node; /* RB tree node */
|
||
|
unsigned long long usage_in_excess;/* Set to the value by which */
|
||
|
/* the soft limit is exceeded*/
|
||
|
bool on_tree;
|
||
|
struct mem_cgroup *memcg; /* Back pointer, we cannot */
|
||
|
/* use container_of */
|
||
|
};
|
||
|
|
||
|
struct mem_cgroup_per_node {
|
||
|
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
|
||
|
};
|
||
|
|
||
|
struct mem_cgroup_lru_info {
|
||
|
struct mem_cgroup_per_node *nodeinfo[0];
|
||
|
};
|
||
|
|
||
|
struct mem_cgroup_thresholds {
|
||
|
/* Primary thresholds array */
|
||
|
struct mem_cgroup_threshold_ary *primary;
|
||
|
/*
|
||
|
* Spare threshold array.
|
||
|
* This is needed to make mem_cgroup_unregister_event() "never fail".
|
||
|
* It must be able to store at least primary->size - 1 entries.
|
||
|
*/
|
||
|
struct mem_cgroup_threshold_ary *spare;
|
||
|
};
|
||
|
|
||
|
struct mem_cgroup {
|
||
|
struct cgroup_subsys_state css;
|
||
|
/*
|
||
|
* the counter to account for memory usage
|
||
|
*/
|
||
|
struct res_counter res;
|
||
|
|
||
|
/* vmpressure notifications */
|
||
|
struct vmpressure vmpressure;
|
||
|
|
||
|
union {
|
||
|
/*
|
||
|
* the counter to account for mem+swap usage.
|
||
|
*/
|
||
|
struct res_counter memsw;
|
||
|
|
||
|
/*
|
||
|
* rcu_freeing is used only when freeing struct mem_cgroup,
|
||
|
* so put it into a union to avoid wasting more memory.
|
||
|
* It must be disjoint from the css field. It could be
|
||
|
* in a union with the res field, but res plays a much
|
||
|
* larger part in mem_cgroup life than memsw, and might
|
||
|
* be of interest, even at time of free, when debugging.
|
||
|
* So share rcu_head with the less interesting memsw.
|
||
|
*/
|
||
|
struct rcu_head rcu_freeing;
|
||
|
/*
|
||
|
* We also need some space for a worker in deferred freeing.
|
||
|
* By the time we call it, rcu_freeing is no longer in use.
|
||
|
*/
|
||
|
struct work_struct work_freeing;
|
||
|
};
|
||
|
|
||
|
/*
|
||
|
* the counter to account for kernel memory usage.
|
||
|
*/
|
||
|
struct res_counter kmem;
|
||
|
/*
|
||
|
* Should the accounting and control be hierarchical, per subtree?
|
||
|
*/
|
||
|
bool use_hierarchy;
|
||
|
unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
|
||
|
|
||
|
bool oom_lock;
|
||
|
atomic_t under_oom;
|
||
|
atomic_t oom_wakeups;
|
||
|
|
||
|
atomic_t refcnt;
|
||
|
|
||
|
int swappiness;
|
||
|
/* OOM-Killer disable */
|
||
|
int oom_kill_disable;
|
||
|
|
||
|
/* set when res.limit == memsw.limit */
|
||
|
bool memsw_is_minimum;
|
||
|
|
||
|
/* protect arrays of thresholds */
|
||
|
struct mutex thresholds_lock;
|
||
|
|
||
|
/* thresholds for memory usage. RCU-protected */
|
||
|
struct mem_cgroup_thresholds thresholds;
|
||
|
|
||
|
/* thresholds for mem+swap usage. RCU-protected */
|
||
|
struct mem_cgroup_thresholds memsw_thresholds;
|
||
|
|
||
|
/* For oom notifier event fd */
|
||
|
struct list_head oom_notify;
|
||
|
|
||
|
/*
|
||
|
* Should we move charges of a task when a task is moved into this
|
||
|
* mem_cgroup ? And what type of charges should we move ?
|
||
|
*/
|
||
|
unsigned long move_charge_at_immigrate;
|
||
|
/*
|
||
|
* set > 0 if pages under this cgroup are moving to other cgroup.
|
||
|
*/
|
||
|
atomic_t moving_account;
|
||
|
/* taken only while moving_account > 0 */
|
||
|
spinlock_t move_lock;
|
||
|
/*
|
||
|
* percpu counter.
|
||
|
*/
|
||
|
struct mem_cgroup_stat_cpu __percpu *stat;
|
||
|
/*
|
||
|
* used when a cpu is offlined or other synchronizations
|
||
|
* See mem_cgroup_read_stat().
|
||
|
*/
|
||
|
struct mem_cgroup_stat_cpu nocpu_base;
|
||
|
spinlock_t pcp_counter_lock;
|
||
|
|
||
|
atomic_t dead_count;
|
||
|
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
|
||
|
struct tcp_memcontrol tcp_mem;
|
||
|
#endif
|
||
|
#if defined(CONFIG_MEMCG_KMEM)
|
||
|
/* analogous to slab_common's slab_caches list. per-memcg */
|
||
|
struct list_head memcg_slab_caches;
|
||
|
/* Not a spinlock, we can take a lot of time walking the list */
|
||
|
struct mutex slab_caches_mutex;
|
||
|
/* Index in the kmem_cache->memcg_params->memcg_caches array */
|
||
|
int kmemcg_id;
|
||
|
#endif
|
||
|
|
||
|
int last_scanned_node;
|
||
|
#if MAX_NUMNODES > 1
|
||
|
nodemask_t scan_nodes;
|
||
|
atomic_t numainfo_events;
|
||
|
atomic_t numainfo_updating;
|
||
|
#endif
|
||
|
|
||
|
/*
|
||
|
* Per cgroup active and inactive list, similar to the
|
||
|
* per zone LRU lists.
|
||
|
*
|
||
|
* WARNING: This has to be the last element of the struct. Don't
|
||
|
* add new fields after this point.
|
||
|
*/
|
||
|
struct mem_cgroup_lru_info info;
|
||
|
};
|
||
|
|
||
|
static inline
|
||
|
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
|
||
|
{
|
||
|
return container_of(s, struct mem_cgroup, css);
|
||
|
}
|
||
|
|
||
|
struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
|
||
|
{
|
||
|
return mem_cgroup_from_css(
|
||
|
cgroup_subsys_state(cont, mem_cgroup_subsys_id));
|
||
|
}
|
||
|
|
||
|
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
|
||
|
{
|
||
|
/*
|
||
|
* mm_update_next_owner() may clear mm->owner to NULL
|
||
|
* if it races with swapoff, page migration, etc.
|
||
|
* So this can be called with p == NULL.
|
||
|
*/
|
||
|
if (unlikely(!p))
|
||
|
return NULL;
|
||
|
|
||
|
return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from mm/page_alloc.c
|
||
|
*
|
||
|
* Slightly simplified to avoid depending on nr_kernel_pages, nr_all_pages,
|
||
|
* and alloc_bootmem_nopanic().
|
||
|
*/
|
||
|
|
||
|
void *alloc_large_system_hash(const char *tablename,
|
||
|
unsigned long bucketsize,
|
||
|
unsigned long numentries,
|
||
|
int scale,
|
||
|
int flags,
|
||
|
unsigned int *_hash_shift,
|
||
|
unsigned int *_hash_mask,
|
||
|
unsigned long low_limit,
|
||
|
unsigned long high_limit)
|
||
|
{
|
||
|
unsigned long long max = high_limit;
|
||
|
unsigned long log2qty, size;
|
||
|
void *table = NULL;
|
||
|
|
||
|
/* allow the kernel cmdline to have a say */
|
||
|
if (!numentries)
|
||
|
numentries = PAGE_SIZE / bucketsize;
|
||
|
numentries = roundup_pow_of_two(numentries);
|
||
|
|
||
|
/* limit allocation size */
|
||
|
if (max == 0) {
|
||
|
max = 64 * 1024;
|
||
|
do_div(max, bucketsize);
|
||
|
}
|
||
|
max = min(max, 0x80000000ULL);
|
||
|
|
||
|
if (numentries < low_limit)
|
||
|
numentries = low_limit;
|
||
|
if (numentries > max)
|
||
|
numentries = max;
|
||
|
|
||
|
log2qty = ilog2(numentries);
|
||
|
|
||
|
do {
|
||
|
size = bucketsize << log2qty;
|
||
|
/*
|
||
|
* If bucketsize is not a power-of-two, we may free
|
||
|
* some pages at the end of hash table which
|
||
|
* alloc_pages_exact() automatically does
|
||
|
*/
|
||
|
if (get_order(size) < MAX_ORDER) {
|
||
|
table = alloc_pages_exact(size, GFP_ATOMIC);
|
||
|
kmemleak_alloc(table, size, 1, GFP_ATOMIC);
|
||
|
}
|
||
|
} while (!table && size > PAGE_SIZE && --log2qty);
|
||
|
|
||
|
if (!table)
|
||
|
panic("Failed to allocate %s hash table\n", tablename);
|
||
|
|
||
|
printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
|
||
|
tablename,
|
||
|
(1UL << log2qty),
|
||
|
ilog2(size) - PAGE_SHIFT,
|
||
|
size);
|
||
|
|
||
|
if (_hash_shift)
|
||
|
*_hash_shift = log2qty;
|
||
|
if (_hash_mask)
|
||
|
*_hash_mask = (1 << log2qty) - 1;
|
||
|
|
||
|
return table;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from net/core/dst.c
|
||
|
*/
|
||
|
|
||
|
const u32 dst_default_metrics[RTAX_MAX + 1] = {
|
||
|
/* This initializer is needed to force linker to place this variable
|
||
|
* into const section. Otherwise it might end into bss section.
|
||
|
* We really want to avoid false sharing on this variable, and catch
|
||
|
* any writes on it.
|
||
|
*/
|
||
|
[RTAX_MAX] = 0xdeadbeef,
|
||
|
};
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from net/core/neighbour.c
|
||
|
*/
|
||
|
|
||
|
#define PNEIGH_HASHMASK 0xF
|
||
|
|
||
|
static u32 pneigh_hash(const void *pkey, int key_len)
|
||
|
{
|
||
|
u32 hash_val = *(u32 *)(pkey + key_len - 4);
|
||
|
hash_val ^= (hash_val >> 16);
|
||
|
hash_val ^= hash_val >> 8;
|
||
|
hash_val ^= hash_val >> 4;
|
||
|
hash_val &= PNEIGH_HASHMASK;
|
||
|
return hash_val;
|
||
|
}
|
||
|
|
||
|
int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
|
||
|
struct net_device *dev)
|
||
|
{
|
||
|
struct pneigh_entry *n, **np;
|
||
|
int key_len = tbl->key_len;
|
||
|
u32 hash_val = pneigh_hash(pkey, key_len);
|
||
|
|
||
|
write_lock_bh(&tbl->lock);
|
||
|
for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
|
||
|
np = &n->next) {
|
||
|
if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
|
||
|
net_eq(pneigh_net(n), net)) {
|
||
|
*np = n->next;
|
||
|
write_unlock_bh(&tbl->lock);
|
||
|
if (tbl->pdestructor)
|
||
|
tbl->pdestructor(n);
|
||
|
if (n->dev)
|
||
|
dev_put(n->dev);
|
||
|
release_net(pneigh_net(n));
|
||
|
kfree(n);
|
||
|
return 0;
|
||
|
}
|
||
|
}
|
||
|
write_unlock_bh(&tbl->lock);
|
||
|
return -ENOENT;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from net/core/request_sock.c
|
||
|
*/
|
||
|
|
||
|
int reqsk_queue_alloc(struct request_sock_queue *queue,
|
||
|
unsigned int nr_table_entries)
|
||
|
{
|
||
|
size_t lopt_size = sizeof(struct listen_sock);
|
||
|
struct listen_sock *lopt;
|
||
|
|
||
|
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
|
||
|
nr_table_entries = max_t(u32, nr_table_entries, 8);
|
||
|
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
|
||
|
lopt_size += nr_table_entries * sizeof(struct request_sock *);
|
||
|
if (lopt_size > PAGE_SIZE)
|
||
|
lopt = vzalloc(lopt_size);
|
||
|
else
|
||
|
lopt = kzalloc(lopt_size, GFP_KERNEL);
|
||
|
if (lopt == NULL)
|
||
|
return -ENOMEM;
|
||
|
|
||
|
for (lopt->max_qlen_log = 3;
|
||
|
(1 << lopt->max_qlen_log) < nr_table_entries;
|
||
|
lopt->max_qlen_log++);
|
||
|
|
||
|
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
|
||
|
rwlock_init(&queue->syn_wait_lock);
|
||
|
queue->rskq_accept_head = NULL;
|
||
|
lopt->nr_table_entries = nr_table_entries;
|
||
|
|
||
|
write_lock_bh(&queue->syn_wait_lock);
|
||
|
queue->listen_opt = lopt;
|
||
|
write_unlock_bh(&queue->syn_wait_lock);
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
void __reqsk_queue_destroy(struct request_sock_queue *queue)
|
||
|
{
|
||
|
struct listen_sock *lopt;
|
||
|
size_t lopt_size;
|
||
|
|
||
|
/*
|
||
|
* this is an error recovery path only
|
||
|
* no locking needed and the lopt is not NULL
|
||
|
*/
|
||
|
|
||
|
lopt = queue->listen_opt;
|
||
|
lopt_size = sizeof(struct listen_sock) +
|
||
|
lopt->nr_table_entries * sizeof(struct request_sock *);
|
||
|
|
||
|
if (lopt_size > PAGE_SIZE)
|
||
|
vfree(lopt);
|
||
|
else
|
||
|
kfree(lopt);
|
||
|
}
|
||
|
|
||
|
static inline struct listen_sock *reqsk_queue_yank_listen_sk(
|
||
|
struct request_sock_queue *queue)
|
||
|
{
|
||
|
struct listen_sock *lopt;
|
||
|
|
||
|
write_lock_bh(&queue->syn_wait_lock);
|
||
|
lopt = queue->listen_opt;
|
||
|
queue->listen_opt = NULL;
|
||
|
write_unlock_bh(&queue->syn_wait_lock);
|
||
|
|
||
|
return lopt;
|
||
|
}
|
||
|
|
||
|
void reqsk_queue_destroy(struct request_sock_queue *queue)
|
||
|
{
|
||
|
/* make all the listen_opt local to us */
|
||
|
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
|
||
|
size_t lopt_size = sizeof(struct listen_sock) +
|
||
|
lopt->nr_table_entries * sizeof(struct request_sock *);
|
||
|
|
||
|
if (lopt->qlen != 0) {
|
||
|
unsigned int i;
|
||
|
|
||
|
for (i = 0; i < lopt->nr_table_entries; i++) {
|
||
|
struct request_sock *req;
|
||
|
|
||
|
while ((req = lopt->syn_table[i]) != NULL) {
|
||
|
lopt->syn_table[i] = req->dl_next;
|
||
|
lopt->qlen--;
|
||
|
reqsk_free(req);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
WARN_ON(lopt->qlen != 0);
|
||
|
if (lopt_size > PAGE_SIZE)
|
||
|
vfree(lopt);
|
||
|
else
|
||
|
kfree(lopt);
|
||
|
}
|
||
|
|
||
|
void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
|
||
|
bool reset)
|
||
|
{
|
||
|
struct sock *lsk = tcp_rsk(req)->listener;
|
||
|
struct fastopen_queue *fastopenq =
|
||
|
inet_csk(lsk)->icsk_accept_queue.fastopenq;
|
||
|
|
||
|
tcp_sk(sk)->fastopen_rsk = NULL;
|
||
|
spin_lock_bh(&fastopenq->lock);
|
||
|
fastopenq->qlen--;
|
||
|
tcp_rsk(req)->listener = NULL;
|
||
|
if (req->sk) /* the child socket hasn't been accepted yet */
|
||
|
goto out;
|
||
|
|
||
|
if (!reset || lsk->sk_state != TCP_LISTEN) {
|
||
|
/* If the listener has been closed don't bother with the
|
||
|
* special RST handling below.
|
||
|
*/
|
||
|
spin_unlock_bh(&fastopenq->lock);
|
||
|
sock_put(lsk);
|
||
|
reqsk_free(req);
|
||
|
return;
|
||
|
}
|
||
|
/* Wait for 60secs before removing a req that has triggered RST.
|
||
|
* This is a simple defense against TFO spoofing attack - by
|
||
|
* counting the req against fastopen.max_qlen, and disabling
|
||
|
* TFO when the qlen exceeds max_qlen.
|
||
|
*
|
||
|
* For more details see CoNext'11 "TCP Fast Open" paper.
|
||
|
*/
|
||
|
req->expires = jiffies + 60*HZ;
|
||
|
if (fastopenq->rskq_rst_head == NULL)
|
||
|
fastopenq->rskq_rst_head = req;
|
||
|
else
|
||
|
fastopenq->rskq_rst_tail->dl_next = req;
|
||
|
|
||
|
req->dl_next = NULL;
|
||
|
fastopenq->rskq_rst_tail = req;
|
||
|
fastopenq->qlen++;
|
||
|
out:
|
||
|
spin_unlock_bh(&fastopenq->lock);
|
||
|
sock_put(lsk);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from net/core/secure_seq.c
|
||
|
*/
|
||
|
|
||
|
#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
|
||
|
|
||
|
static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
|
||
|
|
||
|
static void net_secret_init(void)
|
||
|
{
|
||
|
u32 tmp;
|
||
|
int i;
|
||
|
|
||
|
if (likely(net_secret[0]))
|
||
|
return;
|
||
|
|
||
|
for (i = NET_SECRET_SIZE; i > 0;) {
|
||
|
do {
|
||
|
get_random_bytes(&tmp, sizeof(tmp));
|
||
|
} while (!tmp);
|
||
|
cmpxchg(&net_secret[--i], 0, tmp);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static u32 seq_scale(u32 seq)
|
||
|
{
|
||
|
/*
|
||
|
* As close as possible to RFC 793, which
|
||
|
* suggests using a 250 kHz clock.
|
||
|
* Further reading shows this assumes 2 Mb/s networks.
|
||
|
* For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
|
||
|
* For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
|
||
|
* we also need to limit the resolution so that the u32 seq
|
||
|
* overlaps less than one time per MSL (2 minutes).
|
||
|
* Choosing a clock of 64 ns period is OK. (period of 274 s)
|
||
|
*/
|
||
|
return seq + (ktime_to_ns(ktime_get_real()) >> 6);
|
||
|
}
|
||
|
|
||
|
__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
|
||
|
__be16 sport, __be16 dport)
|
||
|
{
|
||
|
u32 hash[MD5_DIGEST_WORDS];
|
||
|
|
||
|
net_secret_init();
|
||
|
hash[0] = (__force u32)saddr;
|
||
|
hash[1] = (__force u32)daddr;
|
||
|
hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
|
||
|
hash[3] = net_secret[15];
|
||
|
|
||
|
md5_transform(hash, net_secret);
|
||
|
|
||
|
return seq_scale(hash[0]);
|
||
|
}
|
||
|
|
||
|
u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
|
||
|
{
|
||
|
u32 hash[MD5_DIGEST_WORDS];
|
||
|
|
||
|
net_secret_init();
|
||
|
hash[0] = (__force u32)saddr;
|
||
|
hash[1] = (__force u32)daddr;
|
||
|
hash[2] = (__force u32)dport ^ net_secret[14];
|
||
|
hash[3] = net_secret[15];
|
||
|
|
||
|
md5_transform(hash, net_secret);
|
||
|
|
||
|
return hash[0];
|
||
|
}
|
||
|
|
||
|
#if IS_ENABLED(CONFIG_IPV6)
|
||
|
__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
|
||
|
__be16 sport, __be16 dport)
|
||
|
{
|
||
|
u32 secret[MD5_MESSAGE_BYTES / 4];
|
||
|
u32 hash[MD5_DIGEST_WORDS];
|
||
|
u32 i;
|
||
|
|
||
|
net_secret_init();
|
||
|
memcpy(hash, saddr, 16);
|
||
|
for (i = 0; i < 4; i++)
|
||
|
secret[i] = net_secret[i] + (__force u32)daddr[i];
|
||
|
secret[4] = net_secret[4] +
|
||
|
(((__force u16)sport << 16) + (__force u16)dport);
|
||
|
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
|
||
|
secret[i] = net_secret[i];
|
||
|
|
||
|
md5_transform(hash, secret);
|
||
|
|
||
|
return seq_scale(hash[0]);
|
||
|
}
|
||
|
EXPORT_SYMBOL(secure_tcpv6_sequence_number);
|
||
|
|
||
|
u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
|
||
|
__be16 dport)
|
||
|
{
|
||
|
u32 secret[MD5_MESSAGE_BYTES / 4];
|
||
|
u32 hash[MD5_DIGEST_WORDS];
|
||
|
u32 i;
|
||
|
|
||
|
net_secret_init();
|
||
|
memcpy(hash, saddr, 16);
|
||
|
for (i = 0; i < 4; i++)
|
||
|
secret[i] = net_secret[i] + (__force u32) daddr[i];
|
||
|
secret[4] = net_secret[4] + (__force u32)dport;
|
||
|
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
|
||
|
secret[i] = net_secret[i];
|
||
|
|
||
|
md5_transform(hash, secret);
|
||
|
|
||
|
return hash[0];
|
||
|
}
|
||
|
EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from net/core/skbuff.c
|
||
|
*/
|
||
|
|
||
|
static struct page *linear_to_page(struct page *page, unsigned int *len,
|
||
|
unsigned int *offset,
|
||
|
struct sock *sk)
|
||
|
{
|
||
|
struct page_frag *pfrag = sk_page_frag(sk);
|
||
|
|
||
|
if (!sk_page_frag_refill(sk, pfrag))
|
||
|
return NULL;
|
||
|
|
||
|
*len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
|
||
|
|
||
|
memcpy(page_address(pfrag->page) + pfrag->offset,
|
||
|
page_address(page) + *offset, *len);
|
||
|
*offset = pfrag->offset;
|
||
|
pfrag->offset += *len;
|
||
|
|
||
|
return pfrag->page;
|
||
|
}
|
||
|
|
||
|
static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
|
||
|
struct page *page,
|
||
|
unsigned int offset)
|
||
|
{
|
||
|
return spd->nr_pages &&
|
||
|
spd->pages[spd->nr_pages - 1] == page &&
|
||
|
(spd->partial[spd->nr_pages - 1].offset +
|
||
|
spd->partial[spd->nr_pages - 1].len == offset);
|
||
|
}
|
||
|
|
||
|
static bool spd_fill_page(struct splice_pipe_desc *spd,
|
||
|
struct pipe_inode_info *pipe, struct page *page,
|
||
|
unsigned int *len, unsigned int offset,
|
||
|
bool linear,
|
||
|
struct sock *sk)
|
||
|
{
|
||
|
if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
|
||
|
return true;
|
||
|
|
||
|
if (linear) {
|
||
|
page = linear_to_page(page, len, &offset, sk);
|
||
|
if (!page)
|
||
|
return true;
|
||
|
}
|
||
|
if (spd_can_coalesce(spd, page, offset)) {
|
||
|
spd->partial[spd->nr_pages - 1].len += *len;
|
||
|
return false;
|
||
|
}
|
||
|
get_page(page);
|
||
|
spd->pages[spd->nr_pages] = page;
|
||
|
spd->partial[spd->nr_pages].len = *len;
|
||
|
spd->partial[spd->nr_pages].offset = offset;
|
||
|
spd->nr_pages++;
|
||
|
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
static bool __splice_segment(struct page *page, unsigned int poff,
|
||
|
unsigned int plen, unsigned int *off,
|
||
|
unsigned int *len,
|
||
|
struct splice_pipe_desc *spd, bool linear,
|
||
|
struct sock *sk,
|
||
|
struct pipe_inode_info *pipe)
|
||
|
{
|
||
|
if (!*len)
|
||
|
return true;
|
||
|
|
||
|
/* skip this segment if already processed */
|
||
|
if (*off >= plen) {
|
||
|
*off -= plen;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/* ignore any bits we already processed */
|
||
|
poff += *off;
|
||
|
plen -= *off;
|
||
|
*off = 0;
|
||
|
|
||
|
do {
|
||
|
unsigned int flen = min(*len, plen);
|
||
|
|
||
|
if (spd_fill_page(spd, pipe, page, &flen, poff,
|
||
|
linear, sk))
|
||
|
return true;
|
||
|
poff += flen;
|
||
|
plen -= flen;
|
||
|
*len -= flen;
|
||
|
} while (*len && plen);
|
||
|
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
|
||
|
unsigned int *offset, unsigned int *len,
|
||
|
struct splice_pipe_desc *spd, struct sock *sk)
|
||
|
{
|
||
|
int seg;
|
||
|
|
||
|
/* map the linear part :
|
||
|
* If skb->head_frag is set, this 'linear' part is backed by a
|
||
|
* fragment, and if the head is not shared with any clones then
|
||
|
* we can avoid a copy since we own the head portion of this page.
|
||
|
*/
|
||
|
if (__splice_segment(virt_to_page(skb->data),
|
||
|
(unsigned long) skb->data & (PAGE_SIZE - 1),
|
||
|
skb_headlen(skb),
|
||
|
offset, len, spd,
|
||
|
skb_head_is_locked(skb),
|
||
|
sk, pipe))
|
||
|
return true;
|
||
|
|
||
|
/*
|
||
|
* then map the fragments
|
||
|
*/
|
||
|
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
|
||
|
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
|
||
|
|
||
|
if (__splice_segment(skb_frag_page(f),
|
||
|
f->page_offset, skb_frag_size(f),
|
||
|
offset, len, spd, false, sk, pipe))
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
|
||
|
{
|
||
|
put_page(spd->pages[i]);
|
||
|
}
|
||
|
|
||
|
int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
|
||
|
struct pipe_inode_info *pipe, unsigned int tlen,
|
||
|
unsigned int flags)
|
||
|
{
|
||
|
struct partial_page partial[MAX_SKB_FRAGS];
|
||
|
struct page *pages[MAX_SKB_FRAGS];
|
||
|
struct splice_pipe_desc spd = {
|
||
|
.pages = pages,
|
||
|
.partial = partial,
|
||
|
.nr_pages_max = MAX_SKB_FRAGS,
|
||
|
.flags = flags,
|
||
|
.ops = &nosteal_pipe_buf_ops,
|
||
|
.spd_release = sock_spd_release,
|
||
|
};
|
||
|
struct sk_buff *frag_iter;
|
||
|
struct sock *sk = skb->sk;
|
||
|
int ret = 0;
|
||
|
|
||
|
/*
|
||
|
* __skb_splice_bits() only fails if the output has no room left,
|
||
|
* so no point in going over the frag_list for the error case.
|
||
|
*/
|
||
|
if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
|
||
|
goto done;
|
||
|
else if (!tlen)
|
||
|
goto done;
|
||
|
|
||
|
/*
|
||
|
* now see if we have a frag_list to map
|
||
|
*/
|
||
|
skb_walk_frags(skb, frag_iter) {
|
||
|
if (!tlen)
|
||
|
break;
|
||
|
if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
done:
|
||
|
if (spd.nr_pages) {
|
||
|
/*
|
||
|
* Drop the socket lock, otherwise we have reverse
|
||
|
* locking dependencies between sk_lock and i_mutex
|
||
|
* here as compared to sendfile(). We enter here
|
||
|
* with the socket lock held, and splice_to_pipe() will
|
||
|
* grab the pipe inode lock. For sendfile() emulation,
|
||
|
* we call into ->sendpage() with the i_mutex lock held
|
||
|
* and networking will grab the socket lock.
|
||
|
*/
|
||
|
release_sock(sk);
|
||
|
ret = splice_to_pipe(pipe, &spd);
|
||
|
lock_sock(sk);
|
||
|
}
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
static int skb_prepare_for_shift(struct sk_buff *skb)
|
||
|
{
|
||
|
return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
|
||
|
}
|
||
|
|
||
|
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
|
||
|
{
|
||
|
int from, to, merge, todo;
|
||
|
struct skb_frag_struct *fragfrom, *fragto;
|
||
|
|
||
|
BUG_ON(shiftlen > skb->len);
|
||
|
BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
|
||
|
|
||
|
todo = shiftlen;
|
||
|
from = 0;
|
||
|
to = skb_shinfo(tgt)->nr_frags;
|
||
|
fragfrom = &skb_shinfo(skb)->frags[from];
|
||
|
|
||
|
/* Actual merge is delayed until the point when we know we can
|
||
|
* commit all, so that we don't have to undo partial changes
|
||
|
*/
|
||
|
if (!to ||
|
||
|
!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
|
||
|
fragfrom->page_offset)) {
|
||
|
merge = -1;
|
||
|
} else {
|
||
|
merge = to - 1;
|
||
|
|
||
|
todo -= skb_frag_size(fragfrom);
|
||
|
if (todo < 0) {
|
||
|
if (skb_prepare_for_shift(skb) ||
|
||
|
skb_prepare_for_shift(tgt))
|
||
|
return 0;
|
||
|
|
||
|
/* All previous frag pointers might be stale! */
|
||
|
fragfrom = &skb_shinfo(skb)->frags[from];
|
||
|
fragto = &skb_shinfo(tgt)->frags[merge];
|
||
|
|
||
|
skb_frag_size_add(fragto, shiftlen);
|
||
|
skb_frag_size_sub(fragfrom, shiftlen);
|
||
|
fragfrom->page_offset += shiftlen;
|
||
|
|
||
|
goto onlymerged;
|
||
|
}
|
||
|
|
||
|
from++;
|
||
|
}
|
||
|
|
||
|
/* Skip full, not-fitting skb to avoid expensive operations */
|
||
|
if ((shiftlen == skb->len) &&
|
||
|
(skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
|
||
|
return 0;
|
||
|
|
||
|
if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
|
||
|
return 0;
|
||
|
|
||
|
while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
|
||
|
if (to == MAX_SKB_FRAGS)
|
||
|
return 0;
|
||
|
|
||
|
fragfrom = &skb_shinfo(skb)->frags[from];
|
||
|
fragto = &skb_shinfo(tgt)->frags[to];
|
||
|
|
||
|
if (todo >= skb_frag_size(fragfrom)) {
|
||
|
*fragto = *fragfrom;
|
||
|
todo -= skb_frag_size(fragfrom);
|
||
|
from++;
|
||
|
to++;
|
||
|
|
||
|
} else {
|
||
|
__skb_frag_ref(fragfrom);
|
||
|
fragto->page = fragfrom->page;
|
||
|
fragto->page_offset = fragfrom->page_offset;
|
||
|
skb_frag_size_set(fragto, todo);
|
||
|
|
||
|
fragfrom->page_offset += todo;
|
||
|
skb_frag_size_sub(fragfrom, todo);
|
||
|
todo = 0;
|
||
|
|
||
|
to++;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Ready to "commit" this state change to tgt */
|
||
|
skb_shinfo(tgt)->nr_frags = to;
|
||
|
|
||
|
if (merge >= 0) {
|
||
|
fragfrom = &skb_shinfo(skb)->frags[0];
|
||
|
fragto = &skb_shinfo(tgt)->frags[merge];
|
||
|
|
||
|
skb_frag_size_add(fragto, skb_frag_size(fragfrom));
|
||
|
__skb_frag_unref(fragfrom);
|
||
|
}
|
||
|
|
||
|
/* Reposition in the original skb */
|
||
|
to = 0;
|
||
|
while (from < skb_shinfo(skb)->nr_frags)
|
||
|
skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
|
||
|
skb_shinfo(skb)->nr_frags = to;
|
||
|
|
||
|
BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
|
||
|
|
||
|
onlymerged:
|
||
|
/* Most likely the tgt won't ever need its checksum anymore, skb on
|
||
|
* the other hand might need it if it needs to be resent
|
||
|
*/
|
||
|
tgt->ip_summed = CHECKSUM_PARTIAL;
|
||
|
skb->ip_summed = CHECKSUM_PARTIAL;
|
||
|
|
||
|
/* Yak, is it really working this way? Some helper please? */
|
||
|
skb->len -= shiftlen;
|
||
|
skb->data_len -= shiftlen;
|
||
|
skb->truesize -= shiftlen;
|
||
|
tgt->len += shiftlen;
|
||
|
tgt->data_len += shiftlen;
|
||
|
tgt->truesize += shiftlen;
|
||
|
|
||
|
return shiftlen;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Bits from net/core/sock.c
|
||
|
*/
|
||
|
|
||
|
#define _SK_MEM_PACKETS 256
|
||
|
#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
|
||
|
#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
|
||
|
|
||
|
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
|