linux/kernel/pid_namespace.c
<<
>>
Prefs
   1/*
   2 * Pid namespaces
   3 *
   4 * Authors:
   5 *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
   6 *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
   7 *     Many thanks to Oleg Nesterov for comments and help
   8 *
   9 */
  10
  11#include <linux/pid.h>
  12#include <linux/pid_namespace.h>
  13#include <linux/syscalls.h>
  14#include <linux/err.h>
  15#include <linux/acct.h>
  16#include <linux/slab.h>
  17#include <linux/proc_fs.h>
  18#include <linux/reboot.h>
  19#include <linux/export.h>
  20
  21#define BITS_PER_PAGE           (PAGE_SIZE*8)
  22
  23struct pid_cache {
  24        int nr_ids;
  25        char name[16];
  26        struct kmem_cache *cachep;
  27        struct list_head list;
  28};
  29
  30static LIST_HEAD(pid_caches_lh);
  31static DEFINE_MUTEX(pid_caches_mutex);
  32static struct kmem_cache *pid_ns_cachep;
  33
  34/*
  35 * creates the kmem cache to allocate pids from.
  36 * @nr_ids: the number of numerical ids this pid will have to carry
  37 */
  38
  39static struct kmem_cache *create_pid_cachep(int nr_ids)
  40{
  41        struct pid_cache *pcache;
  42        struct kmem_cache *cachep;
  43
  44        mutex_lock(&pid_caches_mutex);
  45        list_for_each_entry(pcache, &pid_caches_lh, list)
  46                if (pcache->nr_ids == nr_ids)
  47                        goto out;
  48
  49        pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
  50        if (pcache == NULL)
  51                goto err_alloc;
  52
  53        snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
  54        cachep = kmem_cache_create(pcache->name,
  55                        sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
  56                        0, SLAB_HWCACHE_ALIGN, NULL);
  57        if (cachep == NULL)
  58                goto err_cachep;
  59
  60        pcache->nr_ids = nr_ids;
  61        pcache->cachep = cachep;
  62        list_add(&pcache->list, &pid_caches_lh);
  63out:
  64        mutex_unlock(&pid_caches_mutex);
  65        return pcache->cachep;
  66
  67err_cachep:
  68        kfree(pcache);
  69err_alloc:
  70        mutex_unlock(&pid_caches_mutex);
  71        return NULL;
  72}
  73
  74/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
  75#define MAX_PID_NS_LEVEL 32
  76
  77static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
  78{
  79        struct pid_namespace *ns;
  80        unsigned int level = parent_pid_ns->level + 1;
  81        int i;
  82        int err;
  83
  84        if (level > MAX_PID_NS_LEVEL) {
  85                err = -EINVAL;
  86                goto out;
  87        }
  88
  89        err = -ENOMEM;
  90        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
  91        if (ns == NULL)
  92                goto out;
  93
  94        ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
  95        if (!ns->pidmap[0].page)
  96                goto out_free;
  97
  98        ns->pid_cachep = create_pid_cachep(level + 1);
  99        if (ns->pid_cachep == NULL)
 100                goto out_free_map;
 101
 102        kref_init(&ns->kref);
 103        ns->level = level;
 104        ns->parent = get_pid_ns(parent_pid_ns);
 105
 106        set_bit(0, ns->pidmap[0].page);
 107        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
 108
 109        for (i = 1; i < PIDMAP_ENTRIES; i++)
 110                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
 111
 112        err = pid_ns_prepare_proc(ns);
 113        if (err)
 114                goto out_put_parent_pid_ns;
 115
 116        return ns;
 117
 118out_put_parent_pid_ns:
 119        put_pid_ns(parent_pid_ns);
 120out_free_map:
 121        kfree(ns->pidmap[0].page);
 122out_free:
 123        kmem_cache_free(pid_ns_cachep, ns);
 124out:
 125        return ERR_PTR(err);
 126}
 127
 128static void destroy_pid_namespace(struct pid_namespace *ns)
 129{
 130        int i;
 131
 132        for (i = 0; i < PIDMAP_ENTRIES; i++)
 133                kfree(ns->pidmap[i].page);
 134        kmem_cache_free(pid_ns_cachep, ns);
 135}
 136
 137struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
 138{
 139        if (!(flags & CLONE_NEWPID))
 140                return get_pid_ns(old_ns);
 141        if (flags & (CLONE_THREAD|CLONE_PARENT))
 142                return ERR_PTR(-EINVAL);
 143        return create_pid_namespace(old_ns);
 144}
 145
 146static void free_pid_ns(struct kref *kref)
 147{
 148        struct pid_namespace *ns;
 149
 150        ns = container_of(kref, struct pid_namespace, kref);
 151        destroy_pid_namespace(ns);
 152}
 153
 154void put_pid_ns(struct pid_namespace *ns)
 155{
 156        struct pid_namespace *parent;
 157
 158        while (ns != &init_pid_ns) {
 159                parent = ns->parent;
 160                if (!kref_put(&ns->kref, free_pid_ns))
 161                        break;
 162                ns = parent;
 163        }
 164}
 165EXPORT_SYMBOL_GPL(put_pid_ns);
 166
 167void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 168{
 169        int nr;
 170        int rc;
 171        struct task_struct *task, *me = current;
 172
 173        /* Ignore SIGCHLD causing any terminated children to autoreap */
 174        spin_lock_irq(&me->sighand->siglock);
 175        me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
 176        spin_unlock_irq(&me->sighand->siglock);
 177
 178        /*
 179         * The last thread in the cgroup-init thread group is terminating.
 180         * Find remaining pid_ts in the namespace, signal and wait for them
 181         * to exit.
 182         *
 183         * Note:  This signals each threads in the namespace - even those that
 184         *        belong to the same thread group, To avoid this, we would have
 185         *        to walk the entire tasklist looking a processes in this
 186         *        namespace, but that could be unnecessarily expensive if the
 187         *        pid namespace has just a few processes. Or we need to
 188         *        maintain a tasklist for each pid namespace.
 189         *
 190         */
 191        read_lock(&tasklist_lock);
 192        nr = next_pidmap(pid_ns, 1);
 193        while (nr > 0) {
 194                rcu_read_lock();
 195
 196                task = pid_task(find_vpid(nr), PIDTYPE_PID);
 197                if (task && !__fatal_signal_pending(task))
 198                        send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
 199
 200                rcu_read_unlock();
 201
 202                nr = next_pidmap(pid_ns, nr);
 203        }
 204        read_unlock(&tasklist_lock);
 205
 206        /* Firstly reap the EXIT_ZOMBIE children we may have. */
 207        do {
 208                clear_thread_flag(TIF_SIGPENDING);
 209                rc = sys_wait4(-1, NULL, __WALL, NULL);
 210        } while (rc != -ECHILD);
 211
 212        /*
 213         * sys_wait4() above can't reap the TASK_DEAD children.
 214         * Make sure they all go away, see __unhash_process().
 215         */
 216        for (;;) {
 217                bool need_wait = false;
 218
 219                read_lock(&tasklist_lock);
 220                if (!list_empty(&current->children)) {
 221                        __set_current_state(TASK_UNINTERRUPTIBLE);
 222                        need_wait = true;
 223                }
 224                read_unlock(&tasklist_lock);
 225
 226                if (!need_wait)
 227                        break;
 228                schedule();
 229        }
 230
 231        if (pid_ns->reboot)
 232                current->signal->group_exit_code = pid_ns->reboot;
 233
 234        acct_exit_ns(pid_ns);
 235        return;
 236}
 237
 238#ifdef CONFIG_CHECKPOINT_RESTORE
 239static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 240                void __user *buffer, size_t *lenp, loff_t *ppos)
 241{
 242        struct ctl_table tmp = *table;
 243
 244        if (write && !capable(CAP_SYS_ADMIN))
 245                return -EPERM;
 246
 247        /*
 248         * Writing directly to ns' last_pid field is OK, since this field
 249         * is volatile in a living namespace anyway and a code writing to
 250         * it should synchronize its usage with external means.
 251         */
 252
 253        tmp.data = &current->nsproxy->pid_ns->last_pid;
 254        return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 255}
 256
 257extern int pid_max;
 258static int zero = 0;
 259static struct ctl_table pid_ns_ctl_table[] = {
 260        {
 261                .procname = "ns_last_pid",
 262                .maxlen = sizeof(int),
 263                .mode = 0666, /* permissions are checked in the handler */
 264                .proc_handler = pid_ns_ctl_handler,
 265                .extra1 = &zero,
 266                .extra2 = &pid_max,
 267        },
 268        { }
 269};
 270static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
 271#endif  /* CONFIG_CHECKPOINT_RESTORE */
 272
 273int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 274{
 275        if (pid_ns == &init_pid_ns)
 276                return 0;
 277
 278        switch (cmd) {
 279        case LINUX_REBOOT_CMD_RESTART2:
 280        case LINUX_REBOOT_CMD_RESTART:
 281                pid_ns->reboot = SIGHUP;
 282                break;
 283
 284        case LINUX_REBOOT_CMD_POWER_OFF:
 285        case LINUX_REBOOT_CMD_HALT:
 286                pid_ns->reboot = SIGINT;
 287                break;
 288        default:
 289                return -EINVAL;
 290        }
 291
 292        read_lock(&tasklist_lock);
 293        force_sig(SIGKILL, pid_ns->child_reaper);
 294        read_unlock(&tasklist_lock);
 295
 296        do_exit(0);
 297
 298        /* Not reached */
 299        return 0;
 300}
 301
 302static __init int pid_namespaces_init(void)
 303{
 304        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
 305
 306#ifdef CONFIG_CHECKPOINT_RESTORE
 307        register_sysctl_paths(kern_path, pid_ns_ctl_table);
 308#endif
 309        return 0;
 310}
 311
 312__initcall(pid_namespaces_init);
 313
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.