linux/kernel/pid_namespace.c
<<
>>
Prefs
   1/*
   2 * Pid namespaces
   3 *
   4 * Authors:
   5 *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
   6 *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
   7 *     Many thanks to Oleg Nesterov for comments and help
   8 *
   9 */
  10
  11#include <linux/pid.h>
  12#include <linux/pid_namespace.h>
  13#include <linux/syscalls.h>
  14#include <linux/err.h>
  15#include <linux/acct.h>
  16#include <linux/slab.h>
  17#include <linux/proc_fs.h>
  18#include <linux/reboot.h>
  19
  20#define BITS_PER_PAGE           (PAGE_SIZE*8)
  21
  22struct pid_cache {
  23        int nr_ids;
  24        char name[16];
  25        struct kmem_cache *cachep;
  26        struct list_head list;
  27};
  28
  29static LIST_HEAD(pid_caches_lh);
  30static DEFINE_MUTEX(pid_caches_mutex);
  31static struct kmem_cache *pid_ns_cachep;
  32
  33/*
  34 * creates the kmem cache to allocate pids from.
  35 * @nr_ids: the number of numerical ids this pid will have to carry
  36 */
  37
  38static struct kmem_cache *create_pid_cachep(int nr_ids)
  39{
  40        struct pid_cache *pcache;
  41        struct kmem_cache *cachep;
  42
  43        mutex_lock(&pid_caches_mutex);
  44        list_for_each_entry(pcache, &pid_caches_lh, list)
  45                if (pcache->nr_ids == nr_ids)
  46                        goto out;
  47
  48        pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
  49        if (pcache == NULL)
  50                goto err_alloc;
  51
  52        snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
  53        cachep = kmem_cache_create(pcache->name,
  54                        sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
  55                        0, SLAB_HWCACHE_ALIGN, NULL);
  56        if (cachep == NULL)
  57                goto err_cachep;
  58
  59        pcache->nr_ids = nr_ids;
  60        pcache->cachep = cachep;
  61        list_add(&pcache->list, &pid_caches_lh);
  62out:
  63        mutex_unlock(&pid_caches_mutex);
  64        return pcache->cachep;
  65
  66err_cachep:
  67        kfree(pcache);
  68err_alloc:
  69        mutex_unlock(&pid_caches_mutex);
  70        return NULL;
  71}
  72
  73static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
  74{
  75        struct pid_namespace *ns;
  76        unsigned int level = parent_pid_ns->level + 1;
  77        int i, err = -ENOMEM;
  78
  79        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
  80        if (ns == NULL)
  81                goto out;
  82
  83        ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
  84        if (!ns->pidmap[0].page)
  85                goto out_free;
  86
  87        ns->pid_cachep = create_pid_cachep(level + 1);
  88        if (ns->pid_cachep == NULL)
  89                goto out_free_map;
  90
  91        kref_init(&ns->kref);
  92        ns->level = level;
  93        ns->parent = get_pid_ns(parent_pid_ns);
  94
  95        set_bit(0, ns->pidmap[0].page);
  96        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
  97
  98        for (i = 1; i < PIDMAP_ENTRIES; i++)
  99                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
 100
 101        err = pid_ns_prepare_proc(ns);
 102        if (err)
 103                goto out_put_parent_pid_ns;
 104
 105        return ns;
 106
 107out_put_parent_pid_ns:
 108        put_pid_ns(parent_pid_ns);
 109out_free_map:
 110        kfree(ns->pidmap[0].page);
 111out_free:
 112        kmem_cache_free(pid_ns_cachep, ns);
 113out:
 114        return ERR_PTR(err);
 115}
 116
 117static void destroy_pid_namespace(struct pid_namespace *ns)
 118{
 119        int i;
 120
 121        for (i = 0; i < PIDMAP_ENTRIES; i++)
 122                kfree(ns->pidmap[i].page);
 123        kmem_cache_free(pid_ns_cachep, ns);
 124}
 125
 126struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
 127{
 128        if (!(flags & CLONE_NEWPID))
 129                return get_pid_ns(old_ns);
 130        if (flags & (CLONE_THREAD|CLONE_PARENT))
 131                return ERR_PTR(-EINVAL);
 132        return create_pid_namespace(old_ns);
 133}
 134
 135void free_pid_ns(struct kref *kref)
 136{
 137        struct pid_namespace *ns, *parent;
 138
 139        ns = container_of(kref, struct pid_namespace, kref);
 140
 141        parent = ns->parent;
 142        destroy_pid_namespace(ns);
 143
 144        if (parent != NULL)
 145                put_pid_ns(parent);
 146}
 147
 148void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 149{
 150        int nr;
 151        int rc;
 152        struct task_struct *task, *me = current;
 153
 154        /* Ignore SIGCHLD causing any terminated children to autoreap */
 155        spin_lock_irq(&me->sighand->siglock);
 156        me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
 157        spin_unlock_irq(&me->sighand->siglock);
 158
 159        /*
 160         * The last thread in the cgroup-init thread group is terminating.
 161         * Find remaining pid_ts in the namespace, signal and wait for them
 162         * to exit.
 163         *
 164         * Note:  This signals each threads in the namespace - even those that
 165         *        belong to the same thread group, To avoid this, we would have
 166         *        to walk the entire tasklist looking a processes in this
 167         *        namespace, but that could be unnecessarily expensive if the
 168         *        pid namespace has just a few processes. Or we need to
 169         *        maintain a tasklist for each pid namespace.
 170         *
 171         */
 172        read_lock(&tasklist_lock);
 173        nr = next_pidmap(pid_ns, 1);
 174        while (nr > 0) {
 175                rcu_read_lock();
 176
 177                task = pid_task(find_vpid(nr), PIDTYPE_PID);
 178                if (task && !__fatal_signal_pending(task))
 179                        send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
 180
 181                rcu_read_unlock();
 182
 183                nr = next_pidmap(pid_ns, nr);
 184        }
 185        read_unlock(&tasklist_lock);
 186
 187        /* Firstly reap the EXIT_ZOMBIE children we may have. */
 188        do {
 189                clear_thread_flag(TIF_SIGPENDING);
 190                rc = sys_wait4(-1, NULL, __WALL, NULL);
 191        } while (rc != -ECHILD);
 192
 193        /*
 194         * sys_wait4() above can't reap the TASK_DEAD children.
 195         * Make sure they all go away, see __unhash_process().
 196         */
 197        for (;;) {
 198                bool need_wait = false;
 199
 200                read_lock(&tasklist_lock);
 201                if (!list_empty(&current->children)) {
 202                        __set_current_state(TASK_UNINTERRUPTIBLE);
 203                        need_wait = true;
 204                }
 205                read_unlock(&tasklist_lock);
 206
 207                if (!need_wait)
 208                        break;
 209                schedule();
 210        }
 211
 212        if (pid_ns->reboot)
 213                current->signal->group_exit_code = pid_ns->reboot;
 214
 215        acct_exit_ns(pid_ns);
 216        return;
 217}
 218
 219#ifdef CONFIG_CHECKPOINT_RESTORE
 220static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 221                void __user *buffer, size_t *lenp, loff_t *ppos)
 222{
 223        struct ctl_table tmp = *table;
 224
 225        if (write && !capable(CAP_SYS_ADMIN))
 226                return -EPERM;
 227
 228        /*
 229         * Writing directly to ns' last_pid field is OK, since this field
 230         * is volatile in a living namespace anyway and a code writing to
 231         * it should synchronize its usage with external means.
 232         */
 233
 234        tmp.data = &current->nsproxy->pid_ns->last_pid;
 235        return proc_dointvec(&tmp, write, buffer, lenp, ppos);
 236}
 237
 238static struct ctl_table pid_ns_ctl_table[] = {
 239        {
 240                .procname = "ns_last_pid",
 241                .maxlen = sizeof(int),
 242                .mode = 0666, /* permissions are checked in the handler */
 243                .proc_handler = pid_ns_ctl_handler,
 244        },
 245        { }
 246};
 247static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
 248#endif  /* CONFIG_CHECKPOINT_RESTORE */
 249
 250int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 251{
 252        if (pid_ns == &init_pid_ns)
 253                return 0;
 254
 255        switch (cmd) {
 256        case LINUX_REBOOT_CMD_RESTART2:
 257        case LINUX_REBOOT_CMD_RESTART:
 258                pid_ns->reboot = SIGHUP;
 259                break;
 260
 261        case LINUX_REBOOT_CMD_POWER_OFF:
 262        case LINUX_REBOOT_CMD_HALT:
 263                pid_ns->reboot = SIGINT;
 264                break;
 265        default:
 266                return -EINVAL;
 267        }
 268
 269        read_lock(&tasklist_lock);
 270        force_sig(SIGKILL, pid_ns->child_reaper);
 271        read_unlock(&tasklist_lock);
 272
 273        do_exit(0);
 274
 275        /* Not reached */
 276        return 0;
 277}
 278
 279static __init int pid_namespaces_init(void)
 280{
 281        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
 282
 283#ifdef CONFIG_CHECKPOINT_RESTORE
 284        register_sysctl_paths(kern_path, pid_ns_ctl_table);
 285#endif
 286        return 0;
 287}
 288
 289__initcall(pid_namespaces_init);
 290
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.