linux/fs/file.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/file.c
   3 *
   4 *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
   5 *
   6 *  Manage the dynamic fd arrays in the process files_struct.
   7 */
   8
   9#include <linux/module.h>
  10#include <linux/fs.h>
  11#include <linux/mm.h>
  12#include <linux/time.h>
  13#include <linux/sched.h>
  14#include <linux/slab.h>
  15#include <linux/vmalloc.h>
  16#include <linux/file.h>
  17#include <linux/fdtable.h>
  18#include <linux/bitops.h>
  19#include <linux/interrupt.h>
  20#include <linux/spinlock.h>
  21#include <linux/rcupdate.h>
  22#include <linux/workqueue.h>
  23
  24struct fdtable_defer {
  25        spinlock_t lock;
  26        struct work_struct wq;
  27        struct fdtable *next;
  28};
  29
  30int sysctl_nr_open __read_mostly = 1024*1024;
  31int sysctl_nr_open_min = BITS_PER_LONG;
  32int sysctl_nr_open_max = 1024 * 1024; /* raised later */
  33
  34/*
  35 * We use this list to defer free fdtables that have vmalloced
  36 * sets/arrays. By keeping a per-cpu list, we avoid having to embed
  37 * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
  38 * this per-task structure.
  39 */
  40static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
  41
  42static inline void *alloc_fdmem(unsigned int size)
  43{
  44        void *data;
  45
  46        data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
  47        if (data != NULL)
  48                return data;
  49
  50        return vmalloc(size);
  51}
  52
  53static void free_fdmem(void *ptr)
  54{
  55        is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
  56}
  57
  58static void __free_fdtable(struct fdtable *fdt)
  59{
  60        free_fdmem(fdt->fd);
  61        free_fdmem(fdt->open_fds);
  62        kfree(fdt);
  63}
  64
  65static void free_fdtable_work(struct work_struct *work)
  66{
  67        struct fdtable_defer *f =
  68                container_of(work, struct fdtable_defer, wq);
  69        struct fdtable *fdt;
  70
  71        spin_lock_bh(&f->lock);
  72        fdt = f->next;
  73        f->next = NULL;
  74        spin_unlock_bh(&f->lock);
  75        while(fdt) {
  76                struct fdtable *next = fdt->next;
  77
  78                __free_fdtable(fdt);
  79                fdt = next;
  80        }
  81}
  82
  83void free_fdtable_rcu(struct rcu_head *rcu)
  84{
  85        struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
  86        struct fdtable_defer *fddef;
  87
  88        BUG_ON(!fdt);
  89
  90        if (fdt->max_fds <= NR_OPEN_DEFAULT) {
  91                /*
  92                 * This fdtable is embedded in the files structure and that
  93                 * structure itself is getting destroyed.
  94                 */
  95                kmem_cache_free(files_cachep,
  96                                container_of(fdt, struct files_struct, fdtab));
  97                return;
  98        }
  99        if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
 100                kfree(fdt->fd);
 101                kfree(fdt->open_fds);
 102                kfree(fdt);
 103        } else {
 104                fddef = &get_cpu_var(fdtable_defer_list);
 105                spin_lock(&fddef->lock);
 106                fdt->next = fddef->next;
 107                fddef->next = fdt;
 108                /* vmallocs are handled from the workqueue context */
 109                schedule_work(&fddef->wq);
 110                spin_unlock(&fddef->lock);
 111                put_cpu_var(fdtable_defer_list);
 112        }
 113}
 114
 115/*
 116 * Expand the fdset in the files_struct.  Called with the files spinlock
 117 * held for write.
 118 */
 119static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 120{
 121        unsigned int cpy, set;
 122
 123        BUG_ON(nfdt->max_fds < ofdt->max_fds);
 124
 125        cpy = ofdt->max_fds * sizeof(struct file *);
 126        set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
 127        memcpy(nfdt->fd, ofdt->fd, cpy);
 128        memset((char *)(nfdt->fd) + cpy, 0, set);
 129
 130        cpy = ofdt->max_fds / BITS_PER_BYTE;
 131        set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
 132        memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
 133        memset((char *)(nfdt->open_fds) + cpy, 0, set);
 134        memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
 135        memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
 136}
 137
 138static struct fdtable * alloc_fdtable(unsigned int nr)
 139{
 140        struct fdtable *fdt;
 141        char *data;
 142
 143        /*
 144         * Figure out how many fds we actually want to support in this fdtable.
 145         * Allocation steps are keyed to the size of the fdarray, since it
 146         * grows far faster than any of the other dynamic data. We try to fit
 147         * the fdarray into comfortable page-tuned chunks: starting at 1024B
 148         * and growing in powers of two from there on.
 149         */
 150        nr /= (1024 / sizeof(struct file *));
 151        nr = roundup_pow_of_two(nr + 1);
 152        nr *= (1024 / sizeof(struct file *));
 153        /*
 154         * Note that this can drive nr *below* what we had passed if sysctl_nr_open
 155         * had been set lower between the check in expand_files() and here.  Deal
 156         * with that in caller, it's cheaper that way.
 157         *
 158         * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
 159         * bitmaps handling below becomes unpleasant, to put it mildly...
 160         */
 161        if (unlikely(nr > sysctl_nr_open))
 162                nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
 163
 164        fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
 165        if (!fdt)
 166                goto out;
 167        fdt->max_fds = nr;
 168        data = alloc_fdmem(nr * sizeof(struct file *));
 169        if (!data)
 170                goto out_fdt;
 171        fdt->fd = (struct file **)data;
 172        data = alloc_fdmem(max_t(unsigned int,
 173                                 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
 174        if (!data)
 175                goto out_arr;
 176        fdt->open_fds = (fd_set *)data;
 177        data += nr / BITS_PER_BYTE;
 178        fdt->close_on_exec = (fd_set *)data;
 179        fdt->next = NULL;
 180
 181        return fdt;
 182
 183out_arr:
 184        free_fdmem(fdt->fd);
 185out_fdt:
 186        kfree(fdt);
 187out:
 188        return NULL;
 189}
 190
 191/*
 192 * Expand the file descriptor table.
 193 * This function will allocate a new fdtable and both fd array and fdset, of
 194 * the given size.
 195 * Return <0 error code on error; 1 on successful completion.
 196 * The files->file_lock should be held on entry, and will be held on exit.
 197 */
 198static int expand_fdtable(struct files_struct *files, int nr)
 199        __releases(files->file_lock)
 200        __acquires(files->file_lock)
 201{
 202        struct fdtable *new_fdt, *cur_fdt;
 203
 204        spin_unlock(&files->file_lock);
 205        new_fdt = alloc_fdtable(nr);
 206        spin_lock(&files->file_lock);
 207        if (!new_fdt)
 208                return -ENOMEM;
 209        /*
 210         * extremely unlikely race - sysctl_nr_open decreased between the check in
 211         * caller and alloc_fdtable().  Cheaper to catch it here...
 212         */
 213        if (unlikely(new_fdt->max_fds <= nr)) {
 214                __free_fdtable(new_fdt);
 215                return -EMFILE;
 216        }
 217        /*
 218         * Check again since another task may have expanded the fd table while
 219         * we dropped the lock
 220         */
 221        cur_fdt = files_fdtable(files);
 222        if (nr >= cur_fdt->max_fds) {
 223                /* Continue as planned */
 224                copy_fdtable(new_fdt, cur_fdt);
 225                rcu_assign_pointer(files->fdt, new_fdt);
 226                if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
 227                        free_fdtable(cur_fdt);
 228        } else {
 229                /* Somebody else expanded, so undo our attempt */
 230                __free_fdtable(new_fdt);
 231        }
 232        return 1;
 233}
 234
 235/*
 236 * Expand files.
 237 * This function will expand the file structures, if the requested size exceeds
 238 * the current capacity and there is room for expansion.
 239 * Return <0 error code on error; 0 when nothing done; 1 when files were
 240 * expanded and execution may have blocked.
 241 * The files->file_lock should be held on entry, and will be held on exit.
 242 */
 243int expand_files(struct files_struct *files, int nr)
 244{
 245        struct fdtable *fdt;
 246
 247        fdt = files_fdtable(files);
 248
 249        /*
 250         * N.B. For clone tasks sharing a files structure, this test
 251         * will limit the total number of files that can be opened.
 252         */
 253        if (nr >= rlimit(RLIMIT_NOFILE))
 254                return -EMFILE;
 255
 256        /* Do we need to expand? */
 257        if (nr < fdt->max_fds)
 258                return 0;
 259
 260        /* Can we expand? */
 261        if (nr >= sysctl_nr_open)
 262                return -EMFILE;
 263
 264        /* All good, so we try */
 265        return expand_fdtable(files, nr);
 266}
 267
 268static int count_open_files(struct fdtable *fdt)
 269{
 270        int size = fdt->max_fds;
 271        int i;
 272
 273        /* Find the last open fd */
 274        for (i = size/(8*sizeof(long)); i > 0; ) {
 275                if (fdt->open_fds->fds_bits[--i])
 276                        break;
 277        }
 278        i = (i+1) * 8 * sizeof(long);
 279        return i;
 280}
 281
 282/*
 283 * Allocate a new files structure and copy contents from the
 284 * passed in files structure.
 285 * errorp will be valid only when the returned files_struct is NULL.
 286 */
 287struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 288{
 289        struct files_struct *newf;
 290        struct file **old_fds, **new_fds;
 291        int open_files, size, i;
 292        struct fdtable *old_fdt, *new_fdt;
 293
 294        *errorp = -ENOMEM;
 295        newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
 296        if (!newf)
 297                goto out;
 298
 299        atomic_set(&newf->count, 1);
 300
 301        spin_lock_init(&newf->file_lock);
 302        newf->next_fd = 0;
 303        new_fdt = &newf->fdtab;
 304        new_fdt->max_fds = NR_OPEN_DEFAULT;
 305        new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
 306        new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
 307        new_fdt->fd = &newf->fd_array[0];
 308        new_fdt->next = NULL;
 309
 310        spin_lock(&oldf->file_lock);
 311        old_fdt = files_fdtable(oldf);
 312        open_files = count_open_files(old_fdt);
 313
 314        /*
 315         * Check whether we need to allocate a larger fd array and fd set.
 316         */
 317        while (unlikely(open_files > new_fdt->max_fds)) {
 318                spin_unlock(&oldf->file_lock);
 319
 320                if (new_fdt != &newf->fdtab)
 321                        __free_fdtable(new_fdt);
 322
 323                new_fdt = alloc_fdtable(open_files - 1);
 324                if (!new_fdt) {
 325                        *errorp = -ENOMEM;
 326                        goto out_release;
 327                }
 328
 329                /* beyond sysctl_nr_open; nothing to do */
 330                if (unlikely(new_fdt->max_fds < open_files)) {
 331                        __free_fdtable(new_fdt);
 332                        *errorp = -EMFILE;
 333                        goto out_release;
 334                }
 335
 336                /*
 337                 * Reacquire the oldf lock and a pointer to its fd table
 338                 * who knows it may have a new bigger fd table. We need
 339                 * the latest pointer.
 340                 */
 341                spin_lock(&oldf->file_lock);
 342                old_fdt = files_fdtable(oldf);
 343                open_files = count_open_files(old_fdt);
 344        }
 345
 346        old_fds = old_fdt->fd;
 347        new_fds = new_fdt->fd;
 348
 349        memcpy(new_fdt->open_fds->fds_bits,
 350                old_fdt->open_fds->fds_bits, open_files/8);
 351        memcpy(new_fdt->close_on_exec->fds_bits,
 352                old_fdt->close_on_exec->fds_bits, open_files/8);
 353
 354        for (i = open_files; i != 0; i--) {
 355                struct file *f = *old_fds++;
 356                if (f) {
 357                        get_file(f);
 358                } else {
 359                        /*
 360                         * The fd may be claimed in the fd bitmap but not yet
 361                         * instantiated in the files array if a sibling thread
 362                         * is partway through open().  So make sure that this
 363                         * fd is available to the new process.
 364                         */
 365                        FD_CLR(open_files - i, new_fdt->open_fds);
 366                }
 367                rcu_assign_pointer(*new_fds++, f);
 368        }
 369        spin_unlock(&oldf->file_lock);
 370
 371        /* compute the remainder to be cleared */
 372        size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
 373
 374        /* This is long word aligned thus could use a optimized version */
 375        memset(new_fds, 0, size);
 376
 377        if (new_fdt->max_fds > open_files) {
 378                int left = (new_fdt->max_fds-open_files)/8;
 379                int start = open_files / (8 * sizeof(unsigned long));
 380
 381                memset(&new_fdt->open_fds->fds_bits[start], 0, left);
 382                memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
 383        }
 384
 385        rcu_assign_pointer(newf->fdt, new_fdt);
 386
 387        return newf;
 388
 389out_release:
 390        kmem_cache_free(files_cachep, newf);
 391out:
 392        return NULL;
 393}
 394
 395static void __devinit fdtable_defer_list_init(int cpu)
 396{
 397        struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
 398        spin_lock_init(&fddef->lock);
 399        INIT_WORK(&fddef->wq, free_fdtable_work);
 400        fddef->next = NULL;
 401}
 402
 403void __init files_defer_init(void)
 404{
 405        int i;
 406        for_each_possible_cpu(i)
 407                fdtable_defer_list_init(i);
 408        sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
 409                             -BITS_PER_LONG;
 410}
 411
 412struct files_struct init_files = {
 413        .count          = ATOMIC_INIT(1),
 414        .fdt            = &init_files.fdtab,
 415        .fdtab          = {
 416                .max_fds        = NR_OPEN_DEFAULT,
 417                .fd             = &init_files.fd_array[0],
 418                .close_on_exec  = (fd_set *)&init_files.close_on_exec_init,
 419                .open_fds       = (fd_set *)&init_files.open_fds_init,
 420        },
 421        .file_lock      = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
 422};
 423
 424/*
 425 * allocate a file descriptor, mark it busy.
 426 */
 427int alloc_fd(unsigned start, unsigned flags)
 428{
 429        struct files_struct *files = current->files;
 430        unsigned int fd;
 431        int error;
 432        struct fdtable *fdt;
 433
 434        spin_lock(&files->file_lock);
 435repeat:
 436        fdt = files_fdtable(files);
 437        fd = start;
 438        if (fd < files->next_fd)
 439                fd = files->next_fd;
 440
 441        if (fd < fdt->max_fds)
 442                fd = find_next_zero_bit(fdt->open_fds->fds_bits,
 443                                           fdt->max_fds, fd);
 444
 445        error = expand_files(files, fd);
 446        if (error < 0)
 447                goto out;
 448
 449        /*
 450         * If we needed to expand the fs array we
 451         * might have blocked - try again.
 452         */
 453        if (error)
 454                goto repeat;
 455
 456        if (start <= files->next_fd)
 457                files->next_fd = fd + 1;
 458
 459        FD_SET(fd, fdt->open_fds);
 460        if (flags & O_CLOEXEC)
 461                FD_SET(fd, fdt->close_on_exec);
 462        else
 463                FD_CLR(fd, fdt->close_on_exec);
 464        error = fd;
 465#if 1
 466        /* Sanity check */
 467        if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
 468                printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
 469                rcu_assign_pointer(fdt->fd[fd], NULL);
 470        }
 471#endif
 472
 473out:
 474        spin_unlock(&files->file_lock);
 475        return error;
 476}
 477
 478int get_unused_fd(void)
 479{
 480        return alloc_fd(0, 0);
 481}
 482EXPORT_SYMBOL(get_unused_fd);
 483
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.