linux/kernel/acct.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/acct.c
   3 *
   4 *  BSD Process Accounting for Linux
   5 *
   6 *  Author: Marco van Wieringen <mvw@planets.elm.net>
   7 *
   8 *  Some code based on ideas and code from:
   9 *  Thomas K. Dyas <tdyas@eden.rutgers.edu>
  10 *
  11 *  This file implements BSD-style process accounting. Whenever any
  12 *  process exits, an accounting record of type "struct acct" is
  13 *  written to the file specified with the acct() system call. It is
  14 *  up to user-level programs to do useful things with the accounting
  15 *  log. The kernel just provides the raw accounting information.
  16 *
  17 * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
  18 *
  19 *  Plugged two leaks. 1) It didn't return acct_file into the free_filps if
  20 *  the file happened to be read-only. 2) If the accounting was suspended
  21 *  due to the lack of space it happily allowed to reopen it and completely
  22 *  lost the old acct_file. 3/10/98, Al Viro.
  23 *
  24 *  Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
  25 *  XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
  26 *
  27 *  Fixed a nasty interaction with with sys_umount(). If the accointing
  28 *  was suspeneded we failed to stop it on umount(). Messy.
  29 *  Another one: remount to readonly didn't stop accounting.
  30 *      Question: what should we do if we have CAP_SYS_ADMIN but not
  31 *  CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
  32 *  unless we are messing with the root. In that case we are getting a
  33 *  real mess with do_remount_sb(). 9/11/98, AV.
  34 *
  35 *  Fixed a bunch of races (and pair of leaks). Probably not the best way,
  36 *  but this one obviously doesn't introduce deadlocks. Later. BTW, found
  37 *  one race (and leak) in BSD implementation.
  38 *  OK, that's better. ANOTHER race and leak in BSD variant. There always
  39 *  is one more bug... 10/11/98, AV.
  40 *
  41 *      Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
  42 * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
  43 * a struct file opened for write. Fixed. 2/6/2000, AV.
  44 */
  45
  46#include <linux/mm.h>
  47#include <linux/slab.h>
  48#include <linux/acct.h>
  49#include <linux/capability.h>
  50#include <linux/file.h>
  51#include <linux/tty.h>
  52#include <linux/security.h>
  53#include <linux/vfs.h>
  54#include <linux/jiffies.h>
  55#include <linux/times.h>
  56#include <linux/syscalls.h>
  57#include <linux/mount.h>
  58#include <asm/uaccess.h>
  59#include <asm/div64.h>
  60#include <linux/blkdev.h> /* sector_div */
  61#include <linux/pid_namespace.h>
  62
  63/*
  64 * These constants control the amount of freespace that suspend and
  65 * resume the process accounting system, and the time delay between
  66 * each check.
  67 * Turned into sysctl-controllable parameters. AV, 12/11/98
  68 */
  69
  70int acct_parm[3] = {4, 2, 30};
  71#define RESUME          (acct_parm[0])  /* >foo% free space - resume */
  72#define SUSPEND         (acct_parm[1])  /* <foo% free space - suspend */
  73#define ACCT_TIMEOUT    (acct_parm[2])  /* foo second timeout between checks */
  74
  75/*
  76 * External references and all of the globals.
  77 */
  78static void do_acct_process(struct bsd_acct_struct *acct,
  79                struct pid_namespace *ns, struct file *);
  80
  81/*
  82 * This structure is used so that all the data protected by lock
  83 * can be placed in the same cache line as the lock.  This primes
  84 * the cache line to have the data after getting the lock.
  85 */
  86struct bsd_acct_struct {
  87        volatile int            active;
  88        volatile int            needcheck;
  89        struct file             *file;
  90        struct pid_namespace    *ns;
  91        struct timer_list       timer;
  92        struct list_head        list;
  93};
  94
  95static DEFINE_SPINLOCK(acct_lock);
  96static LIST_HEAD(acct_list);
  97
  98/*
  99 * Called whenever the timer says to check the free space.
 100 */
 101static void acct_timeout(unsigned long x)
 102{
 103        struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
 104        acct->needcheck = 1;
 105}
 106
 107/*
 108 * Check the amount of free space and suspend/resume accordingly.
 109 */
 110static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
 111{
 112        struct kstatfs sbuf;
 113        int res;
 114        int act;
 115        sector_t resume;
 116        sector_t suspend;
 117
 118        spin_lock(&acct_lock);
 119        res = acct->active;
 120        if (!file || !acct->needcheck)
 121                goto out;
 122        spin_unlock(&acct_lock);
 123
 124        /* May block */
 125        if (vfs_statfs(&file->f_path, &sbuf))
 126                return res;
 127        suspend = sbuf.f_blocks * SUSPEND;
 128        resume = sbuf.f_blocks * RESUME;
 129
 130        sector_div(suspend, 100);
 131        sector_div(resume, 100);
 132
 133        if (sbuf.f_bavail <= suspend)
 134                act = -1;
 135        else if (sbuf.f_bavail >= resume)
 136                act = 1;
 137        else
 138                act = 0;
 139
 140        /*
 141         * If some joker switched acct->file under us we'ld better be
 142         * silent and _not_ touch anything.
 143         */
 144        spin_lock(&acct_lock);
 145        if (file != acct->file) {
 146                if (act)
 147                        res = act>0;
 148                goto out;
 149        }
 150
 151        if (acct->active) {
 152                if (act < 0) {
 153                        acct->active = 0;
 154                        printk(KERN_INFO "Process accounting paused\n");
 155                }
 156        } else {
 157                if (act > 0) {
 158                        acct->active = 1;
 159                        printk(KERN_INFO "Process accounting resumed\n");
 160                }
 161        }
 162
 163        del_timer(&acct->timer);
 164        acct->needcheck = 0;
 165        acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
 166        add_timer(&acct->timer);
 167        res = acct->active;
 168out:
 169        spin_unlock(&acct_lock);
 170        return res;
 171}
 172
 173/*
 174 * Close the old accounting file (if currently open) and then replace
 175 * it with file (if non-NULL).
 176 *
 177 * NOTE: acct_lock MUST be held on entry and exit.
 178 */
 179static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
 180                struct pid_namespace *ns)
 181{
 182        struct file *old_acct = NULL;
 183        struct pid_namespace *old_ns = NULL;
 184
 185        if (acct->file) {
 186                old_acct = acct->file;
 187                old_ns = acct->ns;
 188                del_timer(&acct->timer);
 189                acct->active = 0;
 190                acct->needcheck = 0;
 191                acct->file = NULL;
 192                acct->ns = NULL;
 193                list_del(&acct->list);
 194        }
 195        if (file) {
 196                acct->file = file;
 197                acct->ns = ns;
 198                acct->needcheck = 0;
 199                acct->active = 1;
 200                list_add(&acct->list, &acct_list);
 201                /* It's been deleted if it was used before so this is safe */
 202                setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
 203                acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
 204                add_timer(&acct->timer);
 205        }
 206        if (old_acct) {
 207                mnt_unpin(old_acct->f_path.mnt);
 208                spin_unlock(&acct_lock);
 209                do_acct_process(acct, old_ns, old_acct);
 210                filp_close(old_acct, NULL);
 211                spin_lock(&acct_lock);
 212        }
 213}
 214
 215static int acct_on(char *name)
 216{
 217        struct file *file;
 218        struct vfsmount *mnt;
 219        struct pid_namespace *ns;
 220        struct bsd_acct_struct *acct = NULL;
 221
 222        /* Difference from BSD - they don't do O_APPEND */
 223        file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
 224        if (IS_ERR(file))
 225                return PTR_ERR(file);
 226
 227        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
 228                filp_close(file, NULL);
 229                return -EACCES;
 230        }
 231
 232        if (!file->f_op->write) {
 233                filp_close(file, NULL);
 234                return -EIO;
 235        }
 236
 237        ns = task_active_pid_ns(current);
 238        if (ns->bacct == NULL) {
 239                acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
 240                if (acct == NULL) {
 241                        filp_close(file, NULL);
 242                        return -ENOMEM;
 243                }
 244        }
 245
 246        spin_lock(&acct_lock);
 247        if (ns->bacct == NULL) {
 248                ns->bacct = acct;
 249                acct = NULL;
 250        }
 251
 252        mnt = file->f_path.mnt;
 253        mnt_pin(mnt);
 254        acct_file_reopen(ns->bacct, file, ns);
 255        spin_unlock(&acct_lock);
 256
 257        mntput(mnt); /* it's pinned, now give up active reference */
 258        kfree(acct);
 259
 260        return 0;
 261}
 262
 263/**
 264 * sys_acct - enable/disable process accounting
 265 * @name: file name for accounting records or NULL to shutdown accounting
 266 *
 267 * Returns 0 for success or negative errno values for failure.
 268 *
 269 * sys_acct() is the only system call needed to implement process
 270 * accounting. It takes the name of the file where accounting records
 271 * should be written. If the filename is NULL, accounting will be
 272 * shutdown.
 273 */
 274SYSCALL_DEFINE1(acct, const char __user *, name)
 275{
 276        int error = 0;
 277
 278        if (!capable(CAP_SYS_PACCT))
 279                return -EPERM;
 280
 281        if (name) {
 282                char *tmp = getname(name);
 283                if (IS_ERR(tmp))
 284                        return (PTR_ERR(tmp));
 285                error = acct_on(tmp);
 286                putname(tmp);
 287        } else {
 288                struct bsd_acct_struct *acct;
 289
 290                acct = task_active_pid_ns(current)->bacct;
 291                if (acct == NULL)
 292                        return 0;
 293
 294                spin_lock(&acct_lock);
 295                acct_file_reopen(acct, NULL, NULL);
 296                spin_unlock(&acct_lock);
 297        }
 298
 299        return error;
 300}
 301
 302/**
 303 * acct_auto_close - turn off a filesystem's accounting if it is on
 304 * @m: vfsmount being shut down
 305 *
 306 * If the accounting is turned on for a file in the subtree pointed to
 307 * to by m, turn accounting off.  Done when m is about to die.
 308 */
 309void acct_auto_close_mnt(struct vfsmount *m)
 310{
 311        struct bsd_acct_struct *acct;
 312
 313        spin_lock(&acct_lock);
 314restart:
 315        list_for_each_entry(acct, &acct_list, list)
 316                if (acct->file && acct->file->f_path.mnt == m) {
 317                        acct_file_reopen(acct, NULL, NULL);
 318                        goto restart;
 319                }
 320        spin_unlock(&acct_lock);
 321}
 322
 323/**
 324 * acct_auto_close - turn off a filesystem's accounting if it is on
 325 * @sb: super block for the filesystem
 326 *
 327 * If the accounting is turned on for a file in the filesystem pointed
 328 * to by sb, turn accounting off.
 329 */
 330void acct_auto_close(struct super_block *sb)
 331{
 332        struct bsd_acct_struct *acct;
 333
 334        spin_lock(&acct_lock);
 335restart:
 336        list_for_each_entry(acct, &acct_list, list)
 337                if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
 338                        acct_file_reopen(acct, NULL, NULL);
 339                        goto restart;
 340                }
 341        spin_unlock(&acct_lock);
 342}
 343
 344void acct_exit_ns(struct pid_namespace *ns)
 345{
 346        struct bsd_acct_struct *acct = ns->bacct;
 347
 348        if (acct == NULL)
 349                return;
 350
 351        del_timer_sync(&acct->timer);
 352        spin_lock(&acct_lock);
 353        if (acct->file != NULL)
 354                acct_file_reopen(acct, NULL, NULL);
 355        spin_unlock(&acct_lock);
 356
 357        kfree(acct);
 358}
 359
 360/*
 361 *  encode an unsigned long into a comp_t
 362 *
 363 *  This routine has been adopted from the encode_comp_t() function in
 364 *  the kern_acct.c file of the FreeBSD operating system. The encoding
 365 *  is a 13-bit fraction with a 3-bit (base 8) exponent.
 366 */
 367
 368#define MANTSIZE        13                      /* 13 bit mantissa. */
 369#define EXPSIZE         3                       /* Base 8 (3 bit) exponent. */
 370#define MAXFRACT        ((1 << MANTSIZE) - 1)   /* Maximum fractional value. */
 371
 372static comp_t encode_comp_t(unsigned long value)
 373{
 374        int exp, rnd;
 375
 376        exp = rnd = 0;
 377        while (value > MAXFRACT) {
 378                rnd = value & (1 << (EXPSIZE - 1));     /* Round up? */
 379                value >>= EXPSIZE;      /* Base 8 exponent == 3 bit shift. */
 380                exp++;
 381        }
 382
 383        /*
 384         * If we need to round up, do it (and handle overflow correctly).
 385         */
 386        if (rnd && (++value > MAXFRACT)) {
 387                value >>= EXPSIZE;
 388                exp++;
 389        }
 390
 391        /*
 392         * Clean it up and polish it off.
 393         */
 394        exp <<= MANTSIZE;               /* Shift the exponent into place */
 395        exp += value;                   /* and add on the mantissa. */
 396        return exp;
 397}
 398
 399#if ACCT_VERSION==1 || ACCT_VERSION==2
 400/*
 401 * encode an u64 into a comp2_t (24 bits)
 402 *
 403 * Format: 5 bit base 2 exponent, 20 bits mantissa.
 404 * The leading bit of the mantissa is not stored, but implied for
 405 * non-zero exponents.
 406 * Largest encodable value is 50 bits.
 407 */
 408
 409#define MANTSIZE2       20                      /* 20 bit mantissa. */
 410#define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
 411#define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
 412#define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
 413
 414static comp2_t encode_comp2_t(u64 value)
 415{
 416        int exp, rnd;
 417
 418        exp = (value > (MAXFRACT2>>1));
 419        rnd = 0;
 420        while (value > MAXFRACT2) {
 421                rnd = value & 1;
 422                value >>= 1;
 423                exp++;
 424        }
 425
 426        /*
 427         * If we need to round up, do it (and handle overflow correctly).
 428         */
 429        if (rnd && (++value > MAXFRACT2)) {
 430                value >>= 1;
 431                exp++;
 432        }
 433
 434        if (exp > MAXEXP2) {
 435                /* Overflow. Return largest representable number instead. */
 436                return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
 437        } else {
 438                return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
 439        }
 440}
 441#endif
 442
 443#if ACCT_VERSION==3
 444/*
 445 * encode an u64 into a 32 bit IEEE float
 446 */
 447static u32 encode_float(u64 value)
 448{
 449        unsigned exp = 190;
 450        unsigned u;
 451
 452        if (value==0) return 0;
 453        while ((s64)value > 0){
 454                value <<= 1;
 455                exp--;
 456        }
 457        u = (u32)(value >> 40) & 0x7fffffu;
 458        return u | (exp << 23);
 459}
 460#endif
 461
 462/*
 463 *  Write an accounting entry for an exiting process
 464 *
 465 *  The acct_process() call is the workhorse of the process
 466 *  accounting system. The struct acct is built here and then written
 467 *  into the accounting file. This function should only be called from
 468 *  do_exit() or when switching to a different output file.
 469 */
 470
 471/*
 472 *  do_acct_process does all actual work. Caller holds the reference to file.
 473 */
 474static void do_acct_process(struct bsd_acct_struct *acct,
 475                struct pid_namespace *ns, struct file *file)
 476{
 477        struct pacct_struct *pacct = &current->signal->pacct;
 478        acct_t ac;
 479        mm_segment_t fs;
 480        unsigned long flim;
 481        u64 elapsed;
 482        u64 run_time;
 483        struct timespec uptime;
 484        struct tty_struct *tty;
 485        const struct cred *orig_cred;
 486
 487        /* Perform file operations on behalf of whoever enabled accounting */
 488        orig_cred = override_creds(file->f_cred);
 489
 490        /*
 491         * First check to see if there is enough free_space to continue
 492         * the process accounting system.
 493         */
 494        if (!check_free_space(acct, file))
 495                goto out;
 496
 497        /*
 498         * Fill the accounting struct with the needed info as recorded
 499         * by the different kernel functions.
 500         */
 501        memset((caddr_t)&ac, 0, sizeof(acct_t));
 502
 503        ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 504        strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 505
 506        /* calculate run_time in nsec*/
 507        do_posix_clock_monotonic_gettime(&uptime);
 508        run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
 509        run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
 510                       + current->group_leader->start_time.tv_nsec;
 511        /* convert nsec -> AHZ */
 512        elapsed = nsec_to_AHZ(run_time);
 513#if ACCT_VERSION==3
 514        ac.ac_etime = encode_float(elapsed);
 515#else
 516        ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
 517                               (unsigned long) elapsed : (unsigned long) -1l);
 518#endif
 519#if ACCT_VERSION==1 || ACCT_VERSION==2
 520        {
 521                /* new enlarged etime field */
 522                comp2_t etime = encode_comp2_t(elapsed);
 523                ac.ac_etime_hi = etime >> 16;
 524                ac.ac_etime_lo = (u16) etime;
 525        }
 526#endif
 527        do_div(elapsed, AHZ);
 528        ac.ac_btime = get_seconds() - elapsed;
 529        /* we really need to bite the bullet and change layout */
 530        ac.ac_uid = orig_cred->uid;
 531        ac.ac_gid = orig_cred->gid;
 532#if ACCT_VERSION==2
 533        ac.ac_ahz = AHZ;
 534#endif
 535#if ACCT_VERSION==1 || ACCT_VERSION==2
 536        /* backward-compatible 16 bit fields */
 537        ac.ac_uid16 = ac.ac_uid;
 538        ac.ac_gid16 = ac.ac_gid;
 539#endif
 540#if ACCT_VERSION==3
 541        ac.ac_pid = task_tgid_nr_ns(current, ns);
 542        rcu_read_lock();
 543        ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
 544        rcu_read_unlock();
 545#endif
 546
 547        spin_lock_irq(&current->sighand->siglock);
 548        tty = current->signal->tty;     /* Safe as we hold the siglock */
 549        ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
 550        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
 551        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
 552        ac.ac_flag = pacct->ac_flag;
 553        ac.ac_mem = encode_comp_t(pacct->ac_mem);
 554        ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
 555        ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
 556        ac.ac_exitcode = pacct->ac_exitcode;
 557        spin_unlock_irq(&current->sighand->siglock);
 558        ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
 559        ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 560        ac.ac_swaps = encode_comp_t(0);
 561
 562        /*
 563         * Kernel segment override to datasegment and write it
 564         * to the accounting file.
 565         */
 566        fs = get_fs();
 567        set_fs(KERNEL_DS);
 568        /*
 569         * Accounting records are not subject to resource limits.
 570         */
 571        flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 572        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 573        file->f_op->write(file, (char *)&ac,
 574                               sizeof(acct_t), &file->f_pos);
 575        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 576        set_fs(fs);
 577out:
 578        revert_creds(orig_cred);
 579}
 580
 581/**
 582 * acct_collect - collect accounting information into pacct_struct
 583 * @exitcode: task exit code
 584 * @group_dead: not 0, if this thread is the last one in the process.
 585 */
 586void acct_collect(long exitcode, int group_dead)
 587{
 588        struct pacct_struct *pacct = &current->signal->pacct;
 589        unsigned long vsize = 0;
 590
 591        if (group_dead && current->mm) {
 592                struct vm_area_struct *vma;
 593                down_read(&current->mm->mmap_sem);
 594                vma = current->mm->mmap;
 595                while (vma) {
 596                        vsize += vma->vm_end - vma->vm_start;
 597                        vma = vma->vm_next;
 598                }
 599                up_read(&current->mm->mmap_sem);
 600        }
 601
 602        spin_lock_irq(&current->sighand->siglock);
 603        if (group_dead)
 604                pacct->ac_mem = vsize / 1024;
 605        if (thread_group_leader(current)) {
 606                pacct->ac_exitcode = exitcode;
 607                if (current->flags & PF_FORKNOEXEC)
 608                        pacct->ac_flag |= AFORK;
 609        }
 610        if (current->flags & PF_SUPERPRIV)
 611                pacct->ac_flag |= ASU;
 612        if (current->flags & PF_DUMPCORE)
 613                pacct->ac_flag |= ACORE;
 614        if (current->flags & PF_SIGNALED)
 615                pacct->ac_flag |= AXSIG;
 616        pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
 617        pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
 618        pacct->ac_minflt += current->min_flt;
 619        pacct->ac_majflt += current->maj_flt;
 620        spin_unlock_irq(&current->sighand->siglock);
 621}
 622
 623static void acct_process_in_ns(struct pid_namespace *ns)
 624{
 625        struct file *file = NULL;
 626        struct bsd_acct_struct *acct;
 627
 628        acct = ns->bacct;
 629        /*
 630         * accelerate the common fastpath:
 631         */
 632        if (!acct || !acct->file)
 633                return;
 634
 635        spin_lock(&acct_lock);
 636        file = acct->file;
 637        if (unlikely(!file)) {
 638                spin_unlock(&acct_lock);
 639                return;
 640        }
 641        get_file(file);
 642        spin_unlock(&acct_lock);
 643
 644        do_acct_process(acct, ns, file);
 645        fput(file);
 646}
 647
 648/**
 649 * acct_process - now just a wrapper around acct_process_in_ns,
 650 * which in turn is a wrapper around do_acct_process.
 651 *
 652 * handles process accounting for an exiting task
 653 */
 654void acct_process(void)
 655{
 656        struct pid_namespace *ns;
 657
 658        /*
 659         * This loop is safe lockless, since current is still
 660         * alive and holds its namespace, which in turn holds
 661         * its parent.
 662         */
 663        for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
 664                acct_process_in_ns(ns);
 665}
 666
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.