linux/kernel/acct.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/acct.c
   3 *
   4 *  BSD Process Accounting for Linux
   5 *
   6 *  Author: Marco van Wieringen <mvw@planets.elm.net>
   7 *
   8 *  Some code based on ideas and code from:
   9 *  Thomas K. Dyas <tdyas@eden.rutgers.edu>
  10 *
  11 *  This file implements BSD-style process accounting. Whenever any
  12 *  process exits, an accounting record of type "struct acct" is
  13 *  written to the file specified with the acct() system call. It is
  14 *  up to user-level programs to do useful things with the accounting
  15 *  log. The kernel just provides the raw accounting information.
  16 *
  17 * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
  18 *
  19 *  Plugged two leaks. 1) It didn't return acct_file into the free_filps if
  20 *  the file happened to be read-only. 2) If the accounting was suspended
  21 *  due to the lack of space it happily allowed to reopen it and completely
  22 *  lost the old acct_file. 3/10/98, Al Viro.
  23 *
  24 *  Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
  25 *  XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
  26 *
  27 *  Fixed a nasty interaction with with sys_umount(). If the accointing
  28 *  was suspeneded we failed to stop it on umount(). Messy.
  29 *  Another one: remount to readonly didn't stop accounting.
  30 *      Question: what should we do if we have CAP_SYS_ADMIN but not
  31 *  CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
  32 *  unless we are messing with the root. In that case we are getting a
  33 *  real mess with do_remount_sb(). 9/11/98, AV.
  34 *
  35 *  Fixed a bunch of races (and pair of leaks). Probably not the best way,
  36 *  but this one obviously doesn't introduce deadlocks. Later. BTW, found
  37 *  one race (and leak) in BSD implementation.
  38 *  OK, that's better. ANOTHER race and leak in BSD variant. There always
  39 *  is one more bug... 10/11/98, AV.
  40 *
  41 *      Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
  42 * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
  43 * a struct file opened for write. Fixed. 2/6/2000, AV.
  44 */
  45
  46#include <linux/mm.h>
  47#include <linux/slab.h>
  48#include <linux/acct.h>
  49#include <linux/capability.h>
  50#include <linux/file.h>
  51#include <linux/tty.h>
  52#include <linux/security.h>
  53#include <linux/vfs.h>
  54#include <linux/jiffies.h>
  55#include <linux/times.h>
  56#include <linux/syscalls.h>
  57#include <linux/mount.h>
  58#include <asm/uaccess.h>
  59#include <asm/div64.h>
  60#include <linux/blkdev.h> /* sector_div */
  61
  62/*
  63 * These constants control the amount of freespace that suspend and
  64 * resume the process accounting system, and the time delay between
  65 * each check.
  66 * Turned into sysctl-controllable parameters. AV, 12/11/98
  67 */
  68
  69int acct_parm[3] = {4, 2, 30};
  70#define RESUME          (acct_parm[0])  /* >foo% free space - resume */
  71#define SUSPEND         (acct_parm[1])  /* <foo% free space - suspend */
  72#define ACCT_TIMEOUT    (acct_parm[2])  /* foo second timeout between checks */
  73
  74/*
  75 * External references and all of the globals.
  76 */
  77static void do_acct_process(struct file *);
  78
  79/*
  80 * This structure is used so that all the data protected by lock
  81 * can be placed in the same cache line as the lock.  This primes
  82 * the cache line to have the data after getting the lock.
  83 */
  84struct acct_glbs {
  85        spinlock_t              lock;
  86        volatile int            active;
  87        volatile int            needcheck;
  88        struct file             *file;
  89        struct timer_list       timer;
  90};
  91
  92static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
  93
  94/*
  95 * Called whenever the timer says to check the free space.
  96 */
  97static void acct_timeout(unsigned long unused)
  98{
  99        acct_globals.needcheck = 1;
 100}
 101
 102/*
 103 * Check the amount of free space and suspend/resume accordingly.
 104 */
 105static int check_free_space(struct file *file)
 106{
 107        struct kstatfs sbuf;
 108        int res;
 109        int act;
 110        sector_t resume;
 111        sector_t suspend;
 112
 113        spin_lock(&acct_globals.lock);
 114        res = acct_globals.active;
 115        if (!file || !acct_globals.needcheck)
 116                goto out;
 117        spin_unlock(&acct_globals.lock);
 118
 119        /* May block */
 120        if (vfs_statfs(file->f_dentry, &sbuf))
 121                return res;
 122        suspend = sbuf.f_blocks * SUSPEND;
 123        resume = sbuf.f_blocks * RESUME;
 124
 125        sector_div(suspend, 100);
 126        sector_div(resume, 100);
 127
 128        if (sbuf.f_bavail <= suspend)
 129                act = -1;
 130        else if (sbuf.f_bavail >= resume)
 131                act = 1;
 132        else
 133                act = 0;
 134
 135        /*
 136         * If some joker switched acct_globals.file under us we'ld better be
 137         * silent and _not_ touch anything.
 138         */
 139        spin_lock(&acct_globals.lock);
 140        if (file != acct_globals.file) {
 141                if (act)
 142                        res = act>0;
 143                goto out;
 144        }
 145
 146        if (acct_globals.active) {
 147                if (act < 0) {
 148                        acct_globals.active = 0;
 149                        printk(KERN_INFO "Process accounting paused\n");
 150                }
 151        } else {
 152                if (act > 0) {
 153                        acct_globals.active = 1;
 154                        printk(KERN_INFO "Process accounting resumed\n");
 155                }
 156        }
 157
 158        del_timer(&acct_globals.timer);
 159        acct_globals.needcheck = 0;
 160        acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
 161        add_timer(&acct_globals.timer);
 162        res = acct_globals.active;
 163out:
 164        spin_unlock(&acct_globals.lock);
 165        return res;
 166}
 167
 168/*
 169 * Close the old accounting file (if currently open) and then replace
 170 * it with file (if non-NULL).
 171 *
 172 * NOTE: acct_globals.lock MUST be held on entry and exit.
 173 */
 174static void acct_file_reopen(struct file *file)
 175{
 176        struct file *old_acct = NULL;
 177
 178        if (acct_globals.file) {
 179                old_acct = acct_globals.file;
 180                del_timer(&acct_globals.timer);
 181                acct_globals.active = 0;
 182                acct_globals.needcheck = 0;
 183                acct_globals.file = NULL;
 184        }
 185        if (file) {
 186                acct_globals.file = file;
 187                acct_globals.needcheck = 0;
 188                acct_globals.active = 1;
 189                /* It's been deleted if it was used before so this is safe */
 190                init_timer(&acct_globals.timer);
 191                acct_globals.timer.function = acct_timeout;
 192                acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
 193                add_timer(&acct_globals.timer);
 194        }
 195        if (old_acct) {
 196                mnt_unpin(old_acct->f_vfsmnt);
 197                spin_unlock(&acct_globals.lock);
 198                do_acct_process(old_acct);
 199                filp_close(old_acct, NULL);
 200                spin_lock(&acct_globals.lock);
 201        }
 202}
 203
 204static int acct_on(char *name)
 205{
 206        struct file *file;
 207        int error;
 208
 209        /* Difference from BSD - they don't do O_APPEND */
 210        file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
 211        if (IS_ERR(file))
 212                return PTR_ERR(file);
 213
 214        if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
 215                filp_close(file, NULL);
 216                return -EACCES;
 217        }
 218
 219        if (!file->f_op->write) {
 220                filp_close(file, NULL);
 221                return -EIO;
 222        }
 223
 224        error = security_acct(file);
 225        if (error) {
 226                filp_close(file, NULL);
 227                return error;
 228        }
 229
 230        spin_lock(&acct_globals.lock);
 231        mnt_pin(file->f_vfsmnt);
 232        acct_file_reopen(file);
 233        spin_unlock(&acct_globals.lock);
 234
 235        mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */
 236
 237        return 0;
 238}
 239
 240/**
 241 * sys_acct - enable/disable process accounting
 242 * @name: file name for accounting records or NULL to shutdown accounting
 243 *
 244 * Returns 0 for success or negative errno values for failure.
 245 *
 246 * sys_acct() is the only system call needed to implement process
 247 * accounting. It takes the name of the file where accounting records
 248 * should be written. If the filename is NULL, accounting will be
 249 * shutdown.
 250 */
 251asmlinkage long sys_acct(const char __user *name)
 252{
 253        int error;
 254
 255        if (!capable(CAP_SYS_PACCT))
 256                return -EPERM;
 257
 258        if (name) {
 259                char *tmp = getname(name);
 260                if (IS_ERR(tmp))
 261                        return (PTR_ERR(tmp));
 262                error = acct_on(tmp);
 263                putname(tmp);
 264        } else {
 265                error = security_acct(NULL);
 266                if (!error) {
 267                        spin_lock(&acct_globals.lock);
 268                        acct_file_reopen(NULL);
 269                        spin_unlock(&acct_globals.lock);
 270                }
 271        }
 272        return error;
 273}
 274
 275/**
 276 * acct_auto_close - turn off a filesystem's accounting if it is on
 277 * @m: vfsmount being shut down
 278 *
 279 * If the accounting is turned on for a file in the subtree pointed to
 280 * to by m, turn accounting off.  Done when m is about to die.
 281 */
 282void acct_auto_close_mnt(struct vfsmount *m)
 283{
 284        spin_lock(&acct_globals.lock);
 285        if (acct_globals.file && acct_globals.file->f_vfsmnt == m)
 286                acct_file_reopen(NULL);
 287        spin_unlock(&acct_globals.lock);
 288}
 289
 290/**
 291 * acct_auto_close - turn off a filesystem's accounting if it is on
 292 * @sb: super block for the filesystem
 293 *
 294 * If the accounting is turned on for a file in the filesystem pointed
 295 * to by sb, turn accounting off.
 296 */
 297void acct_auto_close(struct super_block *sb)
 298{
 299        spin_lock(&acct_globals.lock);
 300        if (acct_globals.file &&
 301            acct_globals.file->f_vfsmnt->mnt_sb == sb) {
 302                acct_file_reopen(NULL);
 303        }
 304        spin_unlock(&acct_globals.lock);
 305}
 306
 307/*
 308 *  encode an unsigned long into a comp_t
 309 *
 310 *  This routine has been adopted from the encode_comp_t() function in
 311 *  the kern_acct.c file of the FreeBSD operating system. The encoding
 312 *  is a 13-bit fraction with a 3-bit (base 8) exponent.
 313 */
 314
 315#define MANTSIZE        13                      /* 13 bit mantissa. */
 316#define EXPSIZE         3                       /* Base 8 (3 bit) exponent. */
 317#define MAXFRACT        ((1 << MANTSIZE) - 1)   /* Maximum fractional value. */
 318
 319static comp_t encode_comp_t(unsigned long value)
 320{
 321        int exp, rnd;
 322
 323        exp = rnd = 0;
 324        while (value > MAXFRACT) {
 325                rnd = value & (1 << (EXPSIZE - 1));     /* Round up? */
 326                value >>= EXPSIZE;      /* Base 8 exponent == 3 bit shift. */
 327                exp++;
 328        }
 329
 330        /*
 331         * If we need to round up, do it (and handle overflow correctly).
 332         */
 333        if (rnd && (++value > MAXFRACT)) {
 334                value >>= EXPSIZE;
 335                exp++;
 336        }
 337
 338        /*
 339         * Clean it up and polish it off.
 340         */
 341        exp <<= MANTSIZE;               /* Shift the exponent into place */
 342        exp += value;                   /* and add on the mantissa. */
 343        return exp;
 344}
 345
 346#if ACCT_VERSION==1 || ACCT_VERSION==2
 347/*
 348 * encode an u64 into a comp2_t (24 bits)
 349 *
 350 * Format: 5 bit base 2 exponent, 20 bits mantissa.
 351 * The leading bit of the mantissa is not stored, but implied for
 352 * non-zero exponents.
 353 * Largest encodable value is 50 bits.
 354 */
 355
 356#define MANTSIZE2       20                      /* 20 bit mantissa. */
 357#define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
 358#define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
 359#define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
 360
 361static comp2_t encode_comp2_t(u64 value)
 362{
 363        int exp, rnd;
 364
 365        exp = (value > (MAXFRACT2>>1));
 366        rnd = 0;
 367        while (value > MAXFRACT2) {
 368                rnd = value & 1;
 369                value >>= 1;
 370                exp++;
 371        }
 372
 373        /*
 374         * If we need to round up, do it (and handle overflow correctly).
 375         */
 376        if (rnd && (++value > MAXFRACT2)) {
 377                value >>= 1;
 378                exp++;
 379        }
 380
 381        if (exp > MAXEXP2) {
 382                /* Overflow. Return largest representable number instead. */
 383                return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
 384        } else {
 385                return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
 386        }
 387}
 388#endif
 389
 390#if ACCT_VERSION==3
 391/*
 392 * encode an u64 into a 32 bit IEEE float
 393 */
 394static u32 encode_float(u64 value)
 395{
 396        unsigned exp = 190;
 397        unsigned u;
 398
 399        if (value==0) return 0;
 400        while ((s64)value > 0){
 401                value <<= 1;
 402                exp--;
 403        }
 404        u = (u32)(value >> 40) & 0x7fffffu;
 405        return u | (exp << 23);
 406}
 407#endif
 408
 409/*
 410 *  Write an accounting entry for an exiting process
 411 *
 412 *  The acct_process() call is the workhorse of the process
 413 *  accounting system. The struct acct is built here and then written
 414 *  into the accounting file. This function should only be called from
 415 *  do_exit().
 416 */
 417
 418/*
 419 *  do_acct_process does all actual work. Caller holds the reference to file.
 420 */
 421static void do_acct_process(struct file *file)
 422{
 423        struct pacct_struct *pacct = &current->signal->pacct;
 424        acct_t ac;
 425        mm_segment_t fs;
 426        unsigned long flim;
 427        u64 elapsed;
 428        u64 run_time;
 429        struct timespec uptime;
 430
 431        /*
 432         * First check to see if there is enough free_space to continue
 433         * the process accounting system.
 434         */
 435        if (!check_free_space(file))
 436                return;
 437
 438        /*
 439         * Fill the accounting struct with the needed info as recorded
 440         * by the different kernel functions.
 441         */
 442        memset((caddr_t)&ac, 0, sizeof(acct_t));
 443
 444        ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 445        strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 446
 447        /* calculate run_time in nsec*/
 448        do_posix_clock_monotonic_gettime(&uptime);
 449        run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
 450        run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
 451                       + current->group_leader->start_time.tv_nsec;
 452        /* convert nsec -> AHZ */
 453        elapsed = nsec_to_AHZ(run_time);
 454#if ACCT_VERSION==3
 455        ac.ac_etime = encode_float(elapsed);
 456#else
 457        ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
 458                               (unsigned long) elapsed : (unsigned long) -1l);
 459#endif
 460#if ACCT_VERSION==1 || ACCT_VERSION==2
 461        {
 462                /* new enlarged etime field */
 463                comp2_t etime = encode_comp2_t(elapsed);
 464                ac.ac_etime_hi = etime >> 16;
 465                ac.ac_etime_lo = (u16) etime;
 466        }
 467#endif
 468        do_div(elapsed, AHZ);
 469        ac.ac_btime = xtime.tv_sec - elapsed;
 470        /* we really need to bite the bullet and change layout */
 471        ac.ac_uid = current->uid;
 472        ac.ac_gid = current->gid;
 473#if ACCT_VERSION==2
 474        ac.ac_ahz = AHZ;
 475#endif
 476#if ACCT_VERSION==1 || ACCT_VERSION==2
 477        /* backward-compatible 16 bit fields */
 478        ac.ac_uid16 = current->uid;
 479        ac.ac_gid16 = current->gid;
 480#endif
 481#if ACCT_VERSION==3
 482        ac.ac_pid = current->tgid;
 483        ac.ac_ppid = current->parent->tgid;
 484#endif
 485
 486        mutex_lock(&tty_mutex);
 487        /* FIXME: Whoever is responsible for current->signal locking needs
 488           to use the same locking all over the kernel and document it */
 489        read_lock(&tasklist_lock);
 490        ac.ac_tty = current->signal->tty ?
 491                old_encode_dev(tty_devnum(current->signal->tty)) : 0;
 492        read_unlock(&tasklist_lock);
 493        mutex_unlock(&tty_mutex);
 494
 495        spin_lock_irq(&current->sighand->siglock);
 496        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
 497        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
 498        ac.ac_flag = pacct->ac_flag;
 499        ac.ac_mem = encode_comp_t(pacct->ac_mem);
 500        ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
 501        ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
 502        ac.ac_exitcode = pacct->ac_exitcode;
 503        spin_unlock_irq(&current->sighand->siglock);
 504        ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
 505        ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 506        ac.ac_swaps = encode_comp_t(0);
 507
 508        /*
 509         * Kernel segment override to datasegment and write it
 510         * to the accounting file.
 511         */
 512        fs = get_fs();
 513        set_fs(KERNEL_DS);
 514        /*
 515         * Accounting records are not subject to resource limits.
 516         */
 517        flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 518        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 519        file->f_op->write(file, (char *)&ac,
 520                               sizeof(acct_t), &file->f_pos);
 521        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 522        set_fs(fs);
 523}
 524
 525/**
 526 * acct_init_pacct - initialize a new pacct_struct
 527 * @pacct: per-process accounting info struct to initialize
 528 */
 529void acct_init_pacct(struct pacct_struct *pacct)
 530{
 531        memset(pacct, 0, sizeof(struct pacct_struct));
 532        pacct->ac_utime = pacct->ac_stime = cputime_zero;
 533}
 534
 535/**
 536 * acct_collect - collect accounting information into pacct_struct
 537 * @exitcode: task exit code
 538 * @group_dead: not 0, if this thread is the last one in the process.
 539 */
 540void acct_collect(long exitcode, int group_dead)
 541{
 542        struct pacct_struct *pacct = &current->signal->pacct;
 543        unsigned long vsize = 0;
 544
 545        if (group_dead && current->mm) {
 546                struct vm_area_struct *vma;
 547                down_read(&current->mm->mmap_sem);
 548                vma = current->mm->mmap;
 549                while (vma) {
 550                        vsize += vma->vm_end - vma->vm_start;
 551                        vma = vma->vm_next;
 552                }
 553                up_read(&current->mm->mmap_sem);
 554        }
 555
 556        spin_lock_irq(&current->sighand->siglock);
 557        if (group_dead)
 558                pacct->ac_mem = vsize / 1024;
 559        if (thread_group_leader(current)) {
 560                pacct->ac_exitcode = exitcode;
 561                if (current->flags & PF_FORKNOEXEC)
 562                        pacct->ac_flag |= AFORK;
 563        }
 564        if (current->flags & PF_SUPERPRIV)
 565                pacct->ac_flag |= ASU;
 566        if (current->flags & PF_DUMPCORE)
 567                pacct->ac_flag |= ACORE;
 568        if (current->flags & PF_SIGNALED)
 569                pacct->ac_flag |= AXSIG;
 570        pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
 571        pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
 572        pacct->ac_minflt += current->min_flt;
 573        pacct->ac_majflt += current->maj_flt;
 574        spin_unlock_irq(&current->sighand->siglock);
 575}
 576
 577/**
 578 * acct_process - now just a wrapper around do_acct_process
 579 * @exitcode: task exit code
 580 *
 581 * handles process accounting for an exiting task
 582 */
 583void acct_process(void)
 584{
 585        struct file *file = NULL;
 586
 587        /*
 588         * accelerate the common fastpath:
 589         */
 590        if (!acct_globals.file)
 591                return;
 592
 593        spin_lock(&acct_globals.lock);
 594        file = acct_globals.file;
 595        if (unlikely(!file)) {
 596                spin_unlock(&acct_globals.lock);
 597                return;
 598        }
 599        get_file(file);
 600        spin_unlock(&acct_globals.lock);
 601
 602        do_acct_process(file);
 603        fput(file);
 604}
 605