linux/kernel/acct.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/acct.c
   3 *
   4 *  BSD Process Accounting for Linux
   5 *
   6 *  Author: Marco van Wieringen <mvw@planets.elm.net>
   7 *
   8 *  Some code based on ideas and code from:
   9 *  Thomas K. Dyas <tdyas@eden.rutgers.edu>
  10 *
  11 *  This file implements BSD-style process accounting. Whenever any
  12 *  process exits, an accounting record of type "struct acct" is
  13 *  written to the file specified with the acct() system call. It is
  14 *  up to user-level programs to do useful things with the accounting
  15 *  log. The kernel just provides the raw accounting information.
  16 *
  17 * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
  18 *
  19 *  Plugged two leaks. 1) It didn't return acct_file into the free_filps if
  20 *  the file happened to be read-only. 2) If the accounting was suspended
  21 *  due to the lack of space it happily allowed to reopen it and completely
  22 *  lost the old acct_file. 3/10/98, Al Viro.
  23 *
  24 *  Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
  25 *  XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
  26 *
  27 *  Fixed a nasty interaction with with sys_umount(). If the accointing
  28 *  was suspeneded we failed to stop it on umount(). Messy.
  29 *  Another one: remount to readonly didn't stop accounting.
  30 *      Question: what should we do if we have CAP_SYS_ADMIN but not
  31 *  CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
  32 *  unless we are messing with the root. In that case we are getting a
  33 *  real mess with do_remount_sb(). 9/11/98, AV.
  34 *
  35 *  Fixed a bunch of races (and pair of leaks). Probably not the best way,
  36 *  but this one obviously doesn't introduce deadlocks. Later. BTW, found
  37 *  one race (and leak) in BSD implementation.
  38 *  OK, that's better. ANOTHER race and leak in BSD variant. There always
  39 *  is one more bug... 10/11/98, AV.
  40 *
  41 *      Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
  42 * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
  43 * a struct file opened for write. Fixed. 2/6/2000, AV.
  44 */
  45
  46#include <linux/mm.h>
  47#include <linux/slab.h>
  48#include <linux/acct.h>
  49#include <linux/capability.h>
  50#include <linux/file.h>
  51#include <linux/tty.h>
  52#include <linux/security.h>
  53#include <linux/vfs.h>
  54#include <linux/jiffies.h>
  55#include <linux/times.h>
  56#include <linux/syscalls.h>
  57#include <linux/mount.h>
  58#include <asm/uaccess.h>
  59#include <asm/div64.h>
  60#include <linux/blkdev.h> /* sector_div */
  61
  62/*
  63 * These constants control the amount of freespace that suspend and
  64 * resume the process accounting system, and the time delay between
  65 * each check.
  66 * Turned into sysctl-controllable parameters. AV, 12/11/98
  67 */
  68
  69int acct_parm[3] = {4, 2, 30};
  70#define RESUME          (acct_parm[0])  /* >foo% free space - resume */
  71#define SUSPEND         (acct_parm[1])  /* <foo% free space - suspend */
  72#define ACCT_TIMEOUT    (acct_parm[2])  /* foo second timeout between checks */
  73
  74/*
  75 * External references and all of the globals.
  76 */
  77static void do_acct_process(struct file *);
  78
  79/*
  80 * This structure is used so that all the data protected by lock
  81 * can be placed in the same cache line as the lock.  This primes
  82 * the cache line to have the data after getting the lock.
  83 */
  84struct acct_glbs {
  85        spinlock_t              lock;
  86        volatile int            active;
  87        volatile int            needcheck;
  88        struct file             *file;
  89        struct timer_list       timer;
  90};
  91
  92static struct acct_glbs acct_globals __cacheline_aligned =
  93        {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
  94
  95/*
  96 * Called whenever the timer says to check the free space.
  97 */
  98static void acct_timeout(unsigned long unused)
  99{
 100        acct_globals.needcheck = 1;
 101}
 102
 103/*
 104 * Check the amount of free space and suspend/resume accordingly.
 105 */
 106static int check_free_space(struct file *file)
 107{
 108        struct kstatfs sbuf;
 109        int res;
 110        int act;
 111        sector_t resume;
 112        sector_t suspend;
 113
 114        spin_lock(&acct_globals.lock);
 115        res = acct_globals.active;
 116        if (!file || !acct_globals.needcheck)
 117                goto out;
 118        spin_unlock(&acct_globals.lock);
 119
 120        /* May block */
 121        if (vfs_statfs(file->f_path.dentry, &sbuf))
 122                return res;
 123        suspend = sbuf.f_blocks * SUSPEND;
 124        resume = sbuf.f_blocks * RESUME;
 125
 126        sector_div(suspend, 100);
 127        sector_div(resume, 100);
 128
 129        if (sbuf.f_bavail <= suspend)
 130                act = -1;
 131        else if (sbuf.f_bavail >= resume)
 132                act = 1;
 133        else
 134                act = 0;
 135
 136        /*
 137         * If some joker switched acct_globals.file under us we'ld better be
 138         * silent and _not_ touch anything.
 139         */
 140        spin_lock(&acct_globals.lock);
 141        if (file != acct_globals.file) {
 142                if (act)
 143                        res = act>0;
 144                goto out;
 145        }
 146
 147        if (acct_globals.active) {
 148                if (act < 0) {
 149                        acct_globals.active = 0;
 150                        printk(KERN_INFO "Process accounting paused\n");
 151                }
 152        } else {
 153                if (act > 0) {
 154                        acct_globals.active = 1;
 155                        printk(KERN_INFO "Process accounting resumed\n");
 156                }
 157        }
 158
 159        del_timer(&acct_globals.timer);
 160        acct_globals.needcheck = 0;
 161        acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
 162        add_timer(&acct_globals.timer);
 163        res = acct_globals.active;
 164out:
 165        spin_unlock(&acct_globals.lock);
 166        return res;
 167}
 168
 169/*
 170 * Close the old accounting file (if currently open) and then replace
 171 * it with file (if non-NULL).
 172 *
 173 * NOTE: acct_globals.lock MUST be held on entry and exit.
 174 */
 175static void acct_file_reopen(struct file *file)
 176{
 177        struct file *old_acct = NULL;
 178
 179        if (acct_globals.file) {
 180                old_acct = acct_globals.file;
 181                del_timer(&acct_globals.timer);
 182                acct_globals.active = 0;
 183                acct_globals.needcheck = 0;
 184                acct_globals.file = NULL;
 185        }
 186        if (file) {
 187                acct_globals.file = file;
 188                acct_globals.needcheck = 0;
 189                acct_globals.active = 1;
 190                /* It's been deleted if it was used before so this is safe */
 191                init_timer(&acct_globals.timer);
 192                acct_globals.timer.function = acct_timeout;
 193                acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
 194                add_timer(&acct_globals.timer);
 195        }
 196        if (old_acct) {
 197                mnt_unpin(old_acct->f_path.mnt);
 198                spin_unlock(&acct_globals.lock);
 199                do_acct_process(old_acct);
 200                filp_close(old_acct, NULL);
 201                spin_lock(&acct_globals.lock);
 202        }
 203}
 204
 205static int acct_on(char *name)
 206{
 207        struct file *file;
 208        int error;
 209
 210        /* Difference from BSD - they don't do O_APPEND */
 211        file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
 212        if (IS_ERR(file))
 213                return PTR_ERR(file);
 214
 215        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
 216                filp_close(file, NULL);
 217                return -EACCES;
 218        }
 219
 220        if (!file->f_op->write) {
 221                filp_close(file, NULL);
 222                return -EIO;
 223        }
 224
 225        error = security_acct(file);
 226        if (error) {
 227                filp_close(file, NULL);
 228                return error;
 229        }
 230
 231        spin_lock(&acct_globals.lock);
 232        mnt_pin(file->f_path.mnt);
 233        acct_file_reopen(file);
 234        spin_unlock(&acct_globals.lock);
 235
 236        mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
 237
 238        return 0;
 239}
 240
 241/**
 242 * sys_acct - enable/disable process accounting
 243 * @name: file name for accounting records or NULL to shutdown accounting
 244 *
 245 * Returns 0 for success or negative errno values for failure.
 246 *
 247 * sys_acct() is the only system call needed to implement process
 248 * accounting. It takes the name of the file where accounting records
 249 * should be written. If the filename is NULL, accounting will be
 250 * shutdown.
 251 */
 252asmlinkage long sys_acct(const char __user *name)
 253{
 254        int error;
 255
 256        if (!capable(CAP_SYS_PACCT))
 257                return -EPERM;
 258
 259        if (name) {
 260                char *tmp = getname(name);
 261                if (IS_ERR(tmp))
 262                        return (PTR_ERR(tmp));
 263                error = acct_on(tmp);
 264                putname(tmp);
 265        } else {
 266                error = security_acct(NULL);
 267                if (!error) {
 268                        spin_lock(&acct_globals.lock);
 269                        acct_file_reopen(NULL);
 270                        spin_unlock(&acct_globals.lock);
 271                }
 272        }
 273        return error;
 274}
 275
 276/**
 277 * acct_auto_close - turn off a filesystem's accounting if it is on
 278 * @m: vfsmount being shut down
 279 *
 280 * If the accounting is turned on for a file in the subtree pointed to
 281 * to by m, turn accounting off.  Done when m is about to die.
 282 */
 283void acct_auto_close_mnt(struct vfsmount *m)
 284{
 285        spin_lock(&acct_globals.lock);
 286        if (acct_globals.file && acct_globals.file->f_path.mnt == m)
 287                acct_file_reopen(NULL);
 288        spin_unlock(&acct_globals.lock);
 289}
 290
 291/**
 292 * acct_auto_close - turn off a filesystem's accounting if it is on
 293 * @sb: super block for the filesystem
 294 *
 295 * If the accounting is turned on for a file in the filesystem pointed
 296 * to by sb, turn accounting off.
 297 */
 298void acct_auto_close(struct super_block *sb)
 299{
 300        spin_lock(&acct_globals.lock);
 301        if (acct_globals.file &&
 302            acct_globals.file->f_path.mnt->mnt_sb == sb) {
 303                acct_file_reopen(NULL);
 304        }
 305        spin_unlock(&acct_globals.lock);
 306}
 307
 308/*
 309 *  encode an unsigned long into a comp_t
 310 *
 311 *  This routine has been adopted from the encode_comp_t() function in
 312 *  the kern_acct.c file of the FreeBSD operating system. The encoding
 313 *  is a 13-bit fraction with a 3-bit (base 8) exponent.
 314 */
 315
 316#define MANTSIZE        13                      /* 13 bit mantissa. */
 317#define EXPSIZE         3                       /* Base 8 (3 bit) exponent. */
 318#define MAXFRACT        ((1 << MANTSIZE) - 1)   /* Maximum fractional value. */
 319
 320static comp_t encode_comp_t(unsigned long value)
 321{
 322        int exp, rnd;
 323
 324        exp = rnd = 0;
 325        while (value > MAXFRACT) {
 326                rnd = value & (1 << (EXPSIZE - 1));     /* Round up? */
 327                value >>= EXPSIZE;      /* Base 8 exponent == 3 bit shift. */
 328                exp++;
 329        }
 330
 331        /*
 332         * If we need to round up, do it (and handle overflow correctly).
 333         */
 334        if (rnd && (++value > MAXFRACT)) {
 335                value >>= EXPSIZE;
 336                exp++;
 337        }
 338
 339        /*
 340         * Clean it up and polish it off.
 341         */
 342        exp <<= MANTSIZE;               /* Shift the exponent into place */
 343        exp += value;                   /* and add on the mantissa. */
 344        return exp;
 345}
 346
 347#if ACCT_VERSION==1 || ACCT_VERSION==2
 348/*
 349 * encode an u64 into a comp2_t (24 bits)
 350 *
 351 * Format: 5 bit base 2 exponent, 20 bits mantissa.
 352 * The leading bit of the mantissa is not stored, but implied for
 353 * non-zero exponents.
 354 * Largest encodable value is 50 bits.
 355 */
 356
 357#define MANTSIZE2       20                      /* 20 bit mantissa. */
 358#define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
 359#define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
 360#define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
 361
 362static comp2_t encode_comp2_t(u64 value)
 363{
 364        int exp, rnd;
 365
 366        exp = (value > (MAXFRACT2>>1));
 367        rnd = 0;
 368        while (value > MAXFRACT2) {
 369                rnd = value & 1;
 370                value >>= 1;
 371                exp++;
 372        }
 373
 374        /*
 375         * If we need to round up, do it (and handle overflow correctly).
 376         */
 377        if (rnd && (++value > MAXFRACT2)) {
 378                value >>= 1;
 379                exp++;
 380        }
 381
 382        if (exp > MAXEXP2) {
 383                /* Overflow. Return largest representable number instead. */
 384                return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
 385        } else {
 386                return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
 387        }
 388}
 389#endif
 390
 391#if ACCT_VERSION==3
 392/*
 393 * encode an u64 into a 32 bit IEEE float
 394 */
 395static u32 encode_float(u64 value)
 396{
 397        unsigned exp = 190;
 398        unsigned u;
 399
 400        if (value==0) return 0;
 401        while ((s64)value > 0){
 402                value <<= 1;
 403                exp--;
 404        }
 405        u = (u32)(value >> 40) & 0x7fffffu;
 406        return u | (exp << 23);
 407}
 408#endif
 409
 410/*
 411 *  Write an accounting entry for an exiting process
 412 *
 413 *  The acct_process() call is the workhorse of the process
 414 *  accounting system. The struct acct is built here and then written
 415 *  into the accounting file. This function should only be called from
 416 *  do_exit() or when switching to a different output file.
 417 */
 418
 419/*
 420 *  do_acct_process does all actual work. Caller holds the reference to file.
 421 */
 422static void do_acct_process(struct file *file)
 423{
 424        struct pacct_struct *pacct = &current->signal->pacct;
 425        acct_t ac;
 426        mm_segment_t fs;
 427        unsigned long flim;
 428        u64 elapsed;
 429        u64 run_time;
 430        struct timespec uptime;
 431        struct tty_struct *tty;
 432
 433        /*
 434         * First check to see if there is enough free_space to continue
 435         * the process accounting system.
 436         */
 437        if (!check_free_space(file))
 438                return;
 439
 440        /*
 441         * Fill the accounting struct with the needed info as recorded
 442         * by the different kernel functions.
 443         */
 444        memset((caddr_t)&ac, 0, sizeof(acct_t));
 445
 446        ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 447        strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 448
 449        /* calculate run_time in nsec*/
 450        do_posix_clock_monotonic_gettime(&uptime);
 451        run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
 452        run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
 453                       + current->group_leader->start_time.tv_nsec;
 454        /* convert nsec -> AHZ */
 455        elapsed = nsec_to_AHZ(run_time);
 456#if ACCT_VERSION==3
 457        ac.ac_etime = encode_float(elapsed);
 458#else
 459        ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
 460                               (unsigned long) elapsed : (unsigned long) -1l);
 461#endif
 462#if ACCT_VERSION==1 || ACCT_VERSION==2
 463        {
 464                /* new enlarged etime field */
 465                comp2_t etime = encode_comp2_t(elapsed);
 466                ac.ac_etime_hi = etime >> 16;
 467                ac.ac_etime_lo = (u16) etime;
 468        }
 469#endif
 470        do_div(elapsed, AHZ);
 471        ac.ac_btime = get_seconds() - elapsed;
 472        /* we really need to bite the bullet and change layout */
 473        ac.ac_uid = current->uid;
 474        ac.ac_gid = current->gid;
 475#if ACCT_VERSION==2
 476        ac.ac_ahz = AHZ;
 477#endif
 478#if ACCT_VERSION==1 || ACCT_VERSION==2
 479        /* backward-compatible 16 bit fields */
 480        ac.ac_uid16 = current->uid;
 481        ac.ac_gid16 = current->gid;
 482#endif
 483#if ACCT_VERSION==3
 484        ac.ac_pid = current->tgid;
 485        ac.ac_ppid = current->real_parent->tgid;
 486#endif
 487
 488        spin_lock_irq(&current->sighand->siglock);
 489        tty = current->signal->tty;
 490        ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
 491        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
 492        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
 493        ac.ac_flag = pacct->ac_flag;
 494        ac.ac_mem = encode_comp_t(pacct->ac_mem);
 495        ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
 496        ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
 497        ac.ac_exitcode = pacct->ac_exitcode;
 498        spin_unlock_irq(&current->sighand->siglock);
 499        ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
 500        ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 501        ac.ac_swaps = encode_comp_t(0);
 502
 503        /*
 504         * Kernel segment override to datasegment and write it
 505         * to the accounting file.
 506         */
 507        fs = get_fs();
 508        set_fs(KERNEL_DS);
 509        /*
 510         * Accounting records are not subject to resource limits.
 511         */
 512        flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 513        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 514        file->f_op->write(file, (char *)&ac,
 515                               sizeof(acct_t), &file->f_pos);
 516        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 517        set_fs(fs);
 518}
 519
 520/**
 521 * acct_init_pacct - initialize a new pacct_struct
 522 * @pacct: per-process accounting info struct to initialize
 523 */
 524void acct_init_pacct(struct pacct_struct *pacct)
 525{
 526        memset(pacct, 0, sizeof(struct pacct_struct));
 527        pacct->ac_utime = pacct->ac_stime = cputime_zero;
 528}
 529
 530/**
 531 * acct_collect - collect accounting information into pacct_struct
 532 * @exitcode: task exit code
 533 * @group_dead: not 0, if this thread is the last one in the process.
 534 */
 535void acct_collect(long exitcode, int group_dead)
 536{
 537        struct pacct_struct *pacct = &current->signal->pacct;
 538        unsigned long vsize = 0;
 539
 540        if (group_dead && current->mm) {
 541                struct vm_area_struct *vma;
 542                down_read(&current->mm->mmap_sem);
 543                vma = current->mm->mmap;
 544                while (vma) {
 545                        vsize += vma->vm_end - vma->vm_start;
 546                        vma = vma->vm_next;
 547                }
 548                up_read(&current->mm->mmap_sem);
 549        }
 550
 551        spin_lock_irq(&current->sighand->siglock);
 552        if (group_dead)
 553                pacct->ac_mem = vsize / 1024;
 554        if (thread_group_leader(current)) {
 555                pacct->ac_exitcode = exitcode;
 556                if (current->flags & PF_FORKNOEXEC)
 557                        pacct->ac_flag |= AFORK;
 558        }
 559        if (current->flags & PF_SUPERPRIV)
 560                pacct->ac_flag |= ASU;
 561        if (current->flags & PF_DUMPCORE)
 562                pacct->ac_flag |= ACORE;
 563        if (current->flags & PF_SIGNALED)
 564                pacct->ac_flag |= AXSIG;
 565        pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
 566        pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
 567        pacct->ac_minflt += current->min_flt;
 568        pacct->ac_majflt += current->maj_flt;
 569        spin_unlock_irq(&current->sighand->siglock);
 570}
 571
 572/**
 573 * acct_process - now just a wrapper around do_acct_process
 574 * @exitcode: task exit code
 575 *
 576 * handles process accounting for an exiting task
 577 */
 578void acct_process(void)
 579{
 580        struct file *file = NULL;
 581
 582        /*
 583         * accelerate the common fastpath:
 584         */
 585        if (!acct_globals.file)
 586                return;
 587
 588        spin_lock(&acct_globals.lock);
 589        file = acct_globals.file;
 590        if (unlikely(!file)) {
 591                spin_unlock(&acct_globals.lock);
 592                return;
 593        }
 594        get_file(file);
 595        spin_unlock(&acct_globals.lock);
 596
 597        do_acct_process(file);
 598        fput(file);
 599}
 600
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.