linux/kernel/acct.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/acct.c
   3 *
   4 *  BSD Process Accounting for Linux
   5 *
   6 *  Author: Marco van Wieringen <mvw@planets.elm.net>
   7 *
   8 *  Some code based on ideas and code from:
   9 *  Thomas K. Dyas <tdyas@eden.rutgers.edu>
  10 *
  11 *  This file implements BSD-style process accounting. Whenever any
  12 *  process exits, an accounting record of type "struct acct" is
  13 *  written to the file specified with the acct() system call. It is
  14 *  up to user-level programs to do useful things with the accounting
  15 *  log. The kernel just provides the raw accounting information.
  16 *
  17 * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
  18 *
  19 *  Plugged two leaks. 1) It didn't return acct_file into the free_filps if
  20 *  the file happened to be read-only. 2) If the accounting was suspended
  21 *  due to the lack of space it happily allowed to reopen it and completely
  22 *  lost the old acct_file. 3/10/98, Al Viro.
  23 *
  24 *  Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
  25 *  XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
  26 *
  27 *  Fixed a nasty interaction with with sys_umount(). If the accointing
  28 *  was suspeneded we failed to stop it on umount(). Messy.
  29 *  Another one: remount to readonly didn't stop accounting.
  30 *      Question: what should we do if we have CAP_SYS_ADMIN but not
  31 *  CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
  32 *  unless we are messing with the root. In that case we are getting a
  33 *  real mess with do_remount_sb(). 9/11/98, AV.
  34 *
  35 *  Fixed a bunch of races (and pair of leaks). Probably not the best way,
  36 *  but this one obviously doesn't introduce deadlocks. Later. BTW, found
  37 *  one race (and leak) in BSD implementation.
  38 *  OK, that's better. ANOTHER race and leak in BSD variant. There always
  39 *  is one more bug... 10/11/98, AV.
  40 *
  41 *      Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
  42 * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
  43 * a struct file opened for write. Fixed. 2/6/2000, AV.
  44 */
  45
  46#include <linux/mm.h>
  47#include <linux/slab.h>
  48#include <linux/acct.h>
  49#include <linux/capability.h>
  50#include <linux/file.h>
  51#include <linux/tty.h>
  52#include <linux/security.h>
  53#include <linux/vfs.h>
  54#include <linux/jiffies.h>
  55#include <linux/times.h>
  56#include <linux/syscalls.h>
  57#include <linux/mount.h>
  58#include <asm/uaccess.h>
  59#include <asm/div64.h>
  60#include <linux/blkdev.h> /* sector_div */
  61#include <linux/pid_namespace.h>
  62
  63/*
  64 * These constants control the amount of freespace that suspend and
  65 * resume the process accounting system, and the time delay between
  66 * each check.
  67 * Turned into sysctl-controllable parameters. AV, 12/11/98
  68 */
  69
  70int acct_parm[3] = {4, 2, 30};
  71#define RESUME          (acct_parm[0])  /* >foo% free space - resume */
  72#define SUSPEND         (acct_parm[1])  /* <foo% free space - suspend */
  73#define ACCT_TIMEOUT    (acct_parm[2])  /* foo second timeout between checks */
  74
  75/*
  76 * External references and all of the globals.
  77 */
  78static void do_acct_process(struct pid_namespace *ns, struct file *);
  79
  80/*
  81 * This structure is used so that all the data protected by lock
  82 * can be placed in the same cache line as the lock.  This primes
  83 * the cache line to have the data after getting the lock.
  84 */
  85struct acct_glbs {
  86        spinlock_t              lock;
  87        volatile int            active;
  88        volatile int            needcheck;
  89        struct file             *file;
  90        struct pid_namespace    *ns;
  91        struct timer_list       timer;
  92};
  93
  94static struct acct_glbs acct_globals __cacheline_aligned =
  95        {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
  96
  97/*
  98 * Called whenever the timer says to check the free space.
  99 */
 100static void acct_timeout(unsigned long unused)
 101{
 102        acct_globals.needcheck = 1;
 103}
 104
 105/*
 106 * Check the amount of free space and suspend/resume accordingly.
 107 */
 108static int check_free_space(struct file *file)
 109{
 110        struct kstatfs sbuf;
 111        int res;
 112        int act;
 113        sector_t resume;
 114        sector_t suspend;
 115
 116        spin_lock(&acct_globals.lock);
 117        res = acct_globals.active;
 118        if (!file || !acct_globals.needcheck)
 119                goto out;
 120        spin_unlock(&acct_globals.lock);
 121
 122        /* May block */
 123        if (vfs_statfs(file->f_path.dentry, &sbuf))
 124                return res;
 125        suspend = sbuf.f_blocks * SUSPEND;
 126        resume = sbuf.f_blocks * RESUME;
 127
 128        sector_div(suspend, 100);
 129        sector_div(resume, 100);
 130
 131        if (sbuf.f_bavail <= suspend)
 132                act = -1;
 133        else if (sbuf.f_bavail >= resume)
 134                act = 1;
 135        else
 136                act = 0;
 137
 138        /*
 139         * If some joker switched acct_globals.file under us we'ld better be
 140         * silent and _not_ touch anything.
 141         */
 142        spin_lock(&acct_globals.lock);
 143        if (file != acct_globals.file) {
 144                if (act)
 145                        res = act>0;
 146                goto out;
 147        }
 148
 149        if (acct_globals.active) {
 150                if (act < 0) {
 151                        acct_globals.active = 0;
 152                        printk(KERN_INFO "Process accounting paused\n");
 153                }
 154        } else {
 155                if (act > 0) {
 156                        acct_globals.active = 1;
 157                        printk(KERN_INFO "Process accounting resumed\n");
 158                }
 159        }
 160
 161        del_timer(&acct_globals.timer);
 162        acct_globals.needcheck = 0;
 163        acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
 164        add_timer(&acct_globals.timer);
 165        res = acct_globals.active;
 166out:
 167        spin_unlock(&acct_globals.lock);
 168        return res;
 169}
 170
 171/*
 172 * Close the old accounting file (if currently open) and then replace
 173 * it with file (if non-NULL).
 174 *
 175 * NOTE: acct_globals.lock MUST be held on entry and exit.
 176 */
 177static void acct_file_reopen(struct file *file)
 178{
 179        struct file *old_acct = NULL;
 180        struct pid_namespace *old_ns = NULL;
 181
 182        if (acct_globals.file) {
 183                old_acct = acct_globals.file;
 184                old_ns = acct_globals.ns;
 185                del_timer(&acct_globals.timer);
 186                acct_globals.active = 0;
 187                acct_globals.needcheck = 0;
 188                acct_globals.file = NULL;
 189        }
 190        if (file) {
 191                acct_globals.file = file;
 192                acct_globals.ns = get_pid_ns(task_active_pid_ns(current));
 193                acct_globals.needcheck = 0;
 194                acct_globals.active = 1;
 195                /* It's been deleted if it was used before so this is safe */
 196                init_timer(&acct_globals.timer);
 197                acct_globals.timer.function = acct_timeout;
 198                acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
 199                add_timer(&acct_globals.timer);
 200        }
 201        if (old_acct) {
 202                mnt_unpin(old_acct->f_path.mnt);
 203                spin_unlock(&acct_globals.lock);
 204                do_acct_process(old_ns, old_acct);
 205                filp_close(old_acct, NULL);
 206                put_pid_ns(old_ns);
 207                spin_lock(&acct_globals.lock);
 208        }
 209}
 210
 211static int acct_on(char *name)
 212{
 213        struct file *file;
 214        int error;
 215
 216        /* Difference from BSD - they don't do O_APPEND */
 217        file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
 218        if (IS_ERR(file))
 219                return PTR_ERR(file);
 220
 221        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
 222                filp_close(file, NULL);
 223                return -EACCES;
 224        }
 225
 226        if (!file->f_op->write) {
 227                filp_close(file, NULL);
 228                return -EIO;
 229        }
 230
 231        error = security_acct(file);
 232        if (error) {
 233                filp_close(file, NULL);
 234                return error;
 235        }
 236
 237        spin_lock(&acct_globals.lock);
 238        mnt_pin(file->f_path.mnt);
 239        acct_file_reopen(file);
 240        spin_unlock(&acct_globals.lock);
 241
 242        mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
 243
 244        return 0;
 245}
 246
 247/**
 248 * sys_acct - enable/disable process accounting
 249 * @name: file name for accounting records or NULL to shutdown accounting
 250 *
 251 * Returns 0 for success or negative errno values for failure.
 252 *
 253 * sys_acct() is the only system call needed to implement process
 254 * accounting. It takes the name of the file where accounting records
 255 * should be written. If the filename is NULL, accounting will be
 256 * shutdown.
 257 */
 258asmlinkage long sys_acct(const char __user *name)
 259{
 260        int error;
 261
 262        if (!capable(CAP_SYS_PACCT))
 263                return -EPERM;
 264
 265        if (name) {
 266                char *tmp = getname(name);
 267                if (IS_ERR(tmp))
 268                        return (PTR_ERR(tmp));
 269                error = acct_on(tmp);
 270                putname(tmp);
 271        } else {
 272                error = security_acct(NULL);
 273                if (!error) {
 274                        spin_lock(&acct_globals.lock);
 275                        acct_file_reopen(NULL);
 276                        spin_unlock(&acct_globals.lock);
 277                }
 278        }
 279        return error;
 280}
 281
 282/**
 283 * acct_auto_close - turn off a filesystem's accounting if it is on
 284 * @m: vfsmount being shut down
 285 *
 286 * If the accounting is turned on for a file in the subtree pointed to
 287 * to by m, turn accounting off.  Done when m is about to die.
 288 */
 289void acct_auto_close_mnt(struct vfsmount *m)
 290{
 291        spin_lock(&acct_globals.lock);
 292        if (acct_globals.file && acct_globals.file->f_path.mnt == m)
 293                acct_file_reopen(NULL);
 294        spin_unlock(&acct_globals.lock);
 295}
 296
 297/**
 298 * acct_auto_close - turn off a filesystem's accounting if it is on
 299 * @sb: super block for the filesystem
 300 *
 301 * If the accounting is turned on for a file in the filesystem pointed
 302 * to by sb, turn accounting off.
 303 */
 304void acct_auto_close(struct super_block *sb)
 305{
 306        spin_lock(&acct_globals.lock);
 307        if (acct_globals.file &&
 308            acct_globals.file->f_path.mnt->mnt_sb == sb) {
 309                acct_file_reopen(NULL);
 310        }
 311        spin_unlock(&acct_globals.lock);
 312}
 313
 314/*
 315 *  encode an unsigned long into a comp_t
 316 *
 317 *  This routine has been adopted from the encode_comp_t() function in
 318 *  the kern_acct.c file of the FreeBSD operating system. The encoding
 319 *  is a 13-bit fraction with a 3-bit (base 8) exponent.
 320 */
 321
 322#define MANTSIZE        13                      /* 13 bit mantissa. */
 323#define EXPSIZE         3                       /* Base 8 (3 bit) exponent. */
 324#define MAXFRACT        ((1 << MANTSIZE) - 1)   /* Maximum fractional value. */
 325
 326static comp_t encode_comp_t(unsigned long value)
 327{
 328        int exp, rnd;
 329
 330        exp = rnd = 0;
 331        while (value > MAXFRACT) {
 332                rnd = value & (1 << (EXPSIZE - 1));     /* Round up? */
 333                value >>= EXPSIZE;      /* Base 8 exponent == 3 bit shift. */
 334                exp++;
 335        }
 336
 337        /*
 338         * If we need to round up, do it (and handle overflow correctly).
 339         */
 340        if (rnd && (++value > MAXFRACT)) {
 341                value >>= EXPSIZE;
 342                exp++;
 343        }
 344
 345        /*
 346         * Clean it up and polish it off.
 347         */
 348        exp <<= MANTSIZE;               /* Shift the exponent into place */
 349        exp += value;                   /* and add on the mantissa. */
 350        return exp;
 351}
 352
 353#if ACCT_VERSION==1 || ACCT_VERSION==2
 354/*
 355 * encode an u64 into a comp2_t (24 bits)
 356 *
 357 * Format: 5 bit base 2 exponent, 20 bits mantissa.
 358 * The leading bit of the mantissa is not stored, but implied for
 359 * non-zero exponents.
 360 * Largest encodable value is 50 bits.
 361 */
 362
 363#define MANTSIZE2       20                      /* 20 bit mantissa. */
 364#define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
 365#define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
 366#define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
 367
 368static comp2_t encode_comp2_t(u64 value)
 369{
 370        int exp, rnd;
 371
 372        exp = (value > (MAXFRACT2>>1));
 373        rnd = 0;
 374        while (value > MAXFRACT2) {
 375                rnd = value & 1;
 376                value >>= 1;
 377                exp++;
 378        }
 379
 380        /*
 381         * If we need to round up, do it (and handle overflow correctly).
 382         */
 383        if (rnd && (++value > MAXFRACT2)) {
 384                value >>= 1;
 385                exp++;
 386        }
 387
 388        if (exp > MAXEXP2) {
 389                /* Overflow. Return largest representable number instead. */
 390                return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
 391        } else {
 392                return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
 393        }
 394}
 395#endif
 396
 397#if ACCT_VERSION==3
 398/*
 399 * encode an u64 into a 32 bit IEEE float
 400 */
 401static u32 encode_float(u64 value)
 402{
 403        unsigned exp = 190;
 404        unsigned u;
 405
 406        if (value==0) return 0;
 407        while ((s64)value > 0){
 408                value <<= 1;
 409                exp--;
 410        }
 411        u = (u32)(value >> 40) & 0x7fffffu;
 412        return u | (exp << 23);
 413}
 414#endif
 415
 416/*
 417 *  Write an accounting entry for an exiting process
 418 *
 419 *  The acct_process() call is the workhorse of the process
 420 *  accounting system. The struct acct is built here and then written
 421 *  into the accounting file. This function should only be called from
 422 *  do_exit() or when switching to a different output file.
 423 */
 424
 425/*
 426 *  do_acct_process does all actual work. Caller holds the reference to file.
 427 */
 428static void do_acct_process(struct pid_namespace *ns, struct file *file)
 429{
 430        struct pacct_struct *pacct = &current->signal->pacct;
 431        acct_t ac;
 432        mm_segment_t fs;
 433        unsigned long flim;
 434        u64 elapsed;
 435        u64 run_time;
 436        struct timespec uptime;
 437        struct tty_struct *tty;
 438
 439        /*
 440         * First check to see if there is enough free_space to continue
 441         * the process accounting system.
 442         */
 443        if (!check_free_space(file))
 444                return;
 445
 446        /*
 447         * Fill the accounting struct with the needed info as recorded
 448         * by the different kernel functions.
 449         */
 450        memset((caddr_t)&ac, 0, sizeof(acct_t));
 451
 452        ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 453        strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 454
 455        /* calculate run_time in nsec*/
 456        do_posix_clock_monotonic_gettime(&uptime);
 457        run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
 458        run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
 459                       + current->group_leader->start_time.tv_nsec;
 460        /* convert nsec -> AHZ */
 461        elapsed = nsec_to_AHZ(run_time);
 462#if ACCT_VERSION==3
 463        ac.ac_etime = encode_float(elapsed);
 464#else
 465        ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
 466                               (unsigned long) elapsed : (unsigned long) -1l);
 467#endif
 468#if ACCT_VERSION==1 || ACCT_VERSION==2
 469        {
 470                /* new enlarged etime field */
 471                comp2_t etime = encode_comp2_t(elapsed);
 472                ac.ac_etime_hi = etime >> 16;
 473                ac.ac_etime_lo = (u16) etime;
 474        }
 475#endif
 476        do_div(elapsed, AHZ);
 477        ac.ac_btime = get_seconds() - elapsed;
 478        /* we really need to bite the bullet and change layout */
 479        ac.ac_uid = current->uid;
 480        ac.ac_gid = current->gid;
 481#if ACCT_VERSION==2
 482        ac.ac_ahz = AHZ;
 483#endif
 484#if ACCT_VERSION==1 || ACCT_VERSION==2
 485        /* backward-compatible 16 bit fields */
 486        ac.ac_uid16 = current->uid;
 487        ac.ac_gid16 = current->gid;
 488#endif
 489#if ACCT_VERSION==3
 490        ac.ac_pid = task_tgid_nr_ns(current, ns);
 491        rcu_read_lock();
 492        ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
 493        rcu_read_unlock();
 494#endif
 495
 496        spin_lock_irq(&current->sighand->siglock);
 497        tty = current->signal->tty;
 498        ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
 499        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
 500        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
 501        ac.ac_flag = pacct->ac_flag;
 502        ac.ac_mem = encode_comp_t(pacct->ac_mem);
 503        ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
 504        ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
 505        ac.ac_exitcode = pacct->ac_exitcode;
 506        spin_unlock_irq(&current->sighand->siglock);
 507        ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
 508        ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 509        ac.ac_swaps = encode_comp_t(0);
 510
 511        /*
 512         * Kernel segment override to datasegment and write it
 513         * to the accounting file.
 514         */
 515        fs = get_fs();
 516        set_fs(KERNEL_DS);
 517        /*
 518         * Accounting records are not subject to resource limits.
 519         */
 520        flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 521        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 522        file->f_op->write(file, (char *)&ac,
 523                               sizeof(acct_t), &file->f_pos);
 524        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 525        set_fs(fs);
 526}
 527
 528/**
 529 * acct_init_pacct - initialize a new pacct_struct
 530 * @pacct: per-process accounting info struct to initialize
 531 */
 532void acct_init_pacct(struct pacct_struct *pacct)
 533{
 534        memset(pacct, 0, sizeof(struct pacct_struct));
 535        pacct->ac_utime = pacct->ac_stime = cputime_zero;
 536}
 537
 538/**
 539 * acct_collect - collect accounting information into pacct_struct
 540 * @exitcode: task exit code
 541 * @group_dead: not 0, if this thread is the last one in the process.
 542 */
 543void acct_collect(long exitcode, int group_dead)
 544{
 545        struct pacct_struct *pacct = &current->signal->pacct;
 546        unsigned long vsize = 0;
 547
 548        if (group_dead && current->mm) {
 549                struct vm_area_struct *vma;
 550                down_read(&current->mm->mmap_sem);
 551                vma = current->mm->mmap;
 552                while (vma) {
 553                        vsize += vma->vm_end - vma->vm_start;
 554                        vma = vma->vm_next;
 555                }
 556                up_read(&current->mm->mmap_sem);
 557        }
 558
 559        spin_lock_irq(&current->sighand->siglock);
 560        if (group_dead)
 561                pacct->ac_mem = vsize / 1024;
 562        if (thread_group_leader(current)) {
 563                pacct->ac_exitcode = exitcode;
 564                if (current->flags & PF_FORKNOEXEC)
 565                        pacct->ac_flag |= AFORK;
 566        }
 567        if (current->flags & PF_SUPERPRIV)
 568                pacct->ac_flag |= ASU;
 569        if (current->flags & PF_DUMPCORE)
 570                pacct->ac_flag |= ACORE;
 571        if (current->flags & PF_SIGNALED)
 572                pacct->ac_flag |= AXSIG;
 573        pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
 574        pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
 575        pacct->ac_minflt += current->min_flt;
 576        pacct->ac_majflt += current->maj_flt;
 577        spin_unlock_irq(&current->sighand->siglock);
 578}
 579
 580/**
 581 * acct_process - now just a wrapper around do_acct_process
 582 * @exitcode: task exit code
 583 *
 584 * handles process accounting for an exiting task
 585 */
 586void acct_process(void)
 587{
 588        struct file *file = NULL;
 589        struct pid_namespace *ns;
 590
 591        /*
 592         * accelerate the common fastpath:
 593         */
 594        if (!acct_globals.file)
 595                return;
 596
 597        spin_lock(&acct_globals.lock);
 598        file = acct_globals.file;
 599        if (unlikely(!file)) {
 600                spin_unlock(&acct_globals.lock);
 601                return;
 602        }
 603        get_file(file);
 604        ns = get_pid_ns(acct_globals.ns);
 605        spin_unlock(&acct_globals.lock);
 606
 607        do_acct_process(ns, file);
 608        fput(file);
 609        put_pid_ns(ns);
 610}
 611
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.